├── code
    ├── __init__.py
    ├── chp1
    │   ├── __init__.py
    │   ├── downloading_a_page.py
    │   ├── retrying_downloads.py
    │   ├── setting_user_agent.py
    │   ├── throttle.py
    │   ├── sitemap_crawler.py
    │   ├── id_iteration_crawler.py
    │   ├── link_crawler.py
    │   ├── advanced_link_crawler_using_requests.py
    │   └── advanced_link_crawler.py
    ├── chp2
    │   ├── __init__.py
    │   ├── lxml_brokenhtml.py
    │   ├── xpath_scraper.py
    │   ├── lxml_scraper.py
    │   ├── beautifulsoup.py
    │   ├── family_trees.py
    │   ├── regex.py
    │   ├── beautifulsoup_brokenhtml.py
    │   ├── csv_callback.py
    │   ├── test_scrapers.py
    │   ├── all_scrapers.py
    │   └── advanced_link_crawler.py
    ├── chp3
    │   ├── __init__.py
    │   ├── url_parsing.py
    │   ├── rediscache.py
    │   ├── requests_cache_link_crawler.py
    │   ├── advanced_link_crawler.py
    │   ├── downloader.py
    │   ├── downloader_requests_cache.py
    │   └── diskcache.py
    ├── chp6
    │   ├── __init__.py
    │   ├── login_form_requests.py
    │   ├── submit_login_form.py
    │   ├── mechanize_form.py
    │   ├── login.py
    │   ├── firefox_sessions.py
    │   ├── selenium_forms.py
    │   └── edit.py
    ├── chp7
    │   ├── __init__.py
    │   ├── investigate_form.py
    │   ├── image_processing.py
    │   ├── test_samples.py
    │   ├── register_with_ocr.py
    │   ├── register_with_api.py
    │   ├── using_captcha_api.py
    │   └── captcha_api.py
    ├── chp8
    │   ├── __init__.py
    │   └── example
    │   │   ├── example
    │   │       ├── __init__.py
    │   │       ├── spiders
    │   │       │   ├── __init__.py
    │   │       │   └── country.py
    │   │       ├── items.py
    │   │       ├── pipelines.py
    │   │       ├── middlewares.py
    │   │       └── settings.py
    │   │   └── scrapy.cfg
    ├── chp9
    │   ├── __init__.py
    │   ├── gap_scraper_callback.py
    │   ├── facebook_graph.py
    │   ├── scrape_google.py
    │   ├── facebook_selenium.py
    │   └── bmw_scraper.py
    ├── example_config.cfg
    ├── chp5
    │   ├── lxml_attempt.py
    │   ├── pyqt_search_browser_render.py
    │   ├── json_one_req.py
    │   ├── selenium_search.py
    │   ├── pyqt_webkit.py
    │   ├── json_scraper.py
    │   ├── pyqt_search.py
    │   └── browser_render.py
    └── chp4
    │   ├── extract_list.py
    │   ├── alexa_callback.py
    │   ├── redis_queue.py
    │   ├── advanced_link_crawler.py
    │   ├── threaded_crawler.py
    │   └── threaded_crawler_with_queue.py
├── data
    ├── captcha_samples
    │   ├── sample1.png
    │   ├── sample10.png
    │   ├── sample100.png
    │   ├── sample11.png
    │   ├── sample12.png
    │   ├── sample13.png
    │   ├── sample14.png
    │   ├── sample15.png
    │   ├── sample16.png
    │   ├── sample17.png
    │   ├── sample18.png
    │   ├── sample19.png
    │   ├── sample2.png
    │   ├── sample20.png
    │   ├── sample21.png
    │   ├── sample22.png
    │   ├── sample23.png
    │   ├── sample24.png
    │   ├── sample25.png
    │   ├── sample26.png
    │   ├── sample27.png
    │   ├── sample28.png
    │   ├── sample29.png
    │   ├── sample3.png
    │   ├── sample30.png
    │   ├── sample31.png
    │   ├── sample32.png
    │   ├── sample33.png
    │   ├── sample34.png
    │   ├── sample35.png
    │   ├── sample36.png
    │   ├── sample37.png
    │   ├── sample38.png
    │   ├── sample39.png
    │   ├── sample4.png
    │   ├── sample40.png
    │   ├── sample41.png
    │   ├── sample42.png
    │   ├── sample43.png
    │   ├── sample44.png
    │   ├── sample45.png
    │   ├── sample46.png
    │   ├── sample47.png
    │   ├── sample48.png
    │   ├── sample49.png
    │   ├── sample5.png
    │   ├── sample50.png
    │   ├── sample51.png
    │   ├── sample52.png
    │   ├── sample53.png
    │   ├── sample54.png
    │   ├── sample55.png
    │   ├── sample56.png
    │   ├── sample57.png
    │   ├── sample58.png
    │   ├── sample59.png
    │   ├── sample6.png
    │   ├── sample60.png
    │   ├── sample61.png
    │   ├── sample62.png
    │   ├── sample63.png
    │   ├── sample64.png
    │   ├── sample65.png
    │   ├── sample66.png
    │   ├── sample67.png
    │   ├── sample68.png
    │   ├── sample69.png
    │   ├── sample7.png
    │   ├── sample70.png
    │   ├── sample71.png
    │   ├── sample72.png
    │   ├── sample73.png
    │   ├── sample74.png
    │   ├── sample75.png
    │   ├── sample76.png
    │   ├── sample77.png
    │   ├── sample78.png
    │   ├── sample79.png
    │   ├── sample8.png
    │   ├── sample80.png
    │   ├── sample81.png
    │   ├── sample82.png
    │   ├── sample83.png
    │   ├── sample84.png
    │   ├── sample85.png
    │   ├── sample86.png
    │   ├── sample87.png
    │   ├── sample88.png
    │   ├── sample89.png
    │   ├── sample9.png
    │   ├── sample90.png
    │   ├── sample91.png
    │   ├── sample92.png
    │   ├── sample93.png
    │   ├── sample94.png
    │   ├── sample95.png
    │   ├── sample96.png
    │   ├── sample97.png
    │   ├── sample98.png
    │   ├── sample99.png
    │   └── samples.csv
    └── .gitignore
├── .gitignore
└── README.md


/code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp1/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp3/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp6/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp7/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp8/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp9/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/chp8/example/example/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/code/example_config.cfg:
--------------------------------------------------------------------------------
1 | [captcha_api]
2 | key=ERU285FKDSL28311
3 | 


--------------------------------------------------------------------------------
/data/captcha_samples/sample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample1.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample10.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample100.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample11.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample12.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample13.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample14.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample15.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample16.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample17.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample18.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample19.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample2.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample20.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample21.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample22.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample23.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample24.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample25.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample26.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample27.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample28.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample29.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample3.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample30.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample31.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample32.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample33.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample34.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample35.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample36.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample37.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample38.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample39.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample4.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample40.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample41.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample42.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample43.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample44.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample45.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample46.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample46.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample47.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample48.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample49.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample5.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample50.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample51.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample52.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample52.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample53.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample54.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample54.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample55.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample56.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample56.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample57.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample57.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample58.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample58.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample59.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample59.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample6.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample60.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample61.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample61.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample62.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample62.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample63.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample63.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample64.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample65.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample65.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample66.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample67.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample67.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample68.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample68.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample69.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample69.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample7.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample70.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample70.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample71.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample71.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample72.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample72.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample73.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample73.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample74.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample74.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample75.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample75.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample76.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample77.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample77.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample78.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample78.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample79.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample79.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample8.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample80.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample80.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample81.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample81.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample82.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample82.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample83.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample83.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample84.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample84.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample85.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample85.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample86.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample86.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample87.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample87.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample88.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample88.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample89.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample89.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample9.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample90.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample90.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample91.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample91.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample92.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample92.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample93.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample93.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample94.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample94.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample95.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample95.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample96.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample96.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample97.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample97.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample98.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample98.png


--------------------------------------------------------------------------------
/data/captcha_samples/sample99.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample99.png


--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except these files
4 | !captcha_samples/
5 | !captcha_samples/*
6 | !.gitignore
7 | 


--------------------------------------------------------------------------------
/code/chp8/example/example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/code/chp5/lxml_attempt.py:
--------------------------------------------------------------------------------
1 | from lxml.html import fromstring
2 | from chp3.downloader import Downloader
3 | 
4 | D = Downloader()
5 | html = D('http://example.webscraping.com/search')
6 | tree = fromstring(html)
7 | tree.cssselect('div#results a')
8 | 


--------------------------------------------------------------------------------
/code/chp2/lxml_brokenhtml.py:
--------------------------------------------------------------------------------
1 | from lxml.html import fromstring, tostring
2 | 
3 | broken_html = '<ul class=country><li>Area<li>Population</ul>'
4 | 
5 | tree = fromstring(broken_html)  # parse the HTML
6 | fixed_html = tostring(tree, pretty_print=True)
7 | print(fixed_html)
8 | 


--------------------------------------------------------------------------------
/code/chp7/investigate_form.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from chp6.login import parse_form
 3 | 
 4 | REGISTER_URL = 'http://example.webscraping.com/user/register'
 5 | 
 6 | session = requests.Session()
 7 | 
 8 | html = session.get(REGISTER_URL)
 9 | form = parse_form(html.content)
10 | print(form)
11 | 


--------------------------------------------------------------------------------
/code/chp6/login_form_requests.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | LOGIN_URL = 'http://example.webscraping.com/user/login'
 4 | LOGIN_EMAIL = 'example@webscraping.com'
 5 | LOGIN_PASSWORD = 'example'
 6 | data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD}
 7 | 
 8 | response = requests.post(LOGIN_URL, data)
 9 | print(response.url)
10 | 


--------------------------------------------------------------------------------
/code/chp8/example/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = example.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = example
12 | 


--------------------------------------------------------------------------------
/code/chp8/example/example/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class CountryItem(scrapy.Item):
12 |     name = scrapy.Field()
13 |     population = scrapy.Field()
14 | 


--------------------------------------------------------------------------------
/code/chp2/xpath_scraper.py:
--------------------------------------------------------------------------------
 1 | from lxml.html import fromstring
 2 | from chp1.advanced_link_crawler import download
 3 | 
 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
 5 | html = download(url)
 6 | 
 7 | tree = fromstring(html)
 8 | area = tree.xpath('//tr[@id="places_area__row"]/td[@class="w2p_fw"]/text()')[0]
 9 | print(area)
10 | 


--------------------------------------------------------------------------------
/code/chp2/lxml_scraper.py:
--------------------------------------------------------------------------------
 1 | from lxml.html import fromstring
 2 | from chp1.advanced_link_crawler import download
 3 | 
 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
 5 | html = download(url)
 6 | 
 7 | tree = fromstring(html)
 8 | td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
 9 | area = td.text_content()
10 | print(area)
11 | 


--------------------------------------------------------------------------------
/code/chp8/example/example/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ExamplePipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/code/chp1/downloading_a_page.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | from urllib.error import URLError, HTTPError, ContentTooShortError
 3 | 
 4 | 
 5 | def download(url):
 6 |     print('Downloading:', url)
 7 |     try:
 8 |         html = urllib.request.urlopen(url).read()
 9 |     except (URLError, HTTPError, ContentTooShortError) as e:
10 |         print('Download error:', e.reason)
11 |         html = None
12 |     return html
13 | 


--------------------------------------------------------------------------------
/code/chp5/pyqt_search_browser_render.py:
--------------------------------------------------------------------------------
 1 | from chp5.browser_render import BrowserRender
 2 | 
 3 | br = BrowserRender()
 4 | br.download('http://example.webscraping.com/search')
 5 | br.attr('#search_term', 'value', '.')
 6 | br.text('#page_size option:checked', '1000')
 7 | br.click('#search')
 8 | elements = br.wait_load('#results a')
 9 | 
10 | countries = [e.toPlainText().strip() for e in elements]
11 | print(countries)
12 | 


--------------------------------------------------------------------------------
/code/chp9/gap_scraper_callback.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | 
 4 | def scrape_callback(url, html):
 5 |     if url.endswith('.xml'):
 6 |         # Parse the sitemap XML file
 7 |         resp = requests.get(url)
 8 |         tree = etree.fromstring(resp.content)
 9 |         links = [e[0].text for e in tree]
10 |         return links
11 |     else:
12 |         # Add scraping code here
13 |         pass
14 | 


--------------------------------------------------------------------------------
/code/chp2/beautifulsoup.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from chp1.advanced_link_crawler import download
 3 | 
 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
 5 | html = download(url)
 6 | soup = BeautifulSoup(html, 'html5lib')
 7 | 
 8 | # locate the area row
 9 | tr = soup.find(attrs={'id': 'places_area__row'})
10 | td = tr.find(attrs={'class': 'w2p_fw'})  # locate the data
11 | area = td.text  # extract the data
12 | print(area)
13 | 


--------------------------------------------------------------------------------
/code/chp6/submit_login_form.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlencode
 2 | from urllib.request import Request, urlopen
 3 | 
 4 | LOGIN_URL = 'http://example.webscraping.com/user/login'
 5 | LOGIN_EMAIL = 'example@webscraping.com'
 6 | LOGIN_PASSWORD = 'example'
 7 | data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD}
 8 | encoded_data = urlencode(data)
 9 | request = Request(LOGIN_URL, encoded_data.encode('utf-8'))
10 | response = urlopen(request)
11 | print(response.geturl())
12 | 


--------------------------------------------------------------------------------
/code/chp2/family_trees.py:
--------------------------------------------------------------------------------
 1 | from lxml.html import fromstring
 2 | from chp1.advanced_link_crawler import download
 3 | 
 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
 5 | html = download(url)
 6 | 
 7 | tree = fromstring(html)
 8 | table = tree.xpath('//table')[0]
 9 | 
10 | print('Children:', table.getchildren())
11 | print('Parent:', table.getparent())
12 | print('Previous Sibling:', table.getprevious())
13 | print('Next Sibling:', table.getnext())
14 | print('All Siblings:', list(table.itersiblings()))
15 | 


--------------------------------------------------------------------------------
/code/chp4/extract_list.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from zipfile import ZipFile
 3 | from io import TextIOWrapper, BytesIO
 4 | import requests
 5 | 
 6 | resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True)
 7 | urls = []  # top 1 million URL's will be stored in this list
 8 | with ZipFile(BytesIO(resp.content)) as zf:
 9 |     csv_filename = zf.namelist()[0]
10 |     with zf.open(csv_filename) as csv_file:
11 |         for _, website in csv.reader(TextIOWrapper(csv_file)):
12 |             urls.append('http://' + website)
13 | 


--------------------------------------------------------------------------------
/code/chp5/json_one_req.py:
--------------------------------------------------------------------------------
 1 | from csv import DictWriter
 2 | import requests
 3 | 
 4 | 
 5 | PAGE_SIZE = 1000
 6 | 
 7 | template_url = 'http://example.webscraping.com/ajax/' + \
 8 |     'search.json?page=0&page_size={}&search_term=.'
 9 | 
10 | resp = requests.get(template_url.format(PAGE_SIZE))
11 | data = resp.json()
12 | records = data.get('records')
13 | 
14 | with open('../data/countries.csv', 'w') as countries_file:
15 |     wrtr = DictWriter(countries_file, fieldnames=records[0].keys())
16 |     wrtr.writeheader()
17 |     wrtr.writerows(records)
18 | 


--------------------------------------------------------------------------------
/code/chp5/selenium_search.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | 
 3 | driver = webdriver.Firefox()
 4 | driver.get('http://example.webscraping.com/search')
 5 | driver.find_element_by_id('search_term').send_keys('.')
 6 | js = "document.getElementById('page_size').options[1].text = '1000';"
 7 | driver.execute_script(js)
 8 | driver.find_element_by_id('search').click()
 9 | driver.implicitly_wait(30)
10 | links = driver.find_elements_by_css_selector('#results a')
11 | countries = [link.text for link in links]
12 | print(countries)
13 | 
14 | driver.close()
15 | 


--------------------------------------------------------------------------------
/code/chp6/mechanize_form.py:
--------------------------------------------------------------------------------
 1 | import mechanize
 2 | 
 3 | LOGIN_URL = 'http://example.webscraping.com/user/login'
 4 | LOGIN_EMAIL = 'example@webscraping.com'
 5 | LOGIN_PASSWORD = 'example'
 6 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239'
 7 | 
 8 | 
 9 | br = mechanize.Browser()
10 | br.open(LOGIN_URL)
11 | br.select_form(nr=0)
12 | br['email'] = LOGIN_EMAIL
13 | br['password'] = LOGIN_PASSWORD
14 | response = br.submit()
15 | br.open(COUNTRY_URL)
16 | br.select_form(nr=0)
17 | br['population'] = str(int(br['population']) + 1)
18 | br.submit()
19 | 


--------------------------------------------------------------------------------
/code/chp2/regex.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from chp1.advanced_link_crawler import download
 3 | 
 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
 5 | html = download(url)
 6 | 
 7 | print(re.findall(r'<td class="w2p_fw">(.*?)</td>', html))
 8 | 
 9 | print(re.findall('<td class="w2p_fw">(.*?)</td>', html)[1])
10 | 
11 | print(re.findall('<tr id="places_area__row"><td class="w2p_fl"><label for="places_area" id="places_area__label">Area: </label></td><td class="w2p_fw">(.*?)</td>', html))
12 | 
13 | print(re.findall('''<tr id="places_area__row">.*?<td\s*class=["']w2p_fw["']>(.*?)</td>''', html))
14 | 


--------------------------------------------------------------------------------
/code/chp1/retrying_downloads.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | from urllib.error import URLError, HTTPError, ContentTooShortError
 3 | 
 4 | 
 5 | def download(url, num_retries=2):
 6 |     print('Downloading:', url)
 7 |     try:
 8 |         html = urllib.request.urlopen(url).read()
 9 |     except (URLError, HTTPError, ContentTooShortError) as e:
10 |         print('Download error:', e.reason)
11 |         html = None
12 |         if num_retries > 0:
13 |             if hasattr(e, 'code') and 500 <= e.code < 600:
14 |                 # recursively retry 5xx HTTP errors
15 |                 return download(url, num_retries - 1)
16 |     return html
17 | 


--------------------------------------------------------------------------------
/code/chp9/facebook_graph.py:
--------------------------------------------------------------------------------
 1 | from facebook import GraphAPI
 2 | from configparser import ConfigParser
 3 | 
 4 | 
 5 | def get_page_details(access_token, page):
 6 |     graph = GraphAPI(access_token, version='2.7')
 7 |     return graph.get_object(page, fields='about,events,feed,picture')
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     config = ConfigParser()
12 |     # This script assumes you have the following config
13 |     # set up with a section facebook and key access_token
14 |     config.read('../../config/api.cfg')
15 |     access_token = config.get('facebook', 'access_token')
16 |     print(get_page_details(access_token, 'PacktPub'))
17 | 


--------------------------------------------------------------------------------
/code/chp5/pyqt_webkit.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | try:
 3 |     from PySide.QtGui import *
 4 |     from PySide.QtCore import *
 5 |     from PySide.QtWebKit import *
 6 | except ImportError:
 7 |     from PyQt4.QtGui import *
 8 |     from PyQt4.QtCore import *
 9 |     from PyQt4.QtWebKit import *
10 | 
11 | url = 'http://example.webscraping.com/dynamic'
12 | app = QApplication([])
13 | webview = QWebView()
14 | loop = QEventLoop()
15 | webview.loadFinished.connect(loop.quit)
16 | webview.load(QUrl(url))
17 | loop.exec_()
18 | html = webview.page().mainFrame().toHtml()
19 | tree = lxml.html.fromstring(html)
20 | print(tree.cssselect('#result')[0].text_content())
21 | 


--------------------------------------------------------------------------------
/code/chp2/beautifulsoup_brokenhtml.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | from chp1.advanced_link_crawler import download
 3 | 
 4 | broken_html = '<ul class=country><li>Area<li>Population</ul>'
 5 | 
 6 | soup = BeautifulSoup(broken_html, 'html.parser')
 7 | fixed_html = soup.prettify()
 8 | print(fixed_html)
 9 | 
10 | # still broken, so try a different parser
11 | 
12 | soup = BeautifulSoup(broken_html, 'html5lib')
13 | fixed_html = soup.prettify()
14 | print(fixed_html)
15 | 
16 | # now we can try and extract the data from the html
17 | 
18 | ul = soup.find('ul', attrs={'class': 'country'})
19 | print(ul.find('li'))  # returns just the first match
20 | print(ul.find_all('li'))  # returns all matches
21 | 


--------------------------------------------------------------------------------
/code/chp1/setting_user_agent.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | from urllib.error import URLError, HTTPError, ContentTooShortError
 3 | 
 4 | 
 5 | def download(url, num_retries=2, user_agent='wswp'):
 6 |     print('Downloading:', url)
 7 |     request = urllib.request.Request(url)
 8 |     request.add_header('User-agent', user_agent)
 9 |     try:
10 |         html = urllib.request.urlopen(request).read()
11 |     except (URLError, HTTPError, ContentTooShortError) as e:
12 |         print('Download error:', e.reason)
13 |         html = None
14 |         if num_retries > 0:
15 |             if hasattr(e, 'code') and 500 <= e.code < 600:
16 |                 # recursively retry 5xx HTTP errors
17 |                 return download(url, num_retries - 1)
18 |     return html
19 | 


--------------------------------------------------------------------------------
/code/chp9/scrape_google.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from urllib.parse import parse_qs, urlparse
 3 | from lxml.html import fromstring
 4 | 
 5 | # get results from search
 6 | html = requests.get('https://www.google.com/search?q=test')
 7 | tree = fromstring(html.content)
 8 | results = tree.cssselect('h3.r a')
 9 | print(results)
10 | 
11 | # grab the first link
12 | link = results[0].get('href')
13 | print(link)
14 | 
15 | # parse the destination url from the querystring
16 | qs = urlparse(link).query
17 | parsed_qs = parse_qs(qs)
18 | print(parsed_qs)
19 | print(parsed_qs.get('q', []))
20 | 
21 | 
22 | # as one list
23 | links = []
24 | for result in results:
25 |     link = result.get('href')
26 |     qs = urlparse(link).query
27 |     links.extend(parse_qs(qs).get('q', []))
28 | 
29 | print(links)
30 | 


--------------------------------------------------------------------------------
/code/chp7/image_processing.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from lxml.html import fromstring
 3 | from PIL import Image
 4 | import base64
 5 | 
 6 | 
 7 | def get_b64_string(html):
 8 |     tree = fromstring(html)
 9 |     img_data = tree.cssselect('div#recaptcha img')[0].get('src')
10 |     img_data = img_data.partition(',')[-1]
11 |     return img_data
12 | 
13 | 
14 | def get_captcha_img(html):
15 |     tree = fromstring(html)
16 |     img_data = tree.cssselect('div#recaptcha img')[0].get('src')
17 |     img_data = img_data.partition(',')[-1]
18 |     binary_img_data = base64.b64decode(img_data)
19 |     img = Image.open(BytesIO(binary_img_data))
20 |     return img
21 | 
22 | 
23 | def img_to_bw(img):
24 |     gray = img.convert('L')
25 |     bw = gray.point(lambda x: 0 if x < 1 else 255, '1')
26 |     return bw
27 | 


--------------------------------------------------------------------------------
/code/chp9/facebook_selenium.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | 
 3 | 
 4 | def get_driver():
 5 |     try:
 6 |         return webdriver.PhantomJS()
 7 |     except:
 8 |         return webdriver.Firefox()
 9 | 
10 | 
11 | def facebook(username, password, url):
12 |     driver = get_driver()
13 |     driver.get('https://facebook.com')
14 |     driver.find_element_by_id('email').send_keys(username)
15 |     driver.find_element_by_id('pass').send_keys(password)
16 |     driver.find_element_by_id('loginbutton').submit()
17 |     driver.implicitly_wait(30)
18 |     # wait until the search box is available,
19 |     # which means it has successfully logged in
20 |     search = driver.find_element_by_name('q')
21 |     # now logged in so can go to the page of interest
22 |     driver.get(url)
23 |     # add code to scrape data of interest here ...
24 | 


--------------------------------------------------------------------------------
/code/chp4/alexa_callback.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | from zipfile import ZipFile
 3 | from io import TextIOWrapper, BytesIO
 4 | import requests
 5 | 
 6 | 
 7 | class AlexaCallback:
 8 |     def __init__(self, max_urls=500):
 9 |         self.max_urls = max_urls
10 |         self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
11 |         self.urls = []
12 | 
13 |     def __call__(self):
14 |         resp = requests.get(self.seed_url, stream=True)
15 |         with ZipFile(BytesIO(resp.content)) as zf:
16 |             csv_filename = zf.namelist()[0]
17 |             with zf.open(csv_filename) as csv_file:
18 |                 for _, website in csv.reader(TextIOWrapper(csv_file)):
19 |                     self.urls.append('http://' + website)
20 |                     if len(self.urls) == self.max_urls:
21 |                         break
22 | 


--------------------------------------------------------------------------------
/code/chp9/bmw_scraper.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | import requests
 4 | 
 5 | 
 6 | url = 'https://c2b-services.bmw.com/c2b-localsearch/services/api/v3/clients/BMWDIGITAL_DLO/DE/pois?country=DE&category=BM&maxResults=%d&language=en&lat=52.507537768880056&lng=13.425269635701511'
 7 | jsonp = requests.get(url % 1000)
 8 | pure_json = jsonp.text[jsonp.text.index('(') + 1: jsonp.text.rindex(')')]
 9 | dealers = json.loads(pure_json)
10 | print(dealers.keys())
11 | print(dealers['count'])
12 | print(dealers['data']['pois'][0])
13 | 
14 | with open('../../data/bmw.csv', 'w') as fp:
15 |     writer = csv.writer(fp)
16 |     writer.writerow(['Name', 'Latitude', 'Longitude'])
17 |     for dealer in dealers['data']['pois']:
18 |         name = dealer['name']
19 |         lat, lng = dealer['lat'], dealer['lng']
20 |         writer.writerow([name, lat, lng])
21 | 


--------------------------------------------------------------------------------
/code/chp5/json_scraper.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import string
 3 | 
 4 | PAGE_SIZE = 10
 5 | 
 6 | template_url = 'http://example.webscraping.com/ajax/' + \
 7 |     'search.json?page={}&page_size={}&search_term={}'
 8 | 
 9 | countries = set()
10 | 
11 | for letter in string.ascii_lowercase:
12 |     print('Searching with %s' % letter)
13 |     page = 0
14 |     while True:
15 |         resp = requests.get(template_url.format(page, PAGE_SIZE, letter))
16 |         data = resp.json()
17 |         print('adding %d more records from page %d' %
18 |               (len(data.get('records')), page))
19 |         for record in data.get('records'):
20 |             countries.add(record['country'])
21 |         page += 1
22 |         if page >= data['num_pages']:
23 |             break
24 | 
25 | with open('../data/countries.txt', 'w') as countries_file:
26 |     countries_file.write('\n'.join(sorted(countries)))
27 | 


--------------------------------------------------------------------------------
/code/chp2/csv_callback.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import re
 3 | from lxml.html import fromstring
 4 | 
 5 | 
 6 | class CsvCallback:
 7 |     def __init__(self):
 8 |         self.writer = csv.writer(open('../data/countries.csv', 'w'))
 9 |         self.fields = ('area', 'population', 'iso', 'country', 'capital',
10 |                        'continent', 'tld', 'currency_code', 'currency_name',
11 |                        'phone', 'postal_code_format', 'postal_code_regex',
12 |                        'languages', 'neighbours')
13 |         self.writer.writerow(self.fields)
14 | 
15 |     def __call__(self, url, html):
16 |         if re.search('/view/', url):
17 |             tree = fromstring(html)
18 |             all_rows = [
19 |                 tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
20 |                 for field in self.fields]
21 |             self.writer.writerow(all_rows)
22 | 


--------------------------------------------------------------------------------
/code/chp1/throttle.py:
--------------------------------------------------------------------------------
 1 | from urllib.parse import urlparse
 2 | import time
 3 | 
 4 | 
 5 | class Throttle:
 6 |     """ Add a delay between downloads to the same domain
 7 |     """
 8 |     def __init__(self, delay):
 9 |         # amount of delay between downloads for each domain
10 |         self.delay = delay
11 |         # timestamp of when a domain was last accessed
12 |         self.domains = {}
13 | 
14 |     def wait(self, url):
15 |         domain = urlparse(url).netloc
16 |         last_accessed = self.domains.get(domain)
17 | 
18 |         if self.delay > 0 and last_accessed is not None:
19 |             sleep_secs = self.delay - (time.time() - last_accessed)
20 |             if sleep_secs > 0:
21 |                 # domain has been accessed recently
22 |                 # so need to sleep
23 |                 time.sleep(sleep_secs)
24 |         # update the last accessed time
25 |         self.domains[domain] = time.time()
26 | 


--------------------------------------------------------------------------------
/code/chp3/url_parsing.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from urllib.parse import urlsplit
 3 | 
 4 | # how to manage converting urls into filenames
 5 | 
 6 | url = 'http://example.webscraping.com/places/default/view/Australia-1'
 7 | filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', url)
 8 | filename = '/'.join(segment[:255] for segment in filename.split('/'))
 9 | print(filename)
10 | 
11 | # how to handle edge case where we need to append index.html for parent urls
12 | # such as http://example.webscraping.com/index/
13 | 
14 | components = urlsplit('http://example.webscraping.com/index/')
15 | print(components)
16 | print(components.path)
17 | path = components.path
18 | if not path:
19 |     path = '/index.html'
20 | elif path.endswith('/'):
21 |     path += 'index.html'
22 | filename = components.netloc + path + components.query
23 | filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
24 | filename = '/'.join(segment[:255] for segment in filename.split('/'))
25 | print(filename)
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | *~
60 | */.*~
61 | .*/
62 | *.rdb
63 | config/
64 | 


--------------------------------------------------------------------------------
/code/chp8/example/example/spiders/country.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from example.items import CountryItem
 6 | 
 7 | 
 8 | class CountrySpider(CrawlSpider):
 9 |     name = 'country'
10 |     allowed_domains = ['example.webscraping.com']
11 |     start_urls = ['http://example.webscraping.com/']
12 | 
13 |     rules = (
14 |         Rule(LinkExtractor(allow=r'/index/', deny=r'/user/'),
15 |              follow=True),
16 |         Rule(LinkExtractor(allow=r'/view/', deny=r'/user/'),
17 |              callback='parse_item'),
18 |     )
19 | 
20 |     def parse_item(self, response):
21 |         item = CountryItem()
22 |         name_css = 'tr#places_country__row td.w2p_fw::text'
23 |         item['name'] = response.css(name_css).extract()
24 |         pop_xpath = '//tr[@id="places_population__row"]/td[@class="w2p_fw"]/text()'
25 |         item['population'] = response.xpath(pop_xpath).extract()
26 |         return item
27 | 


--------------------------------------------------------------------------------
/code/chp2/test_scrapers.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import re
 3 | from chp2.all_scrapers import re_scraper, bs_scraper, \
 4 |     lxml_scraper, lxml_xpath_scraper
 5 | from chp1.advanced_link_crawler import download
 6 | 
 7 | NUM_ITERATIONS = 1000  # number of times to test each scraper
 8 | html = download('http://example.webscraping.com/places/view/United-Kingdom-239')
 9 | 
10 | scrapers = [
11 |     ('Regular expressions', re_scraper),
12 |     ('BeautifulSoup', bs_scraper),
13 |     ('Lxml', lxml_scraper),
14 |     ('Xpath', lxml_xpath_scraper)]
15 | 
16 | for name, scraper in scrapers:
17 |     # record start time of scrape
18 |     start = time.time()
19 |     for i in range(NUM_ITERATIONS):
20 |         if scraper == re_scraper:
21 |             re.purge()
22 |         result = scraper(html)
23 |         # check scraped result is as expected
24 |         assert result['area'] == '244,820 square kilometres'
25 |     # record end time of scrape and output the total
26 |     end = time.time()
27 |     print('%s: %.2f seconds' % (name, end - start))
28 | 


--------------------------------------------------------------------------------
/code/chp7/test_samples.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | from csv import reader
 4 | from chp7.register import ocr
 5 | from chp7.image_processing import img_to_bw
 6 | 
 7 | SAMPLES_DIR = os.path.join(
 8 |     os.path.dirname(os.path.realpath(__file__)),
 9 |     '..', '..', 'data', 'captcha_samples')
10 | 
11 | 
12 | def get_rdr(samples_folder=SAMPLES_DIR):
13 |     return reader(open(os.path.join(samples_folder, 'samples.csv')))
14 | 
15 | 
16 | def test_samples(samples_folder=SAMPLES_DIR):
17 |     rdr = get_rdr(samples_folder=samples_folder)
18 |     results = {'correct': 0, 'incorrect': 0}
19 |     for fname, txt in rdr:
20 |         img = Image.open(os.path.join(samples_folder, fname))
21 |         captcha = ocr(img)
22 |         if captcha == txt:
23 |             results['correct'] += 1
24 |         else:
25 |             results['incorrect'] += 1
26 |     print('accuracy: {}%'.format(results['correct'] / 100.0))
27 |     print('results: ', results)
28 |     return results
29 | 
30 | if __name__ == '__main__':
31 |     test_samples()
32 | 


--------------------------------------------------------------------------------
/code/chp5/pyqt_search.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from PySide.QtGui import *
 3 |     from PySide.QtCore import *
 4 |     from PySide.QtWebKit import *
 5 | except ImportError:
 6 |     from PyQt4.QtGui import *
 7 |     from PyQt4.QtCore import *
 8 |     from PyQt4.QtWebKit import *
 9 | 
10 | 
11 | app = QApplication([])
12 | webview = QWebView()
13 | loop = QEventLoop()
14 | webview.loadFinished.connect(loop.quit)
15 | webview.load(QUrl('http://example.webscraping.com/search'))
16 | loop.exec_()
17 | webview.show()
18 | frame = webview.page().mainFrame()
19 | frame.findFirstElement('#search_term').setAttribute('value', '.')
20 | frame.findFirstElement('#page_size option:checked').setPlainText('1000')
21 | frame.findFirstElement('#search').evaluateJavaScript('this.click()')
22 | # app.exec_() ## Uncomment and this will become a blocking event
23 | 
24 | elements = None
25 | while not elements:
26 |     app.processEvents()
27 |     elements = frame.findAllElements('#results a')
28 | 
29 | 
30 | countries = [e.toPlainText().strip() for e in elements]
31 | print(countries)
32 | 


--------------------------------------------------------------------------------
/code/chp6/login.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml.html import fromstring
 3 | 
 4 | 
 5 | LOGIN_URL = 'http://example.webscraping.com/user/login'
 6 | LOGIN_EMAIL = 'example@webscraping.com'
 7 | LOGIN_PASSWORD = 'example'
 8 | 
 9 | 
10 | def parse_form(html):
11 |     tree = fromstring(html)
12 |     data = {}
13 |     for e in tree.cssselect('form input'):
14 |         if e.get('name'):
15 |             data[e.get('name')] = e.get('value')
16 |     return data
17 | 
18 | 
19 | def login(session=None):
20 |     """ Login to example website.
21 |         params:
22 |             session: request lib session object or None
23 |         returns tuple(response, session)
24 |     """
25 |     if session is None:
26 |         html = requests.get(LOGIN_URL)
27 |     else:
28 |         html = session.get(LOGIN_URL)
29 |     data = parse_form(html.content)
30 |     data['email'] = LOGIN_EMAIL
31 |     data['password'] = LOGIN_PASSWORD
32 |     if session is None:
33 |         response = requests.post(LOGIN_URL, data, cookies=html.cookies)
34 |     else:
35 |         response = session.post(LOGIN_URL, data)
36 |     assert 'login' not in response.url
37 |     return response, session
38 | 


--------------------------------------------------------------------------------
/code/chp1/sitemap_crawler.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | import re
 3 | 
 4 | from urllib.error import URLError, HTTPError, ContentTooShortError
 5 | 
 6 | 
 7 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
 8 |     print('Downloading:', url)
 9 |     request = urllib.request.Request(url)
10 |     request.add_header('User-agent', user_agent)
11 |     try:
12 |         resp = urllib.request.urlopen(request)
13 |         cs = resp.headers.get_content_charset()
14 |         if not cs:
15 |             cs = charset
16 |         html = resp.read().decode(cs)
17 |     except (URLError, HTTPError, ContentTooShortError) as e:
18 |         print('Download error:', e.reason)
19 |         html = None
20 |         if num_retries > 0:
21 |             if hasattr(e, 'code') and 500 <= e.code < 600:
22 |                 # recursively retry 5xx HTTP errors
23 |                 return download(url, num_retries - 1)
24 |     return html
25 | 
26 | 
27 | def crawl_sitemap(url):
28 |     # download the sitemap file
29 |     sitemap = download(url)
30 |     # extract the sitemap links
31 |     links = re.findall('<loc>(.*?)</loc>', sitemap)
32 |     # download each link
33 |     for link in links:
34 |         html = download(link)
35 |         # scrape html here
36 | 


--------------------------------------------------------------------------------
/code/chp1/id_iteration_crawler.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import urllib.request
 3 | from urllib.error import URLError, HTTPError, ContentTooShortError
 4 | 
 5 | 
 6 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
 7 |     print('Downloading:', url)
 8 |     request = urllib.request.Request(url)
 9 |     request.add_header('User-agent', user_agent)
10 |     try:
11 |         resp = urllib.request.urlopen(request)
12 |         cs = resp.headers.get_content_charset()
13 |         if not cs:
14 |             cs = charset
15 |         html = resp.read().decode(cs)
16 |     except (URLError, HTTPError, ContentTooShortError) as e:
17 |         print('Download error:', e.reason)
18 |         html = None
19 |         if num_retries > 0:
20 |             if hasattr(e, 'code') and 500 <= e.code < 600:
21 |                 # recursively retry 5xx HTTP errors
22 |                 return download(url, num_retries - 1)
23 |     return html
24 | 
25 | 
26 | def crawl_site(url, max_errors=5):
27 |     num_errors = 0
28 |     for page in itertools.count(1):
29 |         pg_url = '{}{}'.format(url, page)
30 |         html = download(pg_url)
31 |         if html is None:
32 |             num_errors += 1
33 |             if num_errors == max_errors:
34 |                 # reached max number of errors, so exit
35 |                 break
36 |         else:
37 |             num_errors = 0
38 |             # success - can scrape the result
39 | 


--------------------------------------------------------------------------------
/code/chp7/register_with_ocr.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import string
 3 | import pytesseract
 4 | from lxml.html import fromstring
 5 | from chp6.login import parse_form
 6 | from chp7.image_processing import get_captcha_img, img_to_bw
 7 | 
 8 | REGISTER_URL = 'http://example.webscraping.com/user/register'
 9 | 
10 | 
11 | def register(first_name, last_name, email, password):
12 |     session = requests.Session()
13 |     html = session.get(REGISTER_URL)
14 |     form = parse_form(html.content)
15 |     form['first_name'] = first_name
16 |     form['last_name'] = last_name
17 |     form['email'] = email
18 |     form['password'] = form['password_two'] = password
19 |     img = get_captcha_img(html.content)
20 |     captcha = ocr(img)
21 |     form['recaptcha_response_field'] = captcha
22 |     resp = session.post(html.url, form)
23 |     success = '/user/register' not in resp.url
24 |     if not success:
25 |         form_errors = fromstring(resp.content).cssselect('div.error')
26 |         print('Form Errors:')
27 |         print('\n'.join(
28 |             ('  {}: {}'.format(f.get('id'), f.text) for f in form_errors)))
29 |     return success
30 | 
31 | 
32 | def ocr(img):
33 |     bw = img_to_bw(img)
34 |     captcha = pytesseract.image_to_string(bw)
35 |     cleaned = ''.join(c for c in captcha.lower() if c in string.ascii_lowercase)
36 |     if len(cleaned) != len(captcha):
37 |         print('removed bad characters: {}'.format(set(captcha) - set(cleaned)))
38 |     return cleaned
39 | 


--------------------------------------------------------------------------------
/code/chp7/register_with_api.py:
--------------------------------------------------------------------------------
 1 | from configparser import ConfigParser
 2 | import requests
 3 | from lxml.html import fromstring
 4 | from chp6.login import parse_form
 5 | from chp7.image_processing import get_captcha_img
 6 | from chp7.captcha_api import CaptchaAPI
 7 | 
 8 | REGISTER_URL = 'http://example.webscraping.com/user/register'
 9 | 
10 | 
11 | def get_api_key():
12 |     config = ConfigParser()
13 |     config.read('../config/api.cfg')
14 |     return config.get('captcha_api', 'key')
15 | 
16 | 
17 | def register(first_name, last_name, email, password):
18 |     session = requests.Session()
19 |     html = session.get(REGISTER_URL)
20 |     form = parse_form(html.content)
21 |     form['first_name'] = first_name
22 |     form['last_name'] = last_name
23 |     form['email'] = email
24 |     form['password'] = form['password_two'] = password
25 |     api_key = get_api_key()
26 |     img = get_captcha_img(html.content)
27 |     api = CaptchaAPI(api_key)
28 |     captcha_id, captcha = api.solve(img)
29 |     form['recaptcha_response_field'] = captcha
30 |     resp = session.post(html.url, form)
31 |     success = '/user/register' not in resp.url
32 |     if success:
33 |         api.report(captcha_id, 1)
34 |     else:
35 |         form_errors = fromstring(resp.content).cssselect('div.error')
36 |         print('Form Errors:')
37 |         print('\n'.join(
38 |             ('  {}: {}'.format(f.get('id'), f.text) for f in form_errors)))
39 |         if 'invalid' in [f.text for f in form_errors]:
40 |             api.report(captcha_id, 0)
41 |     return success
42 | 


--------------------------------------------------------------------------------
/code/chp6/firefox_sessions.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import glob
 4 | import requests
 5 | 
 6 | from lxml.html import fromstring
 7 | 
 8 | 
 9 | def find_ff_sessions():
10 |     paths = [
11 |         '~/.mozilla/firefox/*.default',
12 |         '~/Library/Application Support/Firefox/Profiles/*.default',
13 |         '%APPDATA%/Roaming/Mozilla/Firefox/Profiles/*.default'
14 |     ]
15 |     for path in paths:
16 |         filename = os.path.join(path, 'sessionstore.js')
17 |         matches = glob.glob(os.path.expanduser(filename))
18 |         if matches:
19 |             return matches[0]
20 | 
21 | 
22 | def load_ff_sessions(session_filename):
23 |     cookies = {}
24 |     if os.path.exists(session_filename):
25 |         json_data = json.loads(open(session_filename, 'rb').read())
26 |         for window in json_data.get('windows', []):
27 |             for cookie in window.get('cookies', []):
28 |                 cookies[cookie.get('name')] = cookie.get('value')
29 |     else:
30 |         print('Session filename does not exist:', session_filename)
31 |     return cookies
32 | 
33 | 
34 | def session_login():
35 |     session_filename = find_ff_sessions()
36 |     assert session_filename is not None
37 |     cookies = load_ff_sessions(session_filename)
38 |     print('found cookies: ', cookies)
39 |     url = 'http://example.webscraping.com'
40 |     html = requests.get(url, cookies=cookies)
41 |     tree = fromstring(html.content)
42 |     print(tree.cssselect('ul#navbar li a')[0].text_content())
43 |     return html
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     session_login()
48 | 


--------------------------------------------------------------------------------
/code/chp2/all_scrapers.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from bs4 import BeautifulSoup
 3 | from lxml.html import fromstring
 4 | 
 5 | FIELDS = ('area', 'population', 'iso', 'country', 'capital',
 6 |           'continent', 'tld', 'currency_code', 'currency_name',
 7 |           'phone', 'postal_code_format', 'postal_code_regex',
 8 |           'languages', 'neighbours')
 9 | 
10 | 
11 | def re_scraper(html):
12 |     """ Using regex to extract data from country pages. """
13 |     results = {}
14 |     for field in FIELDS:
15 |         results[field] = re.search(
16 |             '<tr id="places_%s__row">.*?<td class="w2p_fw">(.*?)</td>'
17 |             % field, html).groups()[0]
18 |     return results
19 | 
20 | 
21 | def bs_scraper(html):
22 |     """ Using beautifulsoup to extract data from country pages. """
23 |     soup = BeautifulSoup(html, 'html.parser')
24 |     results = {}
25 |     for field in FIELDS:
26 |         results[field] = soup.find('table').find(
27 |             'tr', id='places_%s__row' % field).find(
28 |                 'td', class_='w2p_fw').text
29 |     return results
30 | 
31 | 
32 | def lxml_scraper(html):
33 |     """ Using lxml and cssselect to extract data from country pages. """
34 |     tree = fromstring(html)
35 |     results = {}
36 |     for field in FIELDS:
37 |         results[field] = tree.cssselect(
38 |             'table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content()
39 |     return results
40 | 
41 | 
42 | def lxml_xpath_scraper(html):
43 |     """ Using lxml and xpath to extract data from country pages. """
44 |     tree = fromstring(html)
45 |     results = {}
46 |     for field in FIELDS:
47 |         results[field] = tree.xpath(
48 |             '//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
49 |     return results
50 | 


--------------------------------------------------------------------------------
/code/chp3/rediscache.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import zlib
 3 | from datetime import datetime, timedelta
 4 | from redis import StrictRedis
 5 | 
 6 | 
 7 | class RedisCache:
 8 |     """ RedisCache helps store urls and their responses to Redis
 9 |         Initialization components:
10 |             client: a Redis client connected to the key-value database for
11 |                 the webcrawling cache (if not set, a localhost:6379
12 |                 default connection is used).
13 |             expires (datetime.timedelta): timedelta when content will expire
14 |                 (default: 30 days ago)
15 |             encoding (str): character encoding for serialization
16 |             compress (bool): boolean indicating whether compression with zlib should be used
17 |     """
18 |     def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True):
19 |         self.client = (StrictRedis(host='localhost', port=6379, db=0)
20 |                        if client is None else client)
21 |         self.expires = expires
22 |         self.encoding = encoding
23 |         self.compress = compress
24 | 
25 |     def __getitem__(self, url):
26 |         """Load data from Redis for given URL"""
27 |         record = self.client.get(url)
28 |         if record:
29 |             if self.compress:
30 |                 record = zlib.decompress(record)
31 |             return json.loads(record.decode(self.encoding))
32 |         else:
33 |             # URL has not yet been cached
34 |             raise KeyError(url + ' does not exist')
35 | 
36 |     def __setitem__(self, url, result):
37 |         """Save data to Redis for given url"""
38 |         data = bytes(json.dumps(result), self.encoding)
39 |         if self.compress:
40 |             data = zlib.compress(data)
41 |         self.client.setex(url, self.expires, data)
42 | 


--------------------------------------------------------------------------------
/code/chp6/selenium_forms.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.keys import Keys
 3 | from selenium.webdriver.common.by import By
 4 | from selenium.webdriver.support.ui import WebDriverWait
 5 | from selenium.webdriver.support import expected_conditions as EC
 6 | 
 7 | 
 8 | LOGIN_URL = 'http://example.webscraping.com/user/login'
 9 | LOGIN_EMAIL = 'example@webscraping.com'
10 | LOGIN_PASSWORD = 'example'
11 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239'
12 | 
13 | 
14 | def get_driver():
15 |     try:
16 |         return webdriver.PhantomJS()
17 |     except Exception:
18 |         return webdriver.Firefox()
19 | 
20 | 
21 | def login(driver):
22 |     driver.get(LOGIN_URL)
23 |     driver.find_element_by_id('auth_user_email').send_keys(LOGIN_EMAIL)
24 |     driver.find_element_by_id('auth_user_password').send_keys(
25 |         LOGIN_PASSWORD + Keys.RETURN)
26 |     pg_loaded = WebDriverWait(driver, 10).until(
27 |         EC.presence_of_element_located((By.ID, "results")))
28 |     assert 'login' not in driver.current_url
29 | 
30 | 
31 | def add_population(driver):
32 |     driver.get(COUNTRY_URL)
33 |     population = driver.find_element_by_id('places_population')
34 |     new_population = int(population.get_attribute('value')) + 1
35 |     population.clear()
36 |     population.send_keys(new_population)
37 |     driver.find_element_by_xpath('//input[@type="submit"]').click()
38 |     pg_loaded = WebDriverWait(driver, 10).until(
39 |         EC.presence_of_element_located((By.ID, "places_population__row")))
40 |     test_population = int(driver.find_element_by_css_selector(
41 |         '#places_population__row .w2p_fw').text.replace(',', ''))
42 |     assert test_population == new_population
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     driver = get_driver()
47 |     login(driver)
48 |     add_population(driver)
49 |     driver.quit()
50 | 


--------------------------------------------------------------------------------
/code/chp6/edit.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | from lxml.html import fromstring
 4 | from chp6.login import login, parse_form
 5 | 
 6 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239'
 7 | VIEW_URL = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
 8 | 
 9 | 
10 | def get_population():
11 |     html = requests.get(VIEW_URL)
12 |     tree = fromstring(html.content)
13 |     population = tree.cssselect(
14 |         '#places_population__row .w2p_fw')[0].text_content()
15 |     return int(population.replace(',', ''))
16 | 
17 | 
18 | def add_population():
19 |     session = requests.Session()
20 |     response, session = login(session=session)
21 |     country_html = session.get(COUNTRY_URL)
22 |     data = parse_form(country_html.content)
23 |     print('population is: ', data['population'])
24 |     data['population'] = int(data['population']) + 1
25 |     response = session.post(COUNTRY_URL, data=data)
26 |     test_population = get_population()
27 |     print('population is now:', test_population)
28 |     assert test_population == data['population']
29 | 
30 | 
31 | def get_currency():
32 |     html = requests.get(VIEW_URL)
33 |     tree = fromstring(html.content)
34 |     currency = tree.cssselect(
35 |         '#places_currency_name__row .w2p_fw')[0].text_content()
36 |     return currency
37 | 
38 | 
39 | def change_currency():
40 |     session = requests.Session()
41 |     response, session = login(session=session)
42 |     country_html = session.get(COUNTRY_URL)
43 |     data = parse_form(country_html.content)
44 |     print('currency is: ', data['currency_name'])
45 |     data['currency_name'] = 'British pounds'
46 |     response = session.post(COUNTRY_URL, data=data)
47 |     test_currency = get_currency()
48 |     print('currency is now: ', test_currency)
49 |     assert test_currency == data['currency_name']
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     add_population()
54 | 


--------------------------------------------------------------------------------
/code/chp1/link_crawler.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import urllib.request
 3 | from urllib.parse import urljoin
 4 | from urllib.error import URLError, HTTPError, ContentTooShortError
 5 | 
 6 | 
 7 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
 8 |     print('Downloading:', url)
 9 |     request = urllib.request.Request(url)
10 |     request.add_header('User-agent', user_agent)
11 |     try:
12 |         resp = urllib.request.urlopen(request)
13 |         cs = resp.headers.get_content_charset()
14 |         if not cs:
15 |             cs = charset
16 |         html = resp.read().decode(cs)
17 |     except (URLError, HTTPError, ContentTooShortError) as e:
18 |         print('Download error:', e.reason)
19 |         html = None
20 |         if num_retries > 0:
21 |             if hasattr(e, 'code') and 500 <= e.code < 600:
22 |                 # recursively retry 5xx HTTP errors
23 |                 return download(url, num_retries - 1)
24 |     return html
25 | 
26 | 
27 | def link_crawler(start_url, link_regex):
28 |     " Crawl from the given start URL following links matched by link_regex "
29 |     crawl_queue = [start_url]
30 |     # keep track which URL's have seen before
31 |     seen = set(crawl_queue)
32 |     while crawl_queue:
33 |         url = crawl_queue.pop()
34 |         html = download(url)
35 |         if not html:
36 |             continue
37 |         # filter for links matching our regular expression
38 |         for link in get_links(html):
39 |             if re.match(link_regex, link):
40 |                 abs_link = urljoin(start_url, link)
41 |                 if abs_link not in seen:
42 |                     seen.add(abs_link)
43 |                     crawl_queue.append(abs_link)
44 | 
45 | 
46 | def get_links(html):
47 |     " Return a list of links from html "
48 |     # a regular expression to extract all links from the webpage
49 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
50 |     # list of all links from the webpage
51 |     return webpage_regex.findall(html)
52 | 


--------------------------------------------------------------------------------
/code/chp8/example/example/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ExampleSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/code/chp4/redis_queue.py:
--------------------------------------------------------------------------------
 1 | # Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html
 2 | from redis import StrictRedis
 3 | 
 4 | 
 5 | class RedisQueue:
 6 |     """ RedisQueue helps store urls to crawl to Redis
 7 |         Initialization components:
 8 |             client: a Redis client connected to the key-value database for
 9 |                 the webcrawling cache (if not set, a localhost:6379
10 |                 default connection is used).
11 |         db (int): which database to use for Redis
12 |         queue_name (str): name for queue (default: wswp)
13 |     """
14 | 
15 |     def __init__(self, client=None, db=0, queue_name='wswp'):
16 |         self.client = (StrictRedis(host='localhost', port=6379, db=db)
17 |                        if client is None else client)
18 |         self.name = "queue:%s" % queue_name
19 |         self.seen_set = "seen:%s" % queue_name
20 |         self.depth = "depth:%s" % queue_name
21 | 
22 |     def __len__(self):
23 |         return self.client.llen(self.name)
24 | 
25 |     def push(self, element):
26 |         """Push an element to the tail of the queue"""
27 |         if isinstance(element, list):
28 |             element = [e for e in element if not self.already_seen(e)]
29 |             self.client.lpush(self.name, *element)
30 |             self.client.sadd(self.seen_set, *element)
31 |         elif not self.already_seen(element):
32 |             self.client.lpush(self.name, element)
33 |             self.client.sadd(self.seen_set, element)
34 | 
35 |     def already_seen(self, element):
36 |         """ determine if an element has already been seen """
37 |         return self.client.sismember(self.seen_set, element)
38 | 
39 |     def set_depth(self, element, depth):
40 |         """ Set the seen hash and depth """
41 |         self.client.hset(self.depth, element, depth)
42 | 
43 |     def get_depth(self, element):
44 |         """ Get the seen hash and depth """
45 |         return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element))
46 | 
47 |     def pop(self):
48 |         """Pop an element from the head of the queue"""
49 |         return self.client.rpop(self.name).decode('utf-8')
50 | 


--------------------------------------------------------------------------------
/code/chp7/using_captcha_api.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import base64
 3 | from configparser import ConfigParser
 4 | from time import sleep
 5 | from lxml.html import fromstring
 6 | from chp6.login import parse_form
 7 | from chp7.image_processing import get_captcha_img, get_b64_string
 8 | 
 9 | API_URL = 'https://www.9kw.eu/index.cgi'
10 | REGISTER_URL = 'http://example.webscraping.com/user/register'
11 | 
12 | 
13 | def get_api_key():
14 |     config = ConfigParser()
15 |     config.read('../config/api.cfg')
16 |     return config.get('captcha_api', 'key')
17 | 
18 | 
19 | def send_captcha(api_key, img_data):
20 |     data = {
21 |         'action': 'usercaptchaupload',
22 |         'apikey': api_key,
23 |         'file-upload-01': img_data,
24 |         'base64': '1',
25 |         'selfsolve': '1',
26 |         'json': '1',
27 |         'maxtimeout': '300'
28 |     }
29 |     resp = requests.post(API_URL, data)
30 |     return resp.json()
31 | 
32 | 
33 | def get_captcha_text(api_key, captcha_id):
34 |     data = {
35 |         'action': 'usercaptchacorrectdata',
36 |         'id': captcha_id,
37 |         'apikey': api_key,
38 |         'json': '1',
39 |     }
40 |     resp = requests.get(API_URL, data)
41 |     print('captcha text response:', resp.json())
42 |     answer = resp.json().get('answer')
43 |     return answer
44 | 
45 | 
46 | def register(first_name, last_name, email, password):
47 |     session = requests.Session()
48 |     html = session.get(REGISTER_URL)
49 |     form = parse_form(html.content)
50 |     form['first_name'] = first_name
51 |     form['last_name'] = last_name
52 |     form['email'] = email
53 |     form['password'] = form['password_two'] = password
54 |     img_data = get_b64_string(html.content)
55 |     img = get_captcha_img(html.content)
56 |     img.show()  # This will show the image locally when run
57 |     api_key = get_api_key()
58 |     captcha_id = send_captcha(api_key, img_data)
59 |     print('submitted captcha, got id:', captcha_id)
60 |     sleep(300)
61 |     captcha = get_captcha_text(api_key, captcha_id)
62 |     print('captcha solve:', captcha)
63 |     form['recaptcha_response_field'] = captcha
64 |     resp = session.post(html.url, form)
65 |     success = '/user/register' not in resp.url
66 |     if not success:
67 |         form_errors = fromstring(resp.content).cssselect('div.error')
68 |         print('Form Errors:')
69 |         print('\n'.join(
70 |             ('  {}: {}'.format(f.get('id'), f.text) for f in form_errors)))
71 |     return success
72 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Web Scraping with Python
 2 | 
 3 | Welcome to the code repository for [Web Scraping with Python, Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/python-web-scraping-second-edition)! I hope you find the code and data here useful. If you have any questions reach out to @kjam on Twitter or GitHub.
 4 | 
 5 | ### Code Structure
 6 | 
 7 | All of the code samples are in folders separated by chapter. Scripts are intended to be run from the `code` folder, allowing you to easily import from the chapters. 
 8 | 
 9 | ### Code Examples
10 | 
11 | I have not included every code sample you've found in the book, but I have included a majority of the finished scripts. Although these are included, I encourage you to write out each code sample on your own and use these only as a reference.
12 | 
13 | ### Firefox Issues
14 | 
15 | Depending on your version of Firefox and Selenium, you may run into JavaScript errors. Here are some fixes:
16 |  * Use an older version of Firefox
17 |  * Upgrade Selenium to >=3.0.2 and download the [geckodriver](https://github.com/mozilla/geckodriver/releases). Make sure the geckodriver is findable by your PATH variable. You can do this by adding this line to your `.bashrc` or `.bash_profile`. (Wondering what these are? Please read the Appendix C on learning the command line).
18 |  * Use [PhantomJS](http://phantomjs.org/) with Selenium (change your browser line to `webdriver.PhantomJS('path/to/your/phantomjs/installation')`)
19 |  * Use Chrome, InternetExplorer or any other [supported browser](http://www.seleniumhq.org/about/platforms.jsp)
20 | 
21 | Feel free to reach out if you have any questions!
22 | 
23 | ### Issues with Module Import
24 | 
25 | Seeing chp1 ModuleNotFound errors? Try adding this snippet to the file:
26 | 
27 | ```
28 | import os
29 | import sys
30 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)))
31 | ```
32 | 
33 | What this does is append the main module to your system path, which is where Python looks for imports. On some installations, I have noticed the current directory is not immediately added (common practice), so this code *explicitly* adds that directory to your path. 
34 | 
35 | 
36 | ### Corrections?
37 | 
38 | If you find any issues in these code examples, feel free to submit an Issue or Pull Request. I appreciate your input!
39 | 
40 | 
41 | ### First edition repository
42 | 
43 | If you are looking for the first edition's repository, you can find it here: [Web Scraping with Python, First Edition](https://bitbucket.org/wswp/)
44 | 
45 | ### Questions?
46 | 
47 | Reach out to @kjam on Twitter or GitHub. @kjam is also often on freenode. :)
48 | 


--------------------------------------------------------------------------------
/data/captcha_samples/samples.csv:
--------------------------------------------------------------------------------
  1 | sample1.png,watch
  2 | sample2.png,clean
  3 | sample3.png,forward
  4 | sample4.png,secret
  5 | sample5.png,square
  6 | sample6.png,sweet
  7 | sample7.png,flight
  8 | sample8.png,number
  9 | sample9.png,parcel
 10 | sample10.png,linen
 11 | sample11.png,attack
 12 | sample12.png,comfort
 13 | sample13.png,healthy
 14 | sample14.png,woman
 15 | sample15.png,between
 16 | sample16.png,fruit
 17 | sample17.png,office
 18 | sample18.png,electric
 19 | sample19.png,light
 20 | sample20.png,reward
 21 | sample21.png,powder
 22 | sample22.png,damage
 23 | sample23.png,thick
 24 | sample24.png,tomorrow
 25 | sample25.png,white
 26 | sample26.png,together
 27 | sample27.png,trick
 28 | sample28.png,sister
 29 | sample29.png,tongue
 30 | sample30.png,because
 31 | sample31.png,again
 32 | sample32.png,tooth
 33 | sample33.png,almost
 34 | sample34.png,board
 35 | sample35.png,stitch
 36 | sample36.png,spoon
 37 | sample37.png,paste
 38 | sample38.png,memory
 39 | sample39.png,guide
 40 | sample40.png,electric
 41 | sample41.png,regret
 42 | sample42.png,harbor
 43 | sample43.png,prose
 44 | sample44.png,circle
 45 | sample45.png,flight
 46 | sample46.png,motion
 47 | sample47.png,cause
 48 | sample48.png,front
 49 | sample49.png,question
 50 | sample50.png,drawer
 51 | sample51.png,present
 52 | sample52.png,elastic
 53 | sample53.png,laugh
 54 | sample54.png,rhythm
 55 | sample55.png,angle
 56 | sample56.png,porter
 57 | sample57.png,purpose
 58 | sample58.png,event
 59 | sample59.png,effect
 60 | sample60.png,history
 61 | sample61.png,tired
 62 | sample62.png,animal
 63 | sample63.png,steam
 64 | sample64.png,normal
 65 | sample65.png,scissors
 66 | sample66.png,while
 67 | sample67.png,print
 68 | sample68.png,behavior
 69 | sample69.png,impulse
 70 | sample70.png,quiet
 71 | sample71.png,level
 72 | sample72.png,basin
 73 | sample73.png,every
 74 | sample74.png,peace
 75 | sample75.png,right
 76 | sample76.png,month
 77 | sample77.png,science
 78 | sample78.png,river
 79 | sample79.png,frame
 80 | sample80.png,stocking
 81 | sample81.png,pencil
 82 | sample82.png,table
 83 | sample83.png,common
 84 | sample84.png,store
 85 | sample85.png,ornament
 86 | sample86.png,belief
 87 | sample87.png,across
 88 | sample88.png,history
 89 | sample89.png,harmony
 90 | sample90.png,young
 91 | sample91.png,summer
 92 | sample92.png,yellow
 93 | sample93.png,medical
 94 | sample94.png,current
 95 | sample95.png,amount
 96 | sample96.png,skirt
 97 | sample97.png,serious
 98 | sample98.png,paper
 99 | sample99.png,round
100 | sample100.png,stamp
101 | 


--------------------------------------------------------------------------------
/code/chp5/browser_render.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | import csv
 5 | import time
 6 | try:
 7 |     from PySide.QtGui import QApplication
 8 |     from PySide.QtCore import QUrl, QEventLoop, QTimer
 9 |     from PySide.QtWebKit import QWebView
10 | except ImportError:
11 |     from PyQt4.QtGui import QApplication
12 |     from PyQt4.QtCore import QUrl, QEventLoop, QTimer
13 |     from PyQt4.QtWebKit import QWebView
14 | import lxml.html
15 | 
16 | 
17 | class BrowserRender(QWebView):
18 |     def __init__(self, display=True):
19 |         self.app = QApplication([])
20 |         QWebView.__init__(self)
21 |         if display:
22 |             self.show() # show the browser
23 | 
24 |     def open(self, url, timeout=60):
25 |         """Wait for download to complete and return result"""
26 |         loop = QEventLoop()
27 |         timer = QTimer()
28 |         timer.setSingleShot(True)
29 |         timer.timeout.connect(loop.quit)
30 |         self.loadFinished.connect(loop.quit)
31 |         self.load(QUrl(url))
32 |         timer.start(timeout * 1000)
33 |         loop.exec_() # delay here until download finished
34 |         if timer.isActive():
35 |             # downloaded successfully
36 |             timer.stop()
37 |             return self.html()
38 |         else:
39 |             # timed out
40 |             print 'Request timed out:', url
41 | 
42 |     def html(self):
43 |         """Shortcut to return the current HTML"""
44 |         return self.page().mainFrame().toHtml()
45 | 
46 |     def find(self, pattern):
47 |         """Find all elements that match the pattern"""
48 |         return self.page().mainFrame().findAllElements(pattern)
49 | 
50 |     def attr(self, pattern, name, value):
51 |         """Set attribute for matching elements"""
52 |         for e in self.find(pattern):
53 |             e.setAttribute(name, value)
54 | 
55 |     def text(self, pattern, value):
56 |         """Set attribute for matching elements"""
57 |         for e in self.find(pattern):
58 |             e.setPlainText(value)
59 | 
60 |     def click(self, pattern):
61 |         """Click matching elements"""
62 |         for e in self.find(pattern):
63 |             e.evaluateJavaScript("this.click()")
64 | 
65 |     def wait_load(self, pattern, timeout=60):
66 |         """Wait for this pattern to be found in webpage and return matches"""
67 |         deadline = time.time() + timeout
68 |         while time.time() < deadline:
69 |             self.app.processEvents()
70 |             matches = self.find(pattern)
71 |             if matches:
72 |                 return matches
73 |         print('Wait load timed out')
74 | 
75 | 
76 | def main():
77 |     br = BrowserRender()
78 |     br.open('http://example.webscraping.com/search')
79 |     br.attr('#search_term', 'value', '.')
80 |     br.text('#page_size option:checked', '1000')
81 |     br.click('#search')
82 | 
83 |     elements = br.wait_load('#results a')
84 |     writer = csv.writer(open('countries.csv', 'w'))
85 |     for country in [e.toPlainText().strip() for e in elements]:
86 |         writer.writerow([country])
87 | 
88 | 
89 | if __name__ == '__main__':
90 |     main()
91 | 


--------------------------------------------------------------------------------
/code/chp7/captcha_api.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import re
 3 | import time
 4 | import requests
 5 | from io import BytesIO
 6 | 
 7 | 
 8 | class CaptchaAPI:
 9 |     def __init__(self, api_key, timeout=120):
10 |         self.api_key = api_key
11 |         self.timeout = timeout
12 |         self.url = 'https://www.9kw.eu/index.cgi'
13 | 
14 |     def solve(self, img):
15 |         """Submit CAPTCHA and return result when ready
16 |         """
17 |         img_buffer = BytesIO()
18 |         img.save(img_buffer, format="PNG")
19 |         img_data = img_buffer.getvalue()
20 |         captcha_id = self.send(img_data)
21 |         start_time = time.time()
22 |         while time.time() < start_time + self.timeout:
23 |             try:
24 |                 resp = self.get(captcha_id)
25 |             except CaptchaError:
26 |                 pass  # CAPTCHA still not ready
27 |             else:
28 |                 if resp.get('answer') != 'NO DATA':
29 |                     if resp.get('answer') == 'ERROR NO USER':
30 |                         raise CaptchaError(
31 |                             'Error: no user available to solve CAPTCHA')
32 |                     else:
33 |                         print('CAPTCHA solved!')
34 |                         return captcha_id, resp.get('answer')
35 |             print('Waiting for CAPTCHA ...')
36 |             time.sleep(1)
37 | 
38 |         raise CaptchaError('Error: API timeout')
39 | 
40 |     def send(self, img_data):
41 |         """Send CAPTCHA for solving """
42 |         print('Submitting CAPTCHA')
43 |         data = {
44 |             'action': 'usercaptchaupload',
45 |             'apikey': self.api_key,
46 |             'file-upload-01': base64.b64encode(img_data),
47 |             'base64': '1',
48 |             'selfsolve': '1',
49 |             'json': '1',
50 |             'maxtimeout': str(self.timeout)
51 |         }
52 |         result = requests.post(self.url, data)
53 |         self.check(result.text)
54 |         return result.json()
55 | 
56 |     def get(self, captcha_id):
57 |         """Get result of solved CAPTCHA"""
58 |         data = {
59 |             'action': 'usercaptchacorrectdata',
60 |             'id': captcha_id,
61 |             'apikey': self.api_key,
62 |             'info': '1',
63 |             'json': '1',
64 |         }
65 |         result = requests.get(self.url, data)
66 |         self.check(result.text)
67 |         return result.json()
68 | 
69 |     def check(self, result):
70 |         """Check result of API and raise error if error code"""
71 |         if re.match('00\d\d \w+', result):
72 |             raise CaptchaError('API error: ' + result)
73 | 
74 |     def report(self, captcha_id, correct):
75 |         """ Report back whether captcha was correct or not"""
76 |         data = {
77 |             'action': 'usercaptchacorrectback',
78 |             'id': captcha_id,
79 |             'apikey': self.api_key,
80 |             'correct': (lambda c: 1 if c else 2)(correct),
81 |             'json': '1',
82 |         }
83 |         resp = requests.get(self.url, data)
84 |         return resp.json()
85 | 
86 | 
87 | class CaptchaError(Exception):
88 |     pass
89 | 


--------------------------------------------------------------------------------
/code/chp3/requests_cache_link_crawler.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from urllib import robotparser
 3 | from urllib.parse import urljoin
 4 | from datetime import timedelta
 5 | from chp3.downloader_requests_cache import Downloader
 6 | 
 7 | import requests_cache
 8 | 
 9 | 
10 | def get_robots_parser(robots_url):
11 |     " Return the robots parser object using the robots_url "
12 |     rp = robotparser.RobotFileParser()
13 |     rp.set_url(robots_url)
14 |     rp.read()
15 |     return rp
16 | 
17 | 
18 | def get_links(html):
19 |     " Return a list of links (using simple regex matching) from the html content "
20 |     # a regular expression to extract all links from the webpage
21 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
22 |     # list of all links from the webpage
23 |     return webpage_regex.findall(html)
24 | 
25 | 
26 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
27 |                  proxies=None, delay=3, max_depth=4, num_retries=2, expires=timedelta(days=30)):
28 |     """ Crawl from the given start URL following links matched by link_regex. In the current
29 |         implementation, we do not actually scrapy any information.
30 | 
31 |         args:
32 |             start_url (str): web site to start crawl
33 |             link_regex (str): regex to match for links
34 |         kwargs:
35 |             robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
36 |             user_agent (str): user agent (default: wswp)
37 |             proxies (list of dicts): a list of possible dicts for http / https proxies
38 |                 For formatting, see the requests library
39 |             delay (int): seconds to throttle between requests to one domain (default: 3)
40 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
41 |             num_retries (int): # of retries when 5xx error (default: 2)
42 |             expires (timedelta): timedelta for cache expirations (default: 30 days)
43 |     """
44 |     crawl_queue = [start_url]
45 |     # keep track which URL's have seen before
46 |     seen = {}
47 |     requests_cache.install_cache(backend='redis', expire_after=expires)
48 |     if not robots_url:
49 |         robots_url = '{}/robots.txt'.format(start_url)
50 |     rp = get_robots_parser(robots_url)
51 |     D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies)
52 |     while crawl_queue:
53 |         url = crawl_queue.pop()
54 |         # check url passes robots.txt restrictions
55 |         if rp.can_fetch(user_agent, url):
56 |             depth = seen.get(url, 0)
57 |             if depth == max_depth:
58 |                 print('Skipping %s due to depth' % url)
59 |                 continue
60 |             html = D(url, num_retries=num_retries)
61 |             if not html:
62 |                 continue
63 |             # TODO: add actual data scraping here
64 |             # filter for links matching our regular expression
65 |             for link in get_links(html):
66 |                 if re.match(link_regex, link):
67 |                     abs_link = urljoin(start_url, link)
68 |                     if abs_link not in seen:
69 |                         seen[abs_link] = depth + 1
70 |                         crawl_queue.append(abs_link)
71 |         else:
72 |             print('Blocked by robots.txt:', url)
73 | 


--------------------------------------------------------------------------------
/code/chp3/advanced_link_crawler.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from urllib import robotparser
 3 | from urllib.parse import urljoin
 4 | from chp3.downloader import Downloader
 5 | 
 6 | 
 7 | def get_robots_parser(robots_url):
 8 |     " Return the robots parser object using the robots_url "
 9 |     rp = robotparser.RobotFileParser()
10 |     rp.set_url(robots_url)
11 |     rp.read()
12 |     return rp
13 | 
14 | 
15 | def get_links(html):
16 |     " Return a list of links (using simple regex matching) from the html content "
17 |     # a regular expression to extract all links from the webpage
18 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
19 |     # list of all links from the webpage
20 |     return webpage_regex.findall(html)
21 | 
22 | 
23 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
24 |                  proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None):
25 |     """ Crawl from the given start URL following links matched by link_regex. In the current
26 |         implementation, we do not actually scrape any information.
27 | 
28 |         args:
29 |             start_url (str): web site to start crawl
30 |             link_regex (str): regex to match for links
31 |         kwargs:
32 |             robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
33 |             user_agent (str): user agent (default: wswp)
34 |             proxies (list of dicts): a list of possible dicts for http / https proxies
35 |                 For formatting, see the requests library
36 |             delay (int): seconds to throttle between requests to one domain (default: 3)
37 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
38 |             num_retries (int): # of retries when 5xx error (default: 2)
39 |             cache (dict): cache dict with urls as keys and dicts for responses (default: {})
40 |             scraper_callback: function to be called on url and html content
41 |     """
42 |     crawl_queue = [start_url]
43 |     # keep track which URL's have seen before
44 |     seen = {}
45 |     if not robots_url:
46 |         robots_url = '{}/robots.txt'.format(start_url)
47 |     rp = get_robots_parser(robots_url)
48 |     D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
49 |     while crawl_queue:
50 |         url = crawl_queue.pop()
51 |         # check url passes robots.txt restrictions
52 |         if rp.can_fetch(user_agent, url):
53 |             depth = seen.get(url, 0)
54 |             if depth == max_depth:
55 |                 print('Skipping %s due to depth' % url)
56 |                 continue
57 |             html = D(url, num_retries=num_retries)
58 |             if not html:
59 |                 continue
60 |             if scraper_callback:
61 |                 links = scraper_callback(url, html) or []
62 |             else:
63 |                 links = []
64 |             # filter for links matching our regular expression
65 |             for link in get_links(html) + links:
66 |                 if re.match(link_regex, link):
67 |                     abs_link = urljoin(start_url, link)
68 |                     if abs_link not in seen:
69 |                         seen[abs_link] = depth + 1
70 |                         crawl_queue.append(abs_link)
71 |         else:
72 |             print('Blocked by robots.txt:', url)
73 | 


--------------------------------------------------------------------------------
/code/chp3/downloader.py:
--------------------------------------------------------------------------------
 1 | from random import choice
 2 | import requests
 3 | 
 4 | from chp1.throttle import Throttle
 5 | 
 6 | 
 7 | class Downloader:
 8 |     """ Downloader class to use cache and requests for downloading pages.
 9 |         For contructor, pass:
10 |             delay (int): # of secs delay between requests (default: 5)
11 |             user_agent (str): user agent string (default: 'wswp')
12 |             proxies (list[dict]): list of possible proxies, each
13 |                 must be a dict with http / https keys and proxy values
14 |             cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
15 |             timeout (float/int): number of seconds to wait until timeout
16 |     """
17 |     def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
18 |                  timeout=60):
19 |         self.throttle = Throttle(delay)
20 |         self.user_agent = user_agent
21 |         self.proxies = proxies
22 |         self.cache = cache
23 |         self.num_retries = None  # we will set this per request
24 |         self.timeout = timeout
25 | 
26 |     def __call__(self, url, num_retries=2):
27 |         """ Call the downloader class, which will return HTML from cache
28 |             or download it
29 |             args:
30 |                 url (str): url to download
31 |             kwargs:
32 |                 num_retries (int): # times to retry if 5xx code (default: 2)
33 |         """
34 |         self.num_retries = num_retries
35 |         try:
36 |             result = self.cache[url]
37 |             print('Loaded from cache:', url)
38 |         except KeyError:
39 |             result = None
40 |         if result and self.num_retries and 500 <= result['code'] < 600:
41 |             # server error so ignore result from cache
42 |             # and re-download
43 |             result = None
44 |         if result is None:
45 |             # result was not loaded from cache, need to download
46 |             self.throttle.wait(url)
47 |             proxies = choice(self.proxies) if self.proxies else None
48 |             headers = {'User-Agent': self.user_agent}
49 |             result = self.download(url, headers, proxies)
50 |             self.cache[url] = result
51 |         return result['html']
52 | 
53 |     def download(self, url, headers, proxies):
54 |         """ Download a and return the page content
55 |             args:
56 |                 url (str): URL
57 |                 headers (dict): dict of headers (like user_agent)
58 |                 proxies (dict): proxy dict w/ keys 'http'/'https', values
59 |                     are strs (i.e. 'http(s)://IP') (default: None)
60 |         """
61 |         print('Downloading:', url)
62 |         try:
63 |             resp = requests.get(url, headers=headers, proxies=proxies,
64 |                                 timeout=self.timeout)
65 |             html = resp.text
66 |             if resp.status_code >= 400:
67 |                 print('Download error:', resp.text)
68 |                 html = None
69 |                 if self.num_retries and 500 <= resp.status_code < 600:
70 |                     # recursively retry 5xx HTTP errors
71 |                     self.num_retries -= 1
72 |                     return self.download(url, headers, proxies)
73 |         except requests.exceptions.RequestException as e:
74 |             print('Download error:', e)
75 |             return {'html': None, 'code': 500}
76 |         return {'html': html, 'code': resp.status_code}
77 | 


--------------------------------------------------------------------------------
/code/chp8/example/example/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for example project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'example'
13 | 
14 | SPIDER_MODULES = ['example.spiders']
15 | NEWSPIDER_MODULE = 'example.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'example (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 5
31 | # The download delay setting will honor only one of:
32 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'example.middlewares.ExampleSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'example.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'example.pipelines.ExamplePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/code/chp3/downloader_requests_cache.py:
--------------------------------------------------------------------------------
 1 | from random import choice
 2 | import requests
 3 | import requests_cache
 4 | 
 5 | from chp1.throttle import Throttle
 6 | 
 7 | 
 8 | class Downloader:
 9 |     """ Downloader class to use cache and requests for downloading pages.
10 |         For contructor, pass:
11 |             delay (int): # of secs delay between requests (default: 5)
12 |             user_agent (str): user agent string (default: 'wswp')
13 |             proxies (list[dict]): list of possible proxies, each
14 |                 must be a dict with http / https keys and proxy values
15 |             timeout (float/int): number of seconds to wait until timeout
16 |     """
17 |     def __init__(self, delay=5, user_agent='wswp', proxies=None,
18 |                  timeout=60):
19 |         self.throttle = Throttle(delay)
20 |         self.user_agent = user_agent
21 |         self.proxies = proxies
22 |         self.num_retries = None  # we will set this per request
23 |         self.timeout = timeout
24 | 
25 |     def __call__(self, url, num_retries=2):
26 |         """ Call the downloader class, which will return HTML from cache
27 |             or download it
28 |             args:
29 |                 url (str): url to download
30 |             kwargs:
31 |                 num_retries (int): # times to retry if 5xx code (default: 2)
32 |         """
33 |         self.num_retries = num_retries
34 |         proxies = choice(self.proxies) if self.proxies else None
35 |         headers = {'User-Agent': self.user_agent}
36 |         result = self.download(url, headers, proxies)
37 |         return result['html']
38 | 
39 |     def make_throttle_hook(self, throttle=None):
40 |         """
41 |         Modified from: https://requests-cache.readthedocs.io/en/latest/user_guide.html
42 |         Returns a response hook function which sleeps for `timeout` seconds if
43 |         response is not cached
44 |         """
45 |         def hook(response, *args, **kwargs):
46 |             """ see requests hook documentation for more information"""
47 |             if not getattr(response, 'from_cache', False):
48 |                 throttle.wait(response.url)
49 |                 print('Downloading:', response.url)
50 |             else:
51 |                 print('Returning from cache:', response.url)
52 |             return response
53 |         return hook
54 | 
55 |     def download(self, url, headers, proxies):
56 |         """ Download a and return the page content
57 |             args:
58 |                 url (str): URL
59 |                 headers (dict): dict of headers (like user_agent)
60 |                 proxies (dict): proxy dict w/ keys 'http'/'https', values
61 |                     are strs (i.e. 'http(s)://IP') (default: None)
62 |         """
63 |         session = requests_cache.CachedSession()
64 |         session.hooks = {'response': self.make_throttle_hook(self.throttle)}
65 | 
66 |         try:
67 |             resp = session.get(url, headers=headers, proxies=proxies,
68 |                                timeout=self.timeout)
69 |             html = resp.text
70 |             if resp.status_code >= 400:
71 |                 print('Download error:', resp.text)
72 |                 html = None
73 |                 if self.num_retries and 500 <= resp.status_code < 600:
74 |                     # recursively retry 5xx HTTP errors
75 |                     self.num_retries -= 1
76 |                     return self.download(url, headers, proxies)
77 |         except requests.exceptions.RequestException as e:
78 |             print('Download error:', e)
79 |             return {'html': None, 'code': 500}
80 |         return {'html': html, 'code': resp.status_code}
81 | 


--------------------------------------------------------------------------------
/code/chp3/diskcache.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import re
 4 | import zlib
 5 | 
 6 | from datetime import datetime, timedelta
 7 | from urllib.parse import urlsplit
 8 | 
 9 | 
10 | class DiskCache:
11 |     """ DiskCache helps store urls and their responses to disk
12 |         Intialization components:
13 |             cache_dir (str): abs file path or relative file path
14 |                 for cache directory (default: ../data/cache)
15 |             max_len (int): maximum filename length (default: 255)
16 |             compress (bool): use zlib compression (default: True)
17 |             encoding (str): character encoding for compression (default: utf-8)
18 |             expires (datetime.timedelta): timedelta when content will expire
19 |                 (default: 30 days ago)
20 |     """
21 |     def __init__(self, cache_dir='../data/cache', max_len=255, compress=True,
22 |                  encoding='utf-8', expires=timedelta(days=30)):
23 |         self.cache_dir = cache_dir
24 |         self.max_len = max_len
25 |         self.compress = compress
26 |         self.encoding = encoding
27 |         self.expires = expires
28 | 
29 |     def url_to_path(self, url):
30 |         """ Return file system path string for given URL """
31 |         components = urlsplit(url)
32 |         # append index.html to empty paths
33 |         path = components.path
34 |         if not path:
35 |             path = '/index.html'
36 |         elif path.endswith('/'):
37 |             path += 'index.html'
38 |         filename = components.netloc + path + components.query
39 |         # replace invalid characters
40 |         filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
41 |         # restrict maximum number of characters
42 |         filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
43 |         return os.path.join(self.cache_dir, filename)
44 | 
45 |     def __getitem__(self, url):
46 |         """Load data from disk for given URL"""
47 |         path = self.url_to_path(url)
48 |         if os.path.exists(path):
49 |             mode = ('rb' if self.compress else 'r')
50 |             with open(path, mode) as fp:
51 |                 if self.compress:
52 |                     data = zlib.decompress(fp.read()).decode(self.encoding)
53 |                     data = json.loads(data)
54 |                 else:
55 |                     data = json.load(fp)
56 |             exp_date = data.get('expires')
57 |             if exp_date and datetime.strptime(exp_date,
58 |                                               '%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
59 |                 print('Cache expired!', exp_date)
60 |                 raise KeyError(url + ' has expired.')
61 |             return data
62 |         else:
63 |             # URL has not yet been cached
64 |             raise KeyError(url + ' does not exist')
65 | 
66 |     def __setitem__(self, url, result):
67 |         """Save data to disk for given url"""
68 |         path = self.url_to_path(url)
69 |         folder = os.path.dirname(path)
70 |         if not os.path.exists(folder):
71 |             os.makedirs(folder)
72 |         mode = ('wb' if self.compress else 'w')
73 |         # Note: the timespec command requires Py3.6+ (if using 3.X you can
74 |         # export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
75 |         result['expires'] = (datetime.utcnow() + self.expires).isoformat(
76 |             timespec='seconds')
77 |         with open(path, mode) as fp:
78 |             if self.compress:
79 |                 data = bytes(json.dumps(result), self.encoding)
80 |                 fp.write(zlib.compress(data))
81 |             else:
82 |                 json.dump(result, fp)
83 | 


--------------------------------------------------------------------------------
/code/chp1/advanced_link_crawler_using_requests.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from urllib import robotparser
  3 | from urllib.parse import urljoin
  4 | 
  5 | import requests
  6 | from chp1.throttle import Throttle
  7 | 
  8 | 
  9 | def download(url, num_retries=2, user_agent='wswp', proxies=None):
 10 |     """ Download a given URL and return the page content
 11 |         args:
 12 |             url (str): URL
 13 |         kwargs:
 14 |             user_agent (str): user agent (default: wswp)
 15 |             proxies (dict): proxy dict w/ keys 'http' and 'https', values
 16 |                             are strs (i.e. 'http(s)://IP') (default: None)
 17 |             num_retries (int): # of retries if a 5xx error is seen (default: 2)
 18 |     """
 19 |     print('Downloading:', url)
 20 |     headers = {'User-Agent': user_agent}
 21 |     try:
 22 |         resp = requests.get(url, headers=headers, proxies=proxies)
 23 |         html = resp.text
 24 |         if resp.status_code >= 400:
 25 |             print('Download error:', resp.text)
 26 |             html = None
 27 |             if num_retries and 500 <= resp.status_code < 600:
 28 |                 # recursively retry 5xx HTTP errors
 29 |                 return download(url, num_retries - 1)
 30 |     except requests.exceptions.RequestException as e:
 31 |         print('Download error:', e)
 32 |         html = None
 33 |     return html
 34 | 
 35 | 
 36 | def get_robots_parser(robots_url):
 37 |     " Return the robots parser object using the robots_url "
 38 |     rp = robotparser.RobotFileParser()
 39 |     rp.set_url(robots_url)
 40 |     rp.read()
 41 |     return rp
 42 | 
 43 | 
 44 | def get_links(html):
 45 |     """ Return a list of links (using simple regex matching)
 46 |         from the html content """
 47 |     # a regular expression to extract all links from the webpage
 48 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
 49 |     # list of all links from the webpage
 50 |     return webpage_regex.findall(html)
 51 | 
 52 | 
 53 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
 54 |                  proxies=None, delay=3, max_depth=4):
 55 |     """ Crawl from the given start URL following links matched by link_regex.
 56 |     In the current implementation, we do not actually scrape any information.
 57 | 
 58 |         args:
 59 |             start_url (str): web site to start crawl
 60 |             link_regex (str): regex to match for links
 61 |         kwargs:
 62 |             robots_url (str): url of the site's robots.txt
 63 |                               (default: start_url + /robots.txt)
 64 |             user_agent (str): user agent (default: wswp)
 65 |             proxies (dict): proxy dict w/ keys 'http' and 'https', values
 66 |                             are strs (i.e. 'http(s)://IP') (default: None)
 67 |             delay (int): seconds to throttle between requests
 68 |                          to one domain (default: 3)
 69 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
 70 |     """
 71 |     crawl_queue = [start_url]
 72 |     # keep track which URL's have seen before
 73 |     seen = {}
 74 |     if not robots_url:
 75 |         robots_url = '{}/robots.txt'.format(start_url)
 76 |     rp = get_robots_parser(robots_url)
 77 |     throttle = Throttle(delay)
 78 |     while crawl_queue:
 79 |         url = crawl_queue.pop()
 80 |         # check url passes robots.txt restrictions
 81 |         if rp.can_fetch(user_agent, url):
 82 |             depth = seen.get(url, 0)
 83 |             if depth == max_depth:
 84 |                 print('Skipping %s due to depth' % url)
 85 |                 continue
 86 |             throttle.wait(url)
 87 |             html = download(url, user_agent=user_agent, proxies=proxies)
 88 |             if not html:
 89 |                 continue
 90 |             # TODO: add actual data scraping here
 91 |             # filter for links matching our regular expression
 92 |             for link in get_links(html):
 93 |                 if re.match(link_regex, link):
 94 |                     abs_link = urljoin(start_url, link)
 95 |                     if abs_link not in seen:
 96 |                         seen[abs_link] = depth + 1
 97 |                         crawl_queue.append(abs_link)
 98 |         else:
 99 |             print('Blocked by robots.txt:', url)
100 | 


--------------------------------------------------------------------------------
/code/chp1/advanced_link_crawler.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import urllib.request
  3 | from urllib import robotparser
  4 | from urllib.parse import urljoin
  5 | from urllib.error import URLError, HTTPError, ContentTooShortError
  6 | from chp1.throttle import Throttle
  7 | 
  8 | 
  9 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8', proxy=None):
 10 |     """ Download a given URL and return the page content
 11 |         args:
 12 |             url (str): URL
 13 |         kwargs:
 14 |             user_agent (str): user agent (default: wswp)
 15 |             charset (str): charset if website does not include one in headers
 16 |             proxy (str): proxy url, ex 'http://IP' (default: None)
 17 |             num_retries (int): number of retries if a 5xx error is seen (default: 2)
 18 |     """
 19 |     print('Downloading:', url)
 20 |     request = urllib.request.Request(url)
 21 |     request.add_header('User-agent', user_agent)
 22 |     try:
 23 |         if proxy:
 24 |             proxy_support = urllib.request.ProxyHandler({'http': proxy})
 25 |             opener = urllib.request.build_opener(proxy_support)
 26 |             urllib.request.install_opener(opener)
 27 |         resp = urllib.request.urlopen(request)
 28 |         cs = resp.headers.get_content_charset()
 29 |         if not cs:
 30 |             cs = charset
 31 |         html = resp.read().decode(cs)
 32 |     except (URLError, HTTPError, ContentTooShortError) as e:
 33 |         print('Download error:', e.reason)
 34 |         html = None
 35 |         if num_retries > 0:
 36 |             if hasattr(e, 'code') and 500 <= e.code < 600:
 37 |                 # recursively retry 5xx HTTP errors
 38 |                 return download(url, num_retries - 1)
 39 |     return html
 40 | 
 41 | 
 42 | def get_robots_parser(robots_url):
 43 |     " Return the robots parser object using the robots_url "
 44 |     rp = robotparser.RobotFileParser()
 45 |     rp.set_url(robots_url)
 46 |     rp.read()
 47 |     return rp
 48 | 
 49 | 
 50 | def get_links(html):
 51 |     " Return a list of links (using simple regex matching) from the html content "
 52 |     # a regular expression to extract all links from the webpage
 53 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
 54 |     # list of all links from the webpage
 55 |     return webpage_regex.findall(html)
 56 | 
 57 | 
 58 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
 59 |                  proxy=None, delay=3, max_depth=4):
 60 |     """ Crawl from the given start URL following links matched by link_regex. In the current
 61 |         implementation, we do not actually scrapy any information.
 62 | 
 63 |         args:
 64 |             start_url (str): web site to start crawl
 65 |             link_regex (str): regex to match for links
 66 |         kwargs:
 67 |             robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
 68 |             user_agent (str): user agent (default: wswp)
 69 |             proxy (str): proxy url, ex 'http://IP' (default: None)
 70 |             delay (int): seconds to throttle between requests to one domain (default: 3)
 71 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
 72 |     """
 73 |     crawl_queue = [start_url]
 74 |     # keep track which URL's have seen before
 75 |     seen = {}
 76 |     if not robots_url:
 77 |         robots_url = '{}/robots.txt'.format(start_url)
 78 |     rp = get_robots_parser(robots_url)
 79 |     throttle = Throttle(delay)
 80 |     while crawl_queue:
 81 |         url = crawl_queue.pop()
 82 |         # check url passes robots.txt restrictions
 83 |         if rp.can_fetch(user_agent, url):
 84 |             depth = seen.get(url, 0)
 85 |             if depth == max_depth:
 86 |                 print('Skipping %s due to depth' % url)
 87 |                 continue
 88 |             throttle.wait(url)
 89 |             html = download(url, user_agent=user_agent, proxy=proxy)
 90 |             if not html:
 91 |                 continue
 92 |             # TODO: add actual data scraping here
 93 |             # filter for links matching our regular expression
 94 |             for link in get_links(html):
 95 |                 if re.match(link_regex, link):
 96 |                     abs_link = urljoin(start_url, link)
 97 |                     if abs_link not in seen:
 98 |                         seen[abs_link] = depth + 1
 99 |                         crawl_queue.append(abs_link)
100 |         else:
101 |             print('Blocked by robots.txt:', url)
102 | 


--------------------------------------------------------------------------------
/code/chp4/advanced_link_crawler.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import socket
  3 | from urllib import robotparser
  4 | from urllib.parse import urljoin, urlparse
  5 | from chp3.downloader import Downloader
  6 | 
  7 | socket.setdefaulttimeout(60)
  8 | 
  9 | 
 10 | def get_robots_parser(robots_url):
 11 |     " Return the robots parser object using the robots_url "
 12 |     try:
 13 |         rp = robotparser.RobotFileParser()
 14 |         rp.set_url(robots_url)
 15 |         rp.read()
 16 |         return rp
 17 |     except Exception as e:
 18 |         print('Error finding robots_url:', robots_url, e)
 19 | 
 20 | 
 21 | def get_links(html):
 22 |     " Return a list of links (using simple regex matching) from the html content "
 23 |     # a regular expression to extract all links from the webpage
 24 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
 25 |     # list of all links from the webpage
 26 |     return webpage_regex.findall(html)
 27 | 
 28 | 
 29 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
 30 |                  proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None):
 31 |     """ Crawl from the given start URL following links matched by link_regex. In the current
 32 |         implementation, we do not actually scrapy any information.
 33 | 
 34 |         args:
 35 |             start_url (str or list of strs): web site(s) to start crawl
 36 |             link_regex (str): regex to match for links
 37 |         kwargs:
 38 |             robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
 39 |             user_agent (str): user agent (default: wswp)
 40 |             proxies (list of dicts): a list of possible dicts for http / https proxies
 41 |                 For formatting, see the requests library
 42 |             delay (int): seconds to throttle between requests to one domain (default: 3)
 43 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
 44 |             num_retries (int): # of retries when 5xx error (default: 2)
 45 |             cache (dict): cache dict with urls as keys and dicts for responses (default: {})
 46 |             scraper_callback: function to be called on url and html content
 47 |     """
 48 |     if isinstance(start_url, list):
 49 |         crawl_queue = start_url
 50 |     else:
 51 |         crawl_queue = [start_url]
 52 |     # keep track which URL's have seen before
 53 |     seen, robots = {}, {}
 54 |     D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
 55 |     while crawl_queue:
 56 |         url = crawl_queue.pop()
 57 |         no_robots = False
 58 |         if 'http' not in url:
 59 |             continue
 60 |         domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)
 61 |         rp = robots.get(domain)
 62 |         if not rp and domain not in robots:
 63 |             robots_url = '{}/robots.txt'.format(domain)
 64 |             rp = get_robots_parser(robots_url)
 65 |             if not rp:
 66 |                 # issue finding robots.txt, still crawl
 67 |                 no_robots = True
 68 |             robots[domain] = rp
 69 |         elif domain in robots:
 70 |             no_robots = True
 71 |         # check url passes robots.txt restrictions
 72 |         if no_robots or rp.can_fetch(user_agent, url):
 73 |             depth = seen.get(url, 0)
 74 |             if depth == max_depth:
 75 |                 print('Skipping %s due to depth' % url)
 76 |                 continue
 77 |             html = D(url, num_retries=num_retries)
 78 |             if not html:
 79 |                 continue
 80 |             if scraper_callback:
 81 |                 links = scraper_callback(url, html) or []
 82 |             else:
 83 |                 links = []
 84 |             # filter for links matching our regular expression
 85 |             for link in get_links(html) + links:
 86 |                 if re.match(link_regex, link):
 87 |                     if 'http' not in link:
 88 |                         if link.startswith('//'):
 89 |                             link = '{}:{}'.format(urlparse(url).scheme, link)
 90 |                         elif link.startswith('://'):
 91 |                             link = '{}{}'.format(urlparse(url).scheme, link)
 92 |                         else:
 93 |                             link = urljoin(domain, link)
 94 | 
 95 |                     if link not in seen:
 96 |                         seen[link] = depth + 1
 97 |                         crawl_queue.append(link)
 98 |         else:
 99 |             print('Blocked by robots.txt:', url)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     from chp4.alexa_callback import AlexaCallback
104 |     from chp3.rediscache import RedisCache
105 |     from time import time
106 |     AC = AlexaCallback()
107 |     AC()
108 |     start_time = time()
109 |     link_crawler(AC.urls, '$^', cache=RedisCache())
110 |     print('Total time: %ss' % (time() - start_time))
111 | 


--------------------------------------------------------------------------------
/code/chp2/advanced_link_crawler.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import urllib.request
  3 | from urllib import robotparser
  4 | from urllib.parse import urljoin
  5 | from urllib.error import URLError, HTTPError, ContentTooShortError
  6 | from lxml.html import fromstring
  7 | from chp1.throttle import Throttle
  8 | 
  9 | 
 10 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8', proxy=None):
 11 |     """ Download a given URL and return the page content
 12 |         args:
 13 |             url (str): URL
 14 |         kwargs:
 15 |             user_agent (str): user agent (default: wswp)
 16 |             charset (str): charset if website does not include one in headers
 17 |             proxy (str): proxy url, ex 'http://IP' (default: None)
 18 |             num_retries (int): number of retries if a 5xx error is seen (default: 2)
 19 |     """
 20 |     print('Downloading:', url)
 21 |     request = urllib.request.Request(url)
 22 |     request.add_header('User-agent', user_agent)
 23 |     try:
 24 |         if proxy:
 25 |             proxy_support = urllib.request.ProxyHandler({'http': proxy})
 26 |             opener = urllib.request.build_opener(proxy_support)
 27 |             urllib.request.install_opener(opener)
 28 |         resp = urllib.request.urlopen(request)
 29 |         cs = resp.headers.get_content_charset()
 30 |         if not cs:
 31 |             cs = charset
 32 |         html = resp.read().decode(cs)
 33 |     except (URLError, HTTPError, ContentTooShortError) as e:
 34 |         print('Download error:', e)
 35 |         html = None
 36 |         if num_retries > 0:
 37 |             if hasattr(e, 'code') and 500 <= e.code < 600:
 38 |                 # recursively retry 5xx HTTP errors
 39 |                 return download(url, num_retries - 1)
 40 |     return html
 41 | 
 42 | 
 43 | def get_robots_parser(robots_url):
 44 |     " Return the robots parser object using the robots_url "
 45 |     rp = robotparser.RobotFileParser()
 46 |     rp.set_url(robots_url)
 47 |     rp.read()
 48 |     return rp
 49 | 
 50 | 
 51 | def get_links(html):
 52 |     " Return a list of links (using simple regex matching) from the html content "
 53 |     # a regular expression to extract all links from the webpage
 54 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
 55 |     # list of all links from the webpage
 56 |     return webpage_regex.findall(html)
 57 | 
 58 | 
 59 | def scrape_callback(url, html):
 60 |     """ Scrape each row from the country data using XPath and lxml """
 61 |     fields = ('area', 'population', 'iso', 'country', 'capital',
 62 |               'continent', 'tld', 'currency_code', 'currency_name',
 63 |               'phone', 'postal_code_format', 'postal_code_regex',
 64 |               'languages', 'neighbours')
 65 |     if re.search('/view/', url):
 66 |         tree = fromstring(html)
 67 |         all_rows = [
 68 |             tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
 69 |             for field in fields]
 70 |         print(url, all_rows)
 71 | 
 72 | 
 73 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
 74 |                  proxy=None, delay=3, max_depth=4, scrape_callback=None):
 75 |     """ Crawl from the given start URL following links matched by link_regex. In the current
 76 |         implementation, we do not actually scrapy any information.
 77 | 
 78 |         args:
 79 |             start_url (str): web site to start crawl
 80 |             link_regex (str): regex to match for links
 81 |         kwargs:
 82 |             robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
 83 |             user_agent (str): user agent (default: wswp)
 84 |             proxy (str): proxy url, ex 'http://IP' (default: None)
 85 |             delay (int): seconds to throttle between requests to one domain (default: 3)
 86 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
 87 |             scrape_callback (function): function to call after each download (default: None)
 88 |     """
 89 |     crawl_queue = [start_url]
 90 |     # keep track which URL's have seen before
 91 |     seen = {}
 92 |     data = []
 93 |     if not robots_url:
 94 |         robots_url = '{}/robots.txt'.format(start_url)
 95 |     rp = get_robots_parser(robots_url)
 96 |     throttle = Throttle(delay)
 97 |     while crawl_queue:
 98 |         url = crawl_queue.pop()
 99 |         # check url passes robots.txt restrictions
100 |         if rp.can_fetch(user_agent, url):
101 |             depth = seen.get(url, 0)
102 |             if depth == max_depth:
103 |                 print('Skipping %s due to depth' % url)
104 |                 continue
105 |             throttle.wait(url)
106 |             html = download(url, user_agent=user_agent, proxy=proxy)
107 |             if not html:
108 |                 continue
109 |             if scrape_callback:
110 |                 data.extend(scrape_callback(url, html) or [])
111 |             # filter for links matching our regular expression
112 |             for link in get_links(html):
113 |                 if re.match(link_regex, link):
114 |                     abs_link = urljoin(start_url, link)
115 |                     if abs_link not in seen:
116 |                         seen[abs_link] = depth + 1
117 |                         crawl_queue.append(abs_link)
118 |         else:
119 |             print('Blocked by robots.txt:', url)
120 | 


--------------------------------------------------------------------------------
/code/chp4/threaded_crawler.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import socket
  3 | import threading
  4 | import time
  5 | from urllib import robotparser
  6 | from urllib.parse import urljoin, urlparse
  7 | from chp3.downloader import Downloader
  8 | 
  9 | SLEEP_TIME = 1
 10 | socket.setdefaulttimeout(60)
 11 | 
 12 | 
 13 | def get_robots_parser(robots_url):
 14 |     " Return the robots parser object using the robots_url "
 15 |     try:
 16 |         rp = robotparser.RobotFileParser()
 17 |         rp.set_url(robots_url)
 18 |         rp.read()
 19 |         return rp
 20 |     except Exception as e:
 21 |         print('Error finding robots_url:', robots_url, e)
 22 | 
 23 | 
 24 | def get_links(html):
 25 |     " Return a list of links (using simple regex matching) from the html content "
 26 |     # a regular expression to extract all links from the webpage
 27 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
 28 |     # list of all links from the webpage
 29 |     return webpage_regex.findall(html)
 30 | 
 31 | 
 32 | def threaded_crawler(start_url, link_regex, user_agent='wswp', proxies=None,
 33 |                      delay=3, max_depth=4, num_retries=2, cache={}, max_threads=10, scraper_callback=None):
 34 |     """ Crawl from the given start URLs following links matched by link_regex. In this
 35 |         implementation, we do not actually scrape any information.
 36 | 
 37 |         args:
 38 |             start_url (str or list of strs): web site(s) to start crawl
 39 |             link_regex (str): regex to match for links
 40 |         kwargs:
 41 |             user_agent (str): user agent (default: wswp)
 42 |             proxies (list of dicts): a list of possible dicts for http / https proxies
 43 |                 For formatting, see the requests library
 44 |             delay (int): seconds to throttle between requests to one domain (default: 3)
 45 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
 46 |             num_retries (int): # of retries when 5xx error (default: 2)
 47 |             cache (dict): cache dict with urls as keys and dicts for responses (default: {})
 48 |             scraper_callback: function to be called on url and html content
 49 |     """
 50 |     if isinstance(start_url, list):
 51 |         crawl_queue = start_url
 52 |     else:
 53 |         crawl_queue = [start_url]
 54 |     # keep track which URL's have seen before
 55 |     seen, robots = {}, {}
 56 |     D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
 57 | 
 58 |     def process_queue():
 59 |         while crawl_queue:
 60 |             url = crawl_queue.pop()
 61 |             no_robots = False
 62 |             if not url or 'http' not in url:
 63 |                 continue
 64 |             domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)
 65 |             rp = robots.get(domain)
 66 |             if not rp and domain not in robots:
 67 |                 robots_url = '{}/robots.txt'.format(domain)
 68 |                 rp = get_robots_parser(robots_url)
 69 |                 if not rp:
 70 |                     # issue finding robots.txt, still crawl
 71 |                     no_robots = True
 72 |                 robots[domain] = rp
 73 |             elif domain in robots:
 74 |                 no_robots = True
 75 |             # check url passes robots.txt restrictions
 76 |             if no_robots or rp.can_fetch(user_agent, url):
 77 |                 depth = seen.get(url, 0)
 78 |                 if depth == max_depth:
 79 |                     print('Skipping %s due to depth' % url)
 80 |                     continue
 81 |                 html = D(url, num_retries=num_retries)
 82 |                 if not html:
 83 |                     continue
 84 |                 if scraper_callback:
 85 |                     links = scraper_callback(url, html) or []
 86 |                 else:
 87 |                     links = []
 88 |                 # filter for links matching our regular expression
 89 |                 for link in get_links(html) + links:
 90 |                     if re.match(link_regex, link):
 91 |                         if 'http' not in link:
 92 |                             if link.startswith('//'):
 93 |                                 link = '{}:{}'.format(urlparse(url).scheme, link)
 94 |                             elif link.startswith('://'):
 95 |                                 link = '{}{}'.format(urlparse(url).scheme, link)
 96 |                             else:
 97 |                                 link = urljoin(domain, link)
 98 |                         if link not in seen:
 99 |                             seen[link] = depth + 1
100 |                             crawl_queue.append(link)
101 |             else:
102 |                 print('Blocked by robots.txt:', url)
103 | 
104 |     # wait for all download threads to finish
105 |     threads = []
106 |     print(max_threads)
107 |     while threads or crawl_queue:
108 |         for thread in threads:
109 |             if not thread.is_alive():
110 |                 threads.remove(thread)
111 |         while len(threads) < max_threads and crawl_queue:
112 |             # can start some more threads
113 |             thread = threading.Thread(target=process_queue)
114 |             thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
115 |             thread.start()
116 |             threads.append(thread)
117 |         print(threads)
118 |         for thread in threads:
119 |             thread.join()
120 | 
121 |         time.sleep(SLEEP_TIME)
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     from chp4.alexa_callback import AlexaCallback
126 |     from chp3.rediscache import RedisCache
127 |     import argparse
128 | 
129 |     parser = argparse.ArgumentParser(description='Threaded link crawler')
130 |     parser.add_argument('max_threads', type=int, help='maximum number of threads',
131 |                         nargs='?', default=5)
132 |     parser.add_argument('url_pattern', type=str, help='regex pattern for url matching',
133 |                         nargs='?', default='$^')
134 |     par_args = parser.parse_args()
135 |     AC = AlexaCallback()
136 |     AC()
137 |     start_time = time.time()
138 |     threaded_crawler(AC.urls, par_args.url_pattern, cache=RedisCache(),
139 |                      max_threads=par_args.max_threads)
140 |     print('Total time: %ss' % (time.time() - start_time))
141 | 


--------------------------------------------------------------------------------
/code/chp4/threaded_crawler_with_queue.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import re
  3 | import socket
  4 | import threading
  5 | import time
  6 | from urllib import robotparser
  7 | from urllib.parse import urljoin, urlparse
  8 | from chp3.downloader import Downloader
  9 | from chp4.redis_queue import RedisQueue
 10 | 
 11 | 
 12 | SLEEP_TIME = 1
 13 | socket.setdefaulttimeout(60)
 14 | 
 15 | 
 16 | def get_robots_parser(robots_url):
 17 |     " Return the robots parser object using the robots_url "
 18 |     try:
 19 |         rp = robotparser.RobotFileParser()
 20 |         rp.set_url(robots_url)
 21 |         rp.read()
 22 |         return rp
 23 |     except Exception as e:
 24 |         print('Error finding robots_url:', robots_url, e)
 25 | 
 26 | 
 27 | def clean_link(url, domain, link):
 28 |     if link.startswith('//'):
 29 |         link = '{}:{}'.format(urlparse(url).scheme, link)
 30 |     elif link.startswith('://'):
 31 |         link = '{}{}'.format(urlparse(url).scheme, link)
 32 |     else:
 33 |         link = urljoin(domain, link)
 34 |     return link
 35 | 
 36 | 
 37 | def get_links(html, link_regex):
 38 |     " Return a list of links (using simple regex matching) from the html content "
 39 |     # a regular expression to extract all links from the webpage
 40 |     webpage_regex = re.compile("""<a[^>]+href=["'](.*?)["']""", re.IGNORECASE)
 41 |     # list of all links from the webpage
 42 |     links = webpage_regex.findall(html)
 43 |     links = (link for link in links if re.match(link_regex, link))
 44 |     return links
 45 | 
 46 | 
 47 | def threaded_crawler_rq(start_url, link_regex, user_agent='wswp', proxies=None,
 48 |                         delay=3, max_depth=4, num_retries=2, cache={}, max_threads=10, scraper_callback=None):
 49 |     """ Crawl from the given start URLs following links matched by link_regex. In this
 50 |         implementation, we do not actually scrape any information.
 51 | 
 52 |         args:
 53 |             start_url (str or list of strs): web site(s) to start crawl
 54 |             link_regex (str): regex to match for links
 55 |         kwargs:
 56 |             user_agent (str): user agent (default: wswp)
 57 |             proxies (list of dicts): a list of possible dicts
 58 |                 for http / https proxies
 59 |                 For formatting, see the requests library
 60 |             delay (int): seconds to throttle between requests to one domain
 61 |                         (default: 3)
 62 |             max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
 63 |             num_retries (int): # of retries when 5xx error (default: 2)
 64 |             cache (dict): cache dict with urls as keys
 65 |                           and dicts for responses (default: {})
 66 |             scraper_callback: function to be called on url and html content
 67 |     """
 68 |     crawl_queue = RedisQueue()
 69 |     crawl_queue.push(start_url)
 70 |     # keep track which URL's have seen before
 71 |     robots = {}
 72 |     D = Downloader(delay=delay, user_agent=user_agent,
 73 |                    proxies=proxies, cache=cache)
 74 | 
 75 |     def process_queue():
 76 |         while len(crawl_queue):
 77 |             url = crawl_queue.pop()
 78 |             no_robots = False
 79 |             if not url or 'http' not in url:
 80 |                 continue
 81 |             domain = '{}://{}'.format(urlparse(url).scheme,
 82 |                                       urlparse(url).netloc)
 83 |             rp = robots.get(domain)
 84 |             if not rp and domain not in robots:
 85 |                 robots_url = '{}/robots.txt'.format(domain)
 86 |                 rp = get_robots_parser(robots_url)
 87 |                 if not rp:
 88 |                     # issue finding robots.txt, still crawl
 89 |                     no_robots = True
 90 |                 robots[domain] = rp
 91 |             elif domain in robots:
 92 |                 no_robots = True
 93 |             # check url passes robots.txt restrictions
 94 |             if no_robots or rp.can_fetch(user_agent, url):
 95 |                 depth = crawl_queue.get_depth(url)
 96 |                 if depth == max_depth:
 97 |                     print('Skipping %s due to depth' % url)
 98 |                     continue
 99 |                 html = D(url, num_retries=num_retries)
100 |                 if not html:
101 |                     continue
102 |                 if scraper_callback:
103 |                     links = scraper_callback(url, html) or []
104 |                 else:
105 |                     links = []
106 |                 # filter for links matching our regular expression
107 |                 for link in list(get_links(html, link_regex)) + links:
108 |                     if 'http' not in link:
109 |                         link = clean_link(url, domain, link)
110 |                     crawl_queue.push(link)
111 |                     crawl_queue.set_depth(link, depth + 1)
112 |             else:
113 |                 print('Blocked by robots.txt:', url)
114 | 
115 |     # wait for all download threads to finish
116 |     threads = []
117 |     while threads or len(crawl_queue):
118 |         for thread in threads:
119 |             if not thread.is_alive():
120 |                 threads.remove(thread)
121 |         while len(threads) < max_threads and crawl_queue:
122 |             # can start some more threads
123 |             thread = threading.Thread(target=process_queue)
124 |             thread.setDaemon(True)  # set daemon so main thread can exit w/ ctrl-c
125 |             thread.start()
126 |             threads.append(thread)
127 | 
128 |         for thread in threads:
129 |             thread.join()
130 | 
131 |         time.sleep(SLEEP_TIME)
132 | 
133 | 
134 | def mp_threaded_crawler(*args, **kwargs):
135 |     """ create a multiprocessing threaded crawler """
136 |     processes = []
137 |     num_procs = kwargs.pop('num_procs')
138 |     if not num_procs:
139 |         num_procs = multiprocessing.cpu_count()
140 |     for _ in range(num_procs):
141 |         proc = multiprocessing.Process(target=threaded_crawler_rq,
142 |                                        args=args, kwargs=kwargs)
143 |         proc.start()
144 |         processes.append(proc)
145 |     # wait for processes to complete
146 |     for proc in processes:
147 |         proc.join()
148 | 
149 | 
150 | if __name__ == '__main__':
151 |     from chp4.alexa_callback import AlexaCallback
152 |     from chp3.rediscache import RedisCache
153 |     import argparse
154 | 
155 |     parser = argparse.ArgumentParser(description='Multiprocessing threaded link crawler')
156 |     parser.add_argument('max_threads', type=int, help='maximum number of threads',
157 |                         nargs='?', default=5)
158 |     parser.add_argument('num_procs', type=int, help='number of processes',
159 |                         nargs='?', default=None)
160 |     parser.add_argument('url_pattern', type=str, help='regex pattern for url matching',
161 |                         nargs='?', default='$^')
162 |     par_args = parser.parse_args()
163 | 
164 |     AC = AlexaCallback()
165 |     AC()
166 |     start_time = time.time()
167 | 
168 |     mp_threaded_crawler(AC.urls, par_args.url_pattern, cache=RedisCache(),
169 |                         num_procs=par_args.num_procs, max_threads=par_args.max_threads)
170 |     print('Total time: %ss' % (time.time() - start_time))
171 | 


--------------------------------------------------------------------------------