├── code
├── __init__.py
├── chp1
│ ├── __init__.py
│ ├── downloading_a_page.py
│ ├── retrying_downloads.py
│ ├── setting_user_agent.py
│ ├── throttle.py
│ ├── sitemap_crawler.py
│ ├── id_iteration_crawler.py
│ ├── link_crawler.py
│ ├── advanced_link_crawler_using_requests.py
│ └── advanced_link_crawler.py
├── chp2
│ ├── __init__.py
│ ├── lxml_brokenhtml.py
│ ├── xpath_scraper.py
│ ├── lxml_scraper.py
│ ├── beautifulsoup.py
│ ├── family_trees.py
│ ├── regex.py
│ ├── beautifulsoup_brokenhtml.py
│ ├── csv_callback.py
│ ├── test_scrapers.py
│ ├── all_scrapers.py
│ └── advanced_link_crawler.py
├── chp3
│ ├── __init__.py
│ ├── url_parsing.py
│ ├── rediscache.py
│ ├── requests_cache_link_crawler.py
│ ├── advanced_link_crawler.py
│ ├── downloader.py
│ ├── downloader_requests_cache.py
│ └── diskcache.py
├── chp6
│ ├── __init__.py
│ ├── login_form_requests.py
│ ├── submit_login_form.py
│ ├── mechanize_form.py
│ ├── login.py
│ ├── firefox_sessions.py
│ ├── selenium_forms.py
│ └── edit.py
├── chp7
│ ├── __init__.py
│ ├── investigate_form.py
│ ├── image_processing.py
│ ├── test_samples.py
│ ├── register_with_ocr.py
│ ├── register_with_api.py
│ ├── using_captcha_api.py
│ └── captcha_api.py
├── chp8
│ ├── __init__.py
│ └── example
│ │ ├── example
│ │ ├── __init__.py
│ │ ├── spiders
│ │ │ ├── __init__.py
│ │ │ └── country.py
│ │ ├── items.py
│ │ ├── pipelines.py
│ │ ├── middlewares.py
│ │ └── settings.py
│ │ └── scrapy.cfg
├── chp9
│ ├── __init__.py
│ ├── gap_scraper_callback.py
│ ├── facebook_graph.py
│ ├── scrape_google.py
│ ├── facebook_selenium.py
│ └── bmw_scraper.py
├── example_config.cfg
├── chp5
│ ├── lxml_attempt.py
│ ├── pyqt_search_browser_render.py
│ ├── json_one_req.py
│ ├── selenium_search.py
│ ├── pyqt_webkit.py
│ ├── json_scraper.py
│ ├── pyqt_search.py
│ └── browser_render.py
└── chp4
│ ├── extract_list.py
│ ├── alexa_callback.py
│ ├── redis_queue.py
│ ├── advanced_link_crawler.py
│ ├── threaded_crawler.py
│ └── threaded_crawler_with_queue.py
├── data
├── captcha_samples
│ ├── sample1.png
│ ├── sample10.png
│ ├── sample100.png
│ ├── sample11.png
│ ├── sample12.png
│ ├── sample13.png
│ ├── sample14.png
│ ├── sample15.png
│ ├── sample16.png
│ ├── sample17.png
│ ├── sample18.png
│ ├── sample19.png
│ ├── sample2.png
│ ├── sample20.png
│ ├── sample21.png
│ ├── sample22.png
│ ├── sample23.png
│ ├── sample24.png
│ ├── sample25.png
│ ├── sample26.png
│ ├── sample27.png
│ ├── sample28.png
│ ├── sample29.png
│ ├── sample3.png
│ ├── sample30.png
│ ├── sample31.png
│ ├── sample32.png
│ ├── sample33.png
│ ├── sample34.png
│ ├── sample35.png
│ ├── sample36.png
│ ├── sample37.png
│ ├── sample38.png
│ ├── sample39.png
│ ├── sample4.png
│ ├── sample40.png
│ ├── sample41.png
│ ├── sample42.png
│ ├── sample43.png
│ ├── sample44.png
│ ├── sample45.png
│ ├── sample46.png
│ ├── sample47.png
│ ├── sample48.png
│ ├── sample49.png
│ ├── sample5.png
│ ├── sample50.png
│ ├── sample51.png
│ ├── sample52.png
│ ├── sample53.png
│ ├── sample54.png
│ ├── sample55.png
│ ├── sample56.png
│ ├── sample57.png
│ ├── sample58.png
│ ├── sample59.png
│ ├── sample6.png
│ ├── sample60.png
│ ├── sample61.png
│ ├── sample62.png
│ ├── sample63.png
│ ├── sample64.png
│ ├── sample65.png
│ ├── sample66.png
│ ├── sample67.png
│ ├── sample68.png
│ ├── sample69.png
│ ├── sample7.png
│ ├── sample70.png
│ ├── sample71.png
│ ├── sample72.png
│ ├── sample73.png
│ ├── sample74.png
│ ├── sample75.png
│ ├── sample76.png
│ ├── sample77.png
│ ├── sample78.png
│ ├── sample79.png
│ ├── sample8.png
│ ├── sample80.png
│ ├── sample81.png
│ ├── sample82.png
│ ├── sample83.png
│ ├── sample84.png
│ ├── sample85.png
│ ├── sample86.png
│ ├── sample87.png
│ ├── sample88.png
│ ├── sample89.png
│ ├── sample9.png
│ ├── sample90.png
│ ├── sample91.png
│ ├── sample92.png
│ ├── sample93.png
│ ├── sample94.png
│ ├── sample95.png
│ ├── sample96.png
│ ├── sample97.png
│ ├── sample98.png
│ ├── sample99.png
│ └── samples.csv
└── .gitignore
├── .gitignore
└── README.md
/code/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp1/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp2/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp3/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp6/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp7/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp8/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp9/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/chp8/example/example/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/code/example_config.cfg:
--------------------------------------------------------------------------------
1 | [captcha_api]
2 | key=ERU285FKDSL28311
3 |
--------------------------------------------------------------------------------
/data/captcha_samples/sample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample1.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample10.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample100.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample11.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample12.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample13.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample14.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample15.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample16.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample17.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample18.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample19.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample2.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample20.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample21.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample22.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample23.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample24.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample25.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample26.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample27.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample28.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample29.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample3.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample30.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample31.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample32.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample33.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample34.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample35.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample35.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample36.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample36.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample37.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample37.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample38.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample38.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample39.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample4.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample40.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample40.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample41.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample41.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample42.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample42.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample43.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample43.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample44.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample44.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample45.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample45.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample46.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample46.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample47.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample47.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample48.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample48.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample49.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample49.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample5.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample50.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample51.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample51.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample52.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample52.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample53.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample53.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample54.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample54.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample55.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample55.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample56.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample56.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample57.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample57.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample58.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample58.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample59.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample59.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample6.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample60.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample61.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample61.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample62.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample62.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample63.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample63.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample64.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample64.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample65.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample65.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample66.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample66.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample67.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample67.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample68.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample68.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample69.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample69.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample7.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample70.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample70.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample71.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample71.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample72.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample72.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample73.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample73.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample74.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample74.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample75.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample75.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample76.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample76.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample77.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample77.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample78.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample78.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample79.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample79.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample8.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample80.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample80.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample81.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample81.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample82.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample82.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample83.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample83.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample84.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample84.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample85.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample85.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample86.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample86.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample87.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample87.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample88.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample88.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample89.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample89.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample9.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample90.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample90.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample91.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample91.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample92.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample92.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample93.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample93.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample94.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample94.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample95.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample95.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample96.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample96.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample97.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample97.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample98.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample98.png
--------------------------------------------------------------------------------
/data/captcha_samples/sample99.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample99.png
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except these files
4 | !captcha_samples/
5 | !captcha_samples/*
6 | !.gitignore
7 |
--------------------------------------------------------------------------------
/code/chp8/example/example/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/code/chp5/lxml_attempt.py:
--------------------------------------------------------------------------------
1 | from lxml.html import fromstring
2 | from chp3.downloader import Downloader
3 |
4 | D = Downloader()
5 | html = D('http://example.webscraping.com/search')
6 | tree = fromstring(html)
7 | tree.cssselect('div#results a')
8 |
--------------------------------------------------------------------------------
/code/chp2/lxml_brokenhtml.py:
--------------------------------------------------------------------------------
1 | from lxml.html import fromstring, tostring
2 |
3 | broken_html = '
'
4 |
5 | tree = fromstring(broken_html) # parse the HTML
6 | fixed_html = tostring(tree, pretty_print=True)
7 | print(fixed_html)
8 |
--------------------------------------------------------------------------------
/code/chp7/investigate_form.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from chp6.login import parse_form
3 |
4 | REGISTER_URL = 'http://example.webscraping.com/user/register'
5 |
6 | session = requests.Session()
7 |
8 | html = session.get(REGISTER_URL)
9 | form = parse_form(html.content)
10 | print(form)
11 |
--------------------------------------------------------------------------------
/code/chp6/login_form_requests.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | LOGIN_URL = 'http://example.webscraping.com/user/login'
4 | LOGIN_EMAIL = 'example@webscraping.com'
5 | LOGIN_PASSWORD = 'example'
6 | data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD}
7 |
8 | response = requests.post(LOGIN_URL, data)
9 | print(response.url)
10 |
--------------------------------------------------------------------------------
/code/chp8/example/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = example.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = example
12 |
--------------------------------------------------------------------------------
/code/chp8/example/example/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class CountryItem(scrapy.Item):
12 | name = scrapy.Field()
13 | population = scrapy.Field()
14 |
--------------------------------------------------------------------------------
/code/chp2/xpath_scraper.py:
--------------------------------------------------------------------------------
1 | from lxml.html import fromstring
2 | from chp1.advanced_link_crawler import download
3 |
4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
5 | html = download(url)
6 |
7 | tree = fromstring(html)
8 | area = tree.xpath('//tr[@id="places_area__row"]/td[@class="w2p_fw"]/text()')[0]
9 | print(area)
10 |
--------------------------------------------------------------------------------
/code/chp2/lxml_scraper.py:
--------------------------------------------------------------------------------
1 | from lxml.html import fromstring
2 | from chp1.advanced_link_crawler import download
3 |
4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
5 | html = download(url)
6 |
7 | tree = fromstring(html)
8 | td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0]
9 | area = td.text_content()
10 | print(area)
11 |
--------------------------------------------------------------------------------
/code/chp8/example/example/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class ExamplePipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/code/chp1/downloading_a_page.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | from urllib.error import URLError, HTTPError, ContentTooShortError
3 |
4 |
5 | def download(url):
6 | print('Downloading:', url)
7 | try:
8 | html = urllib.request.urlopen(url).read()
9 | except (URLError, HTTPError, ContentTooShortError) as e:
10 | print('Download error:', e.reason)
11 | html = None
12 | return html
13 |
--------------------------------------------------------------------------------
/code/chp5/pyqt_search_browser_render.py:
--------------------------------------------------------------------------------
1 | from chp5.browser_render import BrowserRender
2 |
3 | br = BrowserRender()
4 | br.download('http://example.webscraping.com/search')
5 | br.attr('#search_term', 'value', '.')
6 | br.text('#page_size option:checked', '1000')
7 | br.click('#search')
8 | elements = br.wait_load('#results a')
9 |
10 | countries = [e.toPlainText().strip() for e in elements]
11 | print(countries)
12 |
--------------------------------------------------------------------------------
/code/chp9/gap_scraper_callback.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 |
4 | def scrape_callback(url, html):
5 | if url.endswith('.xml'):
6 | # Parse the sitemap XML file
7 | resp = requests.get(url)
8 | tree = etree.fromstring(resp.content)
9 | links = [e[0].text for e in tree]
10 | return links
11 | else:
12 | # Add scraping code here
13 | pass
14 |
--------------------------------------------------------------------------------
/code/chp2/beautifulsoup.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from chp1.advanced_link_crawler import download
3 |
4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
5 | html = download(url)
6 | soup = BeautifulSoup(html, 'html5lib')
7 |
8 | # locate the area row
9 | tr = soup.find(attrs={'id': 'places_area__row'})
10 | td = tr.find(attrs={'class': 'w2p_fw'}) # locate the data
11 | area = td.text # extract the data
12 | print(area)
13 |
--------------------------------------------------------------------------------
/code/chp6/submit_login_form.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlencode
2 | from urllib.request import Request, urlopen
3 |
4 | LOGIN_URL = 'http://example.webscraping.com/user/login'
5 | LOGIN_EMAIL = 'example@webscraping.com'
6 | LOGIN_PASSWORD = 'example'
7 | data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD}
8 | encoded_data = urlencode(data)
9 | request = Request(LOGIN_URL, encoded_data.encode('utf-8'))
10 | response = urlopen(request)
11 | print(response.geturl())
12 |
--------------------------------------------------------------------------------
/code/chp2/family_trees.py:
--------------------------------------------------------------------------------
1 | from lxml.html import fromstring
2 | from chp1.advanced_link_crawler import download
3 |
4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
5 | html = download(url)
6 |
7 | tree = fromstring(html)
8 | table = tree.xpath('//table')[0]
9 |
10 | print('Children:', table.getchildren())
11 | print('Parent:', table.getparent())
12 | print('Previous Sibling:', table.getprevious())
13 | print('Next Sibling:', table.getnext())
14 | print('All Siblings:', list(table.itersiblings()))
15 |
--------------------------------------------------------------------------------
/code/chp4/extract_list.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from zipfile import ZipFile
3 | from io import TextIOWrapper, BytesIO
4 | import requests
5 |
6 | resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True)
7 | urls = [] # top 1 million URL's will be stored in this list
8 | with ZipFile(BytesIO(resp.content)) as zf:
9 | csv_filename = zf.namelist()[0]
10 | with zf.open(csv_filename) as csv_file:
11 | for _, website in csv.reader(TextIOWrapper(csv_file)):
12 | urls.append('http://' + website)
13 |
--------------------------------------------------------------------------------
/code/chp5/json_one_req.py:
--------------------------------------------------------------------------------
1 | from csv import DictWriter
2 | import requests
3 |
4 |
5 | PAGE_SIZE = 1000
6 |
7 | template_url = 'http://example.webscraping.com/ajax/' + \
8 | 'search.json?page=0&page_size={}&search_term=.'
9 |
10 | resp = requests.get(template_url.format(PAGE_SIZE))
11 | data = resp.json()
12 | records = data.get('records')
13 |
14 | with open('../data/countries.csv', 'w') as countries_file:
15 | wrtr = DictWriter(countries_file, fieldnames=records[0].keys())
16 | wrtr.writeheader()
17 | wrtr.writerows(records)
18 |
--------------------------------------------------------------------------------
/code/chp5/selenium_search.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 | driver = webdriver.Firefox()
4 | driver.get('http://example.webscraping.com/search')
5 | driver.find_element_by_id('search_term').send_keys('.')
6 | js = "document.getElementById('page_size').options[1].text = '1000';"
7 | driver.execute_script(js)
8 | driver.find_element_by_id('search').click()
9 | driver.implicitly_wait(30)
10 | links = driver.find_elements_by_css_selector('#results a')
11 | countries = [link.text for link in links]
12 | print(countries)
13 |
14 | driver.close()
15 |
--------------------------------------------------------------------------------
/code/chp6/mechanize_form.py:
--------------------------------------------------------------------------------
1 | import mechanize
2 |
3 | LOGIN_URL = 'http://example.webscraping.com/user/login'
4 | LOGIN_EMAIL = 'example@webscraping.com'
5 | LOGIN_PASSWORD = 'example'
6 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239'
7 |
8 |
9 | br = mechanize.Browser()
10 | br.open(LOGIN_URL)
11 | br.select_form(nr=0)
12 | br['email'] = LOGIN_EMAIL
13 | br['password'] = LOGIN_PASSWORD
14 | response = br.submit()
15 | br.open(COUNTRY_URL)
16 | br.select_form(nr=0)
17 | br['population'] = str(int(br['population']) + 1)
18 | br.submit()
19 |
--------------------------------------------------------------------------------
/code/chp2/regex.py:
--------------------------------------------------------------------------------
1 | import re
2 | from chp1.advanced_link_crawler import download
3 |
4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239'
5 | html = download(url)
6 |
7 | print(re.findall(r'(.*?) | ', html))
8 |
9 | print(re.findall('(.*?) | ', html)[1])
10 |
11 | print(re.findall(' | (.*?) | ', html))
12 |
13 | print(re.findall('''
.*?| (.*?) | ''', html))
14 |
--------------------------------------------------------------------------------
/code/chp1/retrying_downloads.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | from urllib.error import URLError, HTTPError, ContentTooShortError
3 |
4 |
5 | def download(url, num_retries=2):
6 | print('Downloading:', url)
7 | try:
8 | html = urllib.request.urlopen(url).read()
9 | except (URLError, HTTPError, ContentTooShortError) as e:
10 | print('Download error:', e.reason)
11 | html = None
12 | if num_retries > 0:
13 | if hasattr(e, 'code') and 500 <= e.code < 600:
14 | # recursively retry 5xx HTTP errors
15 | return download(url, num_retries - 1)
16 | return html
17 |
--------------------------------------------------------------------------------
/code/chp9/facebook_graph.py:
--------------------------------------------------------------------------------
1 | from facebook import GraphAPI
2 | from configparser import ConfigParser
3 |
4 |
5 | def get_page_details(access_token, page):
6 | graph = GraphAPI(access_token, version='2.7')
7 | return graph.get_object(page, fields='about,events,feed,picture')
8 |
9 |
10 | if __name__ == '__main__':
11 | config = ConfigParser()
12 | # This script assumes you have the following config
13 | # set up with a section facebook and key access_token
14 | config.read('../../config/api.cfg')
15 | access_token = config.get('facebook', 'access_token')
16 | print(get_page_details(access_token, 'PacktPub'))
17 |
--------------------------------------------------------------------------------
/code/chp5/pyqt_webkit.py:
--------------------------------------------------------------------------------
1 | import lxml.html
2 | try:
3 | from PySide.QtGui import *
4 | from PySide.QtCore import *
5 | from PySide.QtWebKit import *
6 | except ImportError:
7 | from PyQt4.QtGui import *
8 | from PyQt4.QtCore import *
9 | from PyQt4.QtWebKit import *
10 |
11 | url = 'http://example.webscraping.com/dynamic'
12 | app = QApplication([])
13 | webview = QWebView()
14 | loop = QEventLoop()
15 | webview.loadFinished.connect(loop.quit)
16 | webview.load(QUrl(url))
17 | loop.exec_()
18 | html = webview.page().mainFrame().toHtml()
19 | tree = lxml.html.fromstring(html)
20 | print(tree.cssselect('#result')[0].text_content())
21 |
--------------------------------------------------------------------------------
/code/chp2/beautifulsoup_brokenhtml.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | from chp1.advanced_link_crawler import download
3 |
4 | broken_html = ''
5 |
6 | soup = BeautifulSoup(broken_html, 'html.parser')
7 | fixed_html = soup.prettify()
8 | print(fixed_html)
9 |
10 | # still broken, so try a different parser
11 |
12 | soup = BeautifulSoup(broken_html, 'html5lib')
13 | fixed_html = soup.prettify()
14 | print(fixed_html)
15 |
16 | # now we can try and extract the data from the html
17 |
18 | ul = soup.find('ul', attrs={'class': 'country'})
19 | print(ul.find('li')) # returns just the first match
20 | print(ul.find_all('li')) # returns all matches
21 |
--------------------------------------------------------------------------------
/code/chp1/setting_user_agent.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | from urllib.error import URLError, HTTPError, ContentTooShortError
3 |
4 |
5 | def download(url, num_retries=2, user_agent='wswp'):
6 | print('Downloading:', url)
7 | request = urllib.request.Request(url)
8 | request.add_header('User-agent', user_agent)
9 | try:
10 | html = urllib.request.urlopen(request).read()
11 | except (URLError, HTTPError, ContentTooShortError) as e:
12 | print('Download error:', e.reason)
13 | html = None
14 | if num_retries > 0:
15 | if hasattr(e, 'code') and 500 <= e.code < 600:
16 | # recursively retry 5xx HTTP errors
17 | return download(url, num_retries - 1)
18 | return html
19 |
--------------------------------------------------------------------------------
/code/chp9/scrape_google.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from urllib.parse import parse_qs, urlparse
3 | from lxml.html import fromstring
4 |
5 | # get results from search
6 | html = requests.get('https://www.google.com/search?q=test')
7 | tree = fromstring(html.content)
8 | results = tree.cssselect('h3.r a')
9 | print(results)
10 |
11 | # grab the first link
12 | link = results[0].get('href')
13 | print(link)
14 |
15 | # parse the destination url from the querystring
16 | qs = urlparse(link).query
17 | parsed_qs = parse_qs(qs)
18 | print(parsed_qs)
19 | print(parsed_qs.get('q', []))
20 |
21 |
22 | # as one list
23 | links = []
24 | for result in results:
25 | link = result.get('href')
26 | qs = urlparse(link).query
27 | links.extend(parse_qs(qs).get('q', []))
28 |
29 | print(links)
30 |
--------------------------------------------------------------------------------
/code/chp7/image_processing.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | from lxml.html import fromstring
3 | from PIL import Image
4 | import base64
5 |
6 |
7 | def get_b64_string(html):
8 | tree = fromstring(html)
9 | img_data = tree.cssselect('div#recaptcha img')[0].get('src')
10 | img_data = img_data.partition(',')[-1]
11 | return img_data
12 |
13 |
14 | def get_captcha_img(html):
15 | tree = fromstring(html)
16 | img_data = tree.cssselect('div#recaptcha img')[0].get('src')
17 | img_data = img_data.partition(',')[-1]
18 | binary_img_data = base64.b64decode(img_data)
19 | img = Image.open(BytesIO(binary_img_data))
20 | return img
21 |
22 |
23 | def img_to_bw(img):
24 | gray = img.convert('L')
25 | bw = gray.point(lambda x: 0 if x < 1 else 255, '1')
26 | return bw
27 |
--------------------------------------------------------------------------------
/code/chp9/facebook_selenium.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 |
4 | def get_driver():
5 | try:
6 | return webdriver.PhantomJS()
7 | except:
8 | return webdriver.Firefox()
9 |
10 |
11 | def facebook(username, password, url):
12 | driver = get_driver()
13 | driver.get('https://facebook.com')
14 | driver.find_element_by_id('email').send_keys(username)
15 | driver.find_element_by_id('pass').send_keys(password)
16 | driver.find_element_by_id('loginbutton').submit()
17 | driver.implicitly_wait(30)
18 | # wait until the search box is available,
19 | # which means it has successfully logged in
20 | search = driver.find_element_by_name('q')
21 | # now logged in so can go to the page of interest
22 | driver.get(url)
23 | # add code to scrape data of interest here ...
24 |
--------------------------------------------------------------------------------
/code/chp4/alexa_callback.py:
--------------------------------------------------------------------------------
1 | import csv
2 | from zipfile import ZipFile
3 | from io import TextIOWrapper, BytesIO
4 | import requests
5 |
6 |
7 | class AlexaCallback:
8 | def __init__(self, max_urls=500):
9 | self.max_urls = max_urls
10 | self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip'
11 | self.urls = []
12 |
13 | def __call__(self):
14 | resp = requests.get(self.seed_url, stream=True)
15 | with ZipFile(BytesIO(resp.content)) as zf:
16 | csv_filename = zf.namelist()[0]
17 | with zf.open(csv_filename) as csv_file:
18 | for _, website in csv.reader(TextIOWrapper(csv_file)):
19 | self.urls.append('http://' + website)
20 | if len(self.urls) == self.max_urls:
21 | break
22 |
--------------------------------------------------------------------------------
/code/chp9/bmw_scraper.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import json
3 | import requests
4 |
5 |
6 | url = 'https://c2b-services.bmw.com/c2b-localsearch/services/api/v3/clients/BMWDIGITAL_DLO/DE/pois?country=DE&category=BM&maxResults=%d&language=en&lat=52.507537768880056&lng=13.425269635701511'
7 | jsonp = requests.get(url % 1000)
8 | pure_json = jsonp.text[jsonp.text.index('(') + 1: jsonp.text.rindex(')')]
9 | dealers = json.loads(pure_json)
10 | print(dealers.keys())
11 | print(dealers['count'])
12 | print(dealers['data']['pois'][0])
13 |
14 | with open('../../data/bmw.csv', 'w') as fp:
15 | writer = csv.writer(fp)
16 | writer.writerow(['Name', 'Latitude', 'Longitude'])
17 | for dealer in dealers['data']['pois']:
18 | name = dealer['name']
19 | lat, lng = dealer['lat'], dealer['lng']
20 | writer.writerow([name, lat, lng])
21 |
--------------------------------------------------------------------------------
/code/chp5/json_scraper.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import string
3 |
4 | PAGE_SIZE = 10
5 |
6 | template_url = 'http://example.webscraping.com/ajax/' + \
7 | 'search.json?page={}&page_size={}&search_term={}'
8 |
9 | countries = set()
10 |
11 | for letter in string.ascii_lowercase:
12 | print('Searching with %s' % letter)
13 | page = 0
14 | while True:
15 | resp = requests.get(template_url.format(page, PAGE_SIZE, letter))
16 | data = resp.json()
17 | print('adding %d more records from page %d' %
18 | (len(data.get('records')), page))
19 | for record in data.get('records'):
20 | countries.add(record['country'])
21 | page += 1
22 | if page >= data['num_pages']:
23 | break
24 |
25 | with open('../data/countries.txt', 'w') as countries_file:
26 | countries_file.write('\n'.join(sorted(countries)))
27 |
--------------------------------------------------------------------------------
/code/chp2/csv_callback.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import re
3 | from lxml.html import fromstring
4 |
5 |
6 | class CsvCallback:
7 | def __init__(self):
8 | self.writer = csv.writer(open('../data/countries.csv', 'w'))
9 | self.fields = ('area', 'population', 'iso', 'country', 'capital',
10 | 'continent', 'tld', 'currency_code', 'currency_name',
11 | 'phone', 'postal_code_format', 'postal_code_regex',
12 | 'languages', 'neighbours')
13 | self.writer.writerow(self.fields)
14 |
15 | def __call__(self, url, html):
16 | if re.search('/view/', url):
17 | tree = fromstring(html)
18 | all_rows = [
19 | tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
20 | for field in self.fields]
21 | self.writer.writerow(all_rows)
22 |
--------------------------------------------------------------------------------
/code/chp1/throttle.py:
--------------------------------------------------------------------------------
1 | from urllib.parse import urlparse
2 | import time
3 |
4 |
5 | class Throttle:
6 | """ Add a delay between downloads to the same domain
7 | """
8 | def __init__(self, delay):
9 | # amount of delay between downloads for each domain
10 | self.delay = delay
11 | # timestamp of when a domain was last accessed
12 | self.domains = {}
13 |
14 | def wait(self, url):
15 | domain = urlparse(url).netloc
16 | last_accessed = self.domains.get(domain)
17 |
18 | if self.delay > 0 and last_accessed is not None:
19 | sleep_secs = self.delay - (time.time() - last_accessed)
20 | if sleep_secs > 0:
21 | # domain has been accessed recently
22 | # so need to sleep
23 | time.sleep(sleep_secs)
24 | # update the last accessed time
25 | self.domains[domain] = time.time()
26 |
--------------------------------------------------------------------------------
/code/chp3/url_parsing.py:
--------------------------------------------------------------------------------
1 | import re
2 | from urllib.parse import urlsplit
3 |
4 | # how to manage converting urls into filenames
5 |
6 | url = 'http://example.webscraping.com/places/default/view/Australia-1'
7 | filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', url)
8 | filename = '/'.join(segment[:255] for segment in filename.split('/'))
9 | print(filename)
10 |
11 | # how to handle edge case where we need to append index.html for parent urls
12 | # such as http://example.webscraping.com/index/
13 |
14 | components = urlsplit('http://example.webscraping.com/index/')
15 | print(components)
16 | print(components.path)
17 | path = components.path
18 | if not path:
19 | path = '/index.html'
20 | elif path.endswith('/'):
21 | path += 'index.html'
22 | filename = components.netloc + path + components.query
23 | filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
24 | filename = '/'.join(segment[:255] for segment in filename.split('/'))
25 | print(filename)
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | *~
60 | */.*~
61 | .*/
62 | *.rdb
63 | config/
64 |
--------------------------------------------------------------------------------
/code/chp8/example/example/spiders/country.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 | from example.items import CountryItem
6 |
7 |
8 | class CountrySpider(CrawlSpider):
9 | name = 'country'
10 | allowed_domains = ['example.webscraping.com']
11 | start_urls = ['http://example.webscraping.com/']
12 |
13 | rules = (
14 | Rule(LinkExtractor(allow=r'/index/', deny=r'/user/'),
15 | follow=True),
16 | Rule(LinkExtractor(allow=r'/view/', deny=r'/user/'),
17 | callback='parse_item'),
18 | )
19 |
20 | def parse_item(self, response):
21 | item = CountryItem()
22 | name_css = 'tr#places_country__row td.w2p_fw::text'
23 | item['name'] = response.css(name_css).extract()
24 | pop_xpath = '//tr[@id="places_population__row"]/td[@class="w2p_fw"]/text()'
25 | item['population'] = response.xpath(pop_xpath).extract()
26 | return item
27 |
--------------------------------------------------------------------------------
/code/chp2/test_scrapers.py:
--------------------------------------------------------------------------------
1 | import time
2 | import re
3 | from chp2.all_scrapers import re_scraper, bs_scraper, \
4 | lxml_scraper, lxml_xpath_scraper
5 | from chp1.advanced_link_crawler import download
6 |
7 | NUM_ITERATIONS = 1000 # number of times to test each scraper
8 | html = download('http://example.webscraping.com/places/view/United-Kingdom-239')
9 |
10 | scrapers = [
11 | ('Regular expressions', re_scraper),
12 | ('BeautifulSoup', bs_scraper),
13 | ('Lxml', lxml_scraper),
14 | ('Xpath', lxml_xpath_scraper)]
15 |
16 | for name, scraper in scrapers:
17 | # record start time of scrape
18 | start = time.time()
19 | for i in range(NUM_ITERATIONS):
20 | if scraper == re_scraper:
21 | re.purge()
22 | result = scraper(html)
23 | # check scraped result is as expected
24 | assert result['area'] == '244,820 square kilometres'
25 | # record end time of scrape and output the total
26 | end = time.time()
27 | print('%s: %.2f seconds' % (name, end - start))
28 |
--------------------------------------------------------------------------------
/code/chp7/test_samples.py:
--------------------------------------------------------------------------------
1 | import os
2 | from PIL import Image
3 | from csv import reader
4 | from chp7.register import ocr
5 | from chp7.image_processing import img_to_bw
6 |
7 | SAMPLES_DIR = os.path.join(
8 | os.path.dirname(os.path.realpath(__file__)),
9 | '..', '..', 'data', 'captcha_samples')
10 |
11 |
12 | def get_rdr(samples_folder=SAMPLES_DIR):
13 | return reader(open(os.path.join(samples_folder, 'samples.csv')))
14 |
15 |
16 | def test_samples(samples_folder=SAMPLES_DIR):
17 | rdr = get_rdr(samples_folder=samples_folder)
18 | results = {'correct': 0, 'incorrect': 0}
19 | for fname, txt in rdr:
20 | img = Image.open(os.path.join(samples_folder, fname))
21 | captcha = ocr(img)
22 | if captcha == txt:
23 | results['correct'] += 1
24 | else:
25 | results['incorrect'] += 1
26 | print('accuracy: {}%'.format(results['correct'] / 100.0))
27 | print('results: ', results)
28 | return results
29 |
30 | if __name__ == '__main__':
31 | test_samples()
32 |
--------------------------------------------------------------------------------
/code/chp5/pyqt_search.py:
--------------------------------------------------------------------------------
1 | try:
2 | from PySide.QtGui import *
3 | from PySide.QtCore import *
4 | from PySide.QtWebKit import *
5 | except ImportError:
6 | from PyQt4.QtGui import *
7 | from PyQt4.QtCore import *
8 | from PyQt4.QtWebKit import *
9 |
10 |
11 | app = QApplication([])
12 | webview = QWebView()
13 | loop = QEventLoop()
14 | webview.loadFinished.connect(loop.quit)
15 | webview.load(QUrl('http://example.webscraping.com/search'))
16 | loop.exec_()
17 | webview.show()
18 | frame = webview.page().mainFrame()
19 | frame.findFirstElement('#search_term').setAttribute('value', '.')
20 | frame.findFirstElement('#page_size option:checked').setPlainText('1000')
21 | frame.findFirstElement('#search').evaluateJavaScript('this.click()')
22 | # app.exec_() ## Uncomment and this will become a blocking event
23 |
24 | elements = None
25 | while not elements:
26 | app.processEvents()
27 | elements = frame.findAllElements('#results a')
28 |
29 |
30 | countries = [e.toPlainText().strip() for e in elements]
31 | print(countries)
32 |
--------------------------------------------------------------------------------
/code/chp6/login.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml.html import fromstring
3 |
4 |
5 | LOGIN_URL = 'http://example.webscraping.com/user/login'
6 | LOGIN_EMAIL = 'example@webscraping.com'
7 | LOGIN_PASSWORD = 'example'
8 |
9 |
10 | def parse_form(html):
11 | tree = fromstring(html)
12 | data = {}
13 | for e in tree.cssselect('form input'):
14 | if e.get('name'):
15 | data[e.get('name')] = e.get('value')
16 | return data
17 |
18 |
19 | def login(session=None):
20 | """ Login to example website.
21 | params:
22 | session: request lib session object or None
23 | returns tuple(response, session)
24 | """
25 | if session is None:
26 | html = requests.get(LOGIN_URL)
27 | else:
28 | html = session.get(LOGIN_URL)
29 | data = parse_form(html.content)
30 | data['email'] = LOGIN_EMAIL
31 | data['password'] = LOGIN_PASSWORD
32 | if session is None:
33 | response = requests.post(LOGIN_URL, data, cookies=html.cookies)
34 | else:
35 | response = session.post(LOGIN_URL, data)
36 | assert 'login' not in response.url
37 | return response, session
38 |
--------------------------------------------------------------------------------
/code/chp1/sitemap_crawler.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | import re
3 |
4 | from urllib.error import URLError, HTTPError, ContentTooShortError
5 |
6 |
7 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
8 | print('Downloading:', url)
9 | request = urllib.request.Request(url)
10 | request.add_header('User-agent', user_agent)
11 | try:
12 | resp = urllib.request.urlopen(request)
13 | cs = resp.headers.get_content_charset()
14 | if not cs:
15 | cs = charset
16 | html = resp.read().decode(cs)
17 | except (URLError, HTTPError, ContentTooShortError) as e:
18 | print('Download error:', e.reason)
19 | html = None
20 | if num_retries > 0:
21 | if hasattr(e, 'code') and 500 <= e.code < 600:
22 | # recursively retry 5xx HTTP errors
23 | return download(url, num_retries - 1)
24 | return html
25 |
26 |
27 | def crawl_sitemap(url):
28 | # download the sitemap file
29 | sitemap = download(url)
30 | # extract the sitemap links
31 | links = re.findall('(.*?)', sitemap)
32 | # download each link
33 | for link in links:
34 | html = download(link)
35 | # scrape html here
36 |
--------------------------------------------------------------------------------
/code/chp1/id_iteration_crawler.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import urllib.request
3 | from urllib.error import URLError, HTTPError, ContentTooShortError
4 |
5 |
6 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
7 | print('Downloading:', url)
8 | request = urllib.request.Request(url)
9 | request.add_header('User-agent', user_agent)
10 | try:
11 | resp = urllib.request.urlopen(request)
12 | cs = resp.headers.get_content_charset()
13 | if not cs:
14 | cs = charset
15 | html = resp.read().decode(cs)
16 | except (URLError, HTTPError, ContentTooShortError) as e:
17 | print('Download error:', e.reason)
18 | html = None
19 | if num_retries > 0:
20 | if hasattr(e, 'code') and 500 <= e.code < 600:
21 | # recursively retry 5xx HTTP errors
22 | return download(url, num_retries - 1)
23 | return html
24 |
25 |
26 | def crawl_site(url, max_errors=5):
27 | num_errors = 0
28 | for page in itertools.count(1):
29 | pg_url = '{}{}'.format(url, page)
30 | html = download(pg_url)
31 | if html is None:
32 | num_errors += 1
33 | if num_errors == max_errors:
34 | # reached max number of errors, so exit
35 | break
36 | else:
37 | num_errors = 0
38 | # success - can scrape the result
39 |
--------------------------------------------------------------------------------
/code/chp7/register_with_ocr.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import string
3 | import pytesseract
4 | from lxml.html import fromstring
5 | from chp6.login import parse_form
6 | from chp7.image_processing import get_captcha_img, img_to_bw
7 |
8 | REGISTER_URL = 'http://example.webscraping.com/user/register'
9 |
10 |
11 | def register(first_name, last_name, email, password):
12 | session = requests.Session()
13 | html = session.get(REGISTER_URL)
14 | form = parse_form(html.content)
15 | form['first_name'] = first_name
16 | form['last_name'] = last_name
17 | form['email'] = email
18 | form['password'] = form['password_two'] = password
19 | img = get_captcha_img(html.content)
20 | captcha = ocr(img)
21 | form['recaptcha_response_field'] = captcha
22 | resp = session.post(html.url, form)
23 | success = '/user/register' not in resp.url
24 | if not success:
25 | form_errors = fromstring(resp.content).cssselect('div.error')
26 | print('Form Errors:')
27 | print('\n'.join(
28 | (' {}: {}'.format(f.get('id'), f.text) for f in form_errors)))
29 | return success
30 |
31 |
32 | def ocr(img):
33 | bw = img_to_bw(img)
34 | captcha = pytesseract.image_to_string(bw)
35 | cleaned = ''.join(c for c in captcha.lower() if c in string.ascii_lowercase)
36 | if len(cleaned) != len(captcha):
37 | print('removed bad characters: {}'.format(set(captcha) - set(cleaned)))
38 | return cleaned
39 |
--------------------------------------------------------------------------------
/code/chp7/register_with_api.py:
--------------------------------------------------------------------------------
1 | from configparser import ConfigParser
2 | import requests
3 | from lxml.html import fromstring
4 | from chp6.login import parse_form
5 | from chp7.image_processing import get_captcha_img
6 | from chp7.captcha_api import CaptchaAPI
7 |
8 | REGISTER_URL = 'http://example.webscraping.com/user/register'
9 |
10 |
11 | def get_api_key():
12 | config = ConfigParser()
13 | config.read('../config/api.cfg')
14 | return config.get('captcha_api', 'key')
15 |
16 |
17 | def register(first_name, last_name, email, password):
18 | session = requests.Session()
19 | html = session.get(REGISTER_URL)
20 | form = parse_form(html.content)
21 | form['first_name'] = first_name
22 | form['last_name'] = last_name
23 | form['email'] = email
24 | form['password'] = form['password_two'] = password
25 | api_key = get_api_key()
26 | img = get_captcha_img(html.content)
27 | api = CaptchaAPI(api_key)
28 | captcha_id, captcha = api.solve(img)
29 | form['recaptcha_response_field'] = captcha
30 | resp = session.post(html.url, form)
31 | success = '/user/register' not in resp.url
32 | if success:
33 | api.report(captcha_id, 1)
34 | else:
35 | form_errors = fromstring(resp.content).cssselect('div.error')
36 | print('Form Errors:')
37 | print('\n'.join(
38 | (' {}: {}'.format(f.get('id'), f.text) for f in form_errors)))
39 | if 'invalid' in [f.text for f in form_errors]:
40 | api.report(captcha_id, 0)
41 | return success
42 |
--------------------------------------------------------------------------------
/code/chp6/firefox_sessions.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import glob
4 | import requests
5 |
6 | from lxml.html import fromstring
7 |
8 |
9 | def find_ff_sessions():
10 | paths = [
11 | '~/.mozilla/firefox/*.default',
12 | '~/Library/Application Support/Firefox/Profiles/*.default',
13 | '%APPDATA%/Roaming/Mozilla/Firefox/Profiles/*.default'
14 | ]
15 | for path in paths:
16 | filename = os.path.join(path, 'sessionstore.js')
17 | matches = glob.glob(os.path.expanduser(filename))
18 | if matches:
19 | return matches[0]
20 |
21 |
22 | def load_ff_sessions(session_filename):
23 | cookies = {}
24 | if os.path.exists(session_filename):
25 | json_data = json.loads(open(session_filename, 'rb').read())
26 | for window in json_data.get('windows', []):
27 | for cookie in window.get('cookies', []):
28 | cookies[cookie.get('name')] = cookie.get('value')
29 | else:
30 | print('Session filename does not exist:', session_filename)
31 | return cookies
32 |
33 |
34 | def session_login():
35 | session_filename = find_ff_sessions()
36 | assert session_filename is not None
37 | cookies = load_ff_sessions(session_filename)
38 | print('found cookies: ', cookies)
39 | url = 'http://example.webscraping.com'
40 | html = requests.get(url, cookies=cookies)
41 | tree = fromstring(html.content)
42 | print(tree.cssselect('ul#navbar li a')[0].text_content())
43 | return html
44 |
45 |
46 | if __name__ == '__main__':
47 | session_login()
48 |
--------------------------------------------------------------------------------
/code/chp2/all_scrapers.py:
--------------------------------------------------------------------------------
1 | import re
2 | from bs4 import BeautifulSoup
3 | from lxml.html import fromstring
4 |
5 | FIELDS = ('area', 'population', 'iso', 'country', 'capital',
6 | 'continent', 'tld', 'currency_code', 'currency_name',
7 | 'phone', 'postal_code_format', 'postal_code_regex',
8 | 'languages', 'neighbours')
9 |
10 |
11 | def re_scraper(html):
12 | """ Using regex to extract data from country pages. """
13 | results = {}
14 | for field in FIELDS:
15 | results[field] = re.search(
16 | '
.*?| (.*?) | '
17 | % field, html).groups()[0]
18 | return results
19 |
20 |
21 | def bs_scraper(html):
22 | """ Using beautifulsoup to extract data from country pages. """
23 | soup = BeautifulSoup(html, 'html.parser')
24 | results = {}
25 | for field in FIELDS:
26 | results[field] = soup.find('table').find(
27 | 'tr', id='places_%s__row' % field).find(
28 | 'td', class_='w2p_fw').text
29 | return results
30 |
31 |
32 | def lxml_scraper(html):
33 | """ Using lxml and cssselect to extract data from country pages. """
34 | tree = fromstring(html)
35 | results = {}
36 | for field in FIELDS:
37 | results[field] = tree.cssselect(
38 | 'table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content()
39 | return results
40 |
41 |
42 | def lxml_xpath_scraper(html):
43 | """ Using lxml and xpath to extract data from country pages. """
44 | tree = fromstring(html)
45 | results = {}
46 | for field in FIELDS:
47 | results[field] = tree.xpath(
48 | '//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
49 | return results
50 |
--------------------------------------------------------------------------------
/code/chp3/rediscache.py:
--------------------------------------------------------------------------------
1 | import json
2 | import zlib
3 | from datetime import datetime, timedelta
4 | from redis import StrictRedis
5 |
6 |
7 | class RedisCache:
8 | """ RedisCache helps store urls and their responses to Redis
9 | Initialization components:
10 | client: a Redis client connected to the key-value database for
11 | the webcrawling cache (if not set, a localhost:6379
12 | default connection is used).
13 | expires (datetime.timedelta): timedelta when content will expire
14 | (default: 30 days ago)
15 | encoding (str): character encoding for serialization
16 | compress (bool): boolean indicating whether compression with zlib should be used
17 | """
18 | def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True):
19 | self.client = (StrictRedis(host='localhost', port=6379, db=0)
20 | if client is None else client)
21 | self.expires = expires
22 | self.encoding = encoding
23 | self.compress = compress
24 |
25 | def __getitem__(self, url):
26 | """Load data from Redis for given URL"""
27 | record = self.client.get(url)
28 | if record:
29 | if self.compress:
30 | record = zlib.decompress(record)
31 | return json.loads(record.decode(self.encoding))
32 | else:
33 | # URL has not yet been cached
34 | raise KeyError(url + ' does not exist')
35 |
36 | def __setitem__(self, url, result):
37 | """Save data to Redis for given url"""
38 | data = bytes(json.dumps(result), self.encoding)
39 | if self.compress:
40 | data = zlib.compress(data)
41 | self.client.setex(url, self.expires, data)
42 |
--------------------------------------------------------------------------------
/code/chp6/selenium_forms.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.keys import Keys
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.webdriver.support import expected_conditions as EC
6 |
7 |
8 | LOGIN_URL = 'http://example.webscraping.com/user/login'
9 | LOGIN_EMAIL = 'example@webscraping.com'
10 | LOGIN_PASSWORD = 'example'
11 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239'
12 |
13 |
14 | def get_driver():
15 | try:
16 | return webdriver.PhantomJS()
17 | except Exception:
18 | return webdriver.Firefox()
19 |
20 |
21 | def login(driver):
22 | driver.get(LOGIN_URL)
23 | driver.find_element_by_id('auth_user_email').send_keys(LOGIN_EMAIL)
24 | driver.find_element_by_id('auth_user_password').send_keys(
25 | LOGIN_PASSWORD + Keys.RETURN)
26 | pg_loaded = WebDriverWait(driver, 10).until(
27 | EC.presence_of_element_located((By.ID, "results")))
28 | assert 'login' not in driver.current_url
29 |
30 |
31 | def add_population(driver):
32 | driver.get(COUNTRY_URL)
33 | population = driver.find_element_by_id('places_population')
34 | new_population = int(population.get_attribute('value')) + 1
35 | population.clear()
36 | population.send_keys(new_population)
37 | driver.find_element_by_xpath('//input[@type="submit"]').click()
38 | pg_loaded = WebDriverWait(driver, 10).until(
39 | EC.presence_of_element_located((By.ID, "places_population__row")))
40 | test_population = int(driver.find_element_by_css_selector(
41 | '#places_population__row .w2p_fw').text.replace(',', ''))
42 | assert test_population == new_population
43 |
44 |
45 | if __name__ == '__main__':
46 | driver = get_driver()
47 | login(driver)
48 | add_population(driver)
49 | driver.quit()
50 |
--------------------------------------------------------------------------------
/code/chp6/edit.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json
3 | from lxml.html import fromstring
4 | from chp6.login import login, parse_form
5 |
6 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239'
7 | VIEW_URL = 'http://example.webscraping.com/places/default/view/United-Kingdom-239'
8 |
9 |
10 | def get_population():
11 | html = requests.get(VIEW_URL)
12 | tree = fromstring(html.content)
13 | population = tree.cssselect(
14 | '#places_population__row .w2p_fw')[0].text_content()
15 | return int(population.replace(',', ''))
16 |
17 |
18 | def add_population():
19 | session = requests.Session()
20 | response, session = login(session=session)
21 | country_html = session.get(COUNTRY_URL)
22 | data = parse_form(country_html.content)
23 | print('population is: ', data['population'])
24 | data['population'] = int(data['population']) + 1
25 | response = session.post(COUNTRY_URL, data=data)
26 | test_population = get_population()
27 | print('population is now:', test_population)
28 | assert test_population == data['population']
29 |
30 |
31 | def get_currency():
32 | html = requests.get(VIEW_URL)
33 | tree = fromstring(html.content)
34 | currency = tree.cssselect(
35 | '#places_currency_name__row .w2p_fw')[0].text_content()
36 | return currency
37 |
38 |
39 | def change_currency():
40 | session = requests.Session()
41 | response, session = login(session=session)
42 | country_html = session.get(COUNTRY_URL)
43 | data = parse_form(country_html.content)
44 | print('currency is: ', data['currency_name'])
45 | data['currency_name'] = 'British pounds'
46 | response = session.post(COUNTRY_URL, data=data)
47 | test_currency = get_currency()
48 | print('currency is now: ', test_currency)
49 | assert test_currency == data['currency_name']
50 |
51 |
52 | if __name__ == '__main__':
53 | add_population()
54 |
--------------------------------------------------------------------------------
/code/chp1/link_crawler.py:
--------------------------------------------------------------------------------
1 | import re
2 | import urllib.request
3 | from urllib.parse import urljoin
4 | from urllib.error import URLError, HTTPError, ContentTooShortError
5 |
6 |
7 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
8 | print('Downloading:', url)
9 | request = urllib.request.Request(url)
10 | request.add_header('User-agent', user_agent)
11 | try:
12 | resp = urllib.request.urlopen(request)
13 | cs = resp.headers.get_content_charset()
14 | if not cs:
15 | cs = charset
16 | html = resp.read().decode(cs)
17 | except (URLError, HTTPError, ContentTooShortError) as e:
18 | print('Download error:', e.reason)
19 | html = None
20 | if num_retries > 0:
21 | if hasattr(e, 'code') and 500 <= e.code < 600:
22 | # recursively retry 5xx HTTP errors
23 | return download(url, num_retries - 1)
24 | return html
25 |
26 |
27 | def link_crawler(start_url, link_regex):
28 | " Crawl from the given start URL following links matched by link_regex "
29 | crawl_queue = [start_url]
30 | # keep track which URL's have seen before
31 | seen = set(crawl_queue)
32 | while crawl_queue:
33 | url = crawl_queue.pop()
34 | html = download(url)
35 | if not html:
36 | continue
37 | # filter for links matching our regular expression
38 | for link in get_links(html):
39 | if re.match(link_regex, link):
40 | abs_link = urljoin(start_url, link)
41 | if abs_link not in seen:
42 | seen.add(abs_link)
43 | crawl_queue.append(abs_link)
44 |
45 |
46 | def get_links(html):
47 | " Return a list of links from html "
48 | # a regular expression to extract all links from the webpage
49 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
50 | # list of all links from the webpage
51 | return webpage_regex.findall(html)
52 |
--------------------------------------------------------------------------------
/code/chp8/example/example/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ExampleSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/code/chp4/redis_queue.py:
--------------------------------------------------------------------------------
1 | # Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html
2 | from redis import StrictRedis
3 |
4 |
5 | class RedisQueue:
6 | """ RedisQueue helps store urls to crawl to Redis
7 | Initialization components:
8 | client: a Redis client connected to the key-value database for
9 | the webcrawling cache (if not set, a localhost:6379
10 | default connection is used).
11 | db (int): which database to use for Redis
12 | queue_name (str): name for queue (default: wswp)
13 | """
14 |
15 | def __init__(self, client=None, db=0, queue_name='wswp'):
16 | self.client = (StrictRedis(host='localhost', port=6379, db=db)
17 | if client is None else client)
18 | self.name = "queue:%s" % queue_name
19 | self.seen_set = "seen:%s" % queue_name
20 | self.depth = "depth:%s" % queue_name
21 |
22 | def __len__(self):
23 | return self.client.llen(self.name)
24 |
25 | def push(self, element):
26 | """Push an element to the tail of the queue"""
27 | if isinstance(element, list):
28 | element = [e for e in element if not self.already_seen(e)]
29 | self.client.lpush(self.name, *element)
30 | self.client.sadd(self.seen_set, *element)
31 | elif not self.already_seen(element):
32 | self.client.lpush(self.name, element)
33 | self.client.sadd(self.seen_set, element)
34 |
35 | def already_seen(self, element):
36 | """ determine if an element has already been seen """
37 | return self.client.sismember(self.seen_set, element)
38 |
39 | def set_depth(self, element, depth):
40 | """ Set the seen hash and depth """
41 | self.client.hset(self.depth, element, depth)
42 |
43 | def get_depth(self, element):
44 | """ Get the seen hash and depth """
45 | return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element))
46 |
47 | def pop(self):
48 | """Pop an element from the head of the queue"""
49 | return self.client.rpop(self.name).decode('utf-8')
50 |
--------------------------------------------------------------------------------
/code/chp7/using_captcha_api.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import base64
3 | from configparser import ConfigParser
4 | from time import sleep
5 | from lxml.html import fromstring
6 | from chp6.login import parse_form
7 | from chp7.image_processing import get_captcha_img, get_b64_string
8 |
9 | API_URL = 'https://www.9kw.eu/index.cgi'
10 | REGISTER_URL = 'http://example.webscraping.com/user/register'
11 |
12 |
13 | def get_api_key():
14 | config = ConfigParser()
15 | config.read('../config/api.cfg')
16 | return config.get('captcha_api', 'key')
17 |
18 |
19 | def send_captcha(api_key, img_data):
20 | data = {
21 | 'action': 'usercaptchaupload',
22 | 'apikey': api_key,
23 | 'file-upload-01': img_data,
24 | 'base64': '1',
25 | 'selfsolve': '1',
26 | 'json': '1',
27 | 'maxtimeout': '300'
28 | }
29 | resp = requests.post(API_URL, data)
30 | return resp.json()
31 |
32 |
33 | def get_captcha_text(api_key, captcha_id):
34 | data = {
35 | 'action': 'usercaptchacorrectdata',
36 | 'id': captcha_id,
37 | 'apikey': api_key,
38 | 'json': '1',
39 | }
40 | resp = requests.get(API_URL, data)
41 | print('captcha text response:', resp.json())
42 | answer = resp.json().get('answer')
43 | return answer
44 |
45 |
46 | def register(first_name, last_name, email, password):
47 | session = requests.Session()
48 | html = session.get(REGISTER_URL)
49 | form = parse_form(html.content)
50 | form['first_name'] = first_name
51 | form['last_name'] = last_name
52 | form['email'] = email
53 | form['password'] = form['password_two'] = password
54 | img_data = get_b64_string(html.content)
55 | img = get_captcha_img(html.content)
56 | img.show() # This will show the image locally when run
57 | api_key = get_api_key()
58 | captcha_id = send_captcha(api_key, img_data)
59 | print('submitted captcha, got id:', captcha_id)
60 | sleep(300)
61 | captcha = get_captcha_text(api_key, captcha_id)
62 | print('captcha solve:', captcha)
63 | form['recaptcha_response_field'] = captcha
64 | resp = session.post(html.url, form)
65 | success = '/user/register' not in resp.url
66 | if not success:
67 | form_errors = fromstring(resp.content).cssselect('div.error')
68 | print('Form Errors:')
69 | print('\n'.join(
70 | (' {}: {}'.format(f.get('id'), f.text) for f in form_errors)))
71 | return success
72 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Web Scraping with Python
2 |
3 | Welcome to the code repository for [Web Scraping with Python, Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/python-web-scraping-second-edition)! I hope you find the code and data here useful. If you have any questions reach out to @kjam on Twitter or GitHub.
4 |
5 | ### Code Structure
6 |
7 | All of the code samples are in folders separated by chapter. Scripts are intended to be run from the `code` folder, allowing you to easily import from the chapters.
8 |
9 | ### Code Examples
10 |
11 | I have not included every code sample you've found in the book, but I have included a majority of the finished scripts. Although these are included, I encourage you to write out each code sample on your own and use these only as a reference.
12 |
13 | ### Firefox Issues
14 |
15 | Depending on your version of Firefox and Selenium, you may run into JavaScript errors. Here are some fixes:
16 | * Use an older version of Firefox
17 | * Upgrade Selenium to >=3.0.2 and download the [geckodriver](https://github.com/mozilla/geckodriver/releases). Make sure the geckodriver is findable by your PATH variable. You can do this by adding this line to your `.bashrc` or `.bash_profile`. (Wondering what these are? Please read the Appendix C on learning the command line).
18 | * Use [PhantomJS](http://phantomjs.org/) with Selenium (change your browser line to `webdriver.PhantomJS('path/to/your/phantomjs/installation')`)
19 | * Use Chrome, InternetExplorer or any other [supported browser](http://www.seleniumhq.org/about/platforms.jsp)
20 |
21 | Feel free to reach out if you have any questions!
22 |
23 | ### Issues with Module Import
24 |
25 | Seeing chp1 ModuleNotFound errors? Try adding this snippet to the file:
26 |
27 | ```
28 | import os
29 | import sys
30 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)))
31 | ```
32 |
33 | What this does is append the main module to your system path, which is where Python looks for imports. On some installations, I have noticed the current directory is not immediately added (common practice), so this code *explicitly* adds that directory to your path.
34 |
35 |
36 | ### Corrections?
37 |
38 | If you find any issues in these code examples, feel free to submit an Issue or Pull Request. I appreciate your input!
39 |
40 |
41 | ### First edition repository
42 |
43 | If you are looking for the first edition's repository, you can find it here: [Web Scraping with Python, First Edition](https://bitbucket.org/wswp/)
44 |
45 | ### Questions?
46 |
47 | Reach out to @kjam on Twitter or GitHub. @kjam is also often on freenode. :)
48 |
--------------------------------------------------------------------------------
/data/captcha_samples/samples.csv:
--------------------------------------------------------------------------------
1 | sample1.png,watch
2 | sample2.png,clean
3 | sample3.png,forward
4 | sample4.png,secret
5 | sample5.png,square
6 | sample6.png,sweet
7 | sample7.png,flight
8 | sample8.png,number
9 | sample9.png,parcel
10 | sample10.png,linen
11 | sample11.png,attack
12 | sample12.png,comfort
13 | sample13.png,healthy
14 | sample14.png,woman
15 | sample15.png,between
16 | sample16.png,fruit
17 | sample17.png,office
18 | sample18.png,electric
19 | sample19.png,light
20 | sample20.png,reward
21 | sample21.png,powder
22 | sample22.png,damage
23 | sample23.png,thick
24 | sample24.png,tomorrow
25 | sample25.png,white
26 | sample26.png,together
27 | sample27.png,trick
28 | sample28.png,sister
29 | sample29.png,tongue
30 | sample30.png,because
31 | sample31.png,again
32 | sample32.png,tooth
33 | sample33.png,almost
34 | sample34.png,board
35 | sample35.png,stitch
36 | sample36.png,spoon
37 | sample37.png,paste
38 | sample38.png,memory
39 | sample39.png,guide
40 | sample40.png,electric
41 | sample41.png,regret
42 | sample42.png,harbor
43 | sample43.png,prose
44 | sample44.png,circle
45 | sample45.png,flight
46 | sample46.png,motion
47 | sample47.png,cause
48 | sample48.png,front
49 | sample49.png,question
50 | sample50.png,drawer
51 | sample51.png,present
52 | sample52.png,elastic
53 | sample53.png,laugh
54 | sample54.png,rhythm
55 | sample55.png,angle
56 | sample56.png,porter
57 | sample57.png,purpose
58 | sample58.png,event
59 | sample59.png,effect
60 | sample60.png,history
61 | sample61.png,tired
62 | sample62.png,animal
63 | sample63.png,steam
64 | sample64.png,normal
65 | sample65.png,scissors
66 | sample66.png,while
67 | sample67.png,print
68 | sample68.png,behavior
69 | sample69.png,impulse
70 | sample70.png,quiet
71 | sample71.png,level
72 | sample72.png,basin
73 | sample73.png,every
74 | sample74.png,peace
75 | sample75.png,right
76 | sample76.png,month
77 | sample77.png,science
78 | sample78.png,river
79 | sample79.png,frame
80 | sample80.png,stocking
81 | sample81.png,pencil
82 | sample82.png,table
83 | sample83.png,common
84 | sample84.png,store
85 | sample85.png,ornament
86 | sample86.png,belief
87 | sample87.png,across
88 | sample88.png,history
89 | sample89.png,harmony
90 | sample90.png,young
91 | sample91.png,summer
92 | sample92.png,yellow
93 | sample93.png,medical
94 | sample94.png,current
95 | sample95.png,amount
96 | sample96.png,skirt
97 | sample97.png,serious
98 | sample98.png,paper
99 | sample99.png,round
100 | sample100.png,stamp
101 |
--------------------------------------------------------------------------------
/code/chp5/browser_render.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import re
4 | import csv
5 | import time
6 | try:
7 | from PySide.QtGui import QApplication
8 | from PySide.QtCore import QUrl, QEventLoop, QTimer
9 | from PySide.QtWebKit import QWebView
10 | except ImportError:
11 | from PyQt4.QtGui import QApplication
12 | from PyQt4.QtCore import QUrl, QEventLoop, QTimer
13 | from PyQt4.QtWebKit import QWebView
14 | import lxml.html
15 |
16 |
17 | class BrowserRender(QWebView):
18 | def __init__(self, display=True):
19 | self.app = QApplication([])
20 | QWebView.__init__(self)
21 | if display:
22 | self.show() # show the browser
23 |
24 | def open(self, url, timeout=60):
25 | """Wait for download to complete and return result"""
26 | loop = QEventLoop()
27 | timer = QTimer()
28 | timer.setSingleShot(True)
29 | timer.timeout.connect(loop.quit)
30 | self.loadFinished.connect(loop.quit)
31 | self.load(QUrl(url))
32 | timer.start(timeout * 1000)
33 | loop.exec_() # delay here until download finished
34 | if timer.isActive():
35 | # downloaded successfully
36 | timer.stop()
37 | return self.html()
38 | else:
39 | # timed out
40 | print 'Request timed out:', url
41 |
42 | def html(self):
43 | """Shortcut to return the current HTML"""
44 | return self.page().mainFrame().toHtml()
45 |
46 | def find(self, pattern):
47 | """Find all elements that match the pattern"""
48 | return self.page().mainFrame().findAllElements(pattern)
49 |
50 | def attr(self, pattern, name, value):
51 | """Set attribute for matching elements"""
52 | for e in self.find(pattern):
53 | e.setAttribute(name, value)
54 |
55 | def text(self, pattern, value):
56 | """Set attribute for matching elements"""
57 | for e in self.find(pattern):
58 | e.setPlainText(value)
59 |
60 | def click(self, pattern):
61 | """Click matching elements"""
62 | for e in self.find(pattern):
63 | e.evaluateJavaScript("this.click()")
64 |
65 | def wait_load(self, pattern, timeout=60):
66 | """Wait for this pattern to be found in webpage and return matches"""
67 | deadline = time.time() + timeout
68 | while time.time() < deadline:
69 | self.app.processEvents()
70 | matches = self.find(pattern)
71 | if matches:
72 | return matches
73 | print('Wait load timed out')
74 |
75 |
76 | def main():
77 | br = BrowserRender()
78 | br.open('http://example.webscraping.com/search')
79 | br.attr('#search_term', 'value', '.')
80 | br.text('#page_size option:checked', '1000')
81 | br.click('#search')
82 |
83 | elements = br.wait_load('#results a')
84 | writer = csv.writer(open('countries.csv', 'w'))
85 | for country in [e.toPlainText().strip() for e in elements]:
86 | writer.writerow([country])
87 |
88 |
89 | if __name__ == '__main__':
90 | main()
91 |
--------------------------------------------------------------------------------
/code/chp7/captcha_api.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import re
3 | import time
4 | import requests
5 | from io import BytesIO
6 |
7 |
8 | class CaptchaAPI:
9 | def __init__(self, api_key, timeout=120):
10 | self.api_key = api_key
11 | self.timeout = timeout
12 | self.url = 'https://www.9kw.eu/index.cgi'
13 |
14 | def solve(self, img):
15 | """Submit CAPTCHA and return result when ready
16 | """
17 | img_buffer = BytesIO()
18 | img.save(img_buffer, format="PNG")
19 | img_data = img_buffer.getvalue()
20 | captcha_id = self.send(img_data)
21 | start_time = time.time()
22 | while time.time() < start_time + self.timeout:
23 | try:
24 | resp = self.get(captcha_id)
25 | except CaptchaError:
26 | pass # CAPTCHA still not ready
27 | else:
28 | if resp.get('answer') != 'NO DATA':
29 | if resp.get('answer') == 'ERROR NO USER':
30 | raise CaptchaError(
31 | 'Error: no user available to solve CAPTCHA')
32 | else:
33 | print('CAPTCHA solved!')
34 | return captcha_id, resp.get('answer')
35 | print('Waiting for CAPTCHA ...')
36 | time.sleep(1)
37 |
38 | raise CaptchaError('Error: API timeout')
39 |
40 | def send(self, img_data):
41 | """Send CAPTCHA for solving """
42 | print('Submitting CAPTCHA')
43 | data = {
44 | 'action': 'usercaptchaupload',
45 | 'apikey': self.api_key,
46 | 'file-upload-01': base64.b64encode(img_data),
47 | 'base64': '1',
48 | 'selfsolve': '1',
49 | 'json': '1',
50 | 'maxtimeout': str(self.timeout)
51 | }
52 | result = requests.post(self.url, data)
53 | self.check(result.text)
54 | return result.json()
55 |
56 | def get(self, captcha_id):
57 | """Get result of solved CAPTCHA"""
58 | data = {
59 | 'action': 'usercaptchacorrectdata',
60 | 'id': captcha_id,
61 | 'apikey': self.api_key,
62 | 'info': '1',
63 | 'json': '1',
64 | }
65 | result = requests.get(self.url, data)
66 | self.check(result.text)
67 | return result.json()
68 |
69 | def check(self, result):
70 | """Check result of API and raise error if error code"""
71 | if re.match('00\d\d \w+', result):
72 | raise CaptchaError('API error: ' + result)
73 |
74 | def report(self, captcha_id, correct):
75 | """ Report back whether captcha was correct or not"""
76 | data = {
77 | 'action': 'usercaptchacorrectback',
78 | 'id': captcha_id,
79 | 'apikey': self.api_key,
80 | 'correct': (lambda c: 1 if c else 2)(correct),
81 | 'json': '1',
82 | }
83 | resp = requests.get(self.url, data)
84 | return resp.json()
85 |
86 |
87 | class CaptchaError(Exception):
88 | pass
89 |
--------------------------------------------------------------------------------
/code/chp3/requests_cache_link_crawler.py:
--------------------------------------------------------------------------------
1 | import re
2 | from urllib import robotparser
3 | from urllib.parse import urljoin
4 | from datetime import timedelta
5 | from chp3.downloader_requests_cache import Downloader
6 |
7 | import requests_cache
8 |
9 |
10 | def get_robots_parser(robots_url):
11 | " Return the robots parser object using the robots_url "
12 | rp = robotparser.RobotFileParser()
13 | rp.set_url(robots_url)
14 | rp.read()
15 | return rp
16 |
17 |
18 | def get_links(html):
19 | " Return a list of links (using simple regex matching) from the html content "
20 | # a regular expression to extract all links from the webpage
21 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
22 | # list of all links from the webpage
23 | return webpage_regex.findall(html)
24 |
25 |
26 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
27 | proxies=None, delay=3, max_depth=4, num_retries=2, expires=timedelta(days=30)):
28 | """ Crawl from the given start URL following links matched by link_regex. In the current
29 | implementation, we do not actually scrapy any information.
30 |
31 | args:
32 | start_url (str): web site to start crawl
33 | link_regex (str): regex to match for links
34 | kwargs:
35 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
36 | user_agent (str): user agent (default: wswp)
37 | proxies (list of dicts): a list of possible dicts for http / https proxies
38 | For formatting, see the requests library
39 | delay (int): seconds to throttle between requests to one domain (default: 3)
40 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
41 | num_retries (int): # of retries when 5xx error (default: 2)
42 | expires (timedelta): timedelta for cache expirations (default: 30 days)
43 | """
44 | crawl_queue = [start_url]
45 | # keep track which URL's have seen before
46 | seen = {}
47 | requests_cache.install_cache(backend='redis', expire_after=expires)
48 | if not robots_url:
49 | robots_url = '{}/robots.txt'.format(start_url)
50 | rp = get_robots_parser(robots_url)
51 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies)
52 | while crawl_queue:
53 | url = crawl_queue.pop()
54 | # check url passes robots.txt restrictions
55 | if rp.can_fetch(user_agent, url):
56 | depth = seen.get(url, 0)
57 | if depth == max_depth:
58 | print('Skipping %s due to depth' % url)
59 | continue
60 | html = D(url, num_retries=num_retries)
61 | if not html:
62 | continue
63 | # TODO: add actual data scraping here
64 | # filter for links matching our regular expression
65 | for link in get_links(html):
66 | if re.match(link_regex, link):
67 | abs_link = urljoin(start_url, link)
68 | if abs_link not in seen:
69 | seen[abs_link] = depth + 1
70 | crawl_queue.append(abs_link)
71 | else:
72 | print('Blocked by robots.txt:', url)
73 |
--------------------------------------------------------------------------------
/code/chp3/advanced_link_crawler.py:
--------------------------------------------------------------------------------
1 | import re
2 | from urllib import robotparser
3 | from urllib.parse import urljoin
4 | from chp3.downloader import Downloader
5 |
6 |
7 | def get_robots_parser(robots_url):
8 | " Return the robots parser object using the robots_url "
9 | rp = robotparser.RobotFileParser()
10 | rp.set_url(robots_url)
11 | rp.read()
12 | return rp
13 |
14 |
15 | def get_links(html):
16 | " Return a list of links (using simple regex matching) from the html content "
17 | # a regular expression to extract all links from the webpage
18 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
19 | # list of all links from the webpage
20 | return webpage_regex.findall(html)
21 |
22 |
23 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
24 | proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None):
25 | """ Crawl from the given start URL following links matched by link_regex. In the current
26 | implementation, we do not actually scrape any information.
27 |
28 | args:
29 | start_url (str): web site to start crawl
30 | link_regex (str): regex to match for links
31 | kwargs:
32 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
33 | user_agent (str): user agent (default: wswp)
34 | proxies (list of dicts): a list of possible dicts for http / https proxies
35 | For formatting, see the requests library
36 | delay (int): seconds to throttle between requests to one domain (default: 3)
37 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
38 | num_retries (int): # of retries when 5xx error (default: 2)
39 | cache (dict): cache dict with urls as keys and dicts for responses (default: {})
40 | scraper_callback: function to be called on url and html content
41 | """
42 | crawl_queue = [start_url]
43 | # keep track which URL's have seen before
44 | seen = {}
45 | if not robots_url:
46 | robots_url = '{}/robots.txt'.format(start_url)
47 | rp = get_robots_parser(robots_url)
48 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
49 | while crawl_queue:
50 | url = crawl_queue.pop()
51 | # check url passes robots.txt restrictions
52 | if rp.can_fetch(user_agent, url):
53 | depth = seen.get(url, 0)
54 | if depth == max_depth:
55 | print('Skipping %s due to depth' % url)
56 | continue
57 | html = D(url, num_retries=num_retries)
58 | if not html:
59 | continue
60 | if scraper_callback:
61 | links = scraper_callback(url, html) or []
62 | else:
63 | links = []
64 | # filter for links matching our regular expression
65 | for link in get_links(html) + links:
66 | if re.match(link_regex, link):
67 | abs_link = urljoin(start_url, link)
68 | if abs_link not in seen:
69 | seen[abs_link] = depth + 1
70 | crawl_queue.append(abs_link)
71 | else:
72 | print('Blocked by robots.txt:', url)
73 |
--------------------------------------------------------------------------------
/code/chp3/downloader.py:
--------------------------------------------------------------------------------
1 | from random import choice
2 | import requests
3 |
4 | from chp1.throttle import Throttle
5 |
6 |
7 | class Downloader:
8 | """ Downloader class to use cache and requests for downloading pages.
9 | For contructor, pass:
10 | delay (int): # of secs delay between requests (default: 5)
11 | user_agent (str): user agent string (default: 'wswp')
12 | proxies (list[dict]): list of possible proxies, each
13 | must be a dict with http / https keys and proxy values
14 | cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code)
15 | timeout (float/int): number of seconds to wait until timeout
16 | """
17 | def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={},
18 | timeout=60):
19 | self.throttle = Throttle(delay)
20 | self.user_agent = user_agent
21 | self.proxies = proxies
22 | self.cache = cache
23 | self.num_retries = None # we will set this per request
24 | self.timeout = timeout
25 |
26 | def __call__(self, url, num_retries=2):
27 | """ Call the downloader class, which will return HTML from cache
28 | or download it
29 | args:
30 | url (str): url to download
31 | kwargs:
32 | num_retries (int): # times to retry if 5xx code (default: 2)
33 | """
34 | self.num_retries = num_retries
35 | try:
36 | result = self.cache[url]
37 | print('Loaded from cache:', url)
38 | except KeyError:
39 | result = None
40 | if result and self.num_retries and 500 <= result['code'] < 600:
41 | # server error so ignore result from cache
42 | # and re-download
43 | result = None
44 | if result is None:
45 | # result was not loaded from cache, need to download
46 | self.throttle.wait(url)
47 | proxies = choice(self.proxies) if self.proxies else None
48 | headers = {'User-Agent': self.user_agent}
49 | result = self.download(url, headers, proxies)
50 | self.cache[url] = result
51 | return result['html']
52 |
53 | def download(self, url, headers, proxies):
54 | """ Download a and return the page content
55 | args:
56 | url (str): URL
57 | headers (dict): dict of headers (like user_agent)
58 | proxies (dict): proxy dict w/ keys 'http'/'https', values
59 | are strs (i.e. 'http(s)://IP') (default: None)
60 | """
61 | print('Downloading:', url)
62 | try:
63 | resp = requests.get(url, headers=headers, proxies=proxies,
64 | timeout=self.timeout)
65 | html = resp.text
66 | if resp.status_code >= 400:
67 | print('Download error:', resp.text)
68 | html = None
69 | if self.num_retries and 500 <= resp.status_code < 600:
70 | # recursively retry 5xx HTTP errors
71 | self.num_retries -= 1
72 | return self.download(url, headers, proxies)
73 | except requests.exceptions.RequestException as e:
74 | print('Download error:', e)
75 | return {'html': None, 'code': 500}
76 | return {'html': html, 'code': resp.status_code}
77 |
--------------------------------------------------------------------------------
/code/chp8/example/example/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for example project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'example'
13 |
14 | SPIDER_MODULES = ['example.spiders']
15 | NEWSPIDER_MODULE = 'example.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'example (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 5
31 | # The download delay setting will honor only one of:
32 | CONCURRENT_REQUESTS_PER_DOMAIN = 1
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'example.middlewares.ExampleSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'example.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'example.pipelines.ExamplePipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/code/chp3/downloader_requests_cache.py:
--------------------------------------------------------------------------------
1 | from random import choice
2 | import requests
3 | import requests_cache
4 |
5 | from chp1.throttle import Throttle
6 |
7 |
8 | class Downloader:
9 | """ Downloader class to use cache and requests for downloading pages.
10 | For contructor, pass:
11 | delay (int): # of secs delay between requests (default: 5)
12 | user_agent (str): user agent string (default: 'wswp')
13 | proxies (list[dict]): list of possible proxies, each
14 | must be a dict with http / https keys and proxy values
15 | timeout (float/int): number of seconds to wait until timeout
16 | """
17 | def __init__(self, delay=5, user_agent='wswp', proxies=None,
18 | timeout=60):
19 | self.throttle = Throttle(delay)
20 | self.user_agent = user_agent
21 | self.proxies = proxies
22 | self.num_retries = None # we will set this per request
23 | self.timeout = timeout
24 |
25 | def __call__(self, url, num_retries=2):
26 | """ Call the downloader class, which will return HTML from cache
27 | or download it
28 | args:
29 | url (str): url to download
30 | kwargs:
31 | num_retries (int): # times to retry if 5xx code (default: 2)
32 | """
33 | self.num_retries = num_retries
34 | proxies = choice(self.proxies) if self.proxies else None
35 | headers = {'User-Agent': self.user_agent}
36 | result = self.download(url, headers, proxies)
37 | return result['html']
38 |
39 | def make_throttle_hook(self, throttle=None):
40 | """
41 | Modified from: https://requests-cache.readthedocs.io/en/latest/user_guide.html
42 | Returns a response hook function which sleeps for `timeout` seconds if
43 | response is not cached
44 | """
45 | def hook(response, *args, **kwargs):
46 | """ see requests hook documentation for more information"""
47 | if not getattr(response, 'from_cache', False):
48 | throttle.wait(response.url)
49 | print('Downloading:', response.url)
50 | else:
51 | print('Returning from cache:', response.url)
52 | return response
53 | return hook
54 |
55 | def download(self, url, headers, proxies):
56 | """ Download a and return the page content
57 | args:
58 | url (str): URL
59 | headers (dict): dict of headers (like user_agent)
60 | proxies (dict): proxy dict w/ keys 'http'/'https', values
61 | are strs (i.e. 'http(s)://IP') (default: None)
62 | """
63 | session = requests_cache.CachedSession()
64 | session.hooks = {'response': self.make_throttle_hook(self.throttle)}
65 |
66 | try:
67 | resp = session.get(url, headers=headers, proxies=proxies,
68 | timeout=self.timeout)
69 | html = resp.text
70 | if resp.status_code >= 400:
71 | print('Download error:', resp.text)
72 | html = None
73 | if self.num_retries and 500 <= resp.status_code < 600:
74 | # recursively retry 5xx HTTP errors
75 | self.num_retries -= 1
76 | return self.download(url, headers, proxies)
77 | except requests.exceptions.RequestException as e:
78 | print('Download error:', e)
79 | return {'html': None, 'code': 500}
80 | return {'html': html, 'code': resp.status_code}
81 |
--------------------------------------------------------------------------------
/code/chp3/diskcache.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import re
4 | import zlib
5 |
6 | from datetime import datetime, timedelta
7 | from urllib.parse import urlsplit
8 |
9 |
10 | class DiskCache:
11 | """ DiskCache helps store urls and their responses to disk
12 | Intialization components:
13 | cache_dir (str): abs file path or relative file path
14 | for cache directory (default: ../data/cache)
15 | max_len (int): maximum filename length (default: 255)
16 | compress (bool): use zlib compression (default: True)
17 | encoding (str): character encoding for compression (default: utf-8)
18 | expires (datetime.timedelta): timedelta when content will expire
19 | (default: 30 days ago)
20 | """
21 | def __init__(self, cache_dir='../data/cache', max_len=255, compress=True,
22 | encoding='utf-8', expires=timedelta(days=30)):
23 | self.cache_dir = cache_dir
24 | self.max_len = max_len
25 | self.compress = compress
26 | self.encoding = encoding
27 | self.expires = expires
28 |
29 | def url_to_path(self, url):
30 | """ Return file system path string for given URL """
31 | components = urlsplit(url)
32 | # append index.html to empty paths
33 | path = components.path
34 | if not path:
35 | path = '/index.html'
36 | elif path.endswith('/'):
37 | path += 'index.html'
38 | filename = components.netloc + path + components.query
39 | # replace invalid characters
40 | filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename)
41 | # restrict maximum number of characters
42 | filename = '/'.join(seg[:self.max_len] for seg in filename.split('/'))
43 | return os.path.join(self.cache_dir, filename)
44 |
45 | def __getitem__(self, url):
46 | """Load data from disk for given URL"""
47 | path = self.url_to_path(url)
48 | if os.path.exists(path):
49 | mode = ('rb' if self.compress else 'r')
50 | with open(path, mode) as fp:
51 | if self.compress:
52 | data = zlib.decompress(fp.read()).decode(self.encoding)
53 | data = json.loads(data)
54 | else:
55 | data = json.load(fp)
56 | exp_date = data.get('expires')
57 | if exp_date and datetime.strptime(exp_date,
58 | '%Y-%m-%dT%H:%M:%S') <= datetime.utcnow():
59 | print('Cache expired!', exp_date)
60 | raise KeyError(url + ' has expired.')
61 | return data
62 | else:
63 | # URL has not yet been cached
64 | raise KeyError(url + ' does not exist')
65 |
66 | def __setitem__(self, url, result):
67 | """Save data to disk for given url"""
68 | path = self.url_to_path(url)
69 | folder = os.path.dirname(path)
70 | if not os.path.exists(folder):
71 | os.makedirs(folder)
72 | mode = ('wb' if self.compress else 'w')
73 | # Note: the timespec command requires Py3.6+ (if using 3.X you can
74 | # export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f'
75 | result['expires'] = (datetime.utcnow() + self.expires).isoformat(
76 | timespec='seconds')
77 | with open(path, mode) as fp:
78 | if self.compress:
79 | data = bytes(json.dumps(result), self.encoding)
80 | fp.write(zlib.compress(data))
81 | else:
82 | json.dump(result, fp)
83 |
--------------------------------------------------------------------------------
/code/chp1/advanced_link_crawler_using_requests.py:
--------------------------------------------------------------------------------
1 | import re
2 | from urllib import robotparser
3 | from urllib.parse import urljoin
4 |
5 | import requests
6 | from chp1.throttle import Throttle
7 |
8 |
9 | def download(url, num_retries=2, user_agent='wswp', proxies=None):
10 | """ Download a given URL and return the page content
11 | args:
12 | url (str): URL
13 | kwargs:
14 | user_agent (str): user agent (default: wswp)
15 | proxies (dict): proxy dict w/ keys 'http' and 'https', values
16 | are strs (i.e. 'http(s)://IP') (default: None)
17 | num_retries (int): # of retries if a 5xx error is seen (default: 2)
18 | """
19 | print('Downloading:', url)
20 | headers = {'User-Agent': user_agent}
21 | try:
22 | resp = requests.get(url, headers=headers, proxies=proxies)
23 | html = resp.text
24 | if resp.status_code >= 400:
25 | print('Download error:', resp.text)
26 | html = None
27 | if num_retries and 500 <= resp.status_code < 600:
28 | # recursively retry 5xx HTTP errors
29 | return download(url, num_retries - 1)
30 | except requests.exceptions.RequestException as e:
31 | print('Download error:', e)
32 | html = None
33 | return html
34 |
35 |
36 | def get_robots_parser(robots_url):
37 | " Return the robots parser object using the robots_url "
38 | rp = robotparser.RobotFileParser()
39 | rp.set_url(robots_url)
40 | rp.read()
41 | return rp
42 |
43 |
44 | def get_links(html):
45 | """ Return a list of links (using simple regex matching)
46 | from the html content """
47 | # a regular expression to extract all links from the webpage
48 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
49 | # list of all links from the webpage
50 | return webpage_regex.findall(html)
51 |
52 |
53 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
54 | proxies=None, delay=3, max_depth=4):
55 | """ Crawl from the given start URL following links matched by link_regex.
56 | In the current implementation, we do not actually scrape any information.
57 |
58 | args:
59 | start_url (str): web site to start crawl
60 | link_regex (str): regex to match for links
61 | kwargs:
62 | robots_url (str): url of the site's robots.txt
63 | (default: start_url + /robots.txt)
64 | user_agent (str): user agent (default: wswp)
65 | proxies (dict): proxy dict w/ keys 'http' and 'https', values
66 | are strs (i.e. 'http(s)://IP') (default: None)
67 | delay (int): seconds to throttle between requests
68 | to one domain (default: 3)
69 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
70 | """
71 | crawl_queue = [start_url]
72 | # keep track which URL's have seen before
73 | seen = {}
74 | if not robots_url:
75 | robots_url = '{}/robots.txt'.format(start_url)
76 | rp = get_robots_parser(robots_url)
77 | throttle = Throttle(delay)
78 | while crawl_queue:
79 | url = crawl_queue.pop()
80 | # check url passes robots.txt restrictions
81 | if rp.can_fetch(user_agent, url):
82 | depth = seen.get(url, 0)
83 | if depth == max_depth:
84 | print('Skipping %s due to depth' % url)
85 | continue
86 | throttle.wait(url)
87 | html = download(url, user_agent=user_agent, proxies=proxies)
88 | if not html:
89 | continue
90 | # TODO: add actual data scraping here
91 | # filter for links matching our regular expression
92 | for link in get_links(html):
93 | if re.match(link_regex, link):
94 | abs_link = urljoin(start_url, link)
95 | if abs_link not in seen:
96 | seen[abs_link] = depth + 1
97 | crawl_queue.append(abs_link)
98 | else:
99 | print('Blocked by robots.txt:', url)
100 |
--------------------------------------------------------------------------------
/code/chp1/advanced_link_crawler.py:
--------------------------------------------------------------------------------
1 | import re
2 | import urllib.request
3 | from urllib import robotparser
4 | from urllib.parse import urljoin
5 | from urllib.error import URLError, HTTPError, ContentTooShortError
6 | from chp1.throttle import Throttle
7 |
8 |
9 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8', proxy=None):
10 | """ Download a given URL and return the page content
11 | args:
12 | url (str): URL
13 | kwargs:
14 | user_agent (str): user agent (default: wswp)
15 | charset (str): charset if website does not include one in headers
16 | proxy (str): proxy url, ex 'http://IP' (default: None)
17 | num_retries (int): number of retries if a 5xx error is seen (default: 2)
18 | """
19 | print('Downloading:', url)
20 | request = urllib.request.Request(url)
21 | request.add_header('User-agent', user_agent)
22 | try:
23 | if proxy:
24 | proxy_support = urllib.request.ProxyHandler({'http': proxy})
25 | opener = urllib.request.build_opener(proxy_support)
26 | urllib.request.install_opener(opener)
27 | resp = urllib.request.urlopen(request)
28 | cs = resp.headers.get_content_charset()
29 | if not cs:
30 | cs = charset
31 | html = resp.read().decode(cs)
32 | except (URLError, HTTPError, ContentTooShortError) as e:
33 | print('Download error:', e.reason)
34 | html = None
35 | if num_retries > 0:
36 | if hasattr(e, 'code') and 500 <= e.code < 600:
37 | # recursively retry 5xx HTTP errors
38 | return download(url, num_retries - 1)
39 | return html
40 |
41 |
42 | def get_robots_parser(robots_url):
43 | " Return the robots parser object using the robots_url "
44 | rp = robotparser.RobotFileParser()
45 | rp.set_url(robots_url)
46 | rp.read()
47 | return rp
48 |
49 |
50 | def get_links(html):
51 | " Return a list of links (using simple regex matching) from the html content "
52 | # a regular expression to extract all links from the webpage
53 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
54 | # list of all links from the webpage
55 | return webpage_regex.findall(html)
56 |
57 |
58 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
59 | proxy=None, delay=3, max_depth=4):
60 | """ Crawl from the given start URL following links matched by link_regex. In the current
61 | implementation, we do not actually scrapy any information.
62 |
63 | args:
64 | start_url (str): web site to start crawl
65 | link_regex (str): regex to match for links
66 | kwargs:
67 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
68 | user_agent (str): user agent (default: wswp)
69 | proxy (str): proxy url, ex 'http://IP' (default: None)
70 | delay (int): seconds to throttle between requests to one domain (default: 3)
71 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
72 | """
73 | crawl_queue = [start_url]
74 | # keep track which URL's have seen before
75 | seen = {}
76 | if not robots_url:
77 | robots_url = '{}/robots.txt'.format(start_url)
78 | rp = get_robots_parser(robots_url)
79 | throttle = Throttle(delay)
80 | while crawl_queue:
81 | url = crawl_queue.pop()
82 | # check url passes robots.txt restrictions
83 | if rp.can_fetch(user_agent, url):
84 | depth = seen.get(url, 0)
85 | if depth == max_depth:
86 | print('Skipping %s due to depth' % url)
87 | continue
88 | throttle.wait(url)
89 | html = download(url, user_agent=user_agent, proxy=proxy)
90 | if not html:
91 | continue
92 | # TODO: add actual data scraping here
93 | # filter for links matching our regular expression
94 | for link in get_links(html):
95 | if re.match(link_regex, link):
96 | abs_link = urljoin(start_url, link)
97 | if abs_link not in seen:
98 | seen[abs_link] = depth + 1
99 | crawl_queue.append(abs_link)
100 | else:
101 | print('Blocked by robots.txt:', url)
102 |
--------------------------------------------------------------------------------
/code/chp4/advanced_link_crawler.py:
--------------------------------------------------------------------------------
1 | import re
2 | import socket
3 | from urllib import robotparser
4 | from urllib.parse import urljoin, urlparse
5 | from chp3.downloader import Downloader
6 |
7 | socket.setdefaulttimeout(60)
8 |
9 |
10 | def get_robots_parser(robots_url):
11 | " Return the robots parser object using the robots_url "
12 | try:
13 | rp = robotparser.RobotFileParser()
14 | rp.set_url(robots_url)
15 | rp.read()
16 | return rp
17 | except Exception as e:
18 | print('Error finding robots_url:', robots_url, e)
19 |
20 |
21 | def get_links(html):
22 | " Return a list of links (using simple regex matching) from the html content "
23 | # a regular expression to extract all links from the webpage
24 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
25 | # list of all links from the webpage
26 | return webpage_regex.findall(html)
27 |
28 |
29 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
30 | proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None):
31 | """ Crawl from the given start URL following links matched by link_regex. In the current
32 | implementation, we do not actually scrapy any information.
33 |
34 | args:
35 | start_url (str or list of strs): web site(s) to start crawl
36 | link_regex (str): regex to match for links
37 | kwargs:
38 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
39 | user_agent (str): user agent (default: wswp)
40 | proxies (list of dicts): a list of possible dicts for http / https proxies
41 | For formatting, see the requests library
42 | delay (int): seconds to throttle between requests to one domain (default: 3)
43 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
44 | num_retries (int): # of retries when 5xx error (default: 2)
45 | cache (dict): cache dict with urls as keys and dicts for responses (default: {})
46 | scraper_callback: function to be called on url and html content
47 | """
48 | if isinstance(start_url, list):
49 | crawl_queue = start_url
50 | else:
51 | crawl_queue = [start_url]
52 | # keep track which URL's have seen before
53 | seen, robots = {}, {}
54 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
55 | while crawl_queue:
56 | url = crawl_queue.pop()
57 | no_robots = False
58 | if 'http' not in url:
59 | continue
60 | domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)
61 | rp = robots.get(domain)
62 | if not rp and domain not in robots:
63 | robots_url = '{}/robots.txt'.format(domain)
64 | rp = get_robots_parser(robots_url)
65 | if not rp:
66 | # issue finding robots.txt, still crawl
67 | no_robots = True
68 | robots[domain] = rp
69 | elif domain in robots:
70 | no_robots = True
71 | # check url passes robots.txt restrictions
72 | if no_robots or rp.can_fetch(user_agent, url):
73 | depth = seen.get(url, 0)
74 | if depth == max_depth:
75 | print('Skipping %s due to depth' % url)
76 | continue
77 | html = D(url, num_retries=num_retries)
78 | if not html:
79 | continue
80 | if scraper_callback:
81 | links = scraper_callback(url, html) or []
82 | else:
83 | links = []
84 | # filter for links matching our regular expression
85 | for link in get_links(html) + links:
86 | if re.match(link_regex, link):
87 | if 'http' not in link:
88 | if link.startswith('//'):
89 | link = '{}:{}'.format(urlparse(url).scheme, link)
90 | elif link.startswith('://'):
91 | link = '{}{}'.format(urlparse(url).scheme, link)
92 | else:
93 | link = urljoin(domain, link)
94 |
95 | if link not in seen:
96 | seen[link] = depth + 1
97 | crawl_queue.append(link)
98 | else:
99 | print('Blocked by robots.txt:', url)
100 |
101 |
102 | if __name__ == '__main__':
103 | from chp4.alexa_callback import AlexaCallback
104 | from chp3.rediscache import RedisCache
105 | from time import time
106 | AC = AlexaCallback()
107 | AC()
108 | start_time = time()
109 | link_crawler(AC.urls, '$^', cache=RedisCache())
110 | print('Total time: %ss' % (time() - start_time))
111 |
--------------------------------------------------------------------------------
/code/chp2/advanced_link_crawler.py:
--------------------------------------------------------------------------------
1 | import re
2 | import urllib.request
3 | from urllib import robotparser
4 | from urllib.parse import urljoin
5 | from urllib.error import URLError, HTTPError, ContentTooShortError
6 | from lxml.html import fromstring
7 | from chp1.throttle import Throttle
8 |
9 |
10 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8', proxy=None):
11 | """ Download a given URL and return the page content
12 | args:
13 | url (str): URL
14 | kwargs:
15 | user_agent (str): user agent (default: wswp)
16 | charset (str): charset if website does not include one in headers
17 | proxy (str): proxy url, ex 'http://IP' (default: None)
18 | num_retries (int): number of retries if a 5xx error is seen (default: 2)
19 | """
20 | print('Downloading:', url)
21 | request = urllib.request.Request(url)
22 | request.add_header('User-agent', user_agent)
23 | try:
24 | if proxy:
25 | proxy_support = urllib.request.ProxyHandler({'http': proxy})
26 | opener = urllib.request.build_opener(proxy_support)
27 | urllib.request.install_opener(opener)
28 | resp = urllib.request.urlopen(request)
29 | cs = resp.headers.get_content_charset()
30 | if not cs:
31 | cs = charset
32 | html = resp.read().decode(cs)
33 | except (URLError, HTTPError, ContentTooShortError) as e:
34 | print('Download error:', e)
35 | html = None
36 | if num_retries > 0:
37 | if hasattr(e, 'code') and 500 <= e.code < 600:
38 | # recursively retry 5xx HTTP errors
39 | return download(url, num_retries - 1)
40 | return html
41 |
42 |
43 | def get_robots_parser(robots_url):
44 | " Return the robots parser object using the robots_url "
45 | rp = robotparser.RobotFileParser()
46 | rp.set_url(robots_url)
47 | rp.read()
48 | return rp
49 |
50 |
51 | def get_links(html):
52 | " Return a list of links (using simple regex matching) from the html content "
53 | # a regular expression to extract all links from the webpage
54 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
55 | # list of all links from the webpage
56 | return webpage_regex.findall(html)
57 |
58 |
59 | def scrape_callback(url, html):
60 | """ Scrape each row from the country data using XPath and lxml """
61 | fields = ('area', 'population', 'iso', 'country', 'capital',
62 | 'continent', 'tld', 'currency_code', 'currency_name',
63 | 'phone', 'postal_code_format', 'postal_code_regex',
64 | 'languages', 'neighbours')
65 | if re.search('/view/', url):
66 | tree = fromstring(html)
67 | all_rows = [
68 | tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content()
69 | for field in fields]
70 | print(url, all_rows)
71 |
72 |
73 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp',
74 | proxy=None, delay=3, max_depth=4, scrape_callback=None):
75 | """ Crawl from the given start URL following links matched by link_regex. In the current
76 | implementation, we do not actually scrapy any information.
77 |
78 | args:
79 | start_url (str): web site to start crawl
80 | link_regex (str): regex to match for links
81 | kwargs:
82 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt)
83 | user_agent (str): user agent (default: wswp)
84 | proxy (str): proxy url, ex 'http://IP' (default: None)
85 | delay (int): seconds to throttle between requests to one domain (default: 3)
86 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
87 | scrape_callback (function): function to call after each download (default: None)
88 | """
89 | crawl_queue = [start_url]
90 | # keep track which URL's have seen before
91 | seen = {}
92 | data = []
93 | if not robots_url:
94 | robots_url = '{}/robots.txt'.format(start_url)
95 | rp = get_robots_parser(robots_url)
96 | throttle = Throttle(delay)
97 | while crawl_queue:
98 | url = crawl_queue.pop()
99 | # check url passes robots.txt restrictions
100 | if rp.can_fetch(user_agent, url):
101 | depth = seen.get(url, 0)
102 | if depth == max_depth:
103 | print('Skipping %s due to depth' % url)
104 | continue
105 | throttle.wait(url)
106 | html = download(url, user_agent=user_agent, proxy=proxy)
107 | if not html:
108 | continue
109 | if scrape_callback:
110 | data.extend(scrape_callback(url, html) or [])
111 | # filter for links matching our regular expression
112 | for link in get_links(html):
113 | if re.match(link_regex, link):
114 | abs_link = urljoin(start_url, link)
115 | if abs_link not in seen:
116 | seen[abs_link] = depth + 1
117 | crawl_queue.append(abs_link)
118 | else:
119 | print('Blocked by robots.txt:', url)
120 |
--------------------------------------------------------------------------------
/code/chp4/threaded_crawler.py:
--------------------------------------------------------------------------------
1 | import re
2 | import socket
3 | import threading
4 | import time
5 | from urllib import robotparser
6 | from urllib.parse import urljoin, urlparse
7 | from chp3.downloader import Downloader
8 |
9 | SLEEP_TIME = 1
10 | socket.setdefaulttimeout(60)
11 |
12 |
13 | def get_robots_parser(robots_url):
14 | " Return the robots parser object using the robots_url "
15 | try:
16 | rp = robotparser.RobotFileParser()
17 | rp.set_url(robots_url)
18 | rp.read()
19 | return rp
20 | except Exception as e:
21 | print('Error finding robots_url:', robots_url, e)
22 |
23 |
24 | def get_links(html):
25 | " Return a list of links (using simple regex matching) from the html content "
26 | # a regular expression to extract all links from the webpage
27 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
28 | # list of all links from the webpage
29 | return webpage_regex.findall(html)
30 |
31 |
32 | def threaded_crawler(start_url, link_regex, user_agent='wswp', proxies=None,
33 | delay=3, max_depth=4, num_retries=2, cache={}, max_threads=10, scraper_callback=None):
34 | """ Crawl from the given start URLs following links matched by link_regex. In this
35 | implementation, we do not actually scrape any information.
36 |
37 | args:
38 | start_url (str or list of strs): web site(s) to start crawl
39 | link_regex (str): regex to match for links
40 | kwargs:
41 | user_agent (str): user agent (default: wswp)
42 | proxies (list of dicts): a list of possible dicts for http / https proxies
43 | For formatting, see the requests library
44 | delay (int): seconds to throttle between requests to one domain (default: 3)
45 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
46 | num_retries (int): # of retries when 5xx error (default: 2)
47 | cache (dict): cache dict with urls as keys and dicts for responses (default: {})
48 | scraper_callback: function to be called on url and html content
49 | """
50 | if isinstance(start_url, list):
51 | crawl_queue = start_url
52 | else:
53 | crawl_queue = [start_url]
54 | # keep track which URL's have seen before
55 | seen, robots = {}, {}
56 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache)
57 |
58 | def process_queue():
59 | while crawl_queue:
60 | url = crawl_queue.pop()
61 | no_robots = False
62 | if not url or 'http' not in url:
63 | continue
64 | domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc)
65 | rp = robots.get(domain)
66 | if not rp and domain not in robots:
67 | robots_url = '{}/robots.txt'.format(domain)
68 | rp = get_robots_parser(robots_url)
69 | if not rp:
70 | # issue finding robots.txt, still crawl
71 | no_robots = True
72 | robots[domain] = rp
73 | elif domain in robots:
74 | no_robots = True
75 | # check url passes robots.txt restrictions
76 | if no_robots or rp.can_fetch(user_agent, url):
77 | depth = seen.get(url, 0)
78 | if depth == max_depth:
79 | print('Skipping %s due to depth' % url)
80 | continue
81 | html = D(url, num_retries=num_retries)
82 | if not html:
83 | continue
84 | if scraper_callback:
85 | links = scraper_callback(url, html) or []
86 | else:
87 | links = []
88 | # filter for links matching our regular expression
89 | for link in get_links(html) + links:
90 | if re.match(link_regex, link):
91 | if 'http' not in link:
92 | if link.startswith('//'):
93 | link = '{}:{}'.format(urlparse(url).scheme, link)
94 | elif link.startswith('://'):
95 | link = '{}{}'.format(urlparse(url).scheme, link)
96 | else:
97 | link = urljoin(domain, link)
98 | if link not in seen:
99 | seen[link] = depth + 1
100 | crawl_queue.append(link)
101 | else:
102 | print('Blocked by robots.txt:', url)
103 |
104 | # wait for all download threads to finish
105 | threads = []
106 | print(max_threads)
107 | while threads or crawl_queue:
108 | for thread in threads:
109 | if not thread.is_alive():
110 | threads.remove(thread)
111 | while len(threads) < max_threads and crawl_queue:
112 | # can start some more threads
113 | thread = threading.Thread(target=process_queue)
114 | thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c
115 | thread.start()
116 | threads.append(thread)
117 | print(threads)
118 | for thread in threads:
119 | thread.join()
120 |
121 | time.sleep(SLEEP_TIME)
122 |
123 |
124 | if __name__ == '__main__':
125 | from chp4.alexa_callback import AlexaCallback
126 | from chp3.rediscache import RedisCache
127 | import argparse
128 |
129 | parser = argparse.ArgumentParser(description='Threaded link crawler')
130 | parser.add_argument('max_threads', type=int, help='maximum number of threads',
131 | nargs='?', default=5)
132 | parser.add_argument('url_pattern', type=str, help='regex pattern for url matching',
133 | nargs='?', default='$^')
134 | par_args = parser.parse_args()
135 | AC = AlexaCallback()
136 | AC()
137 | start_time = time.time()
138 | threaded_crawler(AC.urls, par_args.url_pattern, cache=RedisCache(),
139 | max_threads=par_args.max_threads)
140 | print('Total time: %ss' % (time.time() - start_time))
141 |
--------------------------------------------------------------------------------
/code/chp4/threaded_crawler_with_queue.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import re
3 | import socket
4 | import threading
5 | import time
6 | from urllib import robotparser
7 | from urllib.parse import urljoin, urlparse
8 | from chp3.downloader import Downloader
9 | from chp4.redis_queue import RedisQueue
10 |
11 |
12 | SLEEP_TIME = 1
13 | socket.setdefaulttimeout(60)
14 |
15 |
16 | def get_robots_parser(robots_url):
17 | " Return the robots parser object using the robots_url "
18 | try:
19 | rp = robotparser.RobotFileParser()
20 | rp.set_url(robots_url)
21 | rp.read()
22 | return rp
23 | except Exception as e:
24 | print('Error finding robots_url:', robots_url, e)
25 |
26 |
27 | def clean_link(url, domain, link):
28 | if link.startswith('//'):
29 | link = '{}:{}'.format(urlparse(url).scheme, link)
30 | elif link.startswith('://'):
31 | link = '{}{}'.format(urlparse(url).scheme, link)
32 | else:
33 | link = urljoin(domain, link)
34 | return link
35 |
36 |
37 | def get_links(html, link_regex):
38 | " Return a list of links (using simple regex matching) from the html content "
39 | # a regular expression to extract all links from the webpage
40 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE)
41 | # list of all links from the webpage
42 | links = webpage_regex.findall(html)
43 | links = (link for link in links if re.match(link_regex, link))
44 | return links
45 |
46 |
47 | def threaded_crawler_rq(start_url, link_regex, user_agent='wswp', proxies=None,
48 | delay=3, max_depth=4, num_retries=2, cache={}, max_threads=10, scraper_callback=None):
49 | """ Crawl from the given start URLs following links matched by link_regex. In this
50 | implementation, we do not actually scrape any information.
51 |
52 | args:
53 | start_url (str or list of strs): web site(s) to start crawl
54 | link_regex (str): regex to match for links
55 | kwargs:
56 | user_agent (str): user agent (default: wswp)
57 | proxies (list of dicts): a list of possible dicts
58 | for http / https proxies
59 | For formatting, see the requests library
60 | delay (int): seconds to throttle between requests to one domain
61 | (default: 3)
62 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4)
63 | num_retries (int): # of retries when 5xx error (default: 2)
64 | cache (dict): cache dict with urls as keys
65 | and dicts for responses (default: {})
66 | scraper_callback: function to be called on url and html content
67 | """
68 | crawl_queue = RedisQueue()
69 | crawl_queue.push(start_url)
70 | # keep track which URL's have seen before
71 | robots = {}
72 | D = Downloader(delay=delay, user_agent=user_agent,
73 | proxies=proxies, cache=cache)
74 |
75 | def process_queue():
76 | while len(crawl_queue):
77 | url = crawl_queue.pop()
78 | no_robots = False
79 | if not url or 'http' not in url:
80 | continue
81 | domain = '{}://{}'.format(urlparse(url).scheme,
82 | urlparse(url).netloc)
83 | rp = robots.get(domain)
84 | if not rp and domain not in robots:
85 | robots_url = '{}/robots.txt'.format(domain)
86 | rp = get_robots_parser(robots_url)
87 | if not rp:
88 | # issue finding robots.txt, still crawl
89 | no_robots = True
90 | robots[domain] = rp
91 | elif domain in robots:
92 | no_robots = True
93 | # check url passes robots.txt restrictions
94 | if no_robots or rp.can_fetch(user_agent, url):
95 | depth = crawl_queue.get_depth(url)
96 | if depth == max_depth:
97 | print('Skipping %s due to depth' % url)
98 | continue
99 | html = D(url, num_retries=num_retries)
100 | if not html:
101 | continue
102 | if scraper_callback:
103 | links = scraper_callback(url, html) or []
104 | else:
105 | links = []
106 | # filter for links matching our regular expression
107 | for link in list(get_links(html, link_regex)) + links:
108 | if 'http' not in link:
109 | link = clean_link(url, domain, link)
110 | crawl_queue.push(link)
111 | crawl_queue.set_depth(link, depth + 1)
112 | else:
113 | print('Blocked by robots.txt:', url)
114 |
115 | # wait for all download threads to finish
116 | threads = []
117 | while threads or len(crawl_queue):
118 | for thread in threads:
119 | if not thread.is_alive():
120 | threads.remove(thread)
121 | while len(threads) < max_threads and crawl_queue:
122 | # can start some more threads
123 | thread = threading.Thread(target=process_queue)
124 | thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c
125 | thread.start()
126 | threads.append(thread)
127 |
128 | for thread in threads:
129 | thread.join()
130 |
131 | time.sleep(SLEEP_TIME)
132 |
133 |
134 | def mp_threaded_crawler(*args, **kwargs):
135 | """ create a multiprocessing threaded crawler """
136 | processes = []
137 | num_procs = kwargs.pop('num_procs')
138 | if not num_procs:
139 | num_procs = multiprocessing.cpu_count()
140 | for _ in range(num_procs):
141 | proc = multiprocessing.Process(target=threaded_crawler_rq,
142 | args=args, kwargs=kwargs)
143 | proc.start()
144 | processes.append(proc)
145 | # wait for processes to complete
146 | for proc in processes:
147 | proc.join()
148 |
149 |
150 | if __name__ == '__main__':
151 | from chp4.alexa_callback import AlexaCallback
152 | from chp3.rediscache import RedisCache
153 | import argparse
154 |
155 | parser = argparse.ArgumentParser(description='Multiprocessing threaded link crawler')
156 | parser.add_argument('max_threads', type=int, help='maximum number of threads',
157 | nargs='?', default=5)
158 | parser.add_argument('num_procs', type=int, help='number of processes',
159 | nargs='?', default=None)
160 | parser.add_argument('url_pattern', type=str, help='regex pattern for url matching',
161 | nargs='?', default='$^')
162 | par_args = parser.parse_args()
163 |
164 | AC = AlexaCallback()
165 | AC()
166 | start_time = time.time()
167 |
168 | mp_threaded_crawler(AC.urls, par_args.url_pattern, cache=RedisCache(),
169 | num_procs=par_args.num_procs, max_threads=par_args.max_threads)
170 | print('Total time: %ss' % (time.time() - start_time))
171 |
--------------------------------------------------------------------------------