├── code ├── __init__.py ├── chp1 │ ├── __init__.py │ ├── downloading_a_page.py │ ├── retrying_downloads.py │ ├── setting_user_agent.py │ ├── throttle.py │ ├── sitemap_crawler.py │ ├── id_iteration_crawler.py │ ├── link_crawler.py │ ├── advanced_link_crawler_using_requests.py │ └── advanced_link_crawler.py ├── chp2 │ ├── __init__.py │ ├── lxml_brokenhtml.py │ ├── xpath_scraper.py │ ├── lxml_scraper.py │ ├── beautifulsoup.py │ ├── family_trees.py │ ├── regex.py │ ├── beautifulsoup_brokenhtml.py │ ├── csv_callback.py │ ├── test_scrapers.py │ ├── all_scrapers.py │ └── advanced_link_crawler.py ├── chp3 │ ├── __init__.py │ ├── url_parsing.py │ ├── rediscache.py │ ├── requests_cache_link_crawler.py │ ├── advanced_link_crawler.py │ ├── downloader.py │ ├── downloader_requests_cache.py │ └── diskcache.py ├── chp6 │ ├── __init__.py │ ├── login_form_requests.py │ ├── submit_login_form.py │ ├── mechanize_form.py │ ├── login.py │ ├── firefox_sessions.py │ ├── selenium_forms.py │ └── edit.py ├── chp7 │ ├── __init__.py │ ├── investigate_form.py │ ├── image_processing.py │ ├── test_samples.py │ ├── register_with_ocr.py │ ├── register_with_api.py │ ├── using_captcha_api.py │ └── captcha_api.py ├── chp8 │ ├── __init__.py │ └── example │ │ ├── example │ │ ├── __init__.py │ │ ├── spiders │ │ │ ├── __init__.py │ │ │ └── country.py │ │ ├── items.py │ │ ├── pipelines.py │ │ ├── middlewares.py │ │ └── settings.py │ │ └── scrapy.cfg ├── chp9 │ ├── __init__.py │ ├── gap_scraper_callback.py │ ├── facebook_graph.py │ ├── scrape_google.py │ ├── facebook_selenium.py │ └── bmw_scraper.py ├── example_config.cfg ├── chp5 │ ├── lxml_attempt.py │ ├── pyqt_search_browser_render.py │ ├── json_one_req.py │ ├── selenium_search.py │ ├── pyqt_webkit.py │ ├── json_scraper.py │ ├── pyqt_search.py │ └── browser_render.py └── chp4 │ ├── extract_list.py │ ├── alexa_callback.py │ ├── redis_queue.py │ ├── advanced_link_crawler.py │ ├── threaded_crawler.py │ └── threaded_crawler_with_queue.py ├── data ├── captcha_samples │ ├── sample1.png │ ├── sample10.png │ ├── sample100.png │ ├── sample11.png │ ├── sample12.png │ ├── sample13.png │ ├── sample14.png │ ├── sample15.png │ ├── sample16.png │ ├── sample17.png │ ├── sample18.png │ ├── sample19.png │ ├── sample2.png │ ├── sample20.png │ ├── sample21.png │ ├── sample22.png │ ├── sample23.png │ ├── sample24.png │ ├── sample25.png │ ├── sample26.png │ ├── sample27.png │ ├── sample28.png │ ├── sample29.png │ ├── sample3.png │ ├── sample30.png │ ├── sample31.png │ ├── sample32.png │ ├── sample33.png │ ├── sample34.png │ ├── sample35.png │ ├── sample36.png │ ├── sample37.png │ ├── sample38.png │ ├── sample39.png │ ├── sample4.png │ ├── sample40.png │ ├── sample41.png │ ├── sample42.png │ ├── sample43.png │ ├── sample44.png │ ├── sample45.png │ ├── sample46.png │ ├── sample47.png │ ├── sample48.png │ ├── sample49.png │ ├── sample5.png │ ├── sample50.png │ ├── sample51.png │ ├── sample52.png │ ├── sample53.png │ ├── sample54.png │ ├── sample55.png │ ├── sample56.png │ ├── sample57.png │ ├── sample58.png │ ├── sample59.png │ ├── sample6.png │ ├── sample60.png │ ├── sample61.png │ ├── sample62.png │ ├── sample63.png │ ├── sample64.png │ ├── sample65.png │ ├── sample66.png │ ├── sample67.png │ ├── sample68.png │ ├── sample69.png │ ├── sample7.png │ ├── sample70.png │ ├── sample71.png │ ├── sample72.png │ ├── sample73.png │ ├── sample74.png │ ├── sample75.png │ ├── sample76.png │ ├── sample77.png │ ├── sample78.png │ ├── sample79.png │ ├── sample8.png │ ├── sample80.png │ ├── sample81.png │ ├── sample82.png │ ├── sample83.png │ ├── sample84.png │ ├── sample85.png │ ├── sample86.png │ ├── sample87.png │ ├── sample88.png │ ├── sample89.png │ ├── sample9.png │ ├── sample90.png │ ├── sample91.png │ ├── sample92.png │ ├── sample93.png │ ├── sample94.png │ ├── sample95.png │ ├── sample96.png │ ├── sample97.png │ ├── sample98.png │ ├── sample99.png │ └── samples.csv └── .gitignore ├── .gitignore └── README.md /code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp1/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp3/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp6/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp7/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp8/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp9/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/chp8/example/example/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /code/example_config.cfg: -------------------------------------------------------------------------------- 1 | [captcha_api] 2 | key=ERU285FKDSL28311 3 | -------------------------------------------------------------------------------- /data/captcha_samples/sample1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample1.png -------------------------------------------------------------------------------- /data/captcha_samples/sample10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample10.png -------------------------------------------------------------------------------- /data/captcha_samples/sample100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample100.png -------------------------------------------------------------------------------- /data/captcha_samples/sample11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample11.png -------------------------------------------------------------------------------- /data/captcha_samples/sample12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample12.png -------------------------------------------------------------------------------- /data/captcha_samples/sample13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample13.png -------------------------------------------------------------------------------- /data/captcha_samples/sample14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample14.png -------------------------------------------------------------------------------- /data/captcha_samples/sample15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample15.png -------------------------------------------------------------------------------- /data/captcha_samples/sample16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample16.png -------------------------------------------------------------------------------- /data/captcha_samples/sample17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample17.png -------------------------------------------------------------------------------- /data/captcha_samples/sample18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample18.png -------------------------------------------------------------------------------- /data/captcha_samples/sample19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample19.png -------------------------------------------------------------------------------- /data/captcha_samples/sample2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample2.png -------------------------------------------------------------------------------- /data/captcha_samples/sample20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample20.png -------------------------------------------------------------------------------- /data/captcha_samples/sample21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample21.png -------------------------------------------------------------------------------- /data/captcha_samples/sample22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample22.png -------------------------------------------------------------------------------- /data/captcha_samples/sample23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample23.png -------------------------------------------------------------------------------- /data/captcha_samples/sample24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample24.png -------------------------------------------------------------------------------- /data/captcha_samples/sample25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample25.png -------------------------------------------------------------------------------- /data/captcha_samples/sample26.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample26.png -------------------------------------------------------------------------------- /data/captcha_samples/sample27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample27.png -------------------------------------------------------------------------------- /data/captcha_samples/sample28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample28.png -------------------------------------------------------------------------------- /data/captcha_samples/sample29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample29.png -------------------------------------------------------------------------------- /data/captcha_samples/sample3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample3.png -------------------------------------------------------------------------------- /data/captcha_samples/sample30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample30.png -------------------------------------------------------------------------------- /data/captcha_samples/sample31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample31.png -------------------------------------------------------------------------------- /data/captcha_samples/sample32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample32.png -------------------------------------------------------------------------------- /data/captcha_samples/sample33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample33.png -------------------------------------------------------------------------------- /data/captcha_samples/sample34.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample34.png -------------------------------------------------------------------------------- /data/captcha_samples/sample35.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample35.png -------------------------------------------------------------------------------- /data/captcha_samples/sample36.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample36.png -------------------------------------------------------------------------------- /data/captcha_samples/sample37.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample37.png -------------------------------------------------------------------------------- /data/captcha_samples/sample38.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample38.png -------------------------------------------------------------------------------- /data/captcha_samples/sample39.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample39.png -------------------------------------------------------------------------------- /data/captcha_samples/sample4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample4.png -------------------------------------------------------------------------------- /data/captcha_samples/sample40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample40.png -------------------------------------------------------------------------------- /data/captcha_samples/sample41.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample41.png -------------------------------------------------------------------------------- /data/captcha_samples/sample42.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample42.png -------------------------------------------------------------------------------- /data/captcha_samples/sample43.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample43.png -------------------------------------------------------------------------------- /data/captcha_samples/sample44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample44.png -------------------------------------------------------------------------------- /data/captcha_samples/sample45.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample45.png -------------------------------------------------------------------------------- /data/captcha_samples/sample46.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample46.png -------------------------------------------------------------------------------- /data/captcha_samples/sample47.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample47.png -------------------------------------------------------------------------------- /data/captcha_samples/sample48.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample48.png -------------------------------------------------------------------------------- /data/captcha_samples/sample49.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample49.png -------------------------------------------------------------------------------- /data/captcha_samples/sample5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample5.png -------------------------------------------------------------------------------- /data/captcha_samples/sample50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample50.png -------------------------------------------------------------------------------- /data/captcha_samples/sample51.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample51.png -------------------------------------------------------------------------------- /data/captcha_samples/sample52.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample52.png -------------------------------------------------------------------------------- /data/captcha_samples/sample53.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample53.png -------------------------------------------------------------------------------- /data/captcha_samples/sample54.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample54.png -------------------------------------------------------------------------------- /data/captcha_samples/sample55.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample55.png -------------------------------------------------------------------------------- /data/captcha_samples/sample56.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample56.png -------------------------------------------------------------------------------- /data/captcha_samples/sample57.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample57.png -------------------------------------------------------------------------------- /data/captcha_samples/sample58.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample58.png -------------------------------------------------------------------------------- /data/captcha_samples/sample59.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample59.png -------------------------------------------------------------------------------- /data/captcha_samples/sample6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample6.png -------------------------------------------------------------------------------- /data/captcha_samples/sample60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample60.png -------------------------------------------------------------------------------- /data/captcha_samples/sample61.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample61.png -------------------------------------------------------------------------------- /data/captcha_samples/sample62.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample62.png -------------------------------------------------------------------------------- /data/captcha_samples/sample63.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample63.png -------------------------------------------------------------------------------- /data/captcha_samples/sample64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample64.png -------------------------------------------------------------------------------- /data/captcha_samples/sample65.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample65.png -------------------------------------------------------------------------------- /data/captcha_samples/sample66.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample66.png -------------------------------------------------------------------------------- /data/captcha_samples/sample67.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample67.png -------------------------------------------------------------------------------- /data/captcha_samples/sample68.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample68.png -------------------------------------------------------------------------------- /data/captcha_samples/sample69.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample69.png -------------------------------------------------------------------------------- /data/captcha_samples/sample7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample7.png -------------------------------------------------------------------------------- /data/captcha_samples/sample70.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample70.png -------------------------------------------------------------------------------- /data/captcha_samples/sample71.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample71.png -------------------------------------------------------------------------------- /data/captcha_samples/sample72.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample72.png -------------------------------------------------------------------------------- /data/captcha_samples/sample73.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample73.png -------------------------------------------------------------------------------- /data/captcha_samples/sample74.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample74.png -------------------------------------------------------------------------------- /data/captcha_samples/sample75.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample75.png -------------------------------------------------------------------------------- /data/captcha_samples/sample76.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample76.png -------------------------------------------------------------------------------- /data/captcha_samples/sample77.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample77.png -------------------------------------------------------------------------------- /data/captcha_samples/sample78.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample78.png -------------------------------------------------------------------------------- /data/captcha_samples/sample79.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample79.png -------------------------------------------------------------------------------- /data/captcha_samples/sample8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample8.png -------------------------------------------------------------------------------- /data/captcha_samples/sample80.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample80.png -------------------------------------------------------------------------------- /data/captcha_samples/sample81.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample81.png -------------------------------------------------------------------------------- /data/captcha_samples/sample82.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample82.png -------------------------------------------------------------------------------- /data/captcha_samples/sample83.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample83.png -------------------------------------------------------------------------------- /data/captcha_samples/sample84.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample84.png -------------------------------------------------------------------------------- /data/captcha_samples/sample85.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample85.png -------------------------------------------------------------------------------- /data/captcha_samples/sample86.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample86.png -------------------------------------------------------------------------------- /data/captcha_samples/sample87.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample87.png -------------------------------------------------------------------------------- /data/captcha_samples/sample88.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample88.png -------------------------------------------------------------------------------- /data/captcha_samples/sample89.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample89.png -------------------------------------------------------------------------------- /data/captcha_samples/sample9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample9.png -------------------------------------------------------------------------------- /data/captcha_samples/sample90.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample90.png -------------------------------------------------------------------------------- /data/captcha_samples/sample91.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample91.png -------------------------------------------------------------------------------- /data/captcha_samples/sample92.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample92.png -------------------------------------------------------------------------------- /data/captcha_samples/sample93.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample93.png -------------------------------------------------------------------------------- /data/captcha_samples/sample94.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample94.png -------------------------------------------------------------------------------- /data/captcha_samples/sample95.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample95.png -------------------------------------------------------------------------------- /data/captcha_samples/sample96.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample96.png -------------------------------------------------------------------------------- /data/captcha_samples/sample97.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample97.png -------------------------------------------------------------------------------- /data/captcha_samples/sample98.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample98.png -------------------------------------------------------------------------------- /data/captcha_samples/sample99.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kjam/wswp/HEAD/data/captcha_samples/sample99.png -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except these files 4 | !captcha_samples/ 5 | !captcha_samples/* 6 | !.gitignore 7 | -------------------------------------------------------------------------------- /code/chp8/example/example/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /code/chp5/lxml_attempt.py: -------------------------------------------------------------------------------- 1 | from lxml.html import fromstring 2 | from chp3.downloader import Downloader 3 | 4 | D = Downloader() 5 | html = D('http://example.webscraping.com/search') 6 | tree = fromstring(html) 7 | tree.cssselect('div#results a') 8 | -------------------------------------------------------------------------------- /code/chp2/lxml_brokenhtml.py: -------------------------------------------------------------------------------- 1 | from lxml.html import fromstring, tostring 2 | 3 | broken_html = '' 4 | 5 | tree = fromstring(broken_html) # parse the HTML 6 | fixed_html = tostring(tree, pretty_print=True) 7 | print(fixed_html) 8 | -------------------------------------------------------------------------------- /code/chp7/investigate_form.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from chp6.login import parse_form 3 | 4 | REGISTER_URL = 'http://example.webscraping.com/user/register' 5 | 6 | session = requests.Session() 7 | 8 | html = session.get(REGISTER_URL) 9 | form = parse_form(html.content) 10 | print(form) 11 | -------------------------------------------------------------------------------- /code/chp6/login_form_requests.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | LOGIN_URL = 'http://example.webscraping.com/user/login' 4 | LOGIN_EMAIL = 'example@webscraping.com' 5 | LOGIN_PASSWORD = 'example' 6 | data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD} 7 | 8 | response = requests.post(LOGIN_URL, data) 9 | print(response.url) 10 | -------------------------------------------------------------------------------- /code/chp8/example/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = example.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = example 12 | -------------------------------------------------------------------------------- /code/chp8/example/example/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class CountryItem(scrapy.Item): 12 | name = scrapy.Field() 13 | population = scrapy.Field() 14 | -------------------------------------------------------------------------------- /code/chp2/xpath_scraper.py: -------------------------------------------------------------------------------- 1 | from lxml.html import fromstring 2 | from chp1.advanced_link_crawler import download 3 | 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239' 5 | html = download(url) 6 | 7 | tree = fromstring(html) 8 | area = tree.xpath('//tr[@id="places_area__row"]/td[@class="w2p_fw"]/text()')[0] 9 | print(area) 10 | -------------------------------------------------------------------------------- /code/chp2/lxml_scraper.py: -------------------------------------------------------------------------------- 1 | from lxml.html import fromstring 2 | from chp1.advanced_link_crawler import download 3 | 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239' 5 | html = download(url) 6 | 7 | tree = fromstring(html) 8 | td = tree.cssselect('tr#places_area__row > td.w2p_fw')[0] 9 | area = td.text_content() 10 | print(area) 11 | -------------------------------------------------------------------------------- /code/chp8/example/example/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ExamplePipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /code/chp1/downloading_a_page.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from urllib.error import URLError, HTTPError, ContentTooShortError 3 | 4 | 5 | def download(url): 6 | print('Downloading:', url) 7 | try: 8 | html = urllib.request.urlopen(url).read() 9 | except (URLError, HTTPError, ContentTooShortError) as e: 10 | print('Download error:', e.reason) 11 | html = None 12 | return html 13 | -------------------------------------------------------------------------------- /code/chp5/pyqt_search_browser_render.py: -------------------------------------------------------------------------------- 1 | from chp5.browser_render import BrowserRender 2 | 3 | br = BrowserRender() 4 | br.download('http://example.webscraping.com/search') 5 | br.attr('#search_term', 'value', '.') 6 | br.text('#page_size option:checked', '1000') 7 | br.click('#search') 8 | elements = br.wait_load('#results a') 9 | 10 | countries = [e.toPlainText().strip() for e in elements] 11 | print(countries) 12 | -------------------------------------------------------------------------------- /code/chp9/gap_scraper_callback.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | 4 | def scrape_callback(url, html): 5 | if url.endswith('.xml'): 6 | # Parse the sitemap XML file 7 | resp = requests.get(url) 8 | tree = etree.fromstring(resp.content) 9 | links = [e[0].text for e in tree] 10 | return links 11 | else: 12 | # Add scraping code here 13 | pass 14 | -------------------------------------------------------------------------------- /code/chp2/beautifulsoup.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from chp1.advanced_link_crawler import download 3 | 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239' 5 | html = download(url) 6 | soup = BeautifulSoup(html, 'html5lib') 7 | 8 | # locate the area row 9 | tr = soup.find(attrs={'id': 'places_area__row'}) 10 | td = tr.find(attrs={'class': 'w2p_fw'}) # locate the data 11 | area = td.text # extract the data 12 | print(area) 13 | -------------------------------------------------------------------------------- /code/chp6/submit_login_form.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlencode 2 | from urllib.request import Request, urlopen 3 | 4 | LOGIN_URL = 'http://example.webscraping.com/user/login' 5 | LOGIN_EMAIL = 'example@webscraping.com' 6 | LOGIN_PASSWORD = 'example' 7 | data = {'email': LOGIN_EMAIL, 'password': LOGIN_PASSWORD} 8 | encoded_data = urlencode(data) 9 | request = Request(LOGIN_URL, encoded_data.encode('utf-8')) 10 | response = urlopen(request) 11 | print(response.geturl()) 12 | -------------------------------------------------------------------------------- /code/chp2/family_trees.py: -------------------------------------------------------------------------------- 1 | from lxml.html import fromstring 2 | from chp1.advanced_link_crawler import download 3 | 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239' 5 | html = download(url) 6 | 7 | tree = fromstring(html) 8 | table = tree.xpath('//table')[0] 9 | 10 | print('Children:', table.getchildren()) 11 | print('Parent:', table.getparent()) 12 | print('Previous Sibling:', table.getprevious()) 13 | print('Next Sibling:', table.getnext()) 14 | print('All Siblings:', list(table.itersiblings())) 15 | -------------------------------------------------------------------------------- /code/chp4/extract_list.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from zipfile import ZipFile 3 | from io import TextIOWrapper, BytesIO 4 | import requests 5 | 6 | resp = requests.get('http://s3.amazonaws.com/alexa-static/top-1m.csv.zip', stream=True) 7 | urls = [] # top 1 million URL's will be stored in this list 8 | with ZipFile(BytesIO(resp.content)) as zf: 9 | csv_filename = zf.namelist()[0] 10 | with zf.open(csv_filename) as csv_file: 11 | for _, website in csv.reader(TextIOWrapper(csv_file)): 12 | urls.append('http://' + website) 13 | -------------------------------------------------------------------------------- /code/chp5/json_one_req.py: -------------------------------------------------------------------------------- 1 | from csv import DictWriter 2 | import requests 3 | 4 | 5 | PAGE_SIZE = 1000 6 | 7 | template_url = 'http://example.webscraping.com/ajax/' + \ 8 | 'search.json?page=0&page_size={}&search_term=.' 9 | 10 | resp = requests.get(template_url.format(PAGE_SIZE)) 11 | data = resp.json() 12 | records = data.get('records') 13 | 14 | with open('../data/countries.csv', 'w') as countries_file: 15 | wrtr = DictWriter(countries_file, fieldnames=records[0].keys()) 16 | wrtr.writeheader() 17 | wrtr.writerows(records) 18 | -------------------------------------------------------------------------------- /code/chp5/selenium_search.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | driver = webdriver.Firefox() 4 | driver.get('http://example.webscraping.com/search') 5 | driver.find_element_by_id('search_term').send_keys('.') 6 | js = "document.getElementById('page_size').options[1].text = '1000';" 7 | driver.execute_script(js) 8 | driver.find_element_by_id('search').click() 9 | driver.implicitly_wait(30) 10 | links = driver.find_elements_by_css_selector('#results a') 11 | countries = [link.text for link in links] 12 | print(countries) 13 | 14 | driver.close() 15 | -------------------------------------------------------------------------------- /code/chp6/mechanize_form.py: -------------------------------------------------------------------------------- 1 | import mechanize 2 | 3 | LOGIN_URL = 'http://example.webscraping.com/user/login' 4 | LOGIN_EMAIL = 'example@webscraping.com' 5 | LOGIN_PASSWORD = 'example' 6 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239' 7 | 8 | 9 | br = mechanize.Browser() 10 | br.open(LOGIN_URL) 11 | br.select_form(nr=0) 12 | br['email'] = LOGIN_EMAIL 13 | br['password'] = LOGIN_PASSWORD 14 | response = br.submit() 15 | br.open(COUNTRY_URL) 16 | br.select_form(nr=0) 17 | br['population'] = str(int(br['population']) + 1) 18 | br.submit() 19 | -------------------------------------------------------------------------------- /code/chp2/regex.py: -------------------------------------------------------------------------------- 1 | import re 2 | from chp1.advanced_link_crawler import download 3 | 4 | url = 'http://example.webscraping.com/places/default/view/UnitedKingdom-239' 5 | html = download(url) 6 | 7 | print(re.findall(r'(.*?)', html)) 8 | 9 | print(re.findall('(.*?)', html)[1]) 10 | 11 | print(re.findall('(.*?)', html)) 12 | 13 | print(re.findall('''.*?(.*?)''', html)) 14 | -------------------------------------------------------------------------------- /code/chp1/retrying_downloads.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from urllib.error import URLError, HTTPError, ContentTooShortError 3 | 4 | 5 | def download(url, num_retries=2): 6 | print('Downloading:', url) 7 | try: 8 | html = urllib.request.urlopen(url).read() 9 | except (URLError, HTTPError, ContentTooShortError) as e: 10 | print('Download error:', e.reason) 11 | html = None 12 | if num_retries > 0: 13 | if hasattr(e, 'code') and 500 <= e.code < 600: 14 | # recursively retry 5xx HTTP errors 15 | return download(url, num_retries - 1) 16 | return html 17 | -------------------------------------------------------------------------------- /code/chp9/facebook_graph.py: -------------------------------------------------------------------------------- 1 | from facebook import GraphAPI 2 | from configparser import ConfigParser 3 | 4 | 5 | def get_page_details(access_token, page): 6 | graph = GraphAPI(access_token, version='2.7') 7 | return graph.get_object(page, fields='about,events,feed,picture') 8 | 9 | 10 | if __name__ == '__main__': 11 | config = ConfigParser() 12 | # This script assumes you have the following config 13 | # set up with a section facebook and key access_token 14 | config.read('../../config/api.cfg') 15 | access_token = config.get('facebook', 'access_token') 16 | print(get_page_details(access_token, 'PacktPub')) 17 | -------------------------------------------------------------------------------- /code/chp5/pyqt_webkit.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | try: 3 | from PySide.QtGui import * 4 | from PySide.QtCore import * 5 | from PySide.QtWebKit import * 6 | except ImportError: 7 | from PyQt4.QtGui import * 8 | from PyQt4.QtCore import * 9 | from PyQt4.QtWebKit import * 10 | 11 | url = 'http://example.webscraping.com/dynamic' 12 | app = QApplication([]) 13 | webview = QWebView() 14 | loop = QEventLoop() 15 | webview.loadFinished.connect(loop.quit) 16 | webview.load(QUrl(url)) 17 | loop.exec_() 18 | html = webview.page().mainFrame().toHtml() 19 | tree = lxml.html.fromstring(html) 20 | print(tree.cssselect('#result')[0].text_content()) 21 | -------------------------------------------------------------------------------- /code/chp2/beautifulsoup_brokenhtml.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | from chp1.advanced_link_crawler import download 3 | 4 | broken_html = '' 5 | 6 | soup = BeautifulSoup(broken_html, 'html.parser') 7 | fixed_html = soup.prettify() 8 | print(fixed_html) 9 | 10 | # still broken, so try a different parser 11 | 12 | soup = BeautifulSoup(broken_html, 'html5lib') 13 | fixed_html = soup.prettify() 14 | print(fixed_html) 15 | 16 | # now we can try and extract the data from the html 17 | 18 | ul = soup.find('ul', attrs={'class': 'country'}) 19 | print(ul.find('li')) # returns just the first match 20 | print(ul.find_all('li')) # returns all matches 21 | -------------------------------------------------------------------------------- /code/chp1/setting_user_agent.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | from urllib.error import URLError, HTTPError, ContentTooShortError 3 | 4 | 5 | def download(url, num_retries=2, user_agent='wswp'): 6 | print('Downloading:', url) 7 | request = urllib.request.Request(url) 8 | request.add_header('User-agent', user_agent) 9 | try: 10 | html = urllib.request.urlopen(request).read() 11 | except (URLError, HTTPError, ContentTooShortError) as e: 12 | print('Download error:', e.reason) 13 | html = None 14 | if num_retries > 0: 15 | if hasattr(e, 'code') and 500 <= e.code < 600: 16 | # recursively retry 5xx HTTP errors 17 | return download(url, num_retries - 1) 18 | return html 19 | -------------------------------------------------------------------------------- /code/chp9/scrape_google.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from urllib.parse import parse_qs, urlparse 3 | from lxml.html import fromstring 4 | 5 | # get results from search 6 | html = requests.get('https://www.google.com/search?q=test') 7 | tree = fromstring(html.content) 8 | results = tree.cssselect('h3.r a') 9 | print(results) 10 | 11 | # grab the first link 12 | link = results[0].get('href') 13 | print(link) 14 | 15 | # parse the destination url from the querystring 16 | qs = urlparse(link).query 17 | parsed_qs = parse_qs(qs) 18 | print(parsed_qs) 19 | print(parsed_qs.get('q', [])) 20 | 21 | 22 | # as one list 23 | links = [] 24 | for result in results: 25 | link = result.get('href') 26 | qs = urlparse(link).query 27 | links.extend(parse_qs(qs).get('q', [])) 28 | 29 | print(links) 30 | -------------------------------------------------------------------------------- /code/chp7/image_processing.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from lxml.html import fromstring 3 | from PIL import Image 4 | import base64 5 | 6 | 7 | def get_b64_string(html): 8 | tree = fromstring(html) 9 | img_data = tree.cssselect('div#recaptcha img')[0].get('src') 10 | img_data = img_data.partition(',')[-1] 11 | return img_data 12 | 13 | 14 | def get_captcha_img(html): 15 | tree = fromstring(html) 16 | img_data = tree.cssselect('div#recaptcha img')[0].get('src') 17 | img_data = img_data.partition(',')[-1] 18 | binary_img_data = base64.b64decode(img_data) 19 | img = Image.open(BytesIO(binary_img_data)) 20 | return img 21 | 22 | 23 | def img_to_bw(img): 24 | gray = img.convert('L') 25 | bw = gray.point(lambda x: 0 if x < 1 else 255, '1') 26 | return bw 27 | -------------------------------------------------------------------------------- /code/chp9/facebook_selenium.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | 4 | def get_driver(): 5 | try: 6 | return webdriver.PhantomJS() 7 | except: 8 | return webdriver.Firefox() 9 | 10 | 11 | def facebook(username, password, url): 12 | driver = get_driver() 13 | driver.get('https://facebook.com') 14 | driver.find_element_by_id('email').send_keys(username) 15 | driver.find_element_by_id('pass').send_keys(password) 16 | driver.find_element_by_id('loginbutton').submit() 17 | driver.implicitly_wait(30) 18 | # wait until the search box is available, 19 | # which means it has successfully logged in 20 | search = driver.find_element_by_name('q') 21 | # now logged in so can go to the page of interest 22 | driver.get(url) 23 | # add code to scrape data of interest here ... 24 | -------------------------------------------------------------------------------- /code/chp4/alexa_callback.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from zipfile import ZipFile 3 | from io import TextIOWrapper, BytesIO 4 | import requests 5 | 6 | 7 | class AlexaCallback: 8 | def __init__(self, max_urls=500): 9 | self.max_urls = max_urls 10 | self.seed_url = 'http://s3.amazonaws.com/alexa-static/top-1m.csv.zip' 11 | self.urls = [] 12 | 13 | def __call__(self): 14 | resp = requests.get(self.seed_url, stream=True) 15 | with ZipFile(BytesIO(resp.content)) as zf: 16 | csv_filename = zf.namelist()[0] 17 | with zf.open(csv_filename) as csv_file: 18 | for _, website in csv.reader(TextIOWrapper(csv_file)): 19 | self.urls.append('http://' + website) 20 | if len(self.urls) == self.max_urls: 21 | break 22 | -------------------------------------------------------------------------------- /code/chp9/bmw_scraper.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | import requests 4 | 5 | 6 | url = 'https://c2b-services.bmw.com/c2b-localsearch/services/api/v3/clients/BMWDIGITAL_DLO/DE/pois?country=DE&category=BM&maxResults=%d&language=en&lat=52.507537768880056&lng=13.425269635701511' 7 | jsonp = requests.get(url % 1000) 8 | pure_json = jsonp.text[jsonp.text.index('(') + 1: jsonp.text.rindex(')')] 9 | dealers = json.loads(pure_json) 10 | print(dealers.keys()) 11 | print(dealers['count']) 12 | print(dealers['data']['pois'][0]) 13 | 14 | with open('../../data/bmw.csv', 'w') as fp: 15 | writer = csv.writer(fp) 16 | writer.writerow(['Name', 'Latitude', 'Longitude']) 17 | for dealer in dealers['data']['pois']: 18 | name = dealer['name'] 19 | lat, lng = dealer['lat'], dealer['lng'] 20 | writer.writerow([name, lat, lng]) 21 | -------------------------------------------------------------------------------- /code/chp5/json_scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import string 3 | 4 | PAGE_SIZE = 10 5 | 6 | template_url = 'http://example.webscraping.com/ajax/' + \ 7 | 'search.json?page={}&page_size={}&search_term={}' 8 | 9 | countries = set() 10 | 11 | for letter in string.ascii_lowercase: 12 | print('Searching with %s' % letter) 13 | page = 0 14 | while True: 15 | resp = requests.get(template_url.format(page, PAGE_SIZE, letter)) 16 | data = resp.json() 17 | print('adding %d more records from page %d' % 18 | (len(data.get('records')), page)) 19 | for record in data.get('records'): 20 | countries.add(record['country']) 21 | page += 1 22 | if page >= data['num_pages']: 23 | break 24 | 25 | with open('../data/countries.txt', 'w') as countries_file: 26 | countries_file.write('\n'.join(sorted(countries))) 27 | -------------------------------------------------------------------------------- /code/chp2/csv_callback.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import re 3 | from lxml.html import fromstring 4 | 5 | 6 | class CsvCallback: 7 | def __init__(self): 8 | self.writer = csv.writer(open('../data/countries.csv', 'w')) 9 | self.fields = ('area', 'population', 'iso', 'country', 'capital', 10 | 'continent', 'tld', 'currency_code', 'currency_name', 11 | 'phone', 'postal_code_format', 'postal_code_regex', 12 | 'languages', 'neighbours') 13 | self.writer.writerow(self.fields) 14 | 15 | def __call__(self, url, html): 16 | if re.search('/view/', url): 17 | tree = fromstring(html) 18 | all_rows = [ 19 | tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content() 20 | for field in self.fields] 21 | self.writer.writerow(all_rows) 22 | -------------------------------------------------------------------------------- /code/chp1/throttle.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | import time 3 | 4 | 5 | class Throttle: 6 | """ Add a delay between downloads to the same domain 7 | """ 8 | def __init__(self, delay): 9 | # amount of delay between downloads for each domain 10 | self.delay = delay 11 | # timestamp of when a domain was last accessed 12 | self.domains = {} 13 | 14 | def wait(self, url): 15 | domain = urlparse(url).netloc 16 | last_accessed = self.domains.get(domain) 17 | 18 | if self.delay > 0 and last_accessed is not None: 19 | sleep_secs = self.delay - (time.time() - last_accessed) 20 | if sleep_secs > 0: 21 | # domain has been accessed recently 22 | # so need to sleep 23 | time.sleep(sleep_secs) 24 | # update the last accessed time 25 | self.domains[domain] = time.time() 26 | -------------------------------------------------------------------------------- /code/chp3/url_parsing.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import urlsplit 3 | 4 | # how to manage converting urls into filenames 5 | 6 | url = 'http://example.webscraping.com/places/default/view/Australia-1' 7 | filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', url) 8 | filename = '/'.join(segment[:255] for segment in filename.split('/')) 9 | print(filename) 10 | 11 | # how to handle edge case where we need to append index.html for parent urls 12 | # such as http://example.webscraping.com/index/ 13 | 14 | components = urlsplit('http://example.webscraping.com/index/') 15 | print(components) 16 | print(components.path) 17 | path = components.path 18 | if not path: 19 | path = '/index.html' 20 | elif path.endswith('/'): 21 | path += 'index.html' 22 | filename = components.netloc + path + components.query 23 | filename = re.sub('[^/0-9a-zA-Z\-.,;_ ]', '_', filename) 24 | filename = '/'.join(segment[:255] for segment in filename.split('/')) 25 | print(filename) 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | *~ 60 | */.*~ 61 | .*/ 62 | *.rdb 63 | config/ 64 | -------------------------------------------------------------------------------- /code/chp8/example/example/spiders/country.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from example.items import CountryItem 6 | 7 | 8 | class CountrySpider(CrawlSpider): 9 | name = 'country' 10 | allowed_domains = ['example.webscraping.com'] 11 | start_urls = ['http://example.webscraping.com/'] 12 | 13 | rules = ( 14 | Rule(LinkExtractor(allow=r'/index/', deny=r'/user/'), 15 | follow=True), 16 | Rule(LinkExtractor(allow=r'/view/', deny=r'/user/'), 17 | callback='parse_item'), 18 | ) 19 | 20 | def parse_item(self, response): 21 | item = CountryItem() 22 | name_css = 'tr#places_country__row td.w2p_fw::text' 23 | item['name'] = response.css(name_css).extract() 24 | pop_xpath = '//tr[@id="places_population__row"]/td[@class="w2p_fw"]/text()' 25 | item['population'] = response.xpath(pop_xpath).extract() 26 | return item 27 | -------------------------------------------------------------------------------- /code/chp2/test_scrapers.py: -------------------------------------------------------------------------------- 1 | import time 2 | import re 3 | from chp2.all_scrapers import re_scraper, bs_scraper, \ 4 | lxml_scraper, lxml_xpath_scraper 5 | from chp1.advanced_link_crawler import download 6 | 7 | NUM_ITERATIONS = 1000 # number of times to test each scraper 8 | html = download('http://example.webscraping.com/places/view/United-Kingdom-239') 9 | 10 | scrapers = [ 11 | ('Regular expressions', re_scraper), 12 | ('BeautifulSoup', bs_scraper), 13 | ('Lxml', lxml_scraper), 14 | ('Xpath', lxml_xpath_scraper)] 15 | 16 | for name, scraper in scrapers: 17 | # record start time of scrape 18 | start = time.time() 19 | for i in range(NUM_ITERATIONS): 20 | if scraper == re_scraper: 21 | re.purge() 22 | result = scraper(html) 23 | # check scraped result is as expected 24 | assert result['area'] == '244,820 square kilometres' 25 | # record end time of scrape and output the total 26 | end = time.time() 27 | print('%s: %.2f seconds' % (name, end - start)) 28 | -------------------------------------------------------------------------------- /code/chp7/test_samples.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | from csv import reader 4 | from chp7.register import ocr 5 | from chp7.image_processing import img_to_bw 6 | 7 | SAMPLES_DIR = os.path.join( 8 | os.path.dirname(os.path.realpath(__file__)), 9 | '..', '..', 'data', 'captcha_samples') 10 | 11 | 12 | def get_rdr(samples_folder=SAMPLES_DIR): 13 | return reader(open(os.path.join(samples_folder, 'samples.csv'))) 14 | 15 | 16 | def test_samples(samples_folder=SAMPLES_DIR): 17 | rdr = get_rdr(samples_folder=samples_folder) 18 | results = {'correct': 0, 'incorrect': 0} 19 | for fname, txt in rdr: 20 | img = Image.open(os.path.join(samples_folder, fname)) 21 | captcha = ocr(img) 22 | if captcha == txt: 23 | results['correct'] += 1 24 | else: 25 | results['incorrect'] += 1 26 | print('accuracy: {}%'.format(results['correct'] / 100.0)) 27 | print('results: ', results) 28 | return results 29 | 30 | if __name__ == '__main__': 31 | test_samples() 32 | -------------------------------------------------------------------------------- /code/chp5/pyqt_search.py: -------------------------------------------------------------------------------- 1 | try: 2 | from PySide.QtGui import * 3 | from PySide.QtCore import * 4 | from PySide.QtWebKit import * 5 | except ImportError: 6 | from PyQt4.QtGui import * 7 | from PyQt4.QtCore import * 8 | from PyQt4.QtWebKit import * 9 | 10 | 11 | app = QApplication([]) 12 | webview = QWebView() 13 | loop = QEventLoop() 14 | webview.loadFinished.connect(loop.quit) 15 | webview.load(QUrl('http://example.webscraping.com/search')) 16 | loop.exec_() 17 | webview.show() 18 | frame = webview.page().mainFrame() 19 | frame.findFirstElement('#search_term').setAttribute('value', '.') 20 | frame.findFirstElement('#page_size option:checked').setPlainText('1000') 21 | frame.findFirstElement('#search').evaluateJavaScript('this.click()') 22 | # app.exec_() ## Uncomment and this will become a blocking event 23 | 24 | elements = None 25 | while not elements: 26 | app.processEvents() 27 | elements = frame.findAllElements('#results a') 28 | 29 | 30 | countries = [e.toPlainText().strip() for e in elements] 31 | print(countries) 32 | -------------------------------------------------------------------------------- /code/chp6/login.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml.html import fromstring 3 | 4 | 5 | LOGIN_URL = 'http://example.webscraping.com/user/login' 6 | LOGIN_EMAIL = 'example@webscraping.com' 7 | LOGIN_PASSWORD = 'example' 8 | 9 | 10 | def parse_form(html): 11 | tree = fromstring(html) 12 | data = {} 13 | for e in tree.cssselect('form input'): 14 | if e.get('name'): 15 | data[e.get('name')] = e.get('value') 16 | return data 17 | 18 | 19 | def login(session=None): 20 | """ Login to example website. 21 | params: 22 | session: request lib session object or None 23 | returns tuple(response, session) 24 | """ 25 | if session is None: 26 | html = requests.get(LOGIN_URL) 27 | else: 28 | html = session.get(LOGIN_URL) 29 | data = parse_form(html.content) 30 | data['email'] = LOGIN_EMAIL 31 | data['password'] = LOGIN_PASSWORD 32 | if session is None: 33 | response = requests.post(LOGIN_URL, data, cookies=html.cookies) 34 | else: 35 | response = session.post(LOGIN_URL, data) 36 | assert 'login' not in response.url 37 | return response, session 38 | -------------------------------------------------------------------------------- /code/chp1/sitemap_crawler.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import re 3 | 4 | from urllib.error import URLError, HTTPError, ContentTooShortError 5 | 6 | 7 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'): 8 | print('Downloading:', url) 9 | request = urllib.request.Request(url) 10 | request.add_header('User-agent', user_agent) 11 | try: 12 | resp = urllib.request.urlopen(request) 13 | cs = resp.headers.get_content_charset() 14 | if not cs: 15 | cs = charset 16 | html = resp.read().decode(cs) 17 | except (URLError, HTTPError, ContentTooShortError) as e: 18 | print('Download error:', e.reason) 19 | html = None 20 | if num_retries > 0: 21 | if hasattr(e, 'code') and 500 <= e.code < 600: 22 | # recursively retry 5xx HTTP errors 23 | return download(url, num_retries - 1) 24 | return html 25 | 26 | 27 | def crawl_sitemap(url): 28 | # download the sitemap file 29 | sitemap = download(url) 30 | # extract the sitemap links 31 | links = re.findall('(.*?)', sitemap) 32 | # download each link 33 | for link in links: 34 | html = download(link) 35 | # scrape html here 36 | -------------------------------------------------------------------------------- /code/chp1/id_iteration_crawler.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import urllib.request 3 | from urllib.error import URLError, HTTPError, ContentTooShortError 4 | 5 | 6 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'): 7 | print('Downloading:', url) 8 | request = urllib.request.Request(url) 9 | request.add_header('User-agent', user_agent) 10 | try: 11 | resp = urllib.request.urlopen(request) 12 | cs = resp.headers.get_content_charset() 13 | if not cs: 14 | cs = charset 15 | html = resp.read().decode(cs) 16 | except (URLError, HTTPError, ContentTooShortError) as e: 17 | print('Download error:', e.reason) 18 | html = None 19 | if num_retries > 0: 20 | if hasattr(e, 'code') and 500 <= e.code < 600: 21 | # recursively retry 5xx HTTP errors 22 | return download(url, num_retries - 1) 23 | return html 24 | 25 | 26 | def crawl_site(url, max_errors=5): 27 | num_errors = 0 28 | for page in itertools.count(1): 29 | pg_url = '{}{}'.format(url, page) 30 | html = download(pg_url) 31 | if html is None: 32 | num_errors += 1 33 | if num_errors == max_errors: 34 | # reached max number of errors, so exit 35 | break 36 | else: 37 | num_errors = 0 38 | # success - can scrape the result 39 | -------------------------------------------------------------------------------- /code/chp7/register_with_ocr.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import string 3 | import pytesseract 4 | from lxml.html import fromstring 5 | from chp6.login import parse_form 6 | from chp7.image_processing import get_captcha_img, img_to_bw 7 | 8 | REGISTER_URL = 'http://example.webscraping.com/user/register' 9 | 10 | 11 | def register(first_name, last_name, email, password): 12 | session = requests.Session() 13 | html = session.get(REGISTER_URL) 14 | form = parse_form(html.content) 15 | form['first_name'] = first_name 16 | form['last_name'] = last_name 17 | form['email'] = email 18 | form['password'] = form['password_two'] = password 19 | img = get_captcha_img(html.content) 20 | captcha = ocr(img) 21 | form['recaptcha_response_field'] = captcha 22 | resp = session.post(html.url, form) 23 | success = '/user/register' not in resp.url 24 | if not success: 25 | form_errors = fromstring(resp.content).cssselect('div.error') 26 | print('Form Errors:') 27 | print('\n'.join( 28 | (' {}: {}'.format(f.get('id'), f.text) for f in form_errors))) 29 | return success 30 | 31 | 32 | def ocr(img): 33 | bw = img_to_bw(img) 34 | captcha = pytesseract.image_to_string(bw) 35 | cleaned = ''.join(c for c in captcha.lower() if c in string.ascii_lowercase) 36 | if len(cleaned) != len(captcha): 37 | print('removed bad characters: {}'.format(set(captcha) - set(cleaned))) 38 | return cleaned 39 | -------------------------------------------------------------------------------- /code/chp7/register_with_api.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | import requests 3 | from lxml.html import fromstring 4 | from chp6.login import parse_form 5 | from chp7.image_processing import get_captcha_img 6 | from chp7.captcha_api import CaptchaAPI 7 | 8 | REGISTER_URL = 'http://example.webscraping.com/user/register' 9 | 10 | 11 | def get_api_key(): 12 | config = ConfigParser() 13 | config.read('../config/api.cfg') 14 | return config.get('captcha_api', 'key') 15 | 16 | 17 | def register(first_name, last_name, email, password): 18 | session = requests.Session() 19 | html = session.get(REGISTER_URL) 20 | form = parse_form(html.content) 21 | form['first_name'] = first_name 22 | form['last_name'] = last_name 23 | form['email'] = email 24 | form['password'] = form['password_two'] = password 25 | api_key = get_api_key() 26 | img = get_captcha_img(html.content) 27 | api = CaptchaAPI(api_key) 28 | captcha_id, captcha = api.solve(img) 29 | form['recaptcha_response_field'] = captcha 30 | resp = session.post(html.url, form) 31 | success = '/user/register' not in resp.url 32 | if success: 33 | api.report(captcha_id, 1) 34 | else: 35 | form_errors = fromstring(resp.content).cssselect('div.error') 36 | print('Form Errors:') 37 | print('\n'.join( 38 | (' {}: {}'.format(f.get('id'), f.text) for f in form_errors))) 39 | if 'invalid' in [f.text for f in form_errors]: 40 | api.report(captcha_id, 0) 41 | return success 42 | -------------------------------------------------------------------------------- /code/chp6/firefox_sessions.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import glob 4 | import requests 5 | 6 | from lxml.html import fromstring 7 | 8 | 9 | def find_ff_sessions(): 10 | paths = [ 11 | '~/.mozilla/firefox/*.default', 12 | '~/Library/Application Support/Firefox/Profiles/*.default', 13 | '%APPDATA%/Roaming/Mozilla/Firefox/Profiles/*.default' 14 | ] 15 | for path in paths: 16 | filename = os.path.join(path, 'sessionstore.js') 17 | matches = glob.glob(os.path.expanduser(filename)) 18 | if matches: 19 | return matches[0] 20 | 21 | 22 | def load_ff_sessions(session_filename): 23 | cookies = {} 24 | if os.path.exists(session_filename): 25 | json_data = json.loads(open(session_filename, 'rb').read()) 26 | for window in json_data.get('windows', []): 27 | for cookie in window.get('cookies', []): 28 | cookies[cookie.get('name')] = cookie.get('value') 29 | else: 30 | print('Session filename does not exist:', session_filename) 31 | return cookies 32 | 33 | 34 | def session_login(): 35 | session_filename = find_ff_sessions() 36 | assert session_filename is not None 37 | cookies = load_ff_sessions(session_filename) 38 | print('found cookies: ', cookies) 39 | url = 'http://example.webscraping.com' 40 | html = requests.get(url, cookies=cookies) 41 | tree = fromstring(html.content) 42 | print(tree.cssselect('ul#navbar li a')[0].text_content()) 43 | return html 44 | 45 | 46 | if __name__ == '__main__': 47 | session_login() 48 | -------------------------------------------------------------------------------- /code/chp2/all_scrapers.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | from lxml.html import fromstring 4 | 5 | FIELDS = ('area', 'population', 'iso', 'country', 'capital', 6 | 'continent', 'tld', 'currency_code', 'currency_name', 7 | 'phone', 'postal_code_format', 'postal_code_regex', 8 | 'languages', 'neighbours') 9 | 10 | 11 | def re_scraper(html): 12 | """ Using regex to extract data from country pages. """ 13 | results = {} 14 | for field in FIELDS: 15 | results[field] = re.search( 16 | '.*?(.*?)' 17 | % field, html).groups()[0] 18 | return results 19 | 20 | 21 | def bs_scraper(html): 22 | """ Using beautifulsoup to extract data from country pages. """ 23 | soup = BeautifulSoup(html, 'html.parser') 24 | results = {} 25 | for field in FIELDS: 26 | results[field] = soup.find('table').find( 27 | 'tr', id='places_%s__row' % field).find( 28 | 'td', class_='w2p_fw').text 29 | return results 30 | 31 | 32 | def lxml_scraper(html): 33 | """ Using lxml and cssselect to extract data from country pages. """ 34 | tree = fromstring(html) 35 | results = {} 36 | for field in FIELDS: 37 | results[field] = tree.cssselect( 38 | 'table > tr#places_%s__row > td.w2p_fw' % field)[0].text_content() 39 | return results 40 | 41 | 42 | def lxml_xpath_scraper(html): 43 | """ Using lxml and xpath to extract data from country pages. """ 44 | tree = fromstring(html) 45 | results = {} 46 | for field in FIELDS: 47 | results[field] = tree.xpath( 48 | '//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content() 49 | return results 50 | -------------------------------------------------------------------------------- /code/chp3/rediscache.py: -------------------------------------------------------------------------------- 1 | import json 2 | import zlib 3 | from datetime import datetime, timedelta 4 | from redis import StrictRedis 5 | 6 | 7 | class RedisCache: 8 | """ RedisCache helps store urls and their responses to Redis 9 | Initialization components: 10 | client: a Redis client connected to the key-value database for 11 | the webcrawling cache (if not set, a localhost:6379 12 | default connection is used). 13 | expires (datetime.timedelta): timedelta when content will expire 14 | (default: 30 days ago) 15 | encoding (str): character encoding for serialization 16 | compress (bool): boolean indicating whether compression with zlib should be used 17 | """ 18 | def __init__(self, client=None, expires=timedelta(days=30), encoding='utf-8', compress=True): 19 | self.client = (StrictRedis(host='localhost', port=6379, db=0) 20 | if client is None else client) 21 | self.expires = expires 22 | self.encoding = encoding 23 | self.compress = compress 24 | 25 | def __getitem__(self, url): 26 | """Load data from Redis for given URL""" 27 | record = self.client.get(url) 28 | if record: 29 | if self.compress: 30 | record = zlib.decompress(record) 31 | return json.loads(record.decode(self.encoding)) 32 | else: 33 | # URL has not yet been cached 34 | raise KeyError(url + ' does not exist') 35 | 36 | def __setitem__(self, url, result): 37 | """Save data to Redis for given url""" 38 | data = bytes(json.dumps(result), self.encoding) 39 | if self.compress: 40 | data = zlib.compress(data) 41 | self.client.setex(url, self.expires, data) 42 | -------------------------------------------------------------------------------- /code/chp6/selenium_forms.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.keys import Keys 3 | from selenium.webdriver.common.by import By 4 | from selenium.webdriver.support.ui import WebDriverWait 5 | from selenium.webdriver.support import expected_conditions as EC 6 | 7 | 8 | LOGIN_URL = 'http://example.webscraping.com/user/login' 9 | LOGIN_EMAIL = 'example@webscraping.com' 10 | LOGIN_PASSWORD = 'example' 11 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239' 12 | 13 | 14 | def get_driver(): 15 | try: 16 | return webdriver.PhantomJS() 17 | except Exception: 18 | return webdriver.Firefox() 19 | 20 | 21 | def login(driver): 22 | driver.get(LOGIN_URL) 23 | driver.find_element_by_id('auth_user_email').send_keys(LOGIN_EMAIL) 24 | driver.find_element_by_id('auth_user_password').send_keys( 25 | LOGIN_PASSWORD + Keys.RETURN) 26 | pg_loaded = WebDriverWait(driver, 10).until( 27 | EC.presence_of_element_located((By.ID, "results"))) 28 | assert 'login' not in driver.current_url 29 | 30 | 31 | def add_population(driver): 32 | driver.get(COUNTRY_URL) 33 | population = driver.find_element_by_id('places_population') 34 | new_population = int(population.get_attribute('value')) + 1 35 | population.clear() 36 | population.send_keys(new_population) 37 | driver.find_element_by_xpath('//input[@type="submit"]').click() 38 | pg_loaded = WebDriverWait(driver, 10).until( 39 | EC.presence_of_element_located((By.ID, "places_population__row"))) 40 | test_population = int(driver.find_element_by_css_selector( 41 | '#places_population__row .w2p_fw').text.replace(',', '')) 42 | assert test_population == new_population 43 | 44 | 45 | if __name__ == '__main__': 46 | driver = get_driver() 47 | login(driver) 48 | add_population(driver) 49 | driver.quit() 50 | -------------------------------------------------------------------------------- /code/chp6/edit.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from lxml.html import fromstring 4 | from chp6.login import login, parse_form 5 | 6 | COUNTRY_URL = 'http://example.webscraping.com/places/default/edit/United-Kingdom-239' 7 | VIEW_URL = 'http://example.webscraping.com/places/default/view/United-Kingdom-239' 8 | 9 | 10 | def get_population(): 11 | html = requests.get(VIEW_URL) 12 | tree = fromstring(html.content) 13 | population = tree.cssselect( 14 | '#places_population__row .w2p_fw')[0].text_content() 15 | return int(population.replace(',', '')) 16 | 17 | 18 | def add_population(): 19 | session = requests.Session() 20 | response, session = login(session=session) 21 | country_html = session.get(COUNTRY_URL) 22 | data = parse_form(country_html.content) 23 | print('population is: ', data['population']) 24 | data['population'] = int(data['population']) + 1 25 | response = session.post(COUNTRY_URL, data=data) 26 | test_population = get_population() 27 | print('population is now:', test_population) 28 | assert test_population == data['population'] 29 | 30 | 31 | def get_currency(): 32 | html = requests.get(VIEW_URL) 33 | tree = fromstring(html.content) 34 | currency = tree.cssselect( 35 | '#places_currency_name__row .w2p_fw')[0].text_content() 36 | return currency 37 | 38 | 39 | def change_currency(): 40 | session = requests.Session() 41 | response, session = login(session=session) 42 | country_html = session.get(COUNTRY_URL) 43 | data = parse_form(country_html.content) 44 | print('currency is: ', data['currency_name']) 45 | data['currency_name'] = 'British pounds' 46 | response = session.post(COUNTRY_URL, data=data) 47 | test_currency = get_currency() 48 | print('currency is now: ', test_currency) 49 | assert test_currency == data['currency_name'] 50 | 51 | 52 | if __name__ == '__main__': 53 | add_population() 54 | -------------------------------------------------------------------------------- /code/chp1/link_crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import urllib.request 3 | from urllib.parse import urljoin 4 | from urllib.error import URLError, HTTPError, ContentTooShortError 5 | 6 | 7 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8'): 8 | print('Downloading:', url) 9 | request = urllib.request.Request(url) 10 | request.add_header('User-agent', user_agent) 11 | try: 12 | resp = urllib.request.urlopen(request) 13 | cs = resp.headers.get_content_charset() 14 | if not cs: 15 | cs = charset 16 | html = resp.read().decode(cs) 17 | except (URLError, HTTPError, ContentTooShortError) as e: 18 | print('Download error:', e.reason) 19 | html = None 20 | if num_retries > 0: 21 | if hasattr(e, 'code') and 500 <= e.code < 600: 22 | # recursively retry 5xx HTTP errors 23 | return download(url, num_retries - 1) 24 | return html 25 | 26 | 27 | def link_crawler(start_url, link_regex): 28 | " Crawl from the given start URL following links matched by link_regex " 29 | crawl_queue = [start_url] 30 | # keep track which URL's have seen before 31 | seen = set(crawl_queue) 32 | while crawl_queue: 33 | url = crawl_queue.pop() 34 | html = download(url) 35 | if not html: 36 | continue 37 | # filter for links matching our regular expression 38 | for link in get_links(html): 39 | if re.match(link_regex, link): 40 | abs_link = urljoin(start_url, link) 41 | if abs_link not in seen: 42 | seen.add(abs_link) 43 | crawl_queue.append(abs_link) 44 | 45 | 46 | def get_links(html): 47 | " Return a list of links from html " 48 | # a regular expression to extract all links from the webpage 49 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 50 | # list of all links from the webpage 51 | return webpage_regex.findall(html) 52 | -------------------------------------------------------------------------------- /code/chp8/example/example/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ExampleSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /code/chp4/redis_queue.py: -------------------------------------------------------------------------------- 1 | # Based loosely on the Redis Cookbook FIFO Queue: http://www.rediscookbook.org/implement_a_fifo_queue.html 2 | from redis import StrictRedis 3 | 4 | 5 | class RedisQueue: 6 | """ RedisQueue helps store urls to crawl to Redis 7 | Initialization components: 8 | client: a Redis client connected to the key-value database for 9 | the webcrawling cache (if not set, a localhost:6379 10 | default connection is used). 11 | db (int): which database to use for Redis 12 | queue_name (str): name for queue (default: wswp) 13 | """ 14 | 15 | def __init__(self, client=None, db=0, queue_name='wswp'): 16 | self.client = (StrictRedis(host='localhost', port=6379, db=db) 17 | if client is None else client) 18 | self.name = "queue:%s" % queue_name 19 | self.seen_set = "seen:%s" % queue_name 20 | self.depth = "depth:%s" % queue_name 21 | 22 | def __len__(self): 23 | return self.client.llen(self.name) 24 | 25 | def push(self, element): 26 | """Push an element to the tail of the queue""" 27 | if isinstance(element, list): 28 | element = [e for e in element if not self.already_seen(e)] 29 | self.client.lpush(self.name, *element) 30 | self.client.sadd(self.seen_set, *element) 31 | elif not self.already_seen(element): 32 | self.client.lpush(self.name, element) 33 | self.client.sadd(self.seen_set, element) 34 | 35 | def already_seen(self, element): 36 | """ determine if an element has already been seen """ 37 | return self.client.sismember(self.seen_set, element) 38 | 39 | def set_depth(self, element, depth): 40 | """ Set the seen hash and depth """ 41 | self.client.hset(self.depth, element, depth) 42 | 43 | def get_depth(self, element): 44 | """ Get the seen hash and depth """ 45 | return (lambda dep: int(dep) if dep else 0)(self.client.hget(self.depth, element)) 46 | 47 | def pop(self): 48 | """Pop an element from the head of the queue""" 49 | return self.client.rpop(self.name).decode('utf-8') 50 | -------------------------------------------------------------------------------- /code/chp7/using_captcha_api.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import base64 3 | from configparser import ConfigParser 4 | from time import sleep 5 | from lxml.html import fromstring 6 | from chp6.login import parse_form 7 | from chp7.image_processing import get_captcha_img, get_b64_string 8 | 9 | API_URL = 'https://www.9kw.eu/index.cgi' 10 | REGISTER_URL = 'http://example.webscraping.com/user/register' 11 | 12 | 13 | def get_api_key(): 14 | config = ConfigParser() 15 | config.read('../config/api.cfg') 16 | return config.get('captcha_api', 'key') 17 | 18 | 19 | def send_captcha(api_key, img_data): 20 | data = { 21 | 'action': 'usercaptchaupload', 22 | 'apikey': api_key, 23 | 'file-upload-01': img_data, 24 | 'base64': '1', 25 | 'selfsolve': '1', 26 | 'json': '1', 27 | 'maxtimeout': '300' 28 | } 29 | resp = requests.post(API_URL, data) 30 | return resp.json() 31 | 32 | 33 | def get_captcha_text(api_key, captcha_id): 34 | data = { 35 | 'action': 'usercaptchacorrectdata', 36 | 'id': captcha_id, 37 | 'apikey': api_key, 38 | 'json': '1', 39 | } 40 | resp = requests.get(API_URL, data) 41 | print('captcha text response:', resp.json()) 42 | answer = resp.json().get('answer') 43 | return answer 44 | 45 | 46 | def register(first_name, last_name, email, password): 47 | session = requests.Session() 48 | html = session.get(REGISTER_URL) 49 | form = parse_form(html.content) 50 | form['first_name'] = first_name 51 | form['last_name'] = last_name 52 | form['email'] = email 53 | form['password'] = form['password_two'] = password 54 | img_data = get_b64_string(html.content) 55 | img = get_captcha_img(html.content) 56 | img.show() # This will show the image locally when run 57 | api_key = get_api_key() 58 | captcha_id = send_captcha(api_key, img_data) 59 | print('submitted captcha, got id:', captcha_id) 60 | sleep(300) 61 | captcha = get_captcha_text(api_key, captcha_id) 62 | print('captcha solve:', captcha) 63 | form['recaptcha_response_field'] = captcha 64 | resp = session.post(html.url, form) 65 | success = '/user/register' not in resp.url 66 | if not success: 67 | form_errors = fromstring(resp.content).cssselect('div.error') 68 | print('Form Errors:') 69 | print('\n'.join( 70 | (' {}: {}'.format(f.get('id'), f.text) for f in form_errors))) 71 | return success 72 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Web Scraping with Python 2 | 3 | Welcome to the code repository for [Web Scraping with Python, Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/python-web-scraping-second-edition)! I hope you find the code and data here useful. If you have any questions reach out to @kjam on Twitter or GitHub. 4 | 5 | ### Code Structure 6 | 7 | All of the code samples are in folders separated by chapter. Scripts are intended to be run from the `code` folder, allowing you to easily import from the chapters. 8 | 9 | ### Code Examples 10 | 11 | I have not included every code sample you've found in the book, but I have included a majority of the finished scripts. Although these are included, I encourage you to write out each code sample on your own and use these only as a reference. 12 | 13 | ### Firefox Issues 14 | 15 | Depending on your version of Firefox and Selenium, you may run into JavaScript errors. Here are some fixes: 16 | * Use an older version of Firefox 17 | * Upgrade Selenium to >=3.0.2 and download the [geckodriver](https://github.com/mozilla/geckodriver/releases). Make sure the geckodriver is findable by your PATH variable. You can do this by adding this line to your `.bashrc` or `.bash_profile`. (Wondering what these are? Please read the Appendix C on learning the command line). 18 | * Use [PhantomJS](http://phantomjs.org/) with Selenium (change your browser line to `webdriver.PhantomJS('path/to/your/phantomjs/installation')`) 19 | * Use Chrome, InternetExplorer or any other [supported browser](http://www.seleniumhq.org/about/platforms.jsp) 20 | 21 | Feel free to reach out if you have any questions! 22 | 23 | ### Issues with Module Import 24 | 25 | Seeing chp1 ModuleNotFound errors? Try adding this snippet to the file: 26 | 27 | ``` 28 | import os 29 | import sys 30 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))) 31 | ``` 32 | 33 | What this does is append the main module to your system path, which is where Python looks for imports. On some installations, I have noticed the current directory is not immediately added (common practice), so this code *explicitly* adds that directory to your path. 34 | 35 | 36 | ### Corrections? 37 | 38 | If you find any issues in these code examples, feel free to submit an Issue or Pull Request. I appreciate your input! 39 | 40 | 41 | ### First edition repository 42 | 43 | If you are looking for the first edition's repository, you can find it here: [Web Scraping with Python, First Edition](https://bitbucket.org/wswp/) 44 | 45 | ### Questions? 46 | 47 | Reach out to @kjam on Twitter or GitHub. @kjam is also often on freenode. :) 48 | -------------------------------------------------------------------------------- /data/captcha_samples/samples.csv: -------------------------------------------------------------------------------- 1 | sample1.png,watch 2 | sample2.png,clean 3 | sample3.png,forward 4 | sample4.png,secret 5 | sample5.png,square 6 | sample6.png,sweet 7 | sample7.png,flight 8 | sample8.png,number 9 | sample9.png,parcel 10 | sample10.png,linen 11 | sample11.png,attack 12 | sample12.png,comfort 13 | sample13.png,healthy 14 | sample14.png,woman 15 | sample15.png,between 16 | sample16.png,fruit 17 | sample17.png,office 18 | sample18.png,electric 19 | sample19.png,light 20 | sample20.png,reward 21 | sample21.png,powder 22 | sample22.png,damage 23 | sample23.png,thick 24 | sample24.png,tomorrow 25 | sample25.png,white 26 | sample26.png,together 27 | sample27.png,trick 28 | sample28.png,sister 29 | sample29.png,tongue 30 | sample30.png,because 31 | sample31.png,again 32 | sample32.png,tooth 33 | sample33.png,almost 34 | sample34.png,board 35 | sample35.png,stitch 36 | sample36.png,spoon 37 | sample37.png,paste 38 | sample38.png,memory 39 | sample39.png,guide 40 | sample40.png,electric 41 | sample41.png,regret 42 | sample42.png,harbor 43 | sample43.png,prose 44 | sample44.png,circle 45 | sample45.png,flight 46 | sample46.png,motion 47 | sample47.png,cause 48 | sample48.png,front 49 | sample49.png,question 50 | sample50.png,drawer 51 | sample51.png,present 52 | sample52.png,elastic 53 | sample53.png,laugh 54 | sample54.png,rhythm 55 | sample55.png,angle 56 | sample56.png,porter 57 | sample57.png,purpose 58 | sample58.png,event 59 | sample59.png,effect 60 | sample60.png,history 61 | sample61.png,tired 62 | sample62.png,animal 63 | sample63.png,steam 64 | sample64.png,normal 65 | sample65.png,scissors 66 | sample66.png,while 67 | sample67.png,print 68 | sample68.png,behavior 69 | sample69.png,impulse 70 | sample70.png,quiet 71 | sample71.png,level 72 | sample72.png,basin 73 | sample73.png,every 74 | sample74.png,peace 75 | sample75.png,right 76 | sample76.png,month 77 | sample77.png,science 78 | sample78.png,river 79 | sample79.png,frame 80 | sample80.png,stocking 81 | sample81.png,pencil 82 | sample82.png,table 83 | sample83.png,common 84 | sample84.png,store 85 | sample85.png,ornament 86 | sample86.png,belief 87 | sample87.png,across 88 | sample88.png,history 89 | sample89.png,harmony 90 | sample90.png,young 91 | sample91.png,summer 92 | sample92.png,yellow 93 | sample93.png,medical 94 | sample94.png,current 95 | sample95.png,amount 96 | sample96.png,skirt 97 | sample97.png,serious 98 | sample98.png,paper 99 | sample99.png,round 100 | sample100.png,stamp 101 | -------------------------------------------------------------------------------- /code/chp5/browser_render.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | import csv 5 | import time 6 | try: 7 | from PySide.QtGui import QApplication 8 | from PySide.QtCore import QUrl, QEventLoop, QTimer 9 | from PySide.QtWebKit import QWebView 10 | except ImportError: 11 | from PyQt4.QtGui import QApplication 12 | from PyQt4.QtCore import QUrl, QEventLoop, QTimer 13 | from PyQt4.QtWebKit import QWebView 14 | import lxml.html 15 | 16 | 17 | class BrowserRender(QWebView): 18 | def __init__(self, display=True): 19 | self.app = QApplication([]) 20 | QWebView.__init__(self) 21 | if display: 22 | self.show() # show the browser 23 | 24 | def open(self, url, timeout=60): 25 | """Wait for download to complete and return result""" 26 | loop = QEventLoop() 27 | timer = QTimer() 28 | timer.setSingleShot(True) 29 | timer.timeout.connect(loop.quit) 30 | self.loadFinished.connect(loop.quit) 31 | self.load(QUrl(url)) 32 | timer.start(timeout * 1000) 33 | loop.exec_() # delay here until download finished 34 | if timer.isActive(): 35 | # downloaded successfully 36 | timer.stop() 37 | return self.html() 38 | else: 39 | # timed out 40 | print 'Request timed out:', url 41 | 42 | def html(self): 43 | """Shortcut to return the current HTML""" 44 | return self.page().mainFrame().toHtml() 45 | 46 | def find(self, pattern): 47 | """Find all elements that match the pattern""" 48 | return self.page().mainFrame().findAllElements(pattern) 49 | 50 | def attr(self, pattern, name, value): 51 | """Set attribute for matching elements""" 52 | for e in self.find(pattern): 53 | e.setAttribute(name, value) 54 | 55 | def text(self, pattern, value): 56 | """Set attribute for matching elements""" 57 | for e in self.find(pattern): 58 | e.setPlainText(value) 59 | 60 | def click(self, pattern): 61 | """Click matching elements""" 62 | for e in self.find(pattern): 63 | e.evaluateJavaScript("this.click()") 64 | 65 | def wait_load(self, pattern, timeout=60): 66 | """Wait for this pattern to be found in webpage and return matches""" 67 | deadline = time.time() + timeout 68 | while time.time() < deadline: 69 | self.app.processEvents() 70 | matches = self.find(pattern) 71 | if matches: 72 | return matches 73 | print('Wait load timed out') 74 | 75 | 76 | def main(): 77 | br = BrowserRender() 78 | br.open('http://example.webscraping.com/search') 79 | br.attr('#search_term', 'value', '.') 80 | br.text('#page_size option:checked', '1000') 81 | br.click('#search') 82 | 83 | elements = br.wait_load('#results a') 84 | writer = csv.writer(open('countries.csv', 'w')) 85 | for country in [e.toPlainText().strip() for e in elements]: 86 | writer.writerow([country]) 87 | 88 | 89 | if __name__ == '__main__': 90 | main() 91 | -------------------------------------------------------------------------------- /code/chp7/captcha_api.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import re 3 | import time 4 | import requests 5 | from io import BytesIO 6 | 7 | 8 | class CaptchaAPI: 9 | def __init__(self, api_key, timeout=120): 10 | self.api_key = api_key 11 | self.timeout = timeout 12 | self.url = 'https://www.9kw.eu/index.cgi' 13 | 14 | def solve(self, img): 15 | """Submit CAPTCHA and return result when ready 16 | """ 17 | img_buffer = BytesIO() 18 | img.save(img_buffer, format="PNG") 19 | img_data = img_buffer.getvalue() 20 | captcha_id = self.send(img_data) 21 | start_time = time.time() 22 | while time.time() < start_time + self.timeout: 23 | try: 24 | resp = self.get(captcha_id) 25 | except CaptchaError: 26 | pass # CAPTCHA still not ready 27 | else: 28 | if resp.get('answer') != 'NO DATA': 29 | if resp.get('answer') == 'ERROR NO USER': 30 | raise CaptchaError( 31 | 'Error: no user available to solve CAPTCHA') 32 | else: 33 | print('CAPTCHA solved!') 34 | return captcha_id, resp.get('answer') 35 | print('Waiting for CAPTCHA ...') 36 | time.sleep(1) 37 | 38 | raise CaptchaError('Error: API timeout') 39 | 40 | def send(self, img_data): 41 | """Send CAPTCHA for solving """ 42 | print('Submitting CAPTCHA') 43 | data = { 44 | 'action': 'usercaptchaupload', 45 | 'apikey': self.api_key, 46 | 'file-upload-01': base64.b64encode(img_data), 47 | 'base64': '1', 48 | 'selfsolve': '1', 49 | 'json': '1', 50 | 'maxtimeout': str(self.timeout) 51 | } 52 | result = requests.post(self.url, data) 53 | self.check(result.text) 54 | return result.json() 55 | 56 | def get(self, captcha_id): 57 | """Get result of solved CAPTCHA""" 58 | data = { 59 | 'action': 'usercaptchacorrectdata', 60 | 'id': captcha_id, 61 | 'apikey': self.api_key, 62 | 'info': '1', 63 | 'json': '1', 64 | } 65 | result = requests.get(self.url, data) 66 | self.check(result.text) 67 | return result.json() 68 | 69 | def check(self, result): 70 | """Check result of API and raise error if error code""" 71 | if re.match('00\d\d \w+', result): 72 | raise CaptchaError('API error: ' + result) 73 | 74 | def report(self, captcha_id, correct): 75 | """ Report back whether captcha was correct or not""" 76 | data = { 77 | 'action': 'usercaptchacorrectback', 78 | 'id': captcha_id, 79 | 'apikey': self.api_key, 80 | 'correct': (lambda c: 1 if c else 2)(correct), 81 | 'json': '1', 82 | } 83 | resp = requests.get(self.url, data) 84 | return resp.json() 85 | 86 | 87 | class CaptchaError(Exception): 88 | pass 89 | -------------------------------------------------------------------------------- /code/chp3/requests_cache_link_crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib import robotparser 3 | from urllib.parse import urljoin 4 | from datetime import timedelta 5 | from chp3.downloader_requests_cache import Downloader 6 | 7 | import requests_cache 8 | 9 | 10 | def get_robots_parser(robots_url): 11 | " Return the robots parser object using the robots_url " 12 | rp = robotparser.RobotFileParser() 13 | rp.set_url(robots_url) 14 | rp.read() 15 | return rp 16 | 17 | 18 | def get_links(html): 19 | " Return a list of links (using simple regex matching) from the html content " 20 | # a regular expression to extract all links from the webpage 21 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 22 | # list of all links from the webpage 23 | return webpage_regex.findall(html) 24 | 25 | 26 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', 27 | proxies=None, delay=3, max_depth=4, num_retries=2, expires=timedelta(days=30)): 28 | """ Crawl from the given start URL following links matched by link_regex. In the current 29 | implementation, we do not actually scrapy any information. 30 | 31 | args: 32 | start_url (str): web site to start crawl 33 | link_regex (str): regex to match for links 34 | kwargs: 35 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) 36 | user_agent (str): user agent (default: wswp) 37 | proxies (list of dicts): a list of possible dicts for http / https proxies 38 | For formatting, see the requests library 39 | delay (int): seconds to throttle between requests to one domain (default: 3) 40 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 41 | num_retries (int): # of retries when 5xx error (default: 2) 42 | expires (timedelta): timedelta for cache expirations (default: 30 days) 43 | """ 44 | crawl_queue = [start_url] 45 | # keep track which URL's have seen before 46 | seen = {} 47 | requests_cache.install_cache(backend='redis', expire_after=expires) 48 | if not robots_url: 49 | robots_url = '{}/robots.txt'.format(start_url) 50 | rp = get_robots_parser(robots_url) 51 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies) 52 | while crawl_queue: 53 | url = crawl_queue.pop() 54 | # check url passes robots.txt restrictions 55 | if rp.can_fetch(user_agent, url): 56 | depth = seen.get(url, 0) 57 | if depth == max_depth: 58 | print('Skipping %s due to depth' % url) 59 | continue 60 | html = D(url, num_retries=num_retries) 61 | if not html: 62 | continue 63 | # TODO: add actual data scraping here 64 | # filter for links matching our regular expression 65 | for link in get_links(html): 66 | if re.match(link_regex, link): 67 | abs_link = urljoin(start_url, link) 68 | if abs_link not in seen: 69 | seen[abs_link] = depth + 1 70 | crawl_queue.append(abs_link) 71 | else: 72 | print('Blocked by robots.txt:', url) 73 | -------------------------------------------------------------------------------- /code/chp3/advanced_link_crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib import robotparser 3 | from urllib.parse import urljoin 4 | from chp3.downloader import Downloader 5 | 6 | 7 | def get_robots_parser(robots_url): 8 | " Return the robots parser object using the robots_url " 9 | rp = robotparser.RobotFileParser() 10 | rp.set_url(robots_url) 11 | rp.read() 12 | return rp 13 | 14 | 15 | def get_links(html): 16 | " Return a list of links (using simple regex matching) from the html content " 17 | # a regular expression to extract all links from the webpage 18 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 19 | # list of all links from the webpage 20 | return webpage_regex.findall(html) 21 | 22 | 23 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', 24 | proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None): 25 | """ Crawl from the given start URL following links matched by link_regex. In the current 26 | implementation, we do not actually scrape any information. 27 | 28 | args: 29 | start_url (str): web site to start crawl 30 | link_regex (str): regex to match for links 31 | kwargs: 32 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) 33 | user_agent (str): user agent (default: wswp) 34 | proxies (list of dicts): a list of possible dicts for http / https proxies 35 | For formatting, see the requests library 36 | delay (int): seconds to throttle between requests to one domain (default: 3) 37 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 38 | num_retries (int): # of retries when 5xx error (default: 2) 39 | cache (dict): cache dict with urls as keys and dicts for responses (default: {}) 40 | scraper_callback: function to be called on url and html content 41 | """ 42 | crawl_queue = [start_url] 43 | # keep track which URL's have seen before 44 | seen = {} 45 | if not robots_url: 46 | robots_url = '{}/robots.txt'.format(start_url) 47 | rp = get_robots_parser(robots_url) 48 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache) 49 | while crawl_queue: 50 | url = crawl_queue.pop() 51 | # check url passes robots.txt restrictions 52 | if rp.can_fetch(user_agent, url): 53 | depth = seen.get(url, 0) 54 | if depth == max_depth: 55 | print('Skipping %s due to depth' % url) 56 | continue 57 | html = D(url, num_retries=num_retries) 58 | if not html: 59 | continue 60 | if scraper_callback: 61 | links = scraper_callback(url, html) or [] 62 | else: 63 | links = [] 64 | # filter for links matching our regular expression 65 | for link in get_links(html) + links: 66 | if re.match(link_regex, link): 67 | abs_link = urljoin(start_url, link) 68 | if abs_link not in seen: 69 | seen[abs_link] = depth + 1 70 | crawl_queue.append(abs_link) 71 | else: 72 | print('Blocked by robots.txt:', url) 73 | -------------------------------------------------------------------------------- /code/chp3/downloader.py: -------------------------------------------------------------------------------- 1 | from random import choice 2 | import requests 3 | 4 | from chp1.throttle import Throttle 5 | 6 | 7 | class Downloader: 8 | """ Downloader class to use cache and requests for downloading pages. 9 | For contructor, pass: 10 | delay (int): # of secs delay between requests (default: 5) 11 | user_agent (str): user agent string (default: 'wswp') 12 | proxies (list[dict]): list of possible proxies, each 13 | must be a dict with http / https keys and proxy values 14 | cache (dict or dict-like obj): keys: urls, values: dicts with keys (html, code) 15 | timeout (float/int): number of seconds to wait until timeout 16 | """ 17 | def __init__(self, delay=5, user_agent='wswp', proxies=None, cache={}, 18 | timeout=60): 19 | self.throttle = Throttle(delay) 20 | self.user_agent = user_agent 21 | self.proxies = proxies 22 | self.cache = cache 23 | self.num_retries = None # we will set this per request 24 | self.timeout = timeout 25 | 26 | def __call__(self, url, num_retries=2): 27 | """ Call the downloader class, which will return HTML from cache 28 | or download it 29 | args: 30 | url (str): url to download 31 | kwargs: 32 | num_retries (int): # times to retry if 5xx code (default: 2) 33 | """ 34 | self.num_retries = num_retries 35 | try: 36 | result = self.cache[url] 37 | print('Loaded from cache:', url) 38 | except KeyError: 39 | result = None 40 | if result and self.num_retries and 500 <= result['code'] < 600: 41 | # server error so ignore result from cache 42 | # and re-download 43 | result = None 44 | if result is None: 45 | # result was not loaded from cache, need to download 46 | self.throttle.wait(url) 47 | proxies = choice(self.proxies) if self.proxies else None 48 | headers = {'User-Agent': self.user_agent} 49 | result = self.download(url, headers, proxies) 50 | self.cache[url] = result 51 | return result['html'] 52 | 53 | def download(self, url, headers, proxies): 54 | """ Download a and return the page content 55 | args: 56 | url (str): URL 57 | headers (dict): dict of headers (like user_agent) 58 | proxies (dict): proxy dict w/ keys 'http'/'https', values 59 | are strs (i.e. 'http(s)://IP') (default: None) 60 | """ 61 | print('Downloading:', url) 62 | try: 63 | resp = requests.get(url, headers=headers, proxies=proxies, 64 | timeout=self.timeout) 65 | html = resp.text 66 | if resp.status_code >= 400: 67 | print('Download error:', resp.text) 68 | html = None 69 | if self.num_retries and 500 <= resp.status_code < 600: 70 | # recursively retry 5xx HTTP errors 71 | self.num_retries -= 1 72 | return self.download(url, headers, proxies) 73 | except requests.exceptions.RequestException as e: 74 | print('Download error:', e) 75 | return {'html': None, 'code': 500} 76 | return {'html': html, 'code': resp.status_code} 77 | -------------------------------------------------------------------------------- /code/chp8/example/example/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for example project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'example' 13 | 14 | SPIDER_MODULES = ['example.spiders'] 15 | NEWSPIDER_MODULE = 'example.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'example (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | DOWNLOAD_DELAY = 5 31 | # The download delay setting will honor only one of: 32 | CONCURRENT_REQUESTS_PER_DOMAIN = 1 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'example.middlewares.ExampleSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'example.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'example.pipelines.ExamplePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /code/chp3/downloader_requests_cache.py: -------------------------------------------------------------------------------- 1 | from random import choice 2 | import requests 3 | import requests_cache 4 | 5 | from chp1.throttle import Throttle 6 | 7 | 8 | class Downloader: 9 | """ Downloader class to use cache and requests for downloading pages. 10 | For contructor, pass: 11 | delay (int): # of secs delay between requests (default: 5) 12 | user_agent (str): user agent string (default: 'wswp') 13 | proxies (list[dict]): list of possible proxies, each 14 | must be a dict with http / https keys and proxy values 15 | timeout (float/int): number of seconds to wait until timeout 16 | """ 17 | def __init__(self, delay=5, user_agent='wswp', proxies=None, 18 | timeout=60): 19 | self.throttle = Throttle(delay) 20 | self.user_agent = user_agent 21 | self.proxies = proxies 22 | self.num_retries = None # we will set this per request 23 | self.timeout = timeout 24 | 25 | def __call__(self, url, num_retries=2): 26 | """ Call the downloader class, which will return HTML from cache 27 | or download it 28 | args: 29 | url (str): url to download 30 | kwargs: 31 | num_retries (int): # times to retry if 5xx code (default: 2) 32 | """ 33 | self.num_retries = num_retries 34 | proxies = choice(self.proxies) if self.proxies else None 35 | headers = {'User-Agent': self.user_agent} 36 | result = self.download(url, headers, proxies) 37 | return result['html'] 38 | 39 | def make_throttle_hook(self, throttle=None): 40 | """ 41 | Modified from: https://requests-cache.readthedocs.io/en/latest/user_guide.html 42 | Returns a response hook function which sleeps for `timeout` seconds if 43 | response is not cached 44 | """ 45 | def hook(response, *args, **kwargs): 46 | """ see requests hook documentation for more information""" 47 | if not getattr(response, 'from_cache', False): 48 | throttle.wait(response.url) 49 | print('Downloading:', response.url) 50 | else: 51 | print('Returning from cache:', response.url) 52 | return response 53 | return hook 54 | 55 | def download(self, url, headers, proxies): 56 | """ Download a and return the page content 57 | args: 58 | url (str): URL 59 | headers (dict): dict of headers (like user_agent) 60 | proxies (dict): proxy dict w/ keys 'http'/'https', values 61 | are strs (i.e. 'http(s)://IP') (default: None) 62 | """ 63 | session = requests_cache.CachedSession() 64 | session.hooks = {'response': self.make_throttle_hook(self.throttle)} 65 | 66 | try: 67 | resp = session.get(url, headers=headers, proxies=proxies, 68 | timeout=self.timeout) 69 | html = resp.text 70 | if resp.status_code >= 400: 71 | print('Download error:', resp.text) 72 | html = None 73 | if self.num_retries and 500 <= resp.status_code < 600: 74 | # recursively retry 5xx HTTP errors 75 | self.num_retries -= 1 76 | return self.download(url, headers, proxies) 77 | except requests.exceptions.RequestException as e: 78 | print('Download error:', e) 79 | return {'html': None, 'code': 500} 80 | return {'html': html, 'code': resp.status_code} 81 | -------------------------------------------------------------------------------- /code/chp3/diskcache.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import re 4 | import zlib 5 | 6 | from datetime import datetime, timedelta 7 | from urllib.parse import urlsplit 8 | 9 | 10 | class DiskCache: 11 | """ DiskCache helps store urls and their responses to disk 12 | Intialization components: 13 | cache_dir (str): abs file path or relative file path 14 | for cache directory (default: ../data/cache) 15 | max_len (int): maximum filename length (default: 255) 16 | compress (bool): use zlib compression (default: True) 17 | encoding (str): character encoding for compression (default: utf-8) 18 | expires (datetime.timedelta): timedelta when content will expire 19 | (default: 30 days ago) 20 | """ 21 | def __init__(self, cache_dir='../data/cache', max_len=255, compress=True, 22 | encoding='utf-8', expires=timedelta(days=30)): 23 | self.cache_dir = cache_dir 24 | self.max_len = max_len 25 | self.compress = compress 26 | self.encoding = encoding 27 | self.expires = expires 28 | 29 | def url_to_path(self, url): 30 | """ Return file system path string for given URL """ 31 | components = urlsplit(url) 32 | # append index.html to empty paths 33 | path = components.path 34 | if not path: 35 | path = '/index.html' 36 | elif path.endswith('/'): 37 | path += 'index.html' 38 | filename = components.netloc + path + components.query 39 | # replace invalid characters 40 | filename = re.sub(r'[^/0-9a-zA-Z\-.,;_ ]', '_', filename) 41 | # restrict maximum number of characters 42 | filename = '/'.join(seg[:self.max_len] for seg in filename.split('/')) 43 | return os.path.join(self.cache_dir, filename) 44 | 45 | def __getitem__(self, url): 46 | """Load data from disk for given URL""" 47 | path = self.url_to_path(url) 48 | if os.path.exists(path): 49 | mode = ('rb' if self.compress else 'r') 50 | with open(path, mode) as fp: 51 | if self.compress: 52 | data = zlib.decompress(fp.read()).decode(self.encoding) 53 | data = json.loads(data) 54 | else: 55 | data = json.load(fp) 56 | exp_date = data.get('expires') 57 | if exp_date and datetime.strptime(exp_date, 58 | '%Y-%m-%dT%H:%M:%S') <= datetime.utcnow(): 59 | print('Cache expired!', exp_date) 60 | raise KeyError(url + ' has expired.') 61 | return data 62 | else: 63 | # URL has not yet been cached 64 | raise KeyError(url + ' does not exist') 65 | 66 | def __setitem__(self, url, result): 67 | """Save data to disk for given url""" 68 | path = self.url_to_path(url) 69 | folder = os.path.dirname(path) 70 | if not os.path.exists(folder): 71 | os.makedirs(folder) 72 | mode = ('wb' if self.compress else 'w') 73 | # Note: the timespec command requires Py3.6+ (if using 3.X you can 74 | # export using isoformat() and import with '%Y-%m-%dT%H:%M:%S.%f' 75 | result['expires'] = (datetime.utcnow() + self.expires).isoformat( 76 | timespec='seconds') 77 | with open(path, mode) as fp: 78 | if self.compress: 79 | data = bytes(json.dumps(result), self.encoding) 80 | fp.write(zlib.compress(data)) 81 | else: 82 | json.dump(result, fp) 83 | -------------------------------------------------------------------------------- /code/chp1/advanced_link_crawler_using_requests.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib import robotparser 3 | from urllib.parse import urljoin 4 | 5 | import requests 6 | from chp1.throttle import Throttle 7 | 8 | 9 | def download(url, num_retries=2, user_agent='wswp', proxies=None): 10 | """ Download a given URL and return the page content 11 | args: 12 | url (str): URL 13 | kwargs: 14 | user_agent (str): user agent (default: wswp) 15 | proxies (dict): proxy dict w/ keys 'http' and 'https', values 16 | are strs (i.e. 'http(s)://IP') (default: None) 17 | num_retries (int): # of retries if a 5xx error is seen (default: 2) 18 | """ 19 | print('Downloading:', url) 20 | headers = {'User-Agent': user_agent} 21 | try: 22 | resp = requests.get(url, headers=headers, proxies=proxies) 23 | html = resp.text 24 | if resp.status_code >= 400: 25 | print('Download error:', resp.text) 26 | html = None 27 | if num_retries and 500 <= resp.status_code < 600: 28 | # recursively retry 5xx HTTP errors 29 | return download(url, num_retries - 1) 30 | except requests.exceptions.RequestException as e: 31 | print('Download error:', e) 32 | html = None 33 | return html 34 | 35 | 36 | def get_robots_parser(robots_url): 37 | " Return the robots parser object using the robots_url " 38 | rp = robotparser.RobotFileParser() 39 | rp.set_url(robots_url) 40 | rp.read() 41 | return rp 42 | 43 | 44 | def get_links(html): 45 | """ Return a list of links (using simple regex matching) 46 | from the html content """ 47 | # a regular expression to extract all links from the webpage 48 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 49 | # list of all links from the webpage 50 | return webpage_regex.findall(html) 51 | 52 | 53 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', 54 | proxies=None, delay=3, max_depth=4): 55 | """ Crawl from the given start URL following links matched by link_regex. 56 | In the current implementation, we do not actually scrape any information. 57 | 58 | args: 59 | start_url (str): web site to start crawl 60 | link_regex (str): regex to match for links 61 | kwargs: 62 | robots_url (str): url of the site's robots.txt 63 | (default: start_url + /robots.txt) 64 | user_agent (str): user agent (default: wswp) 65 | proxies (dict): proxy dict w/ keys 'http' and 'https', values 66 | are strs (i.e. 'http(s)://IP') (default: None) 67 | delay (int): seconds to throttle between requests 68 | to one domain (default: 3) 69 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 70 | """ 71 | crawl_queue = [start_url] 72 | # keep track which URL's have seen before 73 | seen = {} 74 | if not robots_url: 75 | robots_url = '{}/robots.txt'.format(start_url) 76 | rp = get_robots_parser(robots_url) 77 | throttle = Throttle(delay) 78 | while crawl_queue: 79 | url = crawl_queue.pop() 80 | # check url passes robots.txt restrictions 81 | if rp.can_fetch(user_agent, url): 82 | depth = seen.get(url, 0) 83 | if depth == max_depth: 84 | print('Skipping %s due to depth' % url) 85 | continue 86 | throttle.wait(url) 87 | html = download(url, user_agent=user_agent, proxies=proxies) 88 | if not html: 89 | continue 90 | # TODO: add actual data scraping here 91 | # filter for links matching our regular expression 92 | for link in get_links(html): 93 | if re.match(link_regex, link): 94 | abs_link = urljoin(start_url, link) 95 | if abs_link not in seen: 96 | seen[abs_link] = depth + 1 97 | crawl_queue.append(abs_link) 98 | else: 99 | print('Blocked by robots.txt:', url) 100 | -------------------------------------------------------------------------------- /code/chp1/advanced_link_crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import urllib.request 3 | from urllib import robotparser 4 | from urllib.parse import urljoin 5 | from urllib.error import URLError, HTTPError, ContentTooShortError 6 | from chp1.throttle import Throttle 7 | 8 | 9 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8', proxy=None): 10 | """ Download a given URL and return the page content 11 | args: 12 | url (str): URL 13 | kwargs: 14 | user_agent (str): user agent (default: wswp) 15 | charset (str): charset if website does not include one in headers 16 | proxy (str): proxy url, ex 'http://IP' (default: None) 17 | num_retries (int): number of retries if a 5xx error is seen (default: 2) 18 | """ 19 | print('Downloading:', url) 20 | request = urllib.request.Request(url) 21 | request.add_header('User-agent', user_agent) 22 | try: 23 | if proxy: 24 | proxy_support = urllib.request.ProxyHandler({'http': proxy}) 25 | opener = urllib.request.build_opener(proxy_support) 26 | urllib.request.install_opener(opener) 27 | resp = urllib.request.urlopen(request) 28 | cs = resp.headers.get_content_charset() 29 | if not cs: 30 | cs = charset 31 | html = resp.read().decode(cs) 32 | except (URLError, HTTPError, ContentTooShortError) as e: 33 | print('Download error:', e.reason) 34 | html = None 35 | if num_retries > 0: 36 | if hasattr(e, 'code') and 500 <= e.code < 600: 37 | # recursively retry 5xx HTTP errors 38 | return download(url, num_retries - 1) 39 | return html 40 | 41 | 42 | def get_robots_parser(robots_url): 43 | " Return the robots parser object using the robots_url " 44 | rp = robotparser.RobotFileParser() 45 | rp.set_url(robots_url) 46 | rp.read() 47 | return rp 48 | 49 | 50 | def get_links(html): 51 | " Return a list of links (using simple regex matching) from the html content " 52 | # a regular expression to extract all links from the webpage 53 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 54 | # list of all links from the webpage 55 | return webpage_regex.findall(html) 56 | 57 | 58 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', 59 | proxy=None, delay=3, max_depth=4): 60 | """ Crawl from the given start URL following links matched by link_regex. In the current 61 | implementation, we do not actually scrapy any information. 62 | 63 | args: 64 | start_url (str): web site to start crawl 65 | link_regex (str): regex to match for links 66 | kwargs: 67 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) 68 | user_agent (str): user agent (default: wswp) 69 | proxy (str): proxy url, ex 'http://IP' (default: None) 70 | delay (int): seconds to throttle between requests to one domain (default: 3) 71 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 72 | """ 73 | crawl_queue = [start_url] 74 | # keep track which URL's have seen before 75 | seen = {} 76 | if not robots_url: 77 | robots_url = '{}/robots.txt'.format(start_url) 78 | rp = get_robots_parser(robots_url) 79 | throttle = Throttle(delay) 80 | while crawl_queue: 81 | url = crawl_queue.pop() 82 | # check url passes robots.txt restrictions 83 | if rp.can_fetch(user_agent, url): 84 | depth = seen.get(url, 0) 85 | if depth == max_depth: 86 | print('Skipping %s due to depth' % url) 87 | continue 88 | throttle.wait(url) 89 | html = download(url, user_agent=user_agent, proxy=proxy) 90 | if not html: 91 | continue 92 | # TODO: add actual data scraping here 93 | # filter for links matching our regular expression 94 | for link in get_links(html): 95 | if re.match(link_regex, link): 96 | abs_link = urljoin(start_url, link) 97 | if abs_link not in seen: 98 | seen[abs_link] = depth + 1 99 | crawl_queue.append(abs_link) 100 | else: 101 | print('Blocked by robots.txt:', url) 102 | -------------------------------------------------------------------------------- /code/chp4/advanced_link_crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import socket 3 | from urllib import robotparser 4 | from urllib.parse import urljoin, urlparse 5 | from chp3.downloader import Downloader 6 | 7 | socket.setdefaulttimeout(60) 8 | 9 | 10 | def get_robots_parser(robots_url): 11 | " Return the robots parser object using the robots_url " 12 | try: 13 | rp = robotparser.RobotFileParser() 14 | rp.set_url(robots_url) 15 | rp.read() 16 | return rp 17 | except Exception as e: 18 | print('Error finding robots_url:', robots_url, e) 19 | 20 | 21 | def get_links(html): 22 | " Return a list of links (using simple regex matching) from the html content " 23 | # a regular expression to extract all links from the webpage 24 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 25 | # list of all links from the webpage 26 | return webpage_regex.findall(html) 27 | 28 | 29 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', 30 | proxies=None, delay=3, max_depth=4, num_retries=2, cache={}, scraper_callback=None): 31 | """ Crawl from the given start URL following links matched by link_regex. In the current 32 | implementation, we do not actually scrapy any information. 33 | 34 | args: 35 | start_url (str or list of strs): web site(s) to start crawl 36 | link_regex (str): regex to match for links 37 | kwargs: 38 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) 39 | user_agent (str): user agent (default: wswp) 40 | proxies (list of dicts): a list of possible dicts for http / https proxies 41 | For formatting, see the requests library 42 | delay (int): seconds to throttle between requests to one domain (default: 3) 43 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 44 | num_retries (int): # of retries when 5xx error (default: 2) 45 | cache (dict): cache dict with urls as keys and dicts for responses (default: {}) 46 | scraper_callback: function to be called on url and html content 47 | """ 48 | if isinstance(start_url, list): 49 | crawl_queue = start_url 50 | else: 51 | crawl_queue = [start_url] 52 | # keep track which URL's have seen before 53 | seen, robots = {}, {} 54 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache) 55 | while crawl_queue: 56 | url = crawl_queue.pop() 57 | no_robots = False 58 | if 'http' not in url: 59 | continue 60 | domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc) 61 | rp = robots.get(domain) 62 | if not rp and domain not in robots: 63 | robots_url = '{}/robots.txt'.format(domain) 64 | rp = get_robots_parser(robots_url) 65 | if not rp: 66 | # issue finding robots.txt, still crawl 67 | no_robots = True 68 | robots[domain] = rp 69 | elif domain in robots: 70 | no_robots = True 71 | # check url passes robots.txt restrictions 72 | if no_robots or rp.can_fetch(user_agent, url): 73 | depth = seen.get(url, 0) 74 | if depth == max_depth: 75 | print('Skipping %s due to depth' % url) 76 | continue 77 | html = D(url, num_retries=num_retries) 78 | if not html: 79 | continue 80 | if scraper_callback: 81 | links = scraper_callback(url, html) or [] 82 | else: 83 | links = [] 84 | # filter for links matching our regular expression 85 | for link in get_links(html) + links: 86 | if re.match(link_regex, link): 87 | if 'http' not in link: 88 | if link.startswith('//'): 89 | link = '{}:{}'.format(urlparse(url).scheme, link) 90 | elif link.startswith('://'): 91 | link = '{}{}'.format(urlparse(url).scheme, link) 92 | else: 93 | link = urljoin(domain, link) 94 | 95 | if link not in seen: 96 | seen[link] = depth + 1 97 | crawl_queue.append(link) 98 | else: 99 | print('Blocked by robots.txt:', url) 100 | 101 | 102 | if __name__ == '__main__': 103 | from chp4.alexa_callback import AlexaCallback 104 | from chp3.rediscache import RedisCache 105 | from time import time 106 | AC = AlexaCallback() 107 | AC() 108 | start_time = time() 109 | link_crawler(AC.urls, '$^', cache=RedisCache()) 110 | print('Total time: %ss' % (time() - start_time)) 111 | -------------------------------------------------------------------------------- /code/chp2/advanced_link_crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import urllib.request 3 | from urllib import robotparser 4 | from urllib.parse import urljoin 5 | from urllib.error import URLError, HTTPError, ContentTooShortError 6 | from lxml.html import fromstring 7 | from chp1.throttle import Throttle 8 | 9 | 10 | def download(url, num_retries=2, user_agent='wswp', charset='utf-8', proxy=None): 11 | """ Download a given URL and return the page content 12 | args: 13 | url (str): URL 14 | kwargs: 15 | user_agent (str): user agent (default: wswp) 16 | charset (str): charset if website does not include one in headers 17 | proxy (str): proxy url, ex 'http://IP' (default: None) 18 | num_retries (int): number of retries if a 5xx error is seen (default: 2) 19 | """ 20 | print('Downloading:', url) 21 | request = urllib.request.Request(url) 22 | request.add_header('User-agent', user_agent) 23 | try: 24 | if proxy: 25 | proxy_support = urllib.request.ProxyHandler({'http': proxy}) 26 | opener = urllib.request.build_opener(proxy_support) 27 | urllib.request.install_opener(opener) 28 | resp = urllib.request.urlopen(request) 29 | cs = resp.headers.get_content_charset() 30 | if not cs: 31 | cs = charset 32 | html = resp.read().decode(cs) 33 | except (URLError, HTTPError, ContentTooShortError) as e: 34 | print('Download error:', e) 35 | html = None 36 | if num_retries > 0: 37 | if hasattr(e, 'code') and 500 <= e.code < 600: 38 | # recursively retry 5xx HTTP errors 39 | return download(url, num_retries - 1) 40 | return html 41 | 42 | 43 | def get_robots_parser(robots_url): 44 | " Return the robots parser object using the robots_url " 45 | rp = robotparser.RobotFileParser() 46 | rp.set_url(robots_url) 47 | rp.read() 48 | return rp 49 | 50 | 51 | def get_links(html): 52 | " Return a list of links (using simple regex matching) from the html content " 53 | # a regular expression to extract all links from the webpage 54 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 55 | # list of all links from the webpage 56 | return webpage_regex.findall(html) 57 | 58 | 59 | def scrape_callback(url, html): 60 | """ Scrape each row from the country data using XPath and lxml """ 61 | fields = ('area', 'population', 'iso', 'country', 'capital', 62 | 'continent', 'tld', 'currency_code', 'currency_name', 63 | 'phone', 'postal_code_format', 'postal_code_regex', 64 | 'languages', 'neighbours') 65 | if re.search('/view/', url): 66 | tree = fromstring(html) 67 | all_rows = [ 68 | tree.xpath('//tr[@id="places_%s__row"]/td[@class="w2p_fw"]' % field)[0].text_content() 69 | for field in fields] 70 | print(url, all_rows) 71 | 72 | 73 | def link_crawler(start_url, link_regex, robots_url=None, user_agent='wswp', 74 | proxy=None, delay=3, max_depth=4, scrape_callback=None): 75 | """ Crawl from the given start URL following links matched by link_regex. In the current 76 | implementation, we do not actually scrapy any information. 77 | 78 | args: 79 | start_url (str): web site to start crawl 80 | link_regex (str): regex to match for links 81 | kwargs: 82 | robots_url (str): url of the site's robots.txt (default: start_url + /robots.txt) 83 | user_agent (str): user agent (default: wswp) 84 | proxy (str): proxy url, ex 'http://IP' (default: None) 85 | delay (int): seconds to throttle between requests to one domain (default: 3) 86 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 87 | scrape_callback (function): function to call after each download (default: None) 88 | """ 89 | crawl_queue = [start_url] 90 | # keep track which URL's have seen before 91 | seen = {} 92 | data = [] 93 | if not robots_url: 94 | robots_url = '{}/robots.txt'.format(start_url) 95 | rp = get_robots_parser(robots_url) 96 | throttle = Throttle(delay) 97 | while crawl_queue: 98 | url = crawl_queue.pop() 99 | # check url passes robots.txt restrictions 100 | if rp.can_fetch(user_agent, url): 101 | depth = seen.get(url, 0) 102 | if depth == max_depth: 103 | print('Skipping %s due to depth' % url) 104 | continue 105 | throttle.wait(url) 106 | html = download(url, user_agent=user_agent, proxy=proxy) 107 | if not html: 108 | continue 109 | if scrape_callback: 110 | data.extend(scrape_callback(url, html) or []) 111 | # filter for links matching our regular expression 112 | for link in get_links(html): 113 | if re.match(link_regex, link): 114 | abs_link = urljoin(start_url, link) 115 | if abs_link not in seen: 116 | seen[abs_link] = depth + 1 117 | crawl_queue.append(abs_link) 118 | else: 119 | print('Blocked by robots.txt:', url) 120 | -------------------------------------------------------------------------------- /code/chp4/threaded_crawler.py: -------------------------------------------------------------------------------- 1 | import re 2 | import socket 3 | import threading 4 | import time 5 | from urllib import robotparser 6 | from urllib.parse import urljoin, urlparse 7 | from chp3.downloader import Downloader 8 | 9 | SLEEP_TIME = 1 10 | socket.setdefaulttimeout(60) 11 | 12 | 13 | def get_robots_parser(robots_url): 14 | " Return the robots parser object using the robots_url " 15 | try: 16 | rp = robotparser.RobotFileParser() 17 | rp.set_url(robots_url) 18 | rp.read() 19 | return rp 20 | except Exception as e: 21 | print('Error finding robots_url:', robots_url, e) 22 | 23 | 24 | def get_links(html): 25 | " Return a list of links (using simple regex matching) from the html content " 26 | # a regular expression to extract all links from the webpage 27 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 28 | # list of all links from the webpage 29 | return webpage_regex.findall(html) 30 | 31 | 32 | def threaded_crawler(start_url, link_regex, user_agent='wswp', proxies=None, 33 | delay=3, max_depth=4, num_retries=2, cache={}, max_threads=10, scraper_callback=None): 34 | """ Crawl from the given start URLs following links matched by link_regex. In this 35 | implementation, we do not actually scrape any information. 36 | 37 | args: 38 | start_url (str or list of strs): web site(s) to start crawl 39 | link_regex (str): regex to match for links 40 | kwargs: 41 | user_agent (str): user agent (default: wswp) 42 | proxies (list of dicts): a list of possible dicts for http / https proxies 43 | For formatting, see the requests library 44 | delay (int): seconds to throttle between requests to one domain (default: 3) 45 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 46 | num_retries (int): # of retries when 5xx error (default: 2) 47 | cache (dict): cache dict with urls as keys and dicts for responses (default: {}) 48 | scraper_callback: function to be called on url and html content 49 | """ 50 | if isinstance(start_url, list): 51 | crawl_queue = start_url 52 | else: 53 | crawl_queue = [start_url] 54 | # keep track which URL's have seen before 55 | seen, robots = {}, {} 56 | D = Downloader(delay=delay, user_agent=user_agent, proxies=proxies, cache=cache) 57 | 58 | def process_queue(): 59 | while crawl_queue: 60 | url = crawl_queue.pop() 61 | no_robots = False 62 | if not url or 'http' not in url: 63 | continue 64 | domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc) 65 | rp = robots.get(domain) 66 | if not rp and domain not in robots: 67 | robots_url = '{}/robots.txt'.format(domain) 68 | rp = get_robots_parser(robots_url) 69 | if not rp: 70 | # issue finding robots.txt, still crawl 71 | no_robots = True 72 | robots[domain] = rp 73 | elif domain in robots: 74 | no_robots = True 75 | # check url passes robots.txt restrictions 76 | if no_robots or rp.can_fetch(user_agent, url): 77 | depth = seen.get(url, 0) 78 | if depth == max_depth: 79 | print('Skipping %s due to depth' % url) 80 | continue 81 | html = D(url, num_retries=num_retries) 82 | if not html: 83 | continue 84 | if scraper_callback: 85 | links = scraper_callback(url, html) or [] 86 | else: 87 | links = [] 88 | # filter for links matching our regular expression 89 | for link in get_links(html) + links: 90 | if re.match(link_regex, link): 91 | if 'http' not in link: 92 | if link.startswith('//'): 93 | link = '{}:{}'.format(urlparse(url).scheme, link) 94 | elif link.startswith('://'): 95 | link = '{}{}'.format(urlparse(url).scheme, link) 96 | else: 97 | link = urljoin(domain, link) 98 | if link not in seen: 99 | seen[link] = depth + 1 100 | crawl_queue.append(link) 101 | else: 102 | print('Blocked by robots.txt:', url) 103 | 104 | # wait for all download threads to finish 105 | threads = [] 106 | print(max_threads) 107 | while threads or crawl_queue: 108 | for thread in threads: 109 | if not thread.is_alive(): 110 | threads.remove(thread) 111 | while len(threads) < max_threads and crawl_queue: 112 | # can start some more threads 113 | thread = threading.Thread(target=process_queue) 114 | thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c 115 | thread.start() 116 | threads.append(thread) 117 | print(threads) 118 | for thread in threads: 119 | thread.join() 120 | 121 | time.sleep(SLEEP_TIME) 122 | 123 | 124 | if __name__ == '__main__': 125 | from chp4.alexa_callback import AlexaCallback 126 | from chp3.rediscache import RedisCache 127 | import argparse 128 | 129 | parser = argparse.ArgumentParser(description='Threaded link crawler') 130 | parser.add_argument('max_threads', type=int, help='maximum number of threads', 131 | nargs='?', default=5) 132 | parser.add_argument('url_pattern', type=str, help='regex pattern for url matching', 133 | nargs='?', default='$^') 134 | par_args = parser.parse_args() 135 | AC = AlexaCallback() 136 | AC() 137 | start_time = time.time() 138 | threaded_crawler(AC.urls, par_args.url_pattern, cache=RedisCache(), 139 | max_threads=par_args.max_threads) 140 | print('Total time: %ss' % (time.time() - start_time)) 141 | -------------------------------------------------------------------------------- /code/chp4/threaded_crawler_with_queue.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import re 3 | import socket 4 | import threading 5 | import time 6 | from urllib import robotparser 7 | from urllib.parse import urljoin, urlparse 8 | from chp3.downloader import Downloader 9 | from chp4.redis_queue import RedisQueue 10 | 11 | 12 | SLEEP_TIME = 1 13 | socket.setdefaulttimeout(60) 14 | 15 | 16 | def get_robots_parser(robots_url): 17 | " Return the robots parser object using the robots_url " 18 | try: 19 | rp = robotparser.RobotFileParser() 20 | rp.set_url(robots_url) 21 | rp.read() 22 | return rp 23 | except Exception as e: 24 | print('Error finding robots_url:', robots_url, e) 25 | 26 | 27 | def clean_link(url, domain, link): 28 | if link.startswith('//'): 29 | link = '{}:{}'.format(urlparse(url).scheme, link) 30 | elif link.startswith('://'): 31 | link = '{}{}'.format(urlparse(url).scheme, link) 32 | else: 33 | link = urljoin(domain, link) 34 | return link 35 | 36 | 37 | def get_links(html, link_regex): 38 | " Return a list of links (using simple regex matching) from the html content " 39 | # a regular expression to extract all links from the webpage 40 | webpage_regex = re.compile("""]+href=["'](.*?)["']""", re.IGNORECASE) 41 | # list of all links from the webpage 42 | links = webpage_regex.findall(html) 43 | links = (link for link in links if re.match(link_regex, link)) 44 | return links 45 | 46 | 47 | def threaded_crawler_rq(start_url, link_regex, user_agent='wswp', proxies=None, 48 | delay=3, max_depth=4, num_retries=2, cache={}, max_threads=10, scraper_callback=None): 49 | """ Crawl from the given start URLs following links matched by link_regex. In this 50 | implementation, we do not actually scrape any information. 51 | 52 | args: 53 | start_url (str or list of strs): web site(s) to start crawl 54 | link_regex (str): regex to match for links 55 | kwargs: 56 | user_agent (str): user agent (default: wswp) 57 | proxies (list of dicts): a list of possible dicts 58 | for http / https proxies 59 | For formatting, see the requests library 60 | delay (int): seconds to throttle between requests to one domain 61 | (default: 3) 62 | max_depth (int): maximum crawl depth (to avoid traps) (default: 4) 63 | num_retries (int): # of retries when 5xx error (default: 2) 64 | cache (dict): cache dict with urls as keys 65 | and dicts for responses (default: {}) 66 | scraper_callback: function to be called on url and html content 67 | """ 68 | crawl_queue = RedisQueue() 69 | crawl_queue.push(start_url) 70 | # keep track which URL's have seen before 71 | robots = {} 72 | D = Downloader(delay=delay, user_agent=user_agent, 73 | proxies=proxies, cache=cache) 74 | 75 | def process_queue(): 76 | while len(crawl_queue): 77 | url = crawl_queue.pop() 78 | no_robots = False 79 | if not url or 'http' not in url: 80 | continue 81 | domain = '{}://{}'.format(urlparse(url).scheme, 82 | urlparse(url).netloc) 83 | rp = robots.get(domain) 84 | if not rp and domain not in robots: 85 | robots_url = '{}/robots.txt'.format(domain) 86 | rp = get_robots_parser(robots_url) 87 | if not rp: 88 | # issue finding robots.txt, still crawl 89 | no_robots = True 90 | robots[domain] = rp 91 | elif domain in robots: 92 | no_robots = True 93 | # check url passes robots.txt restrictions 94 | if no_robots or rp.can_fetch(user_agent, url): 95 | depth = crawl_queue.get_depth(url) 96 | if depth == max_depth: 97 | print('Skipping %s due to depth' % url) 98 | continue 99 | html = D(url, num_retries=num_retries) 100 | if not html: 101 | continue 102 | if scraper_callback: 103 | links = scraper_callback(url, html) or [] 104 | else: 105 | links = [] 106 | # filter for links matching our regular expression 107 | for link in list(get_links(html, link_regex)) + links: 108 | if 'http' not in link: 109 | link = clean_link(url, domain, link) 110 | crawl_queue.push(link) 111 | crawl_queue.set_depth(link, depth + 1) 112 | else: 113 | print('Blocked by robots.txt:', url) 114 | 115 | # wait for all download threads to finish 116 | threads = [] 117 | while threads or len(crawl_queue): 118 | for thread in threads: 119 | if not thread.is_alive(): 120 | threads.remove(thread) 121 | while len(threads) < max_threads and crawl_queue: 122 | # can start some more threads 123 | thread = threading.Thread(target=process_queue) 124 | thread.setDaemon(True) # set daemon so main thread can exit w/ ctrl-c 125 | thread.start() 126 | threads.append(thread) 127 | 128 | for thread in threads: 129 | thread.join() 130 | 131 | time.sleep(SLEEP_TIME) 132 | 133 | 134 | def mp_threaded_crawler(*args, **kwargs): 135 | """ create a multiprocessing threaded crawler """ 136 | processes = [] 137 | num_procs = kwargs.pop('num_procs') 138 | if not num_procs: 139 | num_procs = multiprocessing.cpu_count() 140 | for _ in range(num_procs): 141 | proc = multiprocessing.Process(target=threaded_crawler_rq, 142 | args=args, kwargs=kwargs) 143 | proc.start() 144 | processes.append(proc) 145 | # wait for processes to complete 146 | for proc in processes: 147 | proc.join() 148 | 149 | 150 | if __name__ == '__main__': 151 | from chp4.alexa_callback import AlexaCallback 152 | from chp3.rediscache import RedisCache 153 | import argparse 154 | 155 | parser = argparse.ArgumentParser(description='Multiprocessing threaded link crawler') 156 | parser.add_argument('max_threads', type=int, help='maximum number of threads', 157 | nargs='?', default=5) 158 | parser.add_argument('num_procs', type=int, help='number of processes', 159 | nargs='?', default=None) 160 | parser.add_argument('url_pattern', type=str, help='regex pattern for url matching', 161 | nargs='?', default='$^') 162 | par_args = parser.parse_args() 163 | 164 | AC = AlexaCallback() 165 | AC() 166 | start_time = time.time() 167 | 168 | mp_threaded_crawler(AC.urls, par_args.url_pattern, cache=RedisCache(), 169 | num_procs=par_args.num_procs, max_threads=par_args.max_threads) 170 | print('Total time: %ss' % (time.time() - start_time)) 171 | --------------------------------------------------------------------------------