├── LICENSE
├── README.md
├── chapter1
    ├── code
    │   ├── algorithmia
    │   │   ├── algorithmia_analyze_url.py
    │   │   └── algorithmia_sitemap.py
    │   ├── mechanical-soup
    │   │   ├── bing_search.py
    │   │   ├── github_links.py
    │   │   ├── google_search.py
    │   │   └── twitter_login.py
    │   ├── metadata
    │   │   ├── extract_articles.py
    │   │   └── extract_site_metadata.py
    │   ├── parsel
    │   │   ├── extract_links_css.py
    │   │   └── extract_links_xpath.py
    │   ├── requests
    │   │   ├── request_response.py
    │   │   └── urllib_request.py
    │   ├── robobrowser
    │   │   ├── bing_search.py
    │   │   ├── download_file.py
    │   │   ├── get_emails_links_from_url.py
    │   │   ├── twitter_login_form.py
    │   │   └── website_parsing.py
    │   └── web_technologies
    │   │   └── web_technologies_builtwith.py
    └── images
    │   ├── algorithmia_analyze_url.png
    │   ├── algorithmia_analyze_url2.png
    │   ├── algorithmia_sitemap.png
    │   ├── algorithmia_sitemap2.png
    │   ├── bing_search.png
    │   ├── bing_search_output.png
    │   ├── bing_search_output_mechanical_soup.png
    │   ├── builtwith.png
    │   ├── builtwith_script.png
    │   ├── google_search_mechanical_soup.png
    │   ├── robobrowser_links.png
    │   └── wappalyzer.png
├── chapter12.zip
├── chapter2
    ├── code
    │   ├── bs4
    │   │   ├── BeautifulSoup-getLinks_csv.py
    │   │   ├── bs4_objects.py
    │   │   ├── demo_detail_book.py
    │   │   ├── download_images_from_url.py
    │   │   ├── getExternal_internal_links.py
    │   │   ├── get_offers_bs4.py
    │   │   └── wikipedia_links.py
    │   └── requests
    │   │   ├── crawler_urls.py
    │   │   ├── depth_search_extract_links.py
    │   │   ├── download_file_requests.py
    │   │   ├── extract_links_images_re.py
    │   │   ├── get_emails_from_url.py
    │   │   ├── get_html_requests.py
    │   │   ├── link_crawler_search.py
    │   │   ├── requests_post.py
    │   │   └── requests_user_agent.py
    └── images
    │   ├── download_images.png
    │   ├── download_images2.png
    │   ├── external_inernal_links.png
    │   ├── link_extractor.png
    │   ├── objects.png
    │   ├── packt_books.png
    │   ├── packtpub_links.png
    │   ├── packtpub_links2.png
    │   ├── packtpub_links_csv.png
    │   ├── packtpub_links_deep_search.png
    │   ├── requests_extract_links.png
    │   ├── requests_headers.png
    │   └── requests_post.png
├── chapter3
    ├── code
    │   ├── books_scraping
    │   │   ├── bookList.csv
    │   │   ├── requests_bs4_initial.py
    │   │   └── requests_bs4_with_pages.py
    │   ├── chromedriver.exe
    │   ├── dolar-euro_converter.py
    │   ├── google_translate.py
    │   ├── interacting_with_form.py
    │   ├── phantomjs
    │   │   ├── phantomjs.exe
    │   │   ├── phantomjs_example1.py
    │   │   ├── phantomjs_example2.py
    │   │   └── phantomjs_example3.py
    │   ├── scraping_book_details_requests.py
    │   ├── selenium_list_book.py
    │   └── stack_overflow_tags.py
    └── images
    │   ├── ajax_image.png
    │   ├── book_info.png
    │   ├── book_packit.png
    │   ├── books_details.png
    │   ├── books_packit.png
    │   ├── converter.png
    │   ├── google_translate.png
    │   ├── selenium_methods.png
    │   └── xpath.png
└── chapter4
    ├── BooksSpider-multipage-details
        ├── books_crawler
        │   ├── __init__.py
        │   ├── items.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── BooksSpider.py
        │   │   └── __init__.py
        ├── output.son
        └── scrapy.cfg
    ├── BooksSpider-urls
        ├── books_crawler
        │   ├── __init__.py
        │   ├── items.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── BooksSpider.py
        │   │   └── __init__.py
        ├── books_links.json
        └── scrapy.cfg
    ├── BooksSpider-urls_download_images
        ├── books_crawler
        │   ├── __init__.py
        │   ├── items.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── BooksSpider.py
        │   │   └── __init__.py
        ├── output.son
        └── scrapy.cfg
    ├── europython
        ├── europython
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-37.pyc
        │   │   ├── items.cpython-37.pyc
        │   │   ├── pipelines.cpython-37.pyc
        │   │   └── settings.cpython-37.pyc
        │   ├── items.py
        │   ├── middlewares.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── __init__.py
        │   │   ├── __pycache__
        │   │       ├── __init__.cpython-37.pyc
        │   │       └── europython_spider.cpython-37.pyc
        │   │   └── europython_spider.py
        ├── europython_items.csv
        ├── europython_items.json
        ├── europython_items.xml
        ├── scrapinghub.yml
        ├── scrapy.cfg
        └── setup.py
    ├── images
        ├── book_details.png
        ├── books_images.png
        ├── books_images_output.png
        ├── europython_talk.png
        ├── next_page.png
        ├── scrapy_books.png
        ├── scrapy_books_links.png
        ├── scrapy_options.png
        ├── scrapy_project.png
        ├── scrapy_shell.png
        └── scrapy_shell2.png
    ├── output.json
    └── spider_books.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Advanced-Web-Scraping-with-Python
2 | Advanced Web Scraping with Python, Published by Packt
3 | 


--------------------------------------------------------------------------------
/chapter1/code/algorithmia/algorithmia_analyze_url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import Algorithmia
 5 | import json
 6 | 
 7 | input = [ "https://www.packtpub.com/iot-hardware/single-board-computers"]
 8 | output = []
 9 | 
10 | API_KEY ='simU+xQFB6Ts4O306dxEhZreKBA1'
11 | 
12 | client = Algorithmia.client(API_KEY)
13 | 
14 | algorithmia = client.algo('web/AnalyzeURL/0.2.17').pipe(input[0])
15 | print(algorithmia.result)
16 | output.append(algorithmia.result)
17 | print(json.dumps(output, indent=4))


--------------------------------------------------------------------------------
/chapter1/code/algorithmia/algorithmia_sitemap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import Algorithmia
 5 | 
 6 | input = [ "http://packtpub.com",1]
 7 | 
 8 | API_KEY ='simU+xQFB6Ts4O306dxEhZreKBA1'
 9 | 
10 | client = Algorithmia.client(API_KEY)
11 | response = client.algo('web/SiteMap/0.1.7').pipe(input)
12 | siteMap = response.result
13 | print(siteMap)


--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/bing_search.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import mechanicalsoup
 5 | 
 6 | # Connect to bing search engine
 7 | browser = mechanicalsoup.StatefulBrowser()
 8 | browser.open("http://bing.com/")
 9 | 
10 | # Fill-in the search form
11 | browser.select_form('#sb_form')
12 | browser["q"] = "MechanicalSoup"
13 | browser.submit_selected()
14 | 
15 | # Display the results
16 | for link in browser.links():
17 |     print(link.text, '->', link.attrs['href'])
18 | 


--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/github_links.py:
--------------------------------------------------------------------------------
 1 | """Example app to login to GitHub using the StatefulBrowser class."""
 2 | #!/usr/bin/env python
 3 | # -*- coding: utf-8 -*-
 4 | 
 5 | from __future__ import print_function
 6 | import argparse
 7 | import mechanicalsoup
 8 | from getpass import getpass
 9 | 
10 | parser = argparse.ArgumentParser(description="Login to GitHub.")
11 | parser.add_argument("username")
12 | args = parser.parse_args()
13 | 
14 | args.password = getpass("Please enter your GitHub password: ")
15 | 
16 | browser = mechanicalsoup.StatefulBrowser(
17 |     soup_config={'features': 'lxml'},
18 |     raise_on_404=True,
19 |     user_agent='MyBot/0.1: mysite.example.com/bot_info',
20 | )
21 | # Uncomment for a more verbose output:
22 | browser.set_verbose(2)
23 | 
24 | browser.open("https://github.com")
25 | browser.follow_link("login")
26 | browser.select_form('#login form')
27 | browser["login"] = args.username
28 | browser["password"] = args.password
29 | resp = browser.submit_selected()
30 | 
31 | # Uncomment to launch a web browser on the current page:
32 | browser.launch_browser()
33 | 
34 | # verify we are now logged in
35 | page = browser.get_current_page()
36 | 
37 | for link in browser.links():
38 | 	target = link.attrs['href']
39 | 	print(target)
40 | 
41 | messages = page.find("div", class_="flash-messages")
42 | if messages:
43 |     print(messages.text)
44 | assert page.select(".logout-form")
45 | 
46 | #print(page.title.text)
47 | #print(page)
48 | 
49 | # verify we remain logged in (thanks to cookies) as we browse the rest of
50 | # the site
51 | page3 = browser.open("https://github.com/MechanicalSoup/MechanicalSoup")
52 | assert page3.soup.select(".logout-form")
53 | 


--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/google_search.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import mechanicalsoup
 6 | 
 7 | # Connect to Google
 8 | browser = mechanicalsoup.StatefulBrowser()
 9 | browser.open("https://www.google.com/")
10 | 
11 | # Fill-in the form
12 | browser.select_form('form[action="/search"]')
13 | browser["q"] = "MechanicalSoup"
14 | 
15 | # Note: the button name is btnK in the content served to actual
16 | # browsers, but btnG for bots.
17 | browser.submit_selected(btnName="btnG")
18 | print(browser.get_current_page())
19 | 
20 | # Display links
21 | for link in browser.links():
22 |     target = link.attrs['href']
23 |     # Filter-out unrelated links and extract actual URL from Google's
24 |     # click-tracking.
25 |     if (target.startswith('/url?') and not
26 |             target.startswith("/url?q=http://webcache.googleusercontent.com")):
27 |         target = re.sub(r"^/url\?q=([^&]*)&.*", r"\1", target)
28 |         print(target)
29 | 


--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/twitter_login.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import mechanicalsoup
 5 | import getpass
 6 | 
 7 | URL = "https://twitter.com/login"
 8 |     
 9 | username = input ("Username: ")
10 | password = getpass.getpass()
11 | 
12 | # Create a browser object
13 | browser = mechanicalsoup.Browser()
14 | 
15 | # request Twitter login page
16 | login_page = browser.get(URL)
17 | 
18 | # we grab the login form
19 | login_form = login_page.soup.find("form", {"class":"t1-form clearfix signin js-signin"})
20 | 
21 | # find login and password inputs
22 | login_form.find("input", {"name": "session[username_or_email]"})["value"] = username
23 | login_form.find("input", {"name": "session[password]"})["value"] = password
24 | 
25 | # submit form
26 | browser.submit(login_form, login_page.url)


--------------------------------------------------------------------------------
/chapter1/code/metadata/extract_articles.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import newspaper
 5 | 
 6 | cnn_paper = newspaper.build('http://cnn.com')
 7 | 
 8 | print('*****************************category urls************************************\n')
 9 | for category in cnn_paper.category_urls():
10 | 	print(category)
11 | 	
12 | print('*****************************url articles************************************\n')
13 | 
14 | for article in cnn_paper.articles:
15 | 	print(article.url)
16 | 
17 | print('*****************************download first article************************************\n')
18 | cnn_article = cnn_paper.articles[0]
19 | cnn_article.download()
20 | cnn_article.parse()
21 | 
22 | #print(cnn_article.html)
23 | print(cnn_article.text)
24 | print(cnn_article.keywords)
25 | print(cnn_article.summary)
26 | print(cnn_article.authors)
27 | print(cnn_article.publish_date)


--------------------------------------------------------------------------------
/chapter1/code/metadata/extract_site_metadata.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import extruct
 5 | import requests
 6 | import pprint
 7 | from w3lib.html import get_base_url
 8 | 
 9 | 
10 | pp = pprint.PrettyPrinter(indent=2)
11 | r = requests.get('https://www.packtpub.com')
12 | base_url = get_base_url(r.text, r.url)
13 | data = extruct.extract(r.text, base_url=base_url)
14 | 
15 | pp.pprint(data)


--------------------------------------------------------------------------------
/chapter1/code/parsel/extract_links_css.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | from parsel import Selector
 6 | 
 7 | # GET request to packtpub site
 8 | response = requests.get('https://www.packtpub.com')
 9 | 
10 | # "response.txt" contain all web page content
11 | selector = Selector(response.text)
12 | 
13 | # Extracting href attribute from anchor tag <a href="*">
14 | href_links = selector.css('a::attr(href)').extract()
15 | 
16 | #Extracting src attribute from img tag <img src="*">
17 | image_links = selector.css('img::attr(src)').extract()
18 | 
19 | print('*****************************href_links************************************\n')
20 | print(href_links)
21 | 
22 | 
23 | print('*****************************image_links************************************\n')
24 | print(image_links)
25 | 


--------------------------------------------------------------------------------
/chapter1/code/parsel/extract_links_xpath.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | from parsel import Selector
 6 | 
 7 | # GET request to packtpub site
 8 | response = requests.get('https://www.packtpub.com')
 9 | 
10 | # "response.txt" contain all web page content
11 | selector = Selector(response.text)
12 | 
13 | # Extracting href attribute from anchor tag <a href="*">
14 | href_links = selector.xpath('//a/@href').getall()
15 | 
16 | #Extracting src attribute from img tag <img src="*">
17 | image_links = selector.xpath('//img/@src').getall()
18 | 
19 | print('*****href_links******\n')
20 | print(href_links)
21 | 
22 | 
23 | print('*****image_links*****\n')
24 | print(image_links)
25 | 


--------------------------------------------------------------------------------
/chapter1/code/requests/request_response.py:
--------------------------------------------------------------------------------
1 | import requests
2 | 
3 | url = "http://www.packtpub.com"
4 | # Packages the request, send the request and catch the response
5 | response = requests.get(url)
6 | # Store the response in html variable
7 | html = response.text
8 | # Print the html
9 | print(html)


--------------------------------------------------------------------------------
/chapter1/code/requests/urllib_request.py:
--------------------------------------------------------------------------------
 1 | from urllib.request import urlopen, Request
 2 | 
 3 | # Specify the url
 4 | url = "http://www.packtpub.com"
 5 | # This packages the request
 6 | request = Request(url)
 7 | # Sends the request and catches the response: response
 8 | response = urlopen(request)
 9 | # Extract the response using read()
10 | html = response.read()
11 | # Print the html
12 | print(html)
13 | # Closing the response
14 | response.close()


--------------------------------------------------------------------------------
/chapter1/code/robobrowser/bing_search.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from robobrowser import RoboBrowser
 5 | 
 6 | browser = RoboBrowser(history=True,parser="html.parser")
 7 | browser.open("http://bing.com")
 8 | #print(browser.parsed)
 9 | 
10 | #Find the element by id,action or css class in the html
11 | #form = browser.get_form(id = "sb_form")
12 | form = browser.get_form(action="/search")
13 | #form = browser.get_form(class_='sw_box hassbi')
14 | 
15 | print(form)
16 | 
17 | form.fields['q'].value = "python"
18 | #form["q"].value = "python"
19 | 
20 | browser.submit_form(form)
21 | 
22 | print('*****browser.find_all("a")******\n')
23 | 
24 | links = browser.find_all("a")
25 | for link in links:
26 | 	try:
27 | 		print(link['href'])
28 | 	except Exception as exception:
29 | 		pass


--------------------------------------------------------------------------------
/chapter1/code/robobrowser/download_file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from robobrowser import RoboBrowser
 5 | 
 6 | browser = RoboBrowser(history=True)
 7 | 
 8 | url = "https://www.cse.unsw.edu.au/~en1811/python-docs/python-3.6.4-docs-pdf/tutorial.pdf"
 9 | pdf_file_path = "tutorial.pdf"
10 | 
11 | # get browser session
12 | request = browser.session.get(url, stream=True)
13 | 
14 | with open(pdf_file_path, "wb") as pdf_file:
15 |     pdf_file.write(request.content)


--------------------------------------------------------------------------------
/chapter1/code/robobrowser/get_emails_links_from_url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from robobrowser import RoboBrowser
 5 | import re
 6 | import argparse
 7 | 
 8 | browser = RoboBrowser(history=True,parser="html.parser")
 9 | 
10 | def get_emails(domain):
11 | 	
12 | 	domain="http://"+domain
13 | 	browser.open(domain)
14 | 	contents = browser.find_all("a",href=re.compile("[-a-zA-Z0-9._]+@[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+"))
15 | 	for content in contents:
16 | 		print(content['href'])
17 | 		
18 | def get_links(domain):
19 | 	
20 | 	domain="http://"+domain
21 | 	browser.open(domain)
22 | 	
23 | 	print('*****browser.find_all("a")******\n')
24 | 	contents = browser.find_all("a")
25 | 	for content in contents:
26 | 		try:
27 | 			print(content['href'])
28 | 		except Exception as exception:
29 | 			pass
30 | 
31 | 	print('*****browser.get_links()******\n')
32 | 	links = browser.get_links()
33 | 	for link in links:
34 | 		try:
35 | 			print(link['href'])
36 | 		except Exception as exception:
37 | 			pass
38 |     
39 | if __name__ == "__main__":
40 | 	parser = argparse.ArgumentParser(description='gets emails from domain.', prog='get_emails_links_from_url.py', epilog="", add_help=False)
41 | 	parser.add_argument('-d', '--domain', metavar='<domain>', action='store', help='domain to be resolved.',required=True)
42 | 	args = parser.parse_args()
43 | 	get_emails(args.domain)
44 | 	get_links(args.domain)


--------------------------------------------------------------------------------
/chapter1/code/robobrowser/twitter_login_form.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from robobrowser import RoboBrowser
 5 | 
 6 | browser = RoboBrowser(history=True,parser="html.parser")
 7 | browser.open('http://twitter.com/login')
 8 | print(browser.parsed)
 9 | 
10 | # Get the signup form by action or css class
11 | signup_form = browser.get_form(action="https://twitter.com/sessions")
12 | signup_form = browser.get_form(class_='t1-form clearfix signin js-signin')
13 | print(signup_form)
14 | 
15 | # Inspect authenticity_token value
16 | print(signup_form['authenticity_token'].value)
17 | 
18 | # Fill it out
19 | signup_form['session[username_or_email]'].value = 'username'
20 | signup_form['session[password]'].value = 'password'
21 | 
22 | print(signup_form.serialize())
23 | 
24 | # Submit the form
25 | browser.submit_form(signup_form)


--------------------------------------------------------------------------------
/chapter1/code/robobrowser/website_parsing.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from robobrowser import RoboBrowser
 5 | import requests
 6 | 
 7 | url = "http://www.packtpub.com"
 8 | browser = RoboBrowser(history=True,parser="html.parser")
 9 | 
10 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
11 |        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
12 |        'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
13 |        'Accept-Encoding': 'none',
14 |        'Accept-Language': 'en-US,en;q=0.8',
15 |        'Connection': 'keep-alive'}
16 | 	   
17 | session = requests.Session()
18 | session.headers = headers
19 | browser = RoboBrowser(session=session)
20 | 
21 | browser.open(url)
22 | print(browser.parsed)


--------------------------------------------------------------------------------
/chapter1/code/web_technologies/web_technologies_builtwith.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | import argparse
 6 | import builtwith
 7 | 
 8 | class BuiltWith():
 9 | 
10 |     def __init__(self):
11 | 
12 |         self.key = '1fb25d4e-31b7-468c-8793-4ecebc3467be'
13 |         self.url ='http://api.builtwith.com/free1/api.json'
14 | 
15 |     def module_run(self, domain):
16 |         print("\nDomain "+domain +"\n")
17 |         print(builtwith.parse("http://"+domain))
18 |         payload = {'key': self.key, 'lookup': domain}
19 |         response = requests.get(self.url, params=payload)
20 |         json=response.json()
21 |         print(json)
22 | 
23 | 
24 | if __name__ == '__main__':
25 | 
26 |     parser = argparse.ArgumentParser(description='BuiltWith')
27 |     parser.add_argument('--domain', action="store", dest="domain",required=True)
28 |     given_args = parser.parse_args()
29 |     domain = given_args.domain
30 |     builtWith  = BuiltWith();
31 |     builtWith.module_run(domain);


--------------------------------------------------------------------------------
/chapter1/images/algorithmia_analyze_url.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_analyze_url.png


--------------------------------------------------------------------------------
/chapter1/images/algorithmia_analyze_url2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_analyze_url2.png


--------------------------------------------------------------------------------
/chapter1/images/algorithmia_sitemap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_sitemap.png


--------------------------------------------------------------------------------
/chapter1/images/algorithmia_sitemap2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_sitemap2.png


--------------------------------------------------------------------------------
/chapter1/images/bing_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search.png


--------------------------------------------------------------------------------
/chapter1/images/bing_search_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search_output.png


--------------------------------------------------------------------------------
/chapter1/images/bing_search_output_mechanical_soup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search_output_mechanical_soup.png


--------------------------------------------------------------------------------
/chapter1/images/builtwith.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/builtwith.png


--------------------------------------------------------------------------------
/chapter1/images/builtwith_script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/builtwith_script.png


--------------------------------------------------------------------------------
/chapter1/images/google_search_mechanical_soup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/google_search_mechanical_soup.png


--------------------------------------------------------------------------------
/chapter1/images/robobrowser_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/robobrowser_links.png


--------------------------------------------------------------------------------
/chapter1/images/wappalyzer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/wappalyzer.png


--------------------------------------------------------------------------------
/chapter12.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter12.zip


--------------------------------------------------------------------------------
/chapter2/code/bs4/BeautifulSoup-getLinks_csv.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | import csv
 4 | 
 5 | url = "http://packtpub.com"
 6 | 
 7 | csv_file = csv.writer(open("data_links.csv", "w"))
 8 | csv_file.writerow(["Section" , "Link"])
 9 | 
10 | # Getting the webpage, creating a Response object.
11 | response = requests.get(url)
12 | 
13 | # Extracting the source code of the page.
14 | data = response.text
15 | 
16 | # Passing the source code to Beautiful Soup to create a BeautifulSoup object for it.
17 | soup = BeautifulSoup(data, 'html.parser')
18 | 
19 | # use the 'find_all' function to bring back all instances of the 'a' tag in the HTML and store in 'tags' variable
20 | # Extracting all the <a> tags into a list.
21 | tags = soup.find_all('a')
22 | tags = soup.find_all('a', {'class': 'nav-anchor'}) # only for url = "http://packtpub.com"
23 | 
24 | # Extracting URLs from the attribute href in the <a> tags.
25 | for tag in tags:
26 | 	print(tag.get('href'))
27 | 	link = tag.get('href')
28 | 	text = tag.get_text()
29 | 	csv_file.writerow([text, link])
30 | 	
31 | 


--------------------------------------------------------------------------------
/chapter2/code/bs4/bs4_objects.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | from fake_useragent import UserAgent
 7 | 
 8 | ua = UserAgent()
 9 | header = {'user-agent':ua.chrome}
10 | google_page = requests.get('http://www.packtpub.com',headers=header)
11 | 
12 | soup = BeautifulSoup(google_page.content,'lxml')
13 | 
14 | #find parent
15 | print("Parent of the form with id='search_mini_form':")	
16 | parent_form = soup.find("form",{"id":"search_mini_form"}).parent
17 | print(parent_form) 
18 | 
19 | #get children form a specific element,in this case we are getting child elements of the form with id="search_mini_form"
20 | print("Children of the form with id='search_mini_form:'")
21 | for child in soup.find("form",{"id":"search_mini_form"}).children:
22 |     print(child)
23 | 
24 | #find next_siblings	
25 | print("Siblings of the form with id='search_mini_form:'")
26 | for sibling in soup.find("form",{"id":"search_mini_form"}).input.next_siblings:
27 |     print(sibling)


--------------------------------------------------------------------------------
/chapter2/code/bs4/demo_detail_book.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import requests
 6 | 
 7 | response = requests.get('https://www.packtpub.com/application-development/learn-python-programming-second-edition')
 8 | soup = BeautifulSoup(response.text,'lxml')
 9 | 
10 | title = soup.find('span', attrs={'data-ui-id':'page-title-wrapper'}).text
11 | author = soup.find('div', attrs={'class':'authors inline'}).text
12 | 
13 | print(title)
14 | print(author)


--------------------------------------------------------------------------------
/chapter2/code/bs4/download_images_from_url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import os, sys
 6 | import requests
 7 | from fake_useragent import UserAgent
 8 | 
 9 | def getAllImages(url):
10 | 
11 |     ua = UserAgent()
12 |     header = {'user-agent':ua.chrome}
13 |     schedule_page = requests.get(url,headers=header)
14 | 
15 |     #create directory for save images
16 |     os.system("mkdir images_packtpub")
17 | 
18 |     bs = BeautifulSoup(schedule_page.text,"lxml")
19 |     for image in bs.findAll("img"):
20 |         print("found image")
21 | 
22 |         #Extract the location of the image. We also need to strip for get the image name, so let's do that through '.split()'
23 |         src = image.get('src')
24 |         print(src)
25 | 
26 |         parts_image = src.split("/")
27 |         image_name = parts_image[len(parts_image)-1]
28 | 
29 |         #Save the image
30 |         with open("images_packtpub/"+image_name,"wb") as f:
31 |             f.write(requests.get(src).content)
32 | 
33 | getAllImages("http://www.packtpub.com")
34 | 


--------------------------------------------------------------------------------
/chapter2/code/bs4/getExternal_internal_links.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | 
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | import requests
 8 | import argparse
 9 | 
10 | internalLinks = []
11 | externalLinks = []
12 | 
13 | #Get a list of internal links that start with a "/"
14 | def getInternalLinks(url,beautifulSoup):
15 |     url = url.replace("http://", "").split("/")[0]
16 |     for link in beautifulSoup.findAll("a", href=re.compile("^(/|.*"+url+")")):
17 |         if link.attrs['href'] is not None:
18 |             internalLinks.append(link.attrs['href'])
19 |     return internalLinks
20 | 
21 | 
22 | #Get all links that start with "http" or "www" and not contain the current URL
23 | def getExternalLinks(url,beautifulSoup):
24 |     url = url.replace("http://", "").split("/")[0]
25 |     for link in beautifulSoup.findAll("a", href=re.compile("^(http|www)((?!"+url+").)*$")):
26 |         if link.attrs['href'] is not None:
27 |             externalLinks.append(link.attrs['href'])
28 |     return externalLinks
29 | 
30 | 
31 | def crawlExternalLinks(website):
32 |     html = requests.get(website)
33 |     beautifulSoup = BeautifulSoup(html.text,"lxml")
34 |     externalLinks = getExternalLinks(website, beautifulSoup)
35 |     return externalLinks
36 | 
37 | def crawlInternalLinks(website):
38 |     html = requests.get(website)
39 |     beautifulSoup = BeautifulSoup(html.text,"lxml")
40 |     internalLinks = getInternalLinks(website,beautifulSoup)
41 |     return internalLinks
42 | 
43 | def getExternalInternalLinks(website):
44 |     externalLinks = crawlExternalLinks(website)
45 |     internalLinks = crawlInternalLinks(website)
46 |     print("\nExternal links")
47 |     print("-------------------")
48 | 
49 |     for external in externalLinks:
50 |         print(external)
51 | 
52 |     print("\nInternal links")
53 |     print("-------------------")    
54 |     for internal in internalLinks:
55 |         print(internal)
56 | 
57 | 
58 | if __name__== "__main__":
59 | 
60 |     # parse the command line arguments
61 |     ap = argparse.ArgumentParser()
62 |     ap.add_argument("-d","--domain",required=True,help="The domain to target ie. packtpub.com")
63 |     args = vars(ap.parse_args())
64 | 
65 |     domain = args['domain']
66 | 
67 |     if domain.startswith("http://") == True:
68 |         target = domain
69 |     else:
70 |         target = "http://" + domain
71 | 
72 | getExternalInternalLinks(target)


--------------------------------------------------------------------------------
/chapter2/code/bs4/get_offers_bs4.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | 
 4 | def getOffers(url):
 5 |   # We make the request to the page
 6 |   req = requests.get(url)
 7 |   # We verify that the request returns a Status Code = 200 (200 = Ok)
 8 |   statusCode = req.status_code
 9 |   if statusCode == 200:
10 |     # We pass the HTML content of the web to a BeautifulSoup object
11 |     html = BeautifulSoup(req.text, "html.parser")
12 |     # We get all the div elements with class "offer-box"
13 |     elements = html.find_all('div', {'class': 'offer-box'})
14 |     # We go through all the entries to extract the title, description and link
15 |     for item in elements:
16 |       title = item.find('h3').getText()
17 |       description = item.find('p').getText()
18 |       link = item.find('a').get('href')
19 | 
20 |       # Print title,link and description
21 |       print("Title....: " + title)
22 |       print("Link:.....: " + link)
23 |       print("Description:.....: " + description)
24 |       print("**********************************")
25 |   else:
26 |     # If the page does not exist we show the error
27 |     print("The url " + url + " gives an error %d" % statusCode)
28 | 
29 | getOffers("https://www.packtpub.com/offers")


--------------------------------------------------------------------------------
/chapter2/code/bs4/wikipedia_links.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | 
 8 | def getLinks(url):
 9 | 	html = requests.get("http://en.wikipedia.org"+url).text
10 | 	bs = BeautifulSoup(html, "html.parser")
11 | 	return bs.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
12 | 
13 | print("Main links from http://en.wikipedia.org//wiki/Python_(programming_language)")
14 | links_level1 = getLinks("/wiki/Python_(programming_language)")
15 | 
16 | index =0
17 | 
18 | for link in links_level1:
19 | 	
20 | 	print("http://en.wikipedia.org"+link.get('href').encode('utf-8'))
21 | 	
22 | 	newLink= links_level1[index].attrs["href"]
23 | 	
24 | 	links_level2 = getLinks(newLink)
25 | 	
26 | 	print("Links from http://en.wikipedia.org"+ newLink)
27 | 	
28 | 	for link in links_level2:
29 | 		print("http://en.wikipedia.org"+link.get('href').encode('utf-8'))
30 | 		
31 | 	index = index +1
32 | 	


--------------------------------------------------------------------------------
/chapter2/code/requests/crawler_urls.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | import requests
 6 | 
 7 | web =  input("Url: ")
 8 | response = requests.get('http://'+web).text
 9 | urls = []
10 | 
11 | pattern= re.compile('''href=["'](.[^"']+)["']''')
12 | search = re.findall(pattern, response)
13 | 
14 | for url in search:
15 | 	 try:
16 | 		  urls.append(url)
17 | 		  d1 =  str(url)
18 | 		  urlList = open('crawler_urls.txt','a+')
19 | 		  urlList.write(d1+"\n")
20 | 		  urlList.close()
21 | 		  print(url)
22 | 		  response2 = requests.get(i).text
23 | 		  search2 = re.findall(pattern, response2)
24 | 		  for e in search2:
25 | 			   urls.append(e)
26 | 			   d2 =  str(e)
27 | 			   urlList = open('crawler_urls.txt','a+')
28 | 			   urlList.write(d2+"\n")
29 | 			   urlList.close()
30 | 			   
31 | 	 except Exception as e:
32 | 		  pass
33 | 	
34 | print("URls saved in file crawler_urls.txt")


--------------------------------------------------------------------------------
/chapter2/code/requests/depth_search_extract_links.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from urllib.request import urljoin
 5 | from urllib.parse import urlparse
 6 | import re
 7 | import requests
 8 | from collections import deque
 9 | 
10 | def download_page(url):
11 |     try:
12 |         return requests.get(url).text
13 |     except:
14 |         print('error in the url', url)
15 | 
16 | def extract_links(page):
17 |     if not page:
18 |         return []
19 |     link_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
20 |     return [urljoin(page, link) for link in link_regex.findall(page)]
21 | 
22 | def get_links(page_url):
23 |     host = urlparse(page_url)[1]
24 |     page = download_page(page_url)
25 |     links = extract_links(page)
26 |     return [link for link in links if urlparse(link)[1] == host]
27 | 
28 | def depth_search(start_url):
29 |     visited = set()
30 |     queue = deque()
31 |     queue.append(start_url)
32 |     while queue:
33 |         url = queue.popleft()
34 |         if url in visited:
35 |             continue
36 |         visited.add(url)
37 |         for link in get_links(url):
38 |             queue.appendleft(link)
39 |         print(url)
40 | 
41 | if __name__ == '__main__':
42 | 
43 |     print('Depth search extracting links ')
44 |     print('----------------------------- ')
45 |     depth_search('https://www.packtpub.com')
46 | 


--------------------------------------------------------------------------------
/chapter2/code/requests/download_file_requests.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import requests
 3 | 
 4 | def downloadFile(fileName):
 5 | 	# extract the filename
 6 | 	filename = fileName.split("/")[-1] 
 7 | 	# download image using GET
 8 | 	image = requests.get(fileName, stream=True)
 9 | 	# save the image received into the file
10 | 	with open(filename, 'wb') as fileDescryptor:
11 | 		i=0
12 | 		for chunk in image.iter_content(chunk_size=1024):
13 | 			i=i+1
14 | 			fileDescryptor.write(chunk)
15 | 	return
16 | 	
17 | 
18 | downloadFile("https://www.packtpub.com/media/logo/stores/1/logo.png")
19 | downloadFile("https://media.readthedocs.org/pdf/python-guide/latest/python-guide.pdf")
20 | downloadFile("https://docs.python.org/3/archives/python-3.7.4-docs-pdf-letter.zip")
21 | 


--------------------------------------------------------------------------------
/chapter2/code/requests/extract_links_images_re.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from urllib.request import urljoin
 5 | import re
 6 | import requests
 7 | 
 8 | def download_page(url):
 9 |     return requests.get(url).text
10 | 
11 | def extract_links(page):
12 |     link_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
13 |     return link_regex.findall(page)
14 | 
15 | def extract_image_locations(page):
16 |     img_regex = re.compile('<img[^>]+src=["\'](.*?)["\']', re.IGNORECASE)
17 |     return img_regex.findall(page)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     target_url = 'http://www.packtpub.com'
22 |     packtpub = download_page(target_url)
23 |     links = extract_links(packtpub)
24 | 
25 |     for link in links:
26 |         print(urljoin(target_url, link))
27 | 
28 |         image_locations = extract_image_locations(packtpub)
29 | 
30 |     for src in image_locations:
31 |         print(urljoin(target_url, src))
32 | 


--------------------------------------------------------------------------------
/chapter2/code/requests/get_emails_from_url.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | import re
 6 | import argparse
 7 | 
 8 | def get_emails(domain):
 9 |     
10 |     if not domain.startswith("http://") == True:
11 |         domain="http://"+domain
12 |     
13 |     response = requests.get(domain)
14 |     pattern = re.compile("[-a-zA-Z0-9._]+@[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+")
15 |     mails = re.findall(pattern,response.text)
16 |     emails = str(mails)
17 | 
18 |     print(emails)
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser(description='gets emails from domain.', prog='get_emails_from_url.py', epilog="", add_help=False)
22 |     # Adding the argument
23 |     parser.add_argument('-d', '--domain', metavar='<domain>', action='store', help='domain to be resolved.',required=True)
24 |     args = parser.parse_args()
25 |     
26 |     get_emails(args.domain)


--------------------------------------------------------------------------------
/chapter2/code/requests/get_html_requests.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests
 5 | from fake_useragent import UserAgent
 6 | 
 7 | url = 'https://www.packtpub.com'
 8 | file_name = 'packtpub.com.txt'
 9 | 
10 | user_agent = UserAgent()
11 | page = requests.get(url,headers={'user-agent':user_agent.chrome})
12 | print(page.content)
13 | with open(file_name,'w') as file:
14 |     file.write(page.content.decode('utf-8')) 


--------------------------------------------------------------------------------
/chapter2/code/requests/link_crawler_search.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | import sys
 6 | import requests
 7 | import re
 8 | processed = []
 9 | 
10 | def search_links(url, depth, search):
11 |     # Process http links that are not processed yet
12 |     url_is_processed = (url in processed)
13 |     if (url.startswith("http://") and (not url_is_processed)):
14 |         processed.append(url)
15 |         path = "/"
16 |         urlparts = url.split("/")
17 |         if (len(urlparts) > 1):
18 |             host = urlparts[0]
19 |             path = url.replace(host, "", 1)
20 |     
21 |         # Start crawling
22 |         print("Crawling URL path:%s%s " %(host, path))
23 |         req = requests.get(host+path)
24 |         
25 |         # find the links
26 |         contents = req.text
27 |         all_links = re.findall('href="(.*?)"', contents)
28 |         if (search in contents):
29 |             print("Found " + search + " at " + url)
30 |             print("-----------------------------------")
31 |             print(" ==> %s: processing %s links" %(str(depth),str(len(all_links))))
32 |         
33 |             for href in all_links:
34 |                 # Find relative urls
35 |                 print('link found '+href)
36 |                 # Recurse links
37 |                 if (depth > 0):
38 |                     search_links(href, depth-1, search)
39 |                 else:
40 |                     print("Skipping link: %s ..." %url)
41 |                         
42 | if __name__ == '__main__':
43 |     parser = argparse.ArgumentParser(description='Webpage link crawler')
44 |     parser.add_argument('--url', action="store", dest="url",required=True,type=str)
45 |     parser.add_argument('--query', action="store", dest="query",required=True)
46 |     parser.add_argument('--depth', action="store", dest="depth",default=1)
47 |     given_args = parser.parse_args()
48 |     try:
49 |         if given_args.url.startswith("http://") == True:
50 |             target = given_args.url
51 |         else:   
52 |             target = "http://" + given_args.url
53 |         search_links(target,given_args.depth,given_args.query)
54 |     except KeyboardInterrupt:
55 |         print("Aborting search by user request.")


--------------------------------------------------------------------------------
/chapter2/code/requests/requests_post.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | 
4 | import requests
5 | data_dictionary = {'name': 'username','password': '123456','email': 'user@domain.com'}
6 | response = requests.post("http://httpbin.org/post",data=data_dictionary)
7 | 
8 | if response.status_code == 200:
9 | 	print(response.text)


--------------------------------------------------------------------------------
/chapter2/code/requests/requests_user_agent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import requests, json
 5 | from fake_useragent import UserAgent
 6 | 
 7 | ua = UserAgent()
 8 | header = {'user-agent':ua.chrome}
 9 | 
10 | responseGet = requests.get("https://www.packtpub.com",headers=header)
11 | print(responseGet.text.encode('utf-8'))
12 | print(responseGet.json)
13 | print(responseGet.encoding)
14 | print(responseGet.content)
15 | print("Status code: "+str(responseGet.status_code))
16 | 
17 | print("Headers response: ")
18 | for header, value in responseGet.headers.items():
19 |   print(header, '-->', value)
20 |  
21 | print("Headers request : ")
22 | for header, value in responseGet.request.headers.items():
23 |   print(header, '-->', value)


--------------------------------------------------------------------------------
/chapter2/images/download_images.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/download_images.png


--------------------------------------------------------------------------------
/chapter2/images/download_images2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/download_images2.png


--------------------------------------------------------------------------------
/chapter2/images/external_inernal_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/external_inernal_links.png


--------------------------------------------------------------------------------
/chapter2/images/link_extractor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/link_extractor.png


--------------------------------------------------------------------------------
/chapter2/images/objects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/objects.png


--------------------------------------------------------------------------------
/chapter2/images/packt_books.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packt_books.png


--------------------------------------------------------------------------------
/chapter2/images/packtpub_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links.png


--------------------------------------------------------------------------------
/chapter2/images/packtpub_links2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links2.png


--------------------------------------------------------------------------------
/chapter2/images/packtpub_links_csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links_csv.png


--------------------------------------------------------------------------------
/chapter2/images/packtpub_links_deep_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links_deep_search.png


--------------------------------------------------------------------------------
/chapter2/images/requests_extract_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_extract_links.png


--------------------------------------------------------------------------------
/chapter2/images/requests_headers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_headers.png


--------------------------------------------------------------------------------
/chapter2/images/requests_post.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_post.png


--------------------------------------------------------------------------------
/chapter3/code/books_scraping/requests_bs4_initial.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | 
 4 | def processUrl(url):
 5 |     """
 6 |     Upload and process the content of a URL using request.
 7 | 	Show an error message if you cannot load the page
 8 |     """
 9 |      # http request
10 |     req = requests.get(url)
11 | 
12 |     # We verify the request returns a Status Code = 200
13 |     statusCode = req.status_code
14 |     if statusCode == 200:
15 | 
16 |         # We pass the HTML content of the web to a BeautifulSoup() object
17 |         html = BeautifulSoup(req.text,"lxml")
18 |         
19 |         # We process the downloaded HTML
20 |         return processHTML(html,url)        
21 |         
22 |     else:
23 |         print ("ERROR {}".format(statusCode))
24 | 
25 | def processHTML(html, url=""):
26 |     """
27 |     Process the HTML content of a web page
28 | 	html is a BS4 object
29 | 	url is the URL of the page contained in html_doc
30 |     """
31 |     # Decide here what you want to do with the content
32 |     return
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/chapter3/code/books_scraping/requests_bs4_with_pages.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup
  2 | import requests
  3 | import pandas as pd
  4 | 
  5 | # Class names representing product ratings
  6 | star = ["One", "Two", "Three", "Four", "Five"]
  7 | 
  8 | bookList = []
  9 | url_page = "http://books.toscrape.com/catalogue/page-{}.html"
 10 | url = "http://books.toscrape.com/catalogue/"
 11 | 
 12 | def starToInt (rating):
 13 |     """
 14 |     Convert a textual rating to a numerical rating
 15 | 	Returns the equivalent number, or 0, if the rating is not valid
 16 |     """
 17 |     try:
 18 |         return star.index(rating) + 1
 19 |     except:
 20 |         return 0
 21 | 		
 22 | 		
 23 | def processUrl(url):
 24 |     """
 25 |     Upload and process the content of a URL using request.
 26 |     Show an error message if you cannot load the page
 27 |     """
 28 |     # http request
 29 |     req = requests.get(url)
 30 | 
 31 |     # We verify the request returns a Status Code = 200
 32 |     statusCode = req.status_code
 33 |     if statusCode == 200:
 34 | 
 35 |         # We pass the HTML content of the web to a BeautifulSoup () object
 36 |         html = BeautifulSoup(req.text,"lxml")
 37 | 
 38 |         # We process the downloaded HTML
 39 |         return processHTML(html,url)        
 40 | 
 41 |     else:
 42 |         print ("ERROR {}".format(statusCode))
 43 | 
 44 | def processHTML(html, url=""):
 45 |     """
 46 |     Process the HTML content of a web page
 47 |     html is a BS4 object
 48 |     url is the URL of the page contained in html_doc
 49 |     """
 50 |     book = {}
 51 |     
 52 |     productMain = html.select_one(".product_main")
 53 | 
 54 |     # Title
 55 |     title = productMain.select_one("h1").text
 56 |     book['title'] = title
 57 | 
 58 |     # Price
 59 |     price = productMain.select_one("p.price_color").text
 60 |     book['price'] = price[2:]
 61 |     
 62 |     # Assessment
 63 |     # 1. Get class
 64 |     ratingClasses = productMain.select_one("p.star-rating")["class"]
 65 |     
 66 |     # 2. We get with the intersection
 67 |     ratingText = list(set(ratingClasses).intersection(set(star)))
 68 |     
 69 |     # 3. We convert it to a numerical value
 70 |     if (len(ratingText)==1):
 71 |         book['assessment'] = starToInt(ratingText[0])
 72 |     else:
 73 |         book['assessment'] = 0
 74 | 		
 75 | 	# Processing the description makes us look for the sibling of an element
 76 | 	# Product description
 77 |     # 1. We look for the element that takes product product description
 78 |     productDescription = html.find(id="product_description")
 79 |     
 80 |     # 2. We are looking for the next sibling with tag p
 81 |     if productDescription is None:
 82 |         book['descripcion'] = ""
 83 |     else:
 84 |         book['descripcion'] = productDescription.find_next_sibling('p').text
 85 |         
 86 |     print(book)
 87 | 	
 88 |     return book
 89 | 
 90 | 	
 91 | def processCatalog(url, prefix):
 92 |     """
 93 |     Returns False if we have reached the end of the catalog, True otherwise
 94 |     """
 95 |     # We make the request to the web
 96 |     response = requests.get(url)
 97 | 
 98 |     # We verify that the request returns a Status Code = 200
 99 |     statusCode = response.status_code
100 |     if statusCode == 200:
101 | 
102 |         # We pass the HTML content of the web to a BeautifulSoup () object
103 |         html = BeautifulSoup(response.text,"lxml")
104 |         
105 |         # We process the downloaded HTML
106 |         books = html.select('article.product_pod')
107 |         for prod in books:
108 |             link = prod.select_one('h3 > a')
109 |             book = processUrl(prefix+link['href'])
110 |             book['link'] = prefix+link['href']
111 |             bookList.append(book)
112 |         return True
113 |     
114 |     if statusCode == 404:
115 |         return False
116 |  
117 | if __name__ == "__main__":
118 | 
119 | 	processUrl("http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
120 | 	
121 | 	for i in range(1,5):
122 | 		processCatalog(url_page.format(i), url)
123 | 		
124 | 	for book in bookList:
125 | 		print(book)
126 | 	
127 | 	#Finally we will load all the data in a panda dataframe to process it, extract information and save it to a CSV
128 | 
129 | 	df = pd.DataFrame(bookList)
130 | 	df.to_csv("bookList.csv", sep=";", index=False)
131 | 
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/chapter3/code/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/code/chromedriver.exe


--------------------------------------------------------------------------------
/chapter3/code/dolar-euro_converter.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time
 3 | 
 4 | def get_currency_values():
 5 |         browser = webdriver.Chrome("chromedriver.exe")
 6 |         browser.get('http://www.xe.com/en/currencyconverter/convert/?Amount=1&From=USD&To=EUR')
 7 |         time.sleep(5)
 8 |         value = browser.find_element_by_xpath("//*[@id='converterResult']/div/div/div[2]/span[1]")
 9 |         one_dollar = value.text
10 |         print('The dollar at this time has a value of: €{} EUROS'.format(one_dollar))
11 |         browser.get('http://www.xe.com/en/currencyconverter/convert/?Amount=1&From=EUR&To=USD')
12 |         time.sleep(5)
13 |         value = browser.find_element_by_xpath("//*[@id='converterResult']/div/div/div[2]/span[1]")
14 |         one_euro = value.text
15 |         print('The euro at this time has a value of: ${} dollars'.format(one_euro))
16 |         one_dollar_float = float(one_dollar)
17 |         one_euro_float = float(one_euro)
18 |         operate(one_dollar_float, one_euro_float)
19 | 
20 | 
21 | def operate(one_dollar_float, one_euro_float):
22 | 
23 |         while True:
24 |                 command = str(input('''Selet currency conversion:
25 | 				[1]Dollars to euros
26 | 				[2]Euros to dollars
27 | 				[e]exit'''))
28 | 
29 |                 if command == '1':
30 |                         dollar_to_euro(one_dollar_float)
31 |                 elif command == '2':
32 |                         euro_to_dollar(one_euro_float)
33 |                 else:
34 |                         break
35 | 
36 | def dollar_to_euro(one_dollar_float):
37 |         dollar_amount = float(input('Dollars amount: '))
38 |         result = one_dollar_float * dollar_amount
39 |         print('${} Dollars are ${} Euros'.format(dollar_amount, result))
40 | 
41 | def euro_to_dollar(one_euro_float):
42 |         euros_amount = float(input('Euros amount: '))
43 |         result = one_euro_float * euros_amount
44 |         print('€{} Euros are ${} Dollars'.format(euros_amount, result))
45 | 
46 | 
47 | if __name__ == '__main__':
48 |         get_currency_values()


--------------------------------------------------------------------------------
/chapter3/code/google_translate.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from bs4 import BeautifulSoup
 5 | import requests
 6 | import sys
 7 | from selenium import webdriver
 8 | import time
 9 | 
10 | #Example input to enter : en (= english)
11 | convert_from = input("Language to Convert from : ")
12 | 
13 | #Example input to enter : es (= spanish)
14 | convert_to = input("Language to Convert to : ")
15 | 
16 | text_to_convert = input("Text to translate: ")
17 | 
18 | #replace spaces by + symbol
19 | text_to_convert = text_to_convert.replace(' ', '+')
20 | 
21 | #call translate service
22 | url = 'https://translate.google.com/?sl=%s&tl=%s&text=%s' % (convert_from, convert_to, text_to_convert)
23 | 
24 | browser = webdriver.Chrome("chromedriver.exe")
25 | browser.get(url)
26 | 
27 | time.sleep(5)
28 | 
29 | translation = browser.find_element_by_class_name("tlid-translation")
30 | translation2 = browser.find_element_by_xpath("/html/body/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div[3]/div[1]/div[2]/div/span[1]/span")
31 | 
32 | print("Text translated : ", translation2.text)
33 | 
34 | browser.get_screenshot_as_file('google_translate.png')
35 | browser.close()
36 | 


--------------------------------------------------------------------------------
/chapter3/code/interacting_with_form.py:
--------------------------------------------------------------------------------
 1 | from selenium.webdriver.support.ui import WebDriverWait
 2 | from selenium.common.exceptions import TimeoutException
 3 | from selenium import webdriver
 4 | import time
 5 | 
 6 | url = "https://websistent.com/tools/htdigest-generator-tool/"
 7 | user = "myUser"
 8 | 
 9 | driver = webdriver.Chrome('chromedriver.exe')
10 | driver.get(url)
11 | 
12 | element = driver.find_element_by_id("uname")
13 | element.send_keys(user)
14 | 
15 | #If we go to the browser we will see that we have completed the first input of the form. 
16 | #Then fill in the rest of inputs
17 | 
18 | element = driver.find_element_by_id("realm")
19 | element.send_keys("myRealm")
20 | 
21 | element = driver.find_element_by_id("word1")
22 | element.send_keys("mypassword")
23 | 
24 | element = driver.find_element_by_id("word2")
25 | element.send_keys("mypassword")
26 | 
27 | #Finally, we look for the button and click it
28 | driver.find_element_by_id("generate").click();
29 | 
30 | # We wait 2 seconds before searching for the item
31 | #time.sleep(2)
32 | 
33 | try:
34 |     # We wait a maximum of 10 seconds while we wait for the "Loading" text to disappear
35 |     WebDriverWait(driver, 10).until_not(lambda driver: driver.find_element_by_id("output").text.startswith("Loading"))
36 | 
37 |     output = driver.find_element_by_id("output").text
38 |     print (output[output.find(user):])
39 | 
40 | except TimeoutException:
41 |     print("The realm could not be generated or the page has taken too long time to load")
42 | 	
43 | finally:
44 |     driver.quit()


--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/code/phantomjs/phantomjs.exe


--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs_example1.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | 
3 | driver = webdriver.PhantomJS("phantomjs.exe")
4 | driver.get("https://protonmail.com/")
5 | print(driver.find_element_by_class_name("homepage-hero-sub-title").text)


--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs_example2.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | browser = webdriver.PhantomJS("phantomjs.exe")
 5 | 
 6 | browser.get("https://protonmail.com/")
 7 | page = BeautifulSoup(browser.page_source,"lxml")
 8 | images = page.findAll("img")
 9 | for image in images:
10 | 	print(image.get('src'))
11 | browser.close()


--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs_example3.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.webdriver.common.by import By
 3 | from selenium.webdriver.support.ui import WebDriverWait
 4 | from selenium.webdriver.support import expected_conditions as EC
 5 | 
 6 | driver = webdriver.PhantomJS("phantomjs.exe")
 7 | 
 8 | driver.get("https://httpbin.org/#/HTTP_Methods/post_post")
 9 | 
10 | driver.find_element_by_class_name("opblock-summary-description").click()
11 | 
12 | try:
13 | 	element = WebDriverWait(driver, 15).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "btn"),"Try it out"))
14 | 	
15 | finally:
16 | 	driver.get_screenshot_as_file("image.png")
17 |  
18 | driver.close()


--------------------------------------------------------------------------------
/chapter3/code/scraping_book_details_requests.py:
--------------------------------------------------------------------------------
 1 | from lxml import html
 2 | import csv
 3 | import json
 4 | import requests
 5 | 
 6 | def parse(url):
 7 | 	headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
 8 | 	response = requests.get(url, headers=headers)
 9 | 	doc = html.fromstring(response.content)
10 | 	title_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[1]/h1/span/text()'
11 | 	author_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/div[2]/text()'
12 | 	date_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/div[3]/text()'
13 | 	pages_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/p[1]/text()'
14 | 	title = doc.xpath(title_xpath)[0]
15 | 	author = doc.xpath(author_xpath)[0]
16 | 	date = doc.xpath(date_xpath)[0]
17 | 	pages = doc.xpath(pages_xpath)[0]
18 | 
19 | 	title = ' '.join(''.join(title).split()) if title else None
20 | 	author = ' '.join(''.join(author).split()) if author else None
21 | 	date = ' '.join(''.join(date).split()) if date else None
22 | 	pages = ' '.join(''.join(pages).split()) if pages else None
23 | 	
24 | 	data = {'Title': title,'Author': author,'Date': date,'Pages': pages}
25 | 	print(data)
26 | 	
27 | 	return data
28 | 
29 | 
30 | 
31 | def ScrapingBookData():
32 | 
33 |     bookList = ['big-data-and-business-intelligence/machine-learning-opencv',
34 |                 'big-data-and-business-intelligence/hands-generative-adversarial-networks-keras']
35 | 
36 |     extracted_data = []
37 | 
38 |     for i in bookList:
39 |         url = "https://www.packtpub.com/" + i
40 |         print("Processing: " + url)
41 |         # Calling the parser
42 |         parsed_data = parse(url)
43 |         if parsed_data:
44 |             extracted_data.append(parsed_data)
45 |             #Save the collected data into a json file.
46 |             file_json=open('book_data.json','w')
47 |             json.dump(extracted_data,file_json,indent=4)
48 | 
49 |     # Writing scraped data book to csv file
50 |     with open('scraped_book_data.csv', 'w') as csvfile:
51 |         fieldnames = ['Title','Author','Date','Pages']
52 |         writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
53 |         writer.writeheader()
54 |         for data in extracted_data:
55 |             writer.writerow(data)
56 | 
57 | if __name__ == "__main__":
58 |     ScrapingBookData()


--------------------------------------------------------------------------------
/chapter3/code/selenium_list_book.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from selenium import webdriver
 5 | from bs4 import BeautifulSoup
 6 | import requests
 7 | import pandas as pd
 8 | 
 9 | driver = webdriver.Chrome("chromedriver.exe")
10 | 
11 | driver.get('https://www.packtpub.com/gb/web-development/web-programming')
12 | content = driver.page_source
13 | 
14 | soup = BeautifulSoup(content,'lxml')
15 | 
16 | books=[] #List to store book titles
17 | authors=[] #List to store authors
18 | dates=[] #List to store dates
19 | 
20 | 
21 | for element in soup.findAll('div', attrs={'class':'card h-100'}):
22 | 	title = element.find('h5', attrs={'class':'card-title mt-0'})
23 | 	author = element.find('div', attrs={'class':'author-names'})
24 | 	meta = element.find('div', attrs={'class':'product-meta'})
25 | 	if title is not None:
26 | 		print(title.contents[0].strip())
27 | 		title_text = title.contents[0].strip()
28 | 	else:
29 | 		title_text = ''
30 | 	
31 | 	if author is not None:
32 | 		author_text = author.find('p').text
33 | 	else:
34 | 		author_text = ''
35 | 		
36 | 	if meta is not None:
37 | 		date_text = meta.findChild().text
38 | 	else:
39 | 		date_text = ''
40 | 	
41 | 	
42 | 	books.append(title_text)
43 | 	authors.append(author_text)
44 | 	dates.append(date_text)
45 | 	
46 | df = pd.DataFrame({'Book title':books,'Author':authors,'Date':dates})
47 | df.to_csv('books.csv', index=False, encoding='utf-8')
48 | 


--------------------------------------------------------------------------------
/chapter3/code/stack_overflow_tags.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | 
3 | driver = webdriver.Chrome("chromedriver.exe")
4 | driver.get("https://stackoverflow.com/tags")
5 | tags = driver.find_elements_by_class_name("post-tag")
6 | for i in range(len(tags)):
7 | 	print(tags[i].text)


--------------------------------------------------------------------------------
/chapter3/images/ajax_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/ajax_image.png


--------------------------------------------------------------------------------
/chapter3/images/book_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/book_info.png


--------------------------------------------------------------------------------
/chapter3/images/book_packit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/book_packit.png


--------------------------------------------------------------------------------
/chapter3/images/books_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/books_details.png


--------------------------------------------------------------------------------
/chapter3/images/books_packit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/books_packit.png


--------------------------------------------------------------------------------
/chapter3/images/converter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/converter.png


--------------------------------------------------------------------------------
/chapter3/images/google_translate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/google_translate.png


--------------------------------------------------------------------------------
/chapter3/images/selenium_methods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/selenium_methods.png


--------------------------------------------------------------------------------
/chapter3/images/xpath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/xpath.png


--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-multipage-details/books_crawler/__init__.py


--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BooksCrawlerItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BooksCrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for books_crawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'books_crawler'
13 | 
14 | SPIDER_MODULES = ['books_crawler.spiders']
15 | NEWSPIDER_MODULE = 'books_crawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'books_crawler.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'books_crawler.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/spiders/BooksSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider
 3 | from scrapy.http import Request
 4 | 
 5 | 
 6 | def product_info(response, value):
 7 |     return response.xpath('//th[text()="' + value + '"]/following-sibling::td/text()').extract_first()
 8 | 
 9 | 
10 | class BooksSpider(Spider):
11 |     name = 'BooksSpider'
12 |     allowed_domains = ['books.toscrape.com']
13 |     start_urls = ['http://books.toscrape.com']
14 | 
15 |     def parse(self, response):
16 |         books = response.xpath('//h3/a/@href').extract()
17 |         for book in books:
18 |             absolute_url = response.urljoin(book)
19 |             yield Request(absolute_url, callback=self.parse_book)
20 | 
21 |         # process next page
22 |         next_page_url = response.xpath('//a[text()="next"]/@href').extract_first()
23 |         absolute_next_page_url = response.urljoin(next_page_url)
24 |         yield Request(absolute_next_page_url)
25 | 
26 |     def parse_book(self, response):
27 |         title = response.css('h1::text').extract_first()
28 |         price = response.xpath('//*[@class="price_color"]/text()').extract_first()
29 | 
30 |         image_url = response.xpath('//img/@src').extract_first()
31 |         image_url = image_url.replace('../..', 'http://books.toscrape.com/')
32 | 
33 |         rating = response.xpath('//*[contains(@class, "star-rating")]/@class').extract_first()
34 |         rating = rating.replace('star-rating ', '')
35 | 
36 |         description = response.xpath(
37 |             '//*[@id="product_description"]/following-sibling::p/text()').extract_first()
38 | 
39 |         # book information data
40 |         product_type =  product_info(response, 'Product Type')
41 |         price_without_tax = product_info(response, 'Price (excl. tax)')
42 |         price_with_tax = product_info(response, 'Price (incl. tax)')
43 |         tax = product_info(response, 'Tax')
44 |         availability = product_info(response, 'Availability')
45 |         number_of_reviews = product_info(response, 'Number of reviews')
46 | 
47 |         yield {
48 |             'title': title,
49 |             'price': price,
50 |             'image_url': image_url,
51 |             'rating': rating,
52 |             'description': description,
53 |             'product_type': product_type,
54 |             'price_without_tax': price_without_tax,
55 |             'price_with_tax': price_with_tax,
56 |             'tax': tax,
57 |             'availability': availability,
58 |             'number_of_reviews': number_of_reviews
59 |         }


--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = books_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books_crawler
12 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-urls/books_crawler/__init__.py


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BooksCrawlerItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BooksCrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for books_crawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'books_crawler'
13 | 
14 | SPIDER_MODULES = ['books_crawler.spiders']
15 | NEWSPIDER_MODULE = 'books_crawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'books_crawler.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'books_crawler.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/spiders/BooksSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider
 3 | from scrapy.http import Request
 4 | 
 5 | 
 6 | class BooksSpider(Spider):
 7 |     name = 'BooksSpider'
 8 |     allowed_domains = ['books.toscrape.com']
 9 |     start_urls = ['http://books.toscrape.com']
10 | 
11 |     def parse(self, response):
12 |         books = response.xpath('//h3/a/@href').extract()
13 |         for book in books:
14 |             absolute_url = response.urljoin(book)
15 |             yield Request(absolute_url, callback=self.parse_book)
16 | 
17 |         # process next page
18 |         next_page_url = response.xpath('//a[text()="next"]/@href').extract_first()
19 |         absolute_next_page_url = response.urljoin(next_page_url)
20 |         yield Request(absolute_next_page_url)
21 | 
22 |     def parse_book(self, response):
23 |         yield { 'book_url': response.url}
24 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = books_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books_crawler
12 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-urls_download_images/books_crawler/__init__.py


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class BooksCrawlerItem(scrapy.Item):
11 |     title = scrapy.Field()
12 |     price = scrapy.Field()
13 | 
14 |     image_urls = scrapy.Field()
15 |     images = scrapy.Field()
16 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BooksCrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for books_crawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'books_crawler'
13 | 
14 | SPIDER_MODULES = ['books_crawler.spiders']
15 | NEWSPIDER_MODULE = 'books_crawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'books_crawler.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'scrapy.pipelines.images.ImagesPipeline': 1,
69 | }
70 | IMAGES_STORE = './images_store'
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/spiders/BooksSpider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider
 3 | from scrapy.http import Request
 4 | from scrapy.loader import ItemLoader
 5 | from books_crawler.items import BooksCrawlerItem
 6 | 
 7 | 
 8 | class BooksSpider(Spider):
 9 |     name = 'BooksSpider'
10 |     allowed_domains = ['books.toscrape.com']
11 |     start_urls = ['http://books.toscrape.com']
12 | 
13 |     def parse(self, response):
14 |         books = response.xpath('//h3/a/@href').extract()
15 |         for book in books:
16 |             absolute_url = response.urljoin(book)
17 |             yield Request(absolute_url, callback=self.parse_book)
18 | 
19 |         # process next page
20 |         next_page_url = response.xpath('//a[text()="next"]/@href').extract_first()
21 |         absolute_next_page_url = response.urljoin(next_page_url)
22 |         yield Request(absolute_next_page_url)
23 | 
24 |     def parse_book(self, response):
25 |         item_loader = ItemLoader(item=BooksCrawlerItem(), response=response)
26 | 
27 |         title = response.css('h1::text').extract_first()
28 |         price = response.xpath('//*[@class="price_color"]/text()').extract_first()
29 | 
30 |         image_urls = response.xpath('//img/@src').extract_first()
31 |         image_urls = image_urls.replace('../..', 'http://books.toscrape.com/')
32 | 
33 |         item_loader.add_value('title', title)
34 |         item_loader.add_value('price', price)
35 |         item_loader.add_value('image_urls', image_urls)
36 | 
37 |         return item_loader.load_item()
38 | 
39 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = books_crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books_crawler
12 | 


--------------------------------------------------------------------------------
/chapter4/europython/europython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__init__.py


--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/items.cpython-37.pyc


--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/pipelines.cpython-37.pyc


--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/settings.cpython-37.pyc


--------------------------------------------------------------------------------
/chapter4/europython/europython/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | from scrapy.loader.processors import Compose, MapCompose, Join
10 | 
11 | clean_text = Compose(MapCompose(lambda v: v.strip()), Join())   
12 | 
13 | def custom_field(text):
14 | 	text = clean_text(text)
15 | 	return text.strip()
16 | 	
17 | class EuropythonItem(scrapy.Item):
18 |     # define the fields for your item here like:
19 |     # name = scrapy.Field()
20 | 	title = scrapy.Field(output_processor=custom_field)
21 | 	author = scrapy.Field(output_processor=custom_field)
22 | 	description = scrapy.Field(output_processor=custom_field)
23 | 	date = scrapy.Field(output_processor=custom_field)
24 | 	tags = scrapy.Field(output_processor=custom_field)
25 | 


--------------------------------------------------------------------------------
/chapter4/europython/europython/middlewares.py:
--------------------------------------------------------------------------------
 1 | # Importing base64 library because we'll need it ONLY in case if the proxy we are going to use requires authentication
 2 | import base64
 3 |  
 4 | # Start your middleware class
 5 | class ProxyMiddleware(object):
 6 |     # overwrite process request
 7 |     def process_request(self, request, spider):
 8 |         # Set the location of the proxy
 9 |         request.meta['proxy'] =  "proxy_server"
10 | 
11 |         # Use the following lines if your proxy requires authentication
12 |         proxy_user_pass = "user:password"
13 |         # setup basic authentication for the proxy
14 |         encoded_user_pass = base64.encodestring(proxy_user_pass)
15 |         request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
16 | 


--------------------------------------------------------------------------------
/chapter4/europython/europython/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import scrapy
 9 | from scrapy import signals
10 | from scrapy.exporters import CsvItemExporter
11 | from scrapy.exporters import XmlItemExporter
12 | import codecs
13 | import json
14 | import csv
15 | 
16 | class EuropythonJsonExport(object):	
17 | 	def __init__(self):
18 | 		self.file = codecs.open('europython_items.json', 'w+b', encoding='utf-8')
19 | 
20 | 	def process_item(self, item, spider):
21 | 		line = json.dumps(dict(item), ensure_ascii=False) + "\n"
22 | 		self.file.write(line)
23 | 		return item
24 | 
25 | 	def spider_closed(self, spider):
26 | 		self.file.close()
27 | 
28 | class EuropythonXmlExport(object):
29 | 	
30 | 	def __init__(self):
31 | 		self.files = {}
32 | 
33 | 	@classmethod
34 | 	def from_crawler(cls, crawler):
35 | 		pipeline = cls()
36 | 		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
37 | 		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
38 | 		return pipeline
39 | 
40 | 	def spider_opened(self, spider):
41 | 		file = open('europython_items.xml', 'w+b')
42 | 		self.files[spider] = file
43 | 		self.exporter = XmlItemExporter(file)
44 | 		self.exporter.start_exporting()
45 | 
46 | 	def spider_closed(self, spider):
47 | 		self.exporter.finish_exporting()
48 | 		file = self.files.pop(spider)
49 | 		file.close()
50 | 
51 | 	def process_item(self, item, spider):
52 | 		self.exporter.export_item(item)
53 | 		return item
54 | 		
55 | class EuropythonCSVExport(object):
56 | 	
57 | 	def __init__(self):
58 | 		self.files = {}
59 | 
60 | 	@classmethod
61 | 	def from_crawler(cls, crawler):
62 | 		pipeline = cls()
63 | 		crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
64 | 		crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
65 | 		return pipeline
66 | 
67 | 	def spider_opened(self, spider):
68 | 		file = open('europython_items.csv', 'w+b')
69 | 		self.files[spider] = file
70 | 		self.exporter = CsvItemExporter(file)
71 | 		self.exporter.start_exporting()
72 | 
73 | 	def spider_closed(self, spider):
74 | 		self.exporter.finish_exporting()
75 | 		file = self.files.pop(spider)
76 | 		file.close()
77 | 
78 | 	def process_item(self, item, spider):
79 | 		self.exporter.export_item(item)
80 | 		return item
81 | 


--------------------------------------------------------------------------------
/chapter4/europython/europython/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for europython project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'europython'
12 | 
13 | SPIDER_MODULES = ['europython.spiders']
14 | NEWSPIDER_MODULE = 'europython.spiders'
15 | 
16 | 
17 | # Configure item pipelines
18 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
19 | ITEM_PIPELINES = {
20 |     'europython.pipelines.EuropythonJsonExport': 100,
21 |     'europython.pipelines.EuropythonXmlExport': 200,
22 |     'europython.pipelines.EuropythonCSVExport': 300,
23 | }
24 | 
25 | DOWNLOADER_MIDDLEWARES = {
26 |     'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
27 |     #'europython.middlewares.ProxyMiddleware': 100,
28 | }
29 | 
30 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
31 | #USER_AGENT = 'europython (+http://www.yourdomain.com)'
32 | 


--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/spiders/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/__pycache__/europython_spider.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/spiders/__pycache__/europython_spider.cpython-37.pyc


--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/europython_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.spiders import CrawlSpider, Rule
 4 | from scrapy.linkextractors import LinkExtractor
 5 | from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
 6 | from scrapy.loader import ItemLoader
 7 | 
 8 | from europython.items import EuropythonItem
 9 | 
10 | 
11 | class EuropythonSpider(CrawlSpider):
12 | 	def __init__(self, year='', *args, **kwargs):
13 | 		super(EuropythonSpider, self).__init__(*args, **kwargs)
14 | 		self.year = year
15 | 		self.start_urls = ['http://ep'+str(self.year)+".europython.eu/en/events/sessions"]
16 | 		print('start url: '+str(self.start_urls[0]))
17 | 	
18 | 	name = "europython_spider"
19 | 	allowed_domains = ["ep2015.europython.eu","ep2016.europython.eu", "ep2017.europython.eu","ep2018.europython.eu","ep2019.europython.eu"]
20 | 	
21 | 	# Pattern for entries that match the conference/talks and /talks format
22 | 	rules = [Rule(LxmlLinkExtractor(allow=['conference/talks']),callback='process_response'),
23 | 	Rule(LxmlLinkExtractor(allow=['talks']),callback='process_response_europython2019')]
24 | 
25 | 	def process_response(self, response):
26 | 		itemLoader = ItemLoader(item=EuropythonItem(), response=response)
27 | 		itemLoader.add_xpath('title', "//div[contains(@class, 'grid-100')]//h1/text()")
28 | 		itemLoader.add_xpath('author', "//div[contains(@class, 'talk-speakers')]//a[1]/text()")
29 | 		itemLoader.add_xpath('description', "//div[contains(@class, 'cms')]//p//text()")
30 | 		itemLoader.add_xpath('date', "//section[contains(@class, 'talk when')]/strong/text()")
31 | 		itemLoader.add_xpath('tags', "//div[contains(@class, 'all-tags')]/span/text()")
32 | 		item = itemLoader.load_item()
33 | 		return item
34 | 		
35 | 	def process_response_europython2019(self, response):
36 | 		item = EuropythonItem()
37 | 		print(response)
38 | 		item['title'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/h1/text()").extract()
39 | 		item['author'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/h5/a/text()").extract()
40 | 		item['description'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/p[3]/text()").extract()
41 | 		item['date'] = "July 2019"
42 | 		item['tags'] = response.xpath("//span[contains(@class, 'badge badge-secondary')]/text()").extract()
43 | 
44 | 		return item


--------------------------------------------------------------------------------
/chapter4/europython/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | project: 366126
2 | 


--------------------------------------------------------------------------------
/chapter4/europython/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = europython.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = europython
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/chapter4/europython/setup.py:
--------------------------------------------------------------------------------
 1 | # Automatically created by: shub deploy
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |     name         = 'project',
 7 |     version      = '1.0',
 8 |     packages     = find_packages(),
 9 |     entry_points = {'scrapy': ['settings = europython.settings']},
10 | )
11 | 


--------------------------------------------------------------------------------
/chapter4/images/book_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/book_details.png


--------------------------------------------------------------------------------
/chapter4/images/books_images.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/books_images.png


--------------------------------------------------------------------------------
/chapter4/images/books_images_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/books_images_output.png


--------------------------------------------------------------------------------
/chapter4/images/europython_talk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/europython_talk.png


--------------------------------------------------------------------------------
/chapter4/images/next_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/next_page.png


--------------------------------------------------------------------------------
/chapter4/images/scrapy_books.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_books.png


--------------------------------------------------------------------------------
/chapter4/images/scrapy_books_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_books_links.png


--------------------------------------------------------------------------------
/chapter4/images/scrapy_options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_options.png


--------------------------------------------------------------------------------
/chapter4/images/scrapy_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_project.png


--------------------------------------------------------------------------------
/chapter4/images/scrapy_shell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_shell.png


--------------------------------------------------------------------------------
/chapter4/images/scrapy_shell2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_shell2.png


--------------------------------------------------------------------------------
/chapter4/output.json:
--------------------------------------------------------------------------------
 1 | [
 2 | {"URL": "http://books.toscrape.com/index.html", "image_link": ["media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg"]},
 3 | {"URL": "http://books.toscrape.com/catalogue/category/books/travel_2/index.html", "image_link": ["../../../../media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg", "../../../../media/cache/57/77/57770cac1628f4407636635f4b85e88c.jpg", "../../../../media/cache/9a/7e/9a7e63f12829df4b43b31d110bf3dc2e.jpg", "../../../../media/cache/d5/bf/d5bf0090470b0b8ea46d9c166f7895aa.jpg", "../../../../media/cache/98/c2/98c2e95c5fd1a4e7cd5f2b63c52826cb.jpg", "../../../../media/cache/4e/15/4e15150388702ebca2c5a523ac270539.jpg", "../../../../media/cache/76/de/76de41867f323d7f1f4fbe2fdfc1b2ba.jpg", "../../../../media/cache/db/46/db46159b05faa5d95262112bf9c29ddd.jpg", "../../../../media/cache/e0/4f/e04f8eda2a2fa947aec17640202d9ab0.jpg", "../../../../media/cache/06/81/0681530a7bc301caf5c3257e1b0f0750.jpg", "../../../../media/cache/d7/0f/d70f7edd92705c45a82118c3ff6c299d.jpg"]},
 4 | {"URL": "http://books.toscrape.com/catalogue/category/books/classics_6/index.html", "image_link": ["../../../../media/cache/c5/46/c5465a06182ed6ebfa40d049258a2f58.jpg", "../../../../media/cache/4a/1b/4a1b6e9c1af75db0dc34ae63344f6883.jpg", "../../../../media/cache/45/bb/45bb59d19eb3aa868293d44809078418.jpg", "../../../../media/cache/1f/b0/1fb03cdabe6001c8a2620f65e025cbd5.jpg", "../../../../media/cache/81/f5/81f559ebe403317226fa8b611e35ce8a.jpg", "../../../../media/cache/27/82/2782701b5c877cb063065b9fc14c5b13.jpg", "../../../../media/cache/e3/c4/e3c4aba2409bb769a6488805e3fc4709.jpg", "../../../../media/cache/10/db/10db56354b4550d92270c6f097d9bebc.jpg", "../../../../media/cache/93/4e/934e966c1ddf559d3ac2b5c1407aaf1e.jpg", "../../../../media/cache/a6/72/a67245346daa38c2b23a4fc64c6e7115.jpg", "../../../../media/cache/42/c4/42c48f11b7e70a0f76c5ba9cb5c5018a.jpg", "../../../../media/cache/dd/6e/dd6e7b84e99f3b4b5655ea0db74af2b4.jpg", "../../../../media/cache/21/bf/21bf2eb0bff3134837def8bd40845ba0.jpg", "../../../../media/cache/ab/16/ab16eb035cc58809a73c4699477de9cb.jpg", "../../../../media/cache/c0/78/c078355608dd81c7c5e4f5e1c5f73d23.jpg", "../../../../media/cache/7d/53/7d53e2264b9647ee307259be9f73585d.jpg", "../../../../media/cache/0f/ca/0fca4597765ffacdb7bd529fc5eb88fa.jpg", "../../../../media/cache/09/63/09638baaef52f03827c215029c632a13.jpg", "../../../../media/cache/96/ee/96ee77d71a31b7694dac6855f6affe4e.jpg"]},
 5 | {"URL": "http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html", "image_link": ["../../../../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg", "../../../../media/cache/71/df/71df730cf38c232ee58a2e407135f055.jpg", "../../../../media/cache/ea/04/ea0476a6f4c318ceccf5e2f2b39f2b15.jpg", "../../../../media/cache/3f/ef/3fef12d9da503693af12997c0ea0897f.jpg", "../../../../media/cache/05/ce/05ce699eaf78c0fae20308497c4f496a.jpg", "../../../../media/cache/de/76/de76d5c473c358bd41c03cf710692bfb.jpg", "../../../../media/cache/12/6e/126ef8f6473b81808ebbb9cff155e883.jpg", "../../../../media/cache/91/e6/91e6190dcdd7d6cdeb94a82b60917ec4.jpg", "../../../../media/cache/f0/aa/f0aa9ae0319b1d6e0706e6053020e696.jpg", "../../../../media/cache/df/c9/dfc9ed72e963572d23233b3a8cb01676.jpg", "../../../../media/cache/ab/45/ab45f300aa15066ad1260d6f1398d03e.jpg"]},
 6 | {"URL": "http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html", "image_link": ["../../../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../../../media/cache/36/df/36df4caaf1420b1183a8235355d39e69.jpg", "../../../../media/cache/c4/dd/c4ddd9ced89966b0602ec85e00cd5b61.jpg", "../../../../media/cache/f4/79/f479de5f305c2ac0512702cf7155bb74.jpg", "../../../../media/cache/e1/ea/e1ea6cb36e62ae6dc7b805f68ab9a700.jpg", "../../../../media/cache/f3/ef/f3efd43ae0fa85d9b325d5e8783e7af5.jpg", "../../../../media/cache/78/0b/780b2c28122750c2c383846155815bf7.jpg", "../../../../media/cache/c8/2f/c82f629a31b3f47bdb17ac14aa51076d.jpg", "../../../../media/cache/01/72/01726c619a05114dca75bd840095016d.jpg", "../../../../media/cache/cb/00/cb004189f548d75ad430d3ed19e6daa9.jpg", "../../../../media/cache/03/88/03886a8502ca54dbce0d91c2568ab69d.jpg", "../../../../media/cache/d3/15/d3158e8d3546fb90cced3c1d44a92a34.jpg", "../../../../media/cache/7e/a0/7ea062007ef00107e3c16d336b41fab2.jpg", "../../../../media/cache/5f/b1/5fb1bf88dcfda795606745ce35be5975.jpg", "../../../../media/cache/aa/74/aa74004807e97a79aa084b5db329a99b.jpg", "../../../../media/cache/16/d4/16d443437126bf6d536a89312c1995a5.jpg", "../../../../media/cache/90/6f/906f0168b0e155a7077625499b1737b5.jpg", "../../../../media/cache/78/97/7897eea91c4a85aca58d925861d4afec.jpg", "../../../../media/cache/f6/88/f688a9d6a89fdf38e4e88439ee9eda69.jpg", "../../../../media/cache/dd/c9/ddc95df6754df8e71bf969c088056188.jpg"]},
 7 | {"URL": "http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html", "image_link": ["../../../../media/cache/5f/72/5f72c8a0d5a7292e2929a354ec8a022f.jpg", "../../../../media/cache/16/e3/16e3ca741956485119251e7442a67e2e.jpg", "../../../../media/cache/ae/ac/aeac003461b89c7ef826251d940b2afc.jpg", "../../../../media/cache/bb/ee/bbeeab4c4ce572c0e9764e3a96c6d4a5.jpg", "../../../../media/cache/39/e3/39e33ebef2d7a35dd6899541eba8306d.jpg", "../../../../media/cache/27/b7/27b7f4ec590965b5acc15dc4b1376684.jpg", "../../../../media/cache/ac/ba/acba5e4e1813b8c1fff4890f1efef3ab.jpg", "../../../../media/cache/72/73/7273ff1bfe3b0a6aab7f54ddf9be7b44.jpg", "../../../../media/cache/a0/fa/a0fa38039f6a674a7c89dfe2be866259.jpg", "../../../../media/cache/13/8f/138f4cf84be250d08e1f5c1db3643dbc.jpg", "../../../../media/cache/63/5f/635fb981e464f7427787824b20a15e71.jpg", "../../../../media/cache/87/d3/87d34d376555dd0cb75030d1059cc144.jpg", "../../../../media/cache/6d/6d/6d6d5799190b4f9ef89f3bbc8b67d60d.jpg", "../../../../media/cache/72/f5/72f5ed312bc82afa386c9cd48d4e36dd.jpg", "../../../../media/cache/b2/df/b2df826432771838819db89c20e20609.jpg", "../../../../media/cache/db/34/db341aa83daa76cd9f9bd2c86ccb5dba.jpg", "../../../../media/cache/0c/32/0c329cbd2adf4e0dc825f892106673b2.jpg"]},
 8 | {"URL": "http://books.toscrape.com/catalogue/category/books/mystery_3/index.html", "image_link": ["../../../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../../../media/cache/23/85/238570a1c284e730dbc737a7e631ae2b.jpg", "../../../../media/cache/89/b8/89b850edb01851a91f64ba114b96acb6.jpg", "../../../../media/cache/11/aa/11aaad48b5f15e262456ca65294084da.jpg", "../../../../media/cache/29/fe/29fe70b1b2e5a9ba61d4bd331255e19e.jpg", "../../../../media/cache/37/f1/37f118b4a56d866e1e8b563759d6966c.jpg", "../../../../media/cache/44/9e/449ed681142bc336646abee754e96639.jpg", "../../../../media/cache/3c/91/3c91d97266bd6dda322089695fb46daf.jpg", "../../../../media/cache/e8/c0/e8c0ba15066bab950ae161fd60949b9a.jpg", "../../../../media/cache/8f/a4/8fa41d6caa10e427356b8a590eb4d96b.jpg", "../../../../media/cache/23/52/2352718971d5e166fa9541a5a7d716fa.jpg", "../../../../media/cache/c3/8d/c38d65cd155b67ca025f0655bd1bb095.jpg", "../../../../media/cache/8b/bc/8bbc5ab4c3784b4d9b93eb0fd1fb6fd6.jpg", "../../../../media/cache/57/07/5707c3d5d4fd44d943d51730ba7d429a.jpg", "../../../../media/cache/d5/81/d58157866ea8f015a8e4c55b23b8c96f.jpg", "../../../../media/cache/fd/71/fd71fb07247bf911505a351c0670c6dc.jpg", "../../../../media/cache/90/0b/900bd2e60d56b6480a4e8eb2dddb46d6.jpg", "../../../../media/cache/c7/ab/c7abb5e32bd37118a87523dcee0a70a6.jpg", "../../../../media/cache/95/d7/95d7541679fcbd579b8a4f2b47231aaf.jpg", "../../../../media/cache/57/31/5731a5d46c2c1e88977eb5e6d1337a2e.jpg"]},
 9 | {"URL": "http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html", "image_link": ["../../../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../../../media/cache/d6/58/d658a1485b130ff26ca5fb0d5975ed2e.jpg", "../../../../media/cache/82/96/8296f92b70fb1dafefecda92c1d51941.jpg", "../../../../media/cache/0d/cb/0dcb33d60b0e79adf8ab9842e697ea2e.jpg", "../../../../media/cache/0e/fe/0efe86960cdff718aed01a5c3f65b1c3.jpg", "../../../../media/cache/0f/c2/0fc21ec3489cb23116778ee84f425eca.jpg", "../../../../media/cache/96/41/964194a317f8ce5ed031bf4c9ceb43ab.jpg", "../../../../media/cache/7a/22/7a224a6e174af91950e9b124afe54e0e.jpg", "../../../../media/cache/16/57/16575316618bd7e922d5b0e0f87de2ca.jpg", "../../../../media/cache/6c/2e/6c2e764e3ea89859b52df8de4f12af7a.jpg", "../../../../media/cache/fc/80/fc80b999ff4b8ef24b7071f62d2bf6d1.jpg", "../../../../media/cache/62/fa/62fa1e72f06f05762db5d9cedf654153.jpg", "../../../../media/cache/be/7c/be7ce6fbc9a8e1a5a5b5c32e73cfd78a.jpg", "../../../../media/cache/6b/82/6b822681c4035131560d40dd3b5a6a2e.jpg", "../../../../media/cache/b7/ad/b7ad37d93d8401c84d7325aa645ff6d5.jpg", "../../../../media/cache/b5/d8/b5d813da01f2ccd7bcfe34e2b875e752.jpg", "../../../../media/cache/b2/8f/b28f211e50e74445ca071d4279d1080d.jpg", "../../../../media/cache/bf/fd/bffd473ab232c5f35e8c81bb927f1624.jpg", "../../../../media/cache/18/f7/18f7bf6366cd7a8b947fd790d808047b.jpg", "../../../../media/cache/bf/7a/bf7a5bc1d1ebac5e9b6fbb147828a123.jpg"]},
10 | {"URL": "http://books.toscrape.com/catalogue/category/books_1/index.html", "image_link": ["../../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "../../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../../media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg"]},
11 | {"URL": "http://books.toscrape.com/catalogue/category/books/romance_8/index.html", "image_link": ["../../../../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg", "../../../../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg", "../../../../media/cache/1e/bb/1ebbbc3e2d3249b111033cfc40763b0b.jpg", "../../../../media/cache/c4/d1/c4d1517cc9370e292366b6132ca9ca36.jpg", "../../../../media/cache/cc/bd/ccbdae9e29b3594301528fa2c876ec29.jpg", "../../../../media/cache/28/99/28992d89f4abf54fba183fc8d074adf3.jpg", "../../../../media/cache/e9/f4/e9f4bc8cf5ffaea1504623c936e90a48.jpg", "../../../../media/cache/59/10/5910fbd8a95e8e9de9c660b71e0694e2.jpg", "../../../../media/cache/e9/25/e9250495a525eb203652ad9da85ccb8e.jpg", "../../../../media/cache/7e/67/7e67addd80caaf8a9f9e9daa9cf66bb2.jpg", "../../../../media/cache/0b/89/0b89c3b317d0f89da48356a0b5959c1e.jpg", "../../../../media/cache/ae/90/ae903f6f6d059954be4e85497dd76bf5.jpg", "../../../../media/cache/a6/4b/a64b3c559f59748bfdbbe75be3e16075.jpg", "../../../../media/cache/1d/78/1d78fe226e1adb9cb591fa21f8a9bf68.jpg", "../../../../media/cache/f0/e0/f0e0db3edcb14293a52b51929cc72979.jpg", "../../../../media/cache/8e/40/8e408552c2e7ee81cd60c03c79f604af.jpg", "../../../../media/cache/f7/a9/f7a90a63f66ac92cc280def001970ed2.jpg", "../../../../media/cache/40/16/4016ffba678f309171d8130135f6eb8e.jpg", "../../../../media/cache/3c/a2/3ca2e61181fc1122658af8f85354bae8.jpg", "../../../../media/cache/57/47/57472d9c6d483bee9c38c90bfa10b3ee.jpg"]},
12 | {"URL": "http://books.toscrape.com/catalogue/category/books/health_47/index.html", "image_link": ["../../../../media/cache/ee/3e/ee3e219d23e73ba71c79b700f183aaed.jpg", "../../../../media/cache/62/3f/623f8e7f7432ce744f4318aae8166ce4.jpg", "../../../../media/cache/23/c2/23c2108ae81327c7f3fb0721976cba5e.jpg", "../../../../media/cache/4b/d4/4bd43108fb070ad8ebba9cdb00b14069.jpg"]},
13 | {"URL": "http://books.toscrape.com/catalogue/category/books/novels_46/index.html", "image_link": ["../../../../media/cache/db/cc/dbcc9d63b73ce9058d53f36465dbe2b2.jpg"]},
14 | {"URL": "http://books.toscrape.com/catalogue/category/books/short-stories_45/index.html", "image_link": ["../../../../media/cache/f4/cb/f4cb1f9c7280bf1fd05fe33d2816080f.jpg"]},
15 | {"URL": "http://books.toscrape.com/catalogue/category/books/suspense_44/index.html", "image_link": ["../../../../media/cache/bb/1c/bb1c91883579f1f99fe6ebf13b92c1c1.jpg"]},
16 | {"URL": "http://books.toscrape.com/catalogue/page-2.html", "image_link": ["../media/cache/5d/72/5d72709c6a7a9584a4d1cf07648bfce1.jpg", "../media/cache/5c/c8/5cc8e107246cb478960d4f0aba1e1c8e.jpg", "../media/cache/9f/59/9f59f01fa916a7bb8f0b28a4012179a4.jpg", "../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg", "../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg", "../media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg", "../media/cache/ef/0b/ef0bed08de4e083dba5e20fdb98d9c36.jpg", "../media/cache/d6/da/d6da0371958068bbaf39ea9c174275cd.jpg", "../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg", "../media/cache/a5/41/a5416b9646aaa7287baa287ec2590270.jpg", "../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg", "../media/cache/38/c5/38c56fba316c07305643a8065269594e.jpg", "../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg", "../media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg", "../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg", "../media/cache/12/53/1253c21c5ef3c6d075c5fa3f5fecee6a.jpg", "../media/cache/f5/88/f5889d038f5d8e949b494d147c2dcf54.jpg", "../media/cache/23/85/238570a1c284e730dbc737a7e631ae2b.jpg", "../media/cache/e1/5c/e15c289ba58cea38519e1281e859f0c1.jpg", "../media/cache/e9/20/e9203b733126c4a0832a1c7885dc27cf.jpg"]},
17 | {"URL": "http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html", "image_link": ["../../media/cache/6d/41/6d418a73cc7d4ecfd75ca11d854041db.jpg", "../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg"]},
18 | {"URL": "http://books.toscrape.com/catalogue/category/books/christian_43/index.html", "image_link": ["../../../../media/cache/cd/db/cddb3eb483ef11a088d519205b7098fb.jpg", "../../../../media/cache/03/f1/03f1e337afadba35687672b5625a9757.jpg", "../../../../media/cache/a2/f5/a2f5b5fd4421d56d37c73a7fb29f5f40.jpg"]},
19 | {"URL": "http://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html", "image_link": ["../../media/cache/91/a4/91a46253e165d144ef5938f2d456b88f.jpg", "../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg"]},
20 | {"URL": "http://books.toscrape.com/catalogue/category/books/historical_42/index.html", "image_link": ["../../../../media/cache/41/c3/41c37f7f0e03ee1144dd6fa89483b5d9.jpg", "../../../../media/cache/d0/b6/d0b6d59c0662dcbd15d47add40af1ebd.jpg"]},
21 | {"URL": "http://books.toscrape.com/catalogue/mesaerion-the-best-science-fiction-stories-1800-1849_983/index.html", "image_link": ["../../media/cache/e8/1f/e81f850db9b9622c65619c9f15748de7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg"]},
22 | {"URL": "http://books.toscrape.com/catalogue/olio_984/index.html", "image_link": ["../../media/cache/b1/0e/b10eabab1e1c811a6d47969904fd5755.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg"]},
23 | {"URL": "http://books.toscrape.com/catalogue/our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985/index.html", "image_link": ["../../media/cache/ad/96/ad96e9c9f1664cbcb0e9627b007fb6f9.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg"]},
24 | {"URL": "http://books.toscrape.com/catalogue/rip-it-up-and-start-again_986/index.html", "image_link": ["../../media/cache/81/7f/817f5089c0e6e62738dce2931e7323d3.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg"]},
25 | {"URL": "http://books.toscrape.com/catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html", "image_link": ["../../media/cache/97/27/97275841c81e66d53bf9313cba06f23e.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg"]},
26 | {"URL": "http://books.toscrape.com/catalogue/set-me-free_988/index.html", "image_link": ["../../media/cache/b8/e9/b8e91bd2fc74c3954118999238abb4b8.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg"]},
27 | {"URL": "http://books.toscrape.com/catalogue/shakespeares-sonnets_989/index.html", "image_link": ["../../media/cache/4d/7a/4d7a79a8be80a529b277ed5c4d8ba482.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg"]},
28 | {"URL": "http://books.toscrape.com/catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html", "image_link": ["../../media/cache/a0/7e/a07ed8f1c23f7b4baf7102722680bd30.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg"]},
29 | {"URL": "http://books.toscrape.com/catalogue/the-black-maria_991/index.html", "image_link": ["../../media/cache/d1/7a/d17a3e313e52e1be5651719e4fba1d16.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg"]},
30 | {"URL": "http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html", "image_link": ["../../media/cache/e1/1b/e11bea016d0ae1d7e2dd46fb3cb870b7.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
31 | {"URL": "http://books.toscrape.com/catalogue/the-requiem-red_995/index.html", "image_link": ["../../media/cache/6b/07/6b07b77236b7c80f42bd90bf325e69f6.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
32 | {"URL": "http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html", "image_link": ["../../media/cache/d1/2d/d12d26739b5369a6b5b3024e4d08f907.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg"]},
33 | {"URL": "http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html", "image_link": ["../../media/cache/97/36/9736132a43b8e6e3989932218ef309ed.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg"]},
34 | {"URL": "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html", "image_link": ["../../media/cache/ce/5f/ce5f052c65cc963cf4422be096e915c9.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
35 | {"URL": "http://books.toscrape.com/catalogue/sharp-objects_997/index.html", "image_link": ["../../media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
36 | {"URL": "http://books.toscrape.com/catalogue/soumission_998/index.html", "image_link": ["../../media/cache/ee/cf/eecfe998905e455df12064dba399c075.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
37 | {"URL": "http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html", "image_link": ["../../media/cache/08/e9/08e94f3731d7d6b760dfbfbc02ca5c62.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
38 | {"URL": "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html", "image_link": ["../../media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg"]},
39 | {"URL": "http://books.toscrape.com/catalogue/category/books/crime_51/index.html", "image_link": ["../../../../media/cache/f2/e5/f2e51dd2b26600459f8eaeb6b9eecaa7.jpg"]},
40 | {"URL": "http://books.toscrape.com/catalogue/category/books/erotica_50/index.html", "image_link": ["../../../../media/cache/6e/4e/6e4e8f4f4abd94356a9be840e4681e65.jpg"]},
41 | {"URL": "http://books.toscrape.com/catalogue/category/books/cultural_49/index.html", "image_link": ["../../../../media/cache/52/46/524655fade1d9fe1475395a3eaff827a.jpg"]},
42 | {"URL": "http://books.toscrape.com/catalogue/category/books/politics_48/index.html", "image_link": ["../../../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../../../media/cache/db/1b/db1babd3c09b84da800b0e9897fe0097.jpg", "../../../../media/cache/00/11/001153d2a22d889837efac1703e10a5e.jpg"]},
43 | {"URL": "http://books.toscrape.com/catalogue/category/books/academic_40/index.html", "image_link": ["../../../../media/cache/d9/4e/d94e6206c2decd3acd9a61b2cbac7eaf.jpg"]},
44 | {"URL": "http://books.toscrape.com/catalogue/category/books/self-help_41/index.html", "image_link": ["../../../../media/cache/ea/9b/ea9b2cb8abbb317402e618445bade1e1.jpg", "../../../../media/cache/da/8b/da8bc9b824dd3f446ef63e438ddbfc85.jpg", "../../../../media/cache/9c/da/9cda4893c7fce0c1c8eaa34fb092aa04.jpg", "../../../../media/cache/9e/15/9e15d7add5090ff2a17bd71ac96aa55a.jpg", "../../../../media/cache/4f/08/4f08f7948770912e4e340e10caa604cb.jpg"]},
45 | {"URL": "http://books.toscrape.com/catalogue/category/books/spirituality_39/index.html", "image_link": ["../../../../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg", "../../../../media/cache/96/db/96db61bb53930c560fb4c1c62b583816.jpg", "../../../../media/cache/b7/6a/b76a73640d26b09c4a6f373b09050bed.jpg", "../../../../media/cache/87/fe/87fe3f7f3f62c1b1b81890578c9cf294.jpg", "../../../../media/cache/8b/10/8b102daec94d1ea9c6fc36dd3ec1c1fe.jpg", "../../../../media/cache/83/c8/83c834b3779be4e577c37ead6d2acf65.jpg"]},
46 | {"URL": "http://books.toscrape.com/catalogue/category/books/contemporary_38/index.html", "image_link": ["../../../../media/cache/08/04/08044269fc197645268a6197c57e6173.jpg", "../../../../media/cache/e3/d0/e3d05227f3fc24f0e0c84ccebe108fb0.jpg", "../../../../media/cache/4d/18/4d1891e435c6692c864331c585e0d014.jpg"]},
47 | {"URL": "http://books.toscrape.com/catalogue/category/books/thriller_37/index.html", "image_link": ["../../../../media/cache/5d/72/5d72709c6a7a9584a4d1cf07648bfce1.jpg", "../../../../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg", "../../../../media/cache/e1/5c/e15c289ba58cea38519e1281e859f0c1.jpg", "../../../../media/cache/d6/97/d697268540fa982f4dce39f61ed3a342.jpg", "../../../../media/cache/76/de/76deee06ffe45e646c0113af01f4f401.jpg", "../../../../media/cache/d9/1a/d91aae72af6c1cb2c63163acabe7895c.jpg", "../../../../media/cache/8b/7c/8b7c73e075cc687b6890dc0dca9fcbcc.jpg", "../../../../media/cache/eb/e9/ebe9f06ccebf83d9853a846052b58fff.jpg", "../../../../media/cache/ee/d4/eed4d5d63d13f0aa86575c90f8ccacb7.jpg", "../../../../media/cache/87/54/8754267f27581996f93e8d94d3c04bf9.jpg", "../../../../media/cache/2a/a8/2aa8afd15f97617ab75f616766161cda.jpg"]},
48 | {"URL": "http://books.toscrape.com/catalogue/category/books/biography_36/index.html", "image_link": ["../../../../media/cache/6f/d9/6fd92e5143cbd5bb8bcf034e5f007dde.jpg", "../../../../media/cache/8b/c4/8bc43a6b42d0283ab4bf611f1b497126.jpg", "../../../../media/cache/cc/a4/cca4e6a4cd5c207e7ce7d992ff464c3b.jpg", "../../../../media/cache/25/f8/25f869fa75340fca0fc2a68e8a0412a1.jpg", "../../../../media/cache/ff/d4/ffd45d95f314555e20c923d3522adea7.jpg"]},
49 | {"URL": "http://books.toscrape.com/catalogue/category/books/business_35/index.html", "image_link": ["../../../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../../../media/cache/d0/77/d077a30042df6b916bfc8d257345c69e.jpg", "../../../../media/cache/82/93/82939ca78da0b724f16ec814849514fd.jpg", "../../../../media/cache/19/aa/19aa1184a3565b1dae6092146018e109.jpg", "../../../../media/cache/e2/2e/e22e4a82d97f9f0689d5295a98f5dcff.jpg", "../../../../media/cache/2d/fd/2dfdc52bcdbd82dee50372bc46c83e15.jpg", "../../../../media/cache/b3/7b/b37be83183f1dcb759d92bda8f8998a4.jpg", "../../../../media/cache/aa/67/aa677a97ecdcbbde7471f1c90ed0cf6f.jpg", "../../../../media/cache/11/2c/112c55a6bcd401c3bd603f5ddb2e6b82.jpg", "../../../../media/cache/18/f4/18f45d31e3892fee589e23f15d759ee3.jpg", "../../../../media/cache/39/f1/39f167dff90d7f84f5c8dc5e05d4051b.jpg", "../../../../media/cache/54/10/5410a58193e2373c04b3021ade78a82b.jpg"]},
50 | {"URL": "http://books.toscrape.com/catalogue/category/books/christian-fiction_34/index.html", "image_link": ["../../../../media/cache/21/21/2121ba78e26194d92c334fde3850f840.jpg", "../../../../media/cache/fa/f6/faf6d69a42f477e1da80a71f05a4dc25.jpg", "../../../../media/cache/93/e0/93e0ec623673a8f83598c9aa7b6c94ec.jpg", "../../../../media/cache/17/e2/17e264d978942f73b859fa1c1d2cf827.jpg", "../../../../media/cache/32/2c/322c1f6cce6d5a69a7d2321779195a0c.jpg", "../../../../media/cache/c3/d0/c3d0f2fb5cacbca64639a679b962e1b9.jpg"]},
51 | {"URL": "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/index.html", "image_link": ["../../../../media/cache/9f/59/9f59f01fa916a7bb8f0b28a4012179a4.jpg", "../../../../media/cache/b7/f4/b7f4843dbe062d44be1ffcfa16b2faa4.jpg", "../../../../media/cache/f5/65/f565af3d9dd20a1ad72a1e7c4157387d.jpg", "../../../../media/cache/10/c6/10c61093002db1fec4089d8076678624.jpg", "../../../../media/cache/98/d1/98d1c979c4bac9e147a6718946578b0f.jpg", "../../../../media/cache/61/bd/61bdfe3950643c47d70c37c4123530f3.jpg", "../../../../media/cache/0d/1f/0d1f3f934460f5a50aaa8c366641234c.jpg", "../../../../media/cache/54/89/54899b4584e941ceced511d81092c88a.jpg", "../../../../media/cache/20/f2/20f28657b49f8cb24ed2ec6448bb6df3.jpg", "../../../../media/cache/c4/dc/c4dcec6f513eaca3f0f3c748d834c46d.jpg", "../../../../media/cache/fe/67/fe67c381d6a0c4c00a7c191d16939554.jpg", "../../../../media/cache/b8/38/b838b65e0e1ac3a9b498dfb1bf004420.jpg", "../../../../media/cache/74/aa/74aa29b1ba4147eaf5b46671bf235861.jpg", "../../../../media/cache/76/a1/76a1516c8d9c3e620626f30840013a85.jpg", "../../../../media/cache/5a/64/5a6499d41ccaad4c4f7eeaa90e16345a.jpg", "../../../../media/cache/98/19/9819ff3a8290dc6ab8797d00de5ec554.jpg", "../../../../media/cache/ae/5c/ae5ca435fb095e374d2c2aa9f7b6f380.jpg", "../../../../media/cache/d4/53/d453cfb6c08dbf76d200ffa858bc9979.jpg", "../../../../media/cache/1d/1f/1d1fbd89f0290275b9166877663ee9f5.jpg", "../../../../media/cache/e6/b6/e6b66353f9325518994dd8b564290fd7.jpg"]},
52 | {"URL": "http://books.toscrape.com/catalogue/category/books/history_32/index.html", "image_link": ["../../../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../../../media/cache/4a/3b/4a3b055f9e378a95fedbef55e7bab7ce.jpg", "../../../../media/cache/2d/4e/2d4e358712e6c9f1d3bdd78d1a16e5a8.jpg", "../../../../media/cache/64/44/6444dacdcb9edaadbbd691524622aeb8.jpg", "../../../../media/cache/97/47/974709d437b08e74649b5744471bf472.jpg", "../../../../media/cache/3d/60/3d6003fc37b842a07c2dbe28e47448e1.jpg", "../../../../media/cache/41/d5/41d5fa6a81cdbcbe6b0b15757a4c9144.jpg", "../../../../media/cache/88/75/8875f384ce9103281b7f6e86a2b8204d.jpg", "../../../../media/cache/56/cb/56cb66d73fb438d64af14dce8bd8b22b.jpg", "../../../../media/cache/11/af/11af7fbd6aec06a75fe207fae92b17e0.jpg", "../../../../media/cache/3c/f6/3cf646523ff7fb8647c500d6325cfcaf.jpg", "../../../../media/cache/e1/02/e102cefae5bb523bc67eb6b49bc18b5d.jpg", "../../../../media/cache/72/f1/72f13b8f069d3a018d2c378be5a1de20.jpg", "../../../../media/cache/f2/64/f26457d65a03b2636c4bcc7c318f7346.jpg", "../../../../media/cache/cf/18/cf187c1dc5575fcbbf49c58024146c4b.jpg", "../../../../media/cache/eb/17/eb178eceef1e9290591cabd5155571a3.jpg", "../../../../media/cache/06/c8/06c897070611b78b80a37333cbb7851c.jpg", "../../../../media/cache/43/fd/43fda1db93163d67705264dcfa98aaa5.jpg"]},
53 | {"URL": "http://books.toscrape.com/catalogue/category/books/horror_31/index.html", "image_link": ["../../../../media/cache/da/df/dadfac66a89774b46b10225362724c83.jpg", "../../../../media/cache/a7/4b/a74b35375ce874153fd352e33bc7bac9.jpg", "../../../../media/cache/6d/10/6d10387a0175701d4ff456a0c7eee67b.jpg", "../../../../media/cache/7a/72/7a72465b21dbf998323e37b31f9a3f4a.jpg", "../../../../media/cache/55/bf/55bfc858c1cb19867e41415532ae43c6.jpg", "../../../../media/cache/02/5c/025c30a378e2a4190e84f1429e81b803.jpg", "../../../../media/cache/0b/2f/0b2f432cc27132f688fcdf29618521e0.jpg", "../../../../media/cache/30/66/3066f8bcd2e2ed6b45084355ff084a61.jpg", "../../../../media/cache/13/ff/13fffcde653948339d3427184b7bd0b5.jpg", "../../../../media/cache/c0/02/c0029d48c2588e6d2a6a31c9f96088ba.jpg", "../../../../media/cache/2d/e0/2de0eff716ca13d12cf5420e88e1a8b3.jpg", "../../../../media/cache/7c/93/7c9302e392e128881e926d19f761da33.jpg", "../../../../media/cache/f7/b7/f7b73392b12909a1e8261ef3f96c5fd1.jpg", "../../../../media/cache/ee/d3/eed3afc5e444e3da5eec34e2b0036ec7.jpg", "../../../../media/cache/3a/7c/3a7c2393061031e7911d7b533b723391.jpg", "../../../../media/cache/41/c7/41c74d82b853606fe98182c417b4669c.jpg", "../../../../media/cache/14/25/142563ccee483bc07632f9c083a68326.jpg"]},
54 | {"URL": "http://books.toscrape.com/catalogue/category/books/humor_30/index.html", "image_link": ["../../../../media/cache/46/bd/46bdee520b8136972262fd040533772d.jpg", "../../../../media/cache/df/5d/df5d172abe87deda6d533e3e908d27d8.jpg", "../../../../media/cache/ea/7b/ea7bcac4b27a5bf6d4f8125bb7af3361.jpg", "../../../../media/cache/b5/a9/b5a90d1c36a96513942f006345ace3d2.jpg", "../../../../media/cache/df/14/df1418baa09e00b877be35066084c9dc.jpg", "../../../../media/cache/4c/30/4c3041def6f29659e009f61e45e492b0.jpg", "../../../../media/cache/73/36/733662595aede2dff1a5be1e76a3b936.jpg", "../../../../media/cache/e7/12/e71268a559d73826aa64151d47357a12.jpg", "../../../../media/cache/a1/03/a10370da29e4ba78c7a75a14041eae0e.jpg", "../../../../media/cache/4b/9a/4b9a2a6d4c995e12fe216f6173a582be.jpg"]},
55 | {"URL": "http://books.toscrape.com/catalogue/category/books/adult-fiction_29/index.html", "image_link": ["../../../../media/cache/18/d8/18d8e02c75c2ef23556c9746fae57e43.jpg"]},
56 | {"URL": "http://books.toscrape.com/catalogue/category/books/parenting_28/index.html", "image_link": ["../../../../media/cache/7d/0b/7d0bb832760e81c281d8d283ba6a2b09.jpg"]},
57 | {"URL": "http://books.toscrape.com/catalogue/category/books/autobiography_27/index.html", "image_link": ["../../../../media/cache/0a/15/0a1567cd04a6582d333db71337b4e2a6.jpg", "../../../../media/cache/d6/e8/d6e8258cee98f80727e99f7ac5aa1b88.jpg", "../../../../media/cache/e9/72/e972f8b4abaaa6f8f449479cd9d87be3.jpg", "../../../../media/cache/17/aa/17aacb738eace89a635a4eb47a94c11d.jpg", "../../../../media/cache/66/c7/66c7a1537c8901e1e4ec217d1956bae8.jpg", "../../../../media/cache/98/9f/989fe700e9e6bdec4fc3217daa5b7df3.jpg", "../../../../media/cache/61/ba/61ba5bc1ee3d8cb3dd350120ffa3f31e.jpg", "../../../../media/cache/80/b3/80b3e38be4204b3b64cdbe8c80dcf1f9.jpg", "../../../../media/cache/7a/58/7a587c5814f33c0c54e8bfa0ef66d690.jpg"]},
58 | {"URL": "http://books.toscrape.com/catalogue/category/books/psychology_26/index.html", "image_link": ["../../../../media/cache/a6/c8/a6c8256b123493472591c5855c7de704.jpg", "../../../../media/cache/dc/4d/dc4d070e33813a07a4e02f069e6d482f.jpg", "../../../../media/cache/ee/a9/eea9e831f8964b4dc0190c84a1f9a1f6.jpg", "../../../../media/cache/00/29/002924b764dc367dcaa3486fa4c0aa0b.jpg", "../../../../media/cache/b4/a5/b4a56663d56f1e84ee1b15bd819563cc.jpg", "../../../../media/cache/4d/a6/4da6939a6bbd895a5acdeabad46d1f9f.jpg", "../../../../media/cache/b8/44/b844a77409f1d53cbb66148820abc217.jpg"]},
59 | {"URL": "http://books.toscrape.com/catalogue/category/books/art_25/index.html", "image_link": ["../../../../media/cache/a5/41/a5416b9646aaa7287baa287ec2590270.jpg", "../../../../media/cache/f2/ee/f2ee668cf593ff13a9560c2801e9c2a2.jpg", "../../../../media/cache/ef/80/ef80e6100214c486562a73ce76444826.jpg", "../../../../media/cache/6a/55/6a55ccd4bc2383f5fe915fbef8bd5a23.jpg", "../../../../media/cache/58/a6/58a634c3231b5380544cc330536cb5ea.jpg", "../../../../media/cache/bb/36/bb364a10868756d1c0877c928b43b533.jpg", "../../../../media/cache/99/51/99511f4da1a4a2114e2ed12e6ba17b65.jpg", "../../../../media/cache/a8/3a/a83a4d31d30dc3cb26a29899a5c3b91d.jpg"]},
60 | {"URL": "http://books.toscrape.com/catalogue/category/books/paranormal_24/index.html", "image_link": ["../../../../media/cache/4b/97/4b972f89c11900ac0e84726d1f07bfcc.jpg"]},
61 | {"URL": "http://books.toscrape.com/catalogue/category/books/poetry_23/index.html", "image_link": ["../../../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "../../../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../../../media/cache/e9/20/e9203b733126c4a0832a1c7885dc27cf.jpg", "../../../../media/cache/72/41/72417db983862010ef0c1a25de98c7d7.jpg", "../../../../media/cache/f9/3b/f93b4a650f03a5d21f2436d7813f42c2.jpg", "../../../../media/cache/38/64/386468a8c3e6b880664bf7885bf6f726.jpg", "../../../../media/cache/25/54/2554431c797ec725eea50b3f8a83758c.jpg", "../../../../media/cache/3f/41/3f4160ada0b16e3c64cd2d0dffe781c8.jpg", "../../../../media/cache/c8/f2/c8f297fab080ddd02b3ed5c17b83af85.jpg", "../../../../media/cache/93/d5/93d5c64abfad9ed6a0cb2e26f19f1a1e.jpg", "../../../../media/cache/36/5b/365b3ab7ab72a6258873716aef6d5c1a.jpg", "../../../../media/cache/b7/29/b7293f602efb0c17e305077f8175888a.jpg", "../../../../media/cache/31/c7/31c7c5ce7b04d227aa36ecb250b9dad5.jpg", "../../../../media/cache/7e/93/7e934132cd03486649fb492fe702f704.jpg", "../../../../media/cache/9f/35/9f351ca1978128c60a3b7f85987075b3.jpg", "../../../../media/cache/8f/46/8f46bb13feb3a4440a27dfcf688fbaa6.jpg", "../../../../media/cache/df/ab/dfab1d94f9190df7c13b63a093a6d16e.jpg"]},
62 | {"URL": "http://books.toscrape.com/catalogue/category/books/science_22/index.html", "image_link": ["../../../../media/cache/d4/8d/d48d5122a15347e9fe2b15ad354d69bf.jpg", "../../../../media/cache/26/1c/261c4eaf957ae4aacf2229b482e76dbe.jpg", "../../../../media/cache/68/ca/68caaf9ac41964d5167a3eb67c638393.jpg", "../../../../media/cache/56/97/5697f2f8f628129df01c5790985ffd9b.jpg", "../../../../media/cache/5e/7f/5e7f7d9913d4c95d33904770c518d537.jpg", "../../../../media/cache/33/4f/334fd0ebdf0c0192baf5914d199c53b5.jpg", "../../../../media/cache/da/0d/da0d13699a090516502257a4d7da623f.jpg", "../../../../media/cache/08/a9/08a957eb34f8047862e225774c3bdde2.jpg", "../../../../media/cache/83/ab/83ab65f938b24fa1a9cb47235be49b57.jpg", "../../../../media/cache/69/c8/69c83860995cde393dbe6690ec3f1d4f.jpg", "../../../../media/cache/f9/69/f969969428b505970a46272fdcea00d3.jpg", "../../../../media/cache/f8/bc/f8bcd489d33473e0819beaecccd5ebac.jpg", "../../../../media/cache/c8/63/c863c222c130a1bc8685a1242dd2523d.jpg", "../../../../media/cache/08/14/0814f26516fb72b7391d0a742b5928a2.jpg"]},
63 | {"URL": "http://books.toscrape.com/catalogue/category/books/young-adult_21/index.html", "image_link": ["../../../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../../../media/cache/5d/7f/5d7f496cdf5e5962a73ecdcc1505c1d5.jpg", "../../../../media/cache/fc/72/fc72f158554b4b4164701e1dfa1153c7.jpg", "../../../../media/cache/26/95/269507c7bb35d2cec9b61a03d1c28e67.jpg", "../../../../media/cache/12/f1/12f1963957f27fa83d51f76b183ef490.jpg", "../../../../media/cache/0f/d3/0fd306891f8fd3196653022fd67d6c87.jpg", "../../../../media/cache/18/08/18086e581ad354aa65f945c2b5c51350.jpg", "../../../../media/cache/f8/54/f85417465a73e33604624205ba8306cc.jpg", "../../../../media/cache/71/76/7176317f1915fa0658bb2fe400441207.jpg", "../../../../media/cache/bb/72/bb723ad463531c602ad8bcb244253bf3.jpg", "../../../../media/cache/19/cf/19cf50aea5bf0e8f4bc016f3745b3dfe.jpg", "../../../../media/cache/18/0b/180bfe1902cb3c0eb77d7c712efa2a96.jpg", "../../../../media/cache/1d/3c/1d3c05b772ab846c111970232360d2c5.jpg", "../../../../media/cache/46/6e/466e9636819aad1126ac6cefb5313ba8.jpg", "../../../../media/cache/b2/df/b2df2ea409c5cf28538b67aff424b11f.jpg", "../../../../media/cache/ad/ac/adac97366586d261feab30bf5220756e.jpg", "../../../../media/cache/61/1a/611aba0ef5b859ba1977ef30677b0194.jpg", "../../../../media/cache/87/cd/87cd652c35e2a78535c83becae33cff2.jpg", "../../../../media/cache/fd/5b/fd5b14399052ab552e240ed18ab03c6d.jpg"]},
64 | {"URL": "http://books.toscrape.com/catalogue/category/books/new-adult_20/index.html", "image_link": ["../../../../media/cache/24/e2/24e2f5c9d325c4004d8190c054da86dd.jpg", "../../../../media/cache/a5/43/a543b100a8c1861c1bf5374ca6b576fe.jpg", "../../../../media/cache/84/ac/84acb0606c96e55dc729a9d6572a08fb.jpg", "../../../../media/cache/38/f1/38f1543cd2d51c2728678f5ecc128958.jpg", "../../../../media/cache/a2/19/a2198abf12e3287f84997b35f4e1050e.jpg", "../../../../media/cache/03/ed/03ed67ea504353b91b035151d8e80db2.jpg"]},
65 | {"URL": "http://books.toscrape.com/catalogue/category/books/fantasy_19/index.html", "image_link": ["../../../../media/cache/76/8e/768ea5924ac1ef6297c2be9959c796c2.jpg", "../../../../media/cache/43/ae/43aee83ebb31e2122a7215e413770e5c.jpg", "../../../../media/cache/b7/e8/b7e84b78be3d9bb79b71156a5e5d4e42.jpg", "../../../../media/cache/ff/e8/ffe81bf98f8386ef29e193abfb6f9c1e.jpg", "../../../../media/cache/66/25/6625e3bbb050de3e42a0c302c0d69f1f.jpg", "../../../../media/cache/06/18/061811c5845d0e13bc04b2a755f0830f.jpg", "../../../../media/cache/c0/88/c08816960890396213a423941af65b8f.jpg", "../../../../media/cache/32/d6/32d6aa560e8ddf2a4da1526b95d4c7ab.jpg", "../../../../media/cache/3e/0b/3e0b16851bec08b6cbf78d5f64af9114.jpg", "../../../../media/cache/e2/60/e260b008b7ea7970562295b7bc64b0cb.jpg", "../../../../media/cache/53/5e/535e2be0b423797c2cdc7d98882c820a.jpg", "../../../../media/cache/b4/67/b467a4f01ca6ae8464b9425a156c7c32.jpg", "../../../../media/cache/9a/33/9a333c4a06ce187c5c9d2f5969ddcac2.jpg", "../../../../media/cache/75/b9/75b99691594fde72ccb1831624cfeff6.jpg", "../../../../media/cache/8f/80/8f8074d9f035c2a0ef8595ad89f7bcc8.jpg", "../../../../media/cache/00/08/0008e65aa431ed3625ad3a4352f8e90d.jpg", "../../../../media/cache/3b/04/3b045fe0394dc192950a0ec9e3812fe4.jpg", "../../../../media/cache/d3/0d/d30dd8b6be6f9fcfd17178e8083238b6.jpg", "../../../../media/cache/27/64/27649cb5da52970f4bb2fc5234a48578.jpg", "../../../../media/cache/3e/2d/3e2d526ee062008ab1cbf54f90a5abb2.jpg"]},
66 | {"URL": "http://books.toscrape.com/catalogue/category/books/add-a-comment_18/index.html", "image_link": ["../../../../media/cache/33/e5/33e507172541628acfd421503196b578.jpg", "../../../../media/cache/f8/6d/f86d08178e3788563ac17be5aefd29f0.jpg", "../../../../media/cache/70/fa/70fa6c0437d9c97dbeada6bd32bf9d2c.jpg", "../../../../media/cache/a1/14/a114d70e7babf110ba42a389078e9a45.jpg", "../../../../media/cache/5f/52/5f52b1bc6d45daab2e330c744feb0359.jpg", "../../../../media/cache/ae/0c/ae0ccc307568b6d7699786411f3cbcc4.jpg", "../../../../media/cache/28/78/2878538a1039d9c4649110499a1393fb.jpg", "../../../../media/cache/72/d8/72d861617b6d3aababe6e61e8d3c1056.jpg", "../../../../media/cache/66/f7/66f79b76d6c6b64fcc8110515c454e09.jpg", "../../../../media/cache/94/ac/94ac87da7b40853013093f08356efa3b.jpg", "../../../../media/cache/8f/3f/8f3f4d67e30a8129577ccc4664998345.jpg", "../../../../media/cache/3f/e7/3fe7073a5caac81929524d2d9488f928.jpg", "../../../../media/cache/f5/58/f55886d1bf600529a35e1bd932c78ca0.jpg", "../../../../media/cache/0b/97/0b97282ed82b771ed328e05386a84adb.jpg", "../../../../media/cache/50/0e/500eeb810e940424827580574e46852c.jpg", "../../../../media/cache/9b/20/9b2076ce7414103a093ce2459d089969.jpg", "../../../../media/cache/75/20/75200336c141156746000f7055df344a.jpg", "../../../../media/cache/4d/16/4d163d43cb4aa624e599330a39abace5.jpg", "../../../../media/cache/55/33/5533595a623c3bb947c4a5171fc2df08.jpg", "../../../../media/cache/97/3a/973a2c3462a18fc90d3b9662d959df37.jpg"]},
67 | {"URL": "http://books.toscrape.com/catalogue/category/books/sports-and-games_17/index.html", "image_link": ["../../../../media/cache/61/2c/612caeb0b2acb35c100629f0f52a40d7.jpg", "../../../../media/cache/7d/cf/7dcf6c3b419bf7e7e3b3b8162b177869.jpg", "../../../../media/cache/c3/a9/c3a90a5baa833a37c29c4b03a444737c.jpg", "../../../../media/cache/9b/4e/9b4ece2ab5a6335c8594c878e2f22df1.jpg", "../../../../media/cache/8d/1e/8d1e285bf672b2ea66879490cc5f6904.jpg"]},
68 | {"URL": "http://books.toscrape.com/catalogue/category/books/science-fiction_16/index.html", "image_link": ["../../../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../../../media/cache/93/63/9363f0065fbad5689f44fcf6e203eef3.jpg", "../../../../media/cache/02/37/0237b445efc18c5562355a5a2c40889c.jpg", "../../../../media/cache/10/6e/106e2fc7160712edf8e2ff996dc8cd6c.jpg", "../../../../media/cache/f0/06/f0060c756556b855184fa32f66280961.jpg", "../../../../media/cache/c0/72/c072c1ef144d571abd25fe9cc18cceba.jpg", "../../../../media/cache/51/88/518810d182843244a404f2a2a614a93b.jpg", "../../../../media/cache/8b/92/8b9267df86378b6973974ae7e1924ffe.jpg", "../../../../media/cache/b8/b2/b8b2956acc758a381beef87339c0a52f.jpg", "../../../../media/cache/51/34/513418bd1c6114f3ea1fd703278e20ef.jpg", "../../../../media/cache/ef/8b/ef8bc5adcd3bea8e8ba97be76d07a32a.jpg", "../../../../media/cache/7a/bc/7abccb865ecf9b0f676800b10c71cfd6.jpg", "../../../../media/cache/fa/65/fa653fbe3a4c69227c9b79d471cee576.jpg", "../../../../media/cache/c7/21/c721943edf481cad5ab32505e2ad3865.jpg", "../../../../media/cache/da/47/da4746e620f8ccd7cf20628d1a5e535a.jpg", "../../../../media/cache/f4/83/f4835e9f3fdd8b8107bbb39a391654f0.jpg"]},
69 | {"URL": "http://books.toscrape.com/catalogue/category/books/default_15/index.html", "image_link": ["../../../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../../../media/cache/ef/0b/ef0bed08de4e083dba5e20fdb98d9c36.jpg", "../../../../media/cache/d6/da/d6da0371958068bbaf39ea9c174275cd.jpg", "../../../../media/cache/12/53/1253c21c5ef3c6d075c5fa3f5fecee6a.jpg", "../../../../media/cache/f5/88/f5889d038f5d8e949b494d147c2dcf54.jpg", "../../../../media/cache/75/dc/75dce2f5949b407161f37f0af249b018.jpg", "../../../../media/cache/69/85/69852567cf97264a1442cbc882c84903.jpg", "../../../../media/cache/27/d2/27d20361745ec2f7be668b18a4da29da.jpg", "../../../../media/cache/78/2e/782e315667ec50759b8603527ee33dec.jpg", "../../../../media/cache/08/89/088995e862aac86c88c608d763f6390e.jpg", "../../../../media/cache/06/a6/06a6cfcf89afd1601cbba1a16cda57fb.jpg", "../../../../media/cache/8a/83/8a83b6ce350f01bab21f85e6ba539316.jpg", "../../../../media/cache/4e/0f/4e0f05ae01d8fb6bd0d3901edd06de16.jpg", "../../../../media/cache/34/f5/34f5f8e513c5f048241f5695e61b2483.jpg", "../../../../media/cache/58/9d/589d73503d9a23d224de836134fae553.jpg", "../../../../media/cache/25/6c/256c946dd0962095f66c6de3b15ab300.jpg", "../../../../media/cache/81/58/81586cd0bf8743e1f5ed80b6a0e1fabe.jpg", "../../../../media/cache/fe/b7/feb764b2afa54991cfdbbffdf501b333.jpg"]},
70 | {"URL": "http://books.toscrape.com/catalogue/category/books/music_14/index.html", "image_link": ["../../../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../../../media/cache/5c/c8/5cc8e107246cb478960d4f0aba1e1c8e.jpg", "../../../../media/cache/a2/6d/a26d8449abb3381e09126eda5f4e8151.jpg", "../../../../media/cache/06/f1/06f185c0be2ad6e2fe059464c03f1b47.jpg", "../../../../media/cache/85/42/8542841f5644a6daf433504f1e106e97.jpg", "../../../../media/cache/11/fc/11fc94453c4dc0d68543971d7843afb0.jpg", "../../../../media/cache/35/a4/35a4a7c6c76c4e82186753078e441654.jpg", "../../../../media/cache/15/de/15de75548ee9a4c6be1420ee309c03e0.jpg", "../../../../media/cache/7a/7e/7a7eb52e7075a5305522948375c1316e.jpg", "../../../../media/cache/99/97/9997eda658c2fe50e724171f9c2a2b0b.jpg", "../../../../media/cache/7e/94/7e947f3dd04f178175b85123829467a9.jpg", "../../../../media/cache/7f/b0/7fb03a053c270000667a50dd8d594843.jpg"]},
71 | {"URL": "http://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html", "image_link": ["../../../../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg", "../../../../media/cache/38/c5/38c56fba316c07305643a8065269594e.jpg", "../../../../media/cache/cb/bd/cbbdb0222ee8a0f6ab61657412a15794.jpg", "../../../../media/cache/9c/46/9c463c7631c82401160fd3b554b8f0e1.jpg", "../../../../media/cache/41/a2/41a20f35adf0caea24f208dc01ad7681.jpg", "../../../../media/cache/03/86/038650c9e7517b4baf2a423cd8eed38f.jpg", "../../../../media/cache/95/64/95647d6a526bf54120b9445e124794e1.jpg", "../../../../media/cache/64/15/641570cd7e7aded53c7d33d78a9629f1.jpg", "../../../../media/cache/2e/23/2e236e23ad52aa74505f224f6552eda8.jpg", "../../../../media/cache/f3/4f/f34ffb24cc21c9f9f52dad4fd8f3ac21.jpg", "../../../../media/cache/97/f8/97f8debeeaaece9603267653076e760f.jpg", "../../../../media/cache/fe/ea/feeafd2ad7b3077f8e74cbb1da9e3c7d.jpg", "../../../../media/cache/64/94/6494bf61176ca73b61255909230030be.jpg", "../../../../media/cache/88/9e/889e0bac4c7c0e7178f0165b8d3b4617.jpg", "../../../../media/cache/23/b4/23b42e094c02d52b14b11a960d49610e.jpg", "../../../../media/cache/03/38/0338682e76bad3216cd4c6c28b2b625a.jpg", "../../../../media/cache/14/f3/14f3d525e2a114cd71e27201a16af188.jpg", "../../../../media/cache/13/57/1357c6aa40c9e63d2f931927fbf81f3f.jpg", "../../../../media/cache/0e/6d/0e6dc2484322c5b9e7854ced66fdf62d.jpg", "../../../../media/cache/6e/d4/6ed4991d97f60db29ec7b421e61a2cf3.jpg"]},
72 | {"URL": "http://books.toscrape.com/catalogue/category/books/religion_12/index.html", "image_link": ["../../../../media/cache/95/30/953013d044aa313cc162dec414f3969a.jpg", "../../../../media/cache/6b/70/6b70f2cdb17d9ab7551240a88b9211fe.jpg", "../../../../media/cache/1f/db/1fdb125bcb8cee71f3404b4dc293348c.jpg", "../../../../media/cache/83/db/83dbf86eb0fed1d99de2148eac4eb064.jpg", "../../../../media/cache/71/91/7191a7d76eb6c3a18259541e2c038ae3.jpg", "../../../../media/cache/4e/69/4e69dacc99de838814d0f65c94e67f6c.jpg", "../../../../media/cache/df/ab/dfabeab158046237ddb6b713b794909f.jpg"]},
73 | {"URL": "http://books.toscrape.com/catalogue/category/books/childrens_11/index.html", "image_link": ["../../../../media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg", "../../../../media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg", "../../../../media/cache/c4/a2/c4a2a1a026c67bcceb5a411c724d7d0c.jpg", "../../../../media/cache/26/32/2632a1e12f2c085fabbe022ae4cd6933.jpg", "../../../../media/cache/80/25/8025b80a40178f2a6dd4f99ad88e0fba.jpg", "../../../../media/cache/28/50/2850439c2ba103fb69dba9cd2dd9f0c2.jpg", "../../../../media/cache/2b/38/2b380f77723c797c0389f978afa6db58.jpg", "../../../../media/cache/bb/e2/bbe26db72b8a32117bfe4981b7cc8147.jpg", "../../../../media/cache/97/12/971212afa8e4ff49d92f678bc889d8b7.jpg", "../../../../media/cache/85/e7/85e75d5a9309da5807c82decf3d90263.jpg", "../../../../media/cache/27/1f/271faa1d7561473974d12803feb1f0a1.jpg", "../../../../media/cache/6c/18/6c18ea03e294bfcfe07cf531c6c5f5b3.jpg", "../../../../media/cache/4f/1e/4f1ece2500f8dbacecca42d57befca03.jpg", "../../../../media/cache/8f/66/8f66ec46e671d6fca79649c10c7c8f8a.jpg", "../../../../media/cache/1c/eb/1cebdf525ebe970a1dc3c5a8c50eae6b.jpg", "../../../../media/cache/c0/bb/c0bb6e42743b9c1aaf9b754501100a5d.jpg", "../../../../media/cache/bf/db/bfdbf9726621276fc7821d705690dbae.jpg", "../../../../media/cache/e0/90/e090748ce5a567207aed9185c97eb34b.jpg", "../../../../media/cache/21/bd/21bdf7ae21476b1debf4aa3eefe6f29d.jpg", "../../../../media/cache/ec/08/ec08efebaa33a403e54080b48c139794.jpg"]},
74 | {"URL": "http://books.toscrape.com/catalogue/category/books/fiction_10/index.html", "image_link": ["../../../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../../../media/cache/9d/05/9d0533bae1578846d728a82913b95c26.jpg", "../../../../media/cache/5f/15/5f152afdbc42356ecba02f61058a7e5b.jpg", "../../../../media/cache/c4/0a/c40a64f59e7487b1a80a049f6ceb2ba5.jpg", "../../../../media/cache/dc/44/dc44f8e2aebac48ca8553814d9b021a8.jpg", "../../../../media/cache/6b/da/6bdae061cb92c32b0b83cda8dd10275d.jpg", "../../../../media/cache/37/25/372578cc073efae80cf284b56040a488.jpg", "../../../../media/cache/f8/31/f8314c7fdaa79fb7191a583e9a852db8.jpg", "../../../../media/cache/6a/81/6a81103b1c01a3f6c56e5718a838a4c8.jpg", "../../../../media/cache/8f/f8/8ff8680dde59ea739d6978a01e4d7fe5.jpg", "../../../../media/cache/83/05/8305154438c91a02cefacf4ec8b53393.jpg", "../../../../media/cache/38/34/3834572e651cdc14b18d348fa4875aa9.jpg", "../../../../media/cache/d8/a4/d8a44eda7cbe7bd1207f868e9adc06f3.jpg", "../../../../media/cache/8e/c7/8ec7f310b74ddd7ec3c859e9b0da7389.jpg", "../../../../media/cache/03/16/0316bb6f4785ac69c0643109201bad5d.jpg", "../../../../media/cache/ca/b1/cab150e556b5fab663a9fec00ed97943.jpg", "../../../../media/cache/e0/79/e07906c1e507055da9a2260a74f58273.jpg", "../../../../media/cache/a7/f0/a7f092a7b79f848df0226f808fed489b.jpg", "../../../../media/cache/ed/07/ed07c9e7c53d4f33a6eb7d41eb0e6d4a.jpg", "../../../../media/cache/26/3b/263bf5d128bf18553ea8da8bb19e9a0c.jpg"]}
75 | ]


--------------------------------------------------------------------------------
/chapter4/spider_books.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | 
 3 | 
 4 | class BooksSpider(scrapy.Spider):
 5 |     name = 'bookLinks'
 6 | 
 7 |     start_urls = ['http://books.toscrape.com']
 8 |     images_data = {}
 9 | 	
10 |     def parse(self, response):
11 |         # follow links to author pages
12 |         for img in response.css('a::attr(href)'):
13 |             yield response.follow(img, self.parse_images)
14 | 
15 |     def parse_images(self, response):
16 |         print ("URL: " + response.request.url)
17 |         def extract_with_css(query):
18 |             return response.css(query).extract()
19 |         yield {
20 |             'URL': response.request.url,
21 |             'image_link': extract_with_css('img::attr(src)')
22 |         }


--------------------------------------------------------------------------------