├── LICENSE ├── README.md ├── chapter1 ├── code │ ├── algorithmia │ │ ├── algorithmia_analyze_url.py │ │ └── algorithmia_sitemap.py │ ├── mechanical-soup │ │ ├── bing_search.py │ │ ├── github_links.py │ │ ├── google_search.py │ │ └── twitter_login.py │ ├── metadata │ │ ├── extract_articles.py │ │ └── extract_site_metadata.py │ ├── parsel │ │ ├── extract_links_css.py │ │ └── extract_links_xpath.py │ ├── requests │ │ ├── request_response.py │ │ └── urllib_request.py │ ├── robobrowser │ │ ├── bing_search.py │ │ ├── download_file.py │ │ ├── get_emails_links_from_url.py │ │ ├── twitter_login_form.py │ │ └── website_parsing.py │ └── web_technologies │ │ └── web_technologies_builtwith.py └── images │ ├── algorithmia_analyze_url.png │ ├── algorithmia_analyze_url2.png │ ├── algorithmia_sitemap.png │ ├── algorithmia_sitemap2.png │ ├── bing_search.png │ ├── bing_search_output.png │ ├── bing_search_output_mechanical_soup.png │ ├── builtwith.png │ ├── builtwith_script.png │ ├── google_search_mechanical_soup.png │ ├── robobrowser_links.png │ └── wappalyzer.png ├── chapter12.zip ├── chapter2 ├── code │ ├── bs4 │ │ ├── BeautifulSoup-getLinks_csv.py │ │ ├── bs4_objects.py │ │ ├── demo_detail_book.py │ │ ├── download_images_from_url.py │ │ ├── getExternal_internal_links.py │ │ ├── get_offers_bs4.py │ │ └── wikipedia_links.py │ └── requests │ │ ├── crawler_urls.py │ │ ├── depth_search_extract_links.py │ │ ├── download_file_requests.py │ │ ├── extract_links_images_re.py │ │ ├── get_emails_from_url.py │ │ ├── get_html_requests.py │ │ ├── link_crawler_search.py │ │ ├── requests_post.py │ │ └── requests_user_agent.py └── images │ ├── download_images.png │ ├── download_images2.png │ ├── external_inernal_links.png │ ├── link_extractor.png │ ├── objects.png │ ├── packt_books.png │ ├── packtpub_links.png │ ├── packtpub_links2.png │ ├── packtpub_links_csv.png │ ├── packtpub_links_deep_search.png │ ├── requests_extract_links.png │ ├── requests_headers.png │ └── requests_post.png ├── chapter3 ├── code │ ├── books_scraping │ │ ├── bookList.csv │ │ ├── requests_bs4_initial.py │ │ └── requests_bs4_with_pages.py │ ├── chromedriver.exe │ ├── dolar-euro_converter.py │ ├── google_translate.py │ ├── interacting_with_form.py │ ├── phantomjs │ │ ├── phantomjs.exe │ │ ├── phantomjs_example1.py │ │ ├── phantomjs_example2.py │ │ └── phantomjs_example3.py │ ├── scraping_book_details_requests.py │ ├── selenium_list_book.py │ └── stack_overflow_tags.py └── images │ ├── ajax_image.png │ ├── book_info.png │ ├── book_packit.png │ ├── books_details.png │ ├── books_packit.png │ ├── converter.png │ ├── google_translate.png │ ├── selenium_methods.png │ └── xpath.png └── chapter4 ├── BooksSpider-multipage-details ├── books_crawler │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── BooksSpider.py │ │ └── __init__.py ├── output.son └── scrapy.cfg ├── BooksSpider-urls ├── books_crawler │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── BooksSpider.py │ │ └── __init__.py ├── books_links.json └── scrapy.cfg ├── BooksSpider-urls_download_images ├── books_crawler │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── BooksSpider.py │ │ └── __init__.py ├── output.son └── scrapy.cfg ├── europython ├── europython │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── items.cpython-37.pyc │ │ ├── pipelines.cpython-37.pyc │ │ └── settings.cpython-37.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── europython_spider.cpython-37.pyc │ │ └── europython_spider.py ├── europython_items.csv ├── europython_items.json ├── europython_items.xml ├── scrapinghub.yml ├── scrapy.cfg └── setup.py ├── images ├── book_details.png ├── books_images.png ├── books_images_output.png ├── europython_talk.png ├── next_page.png ├── scrapy_books.png ├── scrapy_books_links.png ├── scrapy_options.png ├── scrapy_project.png ├── scrapy_shell.png └── scrapy_shell2.png ├── output.json └── spider_books.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Advanced-Web-Scraping-with-Python 2 | Advanced Web Scraping with Python, Published by Packt 3 | -------------------------------------------------------------------------------- /chapter1/code/algorithmia/algorithmia_analyze_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import Algorithmia 5 | import json 6 | 7 | input = [ "https://www.packtpub.com/iot-hardware/single-board-computers"] 8 | output = [] 9 | 10 | API_KEY ='simU+xQFB6Ts4O306dxEhZreKBA1' 11 | 12 | client = Algorithmia.client(API_KEY) 13 | 14 | algorithmia = client.algo('web/AnalyzeURL/0.2.17').pipe(input[0]) 15 | print(algorithmia.result) 16 | output.append(algorithmia.result) 17 | print(json.dumps(output, indent=4)) -------------------------------------------------------------------------------- /chapter1/code/algorithmia/algorithmia_sitemap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import Algorithmia 5 | 6 | input = [ "http://packtpub.com",1] 7 | 8 | API_KEY ='simU+xQFB6Ts4O306dxEhZreKBA1' 9 | 10 | client = Algorithmia.client(API_KEY) 11 | response = client.algo('web/SiteMap/0.1.7').pipe(input) 12 | siteMap = response.result 13 | print(siteMap) -------------------------------------------------------------------------------- /chapter1/code/mechanical-soup/bing_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import mechanicalsoup 5 | 6 | # Connect to bing search engine 7 | browser = mechanicalsoup.StatefulBrowser() 8 | browser.open("http://bing.com/") 9 | 10 | # Fill-in the search form 11 | browser.select_form('#sb_form') 12 | browser["q"] = "MechanicalSoup" 13 | browser.submit_selected() 14 | 15 | # Display the results 16 | for link in browser.links(): 17 | print(link.text, '->', link.attrs['href']) 18 | -------------------------------------------------------------------------------- /chapter1/code/mechanical-soup/github_links.py: -------------------------------------------------------------------------------- 1 | """Example app to login to GitHub using the StatefulBrowser class.""" 2 | #!/usr/bin/env python 3 | # -*- coding: utf-8 -*- 4 | 5 | from __future__ import print_function 6 | import argparse 7 | import mechanicalsoup 8 | from getpass import getpass 9 | 10 | parser = argparse.ArgumentParser(description="Login to GitHub.") 11 | parser.add_argument("username") 12 | args = parser.parse_args() 13 | 14 | args.password = getpass("Please enter your GitHub password: ") 15 | 16 | browser = mechanicalsoup.StatefulBrowser( 17 | soup_config={'features': 'lxml'}, 18 | raise_on_404=True, 19 | user_agent='MyBot/0.1: mysite.example.com/bot_info', 20 | ) 21 | # Uncomment for a more verbose output: 22 | browser.set_verbose(2) 23 | 24 | browser.open("https://github.com") 25 | browser.follow_link("login") 26 | browser.select_form('#login form') 27 | browser["login"] = args.username 28 | browser["password"] = args.password 29 | resp = browser.submit_selected() 30 | 31 | # Uncomment to launch a web browser on the current page: 32 | browser.launch_browser() 33 | 34 | # verify we are now logged in 35 | page = browser.get_current_page() 36 | 37 | for link in browser.links(): 38 | target = link.attrs['href'] 39 | print(target) 40 | 41 | messages = page.find("div", class_="flash-messages") 42 | if messages: 43 | print(messages.text) 44 | assert page.select(".logout-form") 45 | 46 | #print(page.title.text) 47 | #print(page) 48 | 49 | # verify we remain logged in (thanks to cookies) as we browse the rest of 50 | # the site 51 | page3 = browser.open("https://github.com/MechanicalSoup/MechanicalSoup") 52 | assert page3.soup.select(".logout-form") 53 | -------------------------------------------------------------------------------- /chapter1/code/mechanical-soup/google_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import mechanicalsoup 6 | 7 | # Connect to Google 8 | browser = mechanicalsoup.StatefulBrowser() 9 | browser.open("https://www.google.com/") 10 | 11 | # Fill-in the form 12 | browser.select_form('form[action="/search"]') 13 | browser["q"] = "MechanicalSoup" 14 | 15 | # Note: the button name is btnK in the content served to actual 16 | # browsers, but btnG for bots. 17 | browser.submit_selected(btnName="btnG") 18 | print(browser.get_current_page()) 19 | 20 | # Display links 21 | for link in browser.links(): 22 | target = link.attrs['href'] 23 | # Filter-out unrelated links and extract actual URL from Google's 24 | # click-tracking. 25 | if (target.startswith('/url?') and not 26 | target.startswith("/url?q=http://webcache.googleusercontent.com")): 27 | target = re.sub(r"^/url\?q=([^&]*)&.*", r"\1", target) 28 | print(target) 29 | -------------------------------------------------------------------------------- /chapter1/code/mechanical-soup/twitter_login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import mechanicalsoup 5 | import getpass 6 | 7 | URL = "https://twitter.com/login" 8 | 9 | username = input ("Username: ") 10 | password = getpass.getpass() 11 | 12 | # Create a browser object 13 | browser = mechanicalsoup.Browser() 14 | 15 | # request Twitter login page 16 | login_page = browser.get(URL) 17 | 18 | # we grab the login form 19 | login_form = login_page.soup.find("form", {"class":"t1-form clearfix signin js-signin"}) 20 | 21 | # find login and password inputs 22 | login_form.find("input", {"name": "session[username_or_email]"})["value"] = username 23 | login_form.find("input", {"name": "session[password]"})["value"] = password 24 | 25 | # submit form 26 | browser.submit(login_form, login_page.url) -------------------------------------------------------------------------------- /chapter1/code/metadata/extract_articles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import newspaper 5 | 6 | cnn_paper = newspaper.build('http://cnn.com') 7 | 8 | print('*****************************category urls************************************\n') 9 | for category in cnn_paper.category_urls(): 10 | print(category) 11 | 12 | print('*****************************url articles************************************\n') 13 | 14 | for article in cnn_paper.articles: 15 | print(article.url) 16 | 17 | print('*****************************download first article************************************\n') 18 | cnn_article = cnn_paper.articles[0] 19 | cnn_article.download() 20 | cnn_article.parse() 21 | 22 | #print(cnn_article.html) 23 | print(cnn_article.text) 24 | print(cnn_article.keywords) 25 | print(cnn_article.summary) 26 | print(cnn_article.authors) 27 | print(cnn_article.publish_date) -------------------------------------------------------------------------------- /chapter1/code/metadata/extract_site_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import extruct 5 | import requests 6 | import pprint 7 | from w3lib.html import get_base_url 8 | 9 | 10 | pp = pprint.PrettyPrinter(indent=2) 11 | r = requests.get('https://www.packtpub.com') 12 | base_url = get_base_url(r.text, r.url) 13 | data = extruct.extract(r.text, base_url=base_url) 14 | 15 | pp.pprint(data) -------------------------------------------------------------------------------- /chapter1/code/parsel/extract_links_css.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | from parsel import Selector 6 | 7 | # GET request to packtpub site 8 | response = requests.get('https://www.packtpub.com') 9 | 10 | # "response.txt" contain all web page content 11 | selector = Selector(response.text) 12 | 13 | # Extracting href attribute from anchor tag 14 | href_links = selector.css('a::attr(href)').extract() 15 | 16 | #Extracting src attribute from img tag 17 | image_links = selector.css('img::attr(src)').extract() 18 | 19 | print('*****************************href_links************************************\n') 20 | print(href_links) 21 | 22 | 23 | print('*****************************image_links************************************\n') 24 | print(image_links) 25 | -------------------------------------------------------------------------------- /chapter1/code/parsel/extract_links_xpath.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | from parsel import Selector 6 | 7 | # GET request to packtpub site 8 | response = requests.get('https://www.packtpub.com') 9 | 10 | # "response.txt" contain all web page content 11 | selector = Selector(response.text) 12 | 13 | # Extracting href attribute from anchor tag 14 | href_links = selector.xpath('//a/@href').getall() 15 | 16 | #Extracting src attribute from img tag 17 | image_links = selector.xpath('//img/@src').getall() 18 | 19 | print('*****href_links******\n') 20 | print(href_links) 21 | 22 | 23 | print('*****image_links*****\n') 24 | print(image_links) 25 | -------------------------------------------------------------------------------- /chapter1/code/requests/request_response.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | url = "http://www.packtpub.com" 4 | # Packages the request, send the request and catch the response 5 | response = requests.get(url) 6 | # Store the response in html variable 7 | html = response.text 8 | # Print the html 9 | print(html) -------------------------------------------------------------------------------- /chapter1/code/requests/urllib_request.py: -------------------------------------------------------------------------------- 1 | from urllib.request import urlopen, Request 2 | 3 | # Specify the url 4 | url = "http://www.packtpub.com" 5 | # This packages the request 6 | request = Request(url) 7 | # Sends the request and catches the response: response 8 | response = urlopen(request) 9 | # Extract the response using read() 10 | html = response.read() 11 | # Print the html 12 | print(html) 13 | # Closing the response 14 | response.close() -------------------------------------------------------------------------------- /chapter1/code/robobrowser/bing_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from robobrowser import RoboBrowser 5 | 6 | browser = RoboBrowser(history=True,parser="html.parser") 7 | browser.open("http://bing.com") 8 | #print(browser.parsed) 9 | 10 | #Find the element by id,action or css class in the html 11 | #form = browser.get_form(id = "sb_form") 12 | form = browser.get_form(action="/search") 13 | #form = browser.get_form(class_='sw_box hassbi') 14 | 15 | print(form) 16 | 17 | form.fields['q'].value = "python" 18 | #form["q"].value = "python" 19 | 20 | browser.submit_form(form) 21 | 22 | print('*****browser.find_all("a")******\n') 23 | 24 | links = browser.find_all("a") 25 | for link in links: 26 | try: 27 | print(link['href']) 28 | except Exception as exception: 29 | pass -------------------------------------------------------------------------------- /chapter1/code/robobrowser/download_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from robobrowser import RoboBrowser 5 | 6 | browser = RoboBrowser(history=True) 7 | 8 | url = "https://www.cse.unsw.edu.au/~en1811/python-docs/python-3.6.4-docs-pdf/tutorial.pdf" 9 | pdf_file_path = "tutorial.pdf" 10 | 11 | # get browser session 12 | request = browser.session.get(url, stream=True) 13 | 14 | with open(pdf_file_path, "wb") as pdf_file: 15 | pdf_file.write(request.content) -------------------------------------------------------------------------------- /chapter1/code/robobrowser/get_emails_links_from_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from robobrowser import RoboBrowser 5 | import re 6 | import argparse 7 | 8 | browser = RoboBrowser(history=True,parser="html.parser") 9 | 10 | def get_emails(domain): 11 | 12 | domain="http://"+domain 13 | browser.open(domain) 14 | contents = browser.find_all("a",href=re.compile("[-a-zA-Z0-9._]+@[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+")) 15 | for content in contents: 16 | print(content['href']) 17 | 18 | def get_links(domain): 19 | 20 | domain="http://"+domain 21 | browser.open(domain) 22 | 23 | print('*****browser.find_all("a")******\n') 24 | contents = browser.find_all("a") 25 | for content in contents: 26 | try: 27 | print(content['href']) 28 | except Exception as exception: 29 | pass 30 | 31 | print('*****browser.get_links()******\n') 32 | links = browser.get_links() 33 | for link in links: 34 | try: 35 | print(link['href']) 36 | except Exception as exception: 37 | pass 38 | 39 | if __name__ == "__main__": 40 | parser = argparse.ArgumentParser(description='gets emails from domain.', prog='get_emails_links_from_url.py', epilog="", add_help=False) 41 | parser.add_argument('-d', '--domain', metavar='', action='store', help='domain to be resolved.',required=True) 42 | args = parser.parse_args() 43 | get_emails(args.domain) 44 | get_links(args.domain) -------------------------------------------------------------------------------- /chapter1/code/robobrowser/twitter_login_form.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from robobrowser import RoboBrowser 5 | 6 | browser = RoboBrowser(history=True,parser="html.parser") 7 | browser.open('http://twitter.com/login') 8 | print(browser.parsed) 9 | 10 | # Get the signup form by action or css class 11 | signup_form = browser.get_form(action="https://twitter.com/sessions") 12 | signup_form = browser.get_form(class_='t1-form clearfix signin js-signin') 13 | print(signup_form) 14 | 15 | # Inspect authenticity_token value 16 | print(signup_form['authenticity_token'].value) 17 | 18 | # Fill it out 19 | signup_form['session[username_or_email]'].value = 'username' 20 | signup_form['session[password]'].value = 'password' 21 | 22 | print(signup_form.serialize()) 23 | 24 | # Submit the form 25 | browser.submit_form(signup_form) -------------------------------------------------------------------------------- /chapter1/code/robobrowser/website_parsing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from robobrowser import RoboBrowser 5 | import requests 6 | 7 | url = "http://www.packtpub.com" 8 | browser = RoboBrowser(history=True,parser="html.parser") 9 | 10 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', 11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 12 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 13 | 'Accept-Encoding': 'none', 14 | 'Accept-Language': 'en-US,en;q=0.8', 15 | 'Connection': 'keep-alive'} 16 | 17 | session = requests.Session() 18 | session.headers = headers 19 | browser = RoboBrowser(session=session) 20 | 21 | browser.open(url) 22 | print(browser.parsed) -------------------------------------------------------------------------------- /chapter1/code/web_technologies/web_technologies_builtwith.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | import argparse 6 | import builtwith 7 | 8 | class BuiltWith(): 9 | 10 | def __init__(self): 11 | 12 | self.key = '1fb25d4e-31b7-468c-8793-4ecebc3467be' 13 | self.url ='http://api.builtwith.com/free1/api.json' 14 | 15 | def module_run(self, domain): 16 | print("\nDomain "+domain +"\n") 17 | print(builtwith.parse("http://"+domain)) 18 | payload = {'key': self.key, 'lookup': domain} 19 | response = requests.get(self.url, params=payload) 20 | json=response.json() 21 | print(json) 22 | 23 | 24 | if __name__ == '__main__': 25 | 26 | parser = argparse.ArgumentParser(description='BuiltWith') 27 | parser.add_argument('--domain', action="store", dest="domain",required=True) 28 | given_args = parser.parse_args() 29 | domain = given_args.domain 30 | builtWith = BuiltWith(); 31 | builtWith.module_run(domain); -------------------------------------------------------------------------------- /chapter1/images/algorithmia_analyze_url.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_analyze_url.png -------------------------------------------------------------------------------- /chapter1/images/algorithmia_analyze_url2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_analyze_url2.png -------------------------------------------------------------------------------- /chapter1/images/algorithmia_sitemap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_sitemap.png -------------------------------------------------------------------------------- /chapter1/images/algorithmia_sitemap2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_sitemap2.png -------------------------------------------------------------------------------- /chapter1/images/bing_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search.png -------------------------------------------------------------------------------- /chapter1/images/bing_search_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search_output.png -------------------------------------------------------------------------------- /chapter1/images/bing_search_output_mechanical_soup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search_output_mechanical_soup.png -------------------------------------------------------------------------------- /chapter1/images/builtwith.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/builtwith.png -------------------------------------------------------------------------------- /chapter1/images/builtwith_script.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/builtwith_script.png -------------------------------------------------------------------------------- /chapter1/images/google_search_mechanical_soup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/google_search_mechanical_soup.png -------------------------------------------------------------------------------- /chapter1/images/robobrowser_links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/robobrowser_links.png -------------------------------------------------------------------------------- /chapter1/images/wappalyzer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/wappalyzer.png -------------------------------------------------------------------------------- /chapter12.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter12.zip -------------------------------------------------------------------------------- /chapter2/code/bs4/BeautifulSoup-getLinks_csv.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import csv 4 | 5 | url = "http://packtpub.com" 6 | 7 | csv_file = csv.writer(open("data_links.csv", "w")) 8 | csv_file.writerow(["Section" , "Link"]) 9 | 10 | # Getting the webpage, creating a Response object. 11 | response = requests.get(url) 12 | 13 | # Extracting the source code of the page. 14 | data = response.text 15 | 16 | # Passing the source code to Beautiful Soup to create a BeautifulSoup object for it. 17 | soup = BeautifulSoup(data, 'html.parser') 18 | 19 | # use the 'find_all' function to bring back all instances of the 'a' tag in the HTML and store in 'tags' variable 20 | # Extracting all the tags into a list. 21 | tags = soup.find_all('a') 22 | tags = soup.find_all('a', {'class': 'nav-anchor'}) # only for url = "http://packtpub.com" 23 | 24 | # Extracting URLs from the attribute href in the tags. 25 | for tag in tags: 26 | print(tag.get('href')) 27 | link = tag.get('href') 28 | text = tag.get_text() 29 | csv_file.writerow([text, link]) 30 | 31 | -------------------------------------------------------------------------------- /chapter2/code/bs4/bs4_objects.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | from bs4 import BeautifulSoup 6 | from fake_useragent import UserAgent 7 | 8 | ua = UserAgent() 9 | header = {'user-agent':ua.chrome} 10 | google_page = requests.get('http://www.packtpub.com',headers=header) 11 | 12 | soup = BeautifulSoup(google_page.content,'lxml') 13 | 14 | #find parent 15 | print("Parent of the form with id='search_mini_form':") 16 | parent_form = soup.find("form",{"id":"search_mini_form"}).parent 17 | print(parent_form) 18 | 19 | #get children form a specific element,in this case we are getting child elements of the form with id="search_mini_form" 20 | print("Children of the form with id='search_mini_form:'") 21 | for child in soup.find("form",{"id":"search_mini_form"}).children: 22 | print(child) 23 | 24 | #find next_siblings 25 | print("Siblings of the form with id='search_mini_form:'") 26 | for sibling in soup.find("form",{"id":"search_mini_form"}).input.next_siblings: 27 | print(sibling) -------------------------------------------------------------------------------- /chapter2/code/bs4/demo_detail_book.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import requests 6 | 7 | response = requests.get('https://www.packtpub.com/application-development/learn-python-programming-second-edition') 8 | soup = BeautifulSoup(response.text,'lxml') 9 | 10 | title = soup.find('span', attrs={'data-ui-id':'page-title-wrapper'}).text 11 | author = soup.find('div', attrs={'class':'authors inline'}).text 12 | 13 | print(title) 14 | print(author) -------------------------------------------------------------------------------- /chapter2/code/bs4/download_images_from_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import os, sys 6 | import requests 7 | from fake_useragent import UserAgent 8 | 9 | def getAllImages(url): 10 | 11 | ua = UserAgent() 12 | header = {'user-agent':ua.chrome} 13 | schedule_page = requests.get(url,headers=header) 14 | 15 | #create directory for save images 16 | os.system("mkdir images_packtpub") 17 | 18 | bs = BeautifulSoup(schedule_page.text,"lxml") 19 | for image in bs.findAll("img"): 20 | print("found image") 21 | 22 | #Extract the location of the image. We also need to strip for get the image name, so let's do that through '.split()' 23 | src = image.get('src') 24 | print(src) 25 | 26 | parts_image = src.split("/") 27 | image_name = parts_image[len(parts_image)-1] 28 | 29 | #Save the image 30 | with open("images_packtpub/"+image_name,"wb") as f: 31 | f.write(requests.get(src).content) 32 | 33 | getAllImages("http://www.packtpub.com") 34 | -------------------------------------------------------------------------------- /chapter2/code/bs4/getExternal_internal_links.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | 5 | from bs4 import BeautifulSoup 6 | import re 7 | import requests 8 | import argparse 9 | 10 | internalLinks = [] 11 | externalLinks = [] 12 | 13 | #Get a list of internal links that start with a "/" 14 | def getInternalLinks(url,beautifulSoup): 15 | url = url.replace("http://", "").split("/")[0] 16 | for link in beautifulSoup.findAll("a", href=re.compile("^(/|.*"+url+")")): 17 | if link.attrs['href'] is not None: 18 | internalLinks.append(link.attrs['href']) 19 | return internalLinks 20 | 21 | 22 | #Get all links that start with "http" or "www" and not contain the current URL 23 | def getExternalLinks(url,beautifulSoup): 24 | url = url.replace("http://", "").split("/")[0] 25 | for link in beautifulSoup.findAll("a", href=re.compile("^(http|www)((?!"+url+").)*$")): 26 | if link.attrs['href'] is not None: 27 | externalLinks.append(link.attrs['href']) 28 | return externalLinks 29 | 30 | 31 | def crawlExternalLinks(website): 32 | html = requests.get(website) 33 | beautifulSoup = BeautifulSoup(html.text,"lxml") 34 | externalLinks = getExternalLinks(website, beautifulSoup) 35 | return externalLinks 36 | 37 | def crawlInternalLinks(website): 38 | html = requests.get(website) 39 | beautifulSoup = BeautifulSoup(html.text,"lxml") 40 | internalLinks = getInternalLinks(website,beautifulSoup) 41 | return internalLinks 42 | 43 | def getExternalInternalLinks(website): 44 | externalLinks = crawlExternalLinks(website) 45 | internalLinks = crawlInternalLinks(website) 46 | print("\nExternal links") 47 | print("-------------------") 48 | 49 | for external in externalLinks: 50 | print(external) 51 | 52 | print("\nInternal links") 53 | print("-------------------") 54 | for internal in internalLinks: 55 | print(internal) 56 | 57 | 58 | if __name__== "__main__": 59 | 60 | # parse the command line arguments 61 | ap = argparse.ArgumentParser() 62 | ap.add_argument("-d","--domain",required=True,help="The domain to target ie. packtpub.com") 63 | args = vars(ap.parse_args()) 64 | 65 | domain = args['domain'] 66 | 67 | if domain.startswith("http://") == True: 68 | target = domain 69 | else: 70 | target = "http://" + domain 71 | 72 | getExternalInternalLinks(target) -------------------------------------------------------------------------------- /chapter2/code/bs4/get_offers_bs4.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | 4 | def getOffers(url): 5 | # We make the request to the page 6 | req = requests.get(url) 7 | # We verify that the request returns a Status Code = 200 (200 = Ok) 8 | statusCode = req.status_code 9 | if statusCode == 200: 10 | # We pass the HTML content of the web to a BeautifulSoup object 11 | html = BeautifulSoup(req.text, "html.parser") 12 | # We get all the div elements with class "offer-box" 13 | elements = html.find_all('div', {'class': 'offer-box'}) 14 | # We go through all the entries to extract the title, description and link 15 | for item in elements: 16 | title = item.find('h3').getText() 17 | description = item.find('p').getText() 18 | link = item.find('a').get('href') 19 | 20 | # Print title,link and description 21 | print("Title....: " + title) 22 | print("Link:.....: " + link) 23 | print("Description:.....: " + description) 24 | print("**********************************") 25 | else: 26 | # If the page does not exist we show the error 27 | print("The url " + url + " gives an error %d" % statusCode) 28 | 29 | getOffers("https://www.packtpub.com/offers") -------------------------------------------------------------------------------- /chapter2/code/bs4/wikipedia_links.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | from bs4 import BeautifulSoup 6 | import re 7 | 8 | def getLinks(url): 9 | html = requests.get("http://en.wikipedia.org"+url).text 10 | bs = BeautifulSoup(html, "html.parser") 11 | return bs.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$")) 12 | 13 | print("Main links from http://en.wikipedia.org//wiki/Python_(programming_language)") 14 | links_level1 = getLinks("/wiki/Python_(programming_language)") 15 | 16 | index =0 17 | 18 | for link in links_level1: 19 | 20 | print("http://en.wikipedia.org"+link.get('href').encode('utf-8')) 21 | 22 | newLink= links_level1[index].attrs["href"] 23 | 24 | links_level2 = getLinks(newLink) 25 | 26 | print("Links from http://en.wikipedia.org"+ newLink) 27 | 28 | for link in links_level2: 29 | print("http://en.wikipedia.org"+link.get('href').encode('utf-8')) 30 | 31 | index = index +1 32 | -------------------------------------------------------------------------------- /chapter2/code/requests/crawler_urls.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import requests 6 | 7 | web = input("Url: ") 8 | response = requests.get('http://'+web).text 9 | urls = [] 10 | 11 | pattern= re.compile('''href=["'](.[^"']+)["']''') 12 | search = re.findall(pattern, response) 13 | 14 | for url in search: 15 | try: 16 | urls.append(url) 17 | d1 = str(url) 18 | urlList = open('crawler_urls.txt','a+') 19 | urlList.write(d1+"\n") 20 | urlList.close() 21 | print(url) 22 | response2 = requests.get(i).text 23 | search2 = re.findall(pattern, response2) 24 | for e in search2: 25 | urls.append(e) 26 | d2 = str(e) 27 | urlList = open('crawler_urls.txt','a+') 28 | urlList.write(d2+"\n") 29 | urlList.close() 30 | 31 | except Exception as e: 32 | pass 33 | 34 | print("URls saved in file crawler_urls.txt") -------------------------------------------------------------------------------- /chapter2/code/requests/depth_search_extract_links.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from urllib.request import urljoin 5 | from urllib.parse import urlparse 6 | import re 7 | import requests 8 | from collections import deque 9 | 10 | def download_page(url): 11 | try: 12 | return requests.get(url).text 13 | except: 14 | print('error in the url', url) 15 | 16 | def extract_links(page): 17 | if not page: 18 | return [] 19 | link_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE) 20 | return [urljoin(page, link) for link in link_regex.findall(page)] 21 | 22 | def get_links(page_url): 23 | host = urlparse(page_url)[1] 24 | page = download_page(page_url) 25 | links = extract_links(page) 26 | return [link for link in links if urlparse(link)[1] == host] 27 | 28 | def depth_search(start_url): 29 | visited = set() 30 | queue = deque() 31 | queue.append(start_url) 32 | while queue: 33 | url = queue.popleft() 34 | if url in visited: 35 | continue 36 | visited.add(url) 37 | for link in get_links(url): 38 | queue.appendleft(link) 39 | print(url) 40 | 41 | if __name__ == '__main__': 42 | 43 | print('Depth search extracting links ') 44 | print('----------------------------- ') 45 | depth_search('https://www.packtpub.com') 46 | -------------------------------------------------------------------------------- /chapter2/code/requests/download_file_requests.py: -------------------------------------------------------------------------------- 1 | 2 | import requests 3 | 4 | def downloadFile(fileName): 5 | # extract the filename 6 | filename = fileName.split("/")[-1] 7 | # download image using GET 8 | image = requests.get(fileName, stream=True) 9 | # save the image received into the file 10 | with open(filename, 'wb') as fileDescryptor: 11 | i=0 12 | for chunk in image.iter_content(chunk_size=1024): 13 | i=i+1 14 | fileDescryptor.write(chunk) 15 | return 16 | 17 | 18 | downloadFile("https://www.packtpub.com/media/logo/stores/1/logo.png") 19 | downloadFile("https://media.readthedocs.org/pdf/python-guide/latest/python-guide.pdf") 20 | downloadFile("https://docs.python.org/3/archives/python-3.7.4-docs-pdf-letter.zip") 21 | -------------------------------------------------------------------------------- /chapter2/code/requests/extract_links_images_re.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from urllib.request import urljoin 5 | import re 6 | import requests 7 | 8 | def download_page(url): 9 | return requests.get(url).text 10 | 11 | def extract_links(page): 12 | link_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE) 13 | return link_regex.findall(page) 14 | 15 | def extract_image_locations(page): 16 | img_regex = re.compile(']+src=["\'](.*?)["\']', re.IGNORECASE) 17 | return img_regex.findall(page) 18 | 19 | 20 | if __name__ == '__main__': 21 | target_url = 'http://www.packtpub.com' 22 | packtpub = download_page(target_url) 23 | links = extract_links(packtpub) 24 | 25 | for link in links: 26 | print(urljoin(target_url, link)) 27 | 28 | image_locations = extract_image_locations(packtpub) 29 | 30 | for src in image_locations: 31 | print(urljoin(target_url, src)) 32 | -------------------------------------------------------------------------------- /chapter2/code/requests/get_emails_from_url.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | import re 6 | import argparse 7 | 8 | def get_emails(domain): 9 | 10 | if not domain.startswith("http://") == True: 11 | domain="http://"+domain 12 | 13 | response = requests.get(domain) 14 | pattern = re.compile("[-a-zA-Z0-9._]+@[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+") 15 | mails = re.findall(pattern,response.text) 16 | emails = str(mails) 17 | 18 | print(emails) 19 | 20 | if __name__ == "__main__": 21 | parser = argparse.ArgumentParser(description='gets emails from domain.', prog='get_emails_from_url.py', epilog="", add_help=False) 22 | # Adding the argument 23 | parser.add_argument('-d', '--domain', metavar='', action='store', help='domain to be resolved.',required=True) 24 | args = parser.parse_args() 25 | 26 | get_emails(args.domain) -------------------------------------------------------------------------------- /chapter2/code/requests/get_html_requests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | from fake_useragent import UserAgent 6 | 7 | url = 'https://www.packtpub.com' 8 | file_name = 'packtpub.com.txt' 9 | 10 | user_agent = UserAgent() 11 | page = requests.get(url,headers={'user-agent':user_agent.chrome}) 12 | print(page.content) 13 | with open(file_name,'w') as file: 14 | file.write(page.content.decode('utf-8')) -------------------------------------------------------------------------------- /chapter2/code/requests/link_crawler_search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | import sys 6 | import requests 7 | import re 8 | processed = [] 9 | 10 | def search_links(url, depth, search): 11 | # Process http links that are not processed yet 12 | url_is_processed = (url in processed) 13 | if (url.startswith("http://") and (not url_is_processed)): 14 | processed.append(url) 15 | path = "/" 16 | urlparts = url.split("/") 17 | if (len(urlparts) > 1): 18 | host = urlparts[0] 19 | path = url.replace(host, "", 1) 20 | 21 | # Start crawling 22 | print("Crawling URL path:%s%s " %(host, path)) 23 | req = requests.get(host+path) 24 | 25 | # find the links 26 | contents = req.text 27 | all_links = re.findall('href="(.*?)"', contents) 28 | if (search in contents): 29 | print("Found " + search + " at " + url) 30 | print("-----------------------------------") 31 | print(" ==> %s: processing %s links" %(str(depth),str(len(all_links)))) 32 | 33 | for href in all_links: 34 | # Find relative urls 35 | print('link found '+href) 36 | # Recurse links 37 | if (depth > 0): 38 | search_links(href, depth-1, search) 39 | else: 40 | print("Skipping link: %s ..." %url) 41 | 42 | if __name__ == '__main__': 43 | parser = argparse.ArgumentParser(description='Webpage link crawler') 44 | parser.add_argument('--url', action="store", dest="url",required=True,type=str) 45 | parser.add_argument('--query', action="store", dest="query",required=True) 46 | parser.add_argument('--depth', action="store", dest="depth",default=1) 47 | given_args = parser.parse_args() 48 | try: 49 | if given_args.url.startswith("http://") == True: 50 | target = given_args.url 51 | else: 52 | target = "http://" + given_args.url 53 | search_links(target,given_args.depth,given_args.query) 54 | except KeyboardInterrupt: 55 | print("Aborting search by user request.") -------------------------------------------------------------------------------- /chapter2/code/requests/requests_post.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests 5 | data_dictionary = {'name': 'username','password': '123456','email': 'user@domain.com'} 6 | response = requests.post("http://httpbin.org/post",data=data_dictionary) 7 | 8 | if response.status_code == 200: 9 | print(response.text) -------------------------------------------------------------------------------- /chapter2/code/requests/requests_user_agent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import requests, json 5 | from fake_useragent import UserAgent 6 | 7 | ua = UserAgent() 8 | header = {'user-agent':ua.chrome} 9 | 10 | responseGet = requests.get("https://www.packtpub.com",headers=header) 11 | print(responseGet.text.encode('utf-8')) 12 | print(responseGet.json) 13 | print(responseGet.encoding) 14 | print(responseGet.content) 15 | print("Status code: "+str(responseGet.status_code)) 16 | 17 | print("Headers response: ") 18 | for header, value in responseGet.headers.items(): 19 | print(header, '-->', value) 20 | 21 | print("Headers request : ") 22 | for header, value in responseGet.request.headers.items(): 23 | print(header, '-->', value) -------------------------------------------------------------------------------- /chapter2/images/download_images.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/download_images.png -------------------------------------------------------------------------------- /chapter2/images/download_images2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/download_images2.png -------------------------------------------------------------------------------- /chapter2/images/external_inernal_links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/external_inernal_links.png -------------------------------------------------------------------------------- /chapter2/images/link_extractor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/link_extractor.png -------------------------------------------------------------------------------- /chapter2/images/objects.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/objects.png -------------------------------------------------------------------------------- /chapter2/images/packt_books.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packt_books.png -------------------------------------------------------------------------------- /chapter2/images/packtpub_links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links.png -------------------------------------------------------------------------------- /chapter2/images/packtpub_links2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links2.png -------------------------------------------------------------------------------- /chapter2/images/packtpub_links_csv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links_csv.png -------------------------------------------------------------------------------- /chapter2/images/packtpub_links_deep_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links_deep_search.png -------------------------------------------------------------------------------- /chapter2/images/requests_extract_links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_extract_links.png -------------------------------------------------------------------------------- /chapter2/images/requests_headers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_headers.png -------------------------------------------------------------------------------- /chapter2/images/requests_post.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_post.png -------------------------------------------------------------------------------- /chapter3/code/books_scraping/requests_bs4_initial.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | 4 | def processUrl(url): 5 | """ 6 | Upload and process the content of a URL using request. 7 | Show an error message if you cannot load the page 8 | """ 9 | # http request 10 | req = requests.get(url) 11 | 12 | # We verify the request returns a Status Code = 200 13 | statusCode = req.status_code 14 | if statusCode == 200: 15 | 16 | # We pass the HTML content of the web to a BeautifulSoup() object 17 | html = BeautifulSoup(req.text,"lxml") 18 | 19 | # We process the downloaded HTML 20 | return processHTML(html,url) 21 | 22 | else: 23 | print ("ERROR {}".format(statusCode)) 24 | 25 | def processHTML(html, url=""): 26 | """ 27 | Process the HTML content of a web page 28 | html is a BS4 object 29 | url is the URL of the page contained in html_doc 30 | """ 31 | # Decide here what you want to do with the content 32 | return 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /chapter3/code/books_scraping/requests_bs4_with_pages.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import pandas as pd 4 | 5 | # Class names representing product ratings 6 | star = ["One", "Two", "Three", "Four", "Five"] 7 | 8 | bookList = [] 9 | url_page = "http://books.toscrape.com/catalogue/page-{}.html" 10 | url = "http://books.toscrape.com/catalogue/" 11 | 12 | def starToInt (rating): 13 | """ 14 | Convert a textual rating to a numerical rating 15 | Returns the equivalent number, or 0, if the rating is not valid 16 | """ 17 | try: 18 | return star.index(rating) + 1 19 | except: 20 | return 0 21 | 22 | 23 | def processUrl(url): 24 | """ 25 | Upload and process the content of a URL using request. 26 | Show an error message if you cannot load the page 27 | """ 28 | # http request 29 | req = requests.get(url) 30 | 31 | # We verify the request returns a Status Code = 200 32 | statusCode = req.status_code 33 | if statusCode == 200: 34 | 35 | # We pass the HTML content of the web to a BeautifulSoup () object 36 | html = BeautifulSoup(req.text,"lxml") 37 | 38 | # We process the downloaded HTML 39 | return processHTML(html,url) 40 | 41 | else: 42 | print ("ERROR {}".format(statusCode)) 43 | 44 | def processHTML(html, url=""): 45 | """ 46 | Process the HTML content of a web page 47 | html is a BS4 object 48 | url is the URL of the page contained in html_doc 49 | """ 50 | book = {} 51 | 52 | productMain = html.select_one(".product_main") 53 | 54 | # Title 55 | title = productMain.select_one("h1").text 56 | book['title'] = title 57 | 58 | # Price 59 | price = productMain.select_one("p.price_color").text 60 | book['price'] = price[2:] 61 | 62 | # Assessment 63 | # 1. Get class 64 | ratingClasses = productMain.select_one("p.star-rating")["class"] 65 | 66 | # 2. We get with the intersection 67 | ratingText = list(set(ratingClasses).intersection(set(star))) 68 | 69 | # 3. We convert it to a numerical value 70 | if (len(ratingText)==1): 71 | book['assessment'] = starToInt(ratingText[0]) 72 | else: 73 | book['assessment'] = 0 74 | 75 | # Processing the description makes us look for the sibling of an element 76 | # Product description 77 | # 1. We look for the element that takes product product description 78 | productDescription = html.find(id="product_description") 79 | 80 | # 2. We are looking for the next sibling with tag p 81 | if productDescription is None: 82 | book['descripcion'] = "" 83 | else: 84 | book['descripcion'] = productDescription.find_next_sibling('p').text 85 | 86 | print(book) 87 | 88 | return book 89 | 90 | 91 | def processCatalog(url, prefix): 92 | """ 93 | Returns False if we have reached the end of the catalog, True otherwise 94 | """ 95 | # We make the request to the web 96 | response = requests.get(url) 97 | 98 | # We verify that the request returns a Status Code = 200 99 | statusCode = response.status_code 100 | if statusCode == 200: 101 | 102 | # We pass the HTML content of the web to a BeautifulSoup () object 103 | html = BeautifulSoup(response.text,"lxml") 104 | 105 | # We process the downloaded HTML 106 | books = html.select('article.product_pod') 107 | for prod in books: 108 | link = prod.select_one('h3 > a') 109 | book = processUrl(prefix+link['href']) 110 | book['link'] = prefix+link['href'] 111 | bookList.append(book) 112 | return True 113 | 114 | if statusCode == 404: 115 | return False 116 | 117 | if __name__ == "__main__": 118 | 119 | processUrl("http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html") 120 | 121 | for i in range(1,5): 122 | processCatalog(url_page.format(i), url) 123 | 124 | for book in bookList: 125 | print(book) 126 | 127 | #Finally we will load all the data in a panda dataframe to process it, extract information and save it to a CSV 128 | 129 | df = pd.DataFrame(bookList) 130 | df.to_csv("bookList.csv", sep=";", index=False) 131 | 132 | 133 | 134 | -------------------------------------------------------------------------------- /chapter3/code/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/code/chromedriver.exe -------------------------------------------------------------------------------- /chapter3/code/dolar-euro_converter.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | 4 | def get_currency_values(): 5 | browser = webdriver.Chrome("chromedriver.exe") 6 | browser.get('http://www.xe.com/en/currencyconverter/convert/?Amount=1&From=USD&To=EUR') 7 | time.sleep(5) 8 | value = browser.find_element_by_xpath("//*[@id='converterResult']/div/div/div[2]/span[1]") 9 | one_dollar = value.text 10 | print('The dollar at this time has a value of: €{} EUROS'.format(one_dollar)) 11 | browser.get('http://www.xe.com/en/currencyconverter/convert/?Amount=1&From=EUR&To=USD') 12 | time.sleep(5) 13 | value = browser.find_element_by_xpath("//*[@id='converterResult']/div/div/div[2]/span[1]") 14 | one_euro = value.text 15 | print('The euro at this time has a value of: ${} dollars'.format(one_euro)) 16 | one_dollar_float = float(one_dollar) 17 | one_euro_float = float(one_euro) 18 | operate(one_dollar_float, one_euro_float) 19 | 20 | 21 | def operate(one_dollar_float, one_euro_float): 22 | 23 | while True: 24 | command = str(input('''Selet currency conversion: 25 | [1]Dollars to euros 26 | [2]Euros to dollars 27 | [e]exit''')) 28 | 29 | if command == '1': 30 | dollar_to_euro(one_dollar_float) 31 | elif command == '2': 32 | euro_to_dollar(one_euro_float) 33 | else: 34 | break 35 | 36 | def dollar_to_euro(one_dollar_float): 37 | dollar_amount = float(input('Dollars amount: ')) 38 | result = one_dollar_float * dollar_amount 39 | print('${} Dollars are ${} Euros'.format(dollar_amount, result)) 40 | 41 | def euro_to_dollar(one_euro_float): 42 | euros_amount = float(input('Euros amount: ')) 43 | result = one_euro_float * euros_amount 44 | print('€{} Euros are ${} Dollars'.format(euros_amount, result)) 45 | 46 | 47 | if __name__ == '__main__': 48 | get_currency_values() -------------------------------------------------------------------------------- /chapter3/code/google_translate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from bs4 import BeautifulSoup 5 | import requests 6 | import sys 7 | from selenium import webdriver 8 | import time 9 | 10 | #Example input to enter : en (= english) 11 | convert_from = input("Language to Convert from : ") 12 | 13 | #Example input to enter : es (= spanish) 14 | convert_to = input("Language to Convert to : ") 15 | 16 | text_to_convert = input("Text to translate: ") 17 | 18 | #replace spaces by + symbol 19 | text_to_convert = text_to_convert.replace(' ', '+') 20 | 21 | #call translate service 22 | url = 'https://translate.google.com/?sl=%s&tl=%s&text=%s' % (convert_from, convert_to, text_to_convert) 23 | 24 | browser = webdriver.Chrome("chromedriver.exe") 25 | browser.get(url) 26 | 27 | time.sleep(5) 28 | 29 | translation = browser.find_element_by_class_name("tlid-translation") 30 | translation2 = browser.find_element_by_xpath("/html/body/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div[3]/div[1]/div[2]/div/span[1]/span") 31 | 32 | print("Text translated : ", translation2.text) 33 | 34 | browser.get_screenshot_as_file('google_translate.png') 35 | browser.close() 36 | -------------------------------------------------------------------------------- /chapter3/code/interacting_with_form.py: -------------------------------------------------------------------------------- 1 | from selenium.webdriver.support.ui import WebDriverWait 2 | from selenium.common.exceptions import TimeoutException 3 | from selenium import webdriver 4 | import time 5 | 6 | url = "https://websistent.com/tools/htdigest-generator-tool/" 7 | user = "myUser" 8 | 9 | driver = webdriver.Chrome('chromedriver.exe') 10 | driver.get(url) 11 | 12 | element = driver.find_element_by_id("uname") 13 | element.send_keys(user) 14 | 15 | #If we go to the browser we will see that we have completed the first input of the form. 16 | #Then fill in the rest of inputs 17 | 18 | element = driver.find_element_by_id("realm") 19 | element.send_keys("myRealm") 20 | 21 | element = driver.find_element_by_id("word1") 22 | element.send_keys("mypassword") 23 | 24 | element = driver.find_element_by_id("word2") 25 | element.send_keys("mypassword") 26 | 27 | #Finally, we look for the button and click it 28 | driver.find_element_by_id("generate").click(); 29 | 30 | # We wait 2 seconds before searching for the item 31 | #time.sleep(2) 32 | 33 | try: 34 | # We wait a maximum of 10 seconds while we wait for the "Loading" text to disappear 35 | WebDriverWait(driver, 10).until_not(lambda driver: driver.find_element_by_id("output").text.startswith("Loading")) 36 | 37 | output = driver.find_element_by_id("output").text 38 | print (output[output.find(user):]) 39 | 40 | except TimeoutException: 41 | print("The realm could not be generated or the page has taken too long time to load") 42 | 43 | finally: 44 | driver.quit() -------------------------------------------------------------------------------- /chapter3/code/phantomjs/phantomjs.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/code/phantomjs/phantomjs.exe -------------------------------------------------------------------------------- /chapter3/code/phantomjs/phantomjs_example1.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | driver = webdriver.PhantomJS("phantomjs.exe") 4 | driver.get("https://protonmail.com/") 5 | print(driver.find_element_by_class_name("homepage-hero-sub-title").text) -------------------------------------------------------------------------------- /chapter3/code/phantomjs/phantomjs_example2.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from bs4 import BeautifulSoup 3 | 4 | browser = webdriver.PhantomJS("phantomjs.exe") 5 | 6 | browser.get("https://protonmail.com/") 7 | page = BeautifulSoup(browser.page_source,"lxml") 8 | images = page.findAll("img") 9 | for image in images: 10 | print(image.get('src')) 11 | browser.close() -------------------------------------------------------------------------------- /chapter3/code/phantomjs/phantomjs_example3.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.webdriver.common.by import By 3 | from selenium.webdriver.support.ui import WebDriverWait 4 | from selenium.webdriver.support import expected_conditions as EC 5 | 6 | driver = webdriver.PhantomJS("phantomjs.exe") 7 | 8 | driver.get("https://httpbin.org/#/HTTP_Methods/post_post") 9 | 10 | driver.find_element_by_class_name("opblock-summary-description").click() 11 | 12 | try: 13 | element = WebDriverWait(driver, 15).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "btn"),"Try it out")) 14 | 15 | finally: 16 | driver.get_screenshot_as_file("image.png") 17 | 18 | driver.close() -------------------------------------------------------------------------------- /chapter3/code/scraping_book_details_requests.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | import csv 3 | import json 4 | import requests 5 | 6 | def parse(url): 7 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'} 8 | response = requests.get(url, headers=headers) 9 | doc = html.fromstring(response.content) 10 | title_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[1]/h1/span/text()' 11 | author_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/div[2]/text()' 12 | date_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/div[3]/text()' 13 | pages_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/p[1]/text()' 14 | title = doc.xpath(title_xpath)[0] 15 | author = doc.xpath(author_xpath)[0] 16 | date = doc.xpath(date_xpath)[0] 17 | pages = doc.xpath(pages_xpath)[0] 18 | 19 | title = ' '.join(''.join(title).split()) if title else None 20 | author = ' '.join(''.join(author).split()) if author else None 21 | date = ' '.join(''.join(date).split()) if date else None 22 | pages = ' '.join(''.join(pages).split()) if pages else None 23 | 24 | data = {'Title': title,'Author': author,'Date': date,'Pages': pages} 25 | print(data) 26 | 27 | return data 28 | 29 | 30 | 31 | def ScrapingBookData(): 32 | 33 | bookList = ['big-data-and-business-intelligence/machine-learning-opencv', 34 | 'big-data-and-business-intelligence/hands-generative-adversarial-networks-keras'] 35 | 36 | extracted_data = [] 37 | 38 | for i in bookList: 39 | url = "https://www.packtpub.com/" + i 40 | print("Processing: " + url) 41 | # Calling the parser 42 | parsed_data = parse(url) 43 | if parsed_data: 44 | extracted_data.append(parsed_data) 45 | #Save the collected data into a json file. 46 | file_json=open('book_data.json','w') 47 | json.dump(extracted_data,file_json,indent=4) 48 | 49 | # Writing scraped data book to csv file 50 | with open('scraped_book_data.csv', 'w') as csvfile: 51 | fieldnames = ['Title','Author','Date','Pages'] 52 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) 53 | writer.writeheader() 54 | for data in extracted_data: 55 | writer.writerow(data) 56 | 57 | if __name__ == "__main__": 58 | ScrapingBookData() -------------------------------------------------------------------------------- /chapter3/code/selenium_list_book.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from selenium import webdriver 5 | from bs4 import BeautifulSoup 6 | import requests 7 | import pandas as pd 8 | 9 | driver = webdriver.Chrome("chromedriver.exe") 10 | 11 | driver.get('https://www.packtpub.com/gb/web-development/web-programming') 12 | content = driver.page_source 13 | 14 | soup = BeautifulSoup(content,'lxml') 15 | 16 | books=[] #List to store book titles 17 | authors=[] #List to store authors 18 | dates=[] #List to store dates 19 | 20 | 21 | for element in soup.findAll('div', attrs={'class':'card h-100'}): 22 | title = element.find('h5', attrs={'class':'card-title mt-0'}) 23 | author = element.find('div', attrs={'class':'author-names'}) 24 | meta = element.find('div', attrs={'class':'product-meta'}) 25 | if title is not None: 26 | print(title.contents[0].strip()) 27 | title_text = title.contents[0].strip() 28 | else: 29 | title_text = '' 30 | 31 | if author is not None: 32 | author_text = author.find('p').text 33 | else: 34 | author_text = '' 35 | 36 | if meta is not None: 37 | date_text = meta.findChild().text 38 | else: 39 | date_text = '' 40 | 41 | 42 | books.append(title_text) 43 | authors.append(author_text) 44 | dates.append(date_text) 45 | 46 | df = pd.DataFrame({'Book title':books,'Author':authors,'Date':dates}) 47 | df.to_csv('books.csv', index=False, encoding='utf-8') 48 | -------------------------------------------------------------------------------- /chapter3/code/stack_overflow_tags.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | 3 | driver = webdriver.Chrome("chromedriver.exe") 4 | driver.get("https://stackoverflow.com/tags") 5 | tags = driver.find_elements_by_class_name("post-tag") 6 | for i in range(len(tags)): 7 | print(tags[i].text) -------------------------------------------------------------------------------- /chapter3/images/ajax_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/ajax_image.png -------------------------------------------------------------------------------- /chapter3/images/book_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/book_info.png -------------------------------------------------------------------------------- /chapter3/images/book_packit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/book_packit.png -------------------------------------------------------------------------------- /chapter3/images/books_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/books_details.png -------------------------------------------------------------------------------- /chapter3/images/books_packit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/books_packit.png -------------------------------------------------------------------------------- /chapter3/images/converter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/converter.png -------------------------------------------------------------------------------- /chapter3/images/google_translate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/google_translate.png -------------------------------------------------------------------------------- /chapter3/images/selenium_methods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/selenium_methods.png -------------------------------------------------------------------------------- /chapter3/images/xpath.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/xpath.png -------------------------------------------------------------------------------- /chapter4/BooksSpider-multipage-details/books_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-multipage-details/books_crawler/__init__.py -------------------------------------------------------------------------------- /chapter4/BooksSpider-multipage-details/books_crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BooksCrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-multipage-details/books_crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BooksCrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-multipage-details/books_crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for books_crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'books_crawler' 13 | 14 | SPIDER_MODULES = ['books_crawler.spiders'] 15 | NEWSPIDER_MODULE = 'books_crawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'books_crawler.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'books_crawler.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-multipage-details/books_crawler/spiders/BooksSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider 3 | from scrapy.http import Request 4 | 5 | 6 | def product_info(response, value): 7 | return response.xpath('//th[text()="' + value + '"]/following-sibling::td/text()').extract_first() 8 | 9 | 10 | class BooksSpider(Spider): 11 | name = 'BooksSpider' 12 | allowed_domains = ['books.toscrape.com'] 13 | start_urls = ['http://books.toscrape.com'] 14 | 15 | def parse(self, response): 16 | books = response.xpath('//h3/a/@href').extract() 17 | for book in books: 18 | absolute_url = response.urljoin(book) 19 | yield Request(absolute_url, callback=self.parse_book) 20 | 21 | # process next page 22 | next_page_url = response.xpath('//a[text()="next"]/@href').extract_first() 23 | absolute_next_page_url = response.urljoin(next_page_url) 24 | yield Request(absolute_next_page_url) 25 | 26 | def parse_book(self, response): 27 | title = response.css('h1::text').extract_first() 28 | price = response.xpath('//*[@class="price_color"]/text()').extract_first() 29 | 30 | image_url = response.xpath('//img/@src').extract_first() 31 | image_url = image_url.replace('../..', 'http://books.toscrape.com/') 32 | 33 | rating = response.xpath('//*[contains(@class, "star-rating")]/@class').extract_first() 34 | rating = rating.replace('star-rating ', '') 35 | 36 | description = response.xpath( 37 | '//*[@id="product_description"]/following-sibling::p/text()').extract_first() 38 | 39 | # book information data 40 | product_type = product_info(response, 'Product Type') 41 | price_without_tax = product_info(response, 'Price (excl. tax)') 42 | price_with_tax = product_info(response, 'Price (incl. tax)') 43 | tax = product_info(response, 'Tax') 44 | availability = product_info(response, 'Availability') 45 | number_of_reviews = product_info(response, 'Number of reviews') 46 | 47 | yield { 48 | 'title': title, 49 | 'price': price, 50 | 'image_url': image_url, 51 | 'rating': rating, 52 | 'description': description, 53 | 'product_type': product_type, 54 | 'price_without_tax': price_without_tax, 55 | 'price_with_tax': price_with_tax, 56 | 'tax': tax, 57 | 'availability': availability, 58 | 'number_of_reviews': number_of_reviews 59 | } -------------------------------------------------------------------------------- /chapter4/BooksSpider-multipage-details/books_crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-multipage-details/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = books_crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = books_crawler 12 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls/books_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-urls/books_crawler/__init__.py -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls/books_crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BooksCrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls/books_crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BooksCrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls/books_crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for books_crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'books_crawler' 13 | 14 | SPIDER_MODULES = ['books_crawler.spiders'] 15 | NEWSPIDER_MODULE = 'books_crawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'books_crawler.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'books_crawler.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls/books_crawler/spiders/BooksSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider 3 | from scrapy.http import Request 4 | 5 | 6 | class BooksSpider(Spider): 7 | name = 'BooksSpider' 8 | allowed_domains = ['books.toscrape.com'] 9 | start_urls = ['http://books.toscrape.com'] 10 | 11 | def parse(self, response): 12 | books = response.xpath('//h3/a/@href').extract() 13 | for book in books: 14 | absolute_url = response.urljoin(book) 15 | yield Request(absolute_url, callback=self.parse_book) 16 | 17 | # process next page 18 | next_page_url = response.xpath('//a[text()="next"]/@href').extract_first() 19 | absolute_next_page_url = response.urljoin(next_page_url) 20 | yield Request(absolute_next_page_url) 21 | 22 | def parse_book(self, response): 23 | yield { 'book_url': response.url} 24 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls/books_crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = books_crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = books_crawler 12 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls_download_images/books_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-urls_download_images/books_crawler/__init__.py -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls_download_images/books_crawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class BooksCrawlerItem(scrapy.Item): 11 | title = scrapy.Field() 12 | price = scrapy.Field() 13 | 14 | image_urls = scrapy.Field() 15 | images = scrapy.Field() 16 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls_download_images/books_crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BooksCrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls_download_images/books_crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for books_crawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'books_crawler' 13 | 14 | SPIDER_MODULES = ['books_crawler.spiders'] 15 | NEWSPIDER_MODULE = 'books_crawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'books_crawler.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'scrapy.pipelines.images.ImagesPipeline': 1, 69 | } 70 | IMAGES_STORE = './images_store' 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | #AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | #AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | #AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | #AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | #HTTPCACHE_ENABLED = True 88 | #HTTPCACHE_EXPIRATION_SECS = 0 89 | #HTTPCACHE_DIR = 'httpcache' 90 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls_download_images/books_crawler/spiders/BooksSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider 3 | from scrapy.http import Request 4 | from scrapy.loader import ItemLoader 5 | from books_crawler.items import BooksCrawlerItem 6 | 7 | 8 | class BooksSpider(Spider): 9 | name = 'BooksSpider' 10 | allowed_domains = ['books.toscrape.com'] 11 | start_urls = ['http://books.toscrape.com'] 12 | 13 | def parse(self, response): 14 | books = response.xpath('//h3/a/@href').extract() 15 | for book in books: 16 | absolute_url = response.urljoin(book) 17 | yield Request(absolute_url, callback=self.parse_book) 18 | 19 | # process next page 20 | next_page_url = response.xpath('//a[text()="next"]/@href').extract_first() 21 | absolute_next_page_url = response.urljoin(next_page_url) 22 | yield Request(absolute_next_page_url) 23 | 24 | def parse_book(self, response): 25 | item_loader = ItemLoader(item=BooksCrawlerItem(), response=response) 26 | 27 | title = response.css('h1::text').extract_first() 28 | price = response.xpath('//*[@class="price_color"]/text()').extract_first() 29 | 30 | image_urls = response.xpath('//img/@src').extract_first() 31 | image_urls = image_urls.replace('../..', 'http://books.toscrape.com/') 32 | 33 | item_loader.add_value('title', title) 34 | item_loader.add_value('price', price) 35 | item_loader.add_value('image_urls', image_urls) 36 | 37 | return item_loader.load_item() 38 | 39 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls_download_images/books_crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /chapter4/BooksSpider-urls_download_images/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = books_crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = books_crawler 12 | -------------------------------------------------------------------------------- /chapter4/europython/europython/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__init__.py -------------------------------------------------------------------------------- /chapter4/europython/europython/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /chapter4/europython/europython/__pycache__/items.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/items.cpython-37.pyc -------------------------------------------------------------------------------- /chapter4/europython/europython/__pycache__/pipelines.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/pipelines.cpython-37.pyc -------------------------------------------------------------------------------- /chapter4/europython/europython/__pycache__/settings.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/settings.cpython-37.pyc -------------------------------------------------------------------------------- /chapter4/europython/europython/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy.loader.processors import Compose, MapCompose, Join 10 | 11 | clean_text = Compose(MapCompose(lambda v: v.strip()), Join()) 12 | 13 | def custom_field(text): 14 | text = clean_text(text) 15 | return text.strip() 16 | 17 | class EuropythonItem(scrapy.Item): 18 | # define the fields for your item here like: 19 | # name = scrapy.Field() 20 | title = scrapy.Field(output_processor=custom_field) 21 | author = scrapy.Field(output_processor=custom_field) 22 | description = scrapy.Field(output_processor=custom_field) 23 | date = scrapy.Field(output_processor=custom_field) 24 | tags = scrapy.Field(output_processor=custom_field) 25 | -------------------------------------------------------------------------------- /chapter4/europython/europython/middlewares.py: -------------------------------------------------------------------------------- 1 | # Importing base64 library because we'll need it ONLY in case if the proxy we are going to use requires authentication 2 | import base64 3 | 4 | # Start your middleware class 5 | class ProxyMiddleware(object): 6 | # overwrite process request 7 | def process_request(self, request, spider): 8 | # Set the location of the proxy 9 | request.meta['proxy'] = "proxy_server" 10 | 11 | # Use the following lines if your proxy requires authentication 12 | proxy_user_pass = "user:password" 13 | # setup basic authentication for the proxy 14 | encoded_user_pass = base64.encodestring(proxy_user_pass) 15 | request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass 16 | -------------------------------------------------------------------------------- /chapter4/europython/europython/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import scrapy 9 | from scrapy import signals 10 | from scrapy.exporters import CsvItemExporter 11 | from scrapy.exporters import XmlItemExporter 12 | import codecs 13 | import json 14 | import csv 15 | 16 | class EuropythonJsonExport(object): 17 | def __init__(self): 18 | self.file = codecs.open('europython_items.json', 'w+b', encoding='utf-8') 19 | 20 | def process_item(self, item, spider): 21 | line = json.dumps(dict(item), ensure_ascii=False) + "\n" 22 | self.file.write(line) 23 | return item 24 | 25 | def spider_closed(self, spider): 26 | self.file.close() 27 | 28 | class EuropythonXmlExport(object): 29 | 30 | def __init__(self): 31 | self.files = {} 32 | 33 | @classmethod 34 | def from_crawler(cls, crawler): 35 | pipeline = cls() 36 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 37 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 38 | return pipeline 39 | 40 | def spider_opened(self, spider): 41 | file = open('europython_items.xml', 'w+b') 42 | self.files[spider] = file 43 | self.exporter = XmlItemExporter(file) 44 | self.exporter.start_exporting() 45 | 46 | def spider_closed(self, spider): 47 | self.exporter.finish_exporting() 48 | file = self.files.pop(spider) 49 | file.close() 50 | 51 | def process_item(self, item, spider): 52 | self.exporter.export_item(item) 53 | return item 54 | 55 | class EuropythonCSVExport(object): 56 | 57 | def __init__(self): 58 | self.files = {} 59 | 60 | @classmethod 61 | def from_crawler(cls, crawler): 62 | pipeline = cls() 63 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened) 64 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) 65 | return pipeline 66 | 67 | def spider_opened(self, spider): 68 | file = open('europython_items.csv', 'w+b') 69 | self.files[spider] = file 70 | self.exporter = CsvItemExporter(file) 71 | self.exporter.start_exporting() 72 | 73 | def spider_closed(self, spider): 74 | self.exporter.finish_exporting() 75 | file = self.files.pop(spider) 76 | file.close() 77 | 78 | def process_item(self, item, spider): 79 | self.exporter.export_item(item) 80 | return item 81 | -------------------------------------------------------------------------------- /chapter4/europython/europython/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for europython project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'europython' 12 | 13 | SPIDER_MODULES = ['europython.spiders'] 14 | NEWSPIDER_MODULE = 'europython.spiders' 15 | 16 | 17 | # Configure item pipelines 18 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 19 | ITEM_PIPELINES = { 20 | 'europython.pipelines.EuropythonJsonExport': 100, 21 | 'europython.pipelines.EuropythonXmlExport': 200, 22 | 'europython.pipelines.EuropythonCSVExport': 300, 23 | } 24 | 25 | DOWNLOADER_MIDDLEWARES = { 26 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110, 27 | #'europython.middlewares.ProxyMiddleware': 100, 28 | } 29 | 30 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 31 | #USER_AGENT = 'europython (+http://www.yourdomain.com)' 32 | -------------------------------------------------------------------------------- /chapter4/europython/europython/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /chapter4/europython/europython/spiders/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/spiders/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /chapter4/europython/europython/spiders/__pycache__/europython_spider.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/spiders/__pycache__/europython_spider.cpython-37.pyc -------------------------------------------------------------------------------- /chapter4/europython/europython/spiders/europython_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.spiders import CrawlSpider, Rule 4 | from scrapy.linkextractors import LinkExtractor 5 | from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor 6 | from scrapy.loader import ItemLoader 7 | 8 | from europython.items import EuropythonItem 9 | 10 | 11 | class EuropythonSpider(CrawlSpider): 12 | def __init__(self, year='', *args, **kwargs): 13 | super(EuropythonSpider, self).__init__(*args, **kwargs) 14 | self.year = year 15 | self.start_urls = ['http://ep'+str(self.year)+".europython.eu/en/events/sessions"] 16 | print('start url: '+str(self.start_urls[0])) 17 | 18 | name = "europython_spider" 19 | allowed_domains = ["ep2015.europython.eu","ep2016.europython.eu", "ep2017.europython.eu","ep2018.europython.eu","ep2019.europython.eu"] 20 | 21 | # Pattern for entries that match the conference/talks and /talks format 22 | rules = [Rule(LxmlLinkExtractor(allow=['conference/talks']),callback='process_response'), 23 | Rule(LxmlLinkExtractor(allow=['talks']),callback='process_response_europython2019')] 24 | 25 | def process_response(self, response): 26 | itemLoader = ItemLoader(item=EuropythonItem(), response=response) 27 | itemLoader.add_xpath('title', "//div[contains(@class, 'grid-100')]//h1/text()") 28 | itemLoader.add_xpath('author', "//div[contains(@class, 'talk-speakers')]//a[1]/text()") 29 | itemLoader.add_xpath('description', "//div[contains(@class, 'cms')]//p//text()") 30 | itemLoader.add_xpath('date', "//section[contains(@class, 'talk when')]/strong/text()") 31 | itemLoader.add_xpath('tags', "//div[contains(@class, 'all-tags')]/span/text()") 32 | item = itemLoader.load_item() 33 | return item 34 | 35 | def process_response_europython2019(self, response): 36 | item = EuropythonItem() 37 | print(response) 38 | item['title'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/h1/text()").extract() 39 | item['author'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/h5/a/text()").extract() 40 | item['description'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/p[3]/text()").extract() 41 | item['date'] = "July 2019" 42 | item['tags'] = response.xpath("//span[contains(@class, 'badge badge-secondary')]/text()").extract() 43 | 44 | return item -------------------------------------------------------------------------------- /chapter4/europython/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | project: 366126 2 | -------------------------------------------------------------------------------- /chapter4/europython/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = europython.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = europython 12 | 13 | 14 | -------------------------------------------------------------------------------- /chapter4/europython/setup.py: -------------------------------------------------------------------------------- 1 | # Automatically created by: shub deploy 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup( 6 | name = 'project', 7 | version = '1.0', 8 | packages = find_packages(), 9 | entry_points = {'scrapy': ['settings = europython.settings']}, 10 | ) 11 | -------------------------------------------------------------------------------- /chapter4/images/book_details.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/book_details.png -------------------------------------------------------------------------------- /chapter4/images/books_images.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/books_images.png -------------------------------------------------------------------------------- /chapter4/images/books_images_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/books_images_output.png -------------------------------------------------------------------------------- /chapter4/images/europython_talk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/europython_talk.png -------------------------------------------------------------------------------- /chapter4/images/next_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/next_page.png -------------------------------------------------------------------------------- /chapter4/images/scrapy_books.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_books.png -------------------------------------------------------------------------------- /chapter4/images/scrapy_books_links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_books_links.png -------------------------------------------------------------------------------- /chapter4/images/scrapy_options.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_options.png -------------------------------------------------------------------------------- /chapter4/images/scrapy_project.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_project.png -------------------------------------------------------------------------------- /chapter4/images/scrapy_shell.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_shell.png -------------------------------------------------------------------------------- /chapter4/images/scrapy_shell2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_shell2.png -------------------------------------------------------------------------------- /chapter4/output.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"URL": "http://books.toscrape.com/index.html", "image_link": ["media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg"]}, 3 | {"URL": "http://books.toscrape.com/catalogue/category/books/travel_2/index.html", "image_link": ["../../../../media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg", "../../../../media/cache/57/77/57770cac1628f4407636635f4b85e88c.jpg", "../../../../media/cache/9a/7e/9a7e63f12829df4b43b31d110bf3dc2e.jpg", "../../../../media/cache/d5/bf/d5bf0090470b0b8ea46d9c166f7895aa.jpg", "../../../../media/cache/98/c2/98c2e95c5fd1a4e7cd5f2b63c52826cb.jpg", "../../../../media/cache/4e/15/4e15150388702ebca2c5a523ac270539.jpg", "../../../../media/cache/76/de/76de41867f323d7f1f4fbe2fdfc1b2ba.jpg", "../../../../media/cache/db/46/db46159b05faa5d95262112bf9c29ddd.jpg", "../../../../media/cache/e0/4f/e04f8eda2a2fa947aec17640202d9ab0.jpg", "../../../../media/cache/06/81/0681530a7bc301caf5c3257e1b0f0750.jpg", "../../../../media/cache/d7/0f/d70f7edd92705c45a82118c3ff6c299d.jpg"]}, 4 | {"URL": "http://books.toscrape.com/catalogue/category/books/classics_6/index.html", "image_link": ["../../../../media/cache/c5/46/c5465a06182ed6ebfa40d049258a2f58.jpg", "../../../../media/cache/4a/1b/4a1b6e9c1af75db0dc34ae63344f6883.jpg", "../../../../media/cache/45/bb/45bb59d19eb3aa868293d44809078418.jpg", "../../../../media/cache/1f/b0/1fb03cdabe6001c8a2620f65e025cbd5.jpg", "../../../../media/cache/81/f5/81f559ebe403317226fa8b611e35ce8a.jpg", "../../../../media/cache/27/82/2782701b5c877cb063065b9fc14c5b13.jpg", "../../../../media/cache/e3/c4/e3c4aba2409bb769a6488805e3fc4709.jpg", "../../../../media/cache/10/db/10db56354b4550d92270c6f097d9bebc.jpg", "../../../../media/cache/93/4e/934e966c1ddf559d3ac2b5c1407aaf1e.jpg", "../../../../media/cache/a6/72/a67245346daa38c2b23a4fc64c6e7115.jpg", "../../../../media/cache/42/c4/42c48f11b7e70a0f76c5ba9cb5c5018a.jpg", "../../../../media/cache/dd/6e/dd6e7b84e99f3b4b5655ea0db74af2b4.jpg", "../../../../media/cache/21/bf/21bf2eb0bff3134837def8bd40845ba0.jpg", "../../../../media/cache/ab/16/ab16eb035cc58809a73c4699477de9cb.jpg", "../../../../media/cache/c0/78/c078355608dd81c7c5e4f5e1c5f73d23.jpg", "../../../../media/cache/7d/53/7d53e2264b9647ee307259be9f73585d.jpg", "../../../../media/cache/0f/ca/0fca4597765ffacdb7bd529fc5eb88fa.jpg", "../../../../media/cache/09/63/09638baaef52f03827c215029c632a13.jpg", "../../../../media/cache/96/ee/96ee77d71a31b7694dac6855f6affe4e.jpg"]}, 5 | {"URL": "http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html", "image_link": ["../../../../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg", "../../../../media/cache/71/df/71df730cf38c232ee58a2e407135f055.jpg", "../../../../media/cache/ea/04/ea0476a6f4c318ceccf5e2f2b39f2b15.jpg", "../../../../media/cache/3f/ef/3fef12d9da503693af12997c0ea0897f.jpg", "../../../../media/cache/05/ce/05ce699eaf78c0fae20308497c4f496a.jpg", "../../../../media/cache/de/76/de76d5c473c358bd41c03cf710692bfb.jpg", "../../../../media/cache/12/6e/126ef8f6473b81808ebbb9cff155e883.jpg", "../../../../media/cache/91/e6/91e6190dcdd7d6cdeb94a82b60917ec4.jpg", "../../../../media/cache/f0/aa/f0aa9ae0319b1d6e0706e6053020e696.jpg", "../../../../media/cache/df/c9/dfc9ed72e963572d23233b3a8cb01676.jpg", "../../../../media/cache/ab/45/ab45f300aa15066ad1260d6f1398d03e.jpg"]}, 6 | {"URL": "http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html", "image_link": ["../../../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../../../media/cache/36/df/36df4caaf1420b1183a8235355d39e69.jpg", "../../../../media/cache/c4/dd/c4ddd9ced89966b0602ec85e00cd5b61.jpg", "../../../../media/cache/f4/79/f479de5f305c2ac0512702cf7155bb74.jpg", "../../../../media/cache/e1/ea/e1ea6cb36e62ae6dc7b805f68ab9a700.jpg", "../../../../media/cache/f3/ef/f3efd43ae0fa85d9b325d5e8783e7af5.jpg", "../../../../media/cache/78/0b/780b2c28122750c2c383846155815bf7.jpg", "../../../../media/cache/c8/2f/c82f629a31b3f47bdb17ac14aa51076d.jpg", "../../../../media/cache/01/72/01726c619a05114dca75bd840095016d.jpg", "../../../../media/cache/cb/00/cb004189f548d75ad430d3ed19e6daa9.jpg", "../../../../media/cache/03/88/03886a8502ca54dbce0d91c2568ab69d.jpg", "../../../../media/cache/d3/15/d3158e8d3546fb90cced3c1d44a92a34.jpg", "../../../../media/cache/7e/a0/7ea062007ef00107e3c16d336b41fab2.jpg", "../../../../media/cache/5f/b1/5fb1bf88dcfda795606745ce35be5975.jpg", "../../../../media/cache/aa/74/aa74004807e97a79aa084b5db329a99b.jpg", "../../../../media/cache/16/d4/16d443437126bf6d536a89312c1995a5.jpg", "../../../../media/cache/90/6f/906f0168b0e155a7077625499b1737b5.jpg", "../../../../media/cache/78/97/7897eea91c4a85aca58d925861d4afec.jpg", "../../../../media/cache/f6/88/f688a9d6a89fdf38e4e88439ee9eda69.jpg", "../../../../media/cache/dd/c9/ddc95df6754df8e71bf969c088056188.jpg"]}, 7 | {"URL": "http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html", "image_link": ["../../../../media/cache/5f/72/5f72c8a0d5a7292e2929a354ec8a022f.jpg", "../../../../media/cache/16/e3/16e3ca741956485119251e7442a67e2e.jpg", "../../../../media/cache/ae/ac/aeac003461b89c7ef826251d940b2afc.jpg", "../../../../media/cache/bb/ee/bbeeab4c4ce572c0e9764e3a96c6d4a5.jpg", "../../../../media/cache/39/e3/39e33ebef2d7a35dd6899541eba8306d.jpg", "../../../../media/cache/27/b7/27b7f4ec590965b5acc15dc4b1376684.jpg", "../../../../media/cache/ac/ba/acba5e4e1813b8c1fff4890f1efef3ab.jpg", "../../../../media/cache/72/73/7273ff1bfe3b0a6aab7f54ddf9be7b44.jpg", "../../../../media/cache/a0/fa/a0fa38039f6a674a7c89dfe2be866259.jpg", "../../../../media/cache/13/8f/138f4cf84be250d08e1f5c1db3643dbc.jpg", "../../../../media/cache/63/5f/635fb981e464f7427787824b20a15e71.jpg", "../../../../media/cache/87/d3/87d34d376555dd0cb75030d1059cc144.jpg", "../../../../media/cache/6d/6d/6d6d5799190b4f9ef89f3bbc8b67d60d.jpg", "../../../../media/cache/72/f5/72f5ed312bc82afa386c9cd48d4e36dd.jpg", "../../../../media/cache/b2/df/b2df826432771838819db89c20e20609.jpg", "../../../../media/cache/db/34/db341aa83daa76cd9f9bd2c86ccb5dba.jpg", "../../../../media/cache/0c/32/0c329cbd2adf4e0dc825f892106673b2.jpg"]}, 8 | {"URL": "http://books.toscrape.com/catalogue/category/books/mystery_3/index.html", "image_link": ["../../../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../../../media/cache/23/85/238570a1c284e730dbc737a7e631ae2b.jpg", "../../../../media/cache/89/b8/89b850edb01851a91f64ba114b96acb6.jpg", "../../../../media/cache/11/aa/11aaad48b5f15e262456ca65294084da.jpg", "../../../../media/cache/29/fe/29fe70b1b2e5a9ba61d4bd331255e19e.jpg", "../../../../media/cache/37/f1/37f118b4a56d866e1e8b563759d6966c.jpg", "../../../../media/cache/44/9e/449ed681142bc336646abee754e96639.jpg", "../../../../media/cache/3c/91/3c91d97266bd6dda322089695fb46daf.jpg", "../../../../media/cache/e8/c0/e8c0ba15066bab950ae161fd60949b9a.jpg", "../../../../media/cache/8f/a4/8fa41d6caa10e427356b8a590eb4d96b.jpg", "../../../../media/cache/23/52/2352718971d5e166fa9541a5a7d716fa.jpg", "../../../../media/cache/c3/8d/c38d65cd155b67ca025f0655bd1bb095.jpg", "../../../../media/cache/8b/bc/8bbc5ab4c3784b4d9b93eb0fd1fb6fd6.jpg", "../../../../media/cache/57/07/5707c3d5d4fd44d943d51730ba7d429a.jpg", "../../../../media/cache/d5/81/d58157866ea8f015a8e4c55b23b8c96f.jpg", "../../../../media/cache/fd/71/fd71fb07247bf911505a351c0670c6dc.jpg", "../../../../media/cache/90/0b/900bd2e60d56b6480a4e8eb2dddb46d6.jpg", "../../../../media/cache/c7/ab/c7abb5e32bd37118a87523dcee0a70a6.jpg", "../../../../media/cache/95/d7/95d7541679fcbd579b8a4f2b47231aaf.jpg", "../../../../media/cache/57/31/5731a5d46c2c1e88977eb5e6d1337a2e.jpg"]}, 9 | {"URL": "http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html", "image_link": ["../../../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../../../media/cache/d6/58/d658a1485b130ff26ca5fb0d5975ed2e.jpg", "../../../../media/cache/82/96/8296f92b70fb1dafefecda92c1d51941.jpg", "../../../../media/cache/0d/cb/0dcb33d60b0e79adf8ab9842e697ea2e.jpg", "../../../../media/cache/0e/fe/0efe86960cdff718aed01a5c3f65b1c3.jpg", "../../../../media/cache/0f/c2/0fc21ec3489cb23116778ee84f425eca.jpg", "../../../../media/cache/96/41/964194a317f8ce5ed031bf4c9ceb43ab.jpg", "../../../../media/cache/7a/22/7a224a6e174af91950e9b124afe54e0e.jpg", "../../../../media/cache/16/57/16575316618bd7e922d5b0e0f87de2ca.jpg", "../../../../media/cache/6c/2e/6c2e764e3ea89859b52df8de4f12af7a.jpg", "../../../../media/cache/fc/80/fc80b999ff4b8ef24b7071f62d2bf6d1.jpg", "../../../../media/cache/62/fa/62fa1e72f06f05762db5d9cedf654153.jpg", "../../../../media/cache/be/7c/be7ce6fbc9a8e1a5a5b5c32e73cfd78a.jpg", "../../../../media/cache/6b/82/6b822681c4035131560d40dd3b5a6a2e.jpg", "../../../../media/cache/b7/ad/b7ad37d93d8401c84d7325aa645ff6d5.jpg", "../../../../media/cache/b5/d8/b5d813da01f2ccd7bcfe34e2b875e752.jpg", "../../../../media/cache/b2/8f/b28f211e50e74445ca071d4279d1080d.jpg", "../../../../media/cache/bf/fd/bffd473ab232c5f35e8c81bb927f1624.jpg", "../../../../media/cache/18/f7/18f7bf6366cd7a8b947fd790d808047b.jpg", "../../../../media/cache/bf/7a/bf7a5bc1d1ebac5e9b6fbb147828a123.jpg"]}, 10 | {"URL": "http://books.toscrape.com/catalogue/category/books_1/index.html", "image_link": ["../../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "../../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../../media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg"]}, 11 | {"URL": "http://books.toscrape.com/catalogue/category/books/romance_8/index.html", "image_link": ["../../../../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg", "../../../../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg", "../../../../media/cache/1e/bb/1ebbbc3e2d3249b111033cfc40763b0b.jpg", "../../../../media/cache/c4/d1/c4d1517cc9370e292366b6132ca9ca36.jpg", "../../../../media/cache/cc/bd/ccbdae9e29b3594301528fa2c876ec29.jpg", "../../../../media/cache/28/99/28992d89f4abf54fba183fc8d074adf3.jpg", "../../../../media/cache/e9/f4/e9f4bc8cf5ffaea1504623c936e90a48.jpg", "../../../../media/cache/59/10/5910fbd8a95e8e9de9c660b71e0694e2.jpg", "../../../../media/cache/e9/25/e9250495a525eb203652ad9da85ccb8e.jpg", "../../../../media/cache/7e/67/7e67addd80caaf8a9f9e9daa9cf66bb2.jpg", "../../../../media/cache/0b/89/0b89c3b317d0f89da48356a0b5959c1e.jpg", "../../../../media/cache/ae/90/ae903f6f6d059954be4e85497dd76bf5.jpg", "../../../../media/cache/a6/4b/a64b3c559f59748bfdbbe75be3e16075.jpg", "../../../../media/cache/1d/78/1d78fe226e1adb9cb591fa21f8a9bf68.jpg", "../../../../media/cache/f0/e0/f0e0db3edcb14293a52b51929cc72979.jpg", "../../../../media/cache/8e/40/8e408552c2e7ee81cd60c03c79f604af.jpg", "../../../../media/cache/f7/a9/f7a90a63f66ac92cc280def001970ed2.jpg", "../../../../media/cache/40/16/4016ffba678f309171d8130135f6eb8e.jpg", "../../../../media/cache/3c/a2/3ca2e61181fc1122658af8f85354bae8.jpg", "../../../../media/cache/57/47/57472d9c6d483bee9c38c90bfa10b3ee.jpg"]}, 12 | {"URL": "http://books.toscrape.com/catalogue/category/books/health_47/index.html", "image_link": ["../../../../media/cache/ee/3e/ee3e219d23e73ba71c79b700f183aaed.jpg", "../../../../media/cache/62/3f/623f8e7f7432ce744f4318aae8166ce4.jpg", "../../../../media/cache/23/c2/23c2108ae81327c7f3fb0721976cba5e.jpg", "../../../../media/cache/4b/d4/4bd43108fb070ad8ebba9cdb00b14069.jpg"]}, 13 | {"URL": "http://books.toscrape.com/catalogue/category/books/novels_46/index.html", "image_link": ["../../../../media/cache/db/cc/dbcc9d63b73ce9058d53f36465dbe2b2.jpg"]}, 14 | {"URL": "http://books.toscrape.com/catalogue/category/books/short-stories_45/index.html", "image_link": ["../../../../media/cache/f4/cb/f4cb1f9c7280bf1fd05fe33d2816080f.jpg"]}, 15 | {"URL": "http://books.toscrape.com/catalogue/category/books/suspense_44/index.html", "image_link": ["../../../../media/cache/bb/1c/bb1c91883579f1f99fe6ebf13b92c1c1.jpg"]}, 16 | {"URL": "http://books.toscrape.com/catalogue/page-2.html", "image_link": ["../media/cache/5d/72/5d72709c6a7a9584a4d1cf07648bfce1.jpg", "../media/cache/5c/c8/5cc8e107246cb478960d4f0aba1e1c8e.jpg", "../media/cache/9f/59/9f59f01fa916a7bb8f0b28a4012179a4.jpg", "../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg", "../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg", "../media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg", "../media/cache/ef/0b/ef0bed08de4e083dba5e20fdb98d9c36.jpg", "../media/cache/d6/da/d6da0371958068bbaf39ea9c174275cd.jpg", "../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg", "../media/cache/a5/41/a5416b9646aaa7287baa287ec2590270.jpg", "../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg", "../media/cache/38/c5/38c56fba316c07305643a8065269594e.jpg", "../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg", "../media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg", "../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg", "../media/cache/12/53/1253c21c5ef3c6d075c5fa3f5fecee6a.jpg", "../media/cache/f5/88/f5889d038f5d8e949b494d147c2dcf54.jpg", "../media/cache/23/85/238570a1c284e730dbc737a7e631ae2b.jpg", "../media/cache/e1/5c/e15c289ba58cea38519e1281e859f0c1.jpg", "../media/cache/e9/20/e9203b733126c4a0832a1c7885dc27cf.jpg"]}, 17 | {"URL": "http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html", "image_link": ["../../media/cache/6d/41/6d418a73cc7d4ecfd75ca11d854041db.jpg", "../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg"]}, 18 | {"URL": "http://books.toscrape.com/catalogue/category/books/christian_43/index.html", "image_link": ["../../../../media/cache/cd/db/cddb3eb483ef11a088d519205b7098fb.jpg", "../../../../media/cache/03/f1/03f1e337afadba35687672b5625a9757.jpg", "../../../../media/cache/a2/f5/a2f5b5fd4421d56d37c73a7fb29f5f40.jpg"]}, 19 | {"URL": "http://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html", "image_link": ["../../media/cache/91/a4/91a46253e165d144ef5938f2d456b88f.jpg", "../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg"]}, 20 | {"URL": "http://books.toscrape.com/catalogue/category/books/historical_42/index.html", "image_link": ["../../../../media/cache/41/c3/41c37f7f0e03ee1144dd6fa89483b5d9.jpg", "../../../../media/cache/d0/b6/d0b6d59c0662dcbd15d47add40af1ebd.jpg"]}, 21 | {"URL": "http://books.toscrape.com/catalogue/mesaerion-the-best-science-fiction-stories-1800-1849_983/index.html", "image_link": ["../../media/cache/e8/1f/e81f850db9b9622c65619c9f15748de7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg"]}, 22 | {"URL": "http://books.toscrape.com/catalogue/olio_984/index.html", "image_link": ["../../media/cache/b1/0e/b10eabab1e1c811a6d47969904fd5755.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg"]}, 23 | {"URL": "http://books.toscrape.com/catalogue/our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985/index.html", "image_link": ["../../media/cache/ad/96/ad96e9c9f1664cbcb0e9627b007fb6f9.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg"]}, 24 | {"URL": "http://books.toscrape.com/catalogue/rip-it-up-and-start-again_986/index.html", "image_link": ["../../media/cache/81/7f/817f5089c0e6e62738dce2931e7323d3.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg"]}, 25 | {"URL": "http://books.toscrape.com/catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html", "image_link": ["../../media/cache/97/27/97275841c81e66d53bf9313cba06f23e.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg"]}, 26 | {"URL": "http://books.toscrape.com/catalogue/set-me-free_988/index.html", "image_link": ["../../media/cache/b8/e9/b8e91bd2fc74c3954118999238abb4b8.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg"]}, 27 | {"URL": "http://books.toscrape.com/catalogue/shakespeares-sonnets_989/index.html", "image_link": ["../../media/cache/4d/7a/4d7a79a8be80a529b277ed5c4d8ba482.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg"]}, 28 | {"URL": "http://books.toscrape.com/catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html", "image_link": ["../../media/cache/a0/7e/a07ed8f1c23f7b4baf7102722680bd30.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg"]}, 29 | {"URL": "http://books.toscrape.com/catalogue/the-black-maria_991/index.html", "image_link": ["../../media/cache/d1/7a/d17a3e313e52e1be5651719e4fba1d16.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg"]}, 30 | {"URL": "http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html", "image_link": ["../../media/cache/e1/1b/e11bea016d0ae1d7e2dd46fb3cb870b7.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]}, 31 | {"URL": "http://books.toscrape.com/catalogue/the-requiem-red_995/index.html", "image_link": ["../../media/cache/6b/07/6b07b77236b7c80f42bd90bf325e69f6.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]}, 32 | {"URL": "http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html", "image_link": ["../../media/cache/d1/2d/d12d26739b5369a6b5b3024e4d08f907.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg"]}, 33 | {"URL": "http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html", "image_link": ["../../media/cache/97/36/9736132a43b8e6e3989932218ef309ed.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg"]}, 34 | {"URL": "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html", "image_link": ["../../media/cache/ce/5f/ce5f052c65cc963cf4422be096e915c9.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]}, 35 | {"URL": "http://books.toscrape.com/catalogue/sharp-objects_997/index.html", "image_link": ["../../media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]}, 36 | {"URL": "http://books.toscrape.com/catalogue/soumission_998/index.html", "image_link": ["../../media/cache/ee/cf/eecfe998905e455df12064dba399c075.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]}, 37 | {"URL": "http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html", "image_link": ["../../media/cache/08/e9/08e94f3731d7d6b760dfbfbc02ca5c62.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]}, 38 | {"URL": "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html", "image_link": ["../../media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg"]}, 39 | {"URL": "http://books.toscrape.com/catalogue/category/books/crime_51/index.html", "image_link": ["../../../../media/cache/f2/e5/f2e51dd2b26600459f8eaeb6b9eecaa7.jpg"]}, 40 | {"URL": "http://books.toscrape.com/catalogue/category/books/erotica_50/index.html", "image_link": ["../../../../media/cache/6e/4e/6e4e8f4f4abd94356a9be840e4681e65.jpg"]}, 41 | {"URL": "http://books.toscrape.com/catalogue/category/books/cultural_49/index.html", "image_link": ["../../../../media/cache/52/46/524655fade1d9fe1475395a3eaff827a.jpg"]}, 42 | {"URL": "http://books.toscrape.com/catalogue/category/books/politics_48/index.html", "image_link": ["../../../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../../../media/cache/db/1b/db1babd3c09b84da800b0e9897fe0097.jpg", "../../../../media/cache/00/11/001153d2a22d889837efac1703e10a5e.jpg"]}, 43 | {"URL": "http://books.toscrape.com/catalogue/category/books/academic_40/index.html", "image_link": ["../../../../media/cache/d9/4e/d94e6206c2decd3acd9a61b2cbac7eaf.jpg"]}, 44 | {"URL": "http://books.toscrape.com/catalogue/category/books/self-help_41/index.html", "image_link": ["../../../../media/cache/ea/9b/ea9b2cb8abbb317402e618445bade1e1.jpg", "../../../../media/cache/da/8b/da8bc9b824dd3f446ef63e438ddbfc85.jpg", "../../../../media/cache/9c/da/9cda4893c7fce0c1c8eaa34fb092aa04.jpg", "../../../../media/cache/9e/15/9e15d7add5090ff2a17bd71ac96aa55a.jpg", "../../../../media/cache/4f/08/4f08f7948770912e4e340e10caa604cb.jpg"]}, 45 | {"URL": "http://books.toscrape.com/catalogue/category/books/spirituality_39/index.html", "image_link": ["../../../../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg", "../../../../media/cache/96/db/96db61bb53930c560fb4c1c62b583816.jpg", "../../../../media/cache/b7/6a/b76a73640d26b09c4a6f373b09050bed.jpg", "../../../../media/cache/87/fe/87fe3f7f3f62c1b1b81890578c9cf294.jpg", "../../../../media/cache/8b/10/8b102daec94d1ea9c6fc36dd3ec1c1fe.jpg", "../../../../media/cache/83/c8/83c834b3779be4e577c37ead6d2acf65.jpg"]}, 46 | {"URL": "http://books.toscrape.com/catalogue/category/books/contemporary_38/index.html", "image_link": ["../../../../media/cache/08/04/08044269fc197645268a6197c57e6173.jpg", "../../../../media/cache/e3/d0/e3d05227f3fc24f0e0c84ccebe108fb0.jpg", "../../../../media/cache/4d/18/4d1891e435c6692c864331c585e0d014.jpg"]}, 47 | {"URL": "http://books.toscrape.com/catalogue/category/books/thriller_37/index.html", "image_link": ["../../../../media/cache/5d/72/5d72709c6a7a9584a4d1cf07648bfce1.jpg", "../../../../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg", "../../../../media/cache/e1/5c/e15c289ba58cea38519e1281e859f0c1.jpg", "../../../../media/cache/d6/97/d697268540fa982f4dce39f61ed3a342.jpg", "../../../../media/cache/76/de/76deee06ffe45e646c0113af01f4f401.jpg", "../../../../media/cache/d9/1a/d91aae72af6c1cb2c63163acabe7895c.jpg", "../../../../media/cache/8b/7c/8b7c73e075cc687b6890dc0dca9fcbcc.jpg", "../../../../media/cache/eb/e9/ebe9f06ccebf83d9853a846052b58fff.jpg", "../../../../media/cache/ee/d4/eed4d5d63d13f0aa86575c90f8ccacb7.jpg", "../../../../media/cache/87/54/8754267f27581996f93e8d94d3c04bf9.jpg", "../../../../media/cache/2a/a8/2aa8afd15f97617ab75f616766161cda.jpg"]}, 48 | {"URL": "http://books.toscrape.com/catalogue/category/books/biography_36/index.html", "image_link": ["../../../../media/cache/6f/d9/6fd92e5143cbd5bb8bcf034e5f007dde.jpg", "../../../../media/cache/8b/c4/8bc43a6b42d0283ab4bf611f1b497126.jpg", "../../../../media/cache/cc/a4/cca4e6a4cd5c207e7ce7d992ff464c3b.jpg", "../../../../media/cache/25/f8/25f869fa75340fca0fc2a68e8a0412a1.jpg", "../../../../media/cache/ff/d4/ffd45d95f314555e20c923d3522adea7.jpg"]}, 49 | {"URL": "http://books.toscrape.com/catalogue/category/books/business_35/index.html", "image_link": ["../../../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../../../media/cache/d0/77/d077a30042df6b916bfc8d257345c69e.jpg", "../../../../media/cache/82/93/82939ca78da0b724f16ec814849514fd.jpg", "../../../../media/cache/19/aa/19aa1184a3565b1dae6092146018e109.jpg", "../../../../media/cache/e2/2e/e22e4a82d97f9f0689d5295a98f5dcff.jpg", "../../../../media/cache/2d/fd/2dfdc52bcdbd82dee50372bc46c83e15.jpg", "../../../../media/cache/b3/7b/b37be83183f1dcb759d92bda8f8998a4.jpg", "../../../../media/cache/aa/67/aa677a97ecdcbbde7471f1c90ed0cf6f.jpg", "../../../../media/cache/11/2c/112c55a6bcd401c3bd603f5ddb2e6b82.jpg", "../../../../media/cache/18/f4/18f45d31e3892fee589e23f15d759ee3.jpg", "../../../../media/cache/39/f1/39f167dff90d7f84f5c8dc5e05d4051b.jpg", "../../../../media/cache/54/10/5410a58193e2373c04b3021ade78a82b.jpg"]}, 50 | {"URL": "http://books.toscrape.com/catalogue/category/books/christian-fiction_34/index.html", "image_link": ["../../../../media/cache/21/21/2121ba78e26194d92c334fde3850f840.jpg", "../../../../media/cache/fa/f6/faf6d69a42f477e1da80a71f05a4dc25.jpg", "../../../../media/cache/93/e0/93e0ec623673a8f83598c9aa7b6c94ec.jpg", "../../../../media/cache/17/e2/17e264d978942f73b859fa1c1d2cf827.jpg", "../../../../media/cache/32/2c/322c1f6cce6d5a69a7d2321779195a0c.jpg", "../../../../media/cache/c3/d0/c3d0f2fb5cacbca64639a679b962e1b9.jpg"]}, 51 | {"URL": "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/index.html", "image_link": ["../../../../media/cache/9f/59/9f59f01fa916a7bb8f0b28a4012179a4.jpg", "../../../../media/cache/b7/f4/b7f4843dbe062d44be1ffcfa16b2faa4.jpg", "../../../../media/cache/f5/65/f565af3d9dd20a1ad72a1e7c4157387d.jpg", "../../../../media/cache/10/c6/10c61093002db1fec4089d8076678624.jpg", "../../../../media/cache/98/d1/98d1c979c4bac9e147a6718946578b0f.jpg", "../../../../media/cache/61/bd/61bdfe3950643c47d70c37c4123530f3.jpg", "../../../../media/cache/0d/1f/0d1f3f934460f5a50aaa8c366641234c.jpg", "../../../../media/cache/54/89/54899b4584e941ceced511d81092c88a.jpg", "../../../../media/cache/20/f2/20f28657b49f8cb24ed2ec6448bb6df3.jpg", "../../../../media/cache/c4/dc/c4dcec6f513eaca3f0f3c748d834c46d.jpg", "../../../../media/cache/fe/67/fe67c381d6a0c4c00a7c191d16939554.jpg", "../../../../media/cache/b8/38/b838b65e0e1ac3a9b498dfb1bf004420.jpg", "../../../../media/cache/74/aa/74aa29b1ba4147eaf5b46671bf235861.jpg", "../../../../media/cache/76/a1/76a1516c8d9c3e620626f30840013a85.jpg", "../../../../media/cache/5a/64/5a6499d41ccaad4c4f7eeaa90e16345a.jpg", "../../../../media/cache/98/19/9819ff3a8290dc6ab8797d00de5ec554.jpg", "../../../../media/cache/ae/5c/ae5ca435fb095e374d2c2aa9f7b6f380.jpg", "../../../../media/cache/d4/53/d453cfb6c08dbf76d200ffa858bc9979.jpg", "../../../../media/cache/1d/1f/1d1fbd89f0290275b9166877663ee9f5.jpg", "../../../../media/cache/e6/b6/e6b66353f9325518994dd8b564290fd7.jpg"]}, 52 | {"URL": "http://books.toscrape.com/catalogue/category/books/history_32/index.html", "image_link": ["../../../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../../../media/cache/4a/3b/4a3b055f9e378a95fedbef55e7bab7ce.jpg", "../../../../media/cache/2d/4e/2d4e358712e6c9f1d3bdd78d1a16e5a8.jpg", "../../../../media/cache/64/44/6444dacdcb9edaadbbd691524622aeb8.jpg", "../../../../media/cache/97/47/974709d437b08e74649b5744471bf472.jpg", "../../../../media/cache/3d/60/3d6003fc37b842a07c2dbe28e47448e1.jpg", "../../../../media/cache/41/d5/41d5fa6a81cdbcbe6b0b15757a4c9144.jpg", "../../../../media/cache/88/75/8875f384ce9103281b7f6e86a2b8204d.jpg", "../../../../media/cache/56/cb/56cb66d73fb438d64af14dce8bd8b22b.jpg", "../../../../media/cache/11/af/11af7fbd6aec06a75fe207fae92b17e0.jpg", "../../../../media/cache/3c/f6/3cf646523ff7fb8647c500d6325cfcaf.jpg", "../../../../media/cache/e1/02/e102cefae5bb523bc67eb6b49bc18b5d.jpg", "../../../../media/cache/72/f1/72f13b8f069d3a018d2c378be5a1de20.jpg", "../../../../media/cache/f2/64/f26457d65a03b2636c4bcc7c318f7346.jpg", "../../../../media/cache/cf/18/cf187c1dc5575fcbbf49c58024146c4b.jpg", "../../../../media/cache/eb/17/eb178eceef1e9290591cabd5155571a3.jpg", "../../../../media/cache/06/c8/06c897070611b78b80a37333cbb7851c.jpg", "../../../../media/cache/43/fd/43fda1db93163d67705264dcfa98aaa5.jpg"]}, 53 | {"URL": "http://books.toscrape.com/catalogue/category/books/horror_31/index.html", "image_link": ["../../../../media/cache/da/df/dadfac66a89774b46b10225362724c83.jpg", "../../../../media/cache/a7/4b/a74b35375ce874153fd352e33bc7bac9.jpg", "../../../../media/cache/6d/10/6d10387a0175701d4ff456a0c7eee67b.jpg", "../../../../media/cache/7a/72/7a72465b21dbf998323e37b31f9a3f4a.jpg", "../../../../media/cache/55/bf/55bfc858c1cb19867e41415532ae43c6.jpg", "../../../../media/cache/02/5c/025c30a378e2a4190e84f1429e81b803.jpg", "../../../../media/cache/0b/2f/0b2f432cc27132f688fcdf29618521e0.jpg", "../../../../media/cache/30/66/3066f8bcd2e2ed6b45084355ff084a61.jpg", "../../../../media/cache/13/ff/13fffcde653948339d3427184b7bd0b5.jpg", "../../../../media/cache/c0/02/c0029d48c2588e6d2a6a31c9f96088ba.jpg", "../../../../media/cache/2d/e0/2de0eff716ca13d12cf5420e88e1a8b3.jpg", "../../../../media/cache/7c/93/7c9302e392e128881e926d19f761da33.jpg", "../../../../media/cache/f7/b7/f7b73392b12909a1e8261ef3f96c5fd1.jpg", "../../../../media/cache/ee/d3/eed3afc5e444e3da5eec34e2b0036ec7.jpg", "../../../../media/cache/3a/7c/3a7c2393061031e7911d7b533b723391.jpg", "../../../../media/cache/41/c7/41c74d82b853606fe98182c417b4669c.jpg", "../../../../media/cache/14/25/142563ccee483bc07632f9c083a68326.jpg"]}, 54 | {"URL": "http://books.toscrape.com/catalogue/category/books/humor_30/index.html", "image_link": ["../../../../media/cache/46/bd/46bdee520b8136972262fd040533772d.jpg", "../../../../media/cache/df/5d/df5d172abe87deda6d533e3e908d27d8.jpg", "../../../../media/cache/ea/7b/ea7bcac4b27a5bf6d4f8125bb7af3361.jpg", "../../../../media/cache/b5/a9/b5a90d1c36a96513942f006345ace3d2.jpg", "../../../../media/cache/df/14/df1418baa09e00b877be35066084c9dc.jpg", "../../../../media/cache/4c/30/4c3041def6f29659e009f61e45e492b0.jpg", "../../../../media/cache/73/36/733662595aede2dff1a5be1e76a3b936.jpg", "../../../../media/cache/e7/12/e71268a559d73826aa64151d47357a12.jpg", "../../../../media/cache/a1/03/a10370da29e4ba78c7a75a14041eae0e.jpg", "../../../../media/cache/4b/9a/4b9a2a6d4c995e12fe216f6173a582be.jpg"]}, 55 | {"URL": "http://books.toscrape.com/catalogue/category/books/adult-fiction_29/index.html", "image_link": ["../../../../media/cache/18/d8/18d8e02c75c2ef23556c9746fae57e43.jpg"]}, 56 | {"URL": "http://books.toscrape.com/catalogue/category/books/parenting_28/index.html", "image_link": ["../../../../media/cache/7d/0b/7d0bb832760e81c281d8d283ba6a2b09.jpg"]}, 57 | {"URL": "http://books.toscrape.com/catalogue/category/books/autobiography_27/index.html", "image_link": ["../../../../media/cache/0a/15/0a1567cd04a6582d333db71337b4e2a6.jpg", "../../../../media/cache/d6/e8/d6e8258cee98f80727e99f7ac5aa1b88.jpg", "../../../../media/cache/e9/72/e972f8b4abaaa6f8f449479cd9d87be3.jpg", "../../../../media/cache/17/aa/17aacb738eace89a635a4eb47a94c11d.jpg", "../../../../media/cache/66/c7/66c7a1537c8901e1e4ec217d1956bae8.jpg", "../../../../media/cache/98/9f/989fe700e9e6bdec4fc3217daa5b7df3.jpg", "../../../../media/cache/61/ba/61ba5bc1ee3d8cb3dd350120ffa3f31e.jpg", "../../../../media/cache/80/b3/80b3e38be4204b3b64cdbe8c80dcf1f9.jpg", "../../../../media/cache/7a/58/7a587c5814f33c0c54e8bfa0ef66d690.jpg"]}, 58 | {"URL": "http://books.toscrape.com/catalogue/category/books/psychology_26/index.html", "image_link": ["../../../../media/cache/a6/c8/a6c8256b123493472591c5855c7de704.jpg", "../../../../media/cache/dc/4d/dc4d070e33813a07a4e02f069e6d482f.jpg", "../../../../media/cache/ee/a9/eea9e831f8964b4dc0190c84a1f9a1f6.jpg", "../../../../media/cache/00/29/002924b764dc367dcaa3486fa4c0aa0b.jpg", "../../../../media/cache/b4/a5/b4a56663d56f1e84ee1b15bd819563cc.jpg", "../../../../media/cache/4d/a6/4da6939a6bbd895a5acdeabad46d1f9f.jpg", "../../../../media/cache/b8/44/b844a77409f1d53cbb66148820abc217.jpg"]}, 59 | {"URL": "http://books.toscrape.com/catalogue/category/books/art_25/index.html", "image_link": ["../../../../media/cache/a5/41/a5416b9646aaa7287baa287ec2590270.jpg", "../../../../media/cache/f2/ee/f2ee668cf593ff13a9560c2801e9c2a2.jpg", "../../../../media/cache/ef/80/ef80e6100214c486562a73ce76444826.jpg", "../../../../media/cache/6a/55/6a55ccd4bc2383f5fe915fbef8bd5a23.jpg", "../../../../media/cache/58/a6/58a634c3231b5380544cc330536cb5ea.jpg", "../../../../media/cache/bb/36/bb364a10868756d1c0877c928b43b533.jpg", "../../../../media/cache/99/51/99511f4da1a4a2114e2ed12e6ba17b65.jpg", "../../../../media/cache/a8/3a/a83a4d31d30dc3cb26a29899a5c3b91d.jpg"]}, 60 | {"URL": "http://books.toscrape.com/catalogue/category/books/paranormal_24/index.html", "image_link": ["../../../../media/cache/4b/97/4b972f89c11900ac0e84726d1f07bfcc.jpg"]}, 61 | {"URL": "http://books.toscrape.com/catalogue/category/books/poetry_23/index.html", "image_link": ["../../../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "../../../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../../../media/cache/e9/20/e9203b733126c4a0832a1c7885dc27cf.jpg", "../../../../media/cache/72/41/72417db983862010ef0c1a25de98c7d7.jpg", "../../../../media/cache/f9/3b/f93b4a650f03a5d21f2436d7813f42c2.jpg", "../../../../media/cache/38/64/386468a8c3e6b880664bf7885bf6f726.jpg", "../../../../media/cache/25/54/2554431c797ec725eea50b3f8a83758c.jpg", "../../../../media/cache/3f/41/3f4160ada0b16e3c64cd2d0dffe781c8.jpg", "../../../../media/cache/c8/f2/c8f297fab080ddd02b3ed5c17b83af85.jpg", "../../../../media/cache/93/d5/93d5c64abfad9ed6a0cb2e26f19f1a1e.jpg", "../../../../media/cache/36/5b/365b3ab7ab72a6258873716aef6d5c1a.jpg", "../../../../media/cache/b7/29/b7293f602efb0c17e305077f8175888a.jpg", "../../../../media/cache/31/c7/31c7c5ce7b04d227aa36ecb250b9dad5.jpg", "../../../../media/cache/7e/93/7e934132cd03486649fb492fe702f704.jpg", "../../../../media/cache/9f/35/9f351ca1978128c60a3b7f85987075b3.jpg", "../../../../media/cache/8f/46/8f46bb13feb3a4440a27dfcf688fbaa6.jpg", "../../../../media/cache/df/ab/dfab1d94f9190df7c13b63a093a6d16e.jpg"]}, 62 | {"URL": "http://books.toscrape.com/catalogue/category/books/science_22/index.html", "image_link": ["../../../../media/cache/d4/8d/d48d5122a15347e9fe2b15ad354d69bf.jpg", "../../../../media/cache/26/1c/261c4eaf957ae4aacf2229b482e76dbe.jpg", "../../../../media/cache/68/ca/68caaf9ac41964d5167a3eb67c638393.jpg", "../../../../media/cache/56/97/5697f2f8f628129df01c5790985ffd9b.jpg", "../../../../media/cache/5e/7f/5e7f7d9913d4c95d33904770c518d537.jpg", "../../../../media/cache/33/4f/334fd0ebdf0c0192baf5914d199c53b5.jpg", "../../../../media/cache/da/0d/da0d13699a090516502257a4d7da623f.jpg", "../../../../media/cache/08/a9/08a957eb34f8047862e225774c3bdde2.jpg", "../../../../media/cache/83/ab/83ab65f938b24fa1a9cb47235be49b57.jpg", "../../../../media/cache/69/c8/69c83860995cde393dbe6690ec3f1d4f.jpg", "../../../../media/cache/f9/69/f969969428b505970a46272fdcea00d3.jpg", "../../../../media/cache/f8/bc/f8bcd489d33473e0819beaecccd5ebac.jpg", "../../../../media/cache/c8/63/c863c222c130a1bc8685a1242dd2523d.jpg", "../../../../media/cache/08/14/0814f26516fb72b7391d0a742b5928a2.jpg"]}, 63 | {"URL": "http://books.toscrape.com/catalogue/category/books/young-adult_21/index.html", "image_link": ["../../../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../../../media/cache/5d/7f/5d7f496cdf5e5962a73ecdcc1505c1d5.jpg", "../../../../media/cache/fc/72/fc72f158554b4b4164701e1dfa1153c7.jpg", "../../../../media/cache/26/95/269507c7bb35d2cec9b61a03d1c28e67.jpg", "../../../../media/cache/12/f1/12f1963957f27fa83d51f76b183ef490.jpg", "../../../../media/cache/0f/d3/0fd306891f8fd3196653022fd67d6c87.jpg", "../../../../media/cache/18/08/18086e581ad354aa65f945c2b5c51350.jpg", "../../../../media/cache/f8/54/f85417465a73e33604624205ba8306cc.jpg", "../../../../media/cache/71/76/7176317f1915fa0658bb2fe400441207.jpg", "../../../../media/cache/bb/72/bb723ad463531c602ad8bcb244253bf3.jpg", "../../../../media/cache/19/cf/19cf50aea5bf0e8f4bc016f3745b3dfe.jpg", "../../../../media/cache/18/0b/180bfe1902cb3c0eb77d7c712efa2a96.jpg", "../../../../media/cache/1d/3c/1d3c05b772ab846c111970232360d2c5.jpg", "../../../../media/cache/46/6e/466e9636819aad1126ac6cefb5313ba8.jpg", "../../../../media/cache/b2/df/b2df2ea409c5cf28538b67aff424b11f.jpg", "../../../../media/cache/ad/ac/adac97366586d261feab30bf5220756e.jpg", "../../../../media/cache/61/1a/611aba0ef5b859ba1977ef30677b0194.jpg", "../../../../media/cache/87/cd/87cd652c35e2a78535c83becae33cff2.jpg", "../../../../media/cache/fd/5b/fd5b14399052ab552e240ed18ab03c6d.jpg"]}, 64 | {"URL": "http://books.toscrape.com/catalogue/category/books/new-adult_20/index.html", "image_link": ["../../../../media/cache/24/e2/24e2f5c9d325c4004d8190c054da86dd.jpg", "../../../../media/cache/a5/43/a543b100a8c1861c1bf5374ca6b576fe.jpg", "../../../../media/cache/84/ac/84acb0606c96e55dc729a9d6572a08fb.jpg", "../../../../media/cache/38/f1/38f1543cd2d51c2728678f5ecc128958.jpg", "../../../../media/cache/a2/19/a2198abf12e3287f84997b35f4e1050e.jpg", "../../../../media/cache/03/ed/03ed67ea504353b91b035151d8e80db2.jpg"]}, 65 | {"URL": "http://books.toscrape.com/catalogue/category/books/fantasy_19/index.html", "image_link": ["../../../../media/cache/76/8e/768ea5924ac1ef6297c2be9959c796c2.jpg", "../../../../media/cache/43/ae/43aee83ebb31e2122a7215e413770e5c.jpg", "../../../../media/cache/b7/e8/b7e84b78be3d9bb79b71156a5e5d4e42.jpg", "../../../../media/cache/ff/e8/ffe81bf98f8386ef29e193abfb6f9c1e.jpg", "../../../../media/cache/66/25/6625e3bbb050de3e42a0c302c0d69f1f.jpg", "../../../../media/cache/06/18/061811c5845d0e13bc04b2a755f0830f.jpg", "../../../../media/cache/c0/88/c08816960890396213a423941af65b8f.jpg", "../../../../media/cache/32/d6/32d6aa560e8ddf2a4da1526b95d4c7ab.jpg", "../../../../media/cache/3e/0b/3e0b16851bec08b6cbf78d5f64af9114.jpg", "../../../../media/cache/e2/60/e260b008b7ea7970562295b7bc64b0cb.jpg", "../../../../media/cache/53/5e/535e2be0b423797c2cdc7d98882c820a.jpg", "../../../../media/cache/b4/67/b467a4f01ca6ae8464b9425a156c7c32.jpg", "../../../../media/cache/9a/33/9a333c4a06ce187c5c9d2f5969ddcac2.jpg", "../../../../media/cache/75/b9/75b99691594fde72ccb1831624cfeff6.jpg", "../../../../media/cache/8f/80/8f8074d9f035c2a0ef8595ad89f7bcc8.jpg", "../../../../media/cache/00/08/0008e65aa431ed3625ad3a4352f8e90d.jpg", "../../../../media/cache/3b/04/3b045fe0394dc192950a0ec9e3812fe4.jpg", "../../../../media/cache/d3/0d/d30dd8b6be6f9fcfd17178e8083238b6.jpg", "../../../../media/cache/27/64/27649cb5da52970f4bb2fc5234a48578.jpg", "../../../../media/cache/3e/2d/3e2d526ee062008ab1cbf54f90a5abb2.jpg"]}, 66 | {"URL": "http://books.toscrape.com/catalogue/category/books/add-a-comment_18/index.html", "image_link": ["../../../../media/cache/33/e5/33e507172541628acfd421503196b578.jpg", "../../../../media/cache/f8/6d/f86d08178e3788563ac17be5aefd29f0.jpg", "../../../../media/cache/70/fa/70fa6c0437d9c97dbeada6bd32bf9d2c.jpg", "../../../../media/cache/a1/14/a114d70e7babf110ba42a389078e9a45.jpg", "../../../../media/cache/5f/52/5f52b1bc6d45daab2e330c744feb0359.jpg", "../../../../media/cache/ae/0c/ae0ccc307568b6d7699786411f3cbcc4.jpg", "../../../../media/cache/28/78/2878538a1039d9c4649110499a1393fb.jpg", "../../../../media/cache/72/d8/72d861617b6d3aababe6e61e8d3c1056.jpg", "../../../../media/cache/66/f7/66f79b76d6c6b64fcc8110515c454e09.jpg", "../../../../media/cache/94/ac/94ac87da7b40853013093f08356efa3b.jpg", "../../../../media/cache/8f/3f/8f3f4d67e30a8129577ccc4664998345.jpg", "../../../../media/cache/3f/e7/3fe7073a5caac81929524d2d9488f928.jpg", "../../../../media/cache/f5/58/f55886d1bf600529a35e1bd932c78ca0.jpg", "../../../../media/cache/0b/97/0b97282ed82b771ed328e05386a84adb.jpg", "../../../../media/cache/50/0e/500eeb810e940424827580574e46852c.jpg", "../../../../media/cache/9b/20/9b2076ce7414103a093ce2459d089969.jpg", "../../../../media/cache/75/20/75200336c141156746000f7055df344a.jpg", "../../../../media/cache/4d/16/4d163d43cb4aa624e599330a39abace5.jpg", "../../../../media/cache/55/33/5533595a623c3bb947c4a5171fc2df08.jpg", "../../../../media/cache/97/3a/973a2c3462a18fc90d3b9662d959df37.jpg"]}, 67 | {"URL": "http://books.toscrape.com/catalogue/category/books/sports-and-games_17/index.html", "image_link": ["../../../../media/cache/61/2c/612caeb0b2acb35c100629f0f52a40d7.jpg", "../../../../media/cache/7d/cf/7dcf6c3b419bf7e7e3b3b8162b177869.jpg", "../../../../media/cache/c3/a9/c3a90a5baa833a37c29c4b03a444737c.jpg", "../../../../media/cache/9b/4e/9b4ece2ab5a6335c8594c878e2f22df1.jpg", "../../../../media/cache/8d/1e/8d1e285bf672b2ea66879490cc5f6904.jpg"]}, 68 | {"URL": "http://books.toscrape.com/catalogue/category/books/science-fiction_16/index.html", "image_link": ["../../../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../../../media/cache/93/63/9363f0065fbad5689f44fcf6e203eef3.jpg", "../../../../media/cache/02/37/0237b445efc18c5562355a5a2c40889c.jpg", "../../../../media/cache/10/6e/106e2fc7160712edf8e2ff996dc8cd6c.jpg", "../../../../media/cache/f0/06/f0060c756556b855184fa32f66280961.jpg", "../../../../media/cache/c0/72/c072c1ef144d571abd25fe9cc18cceba.jpg", "../../../../media/cache/51/88/518810d182843244a404f2a2a614a93b.jpg", "../../../../media/cache/8b/92/8b9267df86378b6973974ae7e1924ffe.jpg", "../../../../media/cache/b8/b2/b8b2956acc758a381beef87339c0a52f.jpg", "../../../../media/cache/51/34/513418bd1c6114f3ea1fd703278e20ef.jpg", "../../../../media/cache/ef/8b/ef8bc5adcd3bea8e8ba97be76d07a32a.jpg", "../../../../media/cache/7a/bc/7abccb865ecf9b0f676800b10c71cfd6.jpg", "../../../../media/cache/fa/65/fa653fbe3a4c69227c9b79d471cee576.jpg", "../../../../media/cache/c7/21/c721943edf481cad5ab32505e2ad3865.jpg", "../../../../media/cache/da/47/da4746e620f8ccd7cf20628d1a5e535a.jpg", "../../../../media/cache/f4/83/f4835e9f3fdd8b8107bbb39a391654f0.jpg"]}, 69 | {"URL": "http://books.toscrape.com/catalogue/category/books/default_15/index.html", "image_link": ["../../../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../../../media/cache/ef/0b/ef0bed08de4e083dba5e20fdb98d9c36.jpg", "../../../../media/cache/d6/da/d6da0371958068bbaf39ea9c174275cd.jpg", "../../../../media/cache/12/53/1253c21c5ef3c6d075c5fa3f5fecee6a.jpg", "../../../../media/cache/f5/88/f5889d038f5d8e949b494d147c2dcf54.jpg", "../../../../media/cache/75/dc/75dce2f5949b407161f37f0af249b018.jpg", "../../../../media/cache/69/85/69852567cf97264a1442cbc882c84903.jpg", "../../../../media/cache/27/d2/27d20361745ec2f7be668b18a4da29da.jpg", "../../../../media/cache/78/2e/782e315667ec50759b8603527ee33dec.jpg", "../../../../media/cache/08/89/088995e862aac86c88c608d763f6390e.jpg", "../../../../media/cache/06/a6/06a6cfcf89afd1601cbba1a16cda57fb.jpg", "../../../../media/cache/8a/83/8a83b6ce350f01bab21f85e6ba539316.jpg", "../../../../media/cache/4e/0f/4e0f05ae01d8fb6bd0d3901edd06de16.jpg", "../../../../media/cache/34/f5/34f5f8e513c5f048241f5695e61b2483.jpg", "../../../../media/cache/58/9d/589d73503d9a23d224de836134fae553.jpg", "../../../../media/cache/25/6c/256c946dd0962095f66c6de3b15ab300.jpg", "../../../../media/cache/81/58/81586cd0bf8743e1f5ed80b6a0e1fabe.jpg", "../../../../media/cache/fe/b7/feb764b2afa54991cfdbbffdf501b333.jpg"]}, 70 | {"URL": "http://books.toscrape.com/catalogue/category/books/music_14/index.html", "image_link": ["../../../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../../../media/cache/5c/c8/5cc8e107246cb478960d4f0aba1e1c8e.jpg", "../../../../media/cache/a2/6d/a26d8449abb3381e09126eda5f4e8151.jpg", "../../../../media/cache/06/f1/06f185c0be2ad6e2fe059464c03f1b47.jpg", "../../../../media/cache/85/42/8542841f5644a6daf433504f1e106e97.jpg", "../../../../media/cache/11/fc/11fc94453c4dc0d68543971d7843afb0.jpg", "../../../../media/cache/35/a4/35a4a7c6c76c4e82186753078e441654.jpg", "../../../../media/cache/15/de/15de75548ee9a4c6be1420ee309c03e0.jpg", "../../../../media/cache/7a/7e/7a7eb52e7075a5305522948375c1316e.jpg", "../../../../media/cache/99/97/9997eda658c2fe50e724171f9c2a2b0b.jpg", "../../../../media/cache/7e/94/7e947f3dd04f178175b85123829467a9.jpg", "../../../../media/cache/7f/b0/7fb03a053c270000667a50dd8d594843.jpg"]}, 71 | {"URL": "http://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html", "image_link": ["../../../../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg", "../../../../media/cache/38/c5/38c56fba316c07305643a8065269594e.jpg", "../../../../media/cache/cb/bd/cbbdb0222ee8a0f6ab61657412a15794.jpg", "../../../../media/cache/9c/46/9c463c7631c82401160fd3b554b8f0e1.jpg", "../../../../media/cache/41/a2/41a20f35adf0caea24f208dc01ad7681.jpg", "../../../../media/cache/03/86/038650c9e7517b4baf2a423cd8eed38f.jpg", "../../../../media/cache/95/64/95647d6a526bf54120b9445e124794e1.jpg", "../../../../media/cache/64/15/641570cd7e7aded53c7d33d78a9629f1.jpg", "../../../../media/cache/2e/23/2e236e23ad52aa74505f224f6552eda8.jpg", "../../../../media/cache/f3/4f/f34ffb24cc21c9f9f52dad4fd8f3ac21.jpg", "../../../../media/cache/97/f8/97f8debeeaaece9603267653076e760f.jpg", "../../../../media/cache/fe/ea/feeafd2ad7b3077f8e74cbb1da9e3c7d.jpg", "../../../../media/cache/64/94/6494bf61176ca73b61255909230030be.jpg", "../../../../media/cache/88/9e/889e0bac4c7c0e7178f0165b8d3b4617.jpg", "../../../../media/cache/23/b4/23b42e094c02d52b14b11a960d49610e.jpg", "../../../../media/cache/03/38/0338682e76bad3216cd4c6c28b2b625a.jpg", "../../../../media/cache/14/f3/14f3d525e2a114cd71e27201a16af188.jpg", "../../../../media/cache/13/57/1357c6aa40c9e63d2f931927fbf81f3f.jpg", "../../../../media/cache/0e/6d/0e6dc2484322c5b9e7854ced66fdf62d.jpg", "../../../../media/cache/6e/d4/6ed4991d97f60db29ec7b421e61a2cf3.jpg"]}, 72 | {"URL": "http://books.toscrape.com/catalogue/category/books/religion_12/index.html", "image_link": ["../../../../media/cache/95/30/953013d044aa313cc162dec414f3969a.jpg", "../../../../media/cache/6b/70/6b70f2cdb17d9ab7551240a88b9211fe.jpg", "../../../../media/cache/1f/db/1fdb125bcb8cee71f3404b4dc293348c.jpg", "../../../../media/cache/83/db/83dbf86eb0fed1d99de2148eac4eb064.jpg", "../../../../media/cache/71/91/7191a7d76eb6c3a18259541e2c038ae3.jpg", "../../../../media/cache/4e/69/4e69dacc99de838814d0f65c94e67f6c.jpg", "../../../../media/cache/df/ab/dfabeab158046237ddb6b713b794909f.jpg"]}, 73 | {"URL": "http://books.toscrape.com/catalogue/category/books/childrens_11/index.html", "image_link": ["../../../../media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg", "../../../../media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg", "../../../../media/cache/c4/a2/c4a2a1a026c67bcceb5a411c724d7d0c.jpg", "../../../../media/cache/26/32/2632a1e12f2c085fabbe022ae4cd6933.jpg", "../../../../media/cache/80/25/8025b80a40178f2a6dd4f99ad88e0fba.jpg", "../../../../media/cache/28/50/2850439c2ba103fb69dba9cd2dd9f0c2.jpg", "../../../../media/cache/2b/38/2b380f77723c797c0389f978afa6db58.jpg", "../../../../media/cache/bb/e2/bbe26db72b8a32117bfe4981b7cc8147.jpg", "../../../../media/cache/97/12/971212afa8e4ff49d92f678bc889d8b7.jpg", "../../../../media/cache/85/e7/85e75d5a9309da5807c82decf3d90263.jpg", "../../../../media/cache/27/1f/271faa1d7561473974d12803feb1f0a1.jpg", "../../../../media/cache/6c/18/6c18ea03e294bfcfe07cf531c6c5f5b3.jpg", "../../../../media/cache/4f/1e/4f1ece2500f8dbacecca42d57befca03.jpg", "../../../../media/cache/8f/66/8f66ec46e671d6fca79649c10c7c8f8a.jpg", "../../../../media/cache/1c/eb/1cebdf525ebe970a1dc3c5a8c50eae6b.jpg", "../../../../media/cache/c0/bb/c0bb6e42743b9c1aaf9b754501100a5d.jpg", "../../../../media/cache/bf/db/bfdbf9726621276fc7821d705690dbae.jpg", "../../../../media/cache/e0/90/e090748ce5a567207aed9185c97eb34b.jpg", "../../../../media/cache/21/bd/21bdf7ae21476b1debf4aa3eefe6f29d.jpg", "../../../../media/cache/ec/08/ec08efebaa33a403e54080b48c139794.jpg"]}, 74 | {"URL": "http://books.toscrape.com/catalogue/category/books/fiction_10/index.html", "image_link": ["../../../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../../../media/cache/9d/05/9d0533bae1578846d728a82913b95c26.jpg", "../../../../media/cache/5f/15/5f152afdbc42356ecba02f61058a7e5b.jpg", "../../../../media/cache/c4/0a/c40a64f59e7487b1a80a049f6ceb2ba5.jpg", "../../../../media/cache/dc/44/dc44f8e2aebac48ca8553814d9b021a8.jpg", "../../../../media/cache/6b/da/6bdae061cb92c32b0b83cda8dd10275d.jpg", "../../../../media/cache/37/25/372578cc073efae80cf284b56040a488.jpg", "../../../../media/cache/f8/31/f8314c7fdaa79fb7191a583e9a852db8.jpg", "../../../../media/cache/6a/81/6a81103b1c01a3f6c56e5718a838a4c8.jpg", "../../../../media/cache/8f/f8/8ff8680dde59ea739d6978a01e4d7fe5.jpg", "../../../../media/cache/83/05/8305154438c91a02cefacf4ec8b53393.jpg", "../../../../media/cache/38/34/3834572e651cdc14b18d348fa4875aa9.jpg", "../../../../media/cache/d8/a4/d8a44eda7cbe7bd1207f868e9adc06f3.jpg", "../../../../media/cache/8e/c7/8ec7f310b74ddd7ec3c859e9b0da7389.jpg", "../../../../media/cache/03/16/0316bb6f4785ac69c0643109201bad5d.jpg", "../../../../media/cache/ca/b1/cab150e556b5fab663a9fec00ed97943.jpg", "../../../../media/cache/e0/79/e07906c1e507055da9a2260a74f58273.jpg", "../../../../media/cache/a7/f0/a7f092a7b79f848df0226f808fed489b.jpg", "../../../../media/cache/ed/07/ed07c9e7c53d4f33a6eb7d41eb0e6d4a.jpg", "../../../../media/cache/26/3b/263bf5d128bf18553ea8da8bb19e9a0c.jpg"]} 75 | ] -------------------------------------------------------------------------------- /chapter4/spider_books.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | 4 | class BooksSpider(scrapy.Spider): 5 | name = 'bookLinks' 6 | 7 | start_urls = ['http://books.toscrape.com'] 8 | images_data = {} 9 | 10 | def parse(self, response): 11 | # follow links to author pages 12 | for img in response.css('a::attr(href)'): 13 | yield response.follow(img, self.parse_images) 14 | 15 | def parse_images(self, response): 16 | print ("URL: " + response.request.url) 17 | def extract_with_css(query): 18 | return response.css(query).extract() 19 | yield { 20 | 'URL': response.request.url, 21 | 'image_link': extract_with_css('img::attr(src)') 22 | } --------------------------------------------------------------------------------