├── LICENSE
├── README.md
├── chapter1
├── code
│ ├── algorithmia
│ │ ├── algorithmia_analyze_url.py
│ │ └── algorithmia_sitemap.py
│ ├── mechanical-soup
│ │ ├── bing_search.py
│ │ ├── github_links.py
│ │ ├── google_search.py
│ │ └── twitter_login.py
│ ├── metadata
│ │ ├── extract_articles.py
│ │ └── extract_site_metadata.py
│ ├── parsel
│ │ ├── extract_links_css.py
│ │ └── extract_links_xpath.py
│ ├── requests
│ │ ├── request_response.py
│ │ └── urllib_request.py
│ ├── robobrowser
│ │ ├── bing_search.py
│ │ ├── download_file.py
│ │ ├── get_emails_links_from_url.py
│ │ ├── twitter_login_form.py
│ │ └── website_parsing.py
│ └── web_technologies
│ │ └── web_technologies_builtwith.py
└── images
│ ├── algorithmia_analyze_url.png
│ ├── algorithmia_analyze_url2.png
│ ├── algorithmia_sitemap.png
│ ├── algorithmia_sitemap2.png
│ ├── bing_search.png
│ ├── bing_search_output.png
│ ├── bing_search_output_mechanical_soup.png
│ ├── builtwith.png
│ ├── builtwith_script.png
│ ├── google_search_mechanical_soup.png
│ ├── robobrowser_links.png
│ └── wappalyzer.png
├── chapter12.zip
├── chapter2
├── code
│ ├── bs4
│ │ ├── BeautifulSoup-getLinks_csv.py
│ │ ├── bs4_objects.py
│ │ ├── demo_detail_book.py
│ │ ├── download_images_from_url.py
│ │ ├── getExternal_internal_links.py
│ │ ├── get_offers_bs4.py
│ │ └── wikipedia_links.py
│ └── requests
│ │ ├── crawler_urls.py
│ │ ├── depth_search_extract_links.py
│ │ ├── download_file_requests.py
│ │ ├── extract_links_images_re.py
│ │ ├── get_emails_from_url.py
│ │ ├── get_html_requests.py
│ │ ├── link_crawler_search.py
│ │ ├── requests_post.py
│ │ └── requests_user_agent.py
└── images
│ ├── download_images.png
│ ├── download_images2.png
│ ├── external_inernal_links.png
│ ├── link_extractor.png
│ ├── objects.png
│ ├── packt_books.png
│ ├── packtpub_links.png
│ ├── packtpub_links2.png
│ ├── packtpub_links_csv.png
│ ├── packtpub_links_deep_search.png
│ ├── requests_extract_links.png
│ ├── requests_headers.png
│ └── requests_post.png
├── chapter3
├── code
│ ├── books_scraping
│ │ ├── bookList.csv
│ │ ├── requests_bs4_initial.py
│ │ └── requests_bs4_with_pages.py
│ ├── chromedriver.exe
│ ├── dolar-euro_converter.py
│ ├── google_translate.py
│ ├── interacting_with_form.py
│ ├── phantomjs
│ │ ├── phantomjs.exe
│ │ ├── phantomjs_example1.py
│ │ ├── phantomjs_example2.py
│ │ └── phantomjs_example3.py
│ ├── scraping_book_details_requests.py
│ ├── selenium_list_book.py
│ └── stack_overflow_tags.py
└── images
│ ├── ajax_image.png
│ ├── book_info.png
│ ├── book_packit.png
│ ├── books_details.png
│ ├── books_packit.png
│ ├── converter.png
│ ├── google_translate.png
│ ├── selenium_methods.png
│ └── xpath.png
└── chapter4
├── BooksSpider-multipage-details
├── books_crawler
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── BooksSpider.py
│ │ └── __init__.py
├── output.son
└── scrapy.cfg
├── BooksSpider-urls
├── books_crawler
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── BooksSpider.py
│ │ └── __init__.py
├── books_links.json
└── scrapy.cfg
├── BooksSpider-urls_download_images
├── books_crawler
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── BooksSpider.py
│ │ └── __init__.py
├── output.son
└── scrapy.cfg
├── europython
├── europython
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ ├── items.cpython-37.pyc
│ │ ├── pipelines.cpython-37.pyc
│ │ └── settings.cpython-37.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-37.pyc
│ │ └── europython_spider.cpython-37.pyc
│ │ └── europython_spider.py
├── europython_items.csv
├── europython_items.json
├── europython_items.xml
├── scrapinghub.yml
├── scrapy.cfg
└── setup.py
├── images
├── book_details.png
├── books_images.png
├── books_images_output.png
├── europython_talk.png
├── next_page.png
├── scrapy_books.png
├── scrapy_books_links.png
├── scrapy_options.png
├── scrapy_project.png
├── scrapy_shell.png
└── scrapy_shell2.png
├── output.json
└── spider_books.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Packt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Advanced-Web-Scraping-with-Python
2 | Advanced Web Scraping with Python, Published by Packt
3 |
--------------------------------------------------------------------------------
/chapter1/code/algorithmia/algorithmia_analyze_url.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import Algorithmia
5 | import json
6 |
7 | input = [ "https://www.packtpub.com/iot-hardware/single-board-computers"]
8 | output = []
9 |
10 | API_KEY ='simU+xQFB6Ts4O306dxEhZreKBA1'
11 |
12 | client = Algorithmia.client(API_KEY)
13 |
14 | algorithmia = client.algo('web/AnalyzeURL/0.2.17').pipe(input[0])
15 | print(algorithmia.result)
16 | output.append(algorithmia.result)
17 | print(json.dumps(output, indent=4))
--------------------------------------------------------------------------------
/chapter1/code/algorithmia/algorithmia_sitemap.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import Algorithmia
5 |
6 | input = [ "http://packtpub.com",1]
7 |
8 | API_KEY ='simU+xQFB6Ts4O306dxEhZreKBA1'
9 |
10 | client = Algorithmia.client(API_KEY)
11 | response = client.algo('web/SiteMap/0.1.7').pipe(input)
12 | siteMap = response.result
13 | print(siteMap)
--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/bing_search.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import mechanicalsoup
5 |
6 | # Connect to bing search engine
7 | browser = mechanicalsoup.StatefulBrowser()
8 | browser.open("http://bing.com/")
9 |
10 | # Fill-in the search form
11 | browser.select_form('#sb_form')
12 | browser["q"] = "MechanicalSoup"
13 | browser.submit_selected()
14 |
15 | # Display the results
16 | for link in browser.links():
17 | print(link.text, '->', link.attrs['href'])
18 |
--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/github_links.py:
--------------------------------------------------------------------------------
1 | """Example app to login to GitHub using the StatefulBrowser class."""
2 | #!/usr/bin/env python
3 | # -*- coding: utf-8 -*-
4 |
5 | from __future__ import print_function
6 | import argparse
7 | import mechanicalsoup
8 | from getpass import getpass
9 |
10 | parser = argparse.ArgumentParser(description="Login to GitHub.")
11 | parser.add_argument("username")
12 | args = parser.parse_args()
13 |
14 | args.password = getpass("Please enter your GitHub password: ")
15 |
16 | browser = mechanicalsoup.StatefulBrowser(
17 | soup_config={'features': 'lxml'},
18 | raise_on_404=True,
19 | user_agent='MyBot/0.1: mysite.example.com/bot_info',
20 | )
21 | # Uncomment for a more verbose output:
22 | browser.set_verbose(2)
23 |
24 | browser.open("https://github.com")
25 | browser.follow_link("login")
26 | browser.select_form('#login form')
27 | browser["login"] = args.username
28 | browser["password"] = args.password
29 | resp = browser.submit_selected()
30 |
31 | # Uncomment to launch a web browser on the current page:
32 | browser.launch_browser()
33 |
34 | # verify we are now logged in
35 | page = browser.get_current_page()
36 |
37 | for link in browser.links():
38 | target = link.attrs['href']
39 | print(target)
40 |
41 | messages = page.find("div", class_="flash-messages")
42 | if messages:
43 | print(messages.text)
44 | assert page.select(".logout-form")
45 |
46 | #print(page.title.text)
47 | #print(page)
48 |
49 | # verify we remain logged in (thanks to cookies) as we browse the rest of
50 | # the site
51 | page3 = browser.open("https://github.com/MechanicalSoup/MechanicalSoup")
52 | assert page3.soup.select(".logout-form")
53 |
--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/google_search.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import mechanicalsoup
6 |
7 | # Connect to Google
8 | browser = mechanicalsoup.StatefulBrowser()
9 | browser.open("https://www.google.com/")
10 |
11 | # Fill-in the form
12 | browser.select_form('form[action="/search"]')
13 | browser["q"] = "MechanicalSoup"
14 |
15 | # Note: the button name is btnK in the content served to actual
16 | # browsers, but btnG for bots.
17 | browser.submit_selected(btnName="btnG")
18 | print(browser.get_current_page())
19 |
20 | # Display links
21 | for link in browser.links():
22 | target = link.attrs['href']
23 | # Filter-out unrelated links and extract actual URL from Google's
24 | # click-tracking.
25 | if (target.startswith('/url?') and not
26 | target.startswith("/url?q=http://webcache.googleusercontent.com")):
27 | target = re.sub(r"^/url\?q=([^&]*)&.*", r"\1", target)
28 | print(target)
29 |
--------------------------------------------------------------------------------
/chapter1/code/mechanical-soup/twitter_login.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import mechanicalsoup
5 | import getpass
6 |
7 | URL = "https://twitter.com/login"
8 |
9 | username = input ("Username: ")
10 | password = getpass.getpass()
11 |
12 | # Create a browser object
13 | browser = mechanicalsoup.Browser()
14 |
15 | # request Twitter login page
16 | login_page = browser.get(URL)
17 |
18 | # we grab the login form
19 | login_form = login_page.soup.find("form", {"class":"t1-form clearfix signin js-signin"})
20 |
21 | # find login and password inputs
22 | login_form.find("input", {"name": "session[username_or_email]"})["value"] = username
23 | login_form.find("input", {"name": "session[password]"})["value"] = password
24 |
25 | # submit form
26 | browser.submit(login_form, login_page.url)
--------------------------------------------------------------------------------
/chapter1/code/metadata/extract_articles.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import newspaper
5 |
6 | cnn_paper = newspaper.build('http://cnn.com')
7 |
8 | print('*****************************category urls************************************\n')
9 | for category in cnn_paper.category_urls():
10 | print(category)
11 |
12 | print('*****************************url articles************************************\n')
13 |
14 | for article in cnn_paper.articles:
15 | print(article.url)
16 |
17 | print('*****************************download first article************************************\n')
18 | cnn_article = cnn_paper.articles[0]
19 | cnn_article.download()
20 | cnn_article.parse()
21 |
22 | #print(cnn_article.html)
23 | print(cnn_article.text)
24 | print(cnn_article.keywords)
25 | print(cnn_article.summary)
26 | print(cnn_article.authors)
27 | print(cnn_article.publish_date)
--------------------------------------------------------------------------------
/chapter1/code/metadata/extract_site_metadata.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import extruct
5 | import requests
6 | import pprint
7 | from w3lib.html import get_base_url
8 |
9 |
10 | pp = pprint.PrettyPrinter(indent=2)
11 | r = requests.get('https://www.packtpub.com')
12 | base_url = get_base_url(r.text, r.url)
13 | data = extruct.extract(r.text, base_url=base_url)
14 |
15 | pp.pprint(data)
--------------------------------------------------------------------------------
/chapter1/code/parsel/extract_links_css.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | from parsel import Selector
6 |
7 | # GET request to packtpub site
8 | response = requests.get('https://www.packtpub.com')
9 |
10 | # "response.txt" contain all web page content
11 | selector = Selector(response.text)
12 |
13 | # Extracting href attribute from anchor tag
14 | href_links = selector.css('a::attr(href)').extract()
15 |
16 | #Extracting src attribute from img tag
17 | image_links = selector.css('img::attr(src)').extract()
18 |
19 | print('*****************************href_links************************************\n')
20 | print(href_links)
21 |
22 |
23 | print('*****************************image_links************************************\n')
24 | print(image_links)
25 |
--------------------------------------------------------------------------------
/chapter1/code/parsel/extract_links_xpath.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | from parsel import Selector
6 |
7 | # GET request to packtpub site
8 | response = requests.get('https://www.packtpub.com')
9 |
10 | # "response.txt" contain all web page content
11 | selector = Selector(response.text)
12 |
13 | # Extracting href attribute from anchor tag
14 | href_links = selector.xpath('//a/@href').getall()
15 |
16 | #Extracting src attribute from img tag
17 | image_links = selector.xpath('//img/@src').getall()
18 |
19 | print('*****href_links******\n')
20 | print(href_links)
21 |
22 |
23 | print('*****image_links*****\n')
24 | print(image_links)
25 |
--------------------------------------------------------------------------------
/chapter1/code/requests/request_response.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | url = "http://www.packtpub.com"
4 | # Packages the request, send the request and catch the response
5 | response = requests.get(url)
6 | # Store the response in html variable
7 | html = response.text
8 | # Print the html
9 | print(html)
--------------------------------------------------------------------------------
/chapter1/code/requests/urllib_request.py:
--------------------------------------------------------------------------------
1 | from urllib.request import urlopen, Request
2 |
3 | # Specify the url
4 | url = "http://www.packtpub.com"
5 | # This packages the request
6 | request = Request(url)
7 | # Sends the request and catches the response: response
8 | response = urlopen(request)
9 | # Extract the response using read()
10 | html = response.read()
11 | # Print the html
12 | print(html)
13 | # Closing the response
14 | response.close()
--------------------------------------------------------------------------------
/chapter1/code/robobrowser/bing_search.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from robobrowser import RoboBrowser
5 |
6 | browser = RoboBrowser(history=True,parser="html.parser")
7 | browser.open("http://bing.com")
8 | #print(browser.parsed)
9 |
10 | #Find the element by id,action or css class in the html
11 | #form = browser.get_form(id = "sb_form")
12 | form = browser.get_form(action="/search")
13 | #form = browser.get_form(class_='sw_box hassbi')
14 |
15 | print(form)
16 |
17 | form.fields['q'].value = "python"
18 | #form["q"].value = "python"
19 |
20 | browser.submit_form(form)
21 |
22 | print('*****browser.find_all("a")******\n')
23 |
24 | links = browser.find_all("a")
25 | for link in links:
26 | try:
27 | print(link['href'])
28 | except Exception as exception:
29 | pass
--------------------------------------------------------------------------------
/chapter1/code/robobrowser/download_file.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from robobrowser import RoboBrowser
5 |
6 | browser = RoboBrowser(history=True)
7 |
8 | url = "https://www.cse.unsw.edu.au/~en1811/python-docs/python-3.6.4-docs-pdf/tutorial.pdf"
9 | pdf_file_path = "tutorial.pdf"
10 |
11 | # get browser session
12 | request = browser.session.get(url, stream=True)
13 |
14 | with open(pdf_file_path, "wb") as pdf_file:
15 | pdf_file.write(request.content)
--------------------------------------------------------------------------------
/chapter1/code/robobrowser/get_emails_links_from_url.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from robobrowser import RoboBrowser
5 | import re
6 | import argparse
7 |
8 | browser = RoboBrowser(history=True,parser="html.parser")
9 |
10 | def get_emails(domain):
11 |
12 | domain="http://"+domain
13 | browser.open(domain)
14 | contents = browser.find_all("a",href=re.compile("[-a-zA-Z0-9._]+@[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+"))
15 | for content in contents:
16 | print(content['href'])
17 |
18 | def get_links(domain):
19 |
20 | domain="http://"+domain
21 | browser.open(domain)
22 |
23 | print('*****browser.find_all("a")******\n')
24 | contents = browser.find_all("a")
25 | for content in contents:
26 | try:
27 | print(content['href'])
28 | except Exception as exception:
29 | pass
30 |
31 | print('*****browser.get_links()******\n')
32 | links = browser.get_links()
33 | for link in links:
34 | try:
35 | print(link['href'])
36 | except Exception as exception:
37 | pass
38 |
39 | if __name__ == "__main__":
40 | parser = argparse.ArgumentParser(description='gets emails from domain.', prog='get_emails_links_from_url.py', epilog="", add_help=False)
41 | parser.add_argument('-d', '--domain', metavar='', action='store', help='domain to be resolved.',required=True)
42 | args = parser.parse_args()
43 | get_emails(args.domain)
44 | get_links(args.domain)
--------------------------------------------------------------------------------
/chapter1/code/robobrowser/twitter_login_form.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from robobrowser import RoboBrowser
5 |
6 | browser = RoboBrowser(history=True,parser="html.parser")
7 | browser.open('http://twitter.com/login')
8 | print(browser.parsed)
9 |
10 | # Get the signup form by action or css class
11 | signup_form = browser.get_form(action="https://twitter.com/sessions")
12 | signup_form = browser.get_form(class_='t1-form clearfix signin js-signin')
13 | print(signup_form)
14 |
15 | # Inspect authenticity_token value
16 | print(signup_form['authenticity_token'].value)
17 |
18 | # Fill it out
19 | signup_form['session[username_or_email]'].value = 'username'
20 | signup_form['session[password]'].value = 'password'
21 |
22 | print(signup_form.serialize())
23 |
24 | # Submit the form
25 | browser.submit_form(signup_form)
--------------------------------------------------------------------------------
/chapter1/code/robobrowser/website_parsing.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from robobrowser import RoboBrowser
5 | import requests
6 |
7 | url = "http://www.packtpub.com"
8 | browser = RoboBrowser(history=True,parser="html.parser")
9 |
10 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
12 | 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
13 | 'Accept-Encoding': 'none',
14 | 'Accept-Language': 'en-US,en;q=0.8',
15 | 'Connection': 'keep-alive'}
16 |
17 | session = requests.Session()
18 | session.headers = headers
19 | browser = RoboBrowser(session=session)
20 |
21 | browser.open(url)
22 | print(browser.parsed)
--------------------------------------------------------------------------------
/chapter1/code/web_technologies/web_technologies_builtwith.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | import argparse
6 | import builtwith
7 |
8 | class BuiltWith():
9 |
10 | def __init__(self):
11 |
12 | self.key = '1fb25d4e-31b7-468c-8793-4ecebc3467be'
13 | self.url ='http://api.builtwith.com/free1/api.json'
14 |
15 | def module_run(self, domain):
16 | print("\nDomain "+domain +"\n")
17 | print(builtwith.parse("http://"+domain))
18 | payload = {'key': self.key, 'lookup': domain}
19 | response = requests.get(self.url, params=payload)
20 | json=response.json()
21 | print(json)
22 |
23 |
24 | if __name__ == '__main__':
25 |
26 | parser = argparse.ArgumentParser(description='BuiltWith')
27 | parser.add_argument('--domain', action="store", dest="domain",required=True)
28 | given_args = parser.parse_args()
29 | domain = given_args.domain
30 | builtWith = BuiltWith();
31 | builtWith.module_run(domain);
--------------------------------------------------------------------------------
/chapter1/images/algorithmia_analyze_url.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_analyze_url.png
--------------------------------------------------------------------------------
/chapter1/images/algorithmia_analyze_url2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_analyze_url2.png
--------------------------------------------------------------------------------
/chapter1/images/algorithmia_sitemap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_sitemap.png
--------------------------------------------------------------------------------
/chapter1/images/algorithmia_sitemap2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/algorithmia_sitemap2.png
--------------------------------------------------------------------------------
/chapter1/images/bing_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search.png
--------------------------------------------------------------------------------
/chapter1/images/bing_search_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search_output.png
--------------------------------------------------------------------------------
/chapter1/images/bing_search_output_mechanical_soup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/bing_search_output_mechanical_soup.png
--------------------------------------------------------------------------------
/chapter1/images/builtwith.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/builtwith.png
--------------------------------------------------------------------------------
/chapter1/images/builtwith_script.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/builtwith_script.png
--------------------------------------------------------------------------------
/chapter1/images/google_search_mechanical_soup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/google_search_mechanical_soup.png
--------------------------------------------------------------------------------
/chapter1/images/robobrowser_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/robobrowser_links.png
--------------------------------------------------------------------------------
/chapter1/images/wappalyzer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter1/images/wappalyzer.png
--------------------------------------------------------------------------------
/chapter12.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter12.zip
--------------------------------------------------------------------------------
/chapter2/code/bs4/BeautifulSoup-getLinks_csv.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | import csv
4 |
5 | url = "http://packtpub.com"
6 |
7 | csv_file = csv.writer(open("data_links.csv", "w"))
8 | csv_file.writerow(["Section" , "Link"])
9 |
10 | # Getting the webpage, creating a Response object.
11 | response = requests.get(url)
12 |
13 | # Extracting the source code of the page.
14 | data = response.text
15 |
16 | # Passing the source code to Beautiful Soup to create a BeautifulSoup object for it.
17 | soup = BeautifulSoup(data, 'html.parser')
18 |
19 | # use the 'find_all' function to bring back all instances of the 'a' tag in the HTML and store in 'tags' variable
20 | # Extracting all the tags into a list.
21 | tags = soup.find_all('a')
22 | tags = soup.find_all('a', {'class': 'nav-anchor'}) # only for url = "http://packtpub.com"
23 |
24 | # Extracting URLs from the attribute href in the tags.
25 | for tag in tags:
26 | print(tag.get('href'))
27 | link = tag.get('href')
28 | text = tag.get_text()
29 | csv_file.writerow([text, link])
30 |
31 |
--------------------------------------------------------------------------------
/chapter2/code/bs4/bs4_objects.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | from bs4 import BeautifulSoup
6 | from fake_useragent import UserAgent
7 |
8 | ua = UserAgent()
9 | header = {'user-agent':ua.chrome}
10 | google_page = requests.get('http://www.packtpub.com',headers=header)
11 |
12 | soup = BeautifulSoup(google_page.content,'lxml')
13 |
14 | #find parent
15 | print("Parent of the form with id='search_mini_form':")
16 | parent_form = soup.find("form",{"id":"search_mini_form"}).parent
17 | print(parent_form)
18 |
19 | #get children form a specific element,in this case we are getting child elements of the form with id="search_mini_form"
20 | print("Children of the form with id='search_mini_form:'")
21 | for child in soup.find("form",{"id":"search_mini_form"}).children:
22 | print(child)
23 |
24 | #find next_siblings
25 | print("Siblings of the form with id='search_mini_form:'")
26 | for sibling in soup.find("form",{"id":"search_mini_form"}).input.next_siblings:
27 | print(sibling)
--------------------------------------------------------------------------------
/chapter2/code/bs4/demo_detail_book.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import requests
6 |
7 | response = requests.get('https://www.packtpub.com/application-development/learn-python-programming-second-edition')
8 | soup = BeautifulSoup(response.text,'lxml')
9 |
10 | title = soup.find('span', attrs={'data-ui-id':'page-title-wrapper'}).text
11 | author = soup.find('div', attrs={'class':'authors inline'}).text
12 |
13 | print(title)
14 | print(author)
--------------------------------------------------------------------------------
/chapter2/code/bs4/download_images_from_url.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import os, sys
6 | import requests
7 | from fake_useragent import UserAgent
8 |
9 | def getAllImages(url):
10 |
11 | ua = UserAgent()
12 | header = {'user-agent':ua.chrome}
13 | schedule_page = requests.get(url,headers=header)
14 |
15 | #create directory for save images
16 | os.system("mkdir images_packtpub")
17 |
18 | bs = BeautifulSoup(schedule_page.text,"lxml")
19 | for image in bs.findAll("img"):
20 | print("found image")
21 |
22 | #Extract the location of the image. We also need to strip for get the image name, so let's do that through '.split()'
23 | src = image.get('src')
24 | print(src)
25 |
26 | parts_image = src.split("/")
27 | image_name = parts_image[len(parts_image)-1]
28 |
29 | #Save the image
30 | with open("images_packtpub/"+image_name,"wb") as f:
31 | f.write(requests.get(src).content)
32 |
33 | getAllImages("http://www.packtpub.com")
34 |
--------------------------------------------------------------------------------
/chapter2/code/bs4/getExternal_internal_links.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 |
5 | from bs4 import BeautifulSoup
6 | import re
7 | import requests
8 | import argparse
9 |
10 | internalLinks = []
11 | externalLinks = []
12 |
13 | #Get a list of internal links that start with a "/"
14 | def getInternalLinks(url,beautifulSoup):
15 | url = url.replace("http://", "").split("/")[0]
16 | for link in beautifulSoup.findAll("a", href=re.compile("^(/|.*"+url+")")):
17 | if link.attrs['href'] is not None:
18 | internalLinks.append(link.attrs['href'])
19 | return internalLinks
20 |
21 |
22 | #Get all links that start with "http" or "www" and not contain the current URL
23 | def getExternalLinks(url,beautifulSoup):
24 | url = url.replace("http://", "").split("/")[0]
25 | for link in beautifulSoup.findAll("a", href=re.compile("^(http|www)((?!"+url+").)*$")):
26 | if link.attrs['href'] is not None:
27 | externalLinks.append(link.attrs['href'])
28 | return externalLinks
29 |
30 |
31 | def crawlExternalLinks(website):
32 | html = requests.get(website)
33 | beautifulSoup = BeautifulSoup(html.text,"lxml")
34 | externalLinks = getExternalLinks(website, beautifulSoup)
35 | return externalLinks
36 |
37 | def crawlInternalLinks(website):
38 | html = requests.get(website)
39 | beautifulSoup = BeautifulSoup(html.text,"lxml")
40 | internalLinks = getInternalLinks(website,beautifulSoup)
41 | return internalLinks
42 |
43 | def getExternalInternalLinks(website):
44 | externalLinks = crawlExternalLinks(website)
45 | internalLinks = crawlInternalLinks(website)
46 | print("\nExternal links")
47 | print("-------------------")
48 |
49 | for external in externalLinks:
50 | print(external)
51 |
52 | print("\nInternal links")
53 | print("-------------------")
54 | for internal in internalLinks:
55 | print(internal)
56 |
57 |
58 | if __name__== "__main__":
59 |
60 | # parse the command line arguments
61 | ap = argparse.ArgumentParser()
62 | ap.add_argument("-d","--domain",required=True,help="The domain to target ie. packtpub.com")
63 | args = vars(ap.parse_args())
64 |
65 | domain = args['domain']
66 |
67 | if domain.startswith("http://") == True:
68 | target = domain
69 | else:
70 | target = "http://" + domain
71 |
72 | getExternalInternalLinks(target)
--------------------------------------------------------------------------------
/chapter2/code/bs4/get_offers_bs4.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 |
4 | def getOffers(url):
5 | # We make the request to the page
6 | req = requests.get(url)
7 | # We verify that the request returns a Status Code = 200 (200 = Ok)
8 | statusCode = req.status_code
9 | if statusCode == 200:
10 | # We pass the HTML content of the web to a BeautifulSoup object
11 | html = BeautifulSoup(req.text, "html.parser")
12 | # We get all the div elements with class "offer-box"
13 | elements = html.find_all('div', {'class': 'offer-box'})
14 | # We go through all the entries to extract the title, description and link
15 | for item in elements:
16 | title = item.find('h3').getText()
17 | description = item.find('p').getText()
18 | link = item.find('a').get('href')
19 |
20 | # Print title,link and description
21 | print("Title....: " + title)
22 | print("Link:.....: " + link)
23 | print("Description:.....: " + description)
24 | print("**********************************")
25 | else:
26 | # If the page does not exist we show the error
27 | print("The url " + url + " gives an error %d" % statusCode)
28 |
29 | getOffers("https://www.packtpub.com/offers")
--------------------------------------------------------------------------------
/chapter2/code/bs4/wikipedia_links.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | from bs4 import BeautifulSoup
6 | import re
7 |
8 | def getLinks(url):
9 | html = requests.get("http://en.wikipedia.org"+url).text
10 | bs = BeautifulSoup(html, "html.parser")
11 | return bs.find("div", {"id":"bodyContent"}).findAll("a", href=re.compile("^(/wiki/)((?!:).)*$"))
12 |
13 | print("Main links from http://en.wikipedia.org//wiki/Python_(programming_language)")
14 | links_level1 = getLinks("/wiki/Python_(programming_language)")
15 |
16 | index =0
17 |
18 | for link in links_level1:
19 |
20 | print("http://en.wikipedia.org"+link.get('href').encode('utf-8'))
21 |
22 | newLink= links_level1[index].attrs["href"]
23 |
24 | links_level2 = getLinks(newLink)
25 |
26 | print("Links from http://en.wikipedia.org"+ newLink)
27 |
28 | for link in links_level2:
29 | print("http://en.wikipedia.org"+link.get('href').encode('utf-8'))
30 |
31 | index = index +1
32 |
--------------------------------------------------------------------------------
/chapter2/code/requests/crawler_urls.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import re
5 | import requests
6 |
7 | web = input("Url: ")
8 | response = requests.get('http://'+web).text
9 | urls = []
10 |
11 | pattern= re.compile('''href=["'](.[^"']+)["']''')
12 | search = re.findall(pattern, response)
13 |
14 | for url in search:
15 | try:
16 | urls.append(url)
17 | d1 = str(url)
18 | urlList = open('crawler_urls.txt','a+')
19 | urlList.write(d1+"\n")
20 | urlList.close()
21 | print(url)
22 | response2 = requests.get(i).text
23 | search2 = re.findall(pattern, response2)
24 | for e in search2:
25 | urls.append(e)
26 | d2 = str(e)
27 | urlList = open('crawler_urls.txt','a+')
28 | urlList.write(d2+"\n")
29 | urlList.close()
30 |
31 | except Exception as e:
32 | pass
33 |
34 | print("URls saved in file crawler_urls.txt")
--------------------------------------------------------------------------------
/chapter2/code/requests/depth_search_extract_links.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from urllib.request import urljoin
5 | from urllib.parse import urlparse
6 | import re
7 | import requests
8 | from collections import deque
9 |
10 | def download_page(url):
11 | try:
12 | return requests.get(url).text
13 | except:
14 | print('error in the url', url)
15 |
16 | def extract_links(page):
17 | if not page:
18 | return []
19 | link_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
20 | return [urljoin(page, link) for link in link_regex.findall(page)]
21 |
22 | def get_links(page_url):
23 | host = urlparse(page_url)[1]
24 | page = download_page(page_url)
25 | links = extract_links(page)
26 | return [link for link in links if urlparse(link)[1] == host]
27 |
28 | def depth_search(start_url):
29 | visited = set()
30 | queue = deque()
31 | queue.append(start_url)
32 | while queue:
33 | url = queue.popleft()
34 | if url in visited:
35 | continue
36 | visited.add(url)
37 | for link in get_links(url):
38 | queue.appendleft(link)
39 | print(url)
40 |
41 | if __name__ == '__main__':
42 |
43 | print('Depth search extracting links ')
44 | print('----------------------------- ')
45 | depth_search('https://www.packtpub.com')
46 |
--------------------------------------------------------------------------------
/chapter2/code/requests/download_file_requests.py:
--------------------------------------------------------------------------------
1 |
2 | import requests
3 |
4 | def downloadFile(fileName):
5 | # extract the filename
6 | filename = fileName.split("/")[-1]
7 | # download image using GET
8 | image = requests.get(fileName, stream=True)
9 | # save the image received into the file
10 | with open(filename, 'wb') as fileDescryptor:
11 | i=0
12 | for chunk in image.iter_content(chunk_size=1024):
13 | i=i+1
14 | fileDescryptor.write(chunk)
15 | return
16 |
17 |
18 | downloadFile("https://www.packtpub.com/media/logo/stores/1/logo.png")
19 | downloadFile("https://media.readthedocs.org/pdf/python-guide/latest/python-guide.pdf")
20 | downloadFile("https://docs.python.org/3/archives/python-3.7.4-docs-pdf-letter.zip")
21 |
--------------------------------------------------------------------------------
/chapter2/code/requests/extract_links_images_re.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from urllib.request import urljoin
5 | import re
6 | import requests
7 |
8 | def download_page(url):
9 | return requests.get(url).text
10 |
11 | def extract_links(page):
12 | link_regex = re.compile(']+href=["\'](.*?)["\']', re.IGNORECASE)
13 | return link_regex.findall(page)
14 |
15 | def extract_image_locations(page):
16 | img_regex = re.compile('
]+src=["\'](.*?)["\']', re.IGNORECASE)
17 | return img_regex.findall(page)
18 |
19 |
20 | if __name__ == '__main__':
21 | target_url = 'http://www.packtpub.com'
22 | packtpub = download_page(target_url)
23 | links = extract_links(packtpub)
24 |
25 | for link in links:
26 | print(urljoin(target_url, link))
27 |
28 | image_locations = extract_image_locations(packtpub)
29 |
30 | for src in image_locations:
31 | print(urljoin(target_url, src))
32 |
--------------------------------------------------------------------------------
/chapter2/code/requests/get_emails_from_url.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | import re
6 | import argparse
7 |
8 | def get_emails(domain):
9 |
10 | if not domain.startswith("http://") == True:
11 | domain="http://"+domain
12 |
13 | response = requests.get(domain)
14 | pattern = re.compile("[-a-zA-Z0-9._]+@[-a-zA-Z0-9_]+.[a-zA-Z0-9_.]+")
15 | mails = re.findall(pattern,response.text)
16 | emails = str(mails)
17 |
18 | print(emails)
19 |
20 | if __name__ == "__main__":
21 | parser = argparse.ArgumentParser(description='gets emails from domain.', prog='get_emails_from_url.py', epilog="", add_help=False)
22 | # Adding the argument
23 | parser.add_argument('-d', '--domain', metavar='', action='store', help='domain to be resolved.',required=True)
24 | args = parser.parse_args()
25 |
26 | get_emails(args.domain)
--------------------------------------------------------------------------------
/chapter2/code/requests/get_html_requests.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | from fake_useragent import UserAgent
6 |
7 | url = 'https://www.packtpub.com'
8 | file_name = 'packtpub.com.txt'
9 |
10 | user_agent = UserAgent()
11 | page = requests.get(url,headers={'user-agent':user_agent.chrome})
12 | print(page.content)
13 | with open(file_name,'w') as file:
14 | file.write(page.content.decode('utf-8'))
--------------------------------------------------------------------------------
/chapter2/code/requests/link_crawler_search.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import argparse
5 | import sys
6 | import requests
7 | import re
8 | processed = []
9 |
10 | def search_links(url, depth, search):
11 | # Process http links that are not processed yet
12 | url_is_processed = (url in processed)
13 | if (url.startswith("http://") and (not url_is_processed)):
14 | processed.append(url)
15 | path = "/"
16 | urlparts = url.split("/")
17 | if (len(urlparts) > 1):
18 | host = urlparts[0]
19 | path = url.replace(host, "", 1)
20 |
21 | # Start crawling
22 | print("Crawling URL path:%s%s " %(host, path))
23 | req = requests.get(host+path)
24 |
25 | # find the links
26 | contents = req.text
27 | all_links = re.findall('href="(.*?)"', contents)
28 | if (search in contents):
29 | print("Found " + search + " at " + url)
30 | print("-----------------------------------")
31 | print(" ==> %s: processing %s links" %(str(depth),str(len(all_links))))
32 |
33 | for href in all_links:
34 | # Find relative urls
35 | print('link found '+href)
36 | # Recurse links
37 | if (depth > 0):
38 | search_links(href, depth-1, search)
39 | else:
40 | print("Skipping link: %s ..." %url)
41 |
42 | if __name__ == '__main__':
43 | parser = argparse.ArgumentParser(description='Webpage link crawler')
44 | parser.add_argument('--url', action="store", dest="url",required=True,type=str)
45 | parser.add_argument('--query', action="store", dest="query",required=True)
46 | parser.add_argument('--depth', action="store", dest="depth",default=1)
47 | given_args = parser.parse_args()
48 | try:
49 | if given_args.url.startswith("http://") == True:
50 | target = given_args.url
51 | else:
52 | target = "http://" + given_args.url
53 | search_links(target,given_args.depth,given_args.query)
54 | except KeyboardInterrupt:
55 | print("Aborting search by user request.")
--------------------------------------------------------------------------------
/chapter2/code/requests/requests_post.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests
5 | data_dictionary = {'name': 'username','password': '123456','email': 'user@domain.com'}
6 | response = requests.post("http://httpbin.org/post",data=data_dictionary)
7 |
8 | if response.status_code == 200:
9 | print(response.text)
--------------------------------------------------------------------------------
/chapter2/code/requests/requests_user_agent.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | import requests, json
5 | from fake_useragent import UserAgent
6 |
7 | ua = UserAgent()
8 | header = {'user-agent':ua.chrome}
9 |
10 | responseGet = requests.get("https://www.packtpub.com",headers=header)
11 | print(responseGet.text.encode('utf-8'))
12 | print(responseGet.json)
13 | print(responseGet.encoding)
14 | print(responseGet.content)
15 | print("Status code: "+str(responseGet.status_code))
16 |
17 | print("Headers response: ")
18 | for header, value in responseGet.headers.items():
19 | print(header, '-->', value)
20 |
21 | print("Headers request : ")
22 | for header, value in responseGet.request.headers.items():
23 | print(header, '-->', value)
--------------------------------------------------------------------------------
/chapter2/images/download_images.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/download_images.png
--------------------------------------------------------------------------------
/chapter2/images/download_images2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/download_images2.png
--------------------------------------------------------------------------------
/chapter2/images/external_inernal_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/external_inernal_links.png
--------------------------------------------------------------------------------
/chapter2/images/link_extractor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/link_extractor.png
--------------------------------------------------------------------------------
/chapter2/images/objects.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/objects.png
--------------------------------------------------------------------------------
/chapter2/images/packt_books.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packt_books.png
--------------------------------------------------------------------------------
/chapter2/images/packtpub_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links.png
--------------------------------------------------------------------------------
/chapter2/images/packtpub_links2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links2.png
--------------------------------------------------------------------------------
/chapter2/images/packtpub_links_csv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links_csv.png
--------------------------------------------------------------------------------
/chapter2/images/packtpub_links_deep_search.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/packtpub_links_deep_search.png
--------------------------------------------------------------------------------
/chapter2/images/requests_extract_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_extract_links.png
--------------------------------------------------------------------------------
/chapter2/images/requests_headers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_headers.png
--------------------------------------------------------------------------------
/chapter2/images/requests_post.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter2/images/requests_post.png
--------------------------------------------------------------------------------
/chapter3/code/books_scraping/requests_bs4_initial.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 |
4 | def processUrl(url):
5 | """
6 | Upload and process the content of a URL using request.
7 | Show an error message if you cannot load the page
8 | """
9 | # http request
10 | req = requests.get(url)
11 |
12 | # We verify the request returns a Status Code = 200
13 | statusCode = req.status_code
14 | if statusCode == 200:
15 |
16 | # We pass the HTML content of the web to a BeautifulSoup() object
17 | html = BeautifulSoup(req.text,"lxml")
18 |
19 | # We process the downloaded HTML
20 | return processHTML(html,url)
21 |
22 | else:
23 | print ("ERROR {}".format(statusCode))
24 |
25 | def processHTML(html, url=""):
26 | """
27 | Process the HTML content of a web page
28 | html is a BS4 object
29 | url is the URL of the page contained in html_doc
30 | """
31 | # Decide here what you want to do with the content
32 | return
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/chapter3/code/books_scraping/requests_bs4_with_pages.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | import pandas as pd
4 |
5 | # Class names representing product ratings
6 | star = ["One", "Two", "Three", "Four", "Five"]
7 |
8 | bookList = []
9 | url_page = "http://books.toscrape.com/catalogue/page-{}.html"
10 | url = "http://books.toscrape.com/catalogue/"
11 |
12 | def starToInt (rating):
13 | """
14 | Convert a textual rating to a numerical rating
15 | Returns the equivalent number, or 0, if the rating is not valid
16 | """
17 | try:
18 | return star.index(rating) + 1
19 | except:
20 | return 0
21 |
22 |
23 | def processUrl(url):
24 | """
25 | Upload and process the content of a URL using request.
26 | Show an error message if you cannot load the page
27 | """
28 | # http request
29 | req = requests.get(url)
30 |
31 | # We verify the request returns a Status Code = 200
32 | statusCode = req.status_code
33 | if statusCode == 200:
34 |
35 | # We pass the HTML content of the web to a BeautifulSoup () object
36 | html = BeautifulSoup(req.text,"lxml")
37 |
38 | # We process the downloaded HTML
39 | return processHTML(html,url)
40 |
41 | else:
42 | print ("ERROR {}".format(statusCode))
43 |
44 | def processHTML(html, url=""):
45 | """
46 | Process the HTML content of a web page
47 | html is a BS4 object
48 | url is the URL of the page contained in html_doc
49 | """
50 | book = {}
51 |
52 | productMain = html.select_one(".product_main")
53 |
54 | # Title
55 | title = productMain.select_one("h1").text
56 | book['title'] = title
57 |
58 | # Price
59 | price = productMain.select_one("p.price_color").text
60 | book['price'] = price[2:]
61 |
62 | # Assessment
63 | # 1. Get class
64 | ratingClasses = productMain.select_one("p.star-rating")["class"]
65 |
66 | # 2. We get with the intersection
67 | ratingText = list(set(ratingClasses).intersection(set(star)))
68 |
69 | # 3. We convert it to a numerical value
70 | if (len(ratingText)==1):
71 | book['assessment'] = starToInt(ratingText[0])
72 | else:
73 | book['assessment'] = 0
74 |
75 | # Processing the description makes us look for the sibling of an element
76 | # Product description
77 | # 1. We look for the element that takes product product description
78 | productDescription = html.find(id="product_description")
79 |
80 | # 2. We are looking for the next sibling with tag p
81 | if productDescription is None:
82 | book['descripcion'] = ""
83 | else:
84 | book['descripcion'] = productDescription.find_next_sibling('p').text
85 |
86 | print(book)
87 |
88 | return book
89 |
90 |
91 | def processCatalog(url, prefix):
92 | """
93 | Returns False if we have reached the end of the catalog, True otherwise
94 | """
95 | # We make the request to the web
96 | response = requests.get(url)
97 |
98 | # We verify that the request returns a Status Code = 200
99 | statusCode = response.status_code
100 | if statusCode == 200:
101 |
102 | # We pass the HTML content of the web to a BeautifulSoup () object
103 | html = BeautifulSoup(response.text,"lxml")
104 |
105 | # We process the downloaded HTML
106 | books = html.select('article.product_pod')
107 | for prod in books:
108 | link = prod.select_one('h3 > a')
109 | book = processUrl(prefix+link['href'])
110 | book['link'] = prefix+link['href']
111 | bookList.append(book)
112 | return True
113 |
114 | if statusCode == 404:
115 | return False
116 |
117 | if __name__ == "__main__":
118 |
119 | processUrl("http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html")
120 |
121 | for i in range(1,5):
122 | processCatalog(url_page.format(i), url)
123 |
124 | for book in bookList:
125 | print(book)
126 |
127 | #Finally we will load all the data in a panda dataframe to process it, extract information and save it to a CSV
128 |
129 | df = pd.DataFrame(bookList)
130 | df.to_csv("bookList.csv", sep=";", index=False)
131 |
132 |
133 |
134 |
--------------------------------------------------------------------------------
/chapter3/code/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/code/chromedriver.exe
--------------------------------------------------------------------------------
/chapter3/code/dolar-euro_converter.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import time
3 |
4 | def get_currency_values():
5 | browser = webdriver.Chrome("chromedriver.exe")
6 | browser.get('http://www.xe.com/en/currencyconverter/convert/?Amount=1&From=USD&To=EUR')
7 | time.sleep(5)
8 | value = browser.find_element_by_xpath("//*[@id='converterResult']/div/div/div[2]/span[1]")
9 | one_dollar = value.text
10 | print('The dollar at this time has a value of: €{} EUROS'.format(one_dollar))
11 | browser.get('http://www.xe.com/en/currencyconverter/convert/?Amount=1&From=EUR&To=USD')
12 | time.sleep(5)
13 | value = browser.find_element_by_xpath("//*[@id='converterResult']/div/div/div[2]/span[1]")
14 | one_euro = value.text
15 | print('The euro at this time has a value of: ${} dollars'.format(one_euro))
16 | one_dollar_float = float(one_dollar)
17 | one_euro_float = float(one_euro)
18 | operate(one_dollar_float, one_euro_float)
19 |
20 |
21 | def operate(one_dollar_float, one_euro_float):
22 |
23 | while True:
24 | command = str(input('''Selet currency conversion:
25 | [1]Dollars to euros
26 | [2]Euros to dollars
27 | [e]exit'''))
28 |
29 | if command == '1':
30 | dollar_to_euro(one_dollar_float)
31 | elif command == '2':
32 | euro_to_dollar(one_euro_float)
33 | else:
34 | break
35 |
36 | def dollar_to_euro(one_dollar_float):
37 | dollar_amount = float(input('Dollars amount: '))
38 | result = one_dollar_float * dollar_amount
39 | print('${} Dollars are ${} Euros'.format(dollar_amount, result))
40 |
41 | def euro_to_dollar(one_euro_float):
42 | euros_amount = float(input('Euros amount: '))
43 | result = one_euro_float * euros_amount
44 | print('€{} Euros are ${} Dollars'.format(euros_amount, result))
45 |
46 |
47 | if __name__ == '__main__':
48 | get_currency_values()
--------------------------------------------------------------------------------
/chapter3/code/google_translate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from bs4 import BeautifulSoup
5 | import requests
6 | import sys
7 | from selenium import webdriver
8 | import time
9 |
10 | #Example input to enter : en (= english)
11 | convert_from = input("Language to Convert from : ")
12 |
13 | #Example input to enter : es (= spanish)
14 | convert_to = input("Language to Convert to : ")
15 |
16 | text_to_convert = input("Text to translate: ")
17 |
18 | #replace spaces by + symbol
19 | text_to_convert = text_to_convert.replace(' ', '+')
20 |
21 | #call translate service
22 | url = 'https://translate.google.com/?sl=%s&tl=%s&text=%s' % (convert_from, convert_to, text_to_convert)
23 |
24 | browser = webdriver.Chrome("chromedriver.exe")
25 | browser.get(url)
26 |
27 | time.sleep(5)
28 |
29 | translation = browser.find_element_by_class_name("tlid-translation")
30 | translation2 = browser.find_element_by_xpath("/html/body/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div[3]/div[1]/div[2]/div/span[1]/span")
31 |
32 | print("Text translated : ", translation2.text)
33 |
34 | browser.get_screenshot_as_file('google_translate.png')
35 | browser.close()
36 |
--------------------------------------------------------------------------------
/chapter3/code/interacting_with_form.py:
--------------------------------------------------------------------------------
1 | from selenium.webdriver.support.ui import WebDriverWait
2 | from selenium.common.exceptions import TimeoutException
3 | from selenium import webdriver
4 | import time
5 |
6 | url = "https://websistent.com/tools/htdigest-generator-tool/"
7 | user = "myUser"
8 |
9 | driver = webdriver.Chrome('chromedriver.exe')
10 | driver.get(url)
11 |
12 | element = driver.find_element_by_id("uname")
13 | element.send_keys(user)
14 |
15 | #If we go to the browser we will see that we have completed the first input of the form.
16 | #Then fill in the rest of inputs
17 |
18 | element = driver.find_element_by_id("realm")
19 | element.send_keys("myRealm")
20 |
21 | element = driver.find_element_by_id("word1")
22 | element.send_keys("mypassword")
23 |
24 | element = driver.find_element_by_id("word2")
25 | element.send_keys("mypassword")
26 |
27 | #Finally, we look for the button and click it
28 | driver.find_element_by_id("generate").click();
29 |
30 | # We wait 2 seconds before searching for the item
31 | #time.sleep(2)
32 |
33 | try:
34 | # We wait a maximum of 10 seconds while we wait for the "Loading" text to disappear
35 | WebDriverWait(driver, 10).until_not(lambda driver: driver.find_element_by_id("output").text.startswith("Loading"))
36 |
37 | output = driver.find_element_by_id("output").text
38 | print (output[output.find(user):])
39 |
40 | except TimeoutException:
41 | print("The realm could not be generated or the page has taken too long time to load")
42 |
43 | finally:
44 | driver.quit()
--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/code/phantomjs/phantomjs.exe
--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs_example1.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 | driver = webdriver.PhantomJS("phantomjs.exe")
4 | driver.get("https://protonmail.com/")
5 | print(driver.find_element_by_class_name("homepage-hero-sub-title").text)
--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs_example2.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from bs4 import BeautifulSoup
3 |
4 | browser = webdriver.PhantomJS("phantomjs.exe")
5 |
6 | browser.get("https://protonmail.com/")
7 | page = BeautifulSoup(browser.page_source,"lxml")
8 | images = page.findAll("img")
9 | for image in images:
10 | print(image.get('src'))
11 | browser.close()
--------------------------------------------------------------------------------
/chapter3/code/phantomjs/phantomjs_example3.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.webdriver.common.by import By
3 | from selenium.webdriver.support.ui import WebDriverWait
4 | from selenium.webdriver.support import expected_conditions as EC
5 |
6 | driver = webdriver.PhantomJS("phantomjs.exe")
7 |
8 | driver.get("https://httpbin.org/#/HTTP_Methods/post_post")
9 |
10 | driver.find_element_by_class_name("opblock-summary-description").click()
11 |
12 | try:
13 | element = WebDriverWait(driver, 15).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, "btn"),"Try it out"))
14 |
15 | finally:
16 | driver.get_screenshot_as_file("image.png")
17 |
18 | driver.close()
--------------------------------------------------------------------------------
/chapter3/code/scraping_book_details_requests.py:
--------------------------------------------------------------------------------
1 | from lxml import html
2 | import csv
3 | import json
4 | import requests
5 |
6 | def parse(url):
7 | headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
8 | response = requests.get(url, headers=headers)
9 | doc = html.fromstring(response.content)
10 | title_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[1]/h1/span/text()'
11 | author_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/div[2]/text()'
12 | date_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/div[3]/text()'
13 | pages_xpath = '//*[@id="maincontent"]/div[3]/div/div[1]/div[2]/p[1]/text()'
14 | title = doc.xpath(title_xpath)[0]
15 | author = doc.xpath(author_xpath)[0]
16 | date = doc.xpath(date_xpath)[0]
17 | pages = doc.xpath(pages_xpath)[0]
18 |
19 | title = ' '.join(''.join(title).split()) if title else None
20 | author = ' '.join(''.join(author).split()) if author else None
21 | date = ' '.join(''.join(date).split()) if date else None
22 | pages = ' '.join(''.join(pages).split()) if pages else None
23 |
24 | data = {'Title': title,'Author': author,'Date': date,'Pages': pages}
25 | print(data)
26 |
27 | return data
28 |
29 |
30 |
31 | def ScrapingBookData():
32 |
33 | bookList = ['big-data-and-business-intelligence/machine-learning-opencv',
34 | 'big-data-and-business-intelligence/hands-generative-adversarial-networks-keras']
35 |
36 | extracted_data = []
37 |
38 | for i in bookList:
39 | url = "https://www.packtpub.com/" + i
40 | print("Processing: " + url)
41 | # Calling the parser
42 | parsed_data = parse(url)
43 | if parsed_data:
44 | extracted_data.append(parsed_data)
45 | #Save the collected data into a json file.
46 | file_json=open('book_data.json','w')
47 | json.dump(extracted_data,file_json,indent=4)
48 |
49 | # Writing scraped data book to csv file
50 | with open('scraped_book_data.csv', 'w') as csvfile:
51 | fieldnames = ['Title','Author','Date','Pages']
52 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
53 | writer.writeheader()
54 | for data in extracted_data:
55 | writer.writerow(data)
56 |
57 | if __name__ == "__main__":
58 | ScrapingBookData()
--------------------------------------------------------------------------------
/chapter3/code/selenium_list_book.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | from selenium import webdriver
5 | from bs4 import BeautifulSoup
6 | import requests
7 | import pandas as pd
8 |
9 | driver = webdriver.Chrome("chromedriver.exe")
10 |
11 | driver.get('https://www.packtpub.com/gb/web-development/web-programming')
12 | content = driver.page_source
13 |
14 | soup = BeautifulSoup(content,'lxml')
15 |
16 | books=[] #List to store book titles
17 | authors=[] #List to store authors
18 | dates=[] #List to store dates
19 |
20 |
21 | for element in soup.findAll('div', attrs={'class':'card h-100'}):
22 | title = element.find('h5', attrs={'class':'card-title mt-0'})
23 | author = element.find('div', attrs={'class':'author-names'})
24 | meta = element.find('div', attrs={'class':'product-meta'})
25 | if title is not None:
26 | print(title.contents[0].strip())
27 | title_text = title.contents[0].strip()
28 | else:
29 | title_text = ''
30 |
31 | if author is not None:
32 | author_text = author.find('p').text
33 | else:
34 | author_text = ''
35 |
36 | if meta is not None:
37 | date_text = meta.findChild().text
38 | else:
39 | date_text = ''
40 |
41 |
42 | books.append(title_text)
43 | authors.append(author_text)
44 | dates.append(date_text)
45 |
46 | df = pd.DataFrame({'Book title':books,'Author':authors,'Date':dates})
47 | df.to_csv('books.csv', index=False, encoding='utf-8')
48 |
--------------------------------------------------------------------------------
/chapter3/code/stack_overflow_tags.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 |
3 | driver = webdriver.Chrome("chromedriver.exe")
4 | driver.get("https://stackoverflow.com/tags")
5 | tags = driver.find_elements_by_class_name("post-tag")
6 | for i in range(len(tags)):
7 | print(tags[i].text)
--------------------------------------------------------------------------------
/chapter3/images/ajax_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/ajax_image.png
--------------------------------------------------------------------------------
/chapter3/images/book_info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/book_info.png
--------------------------------------------------------------------------------
/chapter3/images/book_packit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/book_packit.png
--------------------------------------------------------------------------------
/chapter3/images/books_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/books_details.png
--------------------------------------------------------------------------------
/chapter3/images/books_packit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/books_packit.png
--------------------------------------------------------------------------------
/chapter3/images/converter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/converter.png
--------------------------------------------------------------------------------
/chapter3/images/google_translate.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/google_translate.png
--------------------------------------------------------------------------------
/chapter3/images/selenium_methods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/selenium_methods.png
--------------------------------------------------------------------------------
/chapter3/images/xpath.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter3/images/xpath.png
--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-multipage-details/books_crawler/__init__.py
--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BooksCrawlerItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BooksCrawlerPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for books_crawler project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'books_crawler'
13 |
14 | SPIDER_MODULES = ['books_crawler.spiders']
15 | NEWSPIDER_MODULE = 'books_crawler.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'books_crawler.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'books_crawler.pipelines.SomePipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/spiders/BooksSpider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider
3 | from scrapy.http import Request
4 |
5 |
6 | def product_info(response, value):
7 | return response.xpath('//th[text()="' + value + '"]/following-sibling::td/text()').extract_first()
8 |
9 |
10 | class BooksSpider(Spider):
11 | name = 'BooksSpider'
12 | allowed_domains = ['books.toscrape.com']
13 | start_urls = ['http://books.toscrape.com']
14 |
15 | def parse(self, response):
16 | books = response.xpath('//h3/a/@href').extract()
17 | for book in books:
18 | absolute_url = response.urljoin(book)
19 | yield Request(absolute_url, callback=self.parse_book)
20 |
21 | # process next page
22 | next_page_url = response.xpath('//a[text()="next"]/@href').extract_first()
23 | absolute_next_page_url = response.urljoin(next_page_url)
24 | yield Request(absolute_next_page_url)
25 |
26 | def parse_book(self, response):
27 | title = response.css('h1::text').extract_first()
28 | price = response.xpath('//*[@class="price_color"]/text()').extract_first()
29 |
30 | image_url = response.xpath('//img/@src').extract_first()
31 | image_url = image_url.replace('../..', 'http://books.toscrape.com/')
32 |
33 | rating = response.xpath('//*[contains(@class, "star-rating")]/@class').extract_first()
34 | rating = rating.replace('star-rating ', '')
35 |
36 | description = response.xpath(
37 | '//*[@id="product_description"]/following-sibling::p/text()').extract_first()
38 |
39 | # book information data
40 | product_type = product_info(response, 'Product Type')
41 | price_without_tax = product_info(response, 'Price (excl. tax)')
42 | price_with_tax = product_info(response, 'Price (incl. tax)')
43 | tax = product_info(response, 'Tax')
44 | availability = product_info(response, 'Availability')
45 | number_of_reviews = product_info(response, 'Number of reviews')
46 |
47 | yield {
48 | 'title': title,
49 | 'price': price,
50 | 'image_url': image_url,
51 | 'rating': rating,
52 | 'description': description,
53 | 'product_type': product_type,
54 | 'price_without_tax': price_without_tax,
55 | 'price_with_tax': price_with_tax,
56 | 'tax': tax,
57 | 'availability': availability,
58 | 'number_of_reviews': number_of_reviews
59 | }
--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/books_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-multipage-details/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = books_crawler.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books_crawler
12 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-urls/books_crawler/__init__.py
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BooksCrawlerItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BooksCrawlerPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for books_crawler project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'books_crawler'
13 |
14 | SPIDER_MODULES = ['books_crawler.spiders']
15 | NEWSPIDER_MODULE = 'books_crawler.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'books_crawler.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'books_crawler.pipelines.SomePipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/spiders/BooksSpider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider
3 | from scrapy.http import Request
4 |
5 |
6 | class BooksSpider(Spider):
7 | name = 'BooksSpider'
8 | allowed_domains = ['books.toscrape.com']
9 | start_urls = ['http://books.toscrape.com']
10 |
11 | def parse(self, response):
12 | books = response.xpath('//h3/a/@href').extract()
13 | for book in books:
14 | absolute_url = response.urljoin(book)
15 | yield Request(absolute_url, callback=self.parse_book)
16 |
17 | # process next page
18 | next_page_url = response.xpath('//a[text()="next"]/@href').extract_first()
19 | absolute_next_page_url = response.urljoin(next_page_url)
20 | yield Request(absolute_next_page_url)
21 |
22 | def parse_book(self, response):
23 | yield { 'book_url': response.url}
24 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/books_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = books_crawler.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books_crawler
12 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/BooksSpider-urls_download_images/books_crawler/__init__.py
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | class BooksCrawlerItem(scrapy.Item):
11 | title = scrapy.Field()
12 | price = scrapy.Field()
13 |
14 | image_urls = scrapy.Field()
15 | images = scrapy.Field()
16 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BooksCrawlerPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for books_crawler project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'books_crawler'
13 |
14 | SPIDER_MODULES = ['books_crawler.spiders']
15 | NEWSPIDER_MODULE = 'books_crawler.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'books_crawler (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'books_crawler.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'books_crawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'scrapy.pipelines.images.ImagesPipeline': 1,
69 | }
70 | IMAGES_STORE = './images_store'
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | #AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | #AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | #AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | #AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | #HTTPCACHE_ENABLED = True
88 | #HTTPCACHE_EXPIRATION_SECS = 0
89 | #HTTPCACHE_DIR = 'httpcache'
90 | #HTTPCACHE_IGNORE_HTTP_CODES = []
91 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/spiders/BooksSpider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy import Spider
3 | from scrapy.http import Request
4 | from scrapy.loader import ItemLoader
5 | from books_crawler.items import BooksCrawlerItem
6 |
7 |
8 | class BooksSpider(Spider):
9 | name = 'BooksSpider'
10 | allowed_domains = ['books.toscrape.com']
11 | start_urls = ['http://books.toscrape.com']
12 |
13 | def parse(self, response):
14 | books = response.xpath('//h3/a/@href').extract()
15 | for book in books:
16 | absolute_url = response.urljoin(book)
17 | yield Request(absolute_url, callback=self.parse_book)
18 |
19 | # process next page
20 | next_page_url = response.xpath('//a[text()="next"]/@href').extract_first()
21 | absolute_next_page_url = response.urljoin(next_page_url)
22 | yield Request(absolute_next_page_url)
23 |
24 | def parse_book(self, response):
25 | item_loader = ItemLoader(item=BooksCrawlerItem(), response=response)
26 |
27 | title = response.css('h1::text').extract_first()
28 | price = response.xpath('//*[@class="price_color"]/text()').extract_first()
29 |
30 | image_urls = response.xpath('//img/@src').extract_first()
31 | image_urls = image_urls.replace('../..', 'http://books.toscrape.com/')
32 |
33 | item_loader.add_value('title', title)
34 | item_loader.add_value('price', price)
35 | item_loader.add_value('image_urls', image_urls)
36 |
37 | return item_loader.load_item()
38 |
39 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/books_crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/chapter4/BooksSpider-urls_download_images/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = books_crawler.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = books_crawler
12 |
--------------------------------------------------------------------------------
/chapter4/europython/europython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__init__.py
--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/items.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/items.cpython-37.pyc
--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/pipelines.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/pipelines.cpython-37.pyc
--------------------------------------------------------------------------------
/chapter4/europython/europython/__pycache__/settings.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/__pycache__/settings.cpython-37.pyc
--------------------------------------------------------------------------------
/chapter4/europython/europython/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from scrapy.loader.processors import Compose, MapCompose, Join
10 |
11 | clean_text = Compose(MapCompose(lambda v: v.strip()), Join())
12 |
13 | def custom_field(text):
14 | text = clean_text(text)
15 | return text.strip()
16 |
17 | class EuropythonItem(scrapy.Item):
18 | # define the fields for your item here like:
19 | # name = scrapy.Field()
20 | title = scrapy.Field(output_processor=custom_field)
21 | author = scrapy.Field(output_processor=custom_field)
22 | description = scrapy.Field(output_processor=custom_field)
23 | date = scrapy.Field(output_processor=custom_field)
24 | tags = scrapy.Field(output_processor=custom_field)
25 |
--------------------------------------------------------------------------------
/chapter4/europython/europython/middlewares.py:
--------------------------------------------------------------------------------
1 | # Importing base64 library because we'll need it ONLY in case if the proxy we are going to use requires authentication
2 | import base64
3 |
4 | # Start your middleware class
5 | class ProxyMiddleware(object):
6 | # overwrite process request
7 | def process_request(self, request, spider):
8 | # Set the location of the proxy
9 | request.meta['proxy'] = "proxy_server"
10 |
11 | # Use the following lines if your proxy requires authentication
12 | proxy_user_pass = "user:password"
13 | # setup basic authentication for the proxy
14 | encoded_user_pass = base64.encodestring(proxy_user_pass)
15 | request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
16 |
--------------------------------------------------------------------------------
/chapter4/europython/europython/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | import scrapy
9 | from scrapy import signals
10 | from scrapy.exporters import CsvItemExporter
11 | from scrapy.exporters import XmlItemExporter
12 | import codecs
13 | import json
14 | import csv
15 |
16 | class EuropythonJsonExport(object):
17 | def __init__(self):
18 | self.file = codecs.open('europython_items.json', 'w+b', encoding='utf-8')
19 |
20 | def process_item(self, item, spider):
21 | line = json.dumps(dict(item), ensure_ascii=False) + "\n"
22 | self.file.write(line)
23 | return item
24 |
25 | def spider_closed(self, spider):
26 | self.file.close()
27 |
28 | class EuropythonXmlExport(object):
29 |
30 | def __init__(self):
31 | self.files = {}
32 |
33 | @classmethod
34 | def from_crawler(cls, crawler):
35 | pipeline = cls()
36 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
37 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
38 | return pipeline
39 |
40 | def spider_opened(self, spider):
41 | file = open('europython_items.xml', 'w+b')
42 | self.files[spider] = file
43 | self.exporter = XmlItemExporter(file)
44 | self.exporter.start_exporting()
45 |
46 | def spider_closed(self, spider):
47 | self.exporter.finish_exporting()
48 | file = self.files.pop(spider)
49 | file.close()
50 |
51 | def process_item(self, item, spider):
52 | self.exporter.export_item(item)
53 | return item
54 |
55 | class EuropythonCSVExport(object):
56 |
57 | def __init__(self):
58 | self.files = {}
59 |
60 | @classmethod
61 | def from_crawler(cls, crawler):
62 | pipeline = cls()
63 | crawler.signals.connect(pipeline.spider_opened, signals.spider_opened)
64 | crawler.signals.connect(pipeline.spider_closed, signals.spider_closed)
65 | return pipeline
66 |
67 | def spider_opened(self, spider):
68 | file = open('europython_items.csv', 'w+b')
69 | self.files[spider] = file
70 | self.exporter = CsvItemExporter(file)
71 | self.exporter.start_exporting()
72 |
73 | def spider_closed(self, spider):
74 | self.exporter.finish_exporting()
75 | file = self.files.pop(spider)
76 | file.close()
77 |
78 | def process_item(self, item, spider):
79 | self.exporter.export_item(item)
80 | return item
81 |
--------------------------------------------------------------------------------
/chapter4/europython/europython/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for europython project
4 | #
5 | # For simplicity, this file contains only the most important settings by
6 | # default. All the other settings are documented here:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | #
10 |
11 | BOT_NAME = 'europython'
12 |
13 | SPIDER_MODULES = ['europython.spiders']
14 | NEWSPIDER_MODULE = 'europython.spiders'
15 |
16 |
17 | # Configure item pipelines
18 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
19 | ITEM_PIPELINES = {
20 | 'europython.pipelines.EuropythonJsonExport': 100,
21 | 'europython.pipelines.EuropythonXmlExport': 200,
22 | 'europython.pipelines.EuropythonCSVExport': 300,
23 | }
24 |
25 | DOWNLOADER_MIDDLEWARES = {
26 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 110,
27 | #'europython.middlewares.ProxyMiddleware': 100,
28 | }
29 |
30 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
31 | #USER_AGENT = 'europython (+http://www.yourdomain.com)'
32 |
--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/spiders/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/__pycache__/europython_spider.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/europython/europython/spiders/__pycache__/europython_spider.cpython-37.pyc
--------------------------------------------------------------------------------
/chapter4/europython/europython/spiders/europython_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.spiders import CrawlSpider, Rule
4 | from scrapy.linkextractors import LinkExtractor
5 | from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
6 | from scrapy.loader import ItemLoader
7 |
8 | from europython.items import EuropythonItem
9 |
10 |
11 | class EuropythonSpider(CrawlSpider):
12 | def __init__(self, year='', *args, **kwargs):
13 | super(EuropythonSpider, self).__init__(*args, **kwargs)
14 | self.year = year
15 | self.start_urls = ['http://ep'+str(self.year)+".europython.eu/en/events/sessions"]
16 | print('start url: '+str(self.start_urls[0]))
17 |
18 | name = "europython_spider"
19 | allowed_domains = ["ep2015.europython.eu","ep2016.europython.eu", "ep2017.europython.eu","ep2018.europython.eu","ep2019.europython.eu"]
20 |
21 | # Pattern for entries that match the conference/talks and /talks format
22 | rules = [Rule(LxmlLinkExtractor(allow=['conference/talks']),callback='process_response'),
23 | Rule(LxmlLinkExtractor(allow=['talks']),callback='process_response_europython2019')]
24 |
25 | def process_response(self, response):
26 | itemLoader = ItemLoader(item=EuropythonItem(), response=response)
27 | itemLoader.add_xpath('title', "//div[contains(@class, 'grid-100')]//h1/text()")
28 | itemLoader.add_xpath('author', "//div[contains(@class, 'talk-speakers')]//a[1]/text()")
29 | itemLoader.add_xpath('description', "//div[contains(@class, 'cms')]//p//text()")
30 | itemLoader.add_xpath('date', "//section[contains(@class, 'talk when')]/strong/text()")
31 | itemLoader.add_xpath('tags', "//div[contains(@class, 'all-tags')]/span/text()")
32 | item = itemLoader.load_item()
33 | return item
34 |
35 | def process_response_europython2019(self, response):
36 | item = EuropythonItem()
37 | print(response)
38 | item['title'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/h1/text()").extract()
39 | item['author'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/h5/a/text()").extract()
40 | item['description'] = response.xpath("//*[@id='talk_page']/div/div/div[1]/p[3]/text()").extract()
41 | item['date'] = "July 2019"
42 | item['tags'] = response.xpath("//span[contains(@class, 'badge badge-secondary')]/text()").extract()
43 |
44 | return item
--------------------------------------------------------------------------------
/chapter4/europython/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | project: 366126
2 |
--------------------------------------------------------------------------------
/chapter4/europython/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
5 |
6 | [settings]
7 | default = europython.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = europython
12 |
13 |
14 |
--------------------------------------------------------------------------------
/chapter4/europython/setup.py:
--------------------------------------------------------------------------------
1 | # Automatically created by: shub deploy
2 |
3 | from setuptools import setup, find_packages
4 |
5 | setup(
6 | name = 'project',
7 | version = '1.0',
8 | packages = find_packages(),
9 | entry_points = {'scrapy': ['settings = europython.settings']},
10 | )
11 |
--------------------------------------------------------------------------------
/chapter4/images/book_details.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/book_details.png
--------------------------------------------------------------------------------
/chapter4/images/books_images.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/books_images.png
--------------------------------------------------------------------------------
/chapter4/images/books_images_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/books_images_output.png
--------------------------------------------------------------------------------
/chapter4/images/europython_talk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/europython_talk.png
--------------------------------------------------------------------------------
/chapter4/images/next_page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/next_page.png
--------------------------------------------------------------------------------
/chapter4/images/scrapy_books.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_books.png
--------------------------------------------------------------------------------
/chapter4/images/scrapy_books_links.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_books_links.png
--------------------------------------------------------------------------------
/chapter4/images/scrapy_options.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_options.png
--------------------------------------------------------------------------------
/chapter4/images/scrapy_project.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_project.png
--------------------------------------------------------------------------------
/chapter4/images/scrapy_shell.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_shell.png
--------------------------------------------------------------------------------
/chapter4/images/scrapy_shell2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Advanced-Web-Scraping-with-Python/6624b71b2889a6fcfa3f080a6e15b979e582cce6/chapter4/images/scrapy_shell2.png
--------------------------------------------------------------------------------
/chapter4/output.json:
--------------------------------------------------------------------------------
1 | [
2 | {"URL": "http://books.toscrape.com/index.html", "image_link": ["media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg"]},
3 | {"URL": "http://books.toscrape.com/catalogue/category/books/travel_2/index.html", "image_link": ["../../../../media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg", "../../../../media/cache/57/77/57770cac1628f4407636635f4b85e88c.jpg", "../../../../media/cache/9a/7e/9a7e63f12829df4b43b31d110bf3dc2e.jpg", "../../../../media/cache/d5/bf/d5bf0090470b0b8ea46d9c166f7895aa.jpg", "../../../../media/cache/98/c2/98c2e95c5fd1a4e7cd5f2b63c52826cb.jpg", "../../../../media/cache/4e/15/4e15150388702ebca2c5a523ac270539.jpg", "../../../../media/cache/76/de/76de41867f323d7f1f4fbe2fdfc1b2ba.jpg", "../../../../media/cache/db/46/db46159b05faa5d95262112bf9c29ddd.jpg", "../../../../media/cache/e0/4f/e04f8eda2a2fa947aec17640202d9ab0.jpg", "../../../../media/cache/06/81/0681530a7bc301caf5c3257e1b0f0750.jpg", "../../../../media/cache/d7/0f/d70f7edd92705c45a82118c3ff6c299d.jpg"]},
4 | {"URL": "http://books.toscrape.com/catalogue/category/books/classics_6/index.html", "image_link": ["../../../../media/cache/c5/46/c5465a06182ed6ebfa40d049258a2f58.jpg", "../../../../media/cache/4a/1b/4a1b6e9c1af75db0dc34ae63344f6883.jpg", "../../../../media/cache/45/bb/45bb59d19eb3aa868293d44809078418.jpg", "../../../../media/cache/1f/b0/1fb03cdabe6001c8a2620f65e025cbd5.jpg", "../../../../media/cache/81/f5/81f559ebe403317226fa8b611e35ce8a.jpg", "../../../../media/cache/27/82/2782701b5c877cb063065b9fc14c5b13.jpg", "../../../../media/cache/e3/c4/e3c4aba2409bb769a6488805e3fc4709.jpg", "../../../../media/cache/10/db/10db56354b4550d92270c6f097d9bebc.jpg", "../../../../media/cache/93/4e/934e966c1ddf559d3ac2b5c1407aaf1e.jpg", "../../../../media/cache/a6/72/a67245346daa38c2b23a4fc64c6e7115.jpg", "../../../../media/cache/42/c4/42c48f11b7e70a0f76c5ba9cb5c5018a.jpg", "../../../../media/cache/dd/6e/dd6e7b84e99f3b4b5655ea0db74af2b4.jpg", "../../../../media/cache/21/bf/21bf2eb0bff3134837def8bd40845ba0.jpg", "../../../../media/cache/ab/16/ab16eb035cc58809a73c4699477de9cb.jpg", "../../../../media/cache/c0/78/c078355608dd81c7c5e4f5e1c5f73d23.jpg", "../../../../media/cache/7d/53/7d53e2264b9647ee307259be9f73585d.jpg", "../../../../media/cache/0f/ca/0fca4597765ffacdb7bd529fc5eb88fa.jpg", "../../../../media/cache/09/63/09638baaef52f03827c215029c632a13.jpg", "../../../../media/cache/96/ee/96ee77d71a31b7694dac6855f6affe4e.jpg"]},
5 | {"URL": "http://books.toscrape.com/catalogue/category/books/philosophy_7/index.html", "image_link": ["../../../../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg", "../../../../media/cache/71/df/71df730cf38c232ee58a2e407135f055.jpg", "../../../../media/cache/ea/04/ea0476a6f4c318ceccf5e2f2b39f2b15.jpg", "../../../../media/cache/3f/ef/3fef12d9da503693af12997c0ea0897f.jpg", "../../../../media/cache/05/ce/05ce699eaf78c0fae20308497c4f496a.jpg", "../../../../media/cache/de/76/de76d5c473c358bd41c03cf710692bfb.jpg", "../../../../media/cache/12/6e/126ef8f6473b81808ebbb9cff155e883.jpg", "../../../../media/cache/91/e6/91e6190dcdd7d6cdeb94a82b60917ec4.jpg", "../../../../media/cache/f0/aa/f0aa9ae0319b1d6e0706e6053020e696.jpg", "../../../../media/cache/df/c9/dfc9ed72e963572d23233b3a8cb01676.jpg", "../../../../media/cache/ab/45/ab45f300aa15066ad1260d6f1398d03e.jpg"]},
6 | {"URL": "http://books.toscrape.com/catalogue/category/books/sequential-art_5/index.html", "image_link": ["../../../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../../../media/cache/36/df/36df4caaf1420b1183a8235355d39e69.jpg", "../../../../media/cache/c4/dd/c4ddd9ced89966b0602ec85e00cd5b61.jpg", "../../../../media/cache/f4/79/f479de5f305c2ac0512702cf7155bb74.jpg", "../../../../media/cache/e1/ea/e1ea6cb36e62ae6dc7b805f68ab9a700.jpg", "../../../../media/cache/f3/ef/f3efd43ae0fa85d9b325d5e8783e7af5.jpg", "../../../../media/cache/78/0b/780b2c28122750c2c383846155815bf7.jpg", "../../../../media/cache/c8/2f/c82f629a31b3f47bdb17ac14aa51076d.jpg", "../../../../media/cache/01/72/01726c619a05114dca75bd840095016d.jpg", "../../../../media/cache/cb/00/cb004189f548d75ad430d3ed19e6daa9.jpg", "../../../../media/cache/03/88/03886a8502ca54dbce0d91c2568ab69d.jpg", "../../../../media/cache/d3/15/d3158e8d3546fb90cced3c1d44a92a34.jpg", "../../../../media/cache/7e/a0/7ea062007ef00107e3c16d336b41fab2.jpg", "../../../../media/cache/5f/b1/5fb1bf88dcfda795606745ce35be5975.jpg", "../../../../media/cache/aa/74/aa74004807e97a79aa084b5db329a99b.jpg", "../../../../media/cache/16/d4/16d443437126bf6d536a89312c1995a5.jpg", "../../../../media/cache/90/6f/906f0168b0e155a7077625499b1737b5.jpg", "../../../../media/cache/78/97/7897eea91c4a85aca58d925861d4afec.jpg", "../../../../media/cache/f6/88/f688a9d6a89fdf38e4e88439ee9eda69.jpg", "../../../../media/cache/dd/c9/ddc95df6754df8e71bf969c088056188.jpg"]},
7 | {"URL": "http://books.toscrape.com/catalogue/category/books/womens-fiction_9/index.html", "image_link": ["../../../../media/cache/5f/72/5f72c8a0d5a7292e2929a354ec8a022f.jpg", "../../../../media/cache/16/e3/16e3ca741956485119251e7442a67e2e.jpg", "../../../../media/cache/ae/ac/aeac003461b89c7ef826251d940b2afc.jpg", "../../../../media/cache/bb/ee/bbeeab4c4ce572c0e9764e3a96c6d4a5.jpg", "../../../../media/cache/39/e3/39e33ebef2d7a35dd6899541eba8306d.jpg", "../../../../media/cache/27/b7/27b7f4ec590965b5acc15dc4b1376684.jpg", "../../../../media/cache/ac/ba/acba5e4e1813b8c1fff4890f1efef3ab.jpg", "../../../../media/cache/72/73/7273ff1bfe3b0a6aab7f54ddf9be7b44.jpg", "../../../../media/cache/a0/fa/a0fa38039f6a674a7c89dfe2be866259.jpg", "../../../../media/cache/13/8f/138f4cf84be250d08e1f5c1db3643dbc.jpg", "../../../../media/cache/63/5f/635fb981e464f7427787824b20a15e71.jpg", "../../../../media/cache/87/d3/87d34d376555dd0cb75030d1059cc144.jpg", "../../../../media/cache/6d/6d/6d6d5799190b4f9ef89f3bbc8b67d60d.jpg", "../../../../media/cache/72/f5/72f5ed312bc82afa386c9cd48d4e36dd.jpg", "../../../../media/cache/b2/df/b2df826432771838819db89c20e20609.jpg", "../../../../media/cache/db/34/db341aa83daa76cd9f9bd2c86ccb5dba.jpg", "../../../../media/cache/0c/32/0c329cbd2adf4e0dc825f892106673b2.jpg"]},
8 | {"URL": "http://books.toscrape.com/catalogue/category/books/mystery_3/index.html", "image_link": ["../../../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../../../media/cache/23/85/238570a1c284e730dbc737a7e631ae2b.jpg", "../../../../media/cache/89/b8/89b850edb01851a91f64ba114b96acb6.jpg", "../../../../media/cache/11/aa/11aaad48b5f15e262456ca65294084da.jpg", "../../../../media/cache/29/fe/29fe70b1b2e5a9ba61d4bd331255e19e.jpg", "../../../../media/cache/37/f1/37f118b4a56d866e1e8b563759d6966c.jpg", "../../../../media/cache/44/9e/449ed681142bc336646abee754e96639.jpg", "../../../../media/cache/3c/91/3c91d97266bd6dda322089695fb46daf.jpg", "../../../../media/cache/e8/c0/e8c0ba15066bab950ae161fd60949b9a.jpg", "../../../../media/cache/8f/a4/8fa41d6caa10e427356b8a590eb4d96b.jpg", "../../../../media/cache/23/52/2352718971d5e166fa9541a5a7d716fa.jpg", "../../../../media/cache/c3/8d/c38d65cd155b67ca025f0655bd1bb095.jpg", "../../../../media/cache/8b/bc/8bbc5ab4c3784b4d9b93eb0fd1fb6fd6.jpg", "../../../../media/cache/57/07/5707c3d5d4fd44d943d51730ba7d429a.jpg", "../../../../media/cache/d5/81/d58157866ea8f015a8e4c55b23b8c96f.jpg", "../../../../media/cache/fd/71/fd71fb07247bf911505a351c0670c6dc.jpg", "../../../../media/cache/90/0b/900bd2e60d56b6480a4e8eb2dddb46d6.jpg", "../../../../media/cache/c7/ab/c7abb5e32bd37118a87523dcee0a70a6.jpg", "../../../../media/cache/95/d7/95d7541679fcbd579b8a4f2b47231aaf.jpg", "../../../../media/cache/57/31/5731a5d46c2c1e88977eb5e6d1337a2e.jpg"]},
9 | {"URL": "http://books.toscrape.com/catalogue/category/books/historical-fiction_4/index.html", "image_link": ["../../../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../../../media/cache/d6/58/d658a1485b130ff26ca5fb0d5975ed2e.jpg", "../../../../media/cache/82/96/8296f92b70fb1dafefecda92c1d51941.jpg", "../../../../media/cache/0d/cb/0dcb33d60b0e79adf8ab9842e697ea2e.jpg", "../../../../media/cache/0e/fe/0efe86960cdff718aed01a5c3f65b1c3.jpg", "../../../../media/cache/0f/c2/0fc21ec3489cb23116778ee84f425eca.jpg", "../../../../media/cache/96/41/964194a317f8ce5ed031bf4c9ceb43ab.jpg", "../../../../media/cache/7a/22/7a224a6e174af91950e9b124afe54e0e.jpg", "../../../../media/cache/16/57/16575316618bd7e922d5b0e0f87de2ca.jpg", "../../../../media/cache/6c/2e/6c2e764e3ea89859b52df8de4f12af7a.jpg", "../../../../media/cache/fc/80/fc80b999ff4b8ef24b7071f62d2bf6d1.jpg", "../../../../media/cache/62/fa/62fa1e72f06f05762db5d9cedf654153.jpg", "../../../../media/cache/be/7c/be7ce6fbc9a8e1a5a5b5c32e73cfd78a.jpg", "../../../../media/cache/6b/82/6b822681c4035131560d40dd3b5a6a2e.jpg", "../../../../media/cache/b7/ad/b7ad37d93d8401c84d7325aa645ff6d5.jpg", "../../../../media/cache/b5/d8/b5d813da01f2ccd7bcfe34e2b875e752.jpg", "../../../../media/cache/b2/8f/b28f211e50e74445ca071d4279d1080d.jpg", "../../../../media/cache/bf/fd/bffd473ab232c5f35e8c81bb927f1624.jpg", "../../../../media/cache/18/f7/18f7bf6366cd7a8b947fd790d808047b.jpg", "../../../../media/cache/bf/7a/bf7a5bc1d1ebac5e9b6fbb147828a123.jpg"]},
10 | {"URL": "http://books.toscrape.com/catalogue/category/books_1/index.html", "image_link": ["../../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "../../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../../media/cache/27/a5/27a53d0bb95bdd88288eaf66c9230d7e.jpg"]},
11 | {"URL": "http://books.toscrape.com/catalogue/category/books/romance_8/index.html", "image_link": ["../../../../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg", "../../../../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg", "../../../../media/cache/1e/bb/1ebbbc3e2d3249b111033cfc40763b0b.jpg", "../../../../media/cache/c4/d1/c4d1517cc9370e292366b6132ca9ca36.jpg", "../../../../media/cache/cc/bd/ccbdae9e29b3594301528fa2c876ec29.jpg", "../../../../media/cache/28/99/28992d89f4abf54fba183fc8d074adf3.jpg", "../../../../media/cache/e9/f4/e9f4bc8cf5ffaea1504623c936e90a48.jpg", "../../../../media/cache/59/10/5910fbd8a95e8e9de9c660b71e0694e2.jpg", "../../../../media/cache/e9/25/e9250495a525eb203652ad9da85ccb8e.jpg", "../../../../media/cache/7e/67/7e67addd80caaf8a9f9e9daa9cf66bb2.jpg", "../../../../media/cache/0b/89/0b89c3b317d0f89da48356a0b5959c1e.jpg", "../../../../media/cache/ae/90/ae903f6f6d059954be4e85497dd76bf5.jpg", "../../../../media/cache/a6/4b/a64b3c559f59748bfdbbe75be3e16075.jpg", "../../../../media/cache/1d/78/1d78fe226e1adb9cb591fa21f8a9bf68.jpg", "../../../../media/cache/f0/e0/f0e0db3edcb14293a52b51929cc72979.jpg", "../../../../media/cache/8e/40/8e408552c2e7ee81cd60c03c79f604af.jpg", "../../../../media/cache/f7/a9/f7a90a63f66ac92cc280def001970ed2.jpg", "../../../../media/cache/40/16/4016ffba678f309171d8130135f6eb8e.jpg", "../../../../media/cache/3c/a2/3ca2e61181fc1122658af8f85354bae8.jpg", "../../../../media/cache/57/47/57472d9c6d483bee9c38c90bfa10b3ee.jpg"]},
12 | {"URL": "http://books.toscrape.com/catalogue/category/books/health_47/index.html", "image_link": ["../../../../media/cache/ee/3e/ee3e219d23e73ba71c79b700f183aaed.jpg", "../../../../media/cache/62/3f/623f8e7f7432ce744f4318aae8166ce4.jpg", "../../../../media/cache/23/c2/23c2108ae81327c7f3fb0721976cba5e.jpg", "../../../../media/cache/4b/d4/4bd43108fb070ad8ebba9cdb00b14069.jpg"]},
13 | {"URL": "http://books.toscrape.com/catalogue/category/books/novels_46/index.html", "image_link": ["../../../../media/cache/db/cc/dbcc9d63b73ce9058d53f36465dbe2b2.jpg"]},
14 | {"URL": "http://books.toscrape.com/catalogue/category/books/short-stories_45/index.html", "image_link": ["../../../../media/cache/f4/cb/f4cb1f9c7280bf1fd05fe33d2816080f.jpg"]},
15 | {"URL": "http://books.toscrape.com/catalogue/category/books/suspense_44/index.html", "image_link": ["../../../../media/cache/bb/1c/bb1c91883579f1f99fe6ebf13b92c1c1.jpg"]},
16 | {"URL": "http://books.toscrape.com/catalogue/page-2.html", "image_link": ["../media/cache/5d/72/5d72709c6a7a9584a4d1cf07648bfce1.jpg", "../media/cache/5c/c8/5cc8e107246cb478960d4f0aba1e1c8e.jpg", "../media/cache/9f/59/9f59f01fa916a7bb8f0b28a4012179a4.jpg", "../media/cache/9c/2e/9c2e0eb8866b8e3f3b768994fd3d1c1a.jpg", "../media/cache/44/cc/44ccc99c8f82c33d4f9d2afa4ef25787.jpg", "../media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg", "../media/cache/ef/0b/ef0bed08de4e083dba5e20fdb98d9c36.jpg", "../media/cache/d6/da/d6da0371958068bbaf39ea9c174275cd.jpg", "../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg", "../media/cache/a5/41/a5416b9646aaa7287baa287ec2590270.jpg", "../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg", "../media/cache/38/c5/38c56fba316c07305643a8065269594e.jpg", "../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg", "../media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg", "../media/cache/65/71/6571919836ec51ed54f0050c31d8a0cd.jpg", "../media/cache/12/53/1253c21c5ef3c6d075c5fa3f5fecee6a.jpg", "../media/cache/f5/88/f5889d038f5d8e949b494d147c2dcf54.jpg", "../media/cache/23/85/238570a1c284e730dbc737a7e631ae2b.jpg", "../media/cache/e1/5c/e15c289ba58cea38519e1281e859f0c1.jpg", "../media/cache/e9/20/e9203b733126c4a0832a1c7885dc27cf.jpg"]},
17 | {"URL": "http://books.toscrape.com/catalogue/its-only-the-himalayas_981/index.html", "image_link": ["../../media/cache/6d/41/6d418a73cc7d4ecfd75ca11d854041db.jpg", "../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg"]},
18 | {"URL": "http://books.toscrape.com/catalogue/category/books/christian_43/index.html", "image_link": ["../../../../media/cache/cd/db/cddb3eb483ef11a088d519205b7098fb.jpg", "../../../../media/cache/03/f1/03f1e337afadba35687672b5625a9757.jpg", "../../../../media/cache/a2/f5/a2f5b5fd4421d56d37c73a7fb29f5f40.jpg"]},
19 | {"URL": "http://books.toscrape.com/catalogue/libertarianism-for-beginners_982/index.html", "image_link": ["../../media/cache/91/a4/91a46253e165d144ef5938f2d456b88f.jpg", "../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg"]},
20 | {"URL": "http://books.toscrape.com/catalogue/category/books/historical_42/index.html", "image_link": ["../../../../media/cache/41/c3/41c37f7f0e03ee1144dd6fa89483b5d9.jpg", "../../../../media/cache/d0/b6/d0b6d59c0662dcbd15d47add40af1ebd.jpg"]},
21 | {"URL": "http://books.toscrape.com/catalogue/mesaerion-the-best-science-fiction-stories-1800-1849_983/index.html", "image_link": ["../../media/cache/e8/1f/e81f850db9b9622c65619c9f15748de7.jpg", "../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg"]},
22 | {"URL": "http://books.toscrape.com/catalogue/olio_984/index.html", "image_link": ["../../media/cache/b1/0e/b10eabab1e1c811a6d47969904fd5755.jpg", "../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg"]},
23 | {"URL": "http://books.toscrape.com/catalogue/our-band-could-be-your-life-scenes-from-the-american-indie-underground-1981-1991_985/index.html", "image_link": ["../../media/cache/ad/96/ad96e9c9f1664cbcb0e9627b007fb6f9.jpg", "../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg"]},
24 | {"URL": "http://books.toscrape.com/catalogue/rip-it-up-and-start-again_986/index.html", "image_link": ["../../media/cache/81/7f/817f5089c0e6e62738dce2931e7323d3.jpg", "../../media/cache/94/b1/94b1b8b244bce9677c2f29ccc890d4d2.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg"]},
25 | {"URL": "http://books.toscrape.com/catalogue/scott-pilgrims-precious-little-life-scott-pilgrim-1_987/index.html", "image_link": ["../../media/cache/97/27/97275841c81e66d53bf9313cba06f23e.jpg", "../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg"]},
26 | {"URL": "http://books.toscrape.com/catalogue/set-me-free_988/index.html", "image_link": ["../../media/cache/b8/e9/b8e91bd2fc74c3954118999238abb4b8.jpg", "../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg"]},
27 | {"URL": "http://books.toscrape.com/catalogue/shakespeares-sonnets_989/index.html", "image_link": ["../../media/cache/4d/7a/4d7a79a8be80a529b277ed5c4d8ba482.jpg", "../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg"]},
28 | {"URL": "http://books.toscrape.com/catalogue/starving-hearts-triangular-trade-trilogy-1_990/index.html", "image_link": ["../../media/cache/a0/7e/a07ed8f1c23f7b4baf7102722680bd30.jpg", "../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg"]},
29 | {"URL": "http://books.toscrape.com/catalogue/the-black-maria_991/index.html", "image_link": ["../../media/cache/d1/7a/d17a3e313e52e1be5651719e4fba1d16.jpg", "../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg"]},
30 | {"URL": "http://books.toscrape.com/catalogue/the-dirty-little-secrets-of-getting-your-dream-job_994/index.html", "image_link": ["../../media/cache/e1/1b/e11bea016d0ae1d7e2dd46fb3cb870b7.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
31 | {"URL": "http://books.toscrape.com/catalogue/the-requiem-red_995/index.html", "image_link": ["../../media/cache/6b/07/6b07b77236b7c80f42bd90bf325e69f6.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
32 | {"URL": "http://books.toscrape.com/catalogue/the-boys-in-the-boat-nine-americans-and-their-epic-quest-for-gold-at-the-1936-berlin-olympics_992/index.html", "image_link": ["../../media/cache/d1/2d/d12d26739b5369a6b5b3024e4d08f907.jpg", "../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg"]},
33 | {"URL": "http://books.toscrape.com/catalogue/the-coming-woman-a-novel-based-on-the-life-of-the-infamous-feminist-victoria-woodhull_993/index.html", "image_link": ["../../media/cache/97/36/9736132a43b8e6e3989932218ef309ed.jpg", "../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg"]},
34 | {"URL": "http://books.toscrape.com/catalogue/sapiens-a-brief-history-of-humankind_996/index.html", "image_link": ["../../media/cache/ce/5f/ce5f052c65cc963cf4422be096e915c9.jpg", "../../media/cache/32/51/3251cf3a3412f53f339e42cac2134093.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
35 | {"URL": "http://books.toscrape.com/catalogue/sharp-objects_997/index.html", "image_link": ["../../media/cache/c0/59/c05972805aa7201171b8fc71a5b00292.jpg", "../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
36 | {"URL": "http://books.toscrape.com/catalogue/soumission_998/index.html", "image_link": ["../../media/cache/ee/cf/eecfe998905e455df12064dba399c075.jpg", "../../media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
37 | {"URL": "http://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html", "image_link": ["../../media/cache/08/e9/08e94f3731d7d6b760dfbfbc02ca5c62.jpg", "../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg"]},
38 | {"URL": "http://books.toscrape.com/catalogue/a-light-in-the-attic_1000/index.html", "image_link": ["../../media/cache/fe/72/fe72f0532301ec28892ae79a629a293c.jpg"]},
39 | {"URL": "http://books.toscrape.com/catalogue/category/books/crime_51/index.html", "image_link": ["../../../../media/cache/f2/e5/f2e51dd2b26600459f8eaeb6b9eecaa7.jpg"]},
40 | {"URL": "http://books.toscrape.com/catalogue/category/books/erotica_50/index.html", "image_link": ["../../../../media/cache/6e/4e/6e4e8f4f4abd94356a9be840e4681e65.jpg"]},
41 | {"URL": "http://books.toscrape.com/catalogue/category/books/cultural_49/index.html", "image_link": ["../../../../media/cache/52/46/524655fade1d9fe1475395a3eaff827a.jpg"]},
42 | {"URL": "http://books.toscrape.com/catalogue/category/books/politics_48/index.html", "image_link": ["../../../../media/cache/0b/bc/0bbcd0a6f4bcd81ccb1049a52736406e.jpg", "../../../../media/cache/db/1b/db1babd3c09b84da800b0e9897fe0097.jpg", "../../../../media/cache/00/11/001153d2a22d889837efac1703e10a5e.jpg"]},
43 | {"URL": "http://books.toscrape.com/catalogue/category/books/academic_40/index.html", "image_link": ["../../../../media/cache/d9/4e/d94e6206c2decd3acd9a61b2cbac7eaf.jpg"]},
44 | {"URL": "http://books.toscrape.com/catalogue/category/books/self-help_41/index.html", "image_link": ["../../../../media/cache/ea/9b/ea9b2cb8abbb317402e618445bade1e1.jpg", "../../../../media/cache/da/8b/da8bc9b824dd3f446ef63e438ddbfc85.jpg", "../../../../media/cache/9c/da/9cda4893c7fce0c1c8eaa34fb092aa04.jpg", "../../../../media/cache/9e/15/9e15d7add5090ff2a17bd71ac96aa55a.jpg", "../../../../media/cache/4f/08/4f08f7948770912e4e340e10caa604cb.jpg"]},
45 | {"URL": "http://books.toscrape.com/catalogue/category/books/spirituality_39/index.html", "image_link": ["../../../../media/cache/0f/7e/0f7ee69495c0df1d35723f012624a9f8.jpg", "../../../../media/cache/96/db/96db61bb53930c560fb4c1c62b583816.jpg", "../../../../media/cache/b7/6a/b76a73640d26b09c4a6f373b09050bed.jpg", "../../../../media/cache/87/fe/87fe3f7f3f62c1b1b81890578c9cf294.jpg", "../../../../media/cache/8b/10/8b102daec94d1ea9c6fc36dd3ec1c1fe.jpg", "../../../../media/cache/83/c8/83c834b3779be4e577c37ead6d2acf65.jpg"]},
46 | {"URL": "http://books.toscrape.com/catalogue/category/books/contemporary_38/index.html", "image_link": ["../../../../media/cache/08/04/08044269fc197645268a6197c57e6173.jpg", "../../../../media/cache/e3/d0/e3d05227f3fc24f0e0c84ccebe108fb0.jpg", "../../../../media/cache/4d/18/4d1891e435c6692c864331c585e0d014.jpg"]},
47 | {"URL": "http://books.toscrape.com/catalogue/category/books/thriller_37/index.html", "image_link": ["../../../../media/cache/5d/72/5d72709c6a7a9584a4d1cf07648bfce1.jpg", "../../../../media/cache/5d/7e/5d7ecde8e81513eba8a64c9fe000744b.jpg", "../../../../media/cache/e1/5c/e15c289ba58cea38519e1281e859f0c1.jpg", "../../../../media/cache/d6/97/d697268540fa982f4dce39f61ed3a342.jpg", "../../../../media/cache/76/de/76deee06ffe45e646c0113af01f4f401.jpg", "../../../../media/cache/d9/1a/d91aae72af6c1cb2c63163acabe7895c.jpg", "../../../../media/cache/8b/7c/8b7c73e075cc687b6890dc0dca9fcbcc.jpg", "../../../../media/cache/eb/e9/ebe9f06ccebf83d9853a846052b58fff.jpg", "../../../../media/cache/ee/d4/eed4d5d63d13f0aa86575c90f8ccacb7.jpg", "../../../../media/cache/87/54/8754267f27581996f93e8d94d3c04bf9.jpg", "../../../../media/cache/2a/a8/2aa8afd15f97617ab75f616766161cda.jpg"]},
48 | {"URL": "http://books.toscrape.com/catalogue/category/books/biography_36/index.html", "image_link": ["../../../../media/cache/6f/d9/6fd92e5143cbd5bb8bcf034e5f007dde.jpg", "../../../../media/cache/8b/c4/8bc43a6b42d0283ab4bf611f1b497126.jpg", "../../../../media/cache/cc/a4/cca4e6a4cd5c207e7ce7d992ff464c3b.jpg", "../../../../media/cache/25/f8/25f869fa75340fca0fc2a68e8a0412a1.jpg", "../../../../media/cache/ff/d4/ffd45d95f314555e20c923d3522adea7.jpg"]},
49 | {"URL": "http://books.toscrape.com/catalogue/category/books/business_35/index.html", "image_link": ["../../../../media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg", "../../../../media/cache/d0/77/d077a30042df6b916bfc8d257345c69e.jpg", "../../../../media/cache/82/93/82939ca78da0b724f16ec814849514fd.jpg", "../../../../media/cache/19/aa/19aa1184a3565b1dae6092146018e109.jpg", "../../../../media/cache/e2/2e/e22e4a82d97f9f0689d5295a98f5dcff.jpg", "../../../../media/cache/2d/fd/2dfdc52bcdbd82dee50372bc46c83e15.jpg", "../../../../media/cache/b3/7b/b37be83183f1dcb759d92bda8f8998a4.jpg", "../../../../media/cache/aa/67/aa677a97ecdcbbde7471f1c90ed0cf6f.jpg", "../../../../media/cache/11/2c/112c55a6bcd401c3bd603f5ddb2e6b82.jpg", "../../../../media/cache/18/f4/18f45d31e3892fee589e23f15d759ee3.jpg", "../../../../media/cache/39/f1/39f167dff90d7f84f5c8dc5e05d4051b.jpg", "../../../../media/cache/54/10/5410a58193e2373c04b3021ade78a82b.jpg"]},
50 | {"URL": "http://books.toscrape.com/catalogue/category/books/christian-fiction_34/index.html", "image_link": ["../../../../media/cache/21/21/2121ba78e26194d92c334fde3850f840.jpg", "../../../../media/cache/fa/f6/faf6d69a42f477e1da80a71f05a4dc25.jpg", "../../../../media/cache/93/e0/93e0ec623673a8f83598c9aa7b6c94ec.jpg", "../../../../media/cache/17/e2/17e264d978942f73b859fa1c1d2cf827.jpg", "../../../../media/cache/32/2c/322c1f6cce6d5a69a7d2321779195a0c.jpg", "../../../../media/cache/c3/d0/c3d0f2fb5cacbca64639a679b962e1b9.jpg"]},
51 | {"URL": "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/index.html", "image_link": ["../../../../media/cache/9f/59/9f59f01fa916a7bb8f0b28a4012179a4.jpg", "../../../../media/cache/b7/f4/b7f4843dbe062d44be1ffcfa16b2faa4.jpg", "../../../../media/cache/f5/65/f565af3d9dd20a1ad72a1e7c4157387d.jpg", "../../../../media/cache/10/c6/10c61093002db1fec4089d8076678624.jpg", "../../../../media/cache/98/d1/98d1c979c4bac9e147a6718946578b0f.jpg", "../../../../media/cache/61/bd/61bdfe3950643c47d70c37c4123530f3.jpg", "../../../../media/cache/0d/1f/0d1f3f934460f5a50aaa8c366641234c.jpg", "../../../../media/cache/54/89/54899b4584e941ceced511d81092c88a.jpg", "../../../../media/cache/20/f2/20f28657b49f8cb24ed2ec6448bb6df3.jpg", "../../../../media/cache/c4/dc/c4dcec6f513eaca3f0f3c748d834c46d.jpg", "../../../../media/cache/fe/67/fe67c381d6a0c4c00a7c191d16939554.jpg", "../../../../media/cache/b8/38/b838b65e0e1ac3a9b498dfb1bf004420.jpg", "../../../../media/cache/74/aa/74aa29b1ba4147eaf5b46671bf235861.jpg", "../../../../media/cache/76/a1/76a1516c8d9c3e620626f30840013a85.jpg", "../../../../media/cache/5a/64/5a6499d41ccaad4c4f7eeaa90e16345a.jpg", "../../../../media/cache/98/19/9819ff3a8290dc6ab8797d00de5ec554.jpg", "../../../../media/cache/ae/5c/ae5ca435fb095e374d2c2aa9f7b6f380.jpg", "../../../../media/cache/d4/53/d453cfb6c08dbf76d200ffa858bc9979.jpg", "../../../../media/cache/1d/1f/1d1fbd89f0290275b9166877663ee9f5.jpg", "../../../../media/cache/e6/b6/e6b66353f9325518994dd8b564290fd7.jpg"]},
52 | {"URL": "http://books.toscrape.com/catalogue/category/books/history_32/index.html", "image_link": ["../../../../media/cache/be/a5/bea5697f2534a2f86a3ef27b5a8c12a6.jpg", "../../../../media/cache/4a/3b/4a3b055f9e378a95fedbef55e7bab7ce.jpg", "../../../../media/cache/2d/4e/2d4e358712e6c9f1d3bdd78d1a16e5a8.jpg", "../../../../media/cache/64/44/6444dacdcb9edaadbbd691524622aeb8.jpg", "../../../../media/cache/97/47/974709d437b08e74649b5744471bf472.jpg", "../../../../media/cache/3d/60/3d6003fc37b842a07c2dbe28e47448e1.jpg", "../../../../media/cache/41/d5/41d5fa6a81cdbcbe6b0b15757a4c9144.jpg", "../../../../media/cache/88/75/8875f384ce9103281b7f6e86a2b8204d.jpg", "../../../../media/cache/56/cb/56cb66d73fb438d64af14dce8bd8b22b.jpg", "../../../../media/cache/11/af/11af7fbd6aec06a75fe207fae92b17e0.jpg", "../../../../media/cache/3c/f6/3cf646523ff7fb8647c500d6325cfcaf.jpg", "../../../../media/cache/e1/02/e102cefae5bb523bc67eb6b49bc18b5d.jpg", "../../../../media/cache/72/f1/72f13b8f069d3a018d2c378be5a1de20.jpg", "../../../../media/cache/f2/64/f26457d65a03b2636c4bcc7c318f7346.jpg", "../../../../media/cache/cf/18/cf187c1dc5575fcbbf49c58024146c4b.jpg", "../../../../media/cache/eb/17/eb178eceef1e9290591cabd5155571a3.jpg", "../../../../media/cache/06/c8/06c897070611b78b80a37333cbb7851c.jpg", "../../../../media/cache/43/fd/43fda1db93163d67705264dcfa98aaa5.jpg"]},
53 | {"URL": "http://books.toscrape.com/catalogue/category/books/horror_31/index.html", "image_link": ["../../../../media/cache/da/df/dadfac66a89774b46b10225362724c83.jpg", "../../../../media/cache/a7/4b/a74b35375ce874153fd352e33bc7bac9.jpg", "../../../../media/cache/6d/10/6d10387a0175701d4ff456a0c7eee67b.jpg", "../../../../media/cache/7a/72/7a72465b21dbf998323e37b31f9a3f4a.jpg", "../../../../media/cache/55/bf/55bfc858c1cb19867e41415532ae43c6.jpg", "../../../../media/cache/02/5c/025c30a378e2a4190e84f1429e81b803.jpg", "../../../../media/cache/0b/2f/0b2f432cc27132f688fcdf29618521e0.jpg", "../../../../media/cache/30/66/3066f8bcd2e2ed6b45084355ff084a61.jpg", "../../../../media/cache/13/ff/13fffcde653948339d3427184b7bd0b5.jpg", "../../../../media/cache/c0/02/c0029d48c2588e6d2a6a31c9f96088ba.jpg", "../../../../media/cache/2d/e0/2de0eff716ca13d12cf5420e88e1a8b3.jpg", "../../../../media/cache/7c/93/7c9302e392e128881e926d19f761da33.jpg", "../../../../media/cache/f7/b7/f7b73392b12909a1e8261ef3f96c5fd1.jpg", "../../../../media/cache/ee/d3/eed3afc5e444e3da5eec34e2b0036ec7.jpg", "../../../../media/cache/3a/7c/3a7c2393061031e7911d7b533b723391.jpg", "../../../../media/cache/41/c7/41c74d82b853606fe98182c417b4669c.jpg", "../../../../media/cache/14/25/142563ccee483bc07632f9c083a68326.jpg"]},
54 | {"URL": "http://books.toscrape.com/catalogue/category/books/humor_30/index.html", "image_link": ["../../../../media/cache/46/bd/46bdee520b8136972262fd040533772d.jpg", "../../../../media/cache/df/5d/df5d172abe87deda6d533e3e908d27d8.jpg", "../../../../media/cache/ea/7b/ea7bcac4b27a5bf6d4f8125bb7af3361.jpg", "../../../../media/cache/b5/a9/b5a90d1c36a96513942f006345ace3d2.jpg", "../../../../media/cache/df/14/df1418baa09e00b877be35066084c9dc.jpg", "../../../../media/cache/4c/30/4c3041def6f29659e009f61e45e492b0.jpg", "../../../../media/cache/73/36/733662595aede2dff1a5be1e76a3b936.jpg", "../../../../media/cache/e7/12/e71268a559d73826aa64151d47357a12.jpg", "../../../../media/cache/a1/03/a10370da29e4ba78c7a75a14041eae0e.jpg", "../../../../media/cache/4b/9a/4b9a2a6d4c995e12fe216f6173a582be.jpg"]},
55 | {"URL": "http://books.toscrape.com/catalogue/category/books/adult-fiction_29/index.html", "image_link": ["../../../../media/cache/18/d8/18d8e02c75c2ef23556c9746fae57e43.jpg"]},
56 | {"URL": "http://books.toscrape.com/catalogue/category/books/parenting_28/index.html", "image_link": ["../../../../media/cache/7d/0b/7d0bb832760e81c281d8d283ba6a2b09.jpg"]},
57 | {"URL": "http://books.toscrape.com/catalogue/category/books/autobiography_27/index.html", "image_link": ["../../../../media/cache/0a/15/0a1567cd04a6582d333db71337b4e2a6.jpg", "../../../../media/cache/d6/e8/d6e8258cee98f80727e99f7ac5aa1b88.jpg", "../../../../media/cache/e9/72/e972f8b4abaaa6f8f449479cd9d87be3.jpg", "../../../../media/cache/17/aa/17aacb738eace89a635a4eb47a94c11d.jpg", "../../../../media/cache/66/c7/66c7a1537c8901e1e4ec217d1956bae8.jpg", "../../../../media/cache/98/9f/989fe700e9e6bdec4fc3217daa5b7df3.jpg", "../../../../media/cache/61/ba/61ba5bc1ee3d8cb3dd350120ffa3f31e.jpg", "../../../../media/cache/80/b3/80b3e38be4204b3b64cdbe8c80dcf1f9.jpg", "../../../../media/cache/7a/58/7a587c5814f33c0c54e8bfa0ef66d690.jpg"]},
58 | {"URL": "http://books.toscrape.com/catalogue/category/books/psychology_26/index.html", "image_link": ["../../../../media/cache/a6/c8/a6c8256b123493472591c5855c7de704.jpg", "../../../../media/cache/dc/4d/dc4d070e33813a07a4e02f069e6d482f.jpg", "../../../../media/cache/ee/a9/eea9e831f8964b4dc0190c84a1f9a1f6.jpg", "../../../../media/cache/00/29/002924b764dc367dcaa3486fa4c0aa0b.jpg", "../../../../media/cache/b4/a5/b4a56663d56f1e84ee1b15bd819563cc.jpg", "../../../../media/cache/4d/a6/4da6939a6bbd895a5acdeabad46d1f9f.jpg", "../../../../media/cache/b8/44/b844a77409f1d53cbb66148820abc217.jpg"]},
59 | {"URL": "http://books.toscrape.com/catalogue/category/books/art_25/index.html", "image_link": ["../../../../media/cache/a5/41/a5416b9646aaa7287baa287ec2590270.jpg", "../../../../media/cache/f2/ee/f2ee668cf593ff13a9560c2801e9c2a2.jpg", "../../../../media/cache/ef/80/ef80e6100214c486562a73ce76444826.jpg", "../../../../media/cache/6a/55/6a55ccd4bc2383f5fe915fbef8bd5a23.jpg", "../../../../media/cache/58/a6/58a634c3231b5380544cc330536cb5ea.jpg", "../../../../media/cache/bb/36/bb364a10868756d1c0877c928b43b533.jpg", "../../../../media/cache/99/51/99511f4da1a4a2114e2ed12e6ba17b65.jpg", "../../../../media/cache/a8/3a/a83a4d31d30dc3cb26a29899a5c3b91d.jpg"]},
60 | {"URL": "http://books.toscrape.com/catalogue/category/books/paranormal_24/index.html", "image_link": ["../../../../media/cache/4b/97/4b972f89c11900ac0e84726d1f07bfcc.jpg"]},
61 | {"URL": "http://books.toscrape.com/catalogue/category/books/poetry_23/index.html", "image_link": ["../../../../media/cache/2c/da/2cdad67c44b002e7ead0cc35693c0e8b.jpg", "../../../../media/cache/58/46/5846057e28022268153beff6d352b06c.jpg", "../../../../media/cache/10/48/1048f63d3b5061cd2f424d20b3f9b666.jpg", "../../../../media/cache/55/33/553310a7162dfbc2c6d19a84da0df9e1.jpg", "../../../../media/cache/e9/20/e9203b733126c4a0832a1c7885dc27cf.jpg", "../../../../media/cache/72/41/72417db983862010ef0c1a25de98c7d7.jpg", "../../../../media/cache/f9/3b/f93b4a650f03a5d21f2436d7813f42c2.jpg", "../../../../media/cache/38/64/386468a8c3e6b880664bf7885bf6f726.jpg", "../../../../media/cache/25/54/2554431c797ec725eea50b3f8a83758c.jpg", "../../../../media/cache/3f/41/3f4160ada0b16e3c64cd2d0dffe781c8.jpg", "../../../../media/cache/c8/f2/c8f297fab080ddd02b3ed5c17b83af85.jpg", "../../../../media/cache/93/d5/93d5c64abfad9ed6a0cb2e26f19f1a1e.jpg", "../../../../media/cache/36/5b/365b3ab7ab72a6258873716aef6d5c1a.jpg", "../../../../media/cache/b7/29/b7293f602efb0c17e305077f8175888a.jpg", "../../../../media/cache/31/c7/31c7c5ce7b04d227aa36ecb250b9dad5.jpg", "../../../../media/cache/7e/93/7e934132cd03486649fb492fe702f704.jpg", "../../../../media/cache/9f/35/9f351ca1978128c60a3b7f85987075b3.jpg", "../../../../media/cache/8f/46/8f46bb13feb3a4440a27dfcf688fbaa6.jpg", "../../../../media/cache/df/ab/dfab1d94f9190df7c13b63a093a6d16e.jpg"]},
62 | {"URL": "http://books.toscrape.com/catalogue/category/books/science_22/index.html", "image_link": ["../../../../media/cache/d4/8d/d48d5122a15347e9fe2b15ad354d69bf.jpg", "../../../../media/cache/26/1c/261c4eaf957ae4aacf2229b482e76dbe.jpg", "../../../../media/cache/68/ca/68caaf9ac41964d5167a3eb67c638393.jpg", "../../../../media/cache/56/97/5697f2f8f628129df01c5790985ffd9b.jpg", "../../../../media/cache/5e/7f/5e7f7d9913d4c95d33904770c518d537.jpg", "../../../../media/cache/33/4f/334fd0ebdf0c0192baf5914d199c53b5.jpg", "../../../../media/cache/da/0d/da0d13699a090516502257a4d7da623f.jpg", "../../../../media/cache/08/a9/08a957eb34f8047862e225774c3bdde2.jpg", "../../../../media/cache/83/ab/83ab65f938b24fa1a9cb47235be49b57.jpg", "../../../../media/cache/69/c8/69c83860995cde393dbe6690ec3f1d4f.jpg", "../../../../media/cache/f9/69/f969969428b505970a46272fdcea00d3.jpg", "../../../../media/cache/f8/bc/f8bcd489d33473e0819beaecccd5ebac.jpg", "../../../../media/cache/c8/63/c863c222c130a1bc8685a1242dd2523d.jpg", "../../../../media/cache/08/14/0814f26516fb72b7391d0a742b5928a2.jpg"]},
63 | {"URL": "http://books.toscrape.com/catalogue/category/books/young-adult_21/index.html", "image_link": ["../../../../media/cache/68/33/68339b4c9bc034267e1da611ab3b34f8.jpg", "../../../../media/cache/5b/88/5b88c52633f53cacf162c15f4f823153.jpg", "../../../../media/cache/5d/7f/5d7f496cdf5e5962a73ecdcc1505c1d5.jpg", "../../../../media/cache/fc/72/fc72f158554b4b4164701e1dfa1153c7.jpg", "../../../../media/cache/26/95/269507c7bb35d2cec9b61a03d1c28e67.jpg", "../../../../media/cache/12/f1/12f1963957f27fa83d51f76b183ef490.jpg", "../../../../media/cache/0f/d3/0fd306891f8fd3196653022fd67d6c87.jpg", "../../../../media/cache/18/08/18086e581ad354aa65f945c2b5c51350.jpg", "../../../../media/cache/f8/54/f85417465a73e33604624205ba8306cc.jpg", "../../../../media/cache/71/76/7176317f1915fa0658bb2fe400441207.jpg", "../../../../media/cache/bb/72/bb723ad463531c602ad8bcb244253bf3.jpg", "../../../../media/cache/19/cf/19cf50aea5bf0e8f4bc016f3745b3dfe.jpg", "../../../../media/cache/18/0b/180bfe1902cb3c0eb77d7c712efa2a96.jpg", "../../../../media/cache/1d/3c/1d3c05b772ab846c111970232360d2c5.jpg", "../../../../media/cache/46/6e/466e9636819aad1126ac6cefb5313ba8.jpg", "../../../../media/cache/b2/df/b2df2ea409c5cf28538b67aff424b11f.jpg", "../../../../media/cache/ad/ac/adac97366586d261feab30bf5220756e.jpg", "../../../../media/cache/61/1a/611aba0ef5b859ba1977ef30677b0194.jpg", "../../../../media/cache/87/cd/87cd652c35e2a78535c83becae33cff2.jpg", "../../../../media/cache/fd/5b/fd5b14399052ab552e240ed18ab03c6d.jpg"]},
64 | {"URL": "http://books.toscrape.com/catalogue/category/books/new-adult_20/index.html", "image_link": ["../../../../media/cache/24/e2/24e2f5c9d325c4004d8190c054da86dd.jpg", "../../../../media/cache/a5/43/a543b100a8c1861c1bf5374ca6b576fe.jpg", "../../../../media/cache/84/ac/84acb0606c96e55dc729a9d6572a08fb.jpg", "../../../../media/cache/38/f1/38f1543cd2d51c2728678f5ecc128958.jpg", "../../../../media/cache/a2/19/a2198abf12e3287f84997b35f4e1050e.jpg", "../../../../media/cache/03/ed/03ed67ea504353b91b035151d8e80db2.jpg"]},
65 | {"URL": "http://books.toscrape.com/catalogue/category/books/fantasy_19/index.html", "image_link": ["../../../../media/cache/76/8e/768ea5924ac1ef6297c2be9959c796c2.jpg", "../../../../media/cache/43/ae/43aee83ebb31e2122a7215e413770e5c.jpg", "../../../../media/cache/b7/e8/b7e84b78be3d9bb79b71156a5e5d4e42.jpg", "../../../../media/cache/ff/e8/ffe81bf98f8386ef29e193abfb6f9c1e.jpg", "../../../../media/cache/66/25/6625e3bbb050de3e42a0c302c0d69f1f.jpg", "../../../../media/cache/06/18/061811c5845d0e13bc04b2a755f0830f.jpg", "../../../../media/cache/c0/88/c08816960890396213a423941af65b8f.jpg", "../../../../media/cache/32/d6/32d6aa560e8ddf2a4da1526b95d4c7ab.jpg", "../../../../media/cache/3e/0b/3e0b16851bec08b6cbf78d5f64af9114.jpg", "../../../../media/cache/e2/60/e260b008b7ea7970562295b7bc64b0cb.jpg", "../../../../media/cache/53/5e/535e2be0b423797c2cdc7d98882c820a.jpg", "../../../../media/cache/b4/67/b467a4f01ca6ae8464b9425a156c7c32.jpg", "../../../../media/cache/9a/33/9a333c4a06ce187c5c9d2f5969ddcac2.jpg", "../../../../media/cache/75/b9/75b99691594fde72ccb1831624cfeff6.jpg", "../../../../media/cache/8f/80/8f8074d9f035c2a0ef8595ad89f7bcc8.jpg", "../../../../media/cache/00/08/0008e65aa431ed3625ad3a4352f8e90d.jpg", "../../../../media/cache/3b/04/3b045fe0394dc192950a0ec9e3812fe4.jpg", "../../../../media/cache/d3/0d/d30dd8b6be6f9fcfd17178e8083238b6.jpg", "../../../../media/cache/27/64/27649cb5da52970f4bb2fc5234a48578.jpg", "../../../../media/cache/3e/2d/3e2d526ee062008ab1cbf54f90a5abb2.jpg"]},
66 | {"URL": "http://books.toscrape.com/catalogue/category/books/add-a-comment_18/index.html", "image_link": ["../../../../media/cache/33/e5/33e507172541628acfd421503196b578.jpg", "../../../../media/cache/f8/6d/f86d08178e3788563ac17be5aefd29f0.jpg", "../../../../media/cache/70/fa/70fa6c0437d9c97dbeada6bd32bf9d2c.jpg", "../../../../media/cache/a1/14/a114d70e7babf110ba42a389078e9a45.jpg", "../../../../media/cache/5f/52/5f52b1bc6d45daab2e330c744feb0359.jpg", "../../../../media/cache/ae/0c/ae0ccc307568b6d7699786411f3cbcc4.jpg", "../../../../media/cache/28/78/2878538a1039d9c4649110499a1393fb.jpg", "../../../../media/cache/72/d8/72d861617b6d3aababe6e61e8d3c1056.jpg", "../../../../media/cache/66/f7/66f79b76d6c6b64fcc8110515c454e09.jpg", "../../../../media/cache/94/ac/94ac87da7b40853013093f08356efa3b.jpg", "../../../../media/cache/8f/3f/8f3f4d67e30a8129577ccc4664998345.jpg", "../../../../media/cache/3f/e7/3fe7073a5caac81929524d2d9488f928.jpg", "../../../../media/cache/f5/58/f55886d1bf600529a35e1bd932c78ca0.jpg", "../../../../media/cache/0b/97/0b97282ed82b771ed328e05386a84adb.jpg", "../../../../media/cache/50/0e/500eeb810e940424827580574e46852c.jpg", "../../../../media/cache/9b/20/9b2076ce7414103a093ce2459d089969.jpg", "../../../../media/cache/75/20/75200336c141156746000f7055df344a.jpg", "../../../../media/cache/4d/16/4d163d43cb4aa624e599330a39abace5.jpg", "../../../../media/cache/55/33/5533595a623c3bb947c4a5171fc2df08.jpg", "../../../../media/cache/97/3a/973a2c3462a18fc90d3b9662d959df37.jpg"]},
67 | {"URL": "http://books.toscrape.com/catalogue/category/books/sports-and-games_17/index.html", "image_link": ["../../../../media/cache/61/2c/612caeb0b2acb35c100629f0f52a40d7.jpg", "../../../../media/cache/7d/cf/7dcf6c3b419bf7e7e3b3b8162b177869.jpg", "../../../../media/cache/c3/a9/c3a90a5baa833a37c29c4b03a444737c.jpg", "../../../../media/cache/9b/4e/9b4ece2ab5a6335c8594c878e2f22df1.jpg", "../../../../media/cache/8d/1e/8d1e285bf672b2ea66879490cc5f6904.jpg"]},
68 | {"URL": "http://books.toscrape.com/catalogue/category/books/science-fiction_16/index.html", "image_link": ["../../../../media/cache/09/a3/09a3aef48557576e1a85ba7efea8ecb7.jpg", "../../../../media/cache/93/63/9363f0065fbad5689f44fcf6e203eef3.jpg", "../../../../media/cache/02/37/0237b445efc18c5562355a5a2c40889c.jpg", "../../../../media/cache/10/6e/106e2fc7160712edf8e2ff996dc8cd6c.jpg", "../../../../media/cache/f0/06/f0060c756556b855184fa32f66280961.jpg", "../../../../media/cache/c0/72/c072c1ef144d571abd25fe9cc18cceba.jpg", "../../../../media/cache/51/88/518810d182843244a404f2a2a614a93b.jpg", "../../../../media/cache/8b/92/8b9267df86378b6973974ae7e1924ffe.jpg", "../../../../media/cache/b8/b2/b8b2956acc758a381beef87339c0a52f.jpg", "../../../../media/cache/51/34/513418bd1c6114f3ea1fd703278e20ef.jpg", "../../../../media/cache/ef/8b/ef8bc5adcd3bea8e8ba97be76d07a32a.jpg", "../../../../media/cache/7a/bc/7abccb865ecf9b0f676800b10c71cfd6.jpg", "../../../../media/cache/fa/65/fa653fbe3a4c69227c9b79d471cee576.jpg", "../../../../media/cache/c7/21/c721943edf481cad5ab32505e2ad3865.jpg", "../../../../media/cache/da/47/da4746e620f8ccd7cf20628d1a5e535a.jpg", "../../../../media/cache/f4/83/f4835e9f3fdd8b8107bbb39a391654f0.jpg"]},
69 | {"URL": "http://books.toscrape.com/catalogue/category/books/default_15/index.html", "image_link": ["../../../../media/cache/3d/54/3d54940e57e662c4dd1f3ff00c78cc64.jpg", "../../../../media/cache/66/88/66883b91f6804b2323c8369331cb7dd1.jpg", "../../../../media/cache/be/f4/bef44da28c98f905a3ebec0b87be8530.jpg", "../../../../media/cache/ef/0b/ef0bed08de4e083dba5e20fdb98d9c36.jpg", "../../../../media/cache/d6/da/d6da0371958068bbaf39ea9c174275cd.jpg", "../../../../media/cache/12/53/1253c21c5ef3c6d075c5fa3f5fecee6a.jpg", "../../../../media/cache/f5/88/f5889d038f5d8e949b494d147c2dcf54.jpg", "../../../../media/cache/75/dc/75dce2f5949b407161f37f0af249b018.jpg", "../../../../media/cache/69/85/69852567cf97264a1442cbc882c84903.jpg", "../../../../media/cache/27/d2/27d20361745ec2f7be668b18a4da29da.jpg", "../../../../media/cache/78/2e/782e315667ec50759b8603527ee33dec.jpg", "../../../../media/cache/08/89/088995e862aac86c88c608d763f6390e.jpg", "../../../../media/cache/06/a6/06a6cfcf89afd1601cbba1a16cda57fb.jpg", "../../../../media/cache/8a/83/8a83b6ce350f01bab21f85e6ba539316.jpg", "../../../../media/cache/4e/0f/4e0f05ae01d8fb6bd0d3901edd06de16.jpg", "../../../../media/cache/34/f5/34f5f8e513c5f048241f5695e61b2483.jpg", "../../../../media/cache/58/9d/589d73503d9a23d224de836134fae553.jpg", "../../../../media/cache/25/6c/256c946dd0962095f66c6de3b15ab300.jpg", "../../../../media/cache/81/58/81586cd0bf8743e1f5ed80b6a0e1fabe.jpg", "../../../../media/cache/fe/b7/feb764b2afa54991cfdbbffdf501b333.jpg"]},
70 | {"URL": "http://books.toscrape.com/catalogue/category/books/music_14/index.html", "image_link": ["../../../../media/cache/81/c4/81c4a973364e17d01f217e1188253d5e.jpg", "../../../../media/cache/54/60/54607fe8945897cdcced0044103b10b6.jpg", "../../../../media/cache/5c/c8/5cc8e107246cb478960d4f0aba1e1c8e.jpg", "../../../../media/cache/a2/6d/a26d8449abb3381e09126eda5f4e8151.jpg", "../../../../media/cache/06/f1/06f185c0be2ad6e2fe059464c03f1b47.jpg", "../../../../media/cache/85/42/8542841f5644a6daf433504f1e106e97.jpg", "../../../../media/cache/11/fc/11fc94453c4dc0d68543971d7843afb0.jpg", "../../../../media/cache/35/a4/35a4a7c6c76c4e82186753078e441654.jpg", "../../../../media/cache/15/de/15de75548ee9a4c6be1420ee309c03e0.jpg", "../../../../media/cache/7a/7e/7a7eb52e7075a5305522948375c1316e.jpg", "../../../../media/cache/99/97/9997eda658c2fe50e724171f9c2a2b0b.jpg", "../../../../media/cache/7e/94/7e947f3dd04f178175b85123829467a9.jpg", "../../../../media/cache/7f/b0/7fb03a053c270000667a50dd8d594843.jpg"]},
71 | {"URL": "http://books.toscrape.com/catalogue/category/books/nonfiction_13/index.html", "image_link": ["../../../../media/cache/2e/98/2e98c332bf8563b584784971541c4445.jpg", "../../../../media/cache/38/c5/38c56fba316c07305643a8065269594e.jpg", "../../../../media/cache/cb/bd/cbbdb0222ee8a0f6ab61657412a15794.jpg", "../../../../media/cache/9c/46/9c463c7631c82401160fd3b554b8f0e1.jpg", "../../../../media/cache/41/a2/41a20f35adf0caea24f208dc01ad7681.jpg", "../../../../media/cache/03/86/038650c9e7517b4baf2a423cd8eed38f.jpg", "../../../../media/cache/95/64/95647d6a526bf54120b9445e124794e1.jpg", "../../../../media/cache/64/15/641570cd7e7aded53c7d33d78a9629f1.jpg", "../../../../media/cache/2e/23/2e236e23ad52aa74505f224f6552eda8.jpg", "../../../../media/cache/f3/4f/f34ffb24cc21c9f9f52dad4fd8f3ac21.jpg", "../../../../media/cache/97/f8/97f8debeeaaece9603267653076e760f.jpg", "../../../../media/cache/fe/ea/feeafd2ad7b3077f8e74cbb1da9e3c7d.jpg", "../../../../media/cache/64/94/6494bf61176ca73b61255909230030be.jpg", "../../../../media/cache/88/9e/889e0bac4c7c0e7178f0165b8d3b4617.jpg", "../../../../media/cache/23/b4/23b42e094c02d52b14b11a960d49610e.jpg", "../../../../media/cache/03/38/0338682e76bad3216cd4c6c28b2b625a.jpg", "../../../../media/cache/14/f3/14f3d525e2a114cd71e27201a16af188.jpg", "../../../../media/cache/13/57/1357c6aa40c9e63d2f931927fbf81f3f.jpg", "../../../../media/cache/0e/6d/0e6dc2484322c5b9e7854ced66fdf62d.jpg", "../../../../media/cache/6e/d4/6ed4991d97f60db29ec7b421e61a2cf3.jpg"]},
72 | {"URL": "http://books.toscrape.com/catalogue/category/books/religion_12/index.html", "image_link": ["../../../../media/cache/95/30/953013d044aa313cc162dec414f3969a.jpg", "../../../../media/cache/6b/70/6b70f2cdb17d9ab7551240a88b9211fe.jpg", "../../../../media/cache/1f/db/1fdb125bcb8cee71f3404b4dc293348c.jpg", "../../../../media/cache/83/db/83dbf86eb0fed1d99de2148eac4eb064.jpg", "../../../../media/cache/71/91/7191a7d76eb6c3a18259541e2c038ae3.jpg", "../../../../media/cache/4e/69/4e69dacc99de838814d0f65c94e67f6c.jpg", "../../../../media/cache/df/ab/dfabeab158046237ddb6b713b794909f.jpg"]},
73 | {"URL": "http://books.toscrape.com/catalogue/category/books/childrens_11/index.html", "image_link": ["../../../../media/cache/af/6e/af6e796160fe63e0cf19d44395c7ddf2.jpg", "../../../../media/cache/cf/bb/cfbb5e62715c6d888fd07794c9bab5d6.jpg", "../../../../media/cache/c4/a2/c4a2a1a026c67bcceb5a411c724d7d0c.jpg", "../../../../media/cache/26/32/2632a1e12f2c085fabbe022ae4cd6933.jpg", "../../../../media/cache/80/25/8025b80a40178f2a6dd4f99ad88e0fba.jpg", "../../../../media/cache/28/50/2850439c2ba103fb69dba9cd2dd9f0c2.jpg", "../../../../media/cache/2b/38/2b380f77723c797c0389f978afa6db58.jpg", "../../../../media/cache/bb/e2/bbe26db72b8a32117bfe4981b7cc8147.jpg", "../../../../media/cache/97/12/971212afa8e4ff49d92f678bc889d8b7.jpg", "../../../../media/cache/85/e7/85e75d5a9309da5807c82decf3d90263.jpg", "../../../../media/cache/27/1f/271faa1d7561473974d12803feb1f0a1.jpg", "../../../../media/cache/6c/18/6c18ea03e294bfcfe07cf531c6c5f5b3.jpg", "../../../../media/cache/4f/1e/4f1ece2500f8dbacecca42d57befca03.jpg", "../../../../media/cache/8f/66/8f66ec46e671d6fca79649c10c7c8f8a.jpg", "../../../../media/cache/1c/eb/1cebdf525ebe970a1dc3c5a8c50eae6b.jpg", "../../../../media/cache/c0/bb/c0bb6e42743b9c1aaf9b754501100a5d.jpg", "../../../../media/cache/bf/db/bfdbf9726621276fc7821d705690dbae.jpg", "../../../../media/cache/e0/90/e090748ce5a567207aed9185c97eb34b.jpg", "../../../../media/cache/21/bd/21bdf7ae21476b1debf4aa3eefe6f29d.jpg", "../../../../media/cache/ec/08/ec08efebaa33a403e54080b48c139794.jpg"]},
74 | {"URL": "http://books.toscrape.com/catalogue/category/books/fiction_10/index.html", "image_link": ["../../../../media/cache/3e/ef/3eef99c9d9adef34639f510662022830.jpg", "../../../../media/cache/9d/05/9d0533bae1578846d728a82913b95c26.jpg", "../../../../media/cache/5f/15/5f152afdbc42356ecba02f61058a7e5b.jpg", "../../../../media/cache/c4/0a/c40a64f59e7487b1a80a049f6ceb2ba5.jpg", "../../../../media/cache/dc/44/dc44f8e2aebac48ca8553814d9b021a8.jpg", "../../../../media/cache/6b/da/6bdae061cb92c32b0b83cda8dd10275d.jpg", "../../../../media/cache/37/25/372578cc073efae80cf284b56040a488.jpg", "../../../../media/cache/f8/31/f8314c7fdaa79fb7191a583e9a852db8.jpg", "../../../../media/cache/6a/81/6a81103b1c01a3f6c56e5718a838a4c8.jpg", "../../../../media/cache/8f/f8/8ff8680dde59ea739d6978a01e4d7fe5.jpg", "../../../../media/cache/83/05/8305154438c91a02cefacf4ec8b53393.jpg", "../../../../media/cache/38/34/3834572e651cdc14b18d348fa4875aa9.jpg", "../../../../media/cache/d8/a4/d8a44eda7cbe7bd1207f868e9adc06f3.jpg", "../../../../media/cache/8e/c7/8ec7f310b74ddd7ec3c859e9b0da7389.jpg", "../../../../media/cache/03/16/0316bb6f4785ac69c0643109201bad5d.jpg", "../../../../media/cache/ca/b1/cab150e556b5fab663a9fec00ed97943.jpg", "../../../../media/cache/e0/79/e07906c1e507055da9a2260a74f58273.jpg", "../../../../media/cache/a7/f0/a7f092a7b79f848df0226f808fed489b.jpg", "../../../../media/cache/ed/07/ed07c9e7c53d4f33a6eb7d41eb0e6d4a.jpg", "../../../../media/cache/26/3b/263bf5d128bf18553ea8da8bb19e9a0c.jpg"]}
75 | ]
--------------------------------------------------------------------------------
/chapter4/spider_books.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 |
3 |
4 | class BooksSpider(scrapy.Spider):
5 | name = 'bookLinks'
6 |
7 | start_urls = ['http://books.toscrape.com']
8 | images_data = {}
9 |
10 | def parse(self, response):
11 | # follow links to author pages
12 | for img in response.css('a::attr(href)'):
13 | yield response.follow(img, self.parse_images)
14 |
15 | def parse_images(self, response):
16 | print ("URL: " + response.request.url)
17 | def extract_with_css(query):
18 | return response.css(query).extract()
19 | yield {
20 | 'URL': response.request.url,
21 | 'image_link': extract_with_css('img::attr(src)')
22 | }
--------------------------------------------------------------------------------