├── Chapter05
    ├── Quotes
    │   ├── Quotes
    │   │   ├── __init__.py
    │   │   ├── scrapinghub.yml
    │   │   ├── items.pyc
    │   │   ├── __init__.pyc
    │   │   ├── settings.pyc
    │   │   ├── spiders
    │   │   │   ├── quotes.pyc
    │   │   │   ├── __init__.pyc
    │   │   │   ├── __init__.py
    │   │   │   └── quotes.py
    │   │   ├── pipelines.py
    │   │   ├── items.py
    │   │   └── settings.py
    │   └── scrapy.cfg
    ├── scrapinghub_blogs.py
    ├── scrapinghub.yml
    ├── scrapy.cfg
    ├── toscrape_quotes.py
    ├── bs4_exploring.py
    └── quotes.csv
├── Chapter09
    ├── regex2.py
    ├── regex_worldpopulation.py
    ├── regex1.py
    ├── regexHTML.html
    ├── regex_xml.py
    ├── regexHTML.py
    ├── godfreysfeed.py
    ├── regex.py
    └── sitemap.xml
├── Chapter07
    ├── usgsEarthquake.py
    ├── githubAPI.py
    ├── twitter200.py
    ├── githubevent.py
    ├── sunrisesunset.py
    └── universities.py
├── Chapter04
    ├── example3_company_address.py
    ├── example1_ibm_announcements.py
    ├── example3_AHL.py
    ├── example2_quotes_authors.py
    └── test.html
├── Chapter02
    ├── urlerror.py
    ├── urllib_http_headers.py
    ├── wikipedia_content.py
    ├── githubevents.py
    ├── wikipedia_content_urllib.py
    ├── httpbin_postrequest.py
    ├── requeststest.py
    ├── urllib_test.py
    └── urllibrobotserror.py
├── Chapter03
    ├── lxmlParse.py
    ├── etreeFromString.py
    ├── lxmlXML.py
    ├── lxmlXMLFile.py
    ├── scrapelxml.py
    ├── scrapelxmlcss.py
    ├── food.xml
    └── scrapeXPathLoop.py
├── README.md
├── Chapter10
    ├── bookdetails.csv
    ├── listToCSV.py
    ├── analysis.py
    └── bookdetails.json
├── LICENSE
├── Chapter08
    ├── seleniumBrowser.py
    ├── seleniumLocator.py
    ├── seleniumBooks.py
    └── seleniumProducts.py
└── Chapter06
    ├── testingGroundCookie.py
    ├── toScrapeSessionCookie.py
    └── toScrapeViewstate.py


/Chapter05/Quotes/Quotes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Chapter09/regex2.py:
--------------------------------------------------------------------------------
1 | '''
2 | Removed
3 | '''
4 | 


--------------------------------------------------------------------------------
/Chapter05/scrapinghub_blogs.py:
--------------------------------------------------------------------------------
1 | '''
2 | Removed
3 | '''


--------------------------------------------------------------------------------
/Chapter07/usgsEarthquake.py:
--------------------------------------------------------------------------------
1 | '''
2 | Removed
3 | '''
4 | 


--------------------------------------------------------------------------------
/Chapter09/regex_worldpopulation.py:
--------------------------------------------------------------------------------
1 | '''
2 | removed
3 | '''
4 | 


--------------------------------------------------------------------------------
/Chapter04/example3_company_address.py:
--------------------------------------------------------------------------------
1 | '''
2 | Removed
3 | '''
4 | 


--------------------------------------------------------------------------------
/Chapter09/regex1.py:
--------------------------------------------------------------------------------
1 | import re
2 | import requests
3 | '''
4 | Content Removed
5 | '''


--------------------------------------------------------------------------------
/Chapter05/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | projects:
2 |     default: 385731
3 | stacks:
4 |     default: scrapy:1.3-py3


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/scrapinghub.yml:
--------------------------------------------------------------------------------
1 | projects:
2 |     default: 385731
3 | stacks:
4 |     default: scrapy:1.3-py3


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/items.pyc


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/__init__.pyc


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/settings.pyc


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/spiders/quotes.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/spiders/quotes.pyc


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/spiders/__init__.pyc


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Chapter02/urlerror.py:
--------------------------------------------------------------------------------
1 | import urllib.request as request
2 | import urllib.error as error
3 | try:
4 |     request.urlopen("https://www.python.ogr")
5 | except error.URLError as e:
6 |     print("Error Occurred: ",e.reason)
7 | 


--------------------------------------------------------------------------------
/Chapter02/urllib_http_headers.py:
--------------------------------------------------------------------------------
1 | import urllib.request
2 | 
3 | url='https://www.samsclub.com/sitemap.xml'
4 | someRequest = urllib.request.urlopen(url)#loads provided URL
5 | someRequest.getheaders() #Lists all HTTP headers.
6 | someRequest.getheader("Content-Type") #return value of header 'Content-Type'
7 | 


--------------------------------------------------------------------------------
/Chapter05/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Blog.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Blog
12 | 


--------------------------------------------------------------------------------
/Chapter05/Quotes/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Quotes.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Quotes
12 | 


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class QuotesPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/Chapter03/lxmlParse.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree
 2 | tree = etree.parse("food.xml")
 3 | 
 4 | #iter through selected name found in Tree
 5 | for element in tree.iter('name'):
 6 |     print(element.text)
 7 | 
 8 | #iter through selected elements found in Tree
 9 | for element in tree.iter('name','rating','feedback'):
10 |     print("{} - {}".format(element.tag, element.text))
11 | 
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## [Get this title for $10 on Packt's Spring Sale](https://www.packt.com/B11487?utm_source=github&utm_medium=packt-github-repo&utm_campaign=spring_10_dollar_2022)
2 | -----
3 | For a limited period, all eBooks and Videos are only $10. All the practical content you need \- by developers, for developers
4 | 
5 | # Hands-On-Web-Scraping-with-Python
6 | Hands-On Web Scraping with Python, published by Packt
7 | 


--------------------------------------------------------------------------------
/Chapter02/wikipedia_content.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | link = "https://en.wikipedia.org/wiki/List_of_most_popular_websites"
 4 | response = requests.get(link)
 5 | print(type(response))
 6 | content = response.content
 7 | #print(content)
 8 | #Create a html file with the content received as 'content'
 9 | file = open(os.getcwd()+os.sep+"tests"+os.sep+"wikicontent.html","wb")
10 | file.write(content)
11 | file.close()
12 | #print(content)  
13 | 


--------------------------------------------------------------------------------
/Chapter02/githubevents.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | import json
 4 | link = "https://feeds.citibikenyc.com/stations/stations.json" 
 5 | # link = "https://api.github.com/events"
 6 | response = requests.get(link).json()
 7 | print(response['stationBeanList'][0])
 8 | # jsonData = json.dumps(response)
 9 | # print(type(jsonData))
10 | # print(response[0])
11 | 
12 | # file = open(os.getcwd()+os.sep+"tests"+os.sep+"github_event.json","w")
13 | # file.write(jsonData)
14 | # file.close()


--------------------------------------------------------------------------------
/Chapter02/wikipedia_content_urllib.py:
--------------------------------------------------------------------------------
 1 | import urllib.request as req
 2 | import os
 3 | link = "https://en.wikipedia.org/wiki/List_of_most_popular_websites"
 4 | response = req.urlopen(link)
 5 | print(type(response))
 6 | #print(response.read())
 7 | 
 8 | content = response.read()
 9 | print(content)
10 | #Create a html file with the content received as 'content'
11 | file = open(os.getcwd()+os.sep+"tests"+os.sep+"wikipopular.html","wb")
12 | file.write(content)
13 | file.close()
14 | #print(content)  
15 | 


--------------------------------------------------------------------------------
/Chapter03/etreeFromString.py:
--------------------------------------------------------------------------------
 1 | from lxml import html
 2 | import requests
 3 | response = requests.get('http://httpbin.org/forms/post')
 4 | print(type(response.text))
 5 | # build the DOM Tree
 6 | tree = html.fromstring(response.text)
 7 | print(type(tree))
 8 | for element in tree.iter('input'):
 9 |     print("Element: %s \n\tvalues(): %s \n\tattrib: %s \n\titems(): %s \n\tkeys(): %s"%
10 |         (element.tag, element.values(),element.attrib,element.items(),element.keys()))
11 |     print("\n")
12 | 
13 | 


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | class QuotesItem(scrapy.Item):
11 |     # define the fields for your item here like:
12 |     # name = scrapy.Field()
13 | 
14 |     tags = scrapy.Field()
15 |     author = scrapy.Field()
16 |     quote = scrapy.Field()
17 |     author_link = scrapy.Field()
18 | 
19 |     pass
20 | 


--------------------------------------------------------------------------------
/Chapter03/lxmlXML.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from lxml import etree
 3 | url="https://www.w3schools.com/xml/simple.xml"
 4 | response = requests.get(url).content
 5 | tree = etree.XML(response)
 6 | print(tree)
 7 | print(type(tree))
 8 | #iter through all elements found in Tree
 9 | for element in tree.iter():
10 |     print("%s - %s" % (element.tag, element.text)
11 | 
12 | #iter through selected elements found in Tree
13 | for element in tree.iter('calories','name'):
14 |     print("%s - %s" % (element.tag, element.text))
15 | 


--------------------------------------------------------------------------------
/Chapter02/httpbin_postrequest.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | params = {'custname':'Mr. ABC','custtel':'','custemail':'abc@somedomain.com',
 3 | 'size':'small','topping':['cheese','mushroom'],'delivery':'13:00','comments':'None'}
 4 | headers={
 5 |     'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 6 |     'Content-Type':'application/x-www-form-urlencoded',
 7 |     'Referer':'http://httpbin.org/forms/post'
 8 |     }
 9 | response = requests.post('http://httpbin.org/post',data=params,headers=headers).json()
10 | print(response)


--------------------------------------------------------------------------------
/Chapter07/githubAPI.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | url = 'https://api.github.com'
 3 | 
 4 | results = requests.get(url)
 5 | print("Type Results", type(results))
 6 | print("Status Code: ", results.status_code)
 7 | print("Headers: Content-Type: ", results.headers['Content-Type'])
 8 | print("Headers: ", results.headers)
 9 | 
10 | etag = results.headers['ETag']
11 | print("ETag: ",etag)
12 | results = requests.get(url, headers={'If-None-Match': etag})
13 | print("Type Results", type(results))
14 | print("Status Code: ", results.status_code)
15 | print("Headers: Content-Type: ", results.headers['Content-Type'])
16 | 


--------------------------------------------------------------------------------
/Chapter10/bookdetails.csv:
--------------------------------------------------------------------------------
 1 | Title,Price,Stock,Rating
 2 | Rip it Up and ...,35.02,In stock,5
 3 | Our Band Could Be ...,57.25,In stock,4
 4 | How Music Works,37.32,In stock,2
 5 | Love Is a Mix ...,18.03,Out of stock,1
 6 | Please Kill Me: The ...,31.19,In stock,4
 7 | Kill 'Em and Leave: ...,45.0,In stock,5
 8 | "Chronicles, Vol. 1",52.6,Out of stock,2
 9 | This Is Your Brain ...,38.4,In stock,1
10 | Orchestra of Exiles: The ...,12.36,In stock,3
11 | No One Here Gets ...,20.02,In stock,5
12 | Life,31.58,In stock,5
13 | Old Records Never Die: ...,55.66,Out of Stock,2
14 | Forever Rockers (The Rocker ...,28.8,In stock,3
15 | 


--------------------------------------------------------------------------------
/Chapter03/lxmlXMLFile.py:
--------------------------------------------------------------------------------
 1 | from lxml import etree
 2 | xml = open("food.xml","rb").read()
 3 | #tree = etree.fromstring(xml)
 4 | #tree = etree.parse(xml)
 5 | tree = etree.XML(xml)
 6 | 
 7 | print(tree)
 8 | print(type(tree))
 9 | 
10 | #iter through all elements found in Tree
11 | for element in tree.iter():
12 |     print("%s - %s" % (element.tag, element.text))
13 | 
14 | #iter through selected elements found in Tree
15 | for element in tree.iter('price','name'):
16 |     print("%s - %s" % (element.tag, element.text))
17 | 
18 | #iter through description
19 | for element in tree.iter('description'):
20 |     print("%s - %s" % (element.tag, element.text))
21 | 
22 | 


--------------------------------------------------------------------------------
/Chapter07/twitter200.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | url = 'https://api.twitter.com/1.1/search/tweets.json?q='
 5 | 
 6 | results = requests.get(url)
 7 | print("Type Results",type(results))
 8 | print("Status Code: ", results.status_code)
 9 | print("Headers: Content-Type: ", results.headers['Content-Type'])
10 | 
11 | #jsonResult = results.json()
12 | jsonResult = results.content
13 | print("Type JSON Results",type(jsonResult))
14 | print(jsonResult)
15 | 
16 | jsonFinal = json.loads(jsonResult.decode())
17 | print(jsonFinal)
18 | #print(json.loads(requests.get(url).content.decode()))
19 | 
20 | if results.status_code==400:
21 |     print(jsonFinal['errors'][0]['message'])
22 | else:
23 |     pass
24 | 


--------------------------------------------------------------------------------
/Chapter09/regexHTML.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |    <title>Welcome to Web Scraping: Example</title>
 4 |    <style type="text/css">
 5 |         ....
 6 |    </style>
 7 | </head>
 8 | <body>
 9 |     <h1 style="color:orange;">Welcome to Web Scraping</h1>
10 |      Links:
11 |     <a href="https://www.google.com" style="color:red;">Google</a>
12 |     <a class="classOne" href="https://www.yahoo.com">Yahoo</a>
13 |     <a id="idOne" href="https://www.wikipedia.org" style="color:blue;">Wikipedia</a>
14 |     <div>
15 |         <p id="mainContent" class="content">
16 |             <i>Paragraph contents</i>
17 |             <img src="mylogo.png" id="pageLogo" class="logo"/>
18 |         </p>
19 |         <p class="content" id="subContent">
20 |             <i style="color:red">Sub paragraph content</i>
21 |             <h1 itemprop="subheading">Sub heading Content!</h1>
22 |         </p>
23 |     </div>
24 | </body>
25 | </html>
26 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Chapter03/scrapelxml.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 |   
 3 | musicUrl= "http://books.toscrape.com/catalogue/category/books/music_14/index.html"
 4 | doc = lxml.html.parse(musicUrl)
 5 | 
 6 | #base element
 7 | articles = doc.xpath("//*[@id='default']/div/div/div/div/section/div[2]/ol/li[1]/article")[0]
 8 | 
 9 | #individual element inside base
10 | title = articles.xpath("//h3/a/text()")
11 | price = articles.xpath("//div[2]/p[contains(@class,'price_color')]/text()")
12 | availability = articles.xpath("//div[2]/p[2][contains(@class,'availability')]/text()[normalize-space()]")
13 | imageUrl = articles.xpath("//div[1][contains(@class,'image_container')]/a/img/@src")
14 | starRating = articles.xpath("//p[contains(@class,'star-rating')]/@class")
15 | 
16 | #cleaning and formatting 
17 | stock = list(map(lambda stock:stock.strip(),availability))
18 | images = list(map(lambda img:img.replace('../../../..','http://books.toscrape.com'),imageUrl))
19 | rating = list(map(lambda rating:rating.replace('star-rating ',''),starRating))
20 | 
21 | print(title)
22 | print(price)
23 | print(stock)
24 | print(images)
25 | print(rating)
26 | 
27 | #Merging all 
28 | dataset = zip(title,price,stock,images,rating)
29 | print(list(dataset))
30 | 


--------------------------------------------------------------------------------
/Chapter03/scrapelxmlcss.py:
--------------------------------------------------------------------------------
 1 | from lxml import html
 2 | import requests
 3 | from lxml.cssselect import CSSSelector
 4 | 
 5 | url = 'https://developer.ibm.com/announcements/category/data-science/?fa=date%3ADESC&fb='
 6 | url_get = requests.get(url)
 7 | tree = html.document_fromstring(url_get.content)
 8 | print(type(tree))
 9 | 
10 | announcements=[]
11 | articles = tree.cssselect('.ibm--card > a.ibm--card__block_link')
12 | for article in articles:
13 | 
14 |     link = article.get('href')
15 |     atype = article.cssselect('div.ibm--card__body > h5')[0].text.strip()
16 |     adate = article.cssselect('div.ibm--card__body > h5 > .ibm--card__date')[0].text
17 |     title = article.cssselect('div.ibm--card__body > h3.ibm--card__title')[0].text_content()
18 |     excerpt= article.cssselect(' div.ibm--card__body > p.ibm--card__excerpt')[0].text
19 |     category= article.cssselect('div.ibm--card__bottom > p.cpt-byline__categories span')
20 |     #only two available on block: except '+'
21 | 
22 |     #announcements.append([link,atype,adate,title,excerpt,[category[0].text,category[1].text]])
23 |     announcements.append([link,atype,adate,title,excerpt,[span.text for span in category if span.text!='+']])
24 | 
25 | print(announcements)
26 | 


--------------------------------------------------------------------------------
/Chapter08/seleniumBrowser.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import re
 3 | chrome_path='chromedriver'
 4 | driver = webdriver.Chrome(executable_path=chrome_path)
 5 | print(type(driver))
 6 | 
 7 | driver.get('https://www.python.org')
 8 | 
 9 | print("Title: ",driver.title)
10 | print("Current Page URL: ",driver.current_url)
11 | if re.search(r'python.org',driver.current_url):
12 |     driver.save_screenshot("pythonorg.png")
13 |     print("Python Screenshot Saved!")
14 | 
15 | cookies = driver.get_cookies()
16 | print("Cookies obtained from python.org")
17 | print(cookies)
18 | 
19 | print(driver.page_source)
20 | driver.refresh()
21 | 
22 | driver.get('https://www.google.com')
23 | print("Title: ",driver.title)
24 | print("Current Page URL: ",driver.current_url)
25 | if re.search(r'google.com',driver.current_url):
26 |     driver.save_screenshot("google.png")
27 |     print("Google Screenshot Saved!")
28 | 
29 | cookies = driver.get_cookies()
30 | print("Cookies obtained from google.com")
31 | print(cookies)
32 | 
33 | print("Current Page URL: ",driver.current_url)
34 | driver.back()
35 | print("Page URL (Back): ",driver.current_url)
36 | driver.forward()
37 | print("Page URL (Forward): ",driver.current_url)
38 | 
39 | driver.close()
40 | driver.quit()
41 | 


--------------------------------------------------------------------------------
/Chapter07/githubevent.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | from collections import Counter
 4 | dataSet = []
 5 | 
 6 | url = 'https://api.github.com/'
 7 | 
 8 | 
 9 | def readUrl(search):
10 |     results = requests.get(url + search)
11 |     print("Status Code: ", results.status_code)
12 |     print("Headers: Content-Type: ", results.headers['Content-Type'])
13 |     return results.json()
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     eventTypes=[]
18 |     #IssueCommentEvent,WatchEvent,PullRequestReviewCommentEvent,CreateEvent
19 |     for page in range(1, 4):
20 |         events = readUrl('events?page=' + str(page))
21 |         # print(jsonResult)
22 |         for event in events:
23 |             id = event['id']
24 |             type = event['type']
25 |             actor = event['actor']['display_login']
26 |             repoUrl = event['repo']['url']
27 |             createdAt = event['created_at']
28 |             eventTypes.append(type)
29 |             dataSet.append([id, type, createdAt, repoUrl, actor])
30 | 
31 |     eventInfo = dict(Counter(eventTypes))
32 |     print("Individual Event Counts:", eventInfo)
33 |     print("CreateEvent Counts:", eventInfo['CreateEvent'])
34 |     print("DeleteEvent Counts:", eventInfo['DeleteEvent'])
35 | 
36 | print("Total Events Found: ", len(dataSet))
37 | print(dataSet)
38 | 


--------------------------------------------------------------------------------
/Chapter08/seleniumLocator.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | chrome_path='chromedriver'
 3 | driver = webdriver.Chrome(executable_path=chrome_path)
 4 | driver.get('http://automationpractice.com')
 5 | print("Current Page URL: ",driver.current_url)
 6 | 
 7 | searchBox = driver.find_element_by_id('search_query_top')
 8 | print("Type :",type(searchBox))
 9 | print("Attribute Value :",searchBox.get_attribute("value"))
10 | print("Attribute Class :",searchBox.get_attribute("class"))
11 | print("Tag Name :",searchBox.tag_name)
12 | 
13 | searchBox.clear()
14 | searchBox.send_keys("Dress")
15 | 
16 | submitButton = driver.find_element_by_name("submit_search")
17 | submitButton.click()
18 | 
19 | resultsShowing = driver.find_element_by_class_name("product-count")
20 | print("Results Showing: ",resultsShowing.text)
21 | 
22 | resultsFound = driver.find_element_by_xpath('//*[@id="center_column"]//span[@class="heading-counter"]')
23 | print("Results Found: ",resultsFound.text)
24 | 
25 | products = driver.find_elements_by_xpath('//*[@id="center_column"]//a[@class="product-name"]')
26 | #products = driver.find_elements_by_css_selector('ul.product_list li.ajax_block_product a.product-name')
27 | 
28 | foundProducts=[]
29 | for product in products:
30 |     foundProducts.append([product.text,product.get_attribute("href")])
31 | 
32 | print(foundProducts)    
33 | 
34 | driver.close()
35 | driver.quit()
36 | 


--------------------------------------------------------------------------------
/Chapter06/testingGroundCookie.py:
--------------------------------------------------------------------------------
 1 | from pyquery import PyQuery as pq
 2 | import requests
 3 | mainUrl = "http://testing-ground.scraping.pro"
 4 | loginUrl = "http://testing-ground.scraping.pro/login"
 5 | postUrl="http://testing-ground.scraping.pro/login?mode=login"
 6 | logoutUrl = "http://testing-ground.scraping.pro/login?mode=logout"
 7 | 
 8 | def responseCookies(response):
 9 |     headers = response.headers
10 |     cookies = response.cookies
11 |     print("Headers: ", headers)
12 |     print("Cookies: ", cookies)
13 | 
14 | def processParams(params):
15 |     response = requests.post(postUrl, data=params)
16 |     responseB = pq(response.text)
17 |     message = responseB.find('div#case_login h3').text()
18 |     print("Confirm Login : ",message)
19 | 
20 | if __name__ == '__main__':
21 |     requests.get(logoutUrl)
22 |     response = requests.get(mainUrl)
23 |     responseCookies(response)
24 | 
25 |     response = requests.get(loginUrl)
26 |     responseCookies(response)
27 | 
28 |     responseA = pq(response.text)
29 |     username = responseA.find('input[id="usr"]').attr('name')
30 |     password = responseA.find('input[id="pwd"]').attr('name')
31 | 
32 |     #Welcome : Success
33 |     paramsCorrect = {username: 'admin', password: '12345'} #Success
34 |     print(paramsCorrect)
35 |     processParams(paramsCorrect)
36 | 
37 |     paramsIncorrect = {username: 'admin', password: '123456'} #Access Denied
38 |     print(paramsIncorrect)
39 |     processParams(paramsIncorrect)
40 | 


--------------------------------------------------------------------------------
/Chapter03/food.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <menus>
 3 |   <food>
 4 |     <name>Butter Milk with Vanilla</name>
 5 |     <price>$3.99</price>
 6 |     <description>Rich tangy buttermilk with vanilla essence</description>
 7 |     <rating>5.0</rating>
 8 |     <feedback>6</feedback>
 9 |   </food>
10 |   <food>
11 |     <name>Fish and Chips</name>
12 |     <price>$4.99</price>
13 |     <description>Crispy fried Chips and Fish served with lemon and malt vinegar</description>
14 |     <rating>5.0</rating>
15 |     <feedback>10</feedback>
16 |   </food>
17 |   <food>
18 |     <name>Egg Roll</name>
19 |     <price>$3.99</price>
20 |     <description>Fresh egg rolls filled with ground chicken, carrot, cabbage</description>
21 |     <rating>4.0</rating>
22 |     <feedback>8</feedback>
23 |   </food>
24 |   <food>
25 |     <name>Pineapple Cake</name>
26 |     <price>$3.99</price>
27 |     <description>Crushed Pineapple mixed with vanilla, eggs and lemon juice</description>
28 |     <rating>5.0</rating>
29 |     <feedback>9</feedback>
30 |   </food>
31 |   <food>
32 |     <name>Eggs and Bacon</name>
33 |     <price>$5.50</price>
34 |     <description>Served with rice and fresh fruit</description>
35 |     <rating>4.5</rating>
36 |     <feedback>4</feedback>
37 |   </food>
38 |   <food>
39 |     <name>Orange Juice</name>
40 |     <price>$2.99</price>
41 |     <description>Fresh Orange juice served</description>
42 |     <rating>4.9</rating>
43 |     <feedback>10</feedback>
44 |   </food>
45 | </menus>
46 | 


--------------------------------------------------------------------------------
/Chapter07/sunrisesunset.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | # location: Kathmandu, Nepal
 5 | # lat = 27.717245 , lng=85.323959
 6 | url = 'https://api.sunrise-sunset.org/json?lat=27.717245&lng=85.323959&date=2019-03-04'
 7 | 
 8 | results = requests.get(url)
 9 | print("Type Results",type(results))
10 | print("Status Code: ", results.status_code)
11 | print("Headers: Content-Type: ", results.headers['Content-Type'])
12 | print("Headers: ", results.headers)
13 | 
14 | jsonResult = results.json()
15 | print("Type JSON Results",type(jsonResult))
16 | print(jsonResult)
17 | print("SunRise & Sunset: ",jsonResult['results']['sunrise']," & ",jsonResult['results']['sunset'])
18 | 
19 | 
20 | # Type Results <class 'requests.models.Response'>
21 | # Status Code:  200
22 | # Headers-ContentType:  application/json
23 | # Headers:  {'Access-Control-Allow-Origin': '*', 'Content-Type': 'application/json', 'Vary': 'Accept-Encoding', 'Server': 'nginx', 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Transfer-Encoding': 'chunked', 'Date': 'Mon, 04 Mar 2019 07:48:29 GMT'}
24 | # Type JSON Results <class 'dict'>
25 | # {'status': 'OK', 'results': {'civil_twilight_end': '12:44:16 PM', 'astronomical_twilight_end': '1:38:31 PM', 'civil_twilight_begin': '12:16:32 AM', 'sunrise': '12:39:54 AM', 'nautical_twilight_begin': '11:49:24 PM', 'astronomical_twilight_begin': '11:22:17 PM', 'nautical_twilight_end': '1:11:24 PM', 'sunset': '12:20:54 PM', 'solar_noon': '6:30:24 AM', 'day_length': '11:41:00'}}
26 | # SunRise & Sunset:  12:39:54 AM  &  12:20:54 PM
27 | 


--------------------------------------------------------------------------------
/Chapter07/universities.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | dataSet = []
 4 | 
 5 | url = 'http://universities.hipolabs.com/search?name='
 6 | 
 7 | def readUrl(search):
 8 |     results = requests.get(url+search)
 9 |     print("Status Code: ", results.status_code)
10 |     print("Headers: Content-Type: ", results.headers['Content-Type'])
11 |     # print("Headers: ", results.headers)
12 |     return results.json()
13 | 
14 | if __name__=="__main__":
15 |     jsonResult = readUrl('Wales')
16 |     # print(jsonResult)
17 |     for university in jsonResult:
18 |         name = university['name']
19 |         url = university['web_pages'][0]
20 |         dataSet.append([name,url])
21 | 
22 |     print("Total Universities Found: ",len(dataSet))
23 |     print(dataSet)
24 | 
25 | '''
26 | Status Code:  200
27 | Headers: Content-Type:  application/json
28 | Total Universities Found:  10
29 | [['University of Wales', 'http://www.wales.ac.uk/'],
30 |  ['University of Wales Institute, Cardiff', 'http://www.uwic.ac.uk/'], 
31 |  ['University of Wales College of Medicine', 'http://www.uwcm.ac.uk/'], 
32 |  ['Johnson & Wales University', 'http://www.jwu.edu/'], 
33 |  ['University of New South Wales', 'http://www.unsw.edu.au/'], 
34 |  ['University of Wales, Newport', 'http://www.newport.ac.uk/'], 
35 |  ['University of Wales, Swansea', 'http://www.swan.ac.uk/'], 
36 |  ['University of Wales, Aberystwyth', 'http://www.aber.ac.uk/'], 
37 |  ['University of Wales, Lampeter', 'http://www.lamp.ac.uk/'],
38 |   ['University of Wales, Bangor', 'http://www.bangor.ac.uk/']]
39 | '''
40 | 


--------------------------------------------------------------------------------
/Chapter10/listToCSV.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import json
 3 | 
 4 | colNames = ['Title','Price','Stock','Rating']
 5 | dataSet = [
 6 |     ['Rip it Up and ...', 35.02, 'In stock', 5],
 7 |     ['Our Band Could Be ...', 57.25, 'In stock', 4],
 8 |     ['How Music Works', 37.32, 'In stock', 2],
 9 |     ['Love Is a Mix ...', 18.03, 'Out of stock',1],
10 |     ['Please Kill Me: The ...', 31.19, 'In stock', 4],
11 |     ["Kill 'Em and Leave: ...", 45.0, 'In stock',5],
12 |     ['Chronicles, Vol. 1', 52.60, 'Out of stock',2],
13 |     ['This Is Your Brain ...', 38.4, 'In stock',1],
14 |     ['Orchestra of Exiles: The ...', 12.36, 'In stock',3],
15 |     ['No One Here Gets ...', 20.02, 'In stock',5],
16 |     ['Life', 31.58, 'In stock',5],
17 |     ['Old Records Never Die: ...', 55.66, 'Out of Stock',2],
18 |     ['Forever Rockers (The Rocker ...', 28.80, 'In stock',3]
19 | ]
20 | 
21 | print(dataSet)
22 | 
23 | fileCsv = open('bookdetails.csv', 'w', newline='', encoding='utf-8')
24 | writer = csv.writer(fileCsv)
25 | writer.writerow(colNames)
26 | for data in dataSet:
27 |     writer.writerow(data)
28 | fileCsv.close()
29 | 
30 | 
31 | finalDataSet=list() #empty Dataset
32 | for data in dataSet:
33 |     print(dict(zip(colNames,data)))
34 |     finalDataSet.append(dict(zip(colNames,data)))
35 | print(finalDataSet)
36 | 
37 | with open('bookdetails.json', 'w') as jsonfile:
38 |     json.dump(finalDataSet,jsonfile)
39 | 
40 | 
41 | with open('bookdetails.json', 'r+') as jsonfile:
42 |     data = json.load(jsonfile)
43 |     print(data)
44 |     print(data[0])
45 |     print(data[0]['id'])
46 |     print(data[0]['price'])
47 |     print(data[0:2])
48 | 


--------------------------------------------------------------------------------
/Chapter09/regex_xml.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | filename = 'sitemap.xml'
 4 | 
 5 | # collect Blog title information from URLs except not link to any  category
 6 | dataSetBlog = []
 7 | dataSetBlogURL = [] # collects Blog URLs
 8 | dataSetCategory = [] # collect Category title
 9 | dataSetCategoryURL = [] # collect Category URLs
10 | 
11 | page = open(filename, 'r').read()
12 | pattern = r"loc>(.*)</loc"
13 | urlPatterns = re.findall(pattern, page)
14 | print(type(urlPatterns))
15 | 
16 | 
17 | for url in urlPatterns:
18 | 
19 |     if re.match(r'.*blog', url): #Blog related
20 |         dataSetBlogURL.append(url)
21 |         if re.match(r'[\w\-]', url):
22 |             blogTitle = re.findall(r'blog/([A-Za-z0-9\-]+)', url)
23 |             if len(blogTitle) > 0 and not re.match('(category)', blogTitle[0]):
24 |                 dataSetBlog.append(blogTitle[0])
25 | 
26 |     if re.match(r'.*category', url): #Category Related
27 |         dataSetCategoryURL.append(url)
28 |         categoryTitle = re.findall(r'category/([\w\-\s]+)', url)
29 |         dataSetCategory.append(categoryTitle[0])
30 | 
31 | 
32 | print("Blogs URL: ", len(dataSetBlogURL))
33 | print(dataSetBlogURL)
34 | 
35 | print("Blogs Title: ", len(dataSetBlog))
36 | print(dataSetBlog)
37 | 
38 | print("Unique Blog Count: ", len(set(dataSetBlog)))
39 | print(set(dataSetBlog))
40 | 
41 | print("Category URL  Count: ", len(dataSetCategoryURL))
42 | print(dataSetCategoryURL)
43 | 
44 | print("Category Title  Count: ", len(dataSetCategory))
45 | print(dataSetCategory)
46 | 
47 | print("Unique Category Count: ", len(set(dataSetCategory)))
48 | print(set(dataSetCategory))
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/Chapter10/analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | dataSet = pd.read_csv('bookdetails.csv')
 5 | 
 6 | print(type(dataSet))
 7 | print(dataSet)
 8 | print(dataSet.describe())
 9 | print(dataSet.columns)
10 | print(sum(dataSet['Price']))
11 | print(sum(dataSet['Rating']))
12 | print(dataSet[['Price','Rating']])
13 | print(dataSet['Price'])
14 | print(dataSet[dataSet.Stock.str.contains(r'Out')]['Price'])
15 | print(dataSet[dataSet['Rating']>=4.0][['Title','Price']])
16 | print(dataSet[dataSet.Rating.between(3.5,4.5)]['Title'])
17 | 
18 | 
19 | #Chart1
20 | price_group = dataSet[['Price']]
21 | print(price_group)
22 | bar_plot = price_group.plot()
23 | bar_plot.set_xlabel("No of Books")
24 | bar_plot.set_ylabel("Price")
25 | plt.show()
26 | 
27 | #Chart2
28 | price_group = dataSet[['Price']]
29 | bar_plot = price_group.plot(kind='bar')
30 | bar_plot.set_xlabel("No of Books")
31 | bar_plot.set_ylabel("Price")
32 | plt.show()
33 | 
34 | #Chart3
35 | price_group = dataSet[['Price','Rating']]
36 | bar_plot = price_group.plot(kind='bar',title="Book Price and Rating")
37 | bar_plot.set_xlabel("No of Books")
38 | bar_plot.set_ylabel("Price")
39 | plt.show()
40 | 
41 | #Chart4
42 | labels = dataSet[['Stock']]
43 | print(labels)
44 | price_group = dataSet[['Price','Rating']]
45 | bar_plot = price_group.plot(kind='bar',title="Book Price and Rating")
46 | bar_plot.set_xlabel("No of Books")
47 | bar_plot.set_xticklabels(labels)
48 | bar_plot.set_ylabel("Price")
49 | plt.show()
50 | 
51 | #Chart5 - PieChart
52 | prices = dataSet['Price'][0:6] #Price from first 6 items
53 | labels = dataSet['Title'][0:6] #Book Titles from first 6 items
54 | legends,ax1 = plt.pie(prices, labels=labels, shadow=True, startangle=45)
55 | plt.legend(legends, prices, loc="best")
56 | plt.show()
57 | 


--------------------------------------------------------------------------------
/Chapter10/bookdetails.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "Price": 35.02,
 4 |     "Stock": "In stock",
 5 |     "Title": "Rip it Up and ...",
 6 |     "Rating": 5
 7 |   },
 8 |   {
 9 |     "Price": 57.25,
10 |     "Stock": "In stock",
11 |     "Title": "Our Band Could Be ...",
12 |     "Rating": 4
13 |   },
14 |   {
15 |     "Price": 37.32,
16 |     "Stock": "In stock",
17 |     "Title": "How Music Works",
18 |     "Rating": 2
19 |   },
20 |   {
21 |     "Price": 18.03,
22 |     "Stock": "Out of stock",
23 |     "Title": "Love Is a Mix ...",
24 |     "Rating": 1
25 |   },
26 |   {
27 |     "Price": 31.19,
28 |     "Stock": "In stock",
29 |     "Title": "Please Kill Me: The ...",
30 |     "Rating": 4
31 |   },
32 |   {
33 |     "Price": 45.0,
34 |     "Stock": "In stock",
35 |     "Title": "Kill 'Em and Leave: ...",
36 |     "Rating": 5
37 |   },
38 |   {
39 |     "Price": 52.6,
40 |     "Stock": "Out of stock",
41 |     "Title": "Chronicles, Vol. 1",
42 |     "Rating": 2
43 |   },
44 |   {
45 |     "Price": 38.4,
46 |     "Stock": "In stock",
47 |     "Title": "This Is Your Brain ...",
48 |     "Rating": 1
49 |   },
50 |   {
51 |     "Price": 12.36,
52 |     "Stock": "In stock",
53 |     "Title": "Orchestra of Exiles: The ...",
54 |     "Rating": 3
55 |   },
56 |   {
57 |     "Price": 20.02,
58 |     "Stock": "In stock",
59 |     "Title": "No One Here Gets ...",
60 |     "Rating": 5
61 |   },
62 |   {
63 |     "Price": 31.58,
64 |     "Stock": "In stock",
65 |     "Title": "Life",
66 |     "Rating": 5
67 |   },
68 |   {
69 |     "Price": 55.66,
70 |     "Stock": "Out of Stock",
71 |     "Title": "Old Records Never Die: ...",
72 |     "Rating": 2
73 |   },
74 |   {
75 |     "Price": 28.8,
76 |     "Stock": "In stock",
77 |     "Title": "Forever Rockers (The Rocker ...",
78 |     "Rating": 3
79 |   }
80 | ]
81 | 


--------------------------------------------------------------------------------
/Chapter04/example1_ibm_announcements.py:
--------------------------------------------------------------------------------
 1 | from pyquery import PyQuery as pq
 2 | import requests
 3 | 
 4 | sourceUrl='https://developer.ibm.com/announcements/'
 5 | dataSet = list()
 6 | 
 7 | def read_url(url):
 8 |     """Read given Url , Returns pyquery object for page content"""
 9 |     pageSource = requests.get(url).content
10 |     return pq(pageSource)
11 | 
12 | def get_details(page):
13 |     """read 'page' url and append list of queried items to dataSet"""
14 |     response = read_url(page)
15 | 
16 |     articles = response.find('.ibm--card > a.ibm--card__block_link')
17 |     print("\nTotal articles found :", articles.__len__(), ' in Page: ', page)
18 |     for article in articles.items():
19 |         link = article.attr('href')
20 |         articlebody = article.find('div.ibm--card__body')
21 |         adate = articlebody.find('h5 > .ibm--card__date').text()
22 |         articlebody.find('h5 > .ibm--card__date').remove()
23 |         atype = articlebody.find('h5').text().strip()
24 |         title = articlebody.find('h3.ibm--card__title').text().encode('utf-8')
25 |         excerpt = articlebody.find('p.ibm--card__excerpt').text().encode('utf-8')
26 |         category = article.find('div.ibm--card__bottom > p.cpt-byline__categories span')
27 |         if link:
28 |             link = str(link).replace('/announcements/', sourceUrl)
29 |             categories = [span.text for span in category if span.text != '+']
30 |             dataSet.append([link, atype, adate, title, excerpt,",".join(categories)])
31 | 
32 | if __name__ == '__main__':
33 |     pageUrl = sourceUrl+"category/data-science/?fa=date:DESC&fb="
34 | 
35 |     pageUrls = [
36 |         sourceUrl+"category/data-science/page/%(page)s?fa=date:DESC&fb=" % {'page': page}
37 |         for page in range(1, 3)]
38 | 
39 |     for pages in pageUrls:
40 |         get_details(pages)
41 | 
42 |     print("\nTotal articles collected: ", len(dataSet))
43 |     print(dataSet)
44 | 


--------------------------------------------------------------------------------
/Chapter04/example3_AHL.py:
--------------------------------------------------------------------------------
 1 | from pyquery import PyQuery as pq
 2 | import re
 3 | 
 4 | sourceUrl = 'http://www.flyershistory.com/cgi-bin/ml-poffs.cgi'
 5 | dataSet = list()
 6 | keys = ['year','month','day','game_date','team1', 'team1_score', 'team2', 'team2_score', 'game_status']
 7 | 
 8 | def read_url(url):
 9 |     """Read given Url , Returns pyquery object for page content"""
10 |     pageSource = pq(url)
11 |     return pq(pageSource)
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     page = read_url(sourceUrl)
16 | 
17 |     tableRows = page.find("h1:contains('AHL Playoff Results') + table tr")
18 |     print("\nTotal rows found :", tableRows.__len__())
19 | 
20 |     for tr in tableRows.items():
21 |         team1 = tr.find('td').eq(1).text()
22 |         if team1 != '':
23 |             game_date = tr.find('td').eq(0).text()
24 |             dates = re.search(r'(.*)-(.*)-(.*)',game_date)
25 | 
26 |             team1_score = tr.find('td').eq(2).text()
27 |             team2 = tr.find('td').eq(4).text()
28 |             team2_score = tr.find('td').eq(5).text()
29 | 
30 |             #check Game Status should be either 'W' or 'L'
31 |             game_status = tr.find('td').eq(6).text()
32 |             if not re.match(r'[WL]',game_status):
33 |                 game_status = tr.find('td').eq(7).text()
34 | 
35 |             #breaking down date in year,month and day
36 |             year = dates.group(3)
37 |             month = dates.group(2)
38 |             day = dates.group(1)
39 |             if len(year)==2 and int(year)>=68:
40 |                 year = '19'+year
41 |             elif len(year)==2 and int(year) <68:
42 |                 year = '20'+year
43 |             else:
44 |                 pass
45 | 
46 |             #appending individual data list to the dataSet
47 |             dataSet.append([year,month,day,game_date,team1,team1_score,team2,team2_score,game_status])
48 | 
49 |     print("\nTotal Game Status, found :", len(dataSet))
50 |     print(dataSet)
51 | 


--------------------------------------------------------------------------------
/Chapter08/seleniumBooks.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | from selenium.common.exceptions import NoSuchElementException
 3 | 
 4 | chrome_path = 'chromedriver'
 5 | driver = webdriver.Chrome(executable_path=chrome_path)
 6 | driver.get('http://books.toscrape.com/index.html')
 7 | 
 8 | dataSet = []
 9 | # select: Food and Drink
10 | driver.find_element_by_link_text("Food and Drink").click()
11 | print("Current Page URL: ", driver.current_url)
12 | totalBooks = driver.find_element_by_xpath("//*[@id='default']//form/strong[1]")
13 | print("Found: ", totalBooks.text)
14 | 
15 | page = True
16 | while page:
17 |     listings = driver.find_elements_by_xpath("//*[@id='default']//ol/li[position()>0]")
18 |     for listing in listings:
19 |         url = listing.find_element_by_xpath(".//article[contains(@class,'product_pod')]/h3/a").get_attribute('href')
20 |         title = listing.find_element_by_xpath(".//article[contains(@class,'product_pod')]/h3/a").text
21 |         titleLarge = listing.find_element_by_xpath(".//article[contains(@class,'product_pod')]/h3/a").get_attribute(
22 |             'title')
23 |         price = listing.find_element_by_xpath(".//article/div[2]/p[contains(@class,'price_color')]").text
24 |         stock = listing.find_element_by_xpath(".//article/div[2]/p[2][contains(@class,'availability')]").text
25 |         image = listing.find_element_by_xpath(
26 |             ".//article/div[1][contains(@class,'image_container')]/a/img").get_attribute('src')
27 |         starRating = listing.find_element_by_xpath(".//article/p[contains(@class,'star-rating')]").get_attribute(
28 |             'class')
29 |         dataSet.append([titleLarge, title, price, stock, image, starRating.replace('star-rating ', ''), url])
30 | 
31 |     try:
32 |         #Check for Pagination with text 'next'
33 |         driver.find_element_by_link_text('next').click()
34 |         continue
35 |     except NoSuchElementException:
36 |         page = False
37 | 
38 | print("Completed")
39 | 
40 | print(dataSet)
41 | 
42 | driver.close()
43 | driver.quit()
44 | 


--------------------------------------------------------------------------------
/Chapter06/toScrapeSessionCookie.py:
--------------------------------------------------------------------------------
 1 | from pyquery import PyQuery as pq
 2 | import requests
 3 | mainUrl = "http://toscrape.com/"
 4 | loginUrl = "http://quotes.toscrape.com/login"
 5 | quoteUrl = "http://quotes.toscrape.com/"
 6 | 
 7 | def getCustomHeaders(cookieHeader):
 8 |     return {
 9 |         'Host': 'quotes.toscrape.com',
10 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0',
11 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
12 |         'Referer': 'http://quotes.toscrape.com/login',
13 |         'Content-Type': 'application/x-www-form-urlencoded',
14 |         'Cookie': cookieHeader,
15 |     }
16 | 
17 | def responseCookies(response):
18 |     headers = response.headers
19 |     cookies = response.cookies
20 |     print("Headers: ", headers)
21 |     print("Cookies: ", cookies)
22 |     return headers['Set-Cookie']
23 | 
24 | if __name__ == '__main__':
25 |     requests.get(mainUrl)
26 |     response = requests.get(loginUrl)
27 |     setCookie = responseCookies(response)
28 |     print("Set-Cookie: ",setCookie)
29 | 
30 |     responseA = pq(response.text)
31 |     csrf_token = responseA.find('input[name="csrf_token"]').attr('value')
32 |     username = responseA.find('input[id="username"]').attr('name')
33 |     password = responseA.find('input[id="password"]').attr('name')
34 |     params = {username: 'test', password: 'test','csrf_token': csrf_token}
35 |     print(params)
36 | 
37 |     customheaders = getCustomHeaders(setCookie)
38 |     response = requests.post(loginUrl, data=params, headers=customheaders)
39 |     # response = requests.post(loginUrl, data=params, headers={})
40 |     setCookie = responseCookies(response)
41 |     #print("Set-Cookie: ",setCookie)
42 | 
43 |     responseB = pq(response.text)
44 |     logoutText = responseB.find('a[href*="logout"]').text()
45 |     logoutLink = responseB.find('a[href*="logout"]').attr('href')
46 |     print("Current Page : ",response.url)
47 |     print("Confirm Login : ", responseB.find('.row h2').text())
48 |     print("Logout Info : ", logoutText," & ",logoutLink)
49 | 


--------------------------------------------------------------------------------
/Chapter04/example2_quotes_authors.py:
--------------------------------------------------------------------------------
 1 | from pyquery import PyQuery as pq
 2 | 
 3 | sourceUrl = 'http://quotes.toscrape.com/tag/books/'
 4 | dataSet = list()
 5 | keys = ['quote_tags','author_url','author_name','born_date','born_location','quote_title']
 6 | 
 7 | def read_url(url):
 8 |     """Read given Url , Returns pyquery object for page content"""
 9 |     pageSource = pq(url)
10 |     return pq(pageSource)
11 | 
12 | 
13 | def get_details(page):
14 |     """read 'page' url and append list of queried items to dataSet"""
15 |     nextPage = True
16 |     pageNo = 1
17 |     while (nextPage):
18 |         response = read_url(page + 'page/' + str(pageNo))
19 |         if response.find("ul.pager:has('li.next')"):
20 |             nextPage = True
21 |         else:
22 |             nextPage = False
23 | 
24 |         quotes = response.find('.quote')
25 |         print("\nTotal Quotes found :", quotes.__len__(), ' in Page: ', pageNo)
26 |         for quote in quotes.items():
27 |             title = quote.find('[itemprop="text"]:first').text()
28 |             author = quote.find('[itemprop="author"]:first').text()
29 |             authorLink = quote.find('a[href*="/author/"]:first').attr('href')
30 |             tags = quote.find('.tags [itemprop="keywords"]').attr('content')
31 | 
32 |             if authorLink:
33 |                 authorLink = 'http://quotes.toscrape.com' + authorLink
34 |                 linkDetail = read_url(authorLink)
35 |                 born_date = linkDetail.find('.author-born-date').text()
36 |                 born_location = linkDetail.find('.author-born-location').text()
37 |                 if born_location.startswith('in'):
38 |                     born_location = born_location.replace('in ','')
39 |                 dataSet.append(dict(zip(keys,[tags,authorLink,author,born_date,born_location,title[0:50]])))
40 |         pageNo += 1
41 | 
42 | if __name__ == '__main__':
43 |     get_details(sourceUrl)
44 |     print("\nTotal Quotes collected: ", len(dataSet))
45 |     print(dataSet)
46 |     for info in dataSet:
47 |         print(info['author_name'],' born on ',info['born_date'], ' in ',info['born_location'])
48 | 
49 | 


--------------------------------------------------------------------------------
/Chapter08/seleniumProducts.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | chrome_path='chromedriver'
 3 | driver = webdriver.Chrome(executable_path=chrome_path)
 4 | driver.get('http://automationpractice.com')
 5 | print("Current Page URL: ",driver.current_url)
 6 | 
 7 | searchBox = driver.find_element_by_id('search_query_top')
 8 | print("Type :",type(searchBox))
 9 | print("Attribute Value :",searchBox.get_attribute("value"))
10 | print("Attribute Class :",searchBox.get_attribute("class"))
11 | print("Tag Name :",searchBox.tag_name)
12 | 
13 | searchBox.clear()
14 | searchBox.send_keys("Dress")
15 | 
16 | submitButton = driver.find_element_by_name("submit_search")
17 | submitButton.click()
18 | 
19 | resultsShowing = driver.find_element_by_class_name("product-count")
20 | print("Results Showing: ",resultsShowing.text)
21 | 
22 | resultsFound = driver.find_element_by_xpath('//*[@id="center_column"]//span[@class="heading-counter"]')
23 | print("Results Found: ",resultsFound.text)
24 | 
25 | products = driver.find_elements_by_xpath('//*[@id="center_column"]//a[@class="product-name"]')
26 | #products = driver.find_elements_by_css_selector('ul.product_list li.ajax_block_product a.product-name')
27 | 
28 | foundProducts=[]
29 | dataSet=[]
30 | for product in products:
31 |     foundProducts.append([product.text,product.get_attribute("href")])
32 | 
33 | print(foundProducts)
34 | 
35 | dataSet=[]
36 | if len(foundProducts)>0:
37 |    for foundProduct in foundProducts:
38 |        driver.get(foundProduct[1])
39 |        product_url = driver.current_url
40 |        product_name = driver.find_element_by_xpath('//*[@id="center_column"]//h1[@itemprop="name"]').text
41 |        short_description = driver.find_element_by_xpath('//*[@id="short_description_content"]').text
42 |        product_price = driver.find_element_by_xpath('//*[@id="our_price_display"]').text
43 |        image_url = driver.find_element_by_xpath('//*[@id="bigpic"]').get_attribute('src')
44 |        condition = driver.find_element_by_xpath('//*[@id="product_condition"]/span').text
45 |        dataSet.append([product_name,product_price,condition,short_description,image_url,product_url])
46 | 
47 |    
48 | print(dataSet)
49 | 
50 | driver.close()
51 | driver.quit()
52 | 


--------------------------------------------------------------------------------
/Chapter03/scrapeXPathLoop.py:
--------------------------------------------------------------------------------
 1 | import lxml.html
 2 | from lxml.etree import XPath
 3 | 
 4 | baseUrl = "http://books.toscrape.com/"
 5 | bookUrl = "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/index.html"
 6 | pageUrl = "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/page-"
 7 | 
 8 | dataSet = []
 9 | page=1
10 | totalPages=1
11 | while(page<=totalPages):
12 |     print("Rows in Dataset: "+str(len(dataSet)))
13 | 
14 |     if(page==1):
15 |         doc = lxml.html.parse(pageUrl+str(page)+".html").getroot()
16 |         perPageArticles = doc.xpath("//*[@id=\"default\"]//form/strong[3]/text()")
17 |         totalArticles = doc.xpath("//*[@id=\"default\"]//form/strong[1]/text()")
18 |         totalPages = round(int(totalArticles[0])/int(perPageArticles[0]))
19 |         print(str(totalArticles[0])+" Results, showing "+str(perPageArticles[0])+" Articles per page")
20 |     else:
21 |         doc = lxml.html.parse(pageUrl+str(page)+".html").getroot()
22 | 
23 |     #used to find page url pattern
24 |     nextPage = doc.xpath("//*[@id=\"default\"]//ul[contains(@class,'pager')]/li[2][contains(@class,'next')]/a/@href")
25 |     if len(nextPage)>0:    
26 |         print("Scraping Page "+str(page)+" of "+str(totalPages)+". NextPage > "+str(nextPage[0]))
27 |     else:
28 |         print("Scraping Page "+str(page)+" of "+str(totalPages))
29 |     
30 |     articles = XPath("//*[@id='default']//ol/li[position()>0]")
31 |     titlePath = XPath(".//article[contains(@class,'product_pod')]/h3/a/text()")
32 |     pricePath = XPath(".//article/div[2]/p[contains(@class,'price_color')]/text()")
33 |     stockPath = XPath(".//article/div[2]/p[2][contains(@class,'availability')]/text()[normalize-space()]")
34 |     imagePath = XPath(".//article/div[1][contains(@class,'image_container')]/a/img/@src")
35 |     starRating = XPath(".//article/p[contains(@class,'star-rating')]/@class")
36 |     
37 |     for row in articles(doc):
38 |         title = titlePath(row)[0]
39 |         price = pricePath(row)[0]
40 |         availability = stockPath(row)[0].strip()
41 |         image = imagePath(row)[0]
42 |         rating = starRating(row)[0]
43 |     
44 |         dataSet.append([title,price,availability,image.replace('../../../..',baseUrl),rating.replace('star-rating ','')])
45 | 
46 |     page+=1
47 | 
48 | print(dataSet)
49 | 


--------------------------------------------------------------------------------
/Chapter05/toscrape_quotes.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Listing Quotes from first 5 or less pages found
 3 | from 'http://quotes.toscrape.com/'
 4 | '''
 5 | 
 6 | import requests
 7 | import re
 8 | from bs4 import BeautifulSoup
 9 | import csv
10 | 
11 | sourceUrl = 'http://quotes.toscrape.com/'
12 | keys = ['quote_tags','author_url','author_name','born_date','born_location','quote_title']
13 | 
14 | 
15 | def read_url(url):
16 |     """Read given Url , Returns requests object for page content"""
17 |     response = requests.get(url)
18 |     return response.text
19 | 
20 | 
21 | def get_details(page, dataWriter):
22 |     """Get 'response' for first 5 pages, parse it and collect data for 'keys' headers"""
23 |     nextPage = True
24 |     pageNo = 1
25 |     while (nextPage and pageNo <= 5):
26 |         response = read_url(page + 'page/' + str(pageNo))
27 |         soup = BeautifulSoup(response, 'lxml')
28 | 
29 |         rows = soup.find_all('div', 'quote')
30 |         if (len(rows) > 0):
31 |             print("Page ",pageNo," Total Quotes Found ",len(rows))
32 |             for row in rows:
33 |                 if row.find('span',attrs={'itemprop':'text'}):
34 | 
35 |                     title = row.find(attrs={'itemprop':'text'}).text.strip()
36 |                     author = row.find(attrs={'itemprop':'author'}).text.strip()
37 |                     authorLink = row.find('a',href=re.compile(r'/author/')).get('href')
38 |                     tags = row.find('div','tags').find(itemprop="keywords").get('content')
39 |                     print(title, ' : ', author,' : ',authorLink, ' : ',tags)
40 | 
41 |                     if authorLink:
42 |                         authorLink = 'http://quotes.toscrape.com' + authorLink
43 |                         linkDetail = read_url(authorLink)
44 |                         soupInner = BeautifulSoup(linkDetail, 'lxml')
45 | 
46 |                         born_date = soupInner.find('span','author-born-date').text.strip()
47 |                         born_location = soupInner.find('span','author-born-location').text.strip()
48 | 
49 |                         # Write a list of values in file
50 |                         dataWriter.writerow([tags,authorLink,author,born_date,born_location.replace('in ',''),title])
51 | 
52 |             nextPage = True
53 |             pageNo += 1
54 |         else:
55 |             print("Quotes Not Listed!")
56 | 
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     dataSet = open('quotes.csv', 'w', newline='', encoding='utf-8')
61 |     dataWriter = csv.writer(dataSet)
62 |     # Write a Header or Column_names to CSV
63 |     dataWriter.writerow(keys)
64 |     get_details(sourceUrl, dataWriter)
65 |     # get_details(sourceUrl)
66 |     dataSet.close()
67 | 


--------------------------------------------------------------------------------
/Chapter09/regexHTML.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | In this code we will be using Regex to find the listed information from the HTML content:
 3 | -HTML elements,
 4 | -Elements attributes ('key' and 'values') and
 5 | -Elements content.
 6 | '''
 7 | 
 8 | import re
 9 | from bs4 import BeautifulSoup
10 | 
11 | 
12 | def read_file():
13 |     '''
14 |     Read and return content from file (.html).
15 |     '''
16 |     content = open("regexHTML.html", "r")
17 |     pageSource = content.read()
18 |     return pageSource
19 | 
20 | 
21 | def applyPattern(pattern):
22 |     tags = re.findall(pattern, page)
23 |     print("Pattern r'{}' ,Found total: {}".format(pattern, len(tags)))
24 |     print(tags)
25 |     return
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     page = read_file()  # .decode('utf-8')
30 |     soup = BeautifulSoup(page, 'lxml')
31 |     print([tag.name for tag in soup.find_all()])
32 |     # ['html', 'head', 'title', 'style', 'body', 'h1', 'a', 'a', 'a', 'div', 'p', 'i', 'img', 'p', 'i', 'h1']
33 | 
34 |     applyPattern(r'<(\w+)>')  # Finding Elements without attributes
35 | # Pattern r'<(\w+)>' ,Found total: 6
36 | # ['html', 'head', 'title', 'body', 'div', 'i']
37 | 
38 |     applyPattern(r'<(\w+)\s')  # Finding Elements with attributes
39 | # Pattern r'<(\w+)\s' ,Found total: 10
40 | # ['style', 'h1', 'a', 'a', 'a', 'p', 'img', 'p', 'i', 'h1']
41 | 
42 |     applyPattern(r'<(\w+)\s?')  # Finding all HTML element
43 | # Pattern r'<(\w+)\s?' ,Found total: 16
44 | # ['html', 'head', 'title', 'style', 'body', 'h1', 'a', 'a', 'a', 'div', 'p', 'i', 'img', 'p', 'i', 'h1']
45 | 
46 |     applyPattern(r'<\w+\s+(.*?)=')  # Finding attributes name
47 | # Pattern r'<\w+\s+(.*?)=' ,Found total: 10
48 | # ['type', 'style', 'href', 'class', 'id', 'id', 'src', 'class', 'style', 'itemprop']
49 | 
50 |     applyPattern(r'(\w+)=')  # Finding names of all attributes
51 | # Pattern r'(\w+)=' ,Found total: 18
52 | # ['type', 'style', 'href', 'style', 'class', 'href', 'id', 'href', 'style', 'id', 'class', 'src', 'id', 'class', 'class', 'id', 'style', 'itemprop']
53 | 
54 |     applyPattern(r'=\"(\w+)\"')
55 | # Pattern r'=\"(\w+)\"' ,Found total: 9
56 | # ['classOne', 'idOne', 'mainContent', 'content', 'pageLogo', 'logo', 'content', 'subContent', 'subheading']
57 | 
58 |     applyPattern(r'=\"([\w\S]+)\"')
59 | # Pattern r'=\"([\w\S]+)\"' ,Found total: 18
60 | # ['text/css', 'color:orange;', 'https://www.google.com', 'color:red;', 'classOne', 'https://www.yahoo.com', 'idOne', 'https://www.wikipedia.org', 'color:blue;', 'mainContent', 'content', 'mylogo.png', 'pageLogo', 'logo', 'content', 'subContent', 'color:red', 'subheading']
61 | 
62 |     applyPattern(r'\>(.*)\<')
63 | # Pattern r'\>(.*)\<' ,Found total: 8
64 | # ['Welcome to Web Scraping: Example', 'Welcome to Web Scraping', 'Google', 'Yahoo', 'Wikipedia', 'Paragraph contents', 'Sub paragraph content', 'Sub heading Content!']
65 | 


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/spiders/quotes.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from Quotes.items import QuotesItem
 4 | 
 5 | class QuotesSpider(scrapy.Spider):
 6 |     name = "quotes"
 7 |     allowed_domains = ["quotes.toscrape.com"]
 8 | 
 9 |     #To be used for pagination purpose.
10 | 
11 |     start_urls = (
12 |         'http://quotes.toscrape.com/',
13 |     )
14 |     '''
15 |     #or
16 |     start_urls = (
17 |         'http://quotes.toscrape.com/',
18 |         'http://quotes.toscrape.com/page/1/',
19 |         'http://quotes.toscrape.com/page/2/',
20 |     )
21 |     or
22 |     start_urls = ['http://quotes.toscrape.com/page/%s' % page for page in xrange(1, 5)]
23 |     '''
24 | 
25 |     '''Using XPath'''
26 |     def parse(self, response):
27 |         print("Response Type >>> ", type(response))
28 |         rows = response.xpath("//div[@class='quote']")
29 |         
30 |         print("Quotes Count >> ", rows.__len__())
31 |         for row in rows:
32 |             item = QuotesItem()
33 | 			
34 |             item['tags'] = row.xpath('div[@class="tags"]/meta[@itemprop="keywords"]/@content').extract_first()
35 |             item['author'] = row.xpath('//span/small[@itemprop="author"]/text()').extract_first()
36 |             item['quote'] = row.xpath('span[@itemprop="text"]/text()').extract_first()
37 |             item['author_link'] = row.xpath('//a[contains(@href,"/author/")]/@href').extract_first()
38 |             if len(item['author_link'])>0:
39 |                 item['author_link'] = 'http://quotes.toscrape.com'+item['author_link']
40 | 
41 |             yield item
42 | 
43 |         nextPage = response.xpath("//ul[@class='pager']//li[@class='next']/a/@href").extract_first()
44 |         if nextPage:
45 |             print("Next Page URL: ",nextPage)
46 |             #nextPage obtained from either XPath or CSS can be used.
47 |             yield scrapy.Request('http://quotes.toscrape.com'+nextPage,callback=self.parse)
48 | 
49 |         print('Completed')
50 | 
51 |    
52 | 
53 | 
54 |     '''Using CSS Selectors'''
55 |     '''
56 |     def parse(self, response):
57 |         print("Response Type >>> ", type(response))
58 |         rows = response.css("div.quote")
59 | 
60 |         for row in rows:
61 |             item = QuotesItem()
62 |             item['tags'] = row.css('div.tags > meta[itemprop="keywords"]::attr("content")').extract_first()
63 |             item['author'] = row.css('small[itemprop="author"]::text').extract_first()
64 |             item['quote'] = row.css('span[itemprop="text"]::text').extract_first()
65 |             item['author_link'] = row.css('a:contains("(about)")::attr(href)').extract_first()
66 |             if len(item['author_link'])>0:
67 |                 item['author_link'] = 'http://quotes.toscrape.com'+item['author_link']
68 | 
69 |             yield item
70 | 
71 |         nextPage = response.css("ul.pager > li.next > a::attr(href)").extract_first()
72 |         if nextPage:
73 |             print("Next Page URL: ",nextPage)
74 |             #nextPage obtained from either XPath or CSS can be used.
75 |             yield scrapy.Request('http://quotes.toscrape.com'+nextPage,callback=self.parse)
76 | 
77 |         print('Completed')
78 |         '''
79 | 


--------------------------------------------------------------------------------
/Chapter06/toScrapeViewstate.py:
--------------------------------------------------------------------------------
 1 | from pyquery import PyQuery as pq
 2 | import requests
 3 | 
 4 | mainurl = "http://toscrape.com/"
 5 | searchurl = "http://quotes.toscrape.com/search.aspx"
 6 | filterurl = "http://quotes.toscrape.com/filter.aspx"
 7 | quoteurl = "http://quotes.toscrape.com/"
 8 | authorTags = [('Albert Einstein', 'success'), ('Thomas A. Edison', 'inspirational')]
 9 | 
10 | def processRequests(url, params={}, customheaders={}):
11 |     if len(params) > 0:
12 |         response = requests.post(url, data=params, headers=customheaders)
13 |     else:
14 |         response = requests.get(url)
15 |     #headers = response.headers # print(headers)
16 |     #cookies = response.cookies # print(cookies)
17 |     return pq(response.text)
18 | 
19 | if __name__ == '__main__':
20 |     for authorTag in authorTags:
21 |         authorName,tagName= authorTag
22 | 
23 |         #Step 1: load searchURL
24 |         searchResponse = processRequests(searchurl)
25 |         author = searchResponse.find('select#author option:contains("' + authorName + '")').attr('value')
26 |         viewstate = searchResponse.find('input#__VIEWSTATE').attr('value')
27 |         tag = searchResponse.find('select#tag option').text()
28 | 
29 |         print("Author: ", author)
30 |         print("ViewState: ", viewstate)
31 |         print("Tag: ", tag)
32 | 
33 |         #Step 2: load filterurl with author and default tag
34 |         params = {'author': author, 'tag': tag, '__VIEWSTATE': viewstate}
35 |         customheaders = {
36 |             'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
37 |             'Content-Type': 'application/x-www-form-urlencoded',
38 |             'Referer': searchurl
39 |         }
40 |         filterResponse = processRequests(filterurl,params,customheaders)
41 |         viewstate = filterResponse.find('input#__VIEWSTATE').attr('value')
42 |         tagSuccess = filterResponse.find('select#tag option:contains("' + tagName + '")').attr('value')
43 |         submitButton = filterResponse.find('input[name="submit_button"]').attr('value')
44 |         print("Author: ", author)
45 |         print("ViewState: ", viewstate)
46 |         print("Tag: ", tagSuccess)
47 |         print("Submit: ", submitButton)
48 | 
49 |         #Step 3: load filterurl with author and defined tag
50 |         params = {'author': author, 'tag': tagSuccess, 'submit_button': submitButton, '__VIEWSTATE': viewstate}
51 |         # params = {'author': author, 'tag': tagSuccess, 'submit_button': submitButton}#, '__VIEWSTATE': viewstate}  # test
52 |         customheaders = {
53 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
54 |         'Content-Type': 'application/x-www-form-urlencoded',
55 |         'Referer': filterurl
56 |         }
57 |         finalResponse = processRequests(filterurl,params, customheaders)
58 | 
59 |         #Step 4: Extract results
60 |         quote = finalResponse.find('div.quote span.content').text()
61 |         quoteAuthor = finalResponse.find('div.quote span.author').text()
62 |         message = finalResponse.find('div.quote span.tag').text()
63 |         print("Quote: ", quote, "\nAuthor: ", quoteAuthor, "\nMessage: ", message)
64 | 


--------------------------------------------------------------------------------
/Chapter05/Quotes/Quotes/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for Blog project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'Quotes'
13 | 
14 | SPIDER_MODULES = ['Quotes.spiders']
15 | NEWSPIDER_MODULE = 'Quotes.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'Blog (+http://www.yourdomain.com)'
20 | #ROBOTSTXT_OBEY = False
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | CONCURRENT_REQUESTS=16
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 |    # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
41 |    # 'Accept-Language': 'en',
42 |    # 'upgrade-insecure-requests': 1,
43 |    # 'accept-encoding': 'gzip, deflate, br',
44 |  #  'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'Blog.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'Blog.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'Quotes.pipelines.QuotesPipeline': 300,
69 | }
70 | FEED_EXPORT_ENCODING = 'utf-8'
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
75 | AUTOTHROTTLE_ENABLED=True
76 | # The initial download delay
77 | AUTOTHROTTLE_START_DELAY=5
78 | # The maximum download delay to be set in case of high latencies
79 | AUTOTHROTTLE_MAX_DELAY=60
80 | # Enable showing throttling stats for every response received:
81 | #AUTOTHROTTLE_DEBUG=False
82 | DOWNLOAD_HANDLERS = {'s3': None,}
83 | # Enable and configure HTTP caching (disabled by default)
84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
85 | HTTPCACHE_ENABLED=True
86 | #HTTPCACHE_EXPIRATION_SECS=0
87 | #HTTPCACHE_DIR='httpcache'
88 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
89 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
90 | 


--------------------------------------------------------------------------------
/Chapter09/godfreysfeed.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import requests
 3 | 
 4 | def read_url(url):
 5 |     pageSource = requests.get(url).text
 6 |     return pageSource
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 | 
11 |     dataSet=list()
12 |     sourceUrl = 'http://godfreysfeed.com/dealersandlocations.php'
13 |     page = read_url(sourceUrl)
14 | 
15 |     pLatLng= r'var latLng = new google.maps.LatLng\((?P<lat>.*)\,\s*(?P<lng>.*)\)\;'
16 |     latlngs = re.findall(pLatLng,page)
17 |     print("Findall found total LatLngs: ", len(latlngs))
18 | 
19 |     pDealers = r'infoWindowContent = infoWindowContent\+\s*\"(.*?)\"\;'
20 |     dealers = re.findall(pDealers, page)
21 |     print("Findall found total Address: ", len(latlngs))
22 | 
23 |     d=0
24 |     for dealer in dealers:
25 |         dealerInfo = re.split(r'<br>',re.sub(r'<br><br>','',dealer))
26 |         name = re.findall(r'\'>(.*?)</span',dealerInfo[0])[0]
27 |         address = re.findall(r'>(.*)<',dealerInfo[1])[0]
28 |         city = re.findall(r'>(.*),\s*(.*)<',dealerInfo[2])[0][0]
29 |         state = re.findall(r'>(.*),\s*(.*)<',dealerInfo[2])[0][1]
30 |         zip = re.findall(r'>(.*)<',dealerInfo[3])[0]
31 |         lat = latlngs[d][0]
32 |         lng = latlngs[d][1]
33 |         d+=1
34 |         dataSet.append([name,address,city,state,zip,lat,lng])
35 | 
36 |     print(dataSet) #[[name,address, city, state, zip, lat,lng],]
37 | 
38 | 
39 | #Findall found total LatLngs:  55
40 | #Findall found total Address:  55
41 | #[['Akins Feed & Seed', '206 N Hill Street', 'Griffin', 'GA', '30223', '33.2509855', '-84.2633946'], ['Alf&apos;s Farm and Garden', '101 East 1st Street', 'Donalsonville', 'GA', '39845', '31.0426107', '-84.8821949'], ['American Cowboy Shop', '513 D Murphy Hwy', 'Blairsville', 'GA', '30512', '34.8761989', '-83.9582412'], ['Anderson&apos;s General Store', '23736 US Hwy 80 E', 'Statesboro', 'GA', '30458', '32.43158', '-81.749293'], ['Bar G Horse & Cattle Supply', '1060 Astondale Road', 'Bishop', 'GA', '30621', '33.8192864', '-83.4387722'], ['Beggs Farm Supply', '5845 Royston Hwy', 'Canon', 'GA', '30520', '34.2959968', '-83.0062267'], ['Big Creek Feed', '218 Hwy 49 N', 'Byron', 'GA', '31025', '32.6537561', '-83.7596295'], ['Blue Ribbon Show Supply', '9416 Lucy Moore Road', 'Nichols', 'GA', '31554', '31.462497', '-82.5866503'], ['Burdette Mill', '216 Depot Street', 'Washington', 'GA', '30673', '33.7340136', '-82.7472304'], ['Burke Feed', '369 Hwy 56 N', 'Waynesboro', 'GA', '30830', '33.1064245', '-81.9852452'], ['Candler Feed and Seed', '1275 Smokey Park Hwy', 'Candler', 'NC', '28715', '35.5401542', '-82.7570303'], ['Cash & Carry Feed', '135 N McGriff St.', 'Whigham', 'GA', '39897', '30.8848506', '-84.3248931'], ['Cherokee Feed and Seed', '869 Grove St', 'Gainesville', 'GA', '30501', '34.289323', '-83.8219858'], ['Cherokee Feed and Seed', '2370 Hightower Rd', 'Ball Ground', 'GA', '30107', '34.3372664', '-84.3779515'], ['Claxton Family Cattle', '240 Old Douglas Road', 'Hazelhurst', 'GA', '31539', '31.836371', '-82.6232915'], ['D&D Irringation', '51 S Rentz St', 'Lenox', 'GA', '31637', '31.2713852', '-83.4629421'], ['Double D Stables and Tack', '4111 Logan Rd', 'Rocky Face', 'GA', '30740', '34.805079', '-85.0274471'], ['Eatonton Co-op', '504 S Jefferson Ave', 'Eatonton', 'GA', '31024', '33.3267997', '-83.3884961'], ['Edenfields Feed and Seed', '709 Hwy 25N', 'Millen', 'GA', '30442', '32.8088128', '-81.9491768'], ['Family Feed', '6424 COLUMBUS HWY 80', 'Box Springs', 'GA', '31801', '32.5580349', '-84.6513774'], ['Farm & Garden Inc.', '646 Clarksville Street', 'Cornelia', 'GA', '30531', '34.5114883', '-83.5271166'], ['Farmer Seed Company', '800 W Broad St', 'Doerun', 'GA', '31744', '31.3200669', '-83.9234872'], ['Farmers Feed', '204 N West St', 'Greensboro', 'GA', '30642', '33.5781281', '-83.1845358'], ['Feed South', '2623 Knight Avenue', 'Waycross', 'GA', '31503', '31.2028754', '-82.316785'], ['Forsyth Feed & Seed', '45 W Jefferson Street', 'Forsyth', 'GA', '31029', '33.035097', '-83.940067'], ['Georgia Deer Farm', '850 Hwy 27 N', 'Roopville', 'GA', '30170', '33.476202', '-85.1082285'], ['H&M Trailers and Feed', '6446 JFH Pkwy', 'Adairsville', 'GA', '30103', '34.3924623', '-84.9333769'], ['Hill Farm Supply', '12700 Augusta Hwy', 'Sparta', 'GA', '31087', '33.2791285', '-82.9646478'], ['Ijon Webb', '1130 Stillwell Rd', 'Springfield', 'GA', '31329', '32.369773', '-81.266672'], ['Jesup Milling', '601 SW Broad Street', 'Jesup', 'GA', '31545', '31.5990992', '-81.8905051'], ['Jump N Run Farm', '1569 Liberty Church Grove Rd', 'Wrightsville', 'GA', '31096', '32.6481899', '-82.6139868'], ['L & C Farm and Garden', '1143 East Fairplay Road', 'Fairplay', 'SC', '29643', '34.5101355', '-82.9602795'], ['Maddox Feed', '1915 Winder Hwy', 'Jefferson', 'GA', '30549', '34.1001367', '-83.5969643'], ['Miller Farm Supply', '2001 Bob Culvern Rd', 'Louisville', 'GA', '30434', '32.9859964', '-82.3913739'], ['North Fulton Feed', '12950 Hwy 9 N', 'Alpharetta', 'GA', '30004', '34.096767', '-84.2735144'], ['North Georgia Co-Op', '951 Progress Rd', 'Ellijay', 'GA', '30540', '34.6739981', '-84.4902665'], ['Oglethorpe Feed and Farm Supply', '900 Athens Road', 'Crawford', 'GA', '30648', '33.8898662', '-83.1358665'], ['Owens Farm Supply', '6414 Mize Road', 'Toccoa', 'GA', '30577', '34.4855944', '-83.3394454'], ['Patricks', '10285 Covington Bypass', 'Covington', 'GA', '30014', '33.5770654', '-83.8354943'], ['Perry Feed and Tack', '309 Kellwood Drive', 'Perry', 'GA', '31069', '32.4443895', '-83.7439432'], ['Pine Ridge Outdoor Supply', '4999 HWY 114', 'Lyerly', 'GA', '30730', '34.4166444', '-85.3925577'], ['Reeves Hardware', '95 BO James St', 'Clayton', 'GA', '30525', '34.8686254', '-83.4026817'], ['Roberts Milling Company', '116 West Albany Ave', 'Pearson', 'GA', '31642', '31.2987063', '-82.8577173'], ['Roche Farm and Garden', '803 E Jackson St', 'Dublin', 'GA', '31040', '32.5444125', '-82.8945945'], ['Roche Farm and Garden', '781 East Court Street', 'Wrightsville', 'GA', '31040', '32.7302168', '-82.7117232'], ['Rodgers Fertilizer', '409 N Main St', 'Saluda', 'SC', '29138', '34.0082425', '-81.7729772'], ['Rogers Feed', '1041 Easley Hwy', 'Pelzer', 'SC', '29669', '34.6639864', '-82.5126743'], ['Ronnie Spivey', '654 Mary Richardson Road', 'Wray', 'GA', '31796', '31.525261', '-83.06603'], ['Shirley Feed & Seed Inc', '2439 North Elm Street', 'Commerce', 'GA', '30529', '34.2068698', '-83.4689814'], ['Southern Home and Farm LLC', '3127 Hamilton Road', 'Lagrange', 'GA', '30241', '32.9765932', '-84.98978'], ['Southland Power Fence', '752 E 5th Ave', 'Colbert', 'GA', '30628', '34.0412765', '-83.2001394'], ['Town & Country General Store', '59 Hwy 212 West', 'Monticello', 'GA', '31064', '33.3066615', '-83.6976187'], ['Twisted Fitterz', '10329 Nashville Enigma Rd', 'Alapaha', 'GA', '31622', '31.3441482', '-83.3002373'], ['Westside Feed II', '230 SE 7th Avenue', 'Lake Butler', 'FL', '32054', '30.02116', '-82.329495'], ['White Co. Farmers Exchange', '951 S Main St', 'Cleveland', 'GA', '30528', '34.58403', '-83.760829']]
42 | 


--------------------------------------------------------------------------------
/Chapter09/regex.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Python Regular Expressions: re
  3 | https://regexone.com/references/python
  4 | http://www.regular-expressions.info/python.html
  5 | https://developers.google.com/edu/python/regular-expressions
  6 | # Anchors: ^ begining of Line, $ end of line
  7 | # re.search(pattern,str,re.I|re.MULTILINE|re.M)
  8 | """
  9 | import re
 10 | 
 11 | sentence = """The course assumes a working knowledge of key data science topics
 12 | (statistics, machine learning, and general data analytic methods).
 13 | Programming experience in some language (such as R, MATLAB, SAS, Mathematica, Java, C, C++, VB, or FORTRAN)
 14 | is expected. In particular, participants need to be comfortable with general programming concepts like 
 15 | variables, loops, and functions. Experience with Python is helpful (but not required)."""
 16 | #source: https://www.enthought.com/training/course/python-for-data-science/#/syllabus
 17 | splitSentence=sentence.split()
 18 | 
 19 | print("Length of Sentence: ",len(sentence), '& splitSentence: ',len(splitSentence))
 20 | print(splitSentence)
 21 | 
 22 | #Findall
 23 | matches = re.findall(r"([A-Z+]+)\,",sentence)
 24 | print("Findall found total ",len(matches)," Matches >> ",matches)
 25 | #Findall found total  6  Matches >>  ['R', 'MATLAB', 'SAS', 'C', 'C++', 'VB']
 26 | 
 27 | matches = re.findall(r"([A-Z]+)\,",sentence)
 28 | print("Findall found total ",len(matches)," Matches >> ",matches)
 29 | #Findall found total  5  Matches >>  ['R', 'MATLAB', 'SAS', 'C', 'VB']
 30 | 
 31 | matches = re.findall(r"\s*([\sorA-Z+]+)\)",sentence) #r'\s*([A-Z]+)\)' matches 'FORTRAN'
 32 | print("Findall found total ",len(matches)," Matches >> ",matches)
 33 | #Findall found total  1  Matches >>  ['or FORTRAN']
 34 | 
 35 | 
 36 | #re.match
 37 | fortran = matches[0] # 'or FORTRAN'
 38 | if re.match(r'or',fortran):
 39 |     fortran = re.sub(r'or\s*','',fortran)
 40 | print(fortran)
 41 | #FORTRAN
 42 | 
 43 | #re.search
 44 | if re.search(r'^F.*N$',fortran):
 45 |     print("True")
 46 | #True
 47 | 
 48 | matches  = re.findall(r'\s(MAT.*?)\,',sentence,flags=re.IGNORECASE)
 49 | print("(MAT.*?)\,: ",matches)  #r'(?i)\s(MAT.*?)\,' can also be used
 50 | #(MAT.*?)\,: ['MATLAB', 'Mathematica']
 51 | 
 52 | matches = re.findall(r'\s(MAT.*?)\,',sentence)
 53 | print("(MAT.*?)\,: ",matches)
 54 | #(MAT.*?)\,: ['MATLAB']
 55 | 
 56 | matches = re.findall(r'\s(C.*?)\,',sentence)
 57 | print("\s(C.*?)\,: ",matches)
 58 | #\s(C.*?)\,: ['C', 'C++']
 59 | 
 60 | 
 61 | #re.split
 62 | matchesOne = re.split(r"\W+",sentence)  #\w (word characters, \W - nonword)
 63 | print("Regular Split '\W+' found total: ",len(matchesOne ),"\n",matchesOne)
 64 | #Regular Split '\W+' found total: 63
 65 | #['The', 'course', 'assumes', 'a', 'working', 'knowledge', 'of', 'key', 'data', 'science', 'topics', 'statistics', ......, 'such', 'as', 'R', 'MATLAB', 'SAS', 'Mathematica', 'Java', 'C', 'C', 'VB', 'or', 'FORTRAN', 'is', 'expected', .........., 'and', 'functions', 'Experience', 'with', 'Python', 'is', 'helpful', 'but', 'not', 'required', '']
 66 | 
 67 | matchesTwo = re.split(r"\s",sentence)
 68 | print("Regular Split '\s' found total: ",len(matchesTwo),"\n", matchesTwo)
 69 | #Regular Split '\s' found total: 63 :
 70 | #['The', 'course', 'assumes', 'a', 'working', 'knowledge', 'of', 'key', 'data', 'science', 'topics', '(statistics,', ........., '(such', 'as', 'R,', 'MATLAB,', 'SAS,', 'Mathematica,', 'Java,', 'C,', 'C++,', 'VB,', 'or', 'FORTRAN)', 'is', ......., 'and', 'functions.', 'Experience', 'with', 'Python', 'is', 'helpful', '(but', 'not', 'required).']
 71 | 
 72 | 
 73 | timeDate= '''<time datetime="2019-02-11T18:00:00+00:00"></time>
 74 | <time datetime="2018-02-11T13:59:00+00:00"></time>
 75 | <time datetime="2019-02-06T13:44:00.000002+00:00"></time>
 76 | <time datetime="2019-02-05T17:39:00.000001+00:00"></time>
 77 | <time datetime="2019-02-04T12:53:00+00:00"></time>
 78 | '''
 79 | 
 80 | pattern = r'(20\d+)([-]+)(0[1-9]|1[012])([-]+)(0[1-9]|[12][0-9]|3[01])'
 81 | recompiled = re.compile(pattern)  # <class '_sre.SRE_Pattern'>
 82 | dateMatches = recompiled.search(timeDate)
 83 | 
 84 | 
 85 | print("Group : ",dateMatches.group())
 86 | #Group : 2019-02-11
 87 | 
 88 | print("Groups : ",dateMatches.groups())
 89 | #Groups : ('2019', '-', '02', '-', '11')
 90 | 
 91 | print("Group 1 : ",dateMatches.group(1))
 92 | #Group 1 : 2019
 93 | 
 94 | print("Group 5 : ",dateMatches.group(5))
 95 | #Group 5 : 11
 96 | 
 97 | 
 98 | for match in re.finditer(pattern, timeDate): # <class '_sre.SRE_Match'>
 99 |         #for match in re.finditer(recompiled, timeDate):
100 |         s = match.start()
101 |         e = match.end()
102 |         l = match.lastindex
103 |         g = match.groups()
104 |         print('Found {} at {}:{}, groups{} lastindex:{}'.format(timeDate[s:e], s, e,g,l))
105 | 
106 | 
107 | # Found 2019-02-11 at 16:26, groups('2019', '-', '02', '-', '11') lastindex:5
108 | # Found 2018-02-11 at 67:77, groups('2018', '-', '02', '-', '11') lastindex:5
109 | # Found 2019-02-06 at 118:128, groups('2019', '-', '02', '-', '06') lastindex:5
110 | # Found 2019-02-05 at 176:186, groups('2019', '-', '02', '-', '05') lastindex:5
111 | # Found 2019-02-04 at 234:244, groups('2019', '-', '02', '-', '04') lastindex:5
112 | 
113 | 
114 | pDate = r'(?P<year>[0-9]{4})(?P<sep>[-])(?P<month>0[1-9]|1[012])-(?P<day>0[1-9]|[12][0-9]|3[01])'
115 | recompiled = re.compile(pDate)
116 | for match in re.finditer(recompiled,timeDate):
117 |         s = match.start()
118 |         e = match.end()
119 |         l = match.lastindex
120 |         print("Group ALL or 0: ",match.groups(0)) #or match.groups()
121 |         print("Group Year: ",match.group('year'))
122 |         print("Group Delimiter: ",match.group('sep'))
123 |         print('Found {} at {}:{}, lastindex: {}'.format(timeDate[s:e], s, e,l))
124 |         print('year :',match.groupdict()['year'])
125 |         print('day :',match.groupdict()['day'])
126 |         print('lastgroup :',match.lastgroup)
127 | 
128 | 
129 | # Group ALL or 0: ('2019', '-', '02', '11')
130 | # Group Year: 2019
131 | # Group Month: 02
132 | # Group Day: 11
133 | # Group Delimiter: -
134 | # Found 2019-02-11 at 16:26, lastindex: 4
135 | # year : 2019
136 | # day : 11
137 | # lastgroup : day
138 | 
139 | 
140 | pTime = r'(?P<hour>[0-9]{2})(?P<sep>[:])(?P<min>[0-9]{2}):(?P<sec_mil>[0-9.:+]+)'
141 | recompiled = re.compile(pTime)
142 | for match in re.finditer(recompiled,timeDate):
143 |         print("Group String: ",match.group())
144 |         print("Group ALL or 0: ",match.groups())
145 |         print("Group Span: ",match.span())
146 |         print("Group Span 1: ",match.span(1))
147 |         print("Group Span 4: ",match.span(4))
148 |         print('hour :',match.groupdict()['hour'])
149 |         print('minute :',match.groupdict()['min'])
150 |         print('second :',match.groupdict()['sec_mil'])
151 |         print('lastgroup :',match.lastgroup)
152 | 
153 | 
154 | # Group String: 12:53:00+00:00
155 | # Group ALL or 0: ('12', ':', '53', '00+00:00')
156 | # Group Span: (245, 259)
157 | # Group Span 1: (245, 247)
158 | # Group Span 4: (251, 259)
159 | # hour : 12
160 | # minute : 53
161 | # second : 00+00:00
162 | # lastgroup : sec_mil
163 | 


--------------------------------------------------------------------------------
/Chapter05/bs4_exploring.py:
--------------------------------------------------------------------------------
  1 | from bs4 import BeautifulSoup,SoupStrainer
  2 | import re
  3 | html_doc = """
  4 | <html><head><title>The Dormouse's story</title></head>
  5 | <body>
  6 | <p class="title"><b>The Dormouse's story</b></p>
  7 | <p class="story">Once upon a time there were three little sisters; and their names were
  8 | <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  9 | <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
 10 | <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
 11 | and they lived at the bottom of a well.</p>
 12 | <p class="story">...</p>
 13 | <h1>Secret agents</h1>
 14 | <ul>
 15 |     <li data-id="10784">Jason Walters, 003: Found dead in "A View to a Kill".</li>
 16 |     <li data-id="97865">Alex Trevelyan, 006: Agent turned terrorist leader; James' nemesis in "Goldeneye".</li>
 17 |     <li data-id="45732">James Bond, 007: The main man; shaken but not stirred.</li>
 18 | </ul>
 19 | </body>
 20 | </html>
 21 | """
 22 | tagsA = SoupStrainer("a")
 23 | soupA = BeautifulSoup(html_doc,'lxml',parse_only=tagsA)
 24 | soup = BeautifulSoup(html_doc,'lxml')
 25 | 
 26 | print(type(soupA))
 27 | print(soupA)
 28 | 
 29 | print(soupA.prettify())
 30 | 
 31 | print(soupA.a.has_attr('class'))
 32 | 
 33 | print(soupA.a.has_attr('name'))
 34 | 
 35 | print(soupA.find("a")) #print(soupA.find(name="a"))
 36 | 
 37 | print(soupA.find("a",attrs={'class':'sister'}))
 38 | 
 39 | print(soupA.find("a",attrs={'class':'sister'},text="Lacie"))
 40 | 
 41 | print(soupA.find("a",attrs={'id':'link3'}))
 42 | 
 43 | print(soupA.find('a',id="link2"))
 44 | 
 45 | print(soupA.find_all("a")) 
 46 | 
 47 | #find all <a>, but return only 2 of them
 48 | print(soupA.find_all("a",limit=2)) #attrs, text
 49 | 
 50 | print(soupA.find("a",text=re.compile(r'cie'))) #import re
 51 | 
 52 | print(soupA.find_all("a",attrs={'id':re.compile(r'3')}))
 53 | 
 54 | print(soupA.find_all(re.compile(r'a')))
 55 | 
 56 | #soup
 57 | soup = BeautifulSoup(html_doc,'lxml')
 58 | 
 59 | print(soup.find_all("p","story")) #class=story
 60 | 
 61 | print(soup.find_all("p","title")) #soup.find_all("p",attrs={'class':"title"})
 62 | 
 63 | print(soup.find_all("p",attrs={'class':["title","story"]}))
 64 | 
 65 | print(soup.find_all(["p","li"]))
 66 | 
 67 | print(soup.find_all(string="Elsie")) #text="Elsie"
 68 | 
 69 | print(soup.find_all(text=re.compile(r'Elsie'))) #import re
 70 | 
 71 | print(soup.find_all("a",string="Lacie")) #text="Lacie"
 72 | 
 73 | for li in soup.ul.find_all('li'):
 74 |     print(li.name, ' > ',li.get('data-id'),' > ', li.text)
 75 | 
 76 | print(soupA.a) #tag a
 77 | 
 78 | print(soup.li) #tag li
 79 | 
 80 | print(soup.p)
 81 | 
 82 | print(soup.p.b) #tag p and b
 83 | 
 84 | print(soup.ul.find('li',attrs={'data-id':'45732'}))
 85 | 
 86 | print(soup.ul.find('li',attrs={'data-id':'45732'}).text)
 87 | 
 88 | print(soup.p.text) #get_text()
 89 | 
 90 | print(soup.li.text)
 91 | 
 92 | print(soup.p.string)
 93 | 
 94 | print(list(soup.find('p','story').children))
 95 | 
 96 | print(list(soup.find('p','story').contents))
 97 | 
 98 | print(list(soup.find('p','story').descendants))
 99 | 
100 | #using List Comprehension Technique
101 | print([a.name for a in soup.find('p','story').children])
102 | 
103 | print([{'tag':a.name,'text':a.text,'class':a.get('class')} for a in soup.find('p','story').children if a.name!=None])
104 | 
105 | print([a.name for a in soup.find('p','story').descendants])
106 | 
107 | print(list(filter(None,[a.name for a in soup.find('p','story').descendants])))
108 | 
109 | print(soup.find('p','story').findChildren())
110 | 
111 | print(soup.find('p','story').findChild()) #soup.find('p','story').find()
112 | 
113 | #print parent element of <a> with class=sister
114 | print(soup.find('a','sister').parent)
115 | 
116 | #print parent element name of <a> with class=sister
117 | print(soup.find('a','sister').parent.name)
118 | 
119 | #print text from parent element of <a> with class=sister
120 | print(soup.find('a','sister').parent.text)
121 | 
122 | for element in soup.find('a','sister').parents:
123 |     print(element.name)
124 | 	
125 | #find single Parent for selected <a> with class=sister 
126 | print(soup.find('a','sister').findParent())
127 | 
128 | #find Parents for selected <a> with class=sister 
129 | print(soup.find('a','sister').findParents())
130 | 
131 | print(soup.find('p','story').next)
132 | 
133 | print(soup.find('p','story').next.next)
134 | 
135 | print(soup.find('p','story').next_element)
136 | 
137 | print(soup.find('p','story').next_element.next_element)
138 | 
139 | print(soup.find('p','story').next_element.next_element.next_element)
140 | 
141 | print(soup.find('p','story').previous) #returns empty or new-line. 
142 | print(soup.find('p','title').next.next.next) #returns empty or newline similar to code above
143 | 
144 | print(soup.find('p','story').previous.previous)
145 | 
146 | print(soup.find('p','story').previous_element) #returns empty or new-line. 
147 | print(soup.find('p','story').previous_element.previous_element)
148 | 
149 | 
150 | print(soup.find('p','story').previous_element.previous_element.previous_element)
151 | 
152 | print(soup.find('p','title').next.next.previous.previous)
153 | 
154 | for element in soup.find('ul').next_elements:
155 |     print(element)
156 | 	
157 | print(soup.find('p','story').next)
158 | 
159 | print(soup.find('p','story').next_element)
160 | 
161 | print(soup.find('p','story').find_next()) #element after next_element
162 | 
163 | print(soup.find('p','story').find_next('h1'))
164 | 
165 | print(soup.find('p','story').find_all_next())
166 | 
167 | print(soup.find('p','story').find_all_next('li',limit=2))
168 | 
169 | print(soup.find('ul').previous.previous.previous)
170 | 
171 | print(soup.find('ul').find_previous())
172 | 
173 | print(soup.find('ul').find_previous('p','title'))
174 | 
175 | print(soup.find('ul').find_all_previous('p'))
176 | 
177 | print(soup.find('p','title').next_sibling) #returns empty or new-line
178 | 
179 | print(soup.find('p','title').next_sibling.next_sibling) #print(soup.find('p','title').next_sibling.next)
180 | 
181 | print(soup.find('ul').previous_sibling) #returns empty or new-line
182 | 
183 | print(soup.find('ul').previous_sibling.previous_sibling)
184 | 
185 | #using List Comprehension 
186 | title = [ele.name for ele in soup.find('p','title').next_siblings]
187 | print(list(filter(None,title)))
188 | 
189 | ul = [ele.name for ele in soup.find('ul').previous_siblings]
190 | print(list(filter(None,ul)))
191 | 
192 | #find next <p> siblings for selected <p> with class=title
193 | print(soup.find('p','title').find_next_siblings('p'))
194 | 
195 | #find single or next sibling for selected <h1>
196 | print(soup.find('h1').find_next_sibling())
197 | 
198 | #find single or next sibling <li> for selected <h1>
199 | print(soup.find('h1').find_next_sibling('li'))
200 | 
201 | #find first previous sibling to <ul>
202 | print(soup.find('ul').find_previous_sibling())
203 | 
204 | #find all previous siblings to <ul>
205 | print(soup.find('ul').find_previous_siblings())
206 | 
207 | #using CSS Selectors
208 | print(soup.select('li[data-id]'))
209 | print(soup.select('ul li[data-id]')[1]) #fetch index 1 only from resulted List
210 | print(soup.select_one('li[data-id]'))
211 | print(soup.select('p.story > a.sister'))#Selects all <a> with class='sister' that are direct child to <p> with class="story"
212 | 
213 | print(soup.select('p b'))#Selects <b> inside <p>
214 | 
215 | print(soup.select('p + h1'))#Selects immediate <h1> after <p>
216 | 
217 | print(soup.select('p.story + h1'))#Selects immediate <h1> after <p> with class 'story'
218 | 
219 | print(soup.select('p.title + h1'))#Selects immediate <h1> after <p> with class 'title'
220 | 
221 | print(soup.select('a[href*="example.com"]'))
222 | 
223 | print(soup.select('a[id*="link"]'))


--------------------------------------------------------------------------------
/Chapter09/sitemap.xml:
--------------------------------------------------------------------------------
  1 | <urlset xmlns="https://www.sitemaps.org/schemas/sitemap/0.9">
  2 | <url>
  3 | <loc>https://webscraping.com</loc>
  4 | </url>
  5 | <url>
  6 | <loc>https://webscraping.com/about</loc>
  7 | </url>
  8 | <url>
  9 | <loc>https://webscraping.com/blog</loc>
 10 | </url>
 11 | <url>
 12 | <loc>https://webscraping.com/blog/10/</loc>
 13 | </url>
 14 | <url>
 15 | <loc>https://webscraping.com/blog/11/</loc>
 16 | </url>
 17 | <url>
 18 | <loc>https://webscraping.com/blog/12/</loc>
 19 | </url>
 20 | <url>
 21 | <loc>https://webscraping.com/blog/13/</loc>
 22 | </url>
 23 | <url>
 24 | <loc>https://webscraping.com/blog/2/</loc>
 25 | </url>
 26 | <url>
 27 | <loc>https://webscraping.com/blog/3/</loc>
 28 | </url>
 29 | <url>
 30 | <loc>https://webscraping.com/blog/4/</loc>
 31 | </url>
 32 | <url>
 33 | <loc>https://webscraping.com/blog/5/</loc>
 34 | </url>
 35 | <url>
 36 | <loc>https://webscraping.com/blog/6/</loc>
 37 | </url>
 38 | <url>
 39 | <loc>https://webscraping.com/blog/7/</loc>
 40 | </url>
 41 | <url>
 42 | <loc>https://webscraping.com/blog/8/</loc>
 43 | </url>
 44 | <url>
 45 | <loc>https://webscraping.com/blog/9/</loc>
 46 | </url>
 47 | <url>
 48 | <loc>
 49 | https://webscraping.com/blog/All-your-data-are-belong-to-us/
 50 | </loc>
 51 | </url>
 52 | <url>
 53 | <loc>https://webscraping.com/blog/Android-Apps-Update/</loc>
 54 | </url>
 55 | <url>
 56 | <loc>https://webscraping.com/blog/Apple-Apps-Update/</loc>
 57 | </url>
 58 | <url>
 59 | <loc>
 60 | https://webscraping.com/blog/Asynchronous-support-in-Python/
 61 | </loc>
 62 | </url>
 63 | <url>
 64 | <loc>
 65 | https://webscraping.com/blog/Automatic-web-scraping/
 66 | </loc>
 67 | </url>
 68 | <url>
 69 | <loc>https://webscraping.com/blog/Automating-CAPTCHAs/</loc>
 70 | </url>
 71 | <url>
 72 | <loc>https://webscraping.com/blog/Automating-webkit/</loc>
 73 | </url>
 74 | <url>
 75 | <loc>
 76 | https://webscraping.com/blog/Best-website-for-freelancers/
 77 | </loc>
 78 | </url>
 79 | <url>
 80 | <loc>https://webscraping.com/blog/Bitcoin/</loc>
 81 | </url>
 82 | <url>
 83 | <loc>
 84 | https://webscraping.com/blog/Caching-crawled-webpages/
 85 | </loc>
 86 | </url>
 87 | <url>
 88 | <loc>
 89 | https://webscraping.com/blog/Caching-data-efficiently/
 90 | </loc>
 91 | </url>
 92 | <url>
 93 | <loc>
 94 | https://webscraping.com/blog/Can-you-extract-data-from-this-website/
 95 | </loc>
 96 | </url>
 97 | <url>
 98 | <loc>https://webscraping.com/blog/Client-Feedback/</loc>
 99 | </url>
100 | <url>
101 | <loc>
102 | https://webscraping.com/blog/Converting-UK-Easting-Northing-coordinates/
103 | </loc>
104 | </url>
105 | <url>
106 | <loc>
107 | https://webscraping.com/blog/Crawling-with-threads/
108 | </loc>
109 | </url>
110 | <url>
111 | <loc>
112 | https://webscraping.com/blog/Discount-coupons-for-data-store/
113 | </loc>
114 | </url>
115 | <url>
116 | <loc>
117 | https://webscraping.com/blog/Extracting-article-summaries/
118 | </loc>
119 | </url>
120 | <url>
121 | <loc>https://webscraping.com/blog/Fixed-fee-or-hourly/</loc>
122 | </url>
123 | <url>
124 | <loc>
125 | https://webscraping.com/blog/Free-service-to-extract-article-from-webpage/
126 | </loc>
127 | </url>
128 | <url>
129 | <loc>
130 | https://webscraping.com/blog/Generate-website-screenshot-history/
131 | </loc>
132 | </url>
133 | <url>
134 | <loc>
135 | https://webscraping.com/blog/Google-App-Engine-limitations/
136 | </loc>
137 | </url>
138 | <url>
139 | <loc>https://webscraping.com/blog/Google-Storage/</loc>
140 | </url>
141 | <url>
142 | <loc>https://webscraping.com/blog/Google-interview/</loc>
143 | </url>
144 | <url>
145 | <loc>
146 | https://webscraping.com/blog/How-to-automatically-find-contact-details/
147 | </loc>
148 | </url>
149 | <url>
150 | <loc>
151 | https://webscraping.com/blog/How-to-crawl-websites-without-being-blocked/
152 | </loc>
153 | </url>
154 | <url>
155 | <loc>
156 | https://webscraping.com/blog/How-to-find-what-technology-a-website-uses/
157 | </loc>
158 | </url>
159 | <url>
160 | <loc>
161 | https://webscraping.com/blog/How-to-make-python-faster/
162 | </loc>
163 | </url>
164 | <url>
165 | <loc>
166 | https://webscraping.com/blog/How-to-protect-your-data/
167 | </loc>
168 | </url>
169 | <url>
170 | <loc>
171 | https://webscraping.com/blog/How-to-scrape-Android-Apps/
172 | </loc>
173 | </url>
174 | <url>
175 | <loc>
176 | https://webscraping.com/blog/How-to-teach-yourself-web-scraping/
177 | </loc>
178 | </url>
179 | <url>
180 | <loc>
181 | https://webscraping.com/blog/How-to-use-XPaths-robustly/
182 | </loc>
183 | </url>
184 | <url>
185 | <loc>https://webscraping.com/blog/How-to-use-proxies/</loc>
186 | </url>
187 | <url>
188 | <loc>https://webscraping.com/blog/I-love-AJAX/</loc>
189 | </url>
190 | <url>
191 | <loc>https://webscraping.com/blog/Image-efficiencies/</loc>
192 | </url>
193 | <url>
194 | <loc>
195 | https://webscraping.com/blog/Importing-CSV-into-MySQL/
196 | </loc>
197 | </url>
198 | <url>
199 | <loc>
200 | https://webscraping.com/blog/Increase-your-Google-App-Engine-quotas-for-free/
201 | </loc>
202 | </url>
203 | <url>
204 | <loc>
205 | https://webscraping.com/blog/Is-Web-Scraping-Legal/
206 | </loc>
207 | </url>
208 | <url>
209 | <loc>
210 | https://webscraping.com/blog/Loading-Cookies-from-the-Browser/
211 | </loc>
212 | </url>
213 | <url>
214 | <loc>https://webscraping.com/blog/Luminati/</loc>
215 | </url>
216 | <url>
217 | <loc>
218 | https://webscraping.com/blog/New-scraping-quote-tool/
219 | </loc>
220 | </url>
221 | <url>
222 | <loc>
223 | https://webscraping.com/blog/New-store-for-buying-databases/
224 | </loc>
225 | </url>
226 | <url>
227 | <loc>
228 | https://webscraping.com/blog/Open-sourced-web-scraping-code/
229 | </loc>
230 | </url>
231 | <url>
232 | <loc>
233 | https://webscraping.com/blog/Parsing-Flash-with-Swiffy/
234 | </loc>
235 | </url>
236 | <url>
237 | <loc>
238 | https://webscraping.com/blog/Parsing-HTML-with-Python/
239 | </loc>
240 | </url>
241 | <url>
242 | <loc>
243 | https://webscraping.com/blog/Rebranding-sitescraper-as-webscraping/
244 | </loc>
245 | </url>
246 | <url>
247 | <loc>https://webscraping.com/blog/Reverse-Geocode/</loc>
248 | </url>
249 | <url>
250 | <loc>
251 | https://webscraping.com/blog/Scraping-Flash-based-websites/
252 | </loc>
253 | </url>
254 | <url>
255 | <loc>
256 | https://webscraping.com/blog/Scraping-JavaScript-based-web-pages-with-Chickenfoot/
257 | </loc>
258 | </url>
259 | <url>
260 | <loc>
261 | https://webscraping.com/blog/Scraping-JavaScript-webpages-with-webkit/
262 | </loc>
263 | </url>
264 | <url>
265 | <loc>
266 | https://webscraping.com/blog/Scraping-dynamic-data/
267 | </loc>
268 | </url>
269 | <url>
270 | <loc>
271 | https://webscraping.com/blog/Scraping-multiple-JavaScript-webpages-with-webkit/
272 | </loc>
273 | </url>
274 | <url>
275 | <loc>https://webscraping.com/blog/Services/</loc>
276 | </url>
277 | <url>
278 | <loc>https://webscraping.com/blog/Solving-CAPTCHA/</loc>
279 | </url>
280 | <url>
281 | <loc>https://webscraping.com/blog/Startup/</loc>
282 | </url>
283 | <url>
284 | <loc>
285 | https://webscraping.com/blog/Taking-advantage-of-mobile-interfaces/
286 | </loc>
287 | </url>
288 | <url>
289 | <loc>
290 | https://webscraping.com/blog/The-SiteScraper-module/
291 | </loc>
292 | </url>
293 | <url>
294 | <loc>
295 | https://webscraping.com/blog/Threading-with-webkit/
296 | </loc>
297 | </url>
298 | <url>
299 | <loc>
300 | https://webscraping.com/blog/Typical-web-scraping-job/
301 | </loc>
302 | </url>
303 | <url>
304 | <loc>https://webscraping.com/blog/UPC-Database-Update/</loc>
305 | </url>
306 | <url>
307 | <loc>
308 | https://webscraping.com/blog/Useful-business-directories/
309 | </loc>
310 | </url>
311 | <url>
312 | <loc>https://webscraping.com/blog/User-agents/</loc>
313 | </url>
314 | <url>
315 | <loc>
316 | https://webscraping.com/blog/Using-Google-Cache-to-crawl-a-website/
317 | </loc>
318 | </url>
319 | <url>
320 | <loc>
321 | https://webscraping.com/blog/Using-Google-Translate-to-crawl-a-website/
322 | </loc>
323 | </url>
324 | <url>
325 | <loc>
326 | https://webscraping.com/blog/Using-the-internet-archive-to-crawl-a-website/
327 | </loc>
328 | </url>
329 | <url>
330 | <loc>
331 | https://webscraping.com/blog/Web-Scraping-Interface/
332 | </loc>
333 | </url>
334 | <url>
335 | <loc>https://webscraping.com/blog/Web-Scrapping/</loc>
336 | </url>
337 | <url>
338 | <loc>
339 | https://webscraping.com/blog/Web-scraping-with-regular-expressions/
340 | </loc>
341 | </url>
342 | <url>
343 | <loc>
344 | https://webscraping.com/blog/Webpage-screenshots-with-webkit/
345 | </loc>
346 | </url>
347 | <url>
348 | <loc>https://webscraping.com/blog/What-is-CSV/</loc>
349 | </url>
350 | <url>
351 | <loc>https://webscraping.com/blog/What-is-web-scraping/</loc>
352 | </url>
353 | <url>
354 | <loc>
355 | https://webscraping.com/blog/Why-Google-App-Engine/
356 | </loc>
357 | </url>
358 | <url>
359 | <loc>https://webscraping.com/blog/Why-Python/</loc>
360 | </url>
361 | <url>
362 | <loc>
363 | https://webscraping.com/blog/Why-reinvent-the-wheel/
364 | </loc>
365 | </url>
366 | <url>
367 | <loc>https://webscraping.com/blog/Why-web2py/</loc>
368 | </url>
369 | <url>
370 | <loc>https://webscraping.com/blog/category/ajax</loc>
371 | </url>
372 | <url>
373 | <loc>https://webscraping.com/blog/category/android/</loc>
374 | </url>
375 | <url>
376 | <loc>
377 | https://webscraping.com/blog/category/beautifulsoup
378 | </loc>
379 | </url>
380 | <url>
381 | <loc>https://webscraping.com/blog/category/big picture</loc>
382 | </url>
383 | <url>
384 | <loc>https://webscraping.com/blog/category/business/</loc>
385 | </url>
386 | <url>
387 | <loc>https://webscraping.com/blog/category/cache</loc>
388 | </url>
389 | <url>
390 | <loc>https://webscraping.com/blog/category/captcha</loc>
391 | </url>
392 | <url>
393 | <loc>https://webscraping.com/blog/category/chickenfoot</loc>
394 | </url>
395 | <url>
396 | <loc>https://webscraping.com/blog/category/concurrent</loc>
397 | </url>
398 | <url>
399 | <loc>https://webscraping.com/blog/category/cookies</loc>
400 | </url>
401 | <url>
402 | <loc>https://webscraping.com/blog/category/crawling</loc>
403 | </url>
404 | <url>
405 | <loc>https://webscraping.com/blog/category/database/</loc>
406 | </url>
407 | <url>
408 | <loc>https://webscraping.com/blog/category/efficiency</loc>
409 | </url>
410 | <url>
411 | <loc>https://webscraping.com/blog/category/elance</loc>
412 | </url>
413 | <url>
414 | <loc>https://webscraping.com/blog/category/example</loc>
415 | </url>
416 | <url>
417 | <loc>https://webscraping.com/blog/category/flash</loc>
418 | </url>
419 | <url>
420 | <loc>https://webscraping.com/blog/category/freelancing</loc>
421 | </url>
422 | <url>
423 | <loc>https://webscraping.com/blog/category/gae</loc>
424 | </url>
425 | <url>
426 | <loc>https://webscraping.com/blog/category/google/</loc>
427 | </url>
428 | <url>
429 | <loc>https://webscraping.com/blog/category/html</loc>
430 | </url>
431 | <url>
432 | <loc>https://webscraping.com/blog/category/image</loc>
433 | </url>
434 | <url>
435 | <loc>https://webscraping.com/blog/category/ip</loc>
436 | </url>
437 | <url>
438 | <loc>https://webscraping.com/blog/category/ir</loc>
439 | </url>
440 | <url>
441 | <loc>https://webscraping.com/blog/category/javascript</loc>
442 | </url>
443 | <url>
444 | <loc>https://webscraping.com/blog/category/learn</loc>
445 | </url>
446 | <url>
447 | <loc>https://webscraping.com/blog/category/linux</loc>
448 | </url>
449 | <url>
450 | <loc>https://webscraping.com/blog/category/lxml</loc>
451 | </url>
452 | <url>
453 | <loc>https://webscraping.com/blog/category/mobile</loc>
454 | </url>
455 | <url>
456 | <loc>https://webscraping.com/blog/category/mobile apps/</loc>
457 | </url>
458 | <url>
459 | <loc>https://webscraping.com/blog/category/ocr</loc>
460 | </url>
461 | <url>
462 | <loc>https://webscraping.com/blog/category/opensource</loc>
463 | </url>
464 | <url>
465 | <loc>https://webscraping.com/blog/category/proxies/</loc>
466 | </url>
467 | <url>
468 | <loc>https://webscraping.com/blog/category/python</loc>
469 | </url>
470 | <url>
471 | <loc>https://webscraping.com/blog/category/qt</loc>
472 | </url>
473 | <url>
474 | <loc>https://webscraping.com/blog/category/regex</loc>
475 | </url>
476 | <url>
477 | <loc>https://webscraping.com/blog/category/scrapy</loc>
478 | </url>
479 | <url>
480 | <loc>https://webscraping.com/blog/category/screenshot</loc>
481 | </url>
482 | <url>
483 | <loc>https://webscraping.com/blog/category/sitescraper</loc>
484 | </url>
485 | <url>
486 | <loc>https://webscraping.com/blog/category/sqlite</loc>
487 | </url>
488 | <url>
489 | <loc>https://webscraping.com/blog/category/user-agent</loc>
490 | </url>
491 | <url>
492 | <loc>https://webscraping.com/blog/category/web2py</loc>
493 | </url>
494 | <url>
495 | <loc>https://webscraping.com/blog/category/webkit</loc>
496 | </url>
497 | <url>
498 | <loc>https://webscraping.com/blog/category/website/</loc>
499 | </url>
500 | <url>
501 | <loc>https://webscraping.com/blog/category/xpath</loc>
502 | </url>
503 | <url>
504 | <loc>https://webscraping.com/contact</loc>
505 | </url>
506 | <url>
507 | <loc>https://webscraping.com/feedback</loc>
508 | </url>
509 | <url>
510 | <loc>https://webscraping.com/quote</loc>
511 | </url>
512 | <url>
513 | <loc>https://webscraping.com/shame</loc>
514 | </url>
515 | </urlset>
516 | 


--------------------------------------------------------------------------------
/Chapter05/quotes.csv:
--------------------------------------------------------------------------------
 1 | quote_tags,author_url,author_name,born_date,born_location,quote_title
 2 | "change,deep-thoughts,thinking,world",http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany",“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”
 3 | "abilities,choices",http://quotes.toscrape.com/author/J-K-Rowling,J.K. Rowling,"July 31, 1965","Yate, South Gloucestershire, England, The United Kingdom","“It is our choices, Harry, that show what we truly are, far more than our abilities.”"
 4 | "inspirational,life,live,miracle,miracles",http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany",“There are only two ways to live your life. One is as though nothing is a miracle. The other is as though everything is a miracle.”
 5 | "aliteracy,books,classic,humor",http://quotes.toscrape.com/author/Jane-Austen,Jane Austen,"December 16, 1775","Steventon Rectory, Hampshire, The United Kingdom","“The person, be it gentleman or lady, who has not pleasure in a good novel, must be intolerably stupid.”"
 6 | "be-yourself,inspirational",http://quotes.toscrape.com/author/Marilyn-Monroe,Marilyn Monroe,"June 01, 1926",The United States,"“Imperfection is beauty, madness is genius and it's better to be absolutely ridiculous than absolutely boring.”"
 7 | "adulthood,success,value",http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany",“Try not to become a man of success. Rather become a man of value.”
 8 | "life,love",http://quotes.toscrape.com/author/Andre-Gide,André Gide,"November 22, 1869","Paris, France",“It is better to be hated for what you are than to be loved for what you are not.”
 9 | "edison,failure,inspirational,paraphrased",http://quotes.toscrape.com/author/Thomas-A-Edison,Thomas A. Edison,"February 11, 1847","Milan, Ohio, The United States","“I have not failed. I've just found 10,000 ways that won't work.”"
10 | misattributed-eleanor-roosevelt,http://quotes.toscrape.com/author/Eleanor-Roosevelt,Eleanor Roosevelt,"October 11, 1884",The United States,“A woman is like a tea bag; you never know how strong it is until it's in hot water.”
11 | "humor,obvious,simile",http://quotes.toscrape.com/author/Steve-Martin,Steve Martin,"August 14, 1945","Waco, Texas, The United States","“A day without sunshine is like, you know, night.”"
12 | "friends,heartbreak,inspirational,life,love,sisters",http://quotes.toscrape.com/author/Marilyn-Monroe,Marilyn Monroe,"June 01, 1926",The United States,"“This life is what you make it. No matter what, you're going to mess up sometimes, it's a universal truth. But the good part is you get to decide how you're going to mess it up. Girls will be your friends - they'll act like it anyway. But just remember, some come, some go. The ones that stay with you through everything - they're your true best friends. Don't let go of them. Also remember, sisters make the best friends in the world. As for lovers, well, they'll come and go too. And baby, I hate to say it, most of them - actually pretty much all of them are going to break your heart, but you can't give up because if you give up, you'll never find your soulmate. You'll never find that half who makes you whole and that goes for everything. Just because you fail once, doesn't mean you're gonna fail at everything. Keep trying, hold on, and always, always, always believe in yourself, because if you don't, then who will, sweetie? So keep your head high, keep your chin up, and most importantly, keep smiling, because life's a beautiful thing and there's so much to smile about.”"
13 | "courage,friends",http://quotes.toscrape.com/author/J-K-Rowling,J.K. Rowling,"July 31, 1965","Yate, South Gloucestershire, England, The United Kingdom","“It takes a great deal of bravery to stand up to our enemies, but just as much to stand up to our friends.”"
14 | "simplicity,understand",http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany","“If you can't explain it to a six year old, you don't understand it yourself.”"
15 | love,http://quotes.toscrape.com/author/Bob-Marley,Bob Marley,"February 06, 1945","Nine Mile, Saint Ann, Jamaica","“You may not be her first, her last, or her only. She loved before she may love again. But if she loves you now, what else matters? She's not perfect—you aren't either, and the two of you may never be perfect together but if she can make you laugh, cause you to think twice, and admit to being human and making mistakes, hold onto her and give her the most you can. She may not be thinking about you every second of the day, but she will give you a part of her that she knows you can break—her heart. So don't hurt her, don't change her, don't analyze and don't expect more than she can give. Smile when she makes you happy, let her know when she makes you mad, and miss her when she's not there.”"
16 | fantasy,http://quotes.toscrape.com/author/Dr-Seuss,Dr. Seuss,"March 02, 1904","Springfield, MA, The United States","“I like nonsense, it wakes up the brain cells. Fantasy is a necessary ingredient in living.”"
17 | "life,navigation",http://quotes.toscrape.com/author/Douglas-Adams,Douglas Adams,"March 11, 1952","Cambridge, England, The United Kingdom","“I may not have gone where I intended to go, but I think I have ended up where I needed to be.”"
18 | "activism,apathy,hate,indifference,inspirational,love,opposite,philosophy",http://quotes.toscrape.com/author/Elie-Wiesel,Elie Wiesel,"September 30, 1928","Sighet, Romania","“The opposite of love is not hate, it's indifference. The opposite of art is not ugliness, it's indifference. The opposite of faith is not heresy, it's indifference. And the opposite of life is not death, it's indifference.”"
19 | "friendship,lack-of-friendship,lack-of-love,love,marriage,unhappy-marriage",http://quotes.toscrape.com/author/Friedrich-Nietzsche,Friedrich Nietzsche,"October 15, 1844","Röcken bei Lützen, Prussian Province of Saxony, Germany","“It is not a lack of love, but a lack of friendship that makes unhappy marriages.”"
20 | "books,contentment,friends,friendship,life",http://quotes.toscrape.com/author/Mark-Twain,Mark Twain,"November 30, 1835","Florida, Missouri, The United States","“Good friends, good books, and a sleepy conscience: this is the ideal life.”"
21 | "fate,life,misattributed-john-lennon,planning,plans",http://quotes.toscrape.com/author/Allen-Saunders,Allen Saunders,"April 24, 1899",The United States,“Life is what happens to us while we are making other plans.”
22 | "love,poetry",http://quotes.toscrape.com/author/Pablo-Neruda,Pablo Neruda,"July 12, 1904","Parral, Chile","“I love you without knowing how, or when, or from where. I love you simply, without problems or pride: I love you in this way because I do not know any other way of loving but this, in which there is no I or you, so intimate that your hand upon my chest is my hand, so intimate that when I fall asleep your eyes close.”"
23 | happiness,http://quotes.toscrape.com/author/Ralph-Waldo-Emerson,Ralph Waldo Emerson,"May 25, 1803","Boston, Massachusetts, The United States",“For every minute you are angry you lose sixty seconds of happiness.”
24 | attributed-no-source,http://quotes.toscrape.com/author/Mother-Teresa,Mother Teresa,"August 26, 1910","Skopje, Macedonia, the Former Yugoslav Republic of","“If you judge people, you have no time to love them.”"
25 | "humor,religion",http://quotes.toscrape.com/author/Garrison-Keillor,Garrison Keillor,"August 07, 1942","Anoka, Minnesota, The United States",“Anyone who thinks sitting in church can make you a Christian must also think that sitting in a garage can make you a car.”
26 | humor,http://quotes.toscrape.com/author/Jim-Henson,Jim Henson,"September 24, 1936","Greenville, Mississippi, The United States",“Beauty is in the eye of the beholder and it may be necessary from time to time to give a stupid or misinformed beholder a black eye.”
27 | "comedy,life,yourself",http://quotes.toscrape.com/author/Dr-Seuss,Dr. Seuss,"March 02, 1904","Springfield, MA, The United States","“Today you are You, that is truer than true. There is no one alive who is Youer than You.”"
28 | "children,fairy-tales",http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany","“If you want your children to be intelligent, read them fairy tales. If you want them to be more intelligent, read them more fairy tales.”"
29 | ,http://quotes.toscrape.com/author/J-K-Rowling,J.K. Rowling,"July 31, 1965","Yate, South Gloucestershire, England, The United Kingdom","“It is impossible to live without failing at something, unless you live so cautiously that you might as well not have lived at all - in which case, you fail by default.”"
30 | imagination,http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany",“Logic will get you from A to Z; imagination will get you everywhere.”
31 | music,http://quotes.toscrape.com/author/Bob-Marley,Bob Marley,"February 06, 1945","Nine Mile, Saint Ann, Jamaica","“One good thing about music, when it hits you, you feel no pain.”"
32 | "learning,reading,seuss",http://quotes.toscrape.com/author/Dr-Seuss,Dr. Seuss,"March 02, 1904","Springfield, MA, The United States","“The more that you read, the more things you will know. The more that you learn, the more places you'll go.”"
33 | dumbledore,http://quotes.toscrape.com/author/J-K-Rowling,J.K. Rowling,"July 31, 1965","Yate, South Gloucestershire, England, The United Kingdom","“Of course it is happening inside your head, Harry, but why on earth should that mean that it is not real?”"
34 | friendship,http://quotes.toscrape.com/author/Bob-Marley,Bob Marley,"February 06, 1945","Nine Mile, Saint Ann, Jamaica","“The truth is, everyone is going to hurt you. You just got to find the ones worth suffering for.”"
35 | "misattributed-to-mother-teresa,paraphrased",http://quotes.toscrape.com/author/Mother-Teresa,Mother Teresa,"August 26, 1910","Skopje, Macedonia, the Former Yugoslav Republic of",“Not all of us can do great things. But we can do small things with great love.”
36 | "death,inspirational",http://quotes.toscrape.com/author/J-K-Rowling,J.K. Rowling,"July 31, 1965","Yate, South Gloucestershire, England, The United Kingdom","“To the well-organized mind, death is but the next great adventure.”"
37 | "chocolate,food,humor",http://quotes.toscrape.com/author/Charles-M-Schulz,Charles M. Schulz,"November 26, 1922","Minneapolis, MN, The United States",“All you need is love. But a little chocolate now and then doesn't hurt.”
38 | "misattributed-to-c-s-lewis,reading",http://quotes.toscrape.com/author/William-Nicholson,William Nicholson,"January 12, 1948","Lewes, Sussex, The United Kingdom",“We read to know we're not alone.”
39 | "knowledge,learning,understanding,wisdom",http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany",“Any fool can know. The point is to understand.”
40 | "books,library",http://quotes.toscrape.com/author/Jorge-Luis-Borges,Jorge Luis Borges,"August 24, 1899","Buenos Aires, Argentina",“I have always imagined that Paradise will be a kind of library.”
41 | inspirational,http://quotes.toscrape.com/author/George-Eliot,George Eliot,"November 22, 1819","South Farm, Arbury Hall, Nuneaton, Warwickshire, The United Kingdom",“It is never too late to be what you might have been.”
42 | "read,readers,reading,reading-books",http://quotes.toscrape.com/author/George-R-R-Martin,George R.R. Martin,"September 20, 1948","Bayonne, New Jersey, The United States","“A reader lives a thousand lives before he dies, said Jojen. The man who never reads lives only one.”"
43 | "books,inspirational,reading,tea",http://quotes.toscrape.com/author/C-S-Lewis,C.S. Lewis,"November 29, 1898","Belfast, Ireland",“You can never get a cup of tea large enough or a book long enough to suit me.”
44 | ,http://quotes.toscrape.com/author/Marilyn-Monroe,Marilyn Monroe,"June 01, 1926",The United States,“You believe lies so you eventually learn to trust no one but yourself.”
45 | "girls,love",http://quotes.toscrape.com/author/Marilyn-Monroe,Marilyn Monroe,"June 01, 1926",The United States,"“If you can make a woman laugh, you can make her do anything.”"
46 | "life,simile",http://quotes.toscrape.com/author/Albert-Einstein,Albert Einstein,"March 14, 1879","Ulm, Germany","“Life is like riding a bicycle. To keep your balance, you must keep moving.”"
47 | love,http://quotes.toscrape.com/author/Marilyn-Monroe,Marilyn Monroe,"June 01, 1926",The United States,“The real lover is the man who can thrill you by kissing your forehead or smiling into your eyes or just staring into space.”
48 | attributed-no-source,http://quotes.toscrape.com/author/Marilyn-Monroe,Marilyn Monroe,"June 01, 1926",The United States,"“A wise girl kisses but doesn't love, listens but doesn't believe, and leaves before she is left.”"
49 | "hope,inspirational",http://quotes.toscrape.com/author/Martin-Luther-King-Jr,Martin Luther King Jr.,"January 15, 1929","Atlanta, Georgia, The United States",“Only in the darkness can you see the stars.”
50 | dumbledore,http://quotes.toscrape.com/author/J-K-Rowling,J.K. Rowling,"July 31, 1965","Yate, South Gloucestershire, England, The United Kingdom","“It matters not what someone is born, but what they grow to be.”"
51 | love,http://quotes.toscrape.com/author/James-Baldwin,James Baldwin,"August 02, 1924","Harlem, New York, The United States","“Love does not begin and end the way we seem to think it does. Love is a battle, love is a war; love is a growing up.”"
52 | 


--------------------------------------------------------------------------------
/Chapter02/requeststest.py:
--------------------------------------------------------------------------------
  1 | Python 3.7.0 (v3.7.0:1bf9cc5093, Jun 27 2018, 04:06:47) [MSC v.1914 32 bit (Intel)] on win32
  2 | Type "copyright", "credits" or "license()" for more information.
  3 | >>> import urllib
  4 | >>> dir(urllib)
  5 | ['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'parse']
  6 | >>> urllib.splithost
  7 | Traceback (most recent call last):
  8 |   File "<pyshell#2>", line 1, in <module>
  9 |     urllib.splithost
 10 | AttributeError: module 'urllib' has no attribute 'splithost'
 11 | >>> import requests
 12 | >>> url="https://www.python.org"
 13 | >>> link="https://www.python.org"
 14 | >>> r = requests.get(link)
 15 | >>> r.url
 16 | 'https://www.python.org/'
 17 | >>> r.code
 18 | Traceback (most recent call last):
 19 |   File "<pyshell#8>", line 1, in <module>
 20 |     r.code
 21 | AttributeError: 'Response' object has no attribute 'code'
 22 | >>> r.status_code
 23 | 200
 24 | >>> dir(r)
 25 | ['__attrs__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_content', '_content_consumed', '_next', 'apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'is_permanent_redirect', 'is_redirect', 'iter_content', 'iter_lines', 'json', 'links', 'next', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'status_code', 'text', 'url']
 26 | >>> r.raw
 27 | <urllib3.response.HTTPResponse object at 0x041A8F70>
 28 | >>> r.ok
 29 | True
 30 | >>> r.next
 31 | >>> r.links
 32 | {}
 33 | >>> r.reason
 34 | 'OK'
 35 | >>> r.history()
 36 | Traceback (most recent call last):
 37 |   File "<pyshell#16>", line 1, in <module>
 38 |     r.history()
 39 | TypeError: 'list' object is not callable
 40 | >>> r.headers
 41 | {'Server': 'nginx', 'Content-Type': 'text/html; charset=utf-8', 'X-Frame-Options': 'SAMEORIGIN', 'x-xss-protection': '1; mode=block', 'X-Clacks-Overhead': 'GNU Terry Pratchett', 'Via': '1.1 varnish, 1.1 varnish', 'Content-Length': '48884', 'Accept-Ranges': 'bytes', 'Date': 'Tue, 01 Jan 2019 09:00:01 GMT', 'Age': '418', 'Connection': 'keep-alive', 'X-Served-By': 'cache-iad2133-IAD, cache-ams21026-AMS', 'X-Cache': 'HIT, HIT', 'X-Cache-Hits': '2, 2', 'X-Timer': 'S1546333202.778832,VS0,VE0', 'Vary': 'Cookie', 'Strict-Transport-Security': 'max-age=63072000; includeSubDomains'}
 42 | >>> r.headers['Content-Type']
 43 | 'text/html; charset=utf-8'
 44 | >>> r.is_redirect()
 45 | Traceback (most recent call last):
 46 |   File "<pyshell#19>", line 1, in <module>
 47 |     r.is_redirect()
 48 | TypeError: 'bool' object is not callable
 49 | >>> r.is_redirect
 50 | False
 51 | >>> r.raw.read(20)
 52 | b''
 53 | >>> r
 54 | <Response [200]>
 55 | >>> type(r)
 56 | <class 'requests.models.Response'>
 57 | >>> r.content[0:100]
 58 | b'<!doctype html>\n<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->\n<!-'
 59 | >>> r.text[0:100]
 60 | '<!doctype html>\n<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->\n<!-'
 61 | >>> r.encoding
 62 | 'utf-8'
 63 | >>> amazon="https://www.amazon.com/Thinking-JavaScript-Aravind-Shenoy-ebook/dp/B00JUI6LUQ/"
 64 | >>> r = requests.get(link)
 65 | >>> r.url
 66 | 'https://www.python.org/'
 67 | >>> r = requests.get(amazon)
 68 | >>> r.url
 69 | 'https://www.amazon.com/Thinking-JavaScript-Aravind-Shenoy-ebook/dp/B00JUI6LUQ'
 70 | >>> r.is_rediect
 71 | Traceback (most recent call last):
 72 |   File "<pyshell#32>", line 1, in <module>
 73 |     r.is_rediect
 74 | AttributeError: 'Response' object has no attribute 'is_rediect'
 75 | >>> r.is_redirect
 76 | False
 77 | >>> r.links
 78 | {}
 79 | >>> r.next
 80 | >>> dir(r.next)
 81 | ['__bool__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']
 82 | >>> dir(r.links)
 83 | ['__class__', '__contains__', '__delattr__', '__delitem__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', 'clear', 'copy', 'fromkeys', 'get', 'items', 'keys', 'pop', 'popitem', 'setdefault', 'update', 'values']
 84 | >>> help(r.next)
 85 | Help on NoneType object:
 86 | 
 87 | class NoneType(object)
 88 |  |  Methods defined here:
 89 |  |  
 90 |  |  __bool__(self, /)
 91 |  |      self != 0
 92 |  |  
 93 |  |  __repr__(self, /)
 94 |  |      Return repr(self).
 95 |  |  
 96 |  |  ----------------------------------------------------------------------
 97 |  |  Static methods defined here:
 98 |  |  
 99 |  |  __new__(*args, **kwargs) from builtins.type
100 |  |      Create and return a new object.  See help(type) for accurate signature.
101 | 
102 | >>> r.content[0:100]
103 | b'<!doctype html><html lang="en-us" class="a-no-js" data-19ax5a9jf="dingo"><!-- sp:feature:head-start '
104 | >>> help(r)
105 | Help on Response in module requests.models object:
106 | 
107 | class Response(builtins.object)
108 |  |  The :class:`Response <Response>` object, which contains a
109 |  |  server's response to an HTTP request.
110 |  |  
111 |  |  Methods defined here:
112 |  |  
113 |  |  __bool__(self)
114 |  |      Returns True if :attr:`status_code` is less than 400.
115 |  |      
116 |  |      This attribute checks if the status code of the response is between
117 |  |      400 and 600 to see if there was a client error or a server error. If
118 |  |      the status code, is between 200 and 400, this will return True. This
119 |  |      is **not** a check to see if the response code is ``200 OK``.
120 |  |  
121 |  |  __enter__(self)
122 |  |  
123 |  |  __exit__(self, *args)
124 |  |  
125 |  |  __getstate__(self)
126 |  |  
127 |  |  __init__(self)
128 |  |      Initialize self.  See help(type(self)) for accurate signature.
129 |  |  
130 |  |  __iter__(self)
131 |  |      Allows you to use a response as an iterator.
132 |  |  
133 |  |  __nonzero__(self)
134 |  |      Returns True if :attr:`status_code` is less than 400.
135 |  |      
136 |  |      This attribute checks if the status code of the response is between
137 |  |      400 and 600 to see if there was a client error or a server error. If
138 |  |      the status code, is between 200 and 400, this will return True. This
139 |  |      is **not** a check to see if the response code is ``200 OK``.
140 |  |  
141 |  |  __repr__(self)
142 |  |      Return repr(self).
143 |  |  
144 |  |  __setstate__(self, state)
145 |  |  
146 |  |  close(self)
147 |  |      Releases the connection back to the pool. Once this method has been
148 |  |      called the underlying ``raw`` object must not be accessed again.
149 |  |      
150 |  |      *Note: Should not normally need to be called explicitly.*
151 |  |  
152 |  |  iter_content(self, chunk_size=1, decode_unicode=False)
153 |  |      Iterates over the response data.  When stream=True is set on the
154 |  |      request, this avoids reading the content at once into memory for
155 |  |      large responses.  The chunk size is the number of bytes it should
156 |  |      read into memory.  This is not necessarily the length of each item
157 |  |      returned as decoding can take place.
158 |  |      
159 |  |      chunk_size must be of type int or None. A value of None will
160 |  |      function differently depending on the value of `stream`.
161 |  |      stream=True will read data as it arrives in whatever size the
162 |  |      chunks are received. If stream=False, data is returned as
163 |  |      a single chunk.
164 |  |      
165 |  |      If decode_unicode is True, content will be decoded using the best
166 |  |      available encoding based on the response.
167 |  |  
168 |  |  iter_lines(self, chunk_size=512, decode_unicode=False, delimiter=None)
169 |  |      Iterates over the response data, one line at a time.  When
170 |  |      stream=True is set on the request, this avoids reading the
171 |  |      content at once into memory for large responses.
172 |  |      
173 |  |      .. note:: This method is not reentrant safe.
174 |  |  
175 |  |  json(self, **kwargs)
176 |  |      Returns the json-encoded content of a response, if any.
177 |  |      
178 |  |      :param \*\*kwargs: Optional arguments that ``json.loads`` takes.
179 |  |      :raises ValueError: If the response body does not contain valid json.
180 |  |  
181 |  |  raise_for_status(self)
182 |  |      Raises stored :class:`HTTPError`, if one occurred.
183 |  |  
184 |  |  ----------------------------------------------------------------------
185 |  |  Data descriptors defined here:
186 |  |  
187 |  |  __dict__
188 |  |      dictionary for instance variables (if defined)
189 |  |  
190 |  |  __weakref__
191 |  |      list of weak references to the object (if defined)
192 |  |  
193 |  |  apparent_encoding
194 |  |      The apparent encoding, provided by the chardet library.
195 |  |  
196 |  |  content
197 |  |      Content of the response, in bytes.
198 |  |  
199 |  |  is_permanent_redirect
200 |  |      True if this Response one of the permanent versions of redirect.
201 |  |  
202 |  |  is_redirect
203 |  |      True if this Response is a well-formed HTTP redirect that could have
204 |  |      been processed automatically (by :meth:`Session.resolve_redirects`).
205 |  |  
206 |  |  links
207 |  |      Returns the parsed header links of the response, if any.
208 |  |  
209 |  |  next
210 |  |      Returns a PreparedRequest for the next request in a redirect chain, if there is one.
211 |  |  
212 |  |  ok
213 |  |      Returns True if :attr:`status_code` is less than 400, False if not.
214 |  |      
215 |  |      This attribute checks if the status code of the response is between
216 |  |      400 and 600 to see if there was a client error or a server error. If
217 |  |      the status code is between 200 and 400, this will return True. This
218 |  |      is **not** a check to see if the response code is ``200 OK``.
219 |  |  
220 |  |  text
221 |  |      Content of the response, in unicode.
222 |  |      
223 |  |      If Response.encoding is None, encoding will be guessed using
224 |  |      ``chardet``.
225 |  |      
226 |  |      The encoding of the response content is determined based solely on HTTP
227 |  |      headers, following RFC 2616 to the letter. If you can take advantage of
228 |  |      non-HTTP knowledge to make a better guess at the encoding, you should
229 |  |      set ``r.encoding`` appropriately before accessing this property.
230 |  |  
231 |  |  ----------------------------------------------------------------------
232 |  |  Data and other attributes defined here:
233 |  |  
234 |  |  __attrs__ = ['_content', 'status_code', 'headers', 'url', 'history', '...
235 | 
236 | >>> tpe(r)
237 | 		 
238 | Traceback (most recent call last):
239 |   File "<pyshell#41>", line 1, in <module>
240 |     tpe(r)
241 | NameError: name 'tpe' is not defined
242 | >>> type(r)
243 | 		 
244 | <class 'requests.models.Response'>
245 | >>> r.headers
246 | 		 
247 | {'Content-Type': 'text/html;charset=UTF-8', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Server': 'Server', 'Date': 'Tue, 01 Jan 2019 09:06:56 GMT', 'Strict-Transport-Security': 'max-age=47474747; includeSubDomains; preload', 'Vary': 'Accept-Encoding,User-Agent', 'P3P': 'policyref="https://www.amazon.com/w3c/p3p.xml",CP="CAO DSP LAW CUR ADM IVAo IVDo CONo OTPo OUR DELi PUBi OTRi BUS PHY ONL UNI PUR FIN COM NAV INT DEM CNT STA HEA PRE LOC GOV OTC "', 'Cache-Control': 'no-cache, no-transform', 'Content-Encoding': 'gzip', 'X-XSS-Protection': '1;', 'X-Content-Type-Options': 'nosniff', 'X-Frame-Options': 'SAMEORIGIN', 'x-amz-rid': 'FY7M1HTKW8W81QNB8DY7', 'X-Cache': 'Miss from cloudfront', 'Via': '1.1 7319819fad2b7a4ce76474209cc2df4b.cloudfront.net (CloudFront)', 'X-Amz-Cf-Id': 'xQL6mSAuUrDh4QL4C8uvNb9Hn7TPZ6nJXGRdGYlv-S1TGP6llvWAwA=='}
248 | >>> r.request.headers
249 | 		 
250 | {'User-Agent': 'python-requests/2.21.0', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
251 | >>> dir(r)
252 | 		 
253 | ['__attrs__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_content', '_content_consumed', '_next', 'apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'is_permanent_redirect', 'is_redirect', 'iter_content', 'iter_lines', 'json', 'links', 'next', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'status_code', 'text', 'url']
254 | >>> r.reason
255 | 		 
256 | 'OK'
257 | >>> r.links()
258 | 		 
259 | Traceback (most recent call last):
260 |   File "<pyshell#47>", line 1, in <module>
261 |     r.links()
262 | TypeError: 'dict' object is not callable
263 | >>> r.links
264 | 		 
265 | {}
266 | >>> r.cookies
267 | 		 
268 | <RequestsCookieJar[]>
269 | >>> r.raw
270 | 		 
271 | <urllib3.response.HTTPResponse object at 0x041CDBF0>
272 | >>> type(r.content)
273 | 		 
274 | <class 'bytes'>
275 | >>> 
276 | 
277 | import urllib2
278 | 
279 | url = 'https://www.example.com'
280 | username= 'user'
281 | password = 'pass'
282 | 
283 | request = urllib2.Request(url)
284 | 
285 | password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm()
286 | password_manager.add_password(None, url, username, password)
287 | 
288 | auth_manager = urllib2.HTTPBasicAuthHandler(password_manager)
289 | opener = urllib2.build_opener(auth_manager)
290 | 
291 | urllib2.install_opener(opener)
292 | 
293 | handler = urllib2.urlopen(request)
294 | 
295 | print handler.getcode()
296 | print handler.headers.getheader('content-type')
297 | 
298 | Using Requests: The task of making a simple HTTP GET request can be accomplished in a single line when compared to the large code written using urllib2.
299 | 
300 | import requests
301 | 
302 | r = requests.get('https://www.example.com', auth=('user', 'pass'))
303 | 
304 | print r.status_code
305 | 
306 | print r.headers['content-type']
307 | 
308 | Example 2: Making a POST request
309 | 
310 | Using urllib2/urllib: Note that in this example we had to make use of both the urllib and urllib2 modules in order to write a script for a simple POST request:
311 | 
312 | import urllib
313 | 
314 | import urllib2
315 | 
316 | url = "http://www.example.com"
317 | 
318 | values = {"firstname":" abc ", "lastname":" xyz "}
319 | 
320 | header = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 5.5;Windows NT)"}
321 | 
322 | values = urllib.urlencode(values)
323 | 
324 | request = urllib2.Request(url, values, header)
325 | 
326 | response = urllib2.urlopen(request)
327 | 
328 | html_content = response.read()
329 | 
330 | Using Requests: Here we do not require import multiple modules and a single requests module can accomplish the entire task:
331 | 
332 | import requests
333 | 
334 | values = {""firstname":" abc ", "lastname":" xyz "}
335 | 
336 | r = requests.post('https://www.example.com, data=values)


--------------------------------------------------------------------------------
/Chapter02/urllib_test.py:
--------------------------------------------------------------------------------
  1 | Python 3.7.0 (v3.7.0:1bf9cc5093, Jun 27 2018, 04:06:47) [MSC v.1914 32 bit (Intel)] on win32
  2 | Type "copyright", "credits" or "license()" for more information.
  3 | >>> import urllib
  4 | >>> import requests
  5 | >>> dir(urllib)
  6 | ['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'error', 'parse', 'request', 'response']
  7 | >>> dir(requests)
  8 | ['ConnectTimeout', 'ConnectionError', 'DependencyWarning', 'FileModeWarning', 'HTTPError', 'NullHandler', 'PreparedRequest', 'ReadTimeout', 'Request', 'RequestException', 'RequestsDependencyWarning', 'Response', 'Session', 'Timeout', 'TooManyRedirects', 'URLRequired', '__author__', '__author_email__', '__build__', '__builtins__', '__cached__', '__cake__', '__copyright__', '__description__', '__doc__', '__file__', '__license__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__title__', '__url__', '__version__', '_check_cryptography', '_internal_utils', 'adapters', 'api', 'auth', 'certs', 'chardet', 'check_compatibility', 'codes', 'compat', 'cookies', 'delete', 'exceptions', 'get', 'head', 'hooks', 'logging', 'models', 'options', 'packages', 'patch', 'post', 'put', 'request', 'session', 'sessions', 'status_codes', 'structures', 'urllib3', 'utils', 'warnings']
  9 | >>> requests.info
 10 | Traceback (most recent call last):
 11 |   File "<pyshell#4>", line 1, in <module>
 12 |     requests.info
 13 | AttributeError: module 'requests' has no attribute 'info'
 14 | >>> requests.info()
 15 | Traceback (most recent call last):
 16 |   File "<pyshell#5>", line 1, in <module>
 17 |     requests.info()
 18 | AttributeError: module 'requests' has no attribute 'info'
 19 | >>> info(requests)
 20 | Traceback (most recent call last):
 21 |   File "<pyshell#6>", line 1, in <module>
 22 |     info(requests)
 23 | NameError: name 'info' is not defined
 24 | >>> requests.__version__
 25 | '2.19.1'
 26 | >>> urllib.__version__
 27 | Traceback (most recent call last):
 28 |   File "<pyshell#8>", line 1, in <module>
 29 |     urllib.__version__
 30 | AttributeError: module 'urllib' has no attribute '__version__'
 31 | >>> help(urllib)
 32 | Help on package urllib:
 33 | 
 34 | NAME
 35 |     urllib
 36 | 
 37 | PACKAGE CONTENTS
 38 |     error
 39 |     parse
 40 |     request
 41 |     response
 42 |     robotparser
 43 | 
 44 | FILE
 45 |     c:\python37\lib\urllib\__init__.py
 46 | 
 47 | 
 48 | >>> import urllib2
 49 | Traceback (most recent call last):
 50 |   File "<pyshell#10>", line 1, in <module>
 51 |     import urllib2
 52 | ModuleNotFoundError: No module named 'urllib2'
 53 | >>> import urllib.request as request
 54 | >>> dir(request)
 55 | ['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'DataHandler', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPPasswordMgrWithPriorAuth', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'string', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unquote_to_bytes', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings']
 56 | >>> dir(urllib.response)
 57 | ['__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'addbase', 'addclosehook', 'addinfo', 'addinfourl', 'tempfile']
 58 | >>> url = "https://www.google.com"
 59 | >>> sp = request.urlsplit()
 60 | Traceback (most recent call last):
 61 |   File "<pyshell#15>", line 1, in <module>
 62 |     sp = request.urlsplit()
 63 | TypeError: urlsplit() missing 1 required positional argument: 'url'
 64 | >>> sp = request.urlsplit(url)
 65 | >>> sp
 66 | SplitResult(scheme='https', netloc='www.google.com', path='', query='', fragment='')
 67 | >>> sp[0]
 68 | 'https'
 69 | >>> sp[1]
 70 | 'www.google.com'
 71 | >>> sp[2]
 72 | ''
 73 | >>> url1="https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1"
 74 | >>> sp = request.urlsplit(url1)
 75 | >>> sp
 76 | SplitResult(scheme='https', netloc='stackoverflow.com', path='/questions/34475051/need-to-install-urllib2-for-python-3-5-1', query='', fragment='')
 77 | >>> url1="https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000"
 78 | >>> sp = request.urlsplit(url1)
 79 | >>> sp
 80 | SplitResult(scheme='https', netloc='stackoverflow.com', path='/questions/34475051/need-to-install-urllib2-for-python-3-5-1', query='admin=no&param=yes:8000', fragment='')
 81 | >>> sp.admin
 82 | Traceback (most recent call last):
 83 |   File "<pyshell#27>", line 1, in <module>
 84 |     sp.admin
 85 | AttributeError: 'SplitResult' object has no attribute 'admin'
 86 | >>> type(sp[3])
 87 | <class 'str'>
 88 | >>> sp1=request.splitquery()
 89 | Traceback (most recent call last):
 90 |   File "<pyshell#29>", line 1, in <module>
 91 |     sp1=request.splitquery()
 92 | TypeError: splitquery() missing 1 required positional argument: 'url'
 93 | >>> sp1=request.splitquery(url1)
 94 | >>> spl
 95 | Traceback (most recent call last):
 96 |   File "<pyshell#31>", line 1, in <module>
 97 |     spl
 98 | NameError: name 'spl' is not defined
 99 | >>> sp1
100 | ('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1', 'admin=no&param=yes:8000')
101 | >>> dir(request)
102 | ['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'DataHandler', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPPasswordMgrWithPriorAuth', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'string', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unquote_to_bytes', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings']
103 | >>> sp1=request.splitattr(url1)
104 | >>> sp1
105 | ('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000', [])
106 | >>> sp1=request.splithost(url1)
107 | >>> sp1
108 | (None, 'https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000')
109 | >>> sp1=request.splittag(url1)
110 | >>> sp1
111 | ('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000', None)
112 | >>> sp1=request.urlcleanup(url1)
113 | Traceback (most recent call last):
114 |   File "<pyshell#40>", line 1, in <module>
115 |     sp1=request.urlcleanup(url1)
116 | TypeError: urlcleanup() takes 0 positional arguments but 1 was given
117 | >>> sp1=request.urlparse(url1)
118 | >>> p1
119 | Traceback (most recent call last):
120 |   File "<pyshell#42>", line 1, in <module>
121 |     p1
122 | NameError: name 'p1' is not defined
123 | >>> sp1
124 | ParseResult(scheme='https', netloc='stackoverflow.com', path='/questions/34475051/need-to-install-urllib2-for-python-3-5-1', params='', query='admin=no&param=yes:8000', fragment='')
125 | >>> urllib.parse.urlencode("this is anish")
126 | Traceback (most recent call last):
127 |   File "C:\Python37\lib\urllib\parse.py", line 858, in urlencode
128 |     raise TypeError
129 | TypeError
130 | 
131 | During handling of the above exception, another exception occurred:
132 | 
133 | Traceback (most recent call last):
134 |   File "<pyshell#44>", line 1, in <module>
135 |     urllib.parse.urlencode("this is anish")
136 |   File "C:\Python37\lib\urllib\parse.py", line 866, in urlencode
137 |     "or mapping object").with_traceback(tb)
138 |   File "C:\Python37\lib\urllib\parse.py", line 858, in urlencode
139 |     raise TypeError
140 | TypeError: not a valid non-string sequence or mapping object
141 | >>> import urllib.parse as parse
142 | >>> dir(parse)
143 | ['DefragResult', 'DefragResultBytes', 'MAX_CACHE_SIZE', 'ParseResult', 'ParseResultBytes', 'Quoter', 'ResultBase', 'SplitResult', 'SplitResultBytes', '_ALWAYS_SAFE', '_ALWAYS_SAFE_BYTES', '_DefragResultBase', '_NetlocResultMixinBase', '_NetlocResultMixinBytes', '_NetlocResultMixinStr', '_ParseResultBase', '_ResultMixinBytes', '_ResultMixinStr', '_SplitResultBase', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_asciire', '_coerce_args', '_decode_args', '_encode_result', '_hexdig', '_hextobyte', '_hostprog', '_implicit_encoding', '_implicit_errors', '_noop', '_parse_cache', '_portprog', '_safe_quoters', '_splitnetloc', '_splitparams', '_typeprog', 'clear_cache', 'collections', 'namedtuple', 'non_hierarchical', 'parse_qs', 'parse_qsl', 'quote', 'quote_from_bytes', 'quote_plus', 're', 'scheme_chars', 'splitattr', 'splithost', 'splitnport', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'sys', 'to_bytes', 'unquote', 'unquote_plus', 'unquote_to_bytes', 'unwrap', 'urldefrag', 'urlencode', 'urljoin', 'urlparse', 'urlsplit', 'urlunparse', 'urlunsplit', 'uses_fragment', 'uses_netloc', 'uses_params', 'uses_query', 'uses_relative']
144 | >>> dir(request)
145 | ['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'DataHandler', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPPasswordMgrWithPriorAuth', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'string', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unquote_to_bytes', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings']
146 | >>> read = request.urlopen(url1)
147 | >>> read
148 | <http.client.HTTPResponse object at 0x0487FC50>
149 | >>> read.getcode()
150 | 200
151 | >>> read.getheaders()
152 | [('Cache-Control', 'private'), ('Content-Type', 'text/html; charset=utf-8'), ('Last-Modified', 'Sat, 27 Oct 2018 15:56:33 GMT'), ('X-Frame-Options', 'SAMEORIGIN'), ('X-Request-Guid', '1358a96d-ef20-40c9-8b86-e7f68734f61b'), ('Strict-Transport-Security', 'max-age=15552000'), ('Content-Security-Policy', 'upgrade-insecure-requests'), ('Accept-Ranges', 'bytes'), ('Age', '0'), ('Content-Length', '139930'), ('Accept-Ranges', 'bytes'), ('Date', 'Sun, 23 Dec 2018 12:02:00 GMT'), ('Via', '1.1 varnish'), ('Age', '0'), ('Connection', 'close'), ('X-Served-By', 'cache-ams21030-AMS'), ('X-Cache', 'MISS'), ('X-Cache-Hits', '0'), ('X-Timer', 'S1545566521.706377,VS0,VE93'), ('Vary', 'Fastly-SSL'), ('X-DNS-Prefetch-Control', 'off'), ('Set-Cookie', 'prov=ad65a8f3-fce0-2f55-2767-72a0fb787aaf; domain=.stackoverflow.com; expires=Fri, 01-Jan-2055 00:00:00 GMT; path=/; HttpOnly')]
153 | >>> read.getheader()
154 | Traceback (most recent call last):
155 |   File "<pyshell#52>", line 1, in <module>
156 |     read.getheader()
157 | TypeError: getheader() missing 1 required positional argument: 'name'
158 | >>> read.getheader('Content-Type')
159 | 'text/html; charset=utf-8'
160 | >>> read.geturl()
161 | 'https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no&param=yes:8000'
162 | >>> read.info()
163 | <http.client.HTTPMessage object at 0x0487FDD0>
164 | >>> read.readline()
165 | b'<!DOCTYPE html>\r\n'
166 | >>> read.readlines()
167 | [b'\r\n', b'\r\n', b'    ]
168 | 
169 | 
170 | import requests
171 | import simplejson
172 | 
173 | r = requests.get('https://github.com/timeline.json')
174 | c = r.content
175 | j = simplejson.loads(c)
176 | print(j)
177 | print(j['message'])
178 | 
179 | r = requests.get('https://api.github.com', auth=('user', 'pass'))
180 | print r.status_code
181 | print r.headers['content-type']
182 | 
183 | import requests
184 | import simplejson
185 | import urllib.request as urllib2
186 | from xml.dom import minidom
187 | 
188 | r = requests.get('https://github.com/timeline.json')
189 | c = r.content
190 | j = simplejson.loads(c)
191 | print(j)
192 | print(j['message'])
193 | 
194 | url = 'https://www.boardgamegeek.com/xmlapi2/thing?id=13&stats=1' # define XML location
195 | dom = minidom.parse(urllib2.urlopen(url))
196 | link = dom.getElementsByTagName('link')
197 | categories = [items.attributes['value'].value for items in link if items.attributes['type'].value == "boardgamecategory"]
198 | print(categories)
199 | 
200 | 
201 | 
202 | import requests
203 | 
204 | url = 'http://maps.googleapis.com/maps/api/directions/json'
205 | 
206 | params = dict(
207 |     origin='Chicago,IL',
208 |     destination='Los+Angeles,CA',
209 |     waypoints='Joplin,MO|Oklahoma+City,OK',
210 |     sensor='false'
211 | )
212 | 
213 | resp = requests.get(url=url, params=params)
214 | data = resp.json() # Check the JSON Response Content documentation below
215 | 
216 | 
217 |  with urllib.request.urlopen('http://www.python.org/') as f:
218 | ...     print(f.read(100).decode('utf-8'))
219 | 
220 |  >>> import urllib.request
221 | >>> local_filename, headers = urllib.request.urlretrieve('http://python.org/')
222 | >>> html = open(local_filename)
223 | >>> html.close()
224 | 
225 | 
226 | link="https://www.samsclub.com/robots.txt"
227 | import requests
228 | content = requests.get(link).content
229 | content = requests.get(link).text
230 | content = requests.get(link).content.decode() 
231 |  
232 | 


--------------------------------------------------------------------------------
/Chapter02/urllibrobotserror.py:
--------------------------------------------------------------------------------
  1 | Python 3.7.0 (v3.7.0:1bf9cc5093, Jun 27 2018, 04:06:47) [MSC v.1914 32 bit (Intel)] on win32
  2 | Type "copyright", "credits" or "license()" for more information.
  3 | >>> import urllib.parse
  4 | >>> import urllib.parse as urlparse
  5 | >>> dir(urlparse)
  6 | ['DefragResult', 'DefragResultBytes', 'MAX_CACHE_SIZE', 'ParseResult', 'ParseResultBytes', 'Quoter', 'ResultBase', 'SplitResult', 'SplitResultBytes', '_ALWAYS_SAFE', '_ALWAYS_SAFE_BYTES', '_DefragResultBase', '_NetlocResultMixinBase', '_NetlocResultMixinBytes', '_NetlocResultMixinStr', '_ParseResultBase', '_ResultMixinBytes', '_ResultMixinStr', '_SplitResultBase', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '_asciire', '_coerce_args', '_decode_args', '_encode_result', '_hexdig', '_hextobyte', '_hostprog', '_implicit_encoding', '_implicit_errors', '_noop', '_parse_cache', '_portprog', '_safe_quoters', '_splitnetloc', '_splitparams', '_typeprog', 'clear_cache', 'collections', 'namedtuple', 'non_hierarchical', 'parse_qs', 'parse_qsl', 'quote', 'quote_from_bytes', 'quote_plus', 're', 'scheme_chars', 'splitattr', 'splithost', 'splitnport', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'sys', 'to_bytes', 'unquote', 'unquote_plus', 'unquote_to_bytes', 'unwrap', 'urldefrag', 'urlencode', 'urljoin', 'urlparse', 'urlsplit', 'urlunparse', 'urlunsplit', 'uses_fragment', 'uses_netloc', 'uses_params', 'uses_query', 'uses_relative']
  7 | >>> import urllib
  8 | >>> dir(urllib)
  9 | ['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'parse']
 10 | >>> import urllib.request
 11 | >>> dir(urllib.request)
 12 | ['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'DataHandler', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPPasswordMgrWithPriorAuth', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'string', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unquote_to_bytes', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings']
 13 | >>> link="https://www.google.com"
 14 | >>> urllinb.request.thishost(link)
 15 | Traceback (most recent call last):
 16 |   File "<pyshell#8>", line 1, in <module>
 17 |     urllinb.request.thishost(link)
 18 | NameError: name 'urllinb' is not defined
 19 | >>> urllib.request.thishost(link)
 20 | Traceback (most recent call last):
 21 |   File "<pyshell#9>", line 1, in <module>
 22 |     urllib.request.thishost(link)
 23 | TypeError: thishost() takes 0 positional arguments but 1 was given
 24 | >>> urllib.request.thishost()
 25 | ('172.25.182.97', '192.168.99.1', '172.18.160.1', '192.168.0.106')
 26 | >>> urllib.request.splithost(link)
 27 | (None, 'https://www.google.com')
 28 | >>> link="https://www.google.com?search=Web Scraping"
 29 | >>> urllib.request.splithost(link)
 30 | (None, 'https://www.google.com?search=Web Scraping')
 31 | >>> urllib.request.splitquery(link)
 32 | ('https://www.google.com', 'search=Web Scraping')
 33 | >>> urllib.request.requesthost(link)
 34 | Traceback (most recent call last):
 35 |   File "<pyshell#15>", line 1, in <module>
 36 |     urllib.request.requesthost(link)
 37 | AttributeError: module 'urllib.request' has no attribute 'requesthost'
 38 | >>> urllib.request.request_host(link)
 39 | Traceback (most recent call last):
 40 |   File "<pyshell#16>", line 1, in <module>
 41 |     urllib.request.request_host(link)
 42 |   File "C:\Python37\lib\urllib\request.py", line 314, in request_host
 43 |     url = request.full_url
 44 | AttributeError: 'str' object has no attribute 'full_url'
 45 | >>> help(urllib.request.urlcleanup)
 46 | Help on function urlcleanup in module urllib.request:
 47 | 
 48 | urlcleanup()
 49 |     Clean up temporary files from urlretrieve calls.
 50 | 
 51 | >>> 
 52 |  RESTART: C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py 
 53 | Traceback (most recent call last):
 54 |   File "C:\Python37\lib\urllib\request.py", line 1317, in do_open
 55 |     encode_chunked=req.has_header('Transfer-encoding'))
 56 |   File "C:\Python37\lib\http\client.py", line 1229, in request
 57 |     self._send_request(method, url, body, headers, encode_chunked)
 58 |   File "C:\Python37\lib\http\client.py", line 1275, in _send_request
 59 |     self.endheaders(body, encode_chunked=encode_chunked)
 60 |   File "C:\Python37\lib\http\client.py", line 1224, in endheaders
 61 |     self._send_output(message_body, encode_chunked=encode_chunked)
 62 |   File "C:\Python37\lib\http\client.py", line 1016, in _send_output
 63 |     self.send(msg)
 64 |   File "C:\Python37\lib\http\client.py", line 956, in send
 65 |     self.connect()
 66 |   File "C:\Python37\lib\http\client.py", line 1384, in connect
 67 |     super().connect()
 68 |   File "C:\Python37\lib\http\client.py", line 928, in connect
 69 |     (self.host,self.port), self.timeout, self.source_address)
 70 |   File "C:\Python37\lib\socket.py", line 707, in create_connection
 71 |     for res in getaddrinfo(host, port, 0, SOCK_STREAM):
 72 |   File "C:\Python37\lib\socket.py", line 748, in getaddrinfo
 73 |     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
 74 | socket.gaierror: [Errno 11001] getaddrinfo failed
 75 | 
 76 | During handling of the above exception, another exception occurred:
 77 | 
 78 | Traceback (most recent call last):
 79 |   File "C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py", line 5, in <module>
 80 |     urllib.request.urlopen("https://www.python.ogr")
 81 |   File "C:\Python37\lib\urllib\request.py", line 222, in urlopen
 82 |     return opener.open(url, data, timeout)
 83 |   File "C:\Python37\lib\urllib\request.py", line 525, in open
 84 |     response = self._open(req, data)
 85 |   File "C:\Python37\lib\urllib\request.py", line 543, in _open
 86 |     '_open', req)
 87 |   File "C:\Python37\lib\urllib\request.py", line 503, in _call_chain
 88 |     result = func(*args)
 89 |   File "C:\Python37\lib\urllib\request.py", line 1360, in https_open
 90 |     context=self._context, check_hostname=self._check_hostname)
 91 |   File "C:\Python37\lib\urllib\request.py", line 1319, in do_open
 92 |     raise URLError(err)
 93 | urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
 94 | 
 95 | During handling of the above exception, another exception occurred:
 96 | 
 97 | Traceback (most recent call last):
 98 |   File "C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py", line 6, in <module>
 99 |     except error.URLError as e:
100 | NameError: name 'error' is not defined
101 | >>> 
102 |  RESTART: C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py 
103 | [Errno 11001] getaddrinfo failed
104 | Traceback (most recent call last):
105 |   File "C:\Python37\lib\urllib\request.py", line 1317, in do_open
106 |     encode_chunked=req.has_header('Transfer-encoding'))
107 |   File "C:\Python37\lib\http\client.py", line 1229, in request
108 |     self._send_request(method, url, body, headers, encode_chunked)
109 |   File "C:\Python37\lib\http\client.py", line 1275, in _send_request
110 |     self.endheaders(body, encode_chunked=encode_chunked)
111 |   File "C:\Python37\lib\http\client.py", line 1224, in endheaders
112 |     self._send_output(message_body, encode_chunked=encode_chunked)
113 |   File "C:\Python37\lib\http\client.py", line 1016, in _send_output
114 |     self.send(msg)
115 |   File "C:\Python37\lib\http\client.py", line 956, in send
116 |     self.connect()
117 |   File "C:\Python37\lib\http\client.py", line 1384, in connect
118 |     super().connect()
119 |   File "C:\Python37\lib\http\client.py", line 928, in connect
120 |     (self.host,self.port), self.timeout, self.source_address)
121 |   File "C:\Python37\lib\socket.py", line 707, in create_connection
122 |     for res in getaddrinfo(host, port, 0, SOCK_STREAM):
123 |   File "C:\Python37\lib\socket.py", line 748, in getaddrinfo
124 |     for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
125 | socket.gaierror: [Errno 11001] getaddrinfo failed
126 | 
127 | During handling of the above exception, another exception occurred:
128 | 
129 | Traceback (most recent call last):
130 |   File "C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py", line 5, in <module>
131 |     request.urlopen("https://www.python.ogr")
132 |   File "C:\Python37\lib\urllib\request.py", line 222, in urlopen
133 |     return opener.open(url, data, timeout)
134 |   File "C:\Python37\lib\urllib\request.py", line 525, in open
135 |     response = self._open(req, data)
136 |   File "C:\Python37\lib\urllib\request.py", line 543, in _open
137 |     '_open', req)
138 |   File "C:\Python37\lib\urllib\request.py", line 503, in _call_chain
139 |     result = func(*args)
140 |   File "C:\Python37\lib\urllib\request.py", line 1360, in https_open
141 |     context=self._context, check_hostname=self._check_hostname)
142 |   File "C:\Python37\lib\urllib\request.py", line 1319, in do_open
143 |     raise URLError(err)
144 | urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
145 | 
146 | During handling of the above exception, another exception occurred:
147 | 
148 | Traceback (most recent call last):
149 |   File "C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py", line 8, in <module>
150 |     print(e.code)
151 | AttributeError: 'URLError' object has no attribute 'code'
152 | >>> 
153 |  RESTART: C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py 
154 | [Errno 11001] getaddrinfo failed
155 | >>> 
156 |  RESTART: C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py 
157 | ['ContentTooShortError', 'HTTPError', 'URLError', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'urllib']
158 | [Errno 11001] getaddrinfo failed
159 | >>> 
160 |  RESTART: C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py 
161 | ['__cause__', '__class__', '__context__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__suppress_context__', '__traceback__', '__weakref__', 'args', 'characters_written', 'errno', 'filename', 'filename2', 'strerror', 'winerror', 'with_traceback']
162 | [Errno 11001] getaddrinfo failed
163 | >>> 
164 |  RESTART: C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py 
165 | ['__cause__', '__class__', '__context__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__suppress_context__', '__traceback__', '__weakref__', 'args', 'characters_written', 'errno', 'filename', 'filename2', 'strerror', 'winerror', 'with_traceback']
166 | Error Occurred:  [Errno 11001] getaddrinfo failed
167 | >>> 
168 |  RESTART: C:/Users/PETERCHAPAGAIN/Desktop/WebBook/Chapter2-RequestUrllib/urlerror.py 
169 | ['__cause__', '__class__', '__context__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__suppress_context__', '__traceback__', '__weakref__', 'args', 'characters_written', 'errno', 'filename', 'filename2', 'strerror', 'winerror', 'with_traceback']
170 | Error Occurred:  unknown url type: htps
171 | >>> import urllib.request as request
172 | >>> import urllib.error as error
173 | 
174 | >>> try:
175 |     request.urlopen("https://www.python.ogr")
176 | except error.URLError as e:
177 |     print("Error Occurred: ",e.reason)
178 | 
179 |     
180 | Error Occurred:  [Errno 11001] getaddrinfo failed
181 | >>> import urllib,robotparser as robot
182 | Traceback (most recent call last):
183 |   File "<pyshell#22>", line 1, in <module>
184 |     import urllib,robotparser as robot
185 | ModuleNotFoundError: No module named 'robotparser'
186 | >>> import urllib.robotparser as robot
187 | >>> parser = robot.RobotFileParser("httpe://www.samsclub.com/robots.txt")
188 | >>> dir(parser)
189 | ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_add_entry', 'allow_all', 'can_fetch', 'crawl_delay', 'default_entry', 'disallow_all', 'entries', 'host', 'last_checked', 'modified', 'mtime', 'parse', 'path', 'read', 'request_rate', 'set_url', 'url']
190 | >>> parser = robot.RobotFileParser("https://www.samsclub.com/robots.txt")
191 | >>> dir(parser)
192 | ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_add_entry', 'allow_all', 'can_fetch', 'crawl_delay', 'default_entry', 'disallow_all', 'entries', 'host', 'last_checked', 'modified', 'mtime', 'parse', 'path', 'read', 'request_rate', 'set_url', 'url']
193 | >>> parser.can_fetch('*','')
194 | False
195 | >>> parser.can_fetch('*','https://www.samsclub.com/sitemap.xml')
196 | False
197 | >>> parser.can_fetch('*','https://www.samsclub.com/')
198 | False
199 | >>> parser.can_fetch('*','https://www.samsclub.com/sama/account/signin/createSession.jsp')
200 | False
201 | >>> parser = robot.RobotFileParser("https://www.diply.com/robots.txt")
202 | >>> parser.can_fetch('*','https://www.diply.com/')
203 | False
204 | >>> parser.can_fetch('*','https://www.diply.com/static/cookiepolicy')
205 | False
206 | >>> parser.can_fetch('*','https://www.diply.com/static/cookiepolicy/')
207 | False
208 | >>> parser = robot.RobotFileParser()
209 | >>> p = parser.set_url('https://www.diply.com/robots.txt')
210 | >>> p
211 | >>> dir(p)
212 | ['__bool__', '__class__', '__delattr__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__']
213 | >>> type(p)
214 | <class 'NoneType'>
215 | >>> p.can_fetch('*','https://www.diply.com/static/cookiepolicy/')
216 | Traceback (most recent call last):
217 |   File "<pyshell#41>", line 1, in <module>
218 |     p.can_fetch('*','https://www.diply.com/static/cookiepolicy/')
219 | AttributeError: 'NoneType' object has no attribute 'can_fetch'
220 | >>> p.read()
221 | Traceback (most recent call last):
222 |   File "<pyshell#42>", line 1, in <module>
223 |     p.read()
224 | AttributeError: 'NoneType' object has no attribute 'read'
225 | >>> parser.set_url('https://www.diply.com/robots.txt')
226 | >>> parser.read()
227 | >>> parser.can_fetch('*','https://www.diply.com/static/cookiepolicy/')
228 | True
229 | >>> parser.can_fetch('*','https://www.samsclub.com/sama/account/signin/createSession.jsp')
230 | True
231 | >>> parser.can_fetch('*','https://www.samsclub.com/sitemap.xml')
232 | True
233 | >>> parser.can_fetch('*','https://www.samsclub.com/category')
234 | True
235 | >>> dir(parser)
236 | ['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_add_entry', 'allow_all', 'can_fetch', 'crawl_delay', 'default_entry', 'disallow_all', 'entries', 'host', 'last_checked', 'modified', 'mtime', 'parse', 'path', 'read', 'request_rate', 'set_url', 'url']
237 | >>> print parser
238 | SyntaxError: Missing parentheses in call to 'print'. Did you mean print(parser)?
239 | >>> print(parser)
240 | User-agent: *
241 | Allow: 
242 | 
243 | 
244 | >>> parser.set_url('https://www.samsclub.com/robots.txt')
245 | >>> parser.read()
246 | >>> print(parser)
247 | User-agent: *
248 | Allow: 
249 | 
250 | 
251 | >>> par = robot.RobotFileParser()
252 | >>> par.set_url('https://www.samsclub.com/robots.txt')
253 | >>> par.read()
254 | >>> print(par)
255 | User-agent: *
256 | Allow: /sams/account/signin/createSession.jsp
257 | Disallow: /cgi-bin/
258 | Disallow: /sams/checkout/
259 | Disallow: /sams/account/
260 | Disallow: /sams/cart/
261 | Disallow: /sams/eValues/clubInsiderOffers.jsp
262 | Disallow: /friend
263 | Allow: /sams/account/referal/
264 | 
265 | 
266 | >>> par.can_fetch('*','https://www.samsclub.com/category')
267 | True
268 | >>> par.can_fetch('*','https://www.samsclub.com/friend')
269 | False
270 | >>> par.crawl_delay(*)
271 | SyntaxError: invalid syntax
272 | >>> par.crawl_delay('*')
273 | >>> par.request_rate('*')
274 | >>> rr=par.request_rate('*')
275 | >>> rr.seconds
276 | Traceback (most recent call last):
277 |   File "<pyshell#65>", line 1, in <module>
278 |     rr.seconds
279 | AttributeError: 'NoneType' object has no attribute 'seconds'
280 | >>> 
281 | 


--------------------------------------------------------------------------------
/Chapter04/test.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <!--[if lt IE 7]>
  3 | <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->
  4 | <!--[if IE 7]>
  5 | <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->
  6 | <!--[if IE 8]>
  7 | <html class="no-js ie8 lt-ie9">                 <![endif]-->
  8 | <!--[if gt IE 8]><!-->
  9 | <html class="no-js" lang="en" dir="ltr">  <!--<![endif]-->
 10 | <head>
 11 |     <meta charset="utf-8">
 12 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 13 |     <link rel="prefetch" href="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js">
 14 |     <meta name="application-name" content="Python.org">
 15 |     <meta name="msapplication-tooltip" content="The official home of the Python Programming Language">
 16 |     <meta name="apple-mobile-web-app-title" content="Python.org">
 17 |     <meta name="apple-mobile-web-app-capable" content="yes">
 18 |     <meta name="apple-mobile-web-app-status-bar-style" content="black">
 19 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 20 |     <meta name="HandheldFriendly" content="True">
 21 |     <meta name="format-detection" content="telephone=no">
 22 |     <meta http-equiv="cleartype" content="on">
 23 |     <meta http-equiv="imagetoolbar" content="false">
 24 |     <script src="/static/js/libs/modernizr.js"></script>
 25 |     <link href="/static/stylesheets/style.css" rel="stylesheet" type="text/css" title="default"/>
 26 |     <link href="/static/stylesheets/mq.css" rel="stylesheet" type="text/css"
 27 |           media="not print, braille, embossed, speech, tty"/>
 28 |     <!--[if (lte IE 8)&(!IEMobile)]>
 29 |     <link href="/static/stylesheets/no-mq.css" rel="stylesheet" type="text/css" media="screen"/>
 30 |     <![endif]-->
 31 |     <link rel="icon" type="image/x-icon" href="/static/favicon.ico">
 32 |     <link rel="apple-touch-icon-precomposed" sizes="144x144" href="/static/apple-touch-icon-144x144-precomposed.png">
 33 |     <link rel="apple-touch-icon-precomposed" sizes="114x114" href="/static/apple-touch-icon-114x114-precomposed.png">
 34 |     <link rel="apple-touch-icon-precomposed" sizes="72x72" href="/static/apple-touch-icon-72x72-precomposed.png">
 35 |     <link rel="apple-touch-icon-precomposed" href="/static/apple-touch-icon-precomposed.png">
 36 |     <link rel="apple-touch-icon" href="/static/apple-touch-icon-precomposed.png">
 37 |     <meta name="msapplication-TileImage" content="/static/metro-icon-144x144-precomposed.png"><!-- white shape -->
 38 |     <meta name="msapplication-TileColor" content="#3673a5"><!-- python blue -->
 39 |     <meta name="msapplication-navbutton-color" content="#3673a5">
 40 |     <title>Welcome to Python.org</title>
 41 |     <meta name="description" content="The official home of the Python Programming Language">
 42 |     <meta name="keywords"
 43 |           content="Python programming language object oriented web free open source software license documentation download community">
 44 |     <meta property="og:type" content="website">
 45 |     <meta property="og:site_name" content="Python.org">
 46 |     <meta property="og:title" content="Welcome to Python.org">
 47 |     <meta property="og:description" content="The official home of the Python Programming Language">
 48 |     <meta property="og:image" content="https://www.python.org/static/opengraph-icon-200x200.png">
 49 |     <meta property="og:image:secure_url" content="https://www.python.org/static/opengraph-icon-200x200.png">
 50 |     <meta property="og:url" content="https://www.python.org/">
 51 |     <link rel="author" href="/static/humans.txt">
 52 |     <link rel="alternate" type="application/rss+xml" title="Python Enhancement Proposals"
 53 |           href="https://www.python.org/dev/peps/peps.rss/">
 54 |     <link rel="alternate" type="application/rss+xml" title="Python Job Opportunities"
 55 |           href="https://www.python.org/jobs/feed/rss/">
 56 |     <link rel="alternate" type="application/rss+xml" title="Python Software Foundation News"
 57 |           href="https://feeds.feedburner.com/PythonSoftwareFoundationNews">
 58 |     <link rel="alternate" type="application/rss+xml" title="Python Insider"
 59 |           href="https://feeds.feedburner.com/PythonInsider">
 60 |     <script type="application/ld+json">
 61 |         {
 62 |             "@context": "http://schema.org",
 63 |             "@type": "WebSite",
 64 |             "url": "https://www.python.org/",
 65 |             "potentialAction": {
 66 |                 "@type": "SearchAction",
 67 |                 "target": "https://www.python.org/search/?q={search_term_string}",
 68 |                 "query-input": "required name=search_term_string"
 69 |             }
 70 |         }
 71 |     </script>
 72 |     <script type="text/javascript">
 73 |     var _gaq = _gaq || [];
 74 |     _gaq.push(['_setAccount', 'UA-39055973-1']);
 75 |     _gaq.push(['_trackPageview']);
 76 |     (function() {
 77 |         var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
 78 |         ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
 79 |         var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
 80 |     })();
 81 |     
 82 |     </script>
 83 | </head>
 84 | <body class="python home" id="homepage">
 85 | <div id="touchnav-wrapper">
 86 |     <div id="nojs" class="do-not-print">
 87 |         <p><strong>Notice:</strong> While Javascript is not essential for this website, your interaction with the
 88 |             content will be limited. Please turn Javascript on for the full experience. </p>
 89 |     </div>
 90 |     <!--[if lt IE 8]>
 91 |     <div id="oldie-warning" class="do-not-print">
 92 |         <p><strong>Notice:</strong> Your browser is <em>ancient</em> and <a href="http://www.ie6countdown.com/">Microsoft
 93 |             agrees</a>. <a href="http://browsehappy.com/">Upgrade to a different browser</a> or <a
 94 |                 href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to experience a
 95 |             better web.</p>
 96 |     </div>
 97 |     <![endif]-->
 98 |     <!-- Sister Site Links -->
 99 |     <div id="top" class="top-bar do-not-print">
100 |         <nav class="meta-navigation container" role="navigation">
101 |             <div class="skip-link screen-reader-text">
102 |                 <a href="#content" title="Skip to content">Skip to content</a>
103 |             </div>
104 |             <a id="close-python-network" class="jump-link" href="#python-network" aria-hidden="true">
105 |                 <span aria-hidden="true" class="icon-arrow-down"><span>&#9660;</span></span> Close
106 |             </a>
107 |             <ul class="menu" role="tree">
108 |                 <li class="python-meta current_item selectedcurrent_branch selected">
109 |                     <a href="/" title="The Python Programming Language"
110 |                        class="current_item selectedcurrent_branch selected">Python</a>
111 |                 </li>
112 |                 <li class="psf-meta ">
113 |                     <a href="/psf-landing/" title="The Python Software Foundation">PSF</a>
114 |                 </li>
115 |                 <li class="docs-meta ">
116 |                     <a href="https://docs.python.org" title="Python Documentation">Docs</a>
117 |                 </li>
118 |                 <li class="pypi-meta ">
119 |                     <a href="https://pypi.python.org/" title="Python Package Index">PyPI</a>
120 |                 </li>
121 |                 <li class="jobs-meta ">
122 |                     <a href="/jobs/" title="Python Job Board">Jobs</a>
123 |                 </li>
124 |                 <li class="shop-meta ">
125 |                     <a href="/community/" title="Python Community">Community</a>
126 |                 </li>
127 |             </ul>
128 |             <a id="python-network" class="jump-link" href="#top" aria-hidden="true">
129 |                 <span aria-hidden="true" class="icon-arrow-up"><span>&#9650;</span></span> The Python Network
130 |             </a>
131 |         </nav>
132 |     </div>
133 |     <!-- Header elements -->
134 |     <header class="main-header" role="banner">
135 |         <div class="container">
136 |             <h1 class="site-headline">
137 |                 <a href="/"><img class="python-logo" src="/static/img/python-logo.png" alt="python&trade;"></a>
138 |             </h1>
139 |             <div class="options-bar-container do-not-print">
140 |                 <a href="/psf/donations/" class="donate-button">Donate</a>
141 |                 <div class="options-bar">
142 |                     <a id="site-map-link" class="jump-to-menu" href="#site-map"><span class="menu-icon">&equiv;</span>
143 |                         Menu</a>
144 |                     <form class="search-the-site" action="/search/" method="get">
145 |                         <fieldset title="Search Python.org">
146 |                             <span aria-hidden="true" class="icon-search"></span>
147 |                             <label class="screen-reader-text" for="id-search-field">Search This Site</label>
148 |                             <input id="id-search-field" name="q" type="search" role="textbox" class="search-field"
149 |                                    placeholder="Search" value="" tabindex="1">
150 |                             <button type="submit" name="submit" id="submit" class="search-button"
151 |                                     title="Submit this Search" tabindex="3">
152 |                                 GO
153 |                             </button>
154 |                             <!--[if IE]><input type="text" style="display: none;" disabled="disabled" size="1"
155 |                                                tabindex="4"><![endif]-->
156 |                         </fieldset>
157 |                     </form>
158 |                     <span class="breaker"></span>
159 |                     <div class="adjust-font-size" aria-hidden="true">
160 |                         <ul class="navigation menu" aria-label="Adjust Text Size on Page">
161 |                             <li class="tier-1 last" aria-haspopup="true">
162 |                                 <a href="#" class="action-trigger"><strong>
163 |                                     <small>A</small>
164 |                                     A</strong></a>
165 |                                 <ul class="subnav menu">
166 |                                     <li class="tier-2 element-1" role="treeitem"><a class="text-shrink"
167 |                                                                                     title="Make Text Smaller"
168 |                                                                                     href="javascript:;">Smaller</a></li>
169 |                                     <li class="tier-2 element-2" role="treeitem"><a class="text-grow"
170 |                                                                                     title="Make Text Larger"
171 |                                                                                     href="javascript:;">Larger</a></li>
172 |                                     <li class="tier-2 element-3" role="treeitem"><a class="text-reset"
173 |                                                                                     title="Reset any font size changes I have made"
174 |                                                                                     href="javascript:;">Reset</a></li>
175 |                                 </ul>
176 |                             </li>
177 |                         </ul>
178 |                     </div>
179 |                     <div class="winkwink-nudgenudge">
180 |                         <ul class="navigation menu" aria-label="Social Media Navigation">
181 |                             <li class="tier-1 last" aria-haspopup="true">
182 |                                 <a href="#" class="action-trigger">Socialize</a>
183 |                                 <ul class="subnav menu">
184 |                                     <li class="tier-2 element-1" role="treeitem"><a
185 |                                             href="https://plus.google.com/+Python"><span aria-hidden="true"
186 |                                                                                          class="icon-google-plus"></span>Google+</a>
187 |                                     </li>
188 |                                     <li class="tier-2 element-2" role="treeitem"><a
189 |                                             href="https://www.facebook.com/pythonlang?fref=ts"><span aria-hidden="true"
190 |                                                                                                      class="icon-facebook"></span>Facebook</a>
191 |                                     </li>
192 |                                     <li class="tier-2 element-3" role="treeitem"><a
193 |                                             href="https://twitter.com/ThePSF"><span aria-hidden="true"
194 |                                                                                     class="icon-twitter"></span>Twitter</a>
195 |                                     </li>
196 |                                     <li class="tier-2 element-4" role="treeitem"><a href="/community/irc/"><span
197 |                                             aria-hidden="true" class="icon-freenode"></span>Chat on IRC</a></li>
198 |                                 </ul>
199 |                             </li>
200 |                         </ul>
201 |                     </div>
202 |                     <div class="account-signin">
203 |                         <ul class="navigation menu" aria-label="Social Media Navigation">
204 |                             <li class="tier-1 last" aria-haspopup="true">
205 |                                 <a href="/accounts/login/" title="Sign Up or Sign In to Python.org">Sign In</a>
206 |                                 <ul class="subnav menu">
207 |                                     <li class="tier-2 element-1" role="treeitem"><a href="/accounts/signup/">Sign Up /
208 |                                         Register</a></li>
209 |                                     <li class="tier-2 element-2" role="treeitem"><a href="/accounts/login/">Sign In</a>
210 |                                     </li>
211 |                                 </ul>
212 |                             </li>
213 |                         </ul>
214 |                     </div>
215 |                 </div><!-- end options-bar -->
216 |             </div>
217 |             <nav id="mainnav" class="python-navigation main-navigation do-not-print" role="navigation">
218 |                 <ul class="navigation menu" role="menubar" aria-label="Main Navigation">
219 |                     <li id="about" class="tier-1 element-1  " aria-haspopup="true">
220 |                         <a href="/about/" title="" class="">About</a>
221 |                         <ul class="subnav menu" role="menu" aria-hidden="true">
222 |                             <li class="tier-2 element-1" role="treeitem"><a href="/about/apps/"
223 |                                                                             title="">Applications</a></li>
224 |                             <li class="tier-2 element-2" role="treeitem"><a href="/about/quotes/" title="">Quotes</a>
225 |                             </li>
226 |                             <li class="tier-2 element-3" role="treeitem"><a href="/about/gettingstarted/" title="">Getting
227 |                                 Started</a></li>
228 |                             <li class="tier-2 element-4" role="treeitem"><a href="/about/help/" title="">Help</a></li>
229 |                             <li class="tier-2 element-5" role="treeitem"><a href="http://brochure.getpython.info/"
230 |                                                                             title="">Python Brochure</a></li>
231 |                         </ul>
232 |                     </li>
233 |                     <li id="downloads" class="tier-1 element-2  " aria-haspopup="true">
234 |                         <a href="/downloads/" title="" class="">Downloads</a>
235 |                         <ul class="subnav menu" role="menu" aria-hidden="true">
236 |                             <li class="tier-2 element-1" role="treeitem"><a href="/downloads/" title="">All releases</a>
237 |                             </li>
238 |                             <li class="tier-2 element-2" role="treeitem"><a href="/downloads/source/" title="">Source
239 |                                 code</a></li>
240 |                             <li class="tier-2 element-3" role="treeitem"><a href="/downloads/windows/"
241 |                                                                             title="">Windows</a></li>
242 |                             <li class="tier-2 element-4" role="treeitem"><a href="/downloads/mac-osx/" title="">Mac OS
243 |                                 X</a></li>
244 |                             <li class="tier-2 element-5" role="treeitem"><a href="/download/other/" title="">Other
245 |                                 Platforms</a></li>
246 |                             <li class="tier-2 element-6" role="treeitem"><a
247 |                                     href="https://docs.python.org/3/license.html" title="">License</a></li>
248 |                             <li class="tier-2 element-7" role="treeitem"><a href="/download/alternatives" title="">Alternative
249 |                                 Implementations</a></li>
250 |                         </ul>
251 |                     </li>
252 |                     <li id="documentation" class="tier-1 element-3  " aria-haspopup="true">
253 |                         <a href="/doc/" title="" class="">Documentation</a>
254 |                         <ul class="subnav menu" role="menu" aria-hidden="true">
255 |                             <li class="tier-2 element-1" role="treeitem"><a href="/doc/" title="">Docs</a></li>
256 |                             <li class="tier-2 element-2" role="treeitem"><a href="/doc/av" title="">Audio/Visual
257 |                                 Talks</a></li>
258 |                             <li class="tier-2 element-3" role="treeitem"><a
259 |                                     href="https://wiki.python.org/moin/BeginnersGuide" title="">Beginner&#39;s Guide</a>
260 |                             </li>
261 |                             <li class="tier-2 element-4" role="treeitem"><a href="https://devguide.python.org/"
262 |                                                                             title="">Developer&#39;s Guide</a></li>
263 |                             <li class="tier-2 element-5" role="treeitem"><a href="https://docs.python.org/faq/"
264 |                                                                             title="">FAQ</a></li>
265 |                             <li class="tier-2 element-6" role="treeitem"><a href="http://wiki.python.org/moin/Languages"
266 |                                                                             title="">Non-English Docs</a></li>
267 |                             <li class="tier-2 element-7" role="treeitem"><a href="http://python.org/dev/peps/" title="">PEP
268 |                                 Index</a></li>
269 |                             <li class="tier-2 element-8" role="treeitem"><a
270 |                                     href="https://wiki.python.org/moin/PythonBooks" title="">Python Books</a></li>
271 |                             <li class="tier-2 element-9" role="treeitem"><a href="/doc/essays/" title="">Python
272 |                                 Essays</a></li>
273 |                         </ul>
274 |                     </li>
275 |                     <li id="community" class="tier-1 element-4  " aria-haspopup="true">
276 |                         <a href="/community/" title="" class="">Community</a>
277 |                         <ul class="subnav menu" role="menu" aria-hidden="true">
278 |                             <li class="tier-2 element-1" role="treeitem"><a href="/community/survey" title="">Community
279 |                                 Survey</a></li>
280 |                             <li class="tier-2 element-2" role="treeitem"><a href="/community/diversity/" title="">Diversity</a>
281 |                             </li>
282 |                             <li class="tier-2 element-3" role="treeitem"><a href="/community/lists/" title="">Mailing
283 |                                 Lists</a></li>
284 |                             <li class="tier-2 element-4" role="treeitem"><a href="/community/irc/" title="">IRC</a></li>
285 |                             <li class="tier-2 element-5" role="treeitem"><a href="/community/forums/"
286 |                                                                             title="">Forums</a></li>
287 |                             <li class="tier-2 element-6" role="treeitem"><a href="/community/workshops/" title="">Python
288 |                                 Conferences</a></li>
289 |                             <li class="tier-2 element-7" role="treeitem"><a href="/community/sigs/" title="">Special
290 |                                 Interest Groups</a></li>
291 |                             <li class="tier-2 element-8" role="treeitem"><a href="/community/logos/" title="">Python
292 |                                 Logo</a></li>
293 |                             <li class="tier-2 element-9" role="treeitem"><a href="https://wiki.python.org/moin/"
294 |                                                                             title="">Python Wiki</a></li>
295 |                             <li class="tier-2 element-10" role="treeitem"><a href="/community/merchandise/" title="">Merchandise</a>
296 |                             </li>
297 |                             <li class="tier-2 element-11" role="treeitem"><a href="/community/awards" title="">Community
298 |                                 Awards</a></li>
299 |                             <li class="tier-2 element-12" role="treeitem"><a
300 |                                     href="https://www.python.org/psf/codeofconduct/" title="">Code of Conduct</a></li>
301 |                         </ul>
302 |                     </li>
303 |                     <li id="success-stories" class="tier-1 element-5  " aria-haspopup="true">
304 |                         <a href="/success-stories/" title="success-stories" class="">Success Stories</a>
305 |                         <ul class="subnav menu" role="menu" aria-hidden="true">
306 |                             <li class="tier-2 element-1" role="treeitem"><a href="/success-stories/category/arts/"
307 |                                                                             title="">Arts</a></li>
308 |                             <li class="tier-2 element-2" role="treeitem"><a href="/success-stories/category/business/"
309 |                                                                             title="">Business</a></li>
310 |                             <li class="tier-2 element-3" role="treeitem"><a href="/success-stories/category/education/"
311 |                                                                             title="">Education</a></li>
312 |                             <li class="tier-2 element-4" role="treeitem"><a
313 |                                     href="/success-stories/category/engineering/" title="">Engineering</a></li>
314 |                             <li class="tier-2 element-5" role="treeitem"><a href="/success-stories/category/government/"
315 |                                                                             title="">Government</a></li>
316 |                             <li class="tier-2 element-6" role="treeitem"><a href="/success-stories/category/scientific/"
317 |                                                                             title="">Scientific</a></li>
318 |                             <li class="tier-2 element-7" role="treeitem"><a
319 |                                     href="/success-stories/category/software-development/" title="">Software
320 |                                 Development</a></li>
321 |                         </ul>
322 |                     </li>
323 |                     <li id="news" class="tier-1 element-6  " aria-haspopup="true">
324 |                         <a href="/blogs/" title="News from around the Python world" class="">News</a>
325 |                         <ul class="subnav menu" role="menu" aria-hidden="true">
326 |                             <li class="tier-2 element-1" role="treeitem"><a href="/blogs/"
327 |                                                                             title="Python Insider Blog Posts">Python
328 |                                 News</a></li>
329 |                             <li class="tier-2 element-2" role="treeitem"><a href="http://planetpython.org/"
330 |                                                                             title="Planet Python">Community News</a>
331 |                             </li>
332 |                             <li class="tier-2 element-3" role="treeitem"><a href="http://pyfound.blogspot.com/"
333 |                                                                             title="PSF Blog">PSF News</a></li>
334 |                             <li class="tier-2 element-4" role="treeitem"><a href="http://pycon.blogspot.com/"
335 |                                                                             title="PyCon Blog">PyCon News</a></li>
336 |                         </ul>
337 |                     </li>
338 |                     <li id="events" class="tier-1 element-7  " aria-haspopup="true">
339 |                         <a href="/events/" title="" class="">Events</a>
340 |                         <ul class="subnav menu" role="menu" aria-hidden="true">
341 |                             <li class="tier-2 element-1" role="treeitem"><a href="/events/python-events" title="">Python
342 |                                 Events</a></li>
343 |                             <li class="tier-2 element-2" role="treeitem"><a href="/events/python-user-group/" title="">User
344 |                                 Group Events</a></li>
345 |                             <li class="tier-2 element-3" role="treeitem"><a href="/events/python-events/past/" title="">Python
346 |                                 Events Archive</a></li>
347 |                             <li class="tier-2 element-4" role="treeitem"><a href="/events/python-user-group/past/"
348 |                                                                             title="">User Group Events Archive</a></li>
349 |                             <li class="tier-2 element-5" role="treeitem"><a
350 |                                     href="https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event"
351 |                                     title="">Submit an Event</a></li>
352 |                         </ul>
353 |                     </li>
354 |                 </ul>
355 |             </nav>
356 |             <div class="header-banner "> <!-- for optional "do-not-print" class -->
357 |                 <div id="dive-into-python" class="flex-slideshow slideshow">
358 |                     <ul class="launch-shell menu" id="launch-shell">
359 |                         <li>
360 |                             <a class="button prompt" id="start-shell" data-shell-container="#dive-into-python"
361 |                                href="/shell/">&gt;_
362 |                                 <span class="message">Launch Interactive Shell</span>
363 |                             </a>
364 |                         </li>
365 |                     </ul>
366 |                     <ul class="slides menu">
367 |                         <li>
368 |                             <div class="slide-code"><pre><code><span class="comment"># Python 3: Fibonacci series up to n</span>
369 | >>> def fib(n):
370 | >>>     a, b = 0, 1
371 | >>>     while a &lt; n:
372 | >>>         print(a, end=' ')
373 | >>>         a, b = b, a+b
374 | >>>     print()
375 | >>> fib(1000)
376 | <span class="output">0 1 1 2 3 5 8 13 21 34 55 89 144 233 377 610 987</span></code></pre>
377 |                             </div>
378 |                             <div class="slide-copy"><h1>Functions Defined</h1>
379 |                                 <p>The core of extensible programming is defining functions. Python allows mandatory and
380 |                                     optional arguments, keyword arguments, and even arbitrary argument lists. <a
381 |                                             href="//docs.python.org/3/tutorial/controlflow.html#defining-functions">More
382 |                                         about defining functions in Python&nbsp;3</a></p></div>
383 |                         </li>
384 |                         <li>
385 |                             <div class="slide-code"><pre><code><span
386 |                                     class="comment"># Python 3: List comprehensions</span>
387 | >>> fruits = ['Banana', 'Apple', 'Lime']
388 | >>> loud_fruits = [fruit.upper() for fruit in fruits]
389 | >>> print(loud_fruits)
390 | <span class="output">['BANANA', 'APPLE', 'LIME']</span>
391 | <span class="comment"># List and the enumerate function</span>
392 | >>> list(enumerate(fruits))
393 | <span class="output">[(0, 'Banana'), (1, 'Apple'), (2, 'Lime')]</span></code></pre>
394 |                             </div>
395 |                             <div class="slide-copy"><h1>Compound Data Types</h1>
396 |                                 <p>Lists (known as arrays in other languages) are one of the compound data types that
397 |                                     Python understands. Lists can be indexed, sliced and manipulated with other built-in
398 |                                     functions. <a href="//docs.python.org/3/tutorial/introduction.html#lists">More about
399 |                                         lists in Python&nbsp;3</a></p></div>
400 |                         </li>
401 |                         <li>
402 |                             <div class="slide-code"><pre><code><span
403 |                                     class="comment"># Python 3: Simple arithmetic</span>
404 | >>> 1 / 2
405 | <span class="output">0.5</span>
406 | >>> 2 ** 3
407 | <span class="output">8</span>
408 | >>> 17 / 3  <span class="comment"># classic division returns a float</span>
409 | <span class="output">5.666666666666667</span>
410 | >>> 17 // 3  <span class="comment"># floor division</span>
411 | <span class="output">5</span></code></pre>
412 |                             </div>
413 |                             <div class="slide-copy"><h1>Intuitive Interpretation</h1>
414 |                                 <p>Calculations are simple with Python, and expression syntax is straightforward: the
415 |                                     operators <code>+</code>, <code>-</code>, <code>*</code> and <code>/</code> work as
416 |                                     expected; parentheses <code>()</code> can be used for grouping. <a
417 |                                             href="http://docs.python.org/3/tutorial/introduction.html#using-python-as-a-calculator">More
418 |                                         about simple math functions in Python&nbsp;3</a>.</p></div>
419 |                         </li>
420 |                         <li>
421 |                             <div class="slide-code"><pre><code><span class="comment"># Python 3: Simple output (with Unicode)</span>
422 | >>> print("Hello, I'm Python!")
423 | <span class="output">Hello, I'm Python!</span>
424 | <span class="comment"># Input, assignment</span>
425 | >>> name = input('What is your name?\n')
426 | >>> print('Hi, %s.' % name)
427 | <span class="output">What is your name?
428 | Python
429 | Hi, Python.</span></code></pre>
430 |                             </div>
431 |                             <div class="slide-copy"><h1>Quick &amp; Easy to Learn</h1>
432 |                                 <p>Experienced programmers in any other language can pick up Python very quickly, and
433 |                                     beginners find the clean syntax and indentation structure easy to learn. <a
434 |                                             href="//docs.python.org/3/tutorial/">Whet your appetite</a> with our Python&nbsp;3
435 |                                     overview.</p>
436 |                             </div>
437 |                         </li>
438 |                         <li>
439 |                             <div class="slide-code"><pre><code><span class="comment"># For loop on a list</span>
440 | >>> numbers = [2, 4, 6, 8]
441 | >>> product = 1
442 | >>> for number in numbers:
443 | ...    product = product * number
444 | ...
445 | >>> print('The product is:', product)
446 | <span class="output">The product is: 384</span></code></pre>
447 |                             </div>
448 |                             <div class="slide-copy"><h1>All the Flow You&rsquo;d Expect</h1>
449 |                                 <p>Python knows the usual control flow statements that other languages speak &mdash;
450 |                                     <code>if</code>, <code>for</code>, <code>while</code> and <code>range</code> &mdash;
451 |                                     with some of its own twists, of course. <a
452 |                                             href="//docs.python.org/3/tutorial/controlflow.html">More control flow tools
453 |                                         in Python&nbsp;3</a></p></div>
454 |                         </li>
455 |                     </ul>
456 |                 </div>
457 |             </div>
458 |             <div class="introduction">
459 |                 <p>Python is a programming language that lets you work quickly <span class="breaker"></span>and
460 |                     integrate systems more effectively. <a class="readmore" href="/doc/">Learn More</a></p>
461 |             </div>
462 |         </div><!-- end .container -->
463 |     </header>
464 |     <div id="content" class="content-wrapper">
465 |         <!-- Main Content Column -->
466 |         <div class="container">
467 |             <section class="main-content " role="main">
468 |                 <div class="row">
469 |                     <div class="small-widget get-started-widget">
470 |                         <h2 class="widget-title"><span aria-hidden="true" class="icon-get-started"></span>Get Started
471 |                         </h2>
472 |                         <p>Whether you're new to programming or an experienced developer, it's easy to learn and use
473 |                             Python.</p>
474 |                         <p><a href="/about/gettingstarted/">Start with our Beginner&rsquo;s Guide</a></p>
475 |                     </div>
476 |                     <div class="small-widget download-widget">
477 |                         <h2 class="widget-title"><span aria-hidden="true" class="icon-download"></span>Download</h2>
478 |                         <p>Python source code and installers are available for download for all versions!</p>
479 |                         <p>Latest: <a href="/downloads/release/python-372/">Python 3.7.2</a></p>
480 |                     </div>
481 |                     <div class="small-widget documentation-widget">
482 |                         <h2 class="widget-title"><span aria-hidden="true" class="icon-documentation"></span>Docs</h2>
483 |                         <p>Documentation for Python's standard library, along with tutorials and guides, are available
484 |                             online.</p>
485 |                         <p><a href="https://docs.python.org">docs.python.org</a></p>
486 |                     </div>
487 |                     <div class="small-widget jobs-widget last">
488 |                         <h2 class="widget-title"><span aria-hidden="true" class="icon-jobs"></span>Jobs</h2>
489 |                         <p>Looking for work or have a Python related position that you're trying to hire for? Our
490 |                             <strong>relaunched community-run job board</strong> is the place to go.</p>
491 |                         <p><a href="//jobs.python.org">jobs.python.org</a></p>
492 |                     </div>
493 |                 </div>
494 |                 <div class="list-widgets row">
495 |                     <div class="medium-widget blog-widget">
496 |                         <div class="shrubbery">
497 |                             <h2 class="widget-title"><span aria-hidden="true" class="icon-news"></span>Latest News</h2>
498 |                             <p class="give-me-more"><a href="http://blog.python.org" title="More News">More</a></p>
499 |                             <ul class="menu">
500 |                                 <li>
501 |                                     <time datetime="2019-02-11T18:00:00+00:00"><span class="say-no-more">2019-</span>02-11
502 |                                     </time>
503 |                                     <a href="http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/SSP60lKogpA/python-community-service-award-q3-mario.html">The
504 |                                         PSF community service awards go to those individuals whose ...</a></li>
505 |                                 <li>
506 |                                     <time datetime="2019-02-11T13:59:00+00:00"><span class="say-no-more">2019-</span>02-11
507 |                                     </time>
508 |                                     <a href="http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/W_AaEwoD6CI/the-steady-leader-of-python-community.html">Going
509 |                                         through the big names in the Python community, one ...</a></li>
510 |                                 <li>
511 |                                     <time datetime="2019-02-06T13:44:00.000002+00:00"><span
512 |                                             class="say-no-more">2019-</span>02-06
513 |                                     </time>
514 |                                     <a href="http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/tdocQg_vxAg/python-brasil-people-technology.html">It's
515 |                                         October 17th, 2018. I've arrived at Natal, a beautiful ...</a></li>
516 |                                 <li>
517 |                                     <time datetime="2019-02-05T17:39:00.000001+00:00"><span
518 |                                             class="say-no-more">2019-</span>02-05
519 |                                     </time>
520 |                                     <a href="http://feedproxy.google.com/~r/PythonSoftwareFoundationNews/~3/S5UQDdinac4/python-developers-survey-2018-results.html">In
521 |                                         the fall of 2018, the Python Software Foundation together ...</a></li>
522 |                                 <li>
523 |                                     <time datetime="2019-02-04T12:53:00+00:00"><span class="say-no-more">2019-</span>02-04
524 |                                     </time>
525 |                                     <a href="http://feedproxy.google.com/~r/PythonInsider/~3/4zcUKvBaoUA/python-380a1-is-now-available-for.html">Go
526 |                                         get it here:https://www.python.org/downloads/release/python-380a1/The most
527 |                                         visible change so far is ...</a></li>
528 |                             </ul>
529 |                         </div><!-- end .shrubbery -->
530 |                     </div>
531 |                     <div class="medium-widget event-widget last">
532 |                         <div class="shrubbery">
533 |                             <h2 class="widget-title"><span aria-hidden="true" class="icon-calendar"></span>Upcoming
534 |                                 Events</h2>
535 |                             <p class="give-me-more"><a href="/events/calendars/" title="More Events">More</a></p>
536 |                             <ul class="menu">
537 |                                 <li>
538 |                                     <time datetime="2019-02-19T00:00:00+00:00"><span class="say-no-more">2019-</span>02-19
539 |                                     </time>
540 |                                     <a href="/events/python-events/790/">PyCon Namibia 2019</a></li>
541 |                                 <li>
542 |                                     <time datetime="2019-02-23T00:00:00+00:00"><span class="say-no-more">2019-</span>02-23
543 |                                     </time>
544 |                                     <a href="/events/python-events/757/">PyCascades 2019</a></li>
545 |                                 <li>
546 |                                     <time datetime="2019-02-23T00:00:00+00:00"><span class="say-no-more">2019-</span>02-23
547 |                                     </time>
548 |                                     <a href="/events/python-events/807/">PyCon APAC 2019</a></li>
549 |                                 <li>
550 |                                     <time datetime="2019-02-23T00:00:00+00:00"><span class="say-no-more">2019-</span>02-23
551 |                                     </time>
552 |                                     <a href="/events/python-events/798/">Berlin Python Pizza</a></li>
553 |                                 <li>
554 |                                     <time datetime="2019-03-15T00:00:00+00:00"><span class="say-no-more">2019-</span>03-15
555 |                                     </time>
556 |                                     <a href="/events/python-user-group/816/">Django Girls Rivers 2019 Workshop</a></li>
557 |                             </ul>
558 |                         </div>
559 |                     </div>
560 |                 </div>
561 |                 <div class="row">
562 |                     <div class="medium-widget success-stories-widget">
563 |                         <div class="shrubbery">
564 |                             <h2 class="widget-title"><span aria-hidden="true" class="icon-success-stories"></span>Success
565 |                                 Stories</h2>
566 |                             <p class="give-me-more"><a href="/success-stories/" title="More Success Stories">More</a>
567 |                             </p>
568 |                             <div class="success-story-item" id="success-story-833">
569 |                                 <blockquote>
570 |                                     <a href="/success-stories/using-python-to-automate-tedious-tasks/">How we used
571 |                                         Python to automate a problem that occurred infrequently, but was a huge nuisance
572 |                                         when it did occur.</a>
573 |                                 </blockquote>
574 |                                 <table cellpadding="0" cellspacing="0" border="0" width="100%" class="quote-from">
575 |                                     <tbody>
576 |                                     <tr>
577 |                                         <td><p><a href="/success-stories/using-python-to-automate-tedious-tasks/">Using
578 |                                             Python to Automate Tedious Tasks</a> <em>by Nat Dunn</em></p></td>
579 |                                     </tr>
580 |                                     </tbody>
581 |                                 </table>
582 |                             </div>
583 |                         </div><!-- end .shrubbery -->
584 |                     </div>
585 |                     <div class="medium-widget applications-widget last">
586 |                         <div class="shrubbery">
587 |                             <h2 class="widget-title"><span aria-hidden="true" class="icon-python"></span>Use Python for&hellip;
588 |                             </h2>
589 |                             <p class="give-me-more"><a href="/about/apps" title="More Applications">More</a></p>
590 |                             <ul class="menu">
591 |                                 <li><b>Web Development</b>:
592 |                                     <span class="tag-wrapper"><a class="tag"
593 |                                                                  href="http://www.djangoproject.com/">Django</a>, <a
594 |                                             class="tag" href="http://www.pylonsproject.org/">Pyramid</a>, <a class="tag"
595 |                                                                                                              href="http://bottlepy.org">Bottle</a>, <a
596 |                                             class="tag" href="http://tornadoweb.org">Tornado</a>, <a
597 |                                             href="http://flask.pocoo.org/" class="tag">Flask</a>, <a class="tag"
598 |                                                                                                      href="http://www.web2py.com/">web2py</a></span>
599 |                                 </li>
600 |                                 <li><b>GUI Development</b>:
601 |                                     <span class="tag-wrapper"><a class="tag" href="http://wiki.python.org/moin/TkInter">tkInter</a>, <a
602 |                                             class="tag" href="https://wiki.gnome.org/Projects/PyGObject">PyGObject</a>, <a
603 |                                             class="tag" href="http://www.riverbankcomputing.co.uk/software/pyqt/intro">PyQt</a>, <a
604 |                                             class="tag" href="https://wiki.qt.io/PySide">PySide</a>, <a class="tag"
605 |                                                                                                         href="https://kivy.org/">Kivy</a>, <a
606 |                                             class="tag" href="http://www.wxpython.org/">wxPython</a></span></li>
607 |                                 <li><b>Scientific and Numeric</b>:
608 |                                     <span class="tag-wrapper">
609 | <a class="tag" href="http://www.scipy.org">SciPy</a>, <a class="tag" href="http://pandas.pydata.org/">Pandas</a>, <a
610 |                                             href="http://ipython.org" class="tag">IPython</a></span></li>
611 |                                 <li><b>Software Development</b>:
612 |                                     <span class="tag-wrapper"><a class="tag"
613 |                                                                  href="http://buildbot.net/">Buildbot</a>, <a
614 |                                             class="tag" href="http://trac.edgewall.org/">Trac</a>, <a class="tag"
615 |                                                                                                       href="http://roundup.sourceforge.net/">Roundup</a></span>
616 |                                 </li>
617 |                                 <li><b>System Administration</b>:
618 |                                     <span class="tag-wrapper"><a class="tag" href="http://www.ansible.com">Ansible</a>, <a
619 |                                             class="tag" href="http://www.saltstack.com">Salt</a>, <a class="tag"
620 |                                                                                                      href="https://www.openstack.org">OpenStack</a></span>
621 |                                 </li>
622 |                             </ul>
623 |                         </div><!-- end .shrubbery -->
624 |                     </div>
625 |                 </div>
626 |                 <div class="pep-widget">
627 |                     <h2 class="widget-title">
628 |                         <span class="prompt">&gt;&gt;&gt;</span> <a href="/dev/peps/">Python Enhancement Proposals<span
629 |                             class="say-no-more"> (PEPs)</span></a>: The future of Python<span class="say-no-more"> is discussed here.</span>
630 |                         <a aria-hidden="true" class="rss-link" href="/dev/peps/peps.rss"><span class="icon-feed"></span>
631 |                             RSS</a>
632 |                     </h2>
633 |                 </div>
634 |                 <div class="psf-widget">
635 |                     <div class="python-logo"></div>
636 |                     <h2 class="widget-title">
637 |                         <span class="prompt">&gt;&gt;&gt;</span> <a href="/psf/">Python Software Foundation</a>
638 |                     </h2>
639 |                     <p>The mission of the Python Software Foundation is to promote, protect, and advance the Python
640 |                         programming language, and to support and facilitate the growth of a diverse and international
641 |                         community of Python programmers. <a class="readmore" href="/psf/">Learn more</a></p>
642 |                     <p class="click-these">
643 |                         <a class="button" href="/users/membership/">Become a Member</a>
644 |                         <a class="button" href="/psf/donations/">Donate to the PSF</a>
645 |                     </p>
646 |                 </div>
647 |             </section>
648 |         </div><!-- end .container -->
649 |     </div><!-- end #content .content-wrapper -->
650 |     <!-- Footer and social media list -->
651 |     <footer id="site-map" class="main-footer" role="contentinfo">
652 |         <div class="main-footer-links">
653 |             <div class="container">
654 |                 <a id="back-to-top-1" class="jump-link" href="#python-network"><span aria-hidden="true"
655 |                                                                                      class="icon-arrow-up"><span>&#9650;</span></span>
656 |                     Back to Top</a>
657 |                 <ul class="sitemap navigation menu do-not-print" role="tree" id="container">
658 |                     <li class="tier-1 element-1">
659 |                         <a href="/about/">About</a>
660 |                         <ul class="subnav menu">
661 |                             <li class="tier-2 element-1" role="treeitem"><a href="/about/apps/"
662 |                                                                             title="">Applications</a></li>
663 |                             <li class="tier-2 element-2" role="treeitem"><a href="/about/quotes/" title="">Quotes</a>
664 |                             </li>
665 |                             <li class="tier-2 element-3" role="treeitem"><a href="/about/gettingstarted/" title="">Getting
666 |                                 Started</a></li>
667 |                             <li class="tier-2 element-4" role="treeitem"><a href="/about/help/" title="">Help</a></li>
668 |                             <li class="tier-2 element-5" role="treeitem"><a href="http://brochure.getpython.info/"
669 |                                                                             title="">Python Brochure</a></li>
670 |                         </ul>
671 |                     </li>
672 |                     <li class="tier-1 element-2">
673 |                         <a href="/downloads/">Downloads</a>
674 |                         <ul class="subnav menu">
675 |                             <li class="tier-2 element-1" role="treeitem"><a href="/downloads/" title="">All releases</a>
676 |                             </li>
677 |                             <li class="tier-2 element-2" role="treeitem"><a href="/downloads/source/" title="">Source
678 |                                 code</a></li>
679 |                             <li class="tier-2 element-3" role="treeitem"><a href="/downloads/windows/"
680 |                                                                             title="">Windows</a></li>
681 |                             <li class="tier-2 element-4" role="treeitem"><a href="/downloads/mac-osx/" title="">Mac OS
682 |                                 X</a></li>
683 |                             <li class="tier-2 element-5" role="treeitem"><a href="/download/other/" title="">Other
684 |                                 Platforms</a></li>
685 |                             <li class="tier-2 element-6" role="treeitem"><a
686 |                                     href="https://docs.python.org/3/license.html" title="">License</a></li>
687 |                             <li class="tier-2 element-7" role="treeitem"><a href="/download/alternatives" title="">Alternative
688 |                                 Implementations</a></li>
689 |                         </ul>
690 |                     </li>
691 |                     <li class="tier-1 element-3">
692 |                         <a href="/doc/">Documentation</a>
693 |                         <ul class="subnav menu">
694 |                             <li class="tier-2 element-1" role="treeitem"><a href="/doc/" title="">Docs</a></li>
695 |                             <li class="tier-2 element-2" role="treeitem"><a href="/doc/av" title="">Audio/Visual
696 |                                 Talks</a></li>
697 |                             <li class="tier-2 element-3" role="treeitem"><a
698 |                                     href="https://wiki.python.org/moin/BeginnersGuide" title="">Beginner&#39;s Guide</a>
699 |                             </li>
700 |                             <li class="tier-2 element-4" role="treeitem"><a href="https://devguide.python.org/"
701 |                                                                             title="">Developer&#39;s Guide</a></li>
702 |                             <li class="tier-2 element-5" role="treeitem"><a href="https://docs.python.org/faq/"
703 |                                                                             title="">FAQ</a></li>
704 |                             <li class="tier-2 element-6" role="treeitem"><a href="http://wiki.python.org/moin/Languages"
705 |                                                                             title="">Non-English Docs</a></li>
706 |                             <li class="tier-2 element-7" role="treeitem"><a href="http://python.org/dev/peps/" title="">PEP
707 |                                 Index</a></li>
708 |                             <li class="tier-2 element-8" role="treeitem"><a
709 |                                     href="https://wiki.python.org/moin/PythonBooks" title="">Python Books</a></li>
710 |                             <li class="tier-2 element-9" role="treeitem"><a href="/doc/essays/" title="">Python
711 |                                 Essays</a></li>
712 |                         </ul>
713 |                     </li>
714 |                     <li class="tier-1 element-4">
715 |                         <a href="/community/">Community</a>
716 |                         <ul class="subnav menu">
717 |                             <li class="tier-2 element-1" role="treeitem"><a href="/community/survey" title="">Community
718 |                                 Survey</a></li>
719 |                             <li class="tier-2 element-2" role="treeitem"><a href="/community/diversity/" title="">Diversity</a>
720 |                             </li>
721 |                             <li class="tier-2 element-3" role="treeitem"><a href="/community/lists/" title="">Mailing
722 |                                 Lists</a></li>
723 |                             <li class="tier-2 element-4" role="treeitem"><a href="/community/irc/" title="">IRC</a></li>
724 |                             <li class="tier-2 element-5" role="treeitem"><a href="/community/forums/"
725 |                                                                             title="">Forums</a></li>
726 |                             <li class="tier-2 element-6" role="treeitem"><a href="/community/workshops/" title="">Python
727 |                                 Conferences</a></li>
728 |                             <li class="tier-2 element-7" role="treeitem"><a href="/community/sigs/" title="">Special
729 |                                 Interest Groups</a></li>
730 |                             <li class="tier-2 element-8" role="treeitem"><a href="/community/logos/" title="">Python
731 |                                 Logo</a></li>
732 |                             <li class="tier-2 element-9" role="treeitem"><a href="https://wiki.python.org/moin/"
733 |                                                                             title="">Python Wiki</a></li>
734 |                             <li class="tier-2 element-10" role="treeitem"><a href="/community/merchandise/" title="">Merchandise</a>
735 |                             </li>
736 |                             <li class="tier-2 element-11" role="treeitem"><a href="/community/awards" title="">Community
737 |                                 Awards</a></li>
738 |                             <li class="tier-2 element-12" role="treeitem"><a
739 |                                     href="https://www.python.org/psf/codeofconduct/" title="">Code of Conduct</a></li>
740 |                         </ul>
741 |                     </li>
742 |                     <li class="tier-1 element-5">
743 |                         <a href="/success-stories/" title="success-stories">Success Stories</a>
744 |                         <ul class="subnav menu">
745 |                             <li class="tier-2 element-1" role="treeitem"><a href="/success-stories/category/arts/"
746 |                                                                             title="">Arts</a></li>
747 |                             <li class="tier-2 element-2" role="treeitem"><a href="/success-stories/category/business/"
748 |                                                                             title="">Business</a></li>
749 |                             <li class="tier-2 element-3" role="treeitem"><a href="/success-stories/category/education/"
750 |                                                                             title="">Education</a></li>
751 |                             <li class="tier-2 element-4" role="treeitem"><a
752 |                                     href="/success-stories/category/engineering/" title="">Engineering</a></li>
753 |                             <li class="tier-2 element-5" role="treeitem"><a href="/success-stories/category/government/"
754 |                                                                             title="">Government</a></li>
755 |                             <li class="tier-2 element-6" role="treeitem"><a href="/success-stories/category/scientific/"
756 |                                                                             title="">Scientific</a></li>
757 |                             <li class="tier-2 element-7" role="treeitem"><a
758 |                                     href="/success-stories/category/software-development/" title="">Software
759 |                                 Development</a></li>
760 |                         </ul>
761 |                     </li>
762 |                     <li class="tier-1 element-6">
763 |                         <a href="/blogs/" title="News from around the Python world">News</a>
764 |                         <ul class="subnav menu">
765 |                             <li class="tier-2 element-1" role="treeitem"><a href="/blogs/"
766 |                                                                             title="Python Insider Blog Posts">Python
767 |                                 News</a></li>
768 |                             <li class="tier-2 element-2" role="treeitem"><a href="http://planetpython.org/"
769 |                                                                             title="Planet Python">Community News</a>
770 |                             </li>
771 |                             <li class="tier-2 element-3" role="treeitem"><a href="http://pyfound.blogspot.com/"
772 |                                                                             title="PSF Blog">PSF News</a></li>
773 |                             <li class="tier-2 element-4" role="treeitem"><a href="http://pycon.blogspot.com/"
774 |                                                                             title="PyCon Blog">PyCon News</a></li>
775 |                         </ul>
776 |                     </li>
777 |                     <li class="tier-1 element-7">
778 |                         <a href="/events/">Events</a>
779 |                         <ul class="subnav menu">
780 |                             <li class="tier-2 element-1" role="treeitem"><a href="/events/python-events" title="">Python
781 |                                 Events</a></li>
782 |                             <li class="tier-2 element-2" role="treeitem"><a href="/events/python-user-group/" title="">User
783 |                                 Group Events</a></li>
784 |                             <li class="tier-2 element-3" role="treeitem"><a href="/events/python-events/past/" title="">Python
785 |                                 Events Archive</a></li>
786 |                             <li class="tier-2 element-4" role="treeitem"><a href="/events/python-user-group/past/"
787 |                                                                             title="">User Group Events Archive</a></li>
788 |                             <li class="tier-2 element-5" role="treeitem"><a
789 |                                     href="https://wiki.python.org/moin/PythonEventsCalendar#Submitting_an_Event"
790 |                                     title="">Submit an Event</a></li>
791 |                         </ul>
792 |                     </li>
793 |                     <li class="tier-1 element-8">
794 |                         <a href="/dev/">Contributing</a>
795 |                         <ul class="subnav menu">
796 |                             <li class="tier-2 element-1" role="treeitem"><a href="https://devguide.python.org/"
797 |                                                                             title="">Developer&#39;s Guide</a></li>
798 |                             <li class="tier-2 element-2" role="treeitem"><a href="https://bugs.python.org/" title="">Issue
799 |                                 Tracker</a></li>
800 |                             <li class="tier-2 element-3" role="treeitem"><a
801 |                                     href="https://mail.python.org/mailman/listinfo/python-dev" title="">python-dev
802 |                                 list</a></li>
803 |                             <li class="tier-2 element-4" role="treeitem"><a href="/dev/core-mentorship/" title="">Core
804 |                                 Mentorship</a></li>
805 |                             <li class="tier-2 element-5" role="treeitem"><a href="/news/security/" title="">Report a
806 |                                 Security Issue</a></li>
807 |                         </ul>
808 |                     </li>
809 |                 </ul>
810 |                 <a id="back-to-top-2" class="jump-link" href="#python-network"><span aria-hidden="true"
811 |                                                                                      class="icon-arrow-up"><span>&#9650;</span></span>
812 |                     Back to Top</a>
813 |             </div><!-- end .container -->
814 |         </div> <!-- end .main-footer-links -->
815 |         <div class="site-base">
816 |             <div class="container">
817 |                 <ul class="footer-links navigation menu do-not-print" role="tree">
818 |                     <li class="tier-1 element-1"><a href="/about/help/">Help &amp; <span
819 |                             class="say-no-more">General</span> Contact</a></li>
820 |                     <li class="tier-1 element-2"><a href="/community/diversity/">Diversity <span class="say-no-more">Initiatives</span></a>
821 |                     </li>
822 |                     <li class="tier-1 element-3"><a href="https://github.com/python/pythondotorg/issues">Submit Website
823 |                         Bug</a></li>
824 |                     <li class="tier-1 element-4">
825 |                         <a href="https://status.python.org/">Status <span class="python-status-indicator-default"
826 |                                                                           id="python-status-indicator"></span></a>
827 |                     </li>
828 |                 </ul>
829 |                 <div class="copyright">
830 |                     <p>
831 |                         <small>
832 |                             <span class="pre">Copyright &copy;2001-2019.</span>
833 |                             &nbsp;<span class="pre"><a href="/psf-landing/">Python Software Foundation</a></span>
834 |                             &nbsp;<span class="pre"><a href="/about/legal/">Legal Statements</a></span>
835 |                             &nbsp;<span class="pre"><a href="/privacy/">Privacy Policy</a></span>
836 |                             &nbsp;<span class="pre"><a href="/psf/sponsorship/sponsors/">Powered by Rackspace</a></span>
837 |                         </small>
838 |                     </p>
839 |                 </div>
840 |             </div><!-- end .container -->
841 |         </div><!-- end .site-base -->
842 |     </footer>
843 | </div><!-- end #touchnav-wrapper -->
844 | <script src="//ajax.googleapis.com/ajax/libs/jquery/1.8.2/jquery.min.js"></script>
845 | <script>window.jQuery || document.write('<script src="/static/js/libs/jquery-1.8.2.min.js"><\/script>')</script>
846 | <script src="/static/js/libs/masonry.pkgd.min.js"></script>
847 | <script type="text/javascript" src="/static/js/main-min.js" charset="utf-8"></script>
848 | <!--[if lte IE 7]>
849 | <script type="text/javascript" src="/static/js/plugins/IE8-min.js" charset="utf-8"></script>
850 | <![endif]-->
851 | <!--[if lte IE 8]>
852 | <script type="text/javascript" src="/static/js/plugins/getComputedStyle-min.js" charset="utf-8"></script>
853 | <![endif]-->
854 | </body>
855 | </html>
856 | 


--------------------------------------------------------------------------------