├── Chapter05 ├── Quotes │ ├── Quotes │ │ ├── __init__.py │ │ ├── scrapinghub.yml │ │ ├── items.pyc │ │ ├── __init__.pyc │ │ ├── settings.pyc │ │ ├── spiders │ │ │ ├── quotes.pyc │ │ │ ├── __init__.pyc │ │ │ ├── __init__.py │ │ │ └── quotes.py │ │ ├── pipelines.py │ │ ├── items.py │ │ └── settings.py │ └── scrapy.cfg ├── scrapinghub_blogs.py ├── scrapinghub.yml ├── scrapy.cfg ├── toscrape_quotes.py ├── bs4_exploring.py └── quotes.csv ├── Chapter09 ├── regex2.py ├── regex_worldpopulation.py ├── regex1.py ├── regexHTML.html ├── regex_xml.py ├── regexHTML.py ├── godfreysfeed.py ├── regex.py └── sitemap.xml ├── Chapter07 ├── usgsEarthquake.py ├── githubAPI.py ├── twitter200.py ├── githubevent.py ├── sunrisesunset.py └── universities.py ├── Chapter04 ├── example3_company_address.py ├── example1_ibm_announcements.py ├── example3_AHL.py ├── example2_quotes_authors.py └── test.html ├── Chapter02 ├── urlerror.py ├── urllib_http_headers.py ├── wikipedia_content.py ├── githubevents.py ├── wikipedia_content_urllib.py ├── httpbin_postrequest.py ├── requeststest.py ├── urllib_test.py └── urllibrobotserror.py ├── Chapter03 ├── lxmlParse.py ├── etreeFromString.py ├── lxmlXML.py ├── lxmlXMLFile.py ├── scrapelxml.py ├── scrapelxmlcss.py ├── food.xml └── scrapeXPathLoop.py ├── README.md ├── Chapter10 ├── bookdetails.csv ├── listToCSV.py ├── analysis.py └── bookdetails.json ├── LICENSE ├── Chapter08 ├── seleniumBrowser.py ├── seleniumLocator.py ├── seleniumBooks.py └── seleniumProducts.py └── Chapter06 ├── testingGroundCookie.py ├── toScrapeSessionCookie.py └── toScrapeViewstate.py /Chapter05/Quotes/Quotes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Chapter09/regex2.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Removed 3 | ''' 4 | -------------------------------------------------------------------------------- /Chapter05/scrapinghub_blogs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Removed 3 | ''' -------------------------------------------------------------------------------- /Chapter07/usgsEarthquake.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Removed 3 | ''' 4 | -------------------------------------------------------------------------------- /Chapter09/regex_worldpopulation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | removed 3 | ''' 4 | -------------------------------------------------------------------------------- /Chapter04/example3_company_address.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Removed 3 | ''' 4 | -------------------------------------------------------------------------------- /Chapter09/regex1.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | ''' 4 | Content Removed 5 | ''' -------------------------------------------------------------------------------- /Chapter05/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | projects: 2 | default: 385731 3 | stacks: 4 | default: scrapy:1.3-py3 -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/scrapinghub.yml: -------------------------------------------------------------------------------- 1 | projects: 2 | default: 385731 3 | stacks: 4 | default: scrapy:1.3-py3 -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/items.pyc -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/__init__.pyc -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/settings.pyc -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/spiders/quotes.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/spiders/quotes.pyc -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Supraav/Hands-On-Web-Scraping-with-Python/HEAD/Chapter05/Quotes/Quotes/spiders/__init__.pyc -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Chapter02/urlerror.py: -------------------------------------------------------------------------------- 1 | import urllib.request as request 2 | import urllib.error as error 3 | try: 4 | request.urlopen("https://www.python.ogr") 5 | except error.URLError as e: 6 | print("Error Occurred: ",e.reason) 7 | -------------------------------------------------------------------------------- /Chapter02/urllib_http_headers.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | 3 | url='https://www.samsclub.com/sitemap.xml' 4 | someRequest = urllib.request.urlopen(url)#loads provided URL 5 | someRequest.getheaders() #Lists all HTTP headers. 6 | someRequest.getheader("Content-Type") #return value of header 'Content-Type' 7 | -------------------------------------------------------------------------------- /Chapter05/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Blog.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Blog 12 | -------------------------------------------------------------------------------- /Chapter05/Quotes/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Quotes.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Quotes 12 | -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class QuotesPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /Chapter03/lxmlParse.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | tree = etree.parse("food.xml") 3 | 4 | #iter through selected name found in Tree 5 | for element in tree.iter('name'): 6 | print(element.text) 7 | 8 | #iter through selected elements found in Tree 9 | for element in tree.iter('name','rating','feedback'): 10 | print("{} - {}".format(element.tag, element.text)) 11 | 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## [Get this title for $10 on Packt's Spring Sale](https://www.packt.com/B11487?utm_source=github&utm_medium=packt-github-repo&utm_campaign=spring_10_dollar_2022) 2 | ----- 3 | For a limited period, all eBooks and Videos are only $10. All the practical content you need \- by developers, for developers 4 | 5 | # Hands-On-Web-Scraping-with-Python 6 | Hands-On Web Scraping with Python, published by Packt 7 | -------------------------------------------------------------------------------- /Chapter02/wikipedia_content.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | link = "https://en.wikipedia.org/wiki/List_of_most_popular_websites" 4 | response = requests.get(link) 5 | print(type(response)) 6 | content = response.content 7 | #print(content) 8 | #Create a html file with the content received as 'content' 9 | file = open(os.getcwd()+os.sep+"tests"+os.sep+"wikicontent.html","wb") 10 | file.write(content) 11 | file.close() 12 | #print(content) 13 | -------------------------------------------------------------------------------- /Chapter02/githubevents.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import json 4 | link = "https://feeds.citibikenyc.com/stations/stations.json" 5 | # link = "https://api.github.com/events" 6 | response = requests.get(link).json() 7 | print(response['stationBeanList'][0]) 8 | # jsonData = json.dumps(response) 9 | # print(type(jsonData)) 10 | # print(response[0]) 11 | 12 | # file = open(os.getcwd()+os.sep+"tests"+os.sep+"github_event.json","w") 13 | # file.write(jsonData) 14 | # file.close() -------------------------------------------------------------------------------- /Chapter02/wikipedia_content_urllib.py: -------------------------------------------------------------------------------- 1 | import urllib.request as req 2 | import os 3 | link = "https://en.wikipedia.org/wiki/List_of_most_popular_websites" 4 | response = req.urlopen(link) 5 | print(type(response)) 6 | #print(response.read()) 7 | 8 | content = response.read() 9 | print(content) 10 | #Create a html file with the content received as 'content' 11 | file = open(os.getcwd()+os.sep+"tests"+os.sep+"wikipopular.html","wb") 12 | file.write(content) 13 | file.close() 14 | #print(content) 15 | -------------------------------------------------------------------------------- /Chapter03/etreeFromString.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | import requests 3 | response = requests.get('http://httpbin.org/forms/post') 4 | print(type(response.text)) 5 | # build the DOM Tree 6 | tree = html.fromstring(response.text) 7 | print(type(tree)) 8 | for element in tree.iter('input'): 9 | print("Element: %s \n\tvalues(): %s \n\tattrib: %s \n\titems(): %s \n\tkeys(): %s"% 10 | (element.tag, element.values(),element.attrib,element.items(),element.keys())) 11 | print("\n") 12 | 13 | -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | class QuotesItem(scrapy.Item): 11 | # define the fields for your item here like: 12 | # name = scrapy.Field() 13 | 14 | tags = scrapy.Field() 15 | author = scrapy.Field() 16 | quote = scrapy.Field() 17 | author_link = scrapy.Field() 18 | 19 | pass 20 | -------------------------------------------------------------------------------- /Chapter03/lxmlXML.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | url="https://www.w3schools.com/xml/simple.xml" 4 | response = requests.get(url).content 5 | tree = etree.XML(response) 6 | print(tree) 7 | print(type(tree)) 8 | #iter through all elements found in Tree 9 | for element in tree.iter(): 10 | print("%s - %s" % (element.tag, element.text) 11 | 12 | #iter through selected elements found in Tree 13 | for element in tree.iter('calories','name'): 14 | print("%s - %s" % (element.tag, element.text)) 15 | -------------------------------------------------------------------------------- /Chapter02/httpbin_postrequest.py: -------------------------------------------------------------------------------- 1 | import requests 2 | params = {'custname':'Mr. ABC','custtel':'','custemail':'abc@somedomain.com', 3 | 'size':'small','topping':['cheese','mushroom'],'delivery':'13:00','comments':'None'} 4 | headers={ 5 | 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 6 | 'Content-Type':'application/x-www-form-urlencoded', 7 | 'Referer':'http://httpbin.org/forms/post' 8 | } 9 | response = requests.post('http://httpbin.org/post',data=params,headers=headers).json() 10 | print(response) -------------------------------------------------------------------------------- /Chapter07/githubAPI.py: -------------------------------------------------------------------------------- 1 | import requests 2 | url = 'https://api.github.com' 3 | 4 | results = requests.get(url) 5 | print("Type Results", type(results)) 6 | print("Status Code: ", results.status_code) 7 | print("Headers: Content-Type: ", results.headers['Content-Type']) 8 | print("Headers: ", results.headers) 9 | 10 | etag = results.headers['ETag'] 11 | print("ETag: ",etag) 12 | results = requests.get(url, headers={'If-None-Match': etag}) 13 | print("Type Results", type(results)) 14 | print("Status Code: ", results.status_code) 15 | print("Headers: Content-Type: ", results.headers['Content-Type']) 16 | -------------------------------------------------------------------------------- /Chapter10/bookdetails.csv: -------------------------------------------------------------------------------- 1 | Title,Price,Stock,Rating 2 | Rip it Up and ...,35.02,In stock,5 3 | Our Band Could Be ...,57.25,In stock,4 4 | How Music Works,37.32,In stock,2 5 | Love Is a Mix ...,18.03,Out of stock,1 6 | Please Kill Me: The ...,31.19,In stock,4 7 | Kill 'Em and Leave: ...,45.0,In stock,5 8 | "Chronicles, Vol. 1",52.6,Out of stock,2 9 | This Is Your Brain ...,38.4,In stock,1 10 | Orchestra of Exiles: The ...,12.36,In stock,3 11 | No One Here Gets ...,20.02,In stock,5 12 | Life,31.58,In stock,5 13 | Old Records Never Die: ...,55.66,Out of Stock,2 14 | Forever Rockers (The Rocker ...,28.8,In stock,3 15 | -------------------------------------------------------------------------------- /Chapter03/lxmlXMLFile.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | xml = open("food.xml","rb").read() 3 | #tree = etree.fromstring(xml) 4 | #tree = etree.parse(xml) 5 | tree = etree.XML(xml) 6 | 7 | print(tree) 8 | print(type(tree)) 9 | 10 | #iter through all elements found in Tree 11 | for element in tree.iter(): 12 | print("%s - %s" % (element.tag, element.text)) 13 | 14 | #iter through selected elements found in Tree 15 | for element in tree.iter('price','name'): 16 | print("%s - %s" % (element.tag, element.text)) 17 | 18 | #iter through description 19 | for element in tree.iter('description'): 20 | print("%s - %s" % (element.tag, element.text)) 21 | 22 | -------------------------------------------------------------------------------- /Chapter07/twitter200.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | url = 'https://api.twitter.com/1.1/search/tweets.json?q=' 5 | 6 | results = requests.get(url) 7 | print("Type Results",type(results)) 8 | print("Status Code: ", results.status_code) 9 | print("Headers: Content-Type: ", results.headers['Content-Type']) 10 | 11 | #jsonResult = results.json() 12 | jsonResult = results.content 13 | print("Type JSON Results",type(jsonResult)) 14 | print(jsonResult) 15 | 16 | jsonFinal = json.loads(jsonResult.decode()) 17 | print(jsonFinal) 18 | #print(json.loads(requests.get(url).content.decode())) 19 | 20 | if results.status_code==400: 21 | print(jsonFinal['errors'][0]['message']) 22 | else: 23 | pass 24 | -------------------------------------------------------------------------------- /Chapter09/regexHTML.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Welcome to Web Scraping: Example 4 | 7 | 8 | 9 |

Welcome to Web Scraping

10 | Links: 11 | Google 12 | Yahoo 13 | Wikipedia 14 |
15 |

16 | Paragraph contents 17 | 18 |

19 |

20 | Sub paragraph content 21 |

Sub heading Content!

22 |

23 |
24 | 25 | 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter03/scrapelxml.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | 3 | musicUrl= "http://books.toscrape.com/catalogue/category/books/music_14/index.html" 4 | doc = lxml.html.parse(musicUrl) 5 | 6 | #base element 7 | articles = doc.xpath("//*[@id='default']/div/div/div/div/section/div[2]/ol/li[1]/article")[0] 8 | 9 | #individual element inside base 10 | title = articles.xpath("//h3/a/text()") 11 | price = articles.xpath("//div[2]/p[contains(@class,'price_color')]/text()") 12 | availability = articles.xpath("//div[2]/p[2][contains(@class,'availability')]/text()[normalize-space()]") 13 | imageUrl = articles.xpath("//div[1][contains(@class,'image_container')]/a/img/@src") 14 | starRating = articles.xpath("//p[contains(@class,'star-rating')]/@class") 15 | 16 | #cleaning and formatting 17 | stock = list(map(lambda stock:stock.strip(),availability)) 18 | images = list(map(lambda img:img.replace('../../../..','http://books.toscrape.com'),imageUrl)) 19 | rating = list(map(lambda rating:rating.replace('star-rating ',''),starRating)) 20 | 21 | print(title) 22 | print(price) 23 | print(stock) 24 | print(images) 25 | print(rating) 26 | 27 | #Merging all 28 | dataset = zip(title,price,stock,images,rating) 29 | print(list(dataset)) 30 | -------------------------------------------------------------------------------- /Chapter03/scrapelxmlcss.py: -------------------------------------------------------------------------------- 1 | from lxml import html 2 | import requests 3 | from lxml.cssselect import CSSSelector 4 | 5 | url = 'https://developer.ibm.com/announcements/category/data-science/?fa=date%3ADESC&fb=' 6 | url_get = requests.get(url) 7 | tree = html.document_fromstring(url_get.content) 8 | print(type(tree)) 9 | 10 | announcements=[] 11 | articles = tree.cssselect('.ibm--card > a.ibm--card__block_link') 12 | for article in articles: 13 | 14 | link = article.get('href') 15 | atype = article.cssselect('div.ibm--card__body > h5')[0].text.strip() 16 | adate = article.cssselect('div.ibm--card__body > h5 > .ibm--card__date')[0].text 17 | title = article.cssselect('div.ibm--card__body > h3.ibm--card__title')[0].text_content() 18 | excerpt= article.cssselect(' div.ibm--card__body > p.ibm--card__excerpt')[0].text 19 | category= article.cssselect('div.ibm--card__bottom > p.cpt-byline__categories span') 20 | #only two available on block: except '+' 21 | 22 | #announcements.append([link,atype,adate,title,excerpt,[category[0].text,category[1].text]]) 23 | announcements.append([link,atype,adate,title,excerpt,[span.text for span in category if span.text!='+']]) 24 | 25 | print(announcements) 26 | -------------------------------------------------------------------------------- /Chapter08/seleniumBrowser.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import re 3 | chrome_path='chromedriver' 4 | driver = webdriver.Chrome(executable_path=chrome_path) 5 | print(type(driver)) 6 | 7 | driver.get('https://www.python.org') 8 | 9 | print("Title: ",driver.title) 10 | print("Current Page URL: ",driver.current_url) 11 | if re.search(r'python.org',driver.current_url): 12 | driver.save_screenshot("pythonorg.png") 13 | print("Python Screenshot Saved!") 14 | 15 | cookies = driver.get_cookies() 16 | print("Cookies obtained from python.org") 17 | print(cookies) 18 | 19 | print(driver.page_source) 20 | driver.refresh() 21 | 22 | driver.get('https://www.google.com') 23 | print("Title: ",driver.title) 24 | print("Current Page URL: ",driver.current_url) 25 | if re.search(r'google.com',driver.current_url): 26 | driver.save_screenshot("google.png") 27 | print("Google Screenshot Saved!") 28 | 29 | cookies = driver.get_cookies() 30 | print("Cookies obtained from google.com") 31 | print(cookies) 32 | 33 | print("Current Page URL: ",driver.current_url) 34 | driver.back() 35 | print("Page URL (Back): ",driver.current_url) 36 | driver.forward() 37 | print("Page URL (Forward): ",driver.current_url) 38 | 39 | driver.close() 40 | driver.quit() 41 | -------------------------------------------------------------------------------- /Chapter07/githubevent.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from collections import Counter 4 | dataSet = [] 5 | 6 | url = 'https://api.github.com/' 7 | 8 | 9 | def readUrl(search): 10 | results = requests.get(url + search) 11 | print("Status Code: ", results.status_code) 12 | print("Headers: Content-Type: ", results.headers['Content-Type']) 13 | return results.json() 14 | 15 | 16 | if __name__ == "__main__": 17 | eventTypes=[] 18 | #IssueCommentEvent,WatchEvent,PullRequestReviewCommentEvent,CreateEvent 19 | for page in range(1, 4): 20 | events = readUrl('events?page=' + str(page)) 21 | # print(jsonResult) 22 | for event in events: 23 | id = event['id'] 24 | type = event['type'] 25 | actor = event['actor']['display_login'] 26 | repoUrl = event['repo']['url'] 27 | createdAt = event['created_at'] 28 | eventTypes.append(type) 29 | dataSet.append([id, type, createdAt, repoUrl, actor]) 30 | 31 | eventInfo = dict(Counter(eventTypes)) 32 | print("Individual Event Counts:", eventInfo) 33 | print("CreateEvent Counts:", eventInfo['CreateEvent']) 34 | print("DeleteEvent Counts:", eventInfo['DeleteEvent']) 35 | 36 | print("Total Events Found: ", len(dataSet)) 37 | print(dataSet) 38 | -------------------------------------------------------------------------------- /Chapter08/seleniumLocator.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | chrome_path='chromedriver' 3 | driver = webdriver.Chrome(executable_path=chrome_path) 4 | driver.get('http://automationpractice.com') 5 | print("Current Page URL: ",driver.current_url) 6 | 7 | searchBox = driver.find_element_by_id('search_query_top') 8 | print("Type :",type(searchBox)) 9 | print("Attribute Value :",searchBox.get_attribute("value")) 10 | print("Attribute Class :",searchBox.get_attribute("class")) 11 | print("Tag Name :",searchBox.tag_name) 12 | 13 | searchBox.clear() 14 | searchBox.send_keys("Dress") 15 | 16 | submitButton = driver.find_element_by_name("submit_search") 17 | submitButton.click() 18 | 19 | resultsShowing = driver.find_element_by_class_name("product-count") 20 | print("Results Showing: ",resultsShowing.text) 21 | 22 | resultsFound = driver.find_element_by_xpath('//*[@id="center_column"]//span[@class="heading-counter"]') 23 | print("Results Found: ",resultsFound.text) 24 | 25 | products = driver.find_elements_by_xpath('//*[@id="center_column"]//a[@class="product-name"]') 26 | #products = driver.find_elements_by_css_selector('ul.product_list li.ajax_block_product a.product-name') 27 | 28 | foundProducts=[] 29 | for product in products: 30 | foundProducts.append([product.text,product.get_attribute("href")]) 31 | 32 | print(foundProducts) 33 | 34 | driver.close() 35 | driver.quit() 36 | -------------------------------------------------------------------------------- /Chapter06/testingGroundCookie.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | import requests 3 | mainUrl = "http://testing-ground.scraping.pro" 4 | loginUrl = "http://testing-ground.scraping.pro/login" 5 | postUrl="http://testing-ground.scraping.pro/login?mode=login" 6 | logoutUrl = "http://testing-ground.scraping.pro/login?mode=logout" 7 | 8 | def responseCookies(response): 9 | headers = response.headers 10 | cookies = response.cookies 11 | print("Headers: ", headers) 12 | print("Cookies: ", cookies) 13 | 14 | def processParams(params): 15 | response = requests.post(postUrl, data=params) 16 | responseB = pq(response.text) 17 | message = responseB.find('div#case_login h3').text() 18 | print("Confirm Login : ",message) 19 | 20 | if __name__ == '__main__': 21 | requests.get(logoutUrl) 22 | response = requests.get(mainUrl) 23 | responseCookies(response) 24 | 25 | response = requests.get(loginUrl) 26 | responseCookies(response) 27 | 28 | responseA = pq(response.text) 29 | username = responseA.find('input[id="usr"]').attr('name') 30 | password = responseA.find('input[id="pwd"]').attr('name') 31 | 32 | #Welcome : Success 33 | paramsCorrect = {username: 'admin', password: '12345'} #Success 34 | print(paramsCorrect) 35 | processParams(paramsCorrect) 36 | 37 | paramsIncorrect = {username: 'admin', password: '123456'} #Access Denied 38 | print(paramsIncorrect) 39 | processParams(paramsIncorrect) 40 | -------------------------------------------------------------------------------- /Chapter03/food.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Butter Milk with Vanilla 5 | $3.99 6 | Rich tangy buttermilk with vanilla essence 7 | 5.0 8 | 6 9 | 10 | 11 | Fish and Chips 12 | $4.99 13 | Crispy fried Chips and Fish served with lemon and malt vinegar 14 | 5.0 15 | 10 16 | 17 | 18 | Egg Roll 19 | $3.99 20 | Fresh egg rolls filled with ground chicken, carrot, cabbage 21 | 4.0 22 | 8 23 | 24 | 25 | Pineapple Cake 26 | $3.99 27 | Crushed Pineapple mixed with vanilla, eggs and lemon juice 28 | 5.0 29 | 9 30 | 31 | 32 | Eggs and Bacon 33 | $5.50 34 | Served with rice and fresh fruit 35 | 4.5 36 | 4 37 | 38 | 39 | Orange Juice 40 | $2.99 41 | Fresh Orange juice served 42 | 4.9 43 | 10 44 | 45 | 46 | -------------------------------------------------------------------------------- /Chapter07/sunrisesunset.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | # location: Kathmandu, Nepal 5 | # lat = 27.717245 , lng=85.323959 6 | url = 'https://api.sunrise-sunset.org/json?lat=27.717245&lng=85.323959&date=2019-03-04' 7 | 8 | results = requests.get(url) 9 | print("Type Results",type(results)) 10 | print("Status Code: ", results.status_code) 11 | print("Headers: Content-Type: ", results.headers['Content-Type']) 12 | print("Headers: ", results.headers) 13 | 14 | jsonResult = results.json() 15 | print("Type JSON Results",type(jsonResult)) 16 | print(jsonResult) 17 | print("SunRise & Sunset: ",jsonResult['results']['sunrise']," & ",jsonResult['results']['sunset']) 18 | 19 | 20 | # Type Results 21 | # Status Code: 200 22 | # Headers-ContentType: application/json 23 | # Headers: {'Access-Control-Allow-Origin': '*', 'Content-Type': 'application/json', 'Vary': 'Accept-Encoding', 'Server': 'nginx', 'Connection': 'keep-alive', 'Content-Encoding': 'gzip', 'Transfer-Encoding': 'chunked', 'Date': 'Mon, 04 Mar 2019 07:48:29 GMT'} 24 | # Type JSON Results 25 | # {'status': 'OK', 'results': {'civil_twilight_end': '12:44:16 PM', 'astronomical_twilight_end': '1:38:31 PM', 'civil_twilight_begin': '12:16:32 AM', 'sunrise': '12:39:54 AM', 'nautical_twilight_begin': '11:49:24 PM', 'astronomical_twilight_begin': '11:22:17 PM', 'nautical_twilight_end': '1:11:24 PM', 'sunset': '12:20:54 PM', 'solar_noon': '6:30:24 AM', 'day_length': '11:41:00'}} 26 | # SunRise & Sunset: 12:39:54 AM & 12:20:54 PM 27 | -------------------------------------------------------------------------------- /Chapter07/universities.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | dataSet = [] 4 | 5 | url = 'http://universities.hipolabs.com/search?name=' 6 | 7 | def readUrl(search): 8 | results = requests.get(url+search) 9 | print("Status Code: ", results.status_code) 10 | print("Headers: Content-Type: ", results.headers['Content-Type']) 11 | # print("Headers: ", results.headers) 12 | return results.json() 13 | 14 | if __name__=="__main__": 15 | jsonResult = readUrl('Wales') 16 | # print(jsonResult) 17 | for university in jsonResult: 18 | name = university['name'] 19 | url = university['web_pages'][0] 20 | dataSet.append([name,url]) 21 | 22 | print("Total Universities Found: ",len(dataSet)) 23 | print(dataSet) 24 | 25 | ''' 26 | Status Code: 200 27 | Headers: Content-Type: application/json 28 | Total Universities Found: 10 29 | [['University of Wales', 'http://www.wales.ac.uk/'], 30 | ['University of Wales Institute, Cardiff', 'http://www.uwic.ac.uk/'], 31 | ['University of Wales College of Medicine', 'http://www.uwcm.ac.uk/'], 32 | ['Johnson & Wales University', 'http://www.jwu.edu/'], 33 | ['University of New South Wales', 'http://www.unsw.edu.au/'], 34 | ['University of Wales, Newport', 'http://www.newport.ac.uk/'], 35 | ['University of Wales, Swansea', 'http://www.swan.ac.uk/'], 36 | ['University of Wales, Aberystwyth', 'http://www.aber.ac.uk/'], 37 | ['University of Wales, Lampeter', 'http://www.lamp.ac.uk/'], 38 | ['University of Wales, Bangor', 'http://www.bangor.ac.uk/']] 39 | ''' 40 | -------------------------------------------------------------------------------- /Chapter10/listToCSV.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import json 3 | 4 | colNames = ['Title','Price','Stock','Rating'] 5 | dataSet = [ 6 | ['Rip it Up and ...', 35.02, 'In stock', 5], 7 | ['Our Band Could Be ...', 57.25, 'In stock', 4], 8 | ['How Music Works', 37.32, 'In stock', 2], 9 | ['Love Is a Mix ...', 18.03, 'Out of stock',1], 10 | ['Please Kill Me: The ...', 31.19, 'In stock', 4], 11 | ["Kill 'Em and Leave: ...", 45.0, 'In stock',5], 12 | ['Chronicles, Vol. 1', 52.60, 'Out of stock',2], 13 | ['This Is Your Brain ...', 38.4, 'In stock',1], 14 | ['Orchestra of Exiles: The ...', 12.36, 'In stock',3], 15 | ['No One Here Gets ...', 20.02, 'In stock',5], 16 | ['Life', 31.58, 'In stock',5], 17 | ['Old Records Never Die: ...', 55.66, 'Out of Stock',2], 18 | ['Forever Rockers (The Rocker ...', 28.80, 'In stock',3] 19 | ] 20 | 21 | print(dataSet) 22 | 23 | fileCsv = open('bookdetails.csv', 'w', newline='', encoding='utf-8') 24 | writer = csv.writer(fileCsv) 25 | writer.writerow(colNames) 26 | for data in dataSet: 27 | writer.writerow(data) 28 | fileCsv.close() 29 | 30 | 31 | finalDataSet=list() #empty Dataset 32 | for data in dataSet: 33 | print(dict(zip(colNames,data))) 34 | finalDataSet.append(dict(zip(colNames,data))) 35 | print(finalDataSet) 36 | 37 | with open('bookdetails.json', 'w') as jsonfile: 38 | json.dump(finalDataSet,jsonfile) 39 | 40 | 41 | with open('bookdetails.json', 'r+') as jsonfile: 42 | data = json.load(jsonfile) 43 | print(data) 44 | print(data[0]) 45 | print(data[0]['id']) 46 | print(data[0]['price']) 47 | print(data[0:2]) 48 | -------------------------------------------------------------------------------- /Chapter09/regex_xml.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | filename = 'sitemap.xml' 4 | 5 | # collect Blog title information from URLs except not link to any category 6 | dataSetBlog = [] 7 | dataSetBlogURL = [] # collects Blog URLs 8 | dataSetCategory = [] # collect Category title 9 | dataSetCategoryURL = [] # collect Category URLs 10 | 11 | page = open(filename, 'r').read() 12 | pattern = r"loc>(.*) 0 and not re.match('(category)', blogTitle[0]): 24 | dataSetBlog.append(blogTitle[0]) 25 | 26 | if re.match(r'.*category', url): #Category Related 27 | dataSetCategoryURL.append(url) 28 | categoryTitle = re.findall(r'category/([\w\-\s]+)', url) 29 | dataSetCategory.append(categoryTitle[0]) 30 | 31 | 32 | print("Blogs URL: ", len(dataSetBlogURL)) 33 | print(dataSetBlogURL) 34 | 35 | print("Blogs Title: ", len(dataSetBlog)) 36 | print(dataSetBlog) 37 | 38 | print("Unique Blog Count: ", len(set(dataSetBlog))) 39 | print(set(dataSetBlog)) 40 | 41 | print("Category URL Count: ", len(dataSetCategoryURL)) 42 | print(dataSetCategoryURL) 43 | 44 | print("Category Title Count: ", len(dataSetCategory)) 45 | print(dataSetCategory) 46 | 47 | print("Unique Category Count: ", len(set(dataSetCategory))) 48 | print(set(dataSetCategory)) 49 | 50 | 51 | -------------------------------------------------------------------------------- /Chapter10/analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | 4 | dataSet = pd.read_csv('bookdetails.csv') 5 | 6 | print(type(dataSet)) 7 | print(dataSet) 8 | print(dataSet.describe()) 9 | print(dataSet.columns) 10 | print(sum(dataSet['Price'])) 11 | print(sum(dataSet['Rating'])) 12 | print(dataSet[['Price','Rating']]) 13 | print(dataSet['Price']) 14 | print(dataSet[dataSet.Stock.str.contains(r'Out')]['Price']) 15 | print(dataSet[dataSet['Rating']>=4.0][['Title','Price']]) 16 | print(dataSet[dataSet.Rating.between(3.5,4.5)]['Title']) 17 | 18 | 19 | #Chart1 20 | price_group = dataSet[['Price']] 21 | print(price_group) 22 | bar_plot = price_group.plot() 23 | bar_plot.set_xlabel("No of Books") 24 | bar_plot.set_ylabel("Price") 25 | plt.show() 26 | 27 | #Chart2 28 | price_group = dataSet[['Price']] 29 | bar_plot = price_group.plot(kind='bar') 30 | bar_plot.set_xlabel("No of Books") 31 | bar_plot.set_ylabel("Price") 32 | plt.show() 33 | 34 | #Chart3 35 | price_group = dataSet[['Price','Rating']] 36 | bar_plot = price_group.plot(kind='bar',title="Book Price and Rating") 37 | bar_plot.set_xlabel("No of Books") 38 | bar_plot.set_ylabel("Price") 39 | plt.show() 40 | 41 | #Chart4 42 | labels = dataSet[['Stock']] 43 | print(labels) 44 | price_group = dataSet[['Price','Rating']] 45 | bar_plot = price_group.plot(kind='bar',title="Book Price and Rating") 46 | bar_plot.set_xlabel("No of Books") 47 | bar_plot.set_xticklabels(labels) 48 | bar_plot.set_ylabel("Price") 49 | plt.show() 50 | 51 | #Chart5 - PieChart 52 | prices = dataSet['Price'][0:6] #Price from first 6 items 53 | labels = dataSet['Title'][0:6] #Book Titles from first 6 items 54 | legends,ax1 = plt.pie(prices, labels=labels, shadow=True, startangle=45) 55 | plt.legend(legends, prices, loc="best") 56 | plt.show() 57 | -------------------------------------------------------------------------------- /Chapter10/bookdetails.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Price": 35.02, 4 | "Stock": "In stock", 5 | "Title": "Rip it Up and ...", 6 | "Rating": 5 7 | }, 8 | { 9 | "Price": 57.25, 10 | "Stock": "In stock", 11 | "Title": "Our Band Could Be ...", 12 | "Rating": 4 13 | }, 14 | { 15 | "Price": 37.32, 16 | "Stock": "In stock", 17 | "Title": "How Music Works", 18 | "Rating": 2 19 | }, 20 | { 21 | "Price": 18.03, 22 | "Stock": "Out of stock", 23 | "Title": "Love Is a Mix ...", 24 | "Rating": 1 25 | }, 26 | { 27 | "Price": 31.19, 28 | "Stock": "In stock", 29 | "Title": "Please Kill Me: The ...", 30 | "Rating": 4 31 | }, 32 | { 33 | "Price": 45.0, 34 | "Stock": "In stock", 35 | "Title": "Kill 'Em and Leave: ...", 36 | "Rating": 5 37 | }, 38 | { 39 | "Price": 52.6, 40 | "Stock": "Out of stock", 41 | "Title": "Chronicles, Vol. 1", 42 | "Rating": 2 43 | }, 44 | { 45 | "Price": 38.4, 46 | "Stock": "In stock", 47 | "Title": "This Is Your Brain ...", 48 | "Rating": 1 49 | }, 50 | { 51 | "Price": 12.36, 52 | "Stock": "In stock", 53 | "Title": "Orchestra of Exiles: The ...", 54 | "Rating": 3 55 | }, 56 | { 57 | "Price": 20.02, 58 | "Stock": "In stock", 59 | "Title": "No One Here Gets ...", 60 | "Rating": 5 61 | }, 62 | { 63 | "Price": 31.58, 64 | "Stock": "In stock", 65 | "Title": "Life", 66 | "Rating": 5 67 | }, 68 | { 69 | "Price": 55.66, 70 | "Stock": "Out of Stock", 71 | "Title": "Old Records Never Die: ...", 72 | "Rating": 2 73 | }, 74 | { 75 | "Price": 28.8, 76 | "Stock": "In stock", 77 | "Title": "Forever Rockers (The Rocker ...", 78 | "Rating": 3 79 | } 80 | ] 81 | -------------------------------------------------------------------------------- /Chapter04/example1_ibm_announcements.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | import requests 3 | 4 | sourceUrl='https://developer.ibm.com/announcements/' 5 | dataSet = list() 6 | 7 | def read_url(url): 8 | """Read given Url , Returns pyquery object for page content""" 9 | pageSource = requests.get(url).content 10 | return pq(pageSource) 11 | 12 | def get_details(page): 13 | """read 'page' url and append list of queried items to dataSet""" 14 | response = read_url(page) 15 | 16 | articles = response.find('.ibm--card > a.ibm--card__block_link') 17 | print("\nTotal articles found :", articles.__len__(), ' in Page: ', page) 18 | for article in articles.items(): 19 | link = article.attr('href') 20 | articlebody = article.find('div.ibm--card__body') 21 | adate = articlebody.find('h5 > .ibm--card__date').text() 22 | articlebody.find('h5 > .ibm--card__date').remove() 23 | atype = articlebody.find('h5').text().strip() 24 | title = articlebody.find('h3.ibm--card__title').text().encode('utf-8') 25 | excerpt = articlebody.find('p.ibm--card__excerpt').text().encode('utf-8') 26 | category = article.find('div.ibm--card__bottom > p.cpt-byline__categories span') 27 | if link: 28 | link = str(link).replace('/announcements/', sourceUrl) 29 | categories = [span.text for span in category if span.text != '+'] 30 | dataSet.append([link, atype, adate, title, excerpt,",".join(categories)]) 31 | 32 | if __name__ == '__main__': 33 | pageUrl = sourceUrl+"category/data-science/?fa=date:DESC&fb=" 34 | 35 | pageUrls = [ 36 | sourceUrl+"category/data-science/page/%(page)s?fa=date:DESC&fb=" % {'page': page} 37 | for page in range(1, 3)] 38 | 39 | for pages in pageUrls: 40 | get_details(pages) 41 | 42 | print("\nTotal articles collected: ", len(dataSet)) 43 | print(dataSet) 44 | -------------------------------------------------------------------------------- /Chapter04/example3_AHL.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | import re 3 | 4 | sourceUrl = 'http://www.flyershistory.com/cgi-bin/ml-poffs.cgi' 5 | dataSet = list() 6 | keys = ['year','month','day','game_date','team1', 'team1_score', 'team2', 'team2_score', 'game_status'] 7 | 8 | def read_url(url): 9 | """Read given Url , Returns pyquery object for page content""" 10 | pageSource = pq(url) 11 | return pq(pageSource) 12 | 13 | 14 | if __name__ == '__main__': 15 | page = read_url(sourceUrl) 16 | 17 | tableRows = page.find("h1:contains('AHL Playoff Results') + table tr") 18 | print("\nTotal rows found :", tableRows.__len__()) 19 | 20 | for tr in tableRows.items(): 21 | team1 = tr.find('td').eq(1).text() 22 | if team1 != '': 23 | game_date = tr.find('td').eq(0).text() 24 | dates = re.search(r'(.*)-(.*)-(.*)',game_date) 25 | 26 | team1_score = tr.find('td').eq(2).text() 27 | team2 = tr.find('td').eq(4).text() 28 | team2_score = tr.find('td').eq(5).text() 29 | 30 | #check Game Status should be either 'W' or 'L' 31 | game_status = tr.find('td').eq(6).text() 32 | if not re.match(r'[WL]',game_status): 33 | game_status = tr.find('td').eq(7).text() 34 | 35 | #breaking down date in year,month and day 36 | year = dates.group(3) 37 | month = dates.group(2) 38 | day = dates.group(1) 39 | if len(year)==2 and int(year)>=68: 40 | year = '19'+year 41 | elif len(year)==2 and int(year) <68: 42 | year = '20'+year 43 | else: 44 | pass 45 | 46 | #appending individual data list to the dataSet 47 | dataSet.append([year,month,day,game_date,team1,team1_score,team2,team2_score,game_status]) 48 | 49 | print("\nTotal Game Status, found :", len(dataSet)) 50 | print(dataSet) 51 | -------------------------------------------------------------------------------- /Chapter08/seleniumBooks.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from selenium.common.exceptions import NoSuchElementException 3 | 4 | chrome_path = 'chromedriver' 5 | driver = webdriver.Chrome(executable_path=chrome_path) 6 | driver.get('http://books.toscrape.com/index.html') 7 | 8 | dataSet = [] 9 | # select: Food and Drink 10 | driver.find_element_by_link_text("Food and Drink").click() 11 | print("Current Page URL: ", driver.current_url) 12 | totalBooks = driver.find_element_by_xpath("//*[@id='default']//form/strong[1]") 13 | print("Found: ", totalBooks.text) 14 | 15 | page = True 16 | while page: 17 | listings = driver.find_elements_by_xpath("//*[@id='default']//ol/li[position()>0]") 18 | for listing in listings: 19 | url = listing.find_element_by_xpath(".//article[contains(@class,'product_pod')]/h3/a").get_attribute('href') 20 | title = listing.find_element_by_xpath(".//article[contains(@class,'product_pod')]/h3/a").text 21 | titleLarge = listing.find_element_by_xpath(".//article[contains(@class,'product_pod')]/h3/a").get_attribute( 22 | 'title') 23 | price = listing.find_element_by_xpath(".//article/div[2]/p[contains(@class,'price_color')]").text 24 | stock = listing.find_element_by_xpath(".//article/div[2]/p[2][contains(@class,'availability')]").text 25 | image = listing.find_element_by_xpath( 26 | ".//article/div[1][contains(@class,'image_container')]/a/img").get_attribute('src') 27 | starRating = listing.find_element_by_xpath(".//article/p[contains(@class,'star-rating')]").get_attribute( 28 | 'class') 29 | dataSet.append([titleLarge, title, price, stock, image, starRating.replace('star-rating ', ''), url]) 30 | 31 | try: 32 | #Check for Pagination with text 'next' 33 | driver.find_element_by_link_text('next').click() 34 | continue 35 | except NoSuchElementException: 36 | page = False 37 | 38 | print("Completed") 39 | 40 | print(dataSet) 41 | 42 | driver.close() 43 | driver.quit() 44 | -------------------------------------------------------------------------------- /Chapter06/toScrapeSessionCookie.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | import requests 3 | mainUrl = "http://toscrape.com/" 4 | loginUrl = "http://quotes.toscrape.com/login" 5 | quoteUrl = "http://quotes.toscrape.com/" 6 | 7 | def getCustomHeaders(cookieHeader): 8 | return { 9 | 'Host': 'quotes.toscrape.com', 10 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0', 11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 12 | 'Referer': 'http://quotes.toscrape.com/login', 13 | 'Content-Type': 'application/x-www-form-urlencoded', 14 | 'Cookie': cookieHeader, 15 | } 16 | 17 | def responseCookies(response): 18 | headers = response.headers 19 | cookies = response.cookies 20 | print("Headers: ", headers) 21 | print("Cookies: ", cookies) 22 | return headers['Set-Cookie'] 23 | 24 | if __name__ == '__main__': 25 | requests.get(mainUrl) 26 | response = requests.get(loginUrl) 27 | setCookie = responseCookies(response) 28 | print("Set-Cookie: ",setCookie) 29 | 30 | responseA = pq(response.text) 31 | csrf_token = responseA.find('input[name="csrf_token"]').attr('value') 32 | username = responseA.find('input[id="username"]').attr('name') 33 | password = responseA.find('input[id="password"]').attr('name') 34 | params = {username: 'test', password: 'test','csrf_token': csrf_token} 35 | print(params) 36 | 37 | customheaders = getCustomHeaders(setCookie) 38 | response = requests.post(loginUrl, data=params, headers=customheaders) 39 | # response = requests.post(loginUrl, data=params, headers={}) 40 | setCookie = responseCookies(response) 41 | #print("Set-Cookie: ",setCookie) 42 | 43 | responseB = pq(response.text) 44 | logoutText = responseB.find('a[href*="logout"]').text() 45 | logoutLink = responseB.find('a[href*="logout"]').attr('href') 46 | print("Current Page : ",response.url) 47 | print("Confirm Login : ", responseB.find('.row h2').text()) 48 | print("Logout Info : ", logoutText," & ",logoutLink) 49 | -------------------------------------------------------------------------------- /Chapter04/example2_quotes_authors.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | 3 | sourceUrl = 'http://quotes.toscrape.com/tag/books/' 4 | dataSet = list() 5 | keys = ['quote_tags','author_url','author_name','born_date','born_location','quote_title'] 6 | 7 | def read_url(url): 8 | """Read given Url , Returns pyquery object for page content""" 9 | pageSource = pq(url) 10 | return pq(pageSource) 11 | 12 | 13 | def get_details(page): 14 | """read 'page' url and append list of queried items to dataSet""" 15 | nextPage = True 16 | pageNo = 1 17 | while (nextPage): 18 | response = read_url(page + 'page/' + str(pageNo)) 19 | if response.find("ul.pager:has('li.next')"): 20 | nextPage = True 21 | else: 22 | nextPage = False 23 | 24 | quotes = response.find('.quote') 25 | print("\nTotal Quotes found :", quotes.__len__(), ' in Page: ', pageNo) 26 | for quote in quotes.items(): 27 | title = quote.find('[itemprop="text"]:first').text() 28 | author = quote.find('[itemprop="author"]:first').text() 29 | authorLink = quote.find('a[href*="/author/"]:first').attr('href') 30 | tags = quote.find('.tags [itemprop="keywords"]').attr('content') 31 | 32 | if authorLink: 33 | authorLink = 'http://quotes.toscrape.com' + authorLink 34 | linkDetail = read_url(authorLink) 35 | born_date = linkDetail.find('.author-born-date').text() 36 | born_location = linkDetail.find('.author-born-location').text() 37 | if born_location.startswith('in'): 38 | born_location = born_location.replace('in ','') 39 | dataSet.append(dict(zip(keys,[tags,authorLink,author,born_date,born_location,title[0:50]]))) 40 | pageNo += 1 41 | 42 | if __name__ == '__main__': 43 | get_details(sourceUrl) 44 | print("\nTotal Quotes collected: ", len(dataSet)) 45 | print(dataSet) 46 | for info in dataSet: 47 | print(info['author_name'],' born on ',info['born_date'], ' in ',info['born_location']) 48 | 49 | -------------------------------------------------------------------------------- /Chapter08/seleniumProducts.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | chrome_path='chromedriver' 3 | driver = webdriver.Chrome(executable_path=chrome_path) 4 | driver.get('http://automationpractice.com') 5 | print("Current Page URL: ",driver.current_url) 6 | 7 | searchBox = driver.find_element_by_id('search_query_top') 8 | print("Type :",type(searchBox)) 9 | print("Attribute Value :",searchBox.get_attribute("value")) 10 | print("Attribute Class :",searchBox.get_attribute("class")) 11 | print("Tag Name :",searchBox.tag_name) 12 | 13 | searchBox.clear() 14 | searchBox.send_keys("Dress") 15 | 16 | submitButton = driver.find_element_by_name("submit_search") 17 | submitButton.click() 18 | 19 | resultsShowing = driver.find_element_by_class_name("product-count") 20 | print("Results Showing: ",resultsShowing.text) 21 | 22 | resultsFound = driver.find_element_by_xpath('//*[@id="center_column"]//span[@class="heading-counter"]') 23 | print("Results Found: ",resultsFound.text) 24 | 25 | products = driver.find_elements_by_xpath('//*[@id="center_column"]//a[@class="product-name"]') 26 | #products = driver.find_elements_by_css_selector('ul.product_list li.ajax_block_product a.product-name') 27 | 28 | foundProducts=[] 29 | dataSet=[] 30 | for product in products: 31 | foundProducts.append([product.text,product.get_attribute("href")]) 32 | 33 | print(foundProducts) 34 | 35 | dataSet=[] 36 | if len(foundProducts)>0: 37 | for foundProduct in foundProducts: 38 | driver.get(foundProduct[1]) 39 | product_url = driver.current_url 40 | product_name = driver.find_element_by_xpath('//*[@id="center_column"]//h1[@itemprop="name"]').text 41 | short_description = driver.find_element_by_xpath('//*[@id="short_description_content"]').text 42 | product_price = driver.find_element_by_xpath('//*[@id="our_price_display"]').text 43 | image_url = driver.find_element_by_xpath('//*[@id="bigpic"]').get_attribute('src') 44 | condition = driver.find_element_by_xpath('//*[@id="product_condition"]/span').text 45 | dataSet.append([product_name,product_price,condition,short_description,image_url,product_url]) 46 | 47 | 48 | print(dataSet) 49 | 50 | driver.close() 51 | driver.quit() 52 | -------------------------------------------------------------------------------- /Chapter03/scrapeXPathLoop.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | from lxml.etree import XPath 3 | 4 | baseUrl = "http://books.toscrape.com/" 5 | bookUrl = "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/index.html" 6 | pageUrl = "http://books.toscrape.com/catalogue/category/books/food-and-drink_33/page-" 7 | 8 | dataSet = [] 9 | page=1 10 | totalPages=1 11 | while(page<=totalPages): 12 | print("Rows in Dataset: "+str(len(dataSet))) 13 | 14 | if(page==1): 15 | doc = lxml.html.parse(pageUrl+str(page)+".html").getroot() 16 | perPageArticles = doc.xpath("//*[@id=\"default\"]//form/strong[3]/text()") 17 | totalArticles = doc.xpath("//*[@id=\"default\"]//form/strong[1]/text()") 18 | totalPages = round(int(totalArticles[0])/int(perPageArticles[0])) 19 | print(str(totalArticles[0])+" Results, showing "+str(perPageArticles[0])+" Articles per page") 20 | else: 21 | doc = lxml.html.parse(pageUrl+str(page)+".html").getroot() 22 | 23 | #used to find page url pattern 24 | nextPage = doc.xpath("//*[@id=\"default\"]//ul[contains(@class,'pager')]/li[2][contains(@class,'next')]/a/@href") 25 | if len(nextPage)>0: 26 | print("Scraping Page "+str(page)+" of "+str(totalPages)+". NextPage > "+str(nextPage[0])) 27 | else: 28 | print("Scraping Page "+str(page)+" of "+str(totalPages)) 29 | 30 | articles = XPath("//*[@id='default']//ol/li[position()>0]") 31 | titlePath = XPath(".//article[contains(@class,'product_pod')]/h3/a/text()") 32 | pricePath = XPath(".//article/div[2]/p[contains(@class,'price_color')]/text()") 33 | stockPath = XPath(".//article/div[2]/p[2][contains(@class,'availability')]/text()[normalize-space()]") 34 | imagePath = XPath(".//article/div[1][contains(@class,'image_container')]/a/img/@src") 35 | starRating = XPath(".//article/p[contains(@class,'star-rating')]/@class") 36 | 37 | for row in articles(doc): 38 | title = titlePath(row)[0] 39 | price = pricePath(row)[0] 40 | availability = stockPath(row)[0].strip() 41 | image = imagePath(row)[0] 42 | rating = starRating(row)[0] 43 | 44 | dataSet.append([title,price,availability,image.replace('../../../..',baseUrl),rating.replace('star-rating ','')]) 45 | 46 | page+=1 47 | 48 | print(dataSet) 49 | -------------------------------------------------------------------------------- /Chapter05/toscrape_quotes.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Listing Quotes from first 5 or less pages found 3 | from 'http://quotes.toscrape.com/' 4 | ''' 5 | 6 | import requests 7 | import re 8 | from bs4 import BeautifulSoup 9 | import csv 10 | 11 | sourceUrl = 'http://quotes.toscrape.com/' 12 | keys = ['quote_tags','author_url','author_name','born_date','born_location','quote_title'] 13 | 14 | 15 | def read_url(url): 16 | """Read given Url , Returns requests object for page content""" 17 | response = requests.get(url) 18 | return response.text 19 | 20 | 21 | def get_details(page, dataWriter): 22 | """Get 'response' for first 5 pages, parse it and collect data for 'keys' headers""" 23 | nextPage = True 24 | pageNo = 1 25 | while (nextPage and pageNo <= 5): 26 | response = read_url(page + 'page/' + str(pageNo)) 27 | soup = BeautifulSoup(response, 'lxml') 28 | 29 | rows = soup.find_all('div', 'quote') 30 | if (len(rows) > 0): 31 | print("Page ",pageNo," Total Quotes Found ",len(rows)) 32 | for row in rows: 33 | if row.find('span',attrs={'itemprop':'text'}): 34 | 35 | title = row.find(attrs={'itemprop':'text'}).text.strip() 36 | author = row.find(attrs={'itemprop':'author'}).text.strip() 37 | authorLink = row.find('a',href=re.compile(r'/author/')).get('href') 38 | tags = row.find('div','tags').find(itemprop="keywords").get('content') 39 | print(title, ' : ', author,' : ',authorLink, ' : ',tags) 40 | 41 | if authorLink: 42 | authorLink = 'http://quotes.toscrape.com' + authorLink 43 | linkDetail = read_url(authorLink) 44 | soupInner = BeautifulSoup(linkDetail, 'lxml') 45 | 46 | born_date = soupInner.find('span','author-born-date').text.strip() 47 | born_location = soupInner.find('span','author-born-location').text.strip() 48 | 49 | # Write a list of values in file 50 | dataWriter.writerow([tags,authorLink,author,born_date,born_location.replace('in ',''),title]) 51 | 52 | nextPage = True 53 | pageNo += 1 54 | else: 55 | print("Quotes Not Listed!") 56 | 57 | 58 | 59 | if __name__ == '__main__': 60 | dataSet = open('quotes.csv', 'w', newline='', encoding='utf-8') 61 | dataWriter = csv.writer(dataSet) 62 | # Write a Header or Column_names to CSV 63 | dataWriter.writerow(keys) 64 | get_details(sourceUrl, dataWriter) 65 | # get_details(sourceUrl) 66 | dataSet.close() 67 | -------------------------------------------------------------------------------- /Chapter09/regexHTML.py: -------------------------------------------------------------------------------- 1 | ''' 2 | In this code we will be using Regex to find the listed information from the HTML content: 3 | -HTML elements, 4 | -Elements attributes ('key' and 'values') and 5 | -Elements content. 6 | ''' 7 | 8 | import re 9 | from bs4 import BeautifulSoup 10 | 11 | 12 | def read_file(): 13 | ''' 14 | Read and return content from file (.html). 15 | ''' 16 | content = open("regexHTML.html", "r") 17 | pageSource = content.read() 18 | return pageSource 19 | 20 | 21 | def applyPattern(pattern): 22 | tags = re.findall(pattern, page) 23 | print("Pattern r'{}' ,Found total: {}".format(pattern, len(tags))) 24 | print(tags) 25 | return 26 | 27 | 28 | if __name__ == "__main__": 29 | page = read_file() # .decode('utf-8') 30 | soup = BeautifulSoup(page, 'lxml') 31 | print([tag.name for tag in soup.find_all()]) 32 | # ['html', 'head', 'title', 'style', 'body', 'h1', 'a', 'a', 'a', 'div', 'p', 'i', 'img', 'p', 'i', 'h1'] 33 | 34 | applyPattern(r'<(\w+)>') # Finding Elements without attributes 35 | # Pattern r'<(\w+)>' ,Found total: 6 36 | # ['html', 'head', 'title', 'body', 'div', 'i'] 37 | 38 | applyPattern(r'<(\w+)\s') # Finding Elements with attributes 39 | # Pattern r'<(\w+)\s' ,Found total: 10 40 | # ['style', 'h1', 'a', 'a', 'a', 'p', 'img', 'p', 'i', 'h1'] 41 | 42 | applyPattern(r'<(\w+)\s?') # Finding all HTML element 43 | # Pattern r'<(\w+)\s?' ,Found total: 16 44 | # ['html', 'head', 'title', 'style', 'body', 'h1', 'a', 'a', 'a', 'div', 'p', 'i', 'img', 'p', 'i', 'h1'] 45 | 46 | applyPattern(r'<\w+\s+(.*?)=') # Finding attributes name 47 | # Pattern r'<\w+\s+(.*?)=' ,Found total: 10 48 | # ['type', 'style', 'href', 'class', 'id', 'id', 'src', 'class', 'style', 'itemprop'] 49 | 50 | applyPattern(r'(\w+)=') # Finding names of all attributes 51 | # Pattern r'(\w+)=' ,Found total: 18 52 | # ['type', 'style', 'href', 'style', 'class', 'href', 'id', 'href', 'style', 'id', 'class', 'src', 'id', 'class', 'class', 'id', 'style', 'itemprop'] 53 | 54 | applyPattern(r'=\"(\w+)\"') 55 | # Pattern r'=\"(\w+)\"' ,Found total: 9 56 | # ['classOne', 'idOne', 'mainContent', 'content', 'pageLogo', 'logo', 'content', 'subContent', 'subheading'] 57 | 58 | applyPattern(r'=\"([\w\S]+)\"') 59 | # Pattern r'=\"([\w\S]+)\"' ,Found total: 18 60 | # ['text/css', 'color:orange;', 'https://www.google.com', 'color:red;', 'classOne', 'https://www.yahoo.com', 'idOne', 'https://www.wikipedia.org', 'color:blue;', 'mainContent', 'content', 'mylogo.png', 'pageLogo', 'logo', 'content', 'subContent', 'color:red', 'subheading'] 61 | 62 | applyPattern(r'\>(.*)\<') 63 | # Pattern r'\>(.*)\<' ,Found total: 8 64 | # ['Welcome to Web Scraping: Example', 'Welcome to Web Scraping', 'Google', 'Yahoo', 'Wikipedia', 'Paragraph contents', 'Sub paragraph content', 'Sub heading Content!'] 65 | -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/spiders/quotes.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from Quotes.items import QuotesItem 4 | 5 | class QuotesSpider(scrapy.Spider): 6 | name = "quotes" 7 | allowed_domains = ["quotes.toscrape.com"] 8 | 9 | #To be used for pagination purpose. 10 | 11 | start_urls = ( 12 | 'http://quotes.toscrape.com/', 13 | ) 14 | ''' 15 | #or 16 | start_urls = ( 17 | 'http://quotes.toscrape.com/', 18 | 'http://quotes.toscrape.com/page/1/', 19 | 'http://quotes.toscrape.com/page/2/', 20 | ) 21 | or 22 | start_urls = ['http://quotes.toscrape.com/page/%s' % page for page in xrange(1, 5)] 23 | ''' 24 | 25 | '''Using XPath''' 26 | def parse(self, response): 27 | print("Response Type >>> ", type(response)) 28 | rows = response.xpath("//div[@class='quote']") 29 | 30 | print("Quotes Count >> ", rows.__len__()) 31 | for row in rows: 32 | item = QuotesItem() 33 | 34 | item['tags'] = row.xpath('div[@class="tags"]/meta[@itemprop="keywords"]/@content').extract_first() 35 | item['author'] = row.xpath('//span/small[@itemprop="author"]/text()').extract_first() 36 | item['quote'] = row.xpath('span[@itemprop="text"]/text()').extract_first() 37 | item['author_link'] = row.xpath('//a[contains(@href,"/author/")]/@href').extract_first() 38 | if len(item['author_link'])>0: 39 | item['author_link'] = 'http://quotes.toscrape.com'+item['author_link'] 40 | 41 | yield item 42 | 43 | nextPage = response.xpath("//ul[@class='pager']//li[@class='next']/a/@href").extract_first() 44 | if nextPage: 45 | print("Next Page URL: ",nextPage) 46 | #nextPage obtained from either XPath or CSS can be used. 47 | yield scrapy.Request('http://quotes.toscrape.com'+nextPage,callback=self.parse) 48 | 49 | print('Completed') 50 | 51 | 52 | 53 | 54 | '''Using CSS Selectors''' 55 | ''' 56 | def parse(self, response): 57 | print("Response Type >>> ", type(response)) 58 | rows = response.css("div.quote") 59 | 60 | for row in rows: 61 | item = QuotesItem() 62 | item['tags'] = row.css('div.tags > meta[itemprop="keywords"]::attr("content")').extract_first() 63 | item['author'] = row.css('small[itemprop="author"]::text').extract_first() 64 | item['quote'] = row.css('span[itemprop="text"]::text').extract_first() 65 | item['author_link'] = row.css('a:contains("(about)")::attr(href)').extract_first() 66 | if len(item['author_link'])>0: 67 | item['author_link'] = 'http://quotes.toscrape.com'+item['author_link'] 68 | 69 | yield item 70 | 71 | nextPage = response.css("ul.pager > li.next > a::attr(href)").extract_first() 72 | if nextPage: 73 | print("Next Page URL: ",nextPage) 74 | #nextPage obtained from either XPath or CSS can be used. 75 | yield scrapy.Request('http://quotes.toscrape.com'+nextPage,callback=self.parse) 76 | 77 | print('Completed') 78 | ''' 79 | -------------------------------------------------------------------------------- /Chapter06/toScrapeViewstate.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | import requests 3 | 4 | mainurl = "http://toscrape.com/" 5 | searchurl = "http://quotes.toscrape.com/search.aspx" 6 | filterurl = "http://quotes.toscrape.com/filter.aspx" 7 | quoteurl = "http://quotes.toscrape.com/" 8 | authorTags = [('Albert Einstein', 'success'), ('Thomas A. Edison', 'inspirational')] 9 | 10 | def processRequests(url, params={}, customheaders={}): 11 | if len(params) > 0: 12 | response = requests.post(url, data=params, headers=customheaders) 13 | else: 14 | response = requests.get(url) 15 | #headers = response.headers # print(headers) 16 | #cookies = response.cookies # print(cookies) 17 | return pq(response.text) 18 | 19 | if __name__ == '__main__': 20 | for authorTag in authorTags: 21 | authorName,tagName= authorTag 22 | 23 | #Step 1: load searchURL 24 | searchResponse = processRequests(searchurl) 25 | author = searchResponse.find('select#author option:contains("' + authorName + '")').attr('value') 26 | viewstate = searchResponse.find('input#__VIEWSTATE').attr('value') 27 | tag = searchResponse.find('select#tag option').text() 28 | 29 | print("Author: ", author) 30 | print("ViewState: ", viewstate) 31 | print("Tag: ", tag) 32 | 33 | #Step 2: load filterurl with author and default tag 34 | params = {'author': author, 'tag': tag, '__VIEWSTATE': viewstate} 35 | customheaders = { 36 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 37 | 'Content-Type': 'application/x-www-form-urlencoded', 38 | 'Referer': searchurl 39 | } 40 | filterResponse = processRequests(filterurl,params,customheaders) 41 | viewstate = filterResponse.find('input#__VIEWSTATE').attr('value') 42 | tagSuccess = filterResponse.find('select#tag option:contains("' + tagName + '")').attr('value') 43 | submitButton = filterResponse.find('input[name="submit_button"]').attr('value') 44 | print("Author: ", author) 45 | print("ViewState: ", viewstate) 46 | print("Tag: ", tagSuccess) 47 | print("Submit: ", submitButton) 48 | 49 | #Step 3: load filterurl with author and defined tag 50 | params = {'author': author, 'tag': tagSuccess, 'submit_button': submitButton, '__VIEWSTATE': viewstate} 51 | # params = {'author': author, 'tag': tagSuccess, 'submit_button': submitButton}#, '__VIEWSTATE': viewstate} # test 52 | customheaders = { 53 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 54 | 'Content-Type': 'application/x-www-form-urlencoded', 55 | 'Referer': filterurl 56 | } 57 | finalResponse = processRequests(filterurl,params, customheaders) 58 | 59 | #Step 4: Extract results 60 | quote = finalResponse.find('div.quote span.content').text() 61 | quoteAuthor = finalResponse.find('div.quote span.author').text() 62 | message = finalResponse.find('div.quote span.tag').text() 63 | print("Quote: ", quote, "\nAuthor: ", quoteAuthor, "\nMessage: ", message) 64 | -------------------------------------------------------------------------------- /Chapter05/Quotes/Quotes/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for Blog project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'Quotes' 13 | 14 | SPIDER_MODULES = ['Quotes.spiders'] 15 | NEWSPIDER_MODULE = 'Quotes.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'Blog (+http://www.yourdomain.com)' 20 | #ROBOTSTXT_OBEY = False 21 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 22 | CONCURRENT_REQUESTS=16 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # See also autothrottle settings and docs 27 | DOWNLOAD_DELAY=3 28 | # The download delay setting will honor only one of: 29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16 30 | #CONCURRENT_REQUESTS_PER_IP=16 31 | 32 | # Disable cookies (enabled by default) 33 | #COOKIES_ENABLED=False 34 | 35 | # Disable Telnet Console (enabled by default) 36 | #TELNETCONSOLE_ENABLED=False 37 | 38 | # Override the default request headers: 39 | #DEFAULT_REQUEST_HEADERS = { 40 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 41 | # 'Accept-Language': 'en', 42 | # 'upgrade-insecure-requests': 1, 43 | # 'accept-encoding': 'gzip, deflate, br', 44 | # 'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'Blog.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'Blog.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'Quotes.pipelines.QuotesPipeline': 300, 69 | } 70 | FEED_EXPORT_ENCODING = 'utf-8' 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay 75 | AUTOTHROTTLE_ENABLED=True 76 | # The initial download delay 77 | AUTOTHROTTLE_START_DELAY=5 78 | # The maximum download delay to be set in case of high latencies 79 | AUTOTHROTTLE_MAX_DELAY=60 80 | # Enable showing throttling stats for every response received: 81 | #AUTOTHROTTLE_DEBUG=False 82 | DOWNLOAD_HANDLERS = {'s3': None,} 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | HTTPCACHE_ENABLED=True 86 | #HTTPCACHE_EXPIRATION_SECS=0 87 | #HTTPCACHE_DIR='httpcache' 88 | #HTTPCACHE_IGNORE_HTTP_CODES=[] 89 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage' 90 | -------------------------------------------------------------------------------- /Chapter09/godfreysfeed.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests 3 | 4 | def read_url(url): 5 | pageSource = requests.get(url).text 6 | return pageSource 7 | 8 | 9 | if __name__ == "__main__": 10 | 11 | dataSet=list() 12 | sourceUrl = 'http://godfreysfeed.com/dealersandlocations.php' 13 | page = read_url(sourceUrl) 14 | 15 | pLatLng= r'var latLng = new google.maps.LatLng\((?P.*)\,\s*(?P.*)\)\;' 16 | latlngs = re.findall(pLatLng,page) 17 | print("Findall found total LatLngs: ", len(latlngs)) 18 | 19 | pDealers = r'infoWindowContent = infoWindowContent\+\s*\"(.*?)\"\;' 20 | dealers = re.findall(pDealers, page) 21 | print("Findall found total Address: ", len(latlngs)) 22 | 23 | d=0 24 | for dealer in dealers: 25 | dealerInfo = re.split(r'
',re.sub(r'

','',dealer)) 26 | name = re.findall(r'\'>(.*?)(.*)<',dealerInfo[1])[0] 28 | city = re.findall(r'>(.*),\s*(.*)<',dealerInfo[2])[0][0] 29 | state = re.findall(r'>(.*),\s*(.*)<',dealerInfo[2])[0][1] 30 | zip = re.findall(r'>(.*)<',dealerInfo[3])[0] 31 | lat = latlngs[d][0] 32 | lng = latlngs[d][1] 33 | d+=1 34 | dataSet.append([name,address,city,state,zip,lat,lng]) 35 | 36 | print(dataSet) #[[name,address, city, state, zip, lat,lng],] 37 | 38 | 39 | #Findall found total LatLngs: 55 40 | #Findall found total Address: 55 41 | #[['Akins Feed & Seed', '206 N Hill Street', 'Griffin', 'GA', '30223', '33.2509855', '-84.2633946'], ['Alf's Farm and Garden', '101 East 1st Street', 'Donalsonville', 'GA', '39845', '31.0426107', '-84.8821949'], ['American Cowboy Shop', '513 D Murphy Hwy', 'Blairsville', 'GA', '30512', '34.8761989', '-83.9582412'], ['Anderson's General Store', '23736 US Hwy 80 E', 'Statesboro', 'GA', '30458', '32.43158', '-81.749293'], ['Bar G Horse & Cattle Supply', '1060 Astondale Road', 'Bishop', 'GA', '30621', '33.8192864', '-83.4387722'], ['Beggs Farm Supply', '5845 Royston Hwy', 'Canon', 'GA', '30520', '34.2959968', '-83.0062267'], ['Big Creek Feed', '218 Hwy 49 N', 'Byron', 'GA', '31025', '32.6537561', '-83.7596295'], ['Blue Ribbon Show Supply', '9416 Lucy Moore Road', 'Nichols', 'GA', '31554', '31.462497', '-82.5866503'], ['Burdette Mill', '216 Depot Street', 'Washington', 'GA', '30673', '33.7340136', '-82.7472304'], ['Burke Feed', '369 Hwy 56 N', 'Waynesboro', 'GA', '30830', '33.1064245', '-81.9852452'], ['Candler Feed and Seed', '1275 Smokey Park Hwy', 'Candler', 'NC', '28715', '35.5401542', '-82.7570303'], ['Cash & Carry Feed', '135 N McGriff St.', 'Whigham', 'GA', '39897', '30.8848506', '-84.3248931'], ['Cherokee Feed and Seed', '869 Grove St', 'Gainesville', 'GA', '30501', '34.289323', '-83.8219858'], ['Cherokee Feed and Seed', '2370 Hightower Rd', 'Ball Ground', 'GA', '30107', '34.3372664', '-84.3779515'], ['Claxton Family Cattle', '240 Old Douglas Road', 'Hazelhurst', 'GA', '31539', '31.836371', '-82.6232915'], ['D&D Irringation', '51 S Rentz St', 'Lenox', 'GA', '31637', '31.2713852', '-83.4629421'], ['Double D Stables and Tack', '4111 Logan Rd', 'Rocky Face', 'GA', '30740', '34.805079', '-85.0274471'], ['Eatonton Co-op', '504 S Jefferson Ave', 'Eatonton', 'GA', '31024', '33.3267997', '-83.3884961'], ['Edenfields Feed and Seed', '709 Hwy 25N', 'Millen', 'GA', '30442', '32.8088128', '-81.9491768'], ['Family Feed', '6424 COLUMBUS HWY 80', 'Box Springs', 'GA', '31801', '32.5580349', '-84.6513774'], ['Farm & Garden Inc.', '646 Clarksville Street', 'Cornelia', 'GA', '30531', '34.5114883', '-83.5271166'], ['Farmer Seed Company', '800 W Broad St', 'Doerun', 'GA', '31744', '31.3200669', '-83.9234872'], ['Farmers Feed', '204 N West St', 'Greensboro', 'GA', '30642', '33.5781281', '-83.1845358'], ['Feed South', '2623 Knight Avenue', 'Waycross', 'GA', '31503', '31.2028754', '-82.316785'], ['Forsyth Feed & Seed', '45 W Jefferson Street', 'Forsyth', 'GA', '31029', '33.035097', '-83.940067'], ['Georgia Deer Farm', '850 Hwy 27 N', 'Roopville', 'GA', '30170', '33.476202', '-85.1082285'], ['H&M Trailers and Feed', '6446 JFH Pkwy', 'Adairsville', 'GA', '30103', '34.3924623', '-84.9333769'], ['Hill Farm Supply', '12700 Augusta Hwy', 'Sparta', 'GA', '31087', '33.2791285', '-82.9646478'], ['Ijon Webb', '1130 Stillwell Rd', 'Springfield', 'GA', '31329', '32.369773', '-81.266672'], ['Jesup Milling', '601 SW Broad Street', 'Jesup', 'GA', '31545', '31.5990992', '-81.8905051'], ['Jump N Run Farm', '1569 Liberty Church Grove Rd', 'Wrightsville', 'GA', '31096', '32.6481899', '-82.6139868'], ['L & C Farm and Garden', '1143 East Fairplay Road', 'Fairplay', 'SC', '29643', '34.5101355', '-82.9602795'], ['Maddox Feed', '1915 Winder Hwy', 'Jefferson', 'GA', '30549', '34.1001367', '-83.5969643'], ['Miller Farm Supply', '2001 Bob Culvern Rd', 'Louisville', 'GA', '30434', '32.9859964', '-82.3913739'], ['North Fulton Feed', '12950 Hwy 9 N', 'Alpharetta', 'GA', '30004', '34.096767', '-84.2735144'], ['North Georgia Co-Op', '951 Progress Rd', 'Ellijay', 'GA', '30540', '34.6739981', '-84.4902665'], ['Oglethorpe Feed and Farm Supply', '900 Athens Road', 'Crawford', 'GA', '30648', '33.8898662', '-83.1358665'], ['Owens Farm Supply', '6414 Mize Road', 'Toccoa', 'GA', '30577', '34.4855944', '-83.3394454'], ['Patricks', '10285 Covington Bypass', 'Covington', 'GA', '30014', '33.5770654', '-83.8354943'], ['Perry Feed and Tack', '309 Kellwood Drive', 'Perry', 'GA', '31069', '32.4443895', '-83.7439432'], ['Pine Ridge Outdoor Supply', '4999 HWY 114', 'Lyerly', 'GA', '30730', '34.4166444', '-85.3925577'], ['Reeves Hardware', '95 BO James St', 'Clayton', 'GA', '30525', '34.8686254', '-83.4026817'], ['Roberts Milling Company', '116 West Albany Ave', 'Pearson', 'GA', '31642', '31.2987063', '-82.8577173'], ['Roche Farm and Garden', '803 E Jackson St', 'Dublin', 'GA', '31040', '32.5444125', '-82.8945945'], ['Roche Farm and Garden', '781 East Court Street', 'Wrightsville', 'GA', '31040', '32.7302168', '-82.7117232'], ['Rodgers Fertilizer', '409 N Main St', 'Saluda', 'SC', '29138', '34.0082425', '-81.7729772'], ['Rogers Feed', '1041 Easley Hwy', 'Pelzer', 'SC', '29669', '34.6639864', '-82.5126743'], ['Ronnie Spivey', '654 Mary Richardson Road', 'Wray', 'GA', '31796', '31.525261', '-83.06603'], ['Shirley Feed & Seed Inc', '2439 North Elm Street', 'Commerce', 'GA', '30529', '34.2068698', '-83.4689814'], ['Southern Home and Farm LLC', '3127 Hamilton Road', 'Lagrange', 'GA', '30241', '32.9765932', '-84.98978'], ['Southland Power Fence', '752 E 5th Ave', 'Colbert', 'GA', '30628', '34.0412765', '-83.2001394'], ['Town & Country General Store', '59 Hwy 212 West', 'Monticello', 'GA', '31064', '33.3066615', '-83.6976187'], ['Twisted Fitterz', '10329 Nashville Enigma Rd', 'Alapaha', 'GA', '31622', '31.3441482', '-83.3002373'], ['Westside Feed II', '230 SE 7th Avenue', 'Lake Butler', 'FL', '32054', '30.02116', '-82.329495'], ['White Co. Farmers Exchange', '951 S Main St', 'Cleveland', 'GA', '30528', '34.58403', '-83.760829']] 42 | -------------------------------------------------------------------------------- /Chapter09/regex.py: -------------------------------------------------------------------------------- 1 | """ 2 | Python Regular Expressions: re 3 | https://regexone.com/references/python 4 | http://www.regular-expressions.info/python.html 5 | https://developers.google.com/edu/python/regular-expressions 6 | # Anchors: ^ begining of Line, $ end of line 7 | # re.search(pattern,str,re.I|re.MULTILINE|re.M) 8 | """ 9 | import re 10 | 11 | sentence = """The course assumes a working knowledge of key data science topics 12 | (statistics, machine learning, and general data analytic methods). 13 | Programming experience in some language (such as R, MATLAB, SAS, Mathematica, Java, C, C++, VB, or FORTRAN) 14 | is expected. In particular, participants need to be comfortable with general programming concepts like 15 | variables, loops, and functions. Experience with Python is helpful (but not required).""" 16 | #source: https://www.enthought.com/training/course/python-for-data-science/#/syllabus 17 | splitSentence=sentence.split() 18 | 19 | print("Length of Sentence: ",len(sentence), '& splitSentence: ',len(splitSentence)) 20 | print(splitSentence) 21 | 22 | #Findall 23 | matches = re.findall(r"([A-Z+]+)\,",sentence) 24 | print("Findall found total ",len(matches)," Matches >> ",matches) 25 | #Findall found total 6 Matches >> ['R', 'MATLAB', 'SAS', 'C', 'C++', 'VB'] 26 | 27 | matches = re.findall(r"([A-Z]+)\,",sentence) 28 | print("Findall found total ",len(matches)," Matches >> ",matches) 29 | #Findall found total 5 Matches >> ['R', 'MATLAB', 'SAS', 'C', 'VB'] 30 | 31 | matches = re.findall(r"\s*([\sorA-Z+]+)\)",sentence) #r'\s*([A-Z]+)\)' matches 'FORTRAN' 32 | print("Findall found total ",len(matches)," Matches >> ",matches) 33 | #Findall found total 1 Matches >> ['or FORTRAN'] 34 | 35 | 36 | #re.match 37 | fortran = matches[0] # 'or FORTRAN' 38 | if re.match(r'or',fortran): 39 | fortran = re.sub(r'or\s*','',fortran) 40 | print(fortran) 41 | #FORTRAN 42 | 43 | #re.search 44 | if re.search(r'^F.*N$',fortran): 45 | print("True") 46 | #True 47 | 48 | matches = re.findall(r'\s(MAT.*?)\,',sentence,flags=re.IGNORECASE) 49 | print("(MAT.*?)\,: ",matches) #r'(?i)\s(MAT.*?)\,' can also be used 50 | #(MAT.*?)\,: ['MATLAB', 'Mathematica'] 51 | 52 | matches = re.findall(r'\s(MAT.*?)\,',sentence) 53 | print("(MAT.*?)\,: ",matches) 54 | #(MAT.*?)\,: ['MATLAB'] 55 | 56 | matches = re.findall(r'\s(C.*?)\,',sentence) 57 | print("\s(C.*?)\,: ",matches) 58 | #\s(C.*?)\,: ['C', 'C++'] 59 | 60 | 61 | #re.split 62 | matchesOne = re.split(r"\W+",sentence) #\w (word characters, \W - nonword) 63 | print("Regular Split '\W+' found total: ",len(matchesOne ),"\n",matchesOne) 64 | #Regular Split '\W+' found total: 63 65 | #['The', 'course', 'assumes', 'a', 'working', 'knowledge', 'of', 'key', 'data', 'science', 'topics', 'statistics', ......, 'such', 'as', 'R', 'MATLAB', 'SAS', 'Mathematica', 'Java', 'C', 'C', 'VB', 'or', 'FORTRAN', 'is', 'expected', .........., 'and', 'functions', 'Experience', 'with', 'Python', 'is', 'helpful', 'but', 'not', 'required', ''] 66 | 67 | matchesTwo = re.split(r"\s",sentence) 68 | print("Regular Split '\s' found total: ",len(matchesTwo),"\n", matchesTwo) 69 | #Regular Split '\s' found total: 63 : 70 | #['The', 'course', 'assumes', 'a', 'working', 'knowledge', 'of', 'key', 'data', 'science', 'topics', '(statistics,', ........., '(such', 'as', 'R,', 'MATLAB,', 'SAS,', 'Mathematica,', 'Java,', 'C,', 'C++,', 'VB,', 'or', 'FORTRAN)', 'is', ......., 'and', 'functions.', 'Experience', 'with', 'Python', 'is', 'helpful', '(but', 'not', 'required).'] 71 | 72 | 73 | timeDate= ''' 74 | 75 | 76 | 77 | 78 | ''' 79 | 80 | pattern = r'(20\d+)([-]+)(0[1-9]|1[012])([-]+)(0[1-9]|[12][0-9]|3[01])' 81 | recompiled = re.compile(pattern) # 82 | dateMatches = recompiled.search(timeDate) 83 | 84 | 85 | print("Group : ",dateMatches.group()) 86 | #Group : 2019-02-11 87 | 88 | print("Groups : ",dateMatches.groups()) 89 | #Groups : ('2019', '-', '02', '-', '11') 90 | 91 | print("Group 1 : ",dateMatches.group(1)) 92 | #Group 1 : 2019 93 | 94 | print("Group 5 : ",dateMatches.group(5)) 95 | #Group 5 : 11 96 | 97 | 98 | for match in re.finditer(pattern, timeDate): # 99 | #for match in re.finditer(recompiled, timeDate): 100 | s = match.start() 101 | e = match.end() 102 | l = match.lastindex 103 | g = match.groups() 104 | print('Found {} at {}:{}, groups{} lastindex:{}'.format(timeDate[s:e], s, e,g,l)) 105 | 106 | 107 | # Found 2019-02-11 at 16:26, groups('2019', '-', '02', '-', '11') lastindex:5 108 | # Found 2018-02-11 at 67:77, groups('2018', '-', '02', '-', '11') lastindex:5 109 | # Found 2019-02-06 at 118:128, groups('2019', '-', '02', '-', '06') lastindex:5 110 | # Found 2019-02-05 at 176:186, groups('2019', '-', '02', '-', '05') lastindex:5 111 | # Found 2019-02-04 at 234:244, groups('2019', '-', '02', '-', '04') lastindex:5 112 | 113 | 114 | pDate = r'(?P[0-9]{4})(?P[-])(?P0[1-9]|1[012])-(?P0[1-9]|[12][0-9]|3[01])' 115 | recompiled = re.compile(pDate) 116 | for match in re.finditer(recompiled,timeDate): 117 | s = match.start() 118 | e = match.end() 119 | l = match.lastindex 120 | print("Group ALL or 0: ",match.groups(0)) #or match.groups() 121 | print("Group Year: ",match.group('year')) 122 | print("Group Delimiter: ",match.group('sep')) 123 | print('Found {} at {}:{}, lastindex: {}'.format(timeDate[s:e], s, e,l)) 124 | print('year :',match.groupdict()['year']) 125 | print('day :',match.groupdict()['day']) 126 | print('lastgroup :',match.lastgroup) 127 | 128 | 129 | # Group ALL or 0: ('2019', '-', '02', '11') 130 | # Group Year: 2019 131 | # Group Month: 02 132 | # Group Day: 11 133 | # Group Delimiter: - 134 | # Found 2019-02-11 at 16:26, lastindex: 4 135 | # year : 2019 136 | # day : 11 137 | # lastgroup : day 138 | 139 | 140 | pTime = r'(?P[0-9]{2})(?P[:])(?P[0-9]{2}):(?P[0-9.:+]+)' 141 | recompiled = re.compile(pTime) 142 | for match in re.finditer(recompiled,timeDate): 143 | print("Group String: ",match.group()) 144 | print("Group ALL or 0: ",match.groups()) 145 | print("Group Span: ",match.span()) 146 | print("Group Span 1: ",match.span(1)) 147 | print("Group Span 4: ",match.span(4)) 148 | print('hour :',match.groupdict()['hour']) 149 | print('minute :',match.groupdict()['min']) 150 | print('second :',match.groupdict()['sec_mil']) 151 | print('lastgroup :',match.lastgroup) 152 | 153 | 154 | # Group String: 12:53:00+00:00 155 | # Group ALL or 0: ('12', ':', '53', '00+00:00') 156 | # Group Span: (245, 259) 157 | # Group Span 1: (245, 247) 158 | # Group Span 4: (251, 259) 159 | # hour : 12 160 | # minute : 53 161 | # second : 00+00:00 162 | # lastgroup : sec_mil 163 | -------------------------------------------------------------------------------- /Chapter05/bs4_exploring.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup,SoupStrainer 2 | import re 3 | html_doc = """ 4 | The Dormouse's story 5 | 6 |

The Dormouse's story

7 |

Once upon a time there were three little sisters; and their names were 8 | Elsie, 9 | Lacie and 10 | Tillie; 11 | and they lived at the bottom of a well.

12 |

...

13 |

Secret agents

14 |
    15 |
  • Jason Walters, 003: Found dead in "A View to a Kill".
  • 16 |
  • Alex Trevelyan, 006: Agent turned terrorist leader; James' nemesis in "Goldeneye".
  • 17 |
  • James Bond, 007: The main man; shaken but not stirred.
  • 18 |
19 | 20 | 21 | """ 22 | tagsA = SoupStrainer("a") 23 | soupA = BeautifulSoup(html_doc,'lxml',parse_only=tagsA) 24 | soup = BeautifulSoup(html_doc,'lxml') 25 | 26 | print(type(soupA)) 27 | print(soupA) 28 | 29 | print(soupA.prettify()) 30 | 31 | print(soupA.a.has_attr('class')) 32 | 33 | print(soupA.a.has_attr('name')) 34 | 35 | print(soupA.find("a")) #print(soupA.find(name="a")) 36 | 37 | print(soupA.find("a",attrs={'class':'sister'})) 38 | 39 | print(soupA.find("a",attrs={'class':'sister'},text="Lacie")) 40 | 41 | print(soupA.find("a",attrs={'id':'link3'})) 42 | 43 | print(soupA.find('a',id="link2")) 44 | 45 | print(soupA.find_all("a")) 46 | 47 | #find all , but return only 2 of them 48 | print(soupA.find_all("a",limit=2)) #attrs, text 49 | 50 | print(soupA.find("a",text=re.compile(r'cie'))) #import re 51 | 52 | print(soupA.find_all("a",attrs={'id':re.compile(r'3')})) 53 | 54 | print(soupA.find_all(re.compile(r'a'))) 55 | 56 | #soup 57 | soup = BeautifulSoup(html_doc,'lxml') 58 | 59 | print(soup.find_all("p","story")) #class=story 60 | 61 | print(soup.find_all("p","title")) #soup.find_all("p",attrs={'class':"title"}) 62 | 63 | print(soup.find_all("p",attrs={'class':["title","story"]})) 64 | 65 | print(soup.find_all(["p","li"])) 66 | 67 | print(soup.find_all(string="Elsie")) #text="Elsie" 68 | 69 | print(soup.find_all(text=re.compile(r'Elsie'))) #import re 70 | 71 | print(soup.find_all("a",string="Lacie")) #text="Lacie" 72 | 73 | for li in soup.ul.find_all('li'): 74 | print(li.name, ' > ',li.get('data-id'),' > ', li.text) 75 | 76 | print(soupA.a) #tag a 77 | 78 | print(soup.li) #tag li 79 | 80 | print(soup.p) 81 | 82 | print(soup.p.b) #tag p and b 83 | 84 | print(soup.ul.find('li',attrs={'data-id':'45732'})) 85 | 86 | print(soup.ul.find('li',attrs={'data-id':'45732'}).text) 87 | 88 | print(soup.p.text) #get_text() 89 | 90 | print(soup.li.text) 91 | 92 | print(soup.p.string) 93 | 94 | print(list(soup.find('p','story').children)) 95 | 96 | print(list(soup.find('p','story').contents)) 97 | 98 | print(list(soup.find('p','story').descendants)) 99 | 100 | #using List Comprehension Technique 101 | print([a.name for a in soup.find('p','story').children]) 102 | 103 | print([{'tag':a.name,'text':a.text,'class':a.get('class')} for a in soup.find('p','story').children if a.name!=None]) 104 | 105 | print([a.name for a in soup.find('p','story').descendants]) 106 | 107 | print(list(filter(None,[a.name for a in soup.find('p','story').descendants]))) 108 | 109 | print(soup.find('p','story').findChildren()) 110 | 111 | print(soup.find('p','story').findChild()) #soup.find('p','story').find() 112 | 113 | #print parent element of with class=sister 114 | print(soup.find('a','sister').parent) 115 | 116 | #print parent element name of with class=sister 117 | print(soup.find('a','sister').parent.name) 118 | 119 | #print text from parent element of with class=sister 120 | print(soup.find('a','sister').parent.text) 121 | 122 | for element in soup.find('a','sister').parents: 123 | print(element.name) 124 | 125 | #find single Parent for selected with class=sister 126 | print(soup.find('a','sister').findParent()) 127 | 128 | #find Parents for selected with class=sister 129 | print(soup.find('a','sister').findParents()) 130 | 131 | print(soup.find('p','story').next) 132 | 133 | print(soup.find('p','story').next.next) 134 | 135 | print(soup.find('p','story').next_element) 136 | 137 | print(soup.find('p','story').next_element.next_element) 138 | 139 | print(soup.find('p','story').next_element.next_element.next_element) 140 | 141 | print(soup.find('p','story').previous) #returns empty or new-line. 142 | print(soup.find('p','title').next.next.next) #returns empty or newline similar to code above 143 | 144 | print(soup.find('p','story').previous.previous) 145 | 146 | print(soup.find('p','story').previous_element) #returns empty or new-line. 147 | print(soup.find('p','story').previous_element.previous_element) 148 | 149 | 150 | print(soup.find('p','story').previous_element.previous_element.previous_element) 151 | 152 | print(soup.find('p','title').next.next.previous.previous) 153 | 154 | for element in soup.find('ul').next_elements: 155 | print(element) 156 | 157 | print(soup.find('p','story').next) 158 | 159 | print(soup.find('p','story').next_element) 160 | 161 | print(soup.find('p','story').find_next()) #element after next_element 162 | 163 | print(soup.find('p','story').find_next('h1')) 164 | 165 | print(soup.find('p','story').find_all_next()) 166 | 167 | print(soup.find('p','story').find_all_next('li',limit=2)) 168 | 169 | print(soup.find('ul').previous.previous.previous) 170 | 171 | print(soup.find('ul').find_previous()) 172 | 173 | print(soup.find('ul').find_previous('p','title')) 174 | 175 | print(soup.find('ul').find_all_previous('p')) 176 | 177 | print(soup.find('p','title').next_sibling) #returns empty or new-line 178 | 179 | print(soup.find('p','title').next_sibling.next_sibling) #print(soup.find('p','title').next_sibling.next) 180 | 181 | print(soup.find('ul').previous_sibling) #returns empty or new-line 182 | 183 | print(soup.find('ul').previous_sibling.previous_sibling) 184 | 185 | #using List Comprehension 186 | title = [ele.name for ele in soup.find('p','title').next_siblings] 187 | print(list(filter(None,title))) 188 | 189 | ul = [ele.name for ele in soup.find('ul').previous_siblings] 190 | print(list(filter(None,ul))) 191 | 192 | #find next

siblings for selected

with class=title 193 | print(soup.find('p','title').find_next_siblings('p')) 194 | 195 | #find single or next sibling for selected

196 | print(soup.find('h1').find_next_sibling()) 197 | 198 | #find single or next sibling
  • for selected

    199 | print(soup.find('h1').find_next_sibling('li')) 200 | 201 | #find first previous sibling to