├── How to web scrape in 8 minutes ├── README.md ├── air.py ├── bay.py └── nbaa.py /How to web scrape in 8 minutes: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | 5 | url = 'https://www.worldometers.info/world-population/' 6 | requests.get(url) 7 | page = requests.get(url) 8 | 9 | soup = BeautifulSoup(page.text, 'lxml') 10 | print(soup) 11 | 12 | table_data = soup.find('table', class_ = 'table table-striped table-bordered table-hover table-condensed table-list') 13 | 14 | headers = [] 15 | for i in table_data.find_all('th'): 16 | title = i.text 17 | headers.append(title) 18 | 19 | df = pd.DataFrame(columns = headers) 20 | 21 | for j in table_data.find_all('tr')[1:]: 22 | td = j.find_all('td') 23 | row = [tr.text for tr in td] 24 | length = len(df) 25 | df.loc[length] = row 26 | 27 | 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web-Scraping-Code 2 | This is code that was used for How To Make Money, Using Web Scraping 3 | 4 | The Web Scraping application used was Scrapy which is a more advanced library to web scrape. It involves this code but is also performed using the anaconda prompt/console 5 | 6 | If your starting Web Scraping for the first time I recommend using a libray called Beautiful Soup, it's typically easier to start with. 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /air.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.selector import Selector 4 | from scrapy_selenium import SeleniumRequest 5 | from selenium.webdriver.common.keys import Keys 6 | 7 | 8 | ###INPUT THE LOCATION YOU WANT WITH STAYS AT THE END###################### 9 | n= 1 10 | city = 'italy, rome' 11 | class AirSpider(scrapy.Spider): 12 | name = 'air' 13 | #allowed_domains = ['www.airbnb.ca'] 14 | 15 | #get the webpage 16 | def start_requests(self): 17 | global city 18 | yield SeleniumRequest(url = "https://www.airbnb.ca/s/"+city+"/homes?refinement_paths%5B%5D=%2Ffor_you&search_type=search_query", 19 | wait_time = 1, 20 | callback = self.parse) 21 | 22 | #send the name of the city you inputted into the search box on the website 23 | def parse(self, response): 24 | global n 25 | global city 26 | #response.setHeader("Set-Cookie", "HttpOnly;Secure;SameSite=Strict") 27 | driver = response.meta['driver'] 28 | search_input = driver.find_element_by_xpath("//input[@aria-label]") 29 | 30 | search_input.send_keys('') 31 | search_input.send_keys(Keys.ENTER) 32 | 33 | driver.save_screenshot('after_filling_input.png') 34 | 35 | 36 | html = driver.page_source 37 | response_obj = Selector(text = html) 38 | 39 | 40 | house = response_obj.xpath("//div[@itemprop = 'itemListElement']/div/div[@class]") 41 | i=2 42 | ii = str(i) 43 | 44 | #loop through all the listings collecting data 45 | for houses in house: 46 | name = houses.xpath(".//a/@aria-label").get() 47 | link = houses.xpath(".//a/@href").get() 48 | rating = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div/span)[2]/span/span[@class])[3]/text()").get() 49 | guests = houses.xpath("((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text()").get() 50 | bedrooms = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text())[3]").get() 51 | beds = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text())[5]").get() 52 | baths = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text())[7]").get() 53 | price = houses.xpath("((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[5]/div/div/span/span[@class]/text()").get() 54 | i = i+2 55 | ii = str(i) 56 | if link: 57 | yield{'link': 'https://airbnb.ca'+link, 'name': name, 'rating':rating, 'guests': guests, 58 | 'bedrooms': bedrooms, 'beds': beds, 'baths': baths, 'price':price} 59 | 60 | if n <6: 61 | n += 1 62 | if n ==3: 63 | n = 4 64 | nn = str(n) 65 | else: 66 | n =6 67 | nn = str(n) 68 | print(n) 69 | 70 | #if at end of page will go to the next available page 71 | next_page = response.xpath("(//nav/ul/li)["+nn+"]/a/@href").get() 72 | if next_page: 73 | yield SeleniumRequest(url = 'https://www.airbnb.ca'+next_page, callback = self.parse, wait_time =3) 74 | 75 | -------------------------------------------------------------------------------- /bay.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | n = 3 5 | #identifying the website that is gonna be scraped 6 | class BaySpider(scrapy.Spider): 7 | name = 'bay' 8 | allowed_domains = ['www.thebay.com'] 9 | start_urls = ['https://www.thebay.com/search/EndecaSearch.jsp?N=1553+302023689'] 10 | 11 | #loop through every item grabbing the data we want 12 | def parse(self, response): 13 | global n 14 | i = 3 15 | ii = str(i) 16 | box = response.xpath("//div[@id = 'product-container']/div[@id]/div[@class = 'product-text']") 17 | for boxes in box: 18 | link = boxes.xpath("(//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/@href").get() 19 | price = boxes.xpath("(//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/span[@class = 'product-price line-through']/text()").get() 20 | sale = boxes.xpath("((//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/span)[2]/span[@class = 'product-sale-price']/text()").get() 21 | brand = boxes.xpath("(//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/p/span/text()").get() 22 | clothing = boxes.xpath("((//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/p)[2]/text()").get() 23 | i = i + 3 24 | ii = str(i) 25 | 26 | if len(price) <10 and len(sale) <10: 27 | price_int = price.replace('$', '') 28 | price_int = float(price_int) 29 | sale_int = sale.replace('$','') 30 | sale_int = float(sale_int) 31 | discount = ((price_int - sale_int)/price_int)*100 32 | discount = str(discount) 33 | else: 34 | discount = 'Unknown' 35 | 36 | yield{'link': link, 'Brand': brand, 'clothing': clothing, 'Original Price': price, 'Sale Price': sale, 37 | 'discount': discount +'%'} 38 | 39 | n+= 1 40 | nn = str(n) 41 | print (n) 42 | next_page = response.xpath("((//ol)[2]/li)["+nn+"]/a/@href").get() 43 | print(next_page) 44 | #go to the next page when done with the current one 45 | if next_page: 46 | yield scrapy.Request(url = 'https://thebay.com'+next_page, callback = self.parse, dont_filter = True) 47 | 48 | 49 | b = '2800.00' 50 | c = float(b) 51 | len(b) 52 | -------------------------------------------------------------------------------- /nbaa.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | 4 | n = 0 5 | class NbaaSpider(scrapy.Spider): 6 | name = 'nbaa' 7 | allowed_domains = ['www.basketball-reference.com'] 8 | start_urls = ['https://www.basketball-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2020&year_max=2020&is_playoffs=N&age_min=0&age_max=99&season_start=1&season_end=-1&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=pts'] 9 | 10 | def parse(self, response): 11 | global n 12 | log = response.xpath("//tbody/tr") 13 | for logs in log: 14 | name = logs.xpath(".//td[@data-stat = 'player']/a/text()").get() 15 | pos = logs.xpath(".//td[@data-stat = 'pos']/text()").get() 16 | date = logs.xpath(".//td[@data-stat = 'date_game']/a/text()").get() 17 | team = logs.xpath(".//td[@data-stat = 'team_id']/a/text()").get() 18 | opp = logs.xpath(".//td[@data-stat = 'opp_id']/a/text()").get() 19 | game_result = logs.xpath(".//td[@data-stat = 'game_result']/text()").get() 20 | MP = logs.xpath(".//td[@data-stat = 'mp']/text()").get() 21 | FG = logs.xpath(".//td[@data-stat = 'fg']/text()").get() 22 | FGA = logs.xpath(".//td[@data-stat = 'fga']/text()").get() 23 | FG_pct = logs.xpath(".//td[@data-stat = 'fg_pct']/text()").get() 24 | twoP = logs.xpath(".//td[@data-stat = 'fg2']/text()").get() 25 | twoPA = logs.xpath(".//td[@data-stat = 'fg2a']/text()").get() 26 | twoP_pct = logs.xpath(".//td[@data-stat = 'fg2_pct']/text()").get() 27 | threeP = logs.xpath(".//td[@data-stat = 'fg3']/text()").get() 28 | threePA = logs.xpath(".//td[@data-stat = 'fga']/text()").get() 29 | threeP_pct = logs.xpath(".//td[@data-stat = 'fg3_pct']/text()").get() 30 | FT = logs.xpath(".//td[@data-stat = 'ft']/text()").get() 31 | FTA = logs.xpath(".//td[@data-stat = 'fta']/text()").get() 32 | FT_pct = logs.xpath(".//td[@data-stat = 'ft_pct']/text()").get() 33 | ORB = logs.xpath(".//td[@data-stat = 'orb']/text()").get() 34 | DRB = logs.xpath(".//td[@data-stat = 'drb']/text()").get() 35 | TRB = logs.xpath(".//td[@data-stat = 'trb']/text()").get() 36 | AST = logs.xpath(".//td[@data-stat = 'ast']/text()").get() 37 | STL = logs.xpath(".//td[@data-stat = 'stl']/text()").get() 38 | BLK = logs.xpath(".//td[@data-stat = 'blk']/text()").get() 39 | TOV = logs.xpath(".//td[@data-stat = 'tov']/text()").get() 40 | PF = logs.xpath(".//td[@data-stat = 'pf']/text()").get() 41 | PTS = logs.xpath(".//td[@data-stat = 'pts']/text()").get() 42 | 43 | if name: 44 | 45 | yield{'Athlete':name, 'Pos': pos, 'Date': date, 'Team': team, 'Opp' :opp, 'Game Result': game_result, 46 | 'MP': MP, 'FG': FG, 'FGA': FGA, 'FG%': FG_pct, '2P':twoP, '2PA': twoPA, '2P%':twoP_pct, '3A':threeP,'3PA':threePA, 47 | '3P%':threeP_pct, 'FT':FT, 'FTA':FTA, 'FT%':FT_pct, 'ORB':ORB, 'DRB': DRB, 'TRB': TRB, 'AST': AST, 'STL': STL, 48 | 'BLK':BLK, 'TOV':TOV, 'PF':PF, 'PTS':PTS} 49 | 50 | 51 | if n <= 1: 52 | n+=1 53 | else: 54 | n = 2 55 | nn = str(n) 56 | next_page = response.xpath('((//p)[7]/a/@href)['+nn+']').get() 57 | if next_page: 58 | yield scrapy.Request(url = 'https://www.basketball-reference.com'+next_page, callback = self.parse) 59 | 60 | --------------------------------------------------------------------------------