├── How to web scrape in 8 minutes
├── README.md
├── air.py
├── bay.py
└── nbaa.py


/How to web scrape in 8 minutes:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd
 4 | 
 5 | url = 'https://www.worldometers.info/world-population/'
 6 | requests.get(url)
 7 | page = requests.get(url)
 8 | 
 9 | soup = BeautifulSoup(page.text, 'lxml')
10 | print(soup)
11 | 
12 | table_data = soup.find('table', class_ = 'table table-striped table-bordered table-hover table-condensed table-list')
13 | 
14 | headers = []
15 | for i in table_data.find_all('th'):
16 |     title = i.text
17 |     headers.append(title)
18 | 
19 | df = pd.DataFrame(columns = headers)
20 | 
21 | for j in table_data.find_all('tr')[1:]:
22 |         td = j.find_all('td')
23 |         row = [tr.text for tr in td]
24 |         length = len(df)
25 |         df.loc[length] = row
26 | 
27 |     
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Web-Scraping-Code
 2 | This is code that was used for How To Make Money, Using Web Scraping
 3 | 
 4 | The Web Scraping application used was Scrapy which is a more advanced library to web scrape. It involves this code but is also performed using the anaconda prompt/console
 5 | 
 6 | If your starting Web Scraping for the first time I recommend using a libray called Beautiful Soup, it's typically easier to start with.
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/air.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.selector import Selector
 4 | from scrapy_selenium import SeleniumRequest
 5 | from selenium.webdriver.common.keys import Keys
 6 |  
 7 | 
 8 | ###INPUT THE LOCATION YOU WANT WITH STAYS AT THE END######################
 9 | n= 1
10 | city = 'italy, rome'
11 | class AirSpider(scrapy.Spider):
12 |     name = 'air'
13 |     #allowed_domains = ['www.airbnb.ca']
14 |     
15 |     #get the webpage
16 |     def start_requests(self):
17 |         global city
18 |         yield SeleniumRequest(url = "https://www.airbnb.ca/s/"+city+"/homes?refinement_paths%5B%5D=%2Ffor_you&search_type=search_query",
19 |                               wait_time = 1, 
20 |                               callback = self.parse)
21 | 
22 |    #send the name of the city you inputted into the search box on the website 
23 |     def parse(self, response):
24 |         global n
25 |         global city
26 |         #response.setHeader("Set-Cookie", "HttpOnly;Secure;SameSite=Strict")
27 |         driver = response.meta['driver']
28 |         search_input = driver.find_element_by_xpath("//input[@aria-label]")
29 |         
30 |         search_input.send_keys('')
31 |         search_input.send_keys(Keys.ENTER)
32 |         
33 |         driver.save_screenshot('after_filling_input.png')
34 |        
35 |        
36 |         html = driver.page_source
37 |         response_obj = Selector(text = html)
38 |        
39 |     
40 |         house = response_obj.xpath("//div[@itemprop = 'itemListElement']/div/div[@class]")
41 |         i=2
42 |         ii = str(i)
43 |         
44 |         #loop through all the listings collecting data     
45 |         for houses in house:
46 |             name = houses.xpath(".//a/@aria-label").get()
47 |             link = houses.xpath(".//a/@href").get()
48 |             rating = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div/span)[2]/span/span[@class])[3]/text()").get()
49 |             guests = houses.xpath("((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text()").get()
50 |             bedrooms = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text())[3]").get()
51 |             beds = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text())[5]").get()
52 |             baths = houses.xpath("(((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[3]/text())[7]").get()
53 |             price = houses.xpath("((//div[@itemprop = 'itemListElement']/div/div[@class]/div[@class])["+ii+"]/div[@class])[5]/div/div/span/span[@class]/text()").get()
54 |             i = i+2
55 |             ii = str(i)
56 |             if link:
57 |                 yield{'link': 'https://airbnb.ca'+link, 'name': name, 'rating':rating, 'guests': guests,
58 |                       'bedrooms': bedrooms, 'beds': beds, 'baths': baths, 'price':price}
59 |         
60 |         if n <6:
61 |             n += 1
62 |             if n ==3:
63 |                 n = 4
64 |             nn = str(n)
65 |         else:
66 |             n =6
67 |         nn = str(n)
68 |         print(n)
69 |         
70 |         #if at end of page will go to the next available page
71 |         next_page = response.xpath("(//nav/ul/li)["+nn+"]/a/@href").get()
72 |         if next_page:
73 |             yield SeleniumRequest(url = 'https://www.airbnb.ca'+next_page, callback = self.parse, wait_time =3)
74 |             
75 | 


--------------------------------------------------------------------------------
/bay.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | n = 3
 5 | #identifying the website that is gonna be scraped
 6 | class BaySpider(scrapy.Spider):
 7 |     name = 'bay'
 8 |     allowed_domains = ['www.thebay.com']
 9 |     start_urls = ['https://www.thebay.com/search/EndecaSearch.jsp?N=1553+302023689']
10 | 
11 |     #loop through every item grabbing the data we want
12 |     def parse(self, response):
13 |         global n
14 |         i = 3
15 |         ii = str(i)
16 |         box = response.xpath("//div[@id = 'product-container']/div[@id]/div[@class = 'product-text']")
17 |         for boxes in box:
18 |             link = boxes.xpath("(//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/@href").get()
19 |             price = boxes.xpath("(//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/span[@class = 'product-price line-through']/text()").get()
20 |             sale = boxes.xpath("((//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/span)[2]/span[@class = 'product-sale-price']/text()").get()
21 |             brand = boxes.xpath("(//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/p/span/text()").get()
22 |             clothing = boxes.xpath("((//div[@id = 'product-container']/div[@id]/div)["+ii+"]/a/p)[2]/text()").get()
23 |             i  = i + 3
24 |             ii = str(i)
25 |             
26 |             if len(price) <10 and len(sale) <10:
27 |                 price_int = price.replace('$', '')
28 |                 price_int = float(price_int)
29 |                 sale_int = sale.replace('$','')
30 |                 sale_int = float(sale_int)
31 |                 discount = ((price_int - sale_int)/price_int)*100
32 |                 discount = str(discount)
33 |             else:
34 |                 discount = 'Unknown'
35 |                 
36 |             yield{'link': link, 'Brand': brand, 'clothing': clothing, 'Original Price': price, 'Sale Price': sale,
37 |                   'discount': discount +'%'}
38 |         
39 |         n+= 1
40 |         nn = str(n)
41 |         print (n)
42 |         next_page = response.xpath("((//ol)[2]/li)["+nn+"]/a/@href").get()
43 |         print(next_page)
44 |         #go to the next page when done with the current one
45 |         if next_page:
46 |             yield scrapy.Request(url = 'https://thebay.com'+next_page, callback = self.parse, dont_filter = True)
47 |             
48 |     
49 | b = '2800.00'
50 | c = float(b)
51 | len(b)
52 | 


--------------------------------------------------------------------------------
/nbaa.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | 
 4 | n = 0
 5 | class NbaaSpider(scrapy.Spider):
 6 |     name = 'nbaa'
 7 |     allowed_domains = ['www.basketball-reference.com']
 8 |     start_urls = ['https://www.basketball-reference.com/play-index/pgl_finder.cgi?request=1&match=game&year_min=2020&year_max=2020&is_playoffs=N&age_min=0&age_max=99&season_start=1&season_end=-1&pos_is_g=Y&pos_is_gf=Y&pos_is_f=Y&pos_is_fg=Y&pos_is_fc=Y&pos_is_c=Y&pos_is_cf=Y&order_by=pts']
 9 | 
10 |     def parse(self, response):
11 |         global n
12 |         log = response.xpath("//tbody/tr")
13 |         for logs in log:
14 |             name = logs.xpath(".//td[@data-stat = 'player']/a/text()").get()
15 |             pos = logs.xpath(".//td[@data-stat = 'pos']/text()").get()
16 |             date = logs.xpath(".//td[@data-stat = 'date_game']/a/text()").get()
17 |             team = logs.xpath(".//td[@data-stat = 'team_id']/a/text()").get()
18 |             opp = logs.xpath(".//td[@data-stat = 'opp_id']/a/text()").get()
19 |             game_result = logs.xpath(".//td[@data-stat = 'game_result']/text()").get()
20 |             MP = logs.xpath(".//td[@data-stat = 'mp']/text()").get()
21 |             FG = logs.xpath(".//td[@data-stat = 'fg']/text()").get()
22 |             FGA = logs.xpath(".//td[@data-stat = 'fga']/text()").get()
23 |             FG_pct = logs.xpath(".//td[@data-stat = 'fg_pct']/text()").get()
24 |             twoP = logs.xpath(".//td[@data-stat = 'fg2']/text()").get()
25 |             twoPA = logs.xpath(".//td[@data-stat = 'fg2a']/text()").get()
26 |             twoP_pct = logs.xpath(".//td[@data-stat = 'fg2_pct']/text()").get()
27 |             threeP = logs.xpath(".//td[@data-stat = 'fg3']/text()").get()
28 |             threePA = logs.xpath(".//td[@data-stat = 'fga']/text()").get()
29 |             threeP_pct = logs.xpath(".//td[@data-stat = 'fg3_pct']/text()").get()
30 |             FT = logs.xpath(".//td[@data-stat = 'ft']/text()").get()
31 |             FTA = logs.xpath(".//td[@data-stat = 'fta']/text()").get()
32 |             FT_pct = logs.xpath(".//td[@data-stat = 'ft_pct']/text()").get()
33 |             ORB = logs.xpath(".//td[@data-stat = 'orb']/text()").get()
34 |             DRB = logs.xpath(".//td[@data-stat = 'drb']/text()").get()
35 |             TRB = logs.xpath(".//td[@data-stat = 'trb']/text()").get()
36 |             AST = logs.xpath(".//td[@data-stat = 'ast']/text()").get()
37 |             STL = logs.xpath(".//td[@data-stat = 'stl']/text()").get()
38 |             BLK = logs.xpath(".//td[@data-stat = 'blk']/text()").get()
39 |             TOV = logs.xpath(".//td[@data-stat = 'tov']/text()").get()
40 |             PF = logs.xpath(".//td[@data-stat = 'pf']/text()").get()
41 |             PTS = logs.xpath(".//td[@data-stat = 'pts']/text()").get()
42 |             
43 |             if name:
44 |             
45 |                 yield{'Athlete':name, 'Pos': pos, 'Date': date, 'Team': team, 'Opp' :opp, 'Game Result': game_result,
46 |                       'MP': MP, 'FG': FG, 'FGA': FGA, 'FG%': FG_pct, '2P':twoP, '2PA': twoPA, '2P%':twoP_pct, '3A':threeP,'3PA':threePA,
47 |                       '3P%':threeP_pct, 'FT':FT, 'FTA':FTA, 'FT%':FT_pct, 'ORB':ORB, 'DRB': DRB, 'TRB': TRB, 'AST': AST, 'STL': STL,
48 |                       'BLK':BLK, 'TOV':TOV, 'PF':PF, 'PTS':PTS}
49 | 
50 |         
51 |         if n <= 1:
52 |             n+=1
53 |         else:
54 |             n = 2
55 |         nn = str(n)
56 |         next_page = response.xpath('((//p)[7]/a/@href)['+nn+']').get()
57 |         if next_page:
58 |             yield scrapy.Request(url = 'https://www.basketball-reference.com'+next_page, callback = self.parse)
59 |             
60 |             


--------------------------------------------------------------------------------