├── README.md └── review-scraper.py /README.md: -------------------------------------------------------------------------------- 1 | # scrape-amazon-reviews 2 | 3 | code for YouTube video tutorial. 4 | 5 | Learn how to scrape amazon reviews using Python and Splash. 6 | -------------------------------------------------------------------------------- /review-scraper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import pandas as pd 4 | 5 | reviewlist = [] 6 | 7 | def get_soup(url): 8 | r = requests.get('http://localhost:8050/render.html', params={'url': url, 'wait': 2}) 9 | soup = BeautifulSoup(r.text, 'html.parser') 10 | return soup 11 | 12 | 13 | def get_reviews(soup): 14 | reviews = soup.find_all('div', {'data-hook': 'review'}) 15 | try: 16 | for item in reviews: 17 | review = { 18 | 'product': soup.title.text.replace('Amazon.co.uk:Customer reviews:', '').strip(), 19 | 'title': item.find('a', {'data-hook': 'review-title'}).text.strip(), 20 | 'rating': float(item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()), 21 | 'body': item.find('span', {'data-hook': 'review-body'}).text.strip(), 22 | } 23 | reviewlist.append(review) 24 | except: 25 | pass 26 | 27 | for x in range(1,999): 28 | soup = get_soup(f'https://www.amazon.co.uk/product-reviews/B07WD58H6R/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber={x}') 29 | print(f'Getting page: {x}') 30 | get_reviews(soup) 31 | print(len(reviewlist)) 32 | if not soup.find('li', {'class': 'a-disabled a-last'}): 33 | pass 34 | else: 35 | break 36 | 37 | df = pd.DataFrame(reviewlist) 38 | df.to_excel('sony-headphones.xlsx', index=False) 39 | print('Fin.') 40 | --------------------------------------------------------------------------------