├── README.md
└── review-scraper.py


/README.md:
--------------------------------------------------------------------------------
1 | # scrape-amazon-reviews
2 | 
3 | code for YouTube video tutorial.
4 | 
5 | Learn how to scrape amazon reviews using Python and Splash.
6 | 


--------------------------------------------------------------------------------
/review-scraper.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import pandas as pd
 4 | 
 5 | reviewlist = []
 6 | 
 7 | def get_soup(url):
 8 |     r = requests.get('http://localhost:8050/render.html', params={'url': url, 'wait': 2})
 9 |     soup = BeautifulSoup(r.text, 'html.parser')
10 |     return soup
11 | 
12 | 
13 | def get_reviews(soup):
14 |     reviews = soup.find_all('div', {'data-hook': 'review'})
15 |     try:
16 |         for item in reviews:
17 |             review = {
18 |             'product': soup.title.text.replace('Amazon.co.uk:Customer reviews:', '').strip(),
19 |             'title': item.find('a', {'data-hook': 'review-title'}).text.strip(),
20 |             'rating':  float(item.find('i', {'data-hook': 'review-star-rating'}).text.replace('out of 5 stars', '').strip()),
21 |             'body': item.find('span', {'data-hook': 'review-body'}).text.strip(),
22 |             }
23 |             reviewlist.append(review)
24 |     except:
25 |         pass
26 | 
27 | for x in range(1,999):
28 |     soup = get_soup(f'https://www.amazon.co.uk/product-reviews/B07WD58H6R/ref=cm_cr_arp_d_paging_btm_next_2?ie=UTF8&reviewerType=all_reviews&pageNumber={x}')
29 |     print(f'Getting page: {x}')
30 |     get_reviews(soup)
31 |     print(len(reviewlist))
32 |     if not soup.find('li', {'class': 'a-disabled a-last'}):
33 |         pass
34 |     else:
35 |         break
36 | 
37 | df = pd.DataFrame(reviewlist)
38 | df.to_excel('sony-headphones.xlsx', index=False)
39 | print('Fin.')
40 | 


--------------------------------------------------------------------------------