├── LICENSE.md
├── README.md
├── restaurants_scraper.py
└── things_to_do_scraper.py


/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Giuseppe Gambino
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scraping TripAdvisor with Python 2020 *
 2 | 
 3 | Python implementation of web scraping of TripAdvisor with Selenium in a new 2020 website.
 4 | 
 5 | There are two scripts:
 6 |   - "restaurants_scraper.py" to scrape restaurant
 7 |   - "things_to_do_scraper.py" to scrape hotels, attraction and monuments.
 8 |   
 9 | The python function is commented, write me if you have doubts.
10 | If you have a slow connection and you encounter code problems, try increasing the seconds of time.sleep () function
11 | 
12 | Features implemented: 
13 |   - The click function to open the "more" button of the reviews 
14 |   - The click function to change the page
15 |   - Csv file with the date, the score, the title and the full review!
16 |   
17 | How to use: 
18 |   - First approach: download the python file, open it and edit the default fields (csv file path, number of pages, tripadvisor url)
19 |   
20 |   - Second approach: download the file and launch it directly from the terminal, passing:
21 |       - the path of your csv file where the reviews will be stored
22 |       - the number of pages of the desired website that you want to scrape
23 |       - the url of tripadvisor website that you want to scrape
24 | 
25 | Code to paste into terminal: python3 path_to_downloaded_script/things_to_do_scraper.py desktop/reviews.csv 50 https://www.tripadvisor.com/Attraction_Review-g187791-d192285-Reviews-Colosseum-Rome_Lazio.html
26 | 
27 | What I used:
28 |   - Python 3.8.2
29 |   - Selenium 3.141.0
30 |   - Safari 14.0.1
31 |   - Visual Studio Code 1.51.1
32 |   - Macbook Pro 13" M1 2020 with macOS Big Sur 11.0.1
33 | 
34 | *This activity has been supported by a grant from the Project IDEHA - PON "Ricerca e Innovazione" 2014-2020 - Innovation for Data Elaboration in Heritage Areas, Azione II
35 | 


--------------------------------------------------------------------------------
/restaurants_scraper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import csv
 3 | from selenium import webdriver
 4 | import time
 5 | 
 6 | # default path to file to store data
 7 | path_to_file = "/Users/gius/Desktop/reviews.csv"
 8 | 
 9 | # default number of scraped pages
10 | num_page = 10
11 | 
12 | # default tripadvisor website of restaurant
13 | url = "https://www.tripadvisor.com/Restaurant_Review-g60763-d802686-Reviews-Hard_Rock_Cafe-New_York_City_New_York.html"
14 | 
15 | # if you pass the inputs in the command line
16 | if (len(sys.argv) == 4):
17 |     path_to_file = sys.argv[1]
18 |     num_page = int(sys.argv[2])
19 |     url = sys.argv[3]
20 | 
21 | # Import the webdriver
22 | driver = webdriver.Safari()
23 | driver.get(url)
24 | 
25 | # Open the file to save the review
26 | csvFile = open(path_to_file, 'a', encoding="utf-8")
27 | csvWriter = csv.writer(csvFile)
28 | 
29 | # change the value inside the range to save more or less reviews
30 | for i in range(0, num_page):
31 |     
32 |     # expand the review 
33 |     time.sleep(2)
34 |     driver.find_element_by_xpath("//span[@class='taLnk ulBlueLinks']").click()
35 | 
36 |     container = driver.find_elements_by_xpath(".//div[@class='review-container']")
37 | 
38 |     for j in range(len(container)):
39 | 
40 |         title = container[j].find_element_by_xpath(".//span[@class='noQuotes']").text
41 |         date = container[j].find_element_by_xpath(".//span[contains(@class, 'ratingDate')]").get_attribute("title")
42 |         rating = container[j].find_element_by_xpath(".//span[contains(@class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
43 |         review = container[j].find_element_by_xpath(".//p[@class='partial_entry']").text.replace("\n", " ")
44 | 
45 |         csvWriter.writerow([date, rating, title, review]) 
46 | 
47 |     # change the page
48 |     driver.find_element_by_xpath('.//a[@class="nav next ui_button primary"]').click()
49 | 
50 | driver.close()
51 | 


--------------------------------------------------------------------------------
/things_to_do_scraper.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import csv
 3 | from selenium import webdriver
 4 | import time
 5 | 
 6 | # default path to file to store data
 7 | path_to_file = "/Users/gius/Desktop/reviews.csv"
 8 | 
 9 | # default number of scraped pages
10 | num_page = 10
11 | 
12 | # default tripadvisor website of hotel or things to do (attraction/monument) 
13 | url = "https://www.tripadvisor.com/Hotel_Review-g60763-d1218720-Reviews-The_Standard_High_Line-New_York_City_New_York.html"
14 | #url = "https://www.tripadvisor.com/Attraction_Review-g187791-d192285-Reviews-Colosseum-Rome_Lazio.html"
15 | 
16 | # if you pass the inputs in the command line
17 | if (len(sys.argv) == 4):
18 |     path_to_file = sys.argv[1]
19 |     num_page = int(sys.argv[2])
20 |     url = sys.argv[3]
21 | 
22 | # import the webdriver
23 | driver = webdriver.Safari()
24 | driver.get(url)
25 | 
26 | # open the file to save the review
27 | csvFile = open(path_to_file, 'a', encoding="utf-8")
28 | csvWriter = csv.writer(csvFile)
29 | 
30 | # change the value inside the range to save more or less reviews
31 | for i in range(0, num_page):
32 | 
33 |     # expand the review 
34 |     time.sleep(2)
35 |     driver.find_element_by_xpath(".//div[contains(@data-test-target, 'expand-review')]").click()
36 | 
37 |     container = driver.find_elements_by_xpath("//div[@data-reviewid]")
38 |     dates = driver.find_elements_by_xpath(".//div[@class='_2fxQ4TOx']")
39 | 
40 |     for j in range(len(container)):
41 | 
42 |         rating = container[j].find_element_by_xpath(".//span[contains(@class, 'ui_bubble_rating bubble_')]").get_attribute("class").split("_")[3]
43 |         title = container[j].find_element_by_xpath(".//div[contains(@data-test-target, 'review-title')]").text
44 |         review = container[j].find_element_by_xpath(".//q[@class='IRsGHoPm']").text.replace("\n", "  ")
45 |         date = " ".join(dates[j].text.split(" ")[-2:])
46 |     
47 |         csvWriter.writerow([date, rating, title, review]) 
48 |         
49 |     # change the page            
50 |     driver.find_element_by_xpath('.//a[@class="ui_button nav next primary "]').click()
51 | 
52 | driver.quit()
53 | 


--------------------------------------------------------------------------------