├── .gitignore ├── LICENSE ├── README.md ├── myscraps ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ └── tripadvisorspider.py ├── requirements.txt └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python 3 | 4 | ### Python ### 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # End of https://www.gitignore.io/api/python 98 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Igor Santos 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy-tripadvisor-reviews 2 | Using scrapy to scrape tripadvisor in order to get users' reviews. 3 | 4 | The code in this repository was used to scrape and gather data from tripadvisor about some brazilian cities attractions. The data were used to train a sentiment analysis classifier used in https://github.com/igorbpf/Twitter-Sentiment (https://twisentiment.herokuapp.com/). 5 | 6 | # Usage 7 | In the project's root folder type: 8 | 9 | scrapy crawl tripadvisor -o tripadvisor_reviews.csv 10 | 11 | the reviews will be stored in a csv file named tripadvisor_reviews 12 | -------------------------------------------------------------------------------- /myscraps/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datascience-python/Scrapy-tripadvisor-reviews/11f9a7439b7902db8a9f4a15dabd82239a42a045/myscraps/__init__.py -------------------------------------------------------------------------------- /myscraps/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | 9 | import scrapy 10 | 11 | 12 | class ReviewItem(scrapy.Item): 13 | # Items to get 14 | rating = scrapy.Field() 15 | review = scrapy.Field() 16 | 17 | -------------------------------------------------------------------------------- /myscraps/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | #class MyscrapsPipeline(object): 10 | # def process_item(self, item, spider): 11 | # return item 12 | 13 | -------------------------------------------------------------------------------- /myscraps/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for myscraps project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'myscraps' 13 | 14 | SPIDER_MODULES = ['myscraps.spiders'] 15 | NEWSPIDER_MODULE = 'myscraps.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'myscraps (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | # CONCURRENT_REQUESTS = 16 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'myscraps.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'myscraps.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'myscraps.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | -------------------------------------------------------------------------------- /myscraps/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /myscraps/spiders/tripadvisorspider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import scrapy 5 | from myscraps.items import ReviewItem 6 | from scrapy import Request 7 | 8 | class TripAdvisorReview(scrapy.Spider): 9 | name = "tripadvisor" 10 | # Cities: Recife, Porto Alegre, Salvador, Brasilia, Fortaleza, Curitiba, Belo Horizonte, Vitoria, Florianopolis, Natal, Goiania. 11 | start_urls = ["https://www.tripadvisor.com.br/Attractions-g304560-Activities-Recife_State_of_Pernambuco.html",\ 12 | "https://www.tripadvisor.com.br/Attractions-g303546-Activities-Porto_Alegre_State_of_Rio_Grande_do_Sul.html",\ 13 | "https://www.tripadvisor.com.br/Attractions-g303272-Activities-Salvador_State_of_Bahia.html",\ 14 | "https://www.tripadvisor.com.br/Attractions-g303322-Activities-Brasilia_Federal_District.html",\ 15 | "https://www.tripadvisor.com.br/Attractions-g303293-Activities-Fortaleza_State_of_Ceara.html",\ 16 | "https://www.tripadvisor.com.br/Attractions-g303441-Activities-Curitiba_State_of_Parana.html",\ 17 | "https://www.tripadvisor.com.br/Attractions-g303374-Activities-Belo_Horizonte_State_of_Minas_Gerais.html",\ 18 | "https://www.tripadvisor.com.br/Attractions-g303320-Activities-Vitoria_State_of_Espirito_Santo.html",\ 19 | "https://www.tripadvisor.com.br/Attractions-g303576-Activities-Florianopolis_State_of_Santa_Catarina.html",\ 20 | "https://www.tripadvisor.com.br/Attractions-g303518-Activities-Natal_State_of_Rio_Grande_do_Norte.html",\ 21 | "https://www.tripadvisor.com.br/Attractions-g303324-Activities-Goiania_State_of_Goias.html"] 22 | 23 | def parse(self, response): 24 | urls = [] 25 | for href in response.xpath('//div[@class="property_title"]/a/@href').extract(): 26 | url = response.urljoin(href) 27 | if url not in urls: 28 | urls.append(url) 29 | 30 | yield scrapy.Request(url, callback=self.parse_page) 31 | 32 | next_page = response.xpath('//div[@class="unified pagination "]/a/@href').extract() 33 | if next_page: 34 | url = response.urljoin(next_page[-1]) 35 | print url 36 | yield scrapy.Request(url, self.parse) 37 | 38 | def parse_page(self, response): 39 | 40 | review_page = response.xpath('//div[@class="wrap"]/div/a/@href').extract() 41 | 42 | if review_page: 43 | for i in range(len(review_page)): 44 | url = response.urljoin(review_page[i]) 45 | yield scrapy.Request(url, self.parse_review) 46 | 47 | next_page = response.xpath('//div[@class="unified pagination "]/a/@href').extract() 48 | if next_page: 49 | url = response.urljoin(next_page[-1]) 50 | yield scrapy.Request(url, self.parse_page) 51 | 52 | 53 | 54 | def parse_review(self, response): 55 | 56 | item = ReviewItem() 57 | 58 | contents = response.xpath('//div[@class="entry"]/p').extract() 59 | content = contents[0].encode("utf-8") 60 | 61 | ratings = response.xpath('//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract() 62 | rating = ratings[0][0] 63 | 64 | 65 | item['rating'] = rating 66 | item['review'] = content 67 | yield item 68 | 69 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Automat==0.5.0 2 | PyDispatcher==2.0.5 3 | Scrapy==1.3.2 4 | Twisted==17.1.0 5 | argparse==1.2.1 6 | attrs==16.3.0 7 | cffi==1.9.1 8 | constantly==15.1.0 9 | cryptography==1.7.2 10 | cssselect==1.0.1 11 | enum34==1.1.6 12 | idna==2.2 13 | incremental==16.10.1 14 | ipaddress==1.0.18 15 | lxml==3.7.3 16 | parsel==1.1.0 17 | pyOpenSSL==16.2.0 18 | pyasn1==0.2.2 19 | pyasn1-modules==0.0.8 20 | pycparser==2.17 21 | queuelib==1.4.2 22 | service-identity==16.0.0 23 | six==1.10.0 24 | w3lib==1.17.0 25 | wsgiref==0.1.2 26 | zope.interface==4.3.3 27 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = myscraps.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = myscraps 12 | --------------------------------------------------------------------------------