├── .gitignore
├── LICENSE
├── README.md
├── myscraps
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   └── tripadvisorspider.py
├── requirements.txt
└── scrapy.cfg


/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | # Created by https://www.gitignore.io/api/python
 3 | 
 4 | ### Python ###
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | 
32 | # PyInstaller
33 | #  Usually these files are written by a python script from a template
34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 | 
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 | 
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *,cover
51 | .hypothesis/
52 | 
53 | # Translations
54 | *.mo
55 | *.pot
56 | 
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | 
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 | 
65 | # Scrapy stuff:
66 | .scrapy
67 | 
68 | # Sphinx documentation
69 | docs/_build/
70 | 
71 | # PyBuilder
72 | target/
73 | 
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 | 
77 | # pyenv
78 | .python-version
79 | 
80 | # celery beat schedule file
81 | celerybeat-schedule
82 | 
83 | # dotenv
84 | .env
85 | 
86 | # virtualenv
87 | .venv
88 | venv/
89 | ENV/
90 | 
91 | # Spyder project settings
92 | .spyderproject
93 | 
94 | # Rope project settings
95 | .ropeproject
96 | 
97 | # End of https://www.gitignore.io/api/python
98 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Igor Santos
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Scrapy-tripadvisor-reviews
 2 | Using scrapy to scrape tripadvisor in order to get users' reviews.
 3 | 
 4 | The code in this repository was used to scrape and gather data from tripadvisor about some brazilian cities attractions. The data were used to train a sentiment analysis classifier used in https://github.com/igorbpf/Twitter-Sentiment (https://twisentiment.herokuapp.com/). 
 5 | 
 6 | # Usage
 7 | In the project's root folder type:
 8 | 
 9 | scrapy crawl tripadvisor -o tripadvisor_reviews.csv
10 | 
11 | the reviews will be stored in a csv file named tripadvisor_reviews
12 | 


--------------------------------------------------------------------------------
/myscraps/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datascience-python/Scrapy-tripadvisor-reviews/11f9a7439b7902db8a9f4a15dabd82239a42a045/myscraps/__init__.py


--------------------------------------------------------------------------------
/myscraps/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | 
 9 | import scrapy
10 | 
11 | 
12 | class ReviewItem(scrapy.Item):
13 |     # Items to get
14 |     rating = scrapy.Field()
15 |     review = scrapy.Field()
16 | 
17 | 


--------------------------------------------------------------------------------
/myscraps/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | #class MyscrapsPipeline(object):
10 | #    def process_item(self, item, spider):
11 | #        return item
12 | 
13 | 


--------------------------------------------------------------------------------
/myscraps/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for myscraps project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'myscraps'
13 | 
14 | SPIDER_MODULES = ['myscraps.spiders']
15 | NEWSPIDER_MODULE = 'myscraps.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'myscraps (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | # CONCURRENT_REQUESTS = 16
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'myscraps.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'myscraps.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'myscraps.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | 


--------------------------------------------------------------------------------
/myscraps/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/myscraps/spiders/tripadvisorspider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import scrapy
 5 | from myscraps.items import ReviewItem
 6 | from scrapy import Request
 7 | 
 8 | class TripAdvisorReview(scrapy.Spider):
 9 |     name = "tripadvisor"
10 |     # Cities: Recife, Porto Alegre, Salvador, Brasilia, Fortaleza, Curitiba, Belo Horizonte, Vitoria, Florianopolis, Natal, Goiania.
11 |     start_urls = ["https://www.tripadvisor.com.br/Attractions-g304560-Activities-Recife_State_of_Pernambuco.html",\
12 |                     "https://www.tripadvisor.com.br/Attractions-g303546-Activities-Porto_Alegre_State_of_Rio_Grande_do_Sul.html",\
13 |                     "https://www.tripadvisor.com.br/Attractions-g303272-Activities-Salvador_State_of_Bahia.html",\
14 |                     "https://www.tripadvisor.com.br/Attractions-g303322-Activities-Brasilia_Federal_District.html",\
15 |                     "https://www.tripadvisor.com.br/Attractions-g303293-Activities-Fortaleza_State_of_Ceara.html",\
16 |                     "https://www.tripadvisor.com.br/Attractions-g303441-Activities-Curitiba_State_of_Parana.html",\
17 |                     "https://www.tripadvisor.com.br/Attractions-g303374-Activities-Belo_Horizonte_State_of_Minas_Gerais.html",\
18 |                     "https://www.tripadvisor.com.br/Attractions-g303320-Activities-Vitoria_State_of_Espirito_Santo.html",\
19 |                     "https://www.tripadvisor.com.br/Attractions-g303576-Activities-Florianopolis_State_of_Santa_Catarina.html",\
20 |                     "https://www.tripadvisor.com.br/Attractions-g303518-Activities-Natal_State_of_Rio_Grande_do_Norte.html",\
21 |                     "https://www.tripadvisor.com.br/Attractions-g303324-Activities-Goiania_State_of_Goias.html"]
22 | 
23 |     def parse(self, response):
24 |         urls = []
25 |         for href in response.xpath('//div[@class="property_title"]/a/@href').extract():
26 |             url = response.urljoin(href)
27 |             if url not in urls:
28 |                 urls.append(url)
29 | 
30 |                 yield scrapy.Request(url, callback=self.parse_page)
31 | 
32 |         next_page = response.xpath('//div[@class="unified pagination "]/a/@href').extract()
33 |         if next_page:
34 |             url = response.urljoin(next_page[-1])
35 |             print url
36 |             yield scrapy.Request(url, self.parse)
37 | 
38 |     def parse_page(self, response):
39 | 
40 |         review_page = response.xpath('//div[@class="wrap"]/div/a/@href').extract()
41 | 
42 |         if review_page:
43 |             for i in range(len(review_page)):
44 |                 url = response.urljoin(review_page[i])
45 |                 yield scrapy.Request(url, self.parse_review)
46 | 
47 |         next_page = response.xpath('//div[@class="unified pagination "]/a/@href').extract()
48 |         if next_page:
49 |             url = response.urljoin(next_page[-1])
50 |             yield scrapy.Request(url, self.parse_page)
51 | 
52 | 
53 | 
54 |     def parse_review(self, response):
55 | 
56 |         item = ReviewItem()
57 | 
58 |         contents = response.xpath('//div[@class="entry"]/p').extract()
59 |         content = contents[0].encode("utf-8")
60 | 
61 |         ratings = response.xpath('//span[@class="rate sprite-rating_s rating_s"]/img/@alt').extract()
62 |         rating = ratings[0][0]
63 | 
64 | 
65 |         item['rating'] = rating
66 |         item['review'] = content
67 |         yield item
68 | 
69 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Automat==0.5.0
 2 | PyDispatcher==2.0.5
 3 | Scrapy==1.3.2
 4 | Twisted==17.1.0
 5 | argparse==1.2.1
 6 | attrs==16.3.0
 7 | cffi==1.9.1
 8 | constantly==15.1.0
 9 | cryptography==1.7.2
10 | cssselect==1.0.1
11 | enum34==1.1.6
12 | idna==2.2
13 | incremental==16.10.1
14 | ipaddress==1.0.18
15 | lxml==3.7.3
16 | parsel==1.1.0
17 | pyOpenSSL==16.2.0
18 | pyasn1==0.2.2
19 | pyasn1-modules==0.0.8
20 | pycparser==2.17
21 | queuelib==1.4.2
22 | service-identity==16.0.0
23 | six==1.10.0
24 | w3lib==1.17.0
25 | wsgiref==0.1.2
26 | zope.interface==4.3.3
27 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = myscraps.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = myscraps
12 | 


--------------------------------------------------------------------------------