├── .gitignore ├── Amazon ├── Amazon │ ├── __init__.py │ ├── images │ │ └── default.jpg │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── amazon.py └── scrapy.cfg ├── LICENSE ├── README.md ├── requirements.txt └── scpture.jpg /.gitignore: -------------------------------------------------------------------------------- 1 | /.vagrant 2 | /scrapy.iml 3 | *.pyc 4 | _trial_temp* 5 | dropin.cache 6 | docs/build 7 | *egg-info 8 | .tox 9 | venv 10 | build 11 | dist 12 | .idea 13 | htmlcov/ 14 | .coverage 15 | .coverage.* 16 | .cache/ 17 | 18 | # Windows 19 | Thumbs.db 20 | -------------------------------------------------------------------------------- /Amazon/Amazon/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OFZFZS/scrapy-amazon/3befb32289ccd548fa74cead2e359be848b01fbc/Amazon/Amazon/__init__.py -------------------------------------------------------------------------------- /Amazon/Amazon/images/default.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OFZFZS/scrapy-amazon/3befb32289ccd548fa74cead2e359be848b01fbc/Amazon/Amazon/images/default.jpg -------------------------------------------------------------------------------- /Amazon/Amazon/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | 6 | class AmazonItem(scrapy.Item): 7 | # define the fields for your item here like: 8 | title = scrapy.Field() 9 | image_url = scrapy.Field() 10 | asin = scrapy.Field() 11 | price = scrapy.Field() 12 | url = scrapy.Field() 13 | description = scrapy.Field() 14 | features = scrapy.Field() 15 | # 好评 16 | review_good_titles = scrapy.Field() 17 | review_good_contents = scrapy.Field() 18 | 19 | # 差评 20 | review_bad_titles = scrapy.Field() 21 | review_bad_contents = scrapy.Field() 22 | -------------------------------------------------------------------------------- /Amazon/Amazon/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | 9 | from scrapy import signals 10 | from Amazon.settings import USER_AGENT 11 | 12 | 13 | class AmazonSpiderMiddleware(object): 14 | # Not all methods need to be defined. If a method is not defined, 15 | # scrapy acts as if the spider middleware does not modify the 16 | # passed objects. 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | # This method is used by Scrapy to create your spiders. 21 | s = cls() 22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 23 | return s 24 | 25 | def process_spider_input(self, response, spider): 26 | # Called for each response that goes through the spider 27 | # middleware and into the spider. 28 | 29 | # Should return None or raise an exception. 30 | return None 31 | 32 | def process_spider_output(self, response, result, spider): 33 | # Called with the results returned from the Spider, after 34 | # it has processed the response. 35 | 36 | # Must return an iterable of Request, dict or Item objects. 37 | for i in result: 38 | yield i 39 | 40 | def process_spider_exception(self, response, exception, spider): 41 | # Called when a spider or process_spider_input() method 42 | # (from other spider middleware) raises an exception. 43 | 44 | # Should return either None or an iterable of Response, dict 45 | # or Item objects. 46 | pass 47 | 48 | def process_start_requests(self, start_requests, spider): 49 | # Called with the start requests of the spider, and works 50 | # similarly to the process_spider_output() method, except 51 | # that it doesn’t have a response associated. 52 | 53 | # Must return only requests (not items). 54 | for r in start_requests: 55 | yield r 56 | 57 | def spider_opened(self, spider): 58 | spider.logger.info('Spider opened: %s' % spider.name) 59 | 60 | 61 | class RandomUserAgent(object): 62 | 63 | def process_request(self, request, spider): 64 | request.headers['User-Agent'] = USER_AGENT 65 | 66 | 67 | class RandomProxyMiddleware(object): 68 | """设置代理访问""" 69 | # Not all methods need to be defined. If a method is not defined, 70 | # scrapy acts as if the downloader middleware does not modify the 71 | # passed objects. 72 | 73 | def process_request(self, request, spider): 74 | request.meta['proxy'] ="http://10.122.141.184:16816" 75 | -------------------------------------------------------------------------------- /Amazon/Amazon/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | from Amazon.items import AmazonItem 8 | 9 | from pymongo import MongoClient 10 | 11 | 12 | class AmazonGoodsPipeline(object): 13 | """将商品详情保存到MongoDB""" 14 | 15 | def open_spider(self, spider): 16 | self.db = MongoClient(host="127.0.0.1", port=27017) 17 | self.client = self.db.Amazon.Pipa 18 | 19 | def process_item(self, item, spider): 20 | if isinstance(item, AmazonItem): 21 | _item = dict(item) 22 | _item['_id'] = _item['asin'] 23 | try: 24 | # asin作为主键,插入重复会报错 25 | self.client.insert(_item) 26 | except Exception as err: 27 | pass 28 | 29 | return item 30 | -------------------------------------------------------------------------------- /Amazon/Amazon/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | BOT_NAME = 'Amazon' 4 | 5 | SPIDER_MODULES = ['Amazon.spiders'] 6 | NEWSPIDER_MODULE = 'Amazon.spiders' 7 | 8 | USER_AGENT = "Mozilla/5.0 (Linux; U; Android 8.1.0; zh-cn; BLA-AL00 Build/HUAWEIBLA-AL00) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/57.0.2987.132 MQQBrowser/8.9 Mobile Safari/537.36" 9 | 10 | # Obey robots.txt rules 11 | ROBOTSTXT_OBEY = False 12 | 13 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 14 | CONCURRENT_REQUESTS = 1 15 | 16 | # Configure a delay for requests for the same website (default: 0) 17 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 18 | # See also autothrottle settings and docs 19 | # DOWNLOAD_DELAY = 3 20 | # The download delay setting will honor only one of: 21 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 22 | # CONCURRENT_REQUESTS_PER_IP = 16 23 | 24 | # Disable cookies (enabled by default) 25 | # COOKIES_ENABLED = False 26 | 27 | # Disable Telnet Console (enabled by default) 28 | # TELNETCONSOLE_ENABLED = False 29 | 30 | # Override the default request headers: 31 | DEFAULT_REQUEST_HEADERS = { 32 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,' 33 | '*/*;q=0.8', 34 | 'Accept-Language': 'en', 35 | 'User-Agent': USER_AGENT, 36 | } 37 | 38 | # Enable or disable spider middlewares 39 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 40 | # SPIDER_MIDDLEWARES = { 41 | # 'Amazon.middlewares.AmazonSpiderMiddleware': 543, 42 | # 'Amazon.middlewares.RandomUserAgentMiddleware': 543, 43 | # } 44 | 45 | # Enable or disable downloader middlewares 46 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 47 | DOWNLOADER_MIDDLEWARES = { 48 | 'Amazon.middlewares.RandomUserAgent': 543, 49 | } 50 | 51 | # Enable or disable extensions 52 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 53 | # EXTENSIONS = { 54 | # 'scrapy.extensions.telnet.TelnetConsole': None, 55 | # } 56 | 57 | # Configure item pipelines 58 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 59 | ITEM_PIPELINES = { 60 | 'Amazon.pipelines.AmazonGoodsPipeline': 300, 61 | } 62 | 63 | # Enable and configure the AutoThrottle extension (disabled by default) 64 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 65 | # AUTOTHROTTLE_ENABLED = True 66 | # The initial download delay 67 | # AUTOTHROTTLE_START_DELAY = 5 68 | # The maximum download delay to be set in case of high latencies 69 | # AUTOTHROTTLE_MAX_DELAY = 60 70 | # The average number of requests Scrapy should be sending in parallel to 71 | # each remote server 72 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 73 | # Enable showing throttling stats for every response received: 74 | # AUTOTHROTTLE_DEBUG = False 75 | 76 | COOKIES = {'session-id-time': '2082787201l', 'i18n-prefs': 'USD', 77 | 'session-id': '142-9723185-7096359', 'csm-hit': 'tb:s-TBNBSGGG2DTDACWFT2JG|1572104186431&t:1572104193942&adb:adblk_yes', 'sp-cdn': '"L5Z9:CN"', 'session-token': 'NtXSk4TNeLL1ywfKV+TvuhmxatgSa0yrUMVDxOzt0g6CAMeI6LkpgnQrcoU1asoE+pKF7ldrZnErq1dycNPGtszkRh03Wmo07Omhxs4OsROir2zQn4T5AtJAkn+RqVL8XB6izSJHsI0OWrp6to8bsr9AAw/4tLCFpEsnIh7nzYE0aDnZRQdyKCRbZbIxQTZg42jrFYHQH21c0ePPk9d0oC3feWEYOqh5KmCr5RWv8+xnCTX7kqpCELI9Qbsz1VKR', 'ubid-main': '135-5055030-7258235', 'x-wl-uid': '1iBt/JjYEoFYF+hGe2aCjWjyE0SGZ8B4QyX2KaTJl47LFamTRWYPbh4mcm/D2kLypor/oEsLBxqI', 'lc-main': 'zh_CN'} 78 | -------------------------------------------------------------------------------- /Amazon/Amazon/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Amazon/Amazon/spiders/amazon.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | import requests 5 | 6 | from Amazon.items import AmazonItem 7 | from Amazon.settings import DEFAULT_REQUEST_HEADERS 8 | 9 | from bs4 import BeautifulSoup 10 | 11 | BASE_URL = 'https://www.amazon.com' 12 | 13 | 14 | class AmazonSpider(scrapy.Spider): 15 | name = 'amazon' 16 | allowed_domains = ['amazon.com'] 17 | page = 1 18 | lost_item = 0 19 | keyword = 'Pipa' 20 | rh = 'n%3A11091801' 21 | cookies = { 22 | "anonymid": "j7wsz80ibwp8x3", 23 | "_r01_": "1", 24 | "ln_uact": "mr_mao_hacker@163.com", 25 | "_de": "BF09EE3A28DED52E6B65F6A4705D973F1383380866D39FF5", 26 | "depovince": "GW", 27 | "jebecookies": "2fb888d1-e16c-4e95-9e59-66e4a6ce1eae|||||", 28 | "ick_login": "1c2c11f1-50ce-4f8c-83ef-c1e03ae47add", 29 | "p": "158304820d08f48402be01f0545f406d9", 30 | "first_login_flag": "1", 31 | "ln_hurl": "http://hdn.xnimg.cn/photos/hdn521/20180711/2125/main_SDYi_ae9c0000bf9e1986.jpg", 32 | "t": "adb2270257904fff59f082494aa7f27b9", 33 | "societyguester": "adb2270257904fff59f082494aa7f27b9", 34 | "id": "327550029", 35 | "xnsid": "4a536121", 36 | "loginfrom": "syshome", 37 | "wp_fold": "0" 38 | } 39 | 40 | headers = { 41 | 'Host': 'www.amazon.com', 42 | 'User-Agent': 'Mozilla/5.0 (Linux; Android 7.0; \ 43 | SM-A520F Build/NRD90M; wv) AppleWebKit/537.36 \ 44 | (KHTML, like Gecko) Version/4.0 \ 45 | Chrome/65.0.3325.109 Mobile Safari/537.36', 46 | 'Accept': 'text/html,application/xhtml+xml,\ 47 | application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 48 | } 49 | 50 | def start_requests(self): 51 | """ 52 | start_requests做为程序的入口,可以重写,自定义第一批请求 53 | 可以添加headers、cookies, , dont_filter=True 54 | """ 55 | start_urls = [ 56 | 'https://www.amazon.com/s?k=' + self.keyword + '&page=' + str( 57 | self.page) + '&rh=' + self.rh, 58 | # 'https://www.amazon.com/s?k=' + keyword + '&page=' + str(page)+'&rh=n%3A1055398' 59 | ] 60 | 61 | for url in start_urls: 62 | yield scrapy.Request(url, headers=self.headers, 63 | callback=self.parse) 64 | 65 | def parse(self, response): 66 | url_list = response.xpath('//a[@title="status-badge"]/@href').extract() 67 | last = response.xpath('//li[@class="a-last"]').extract() 68 | 69 | product_url_list = [BASE_URL + x for x in url_list] 70 | # 判断是否是最后一页,是最后一页则结束 71 | 72 | if not last or self.page >= 5: 73 | print('翻页结束,当前页:%s 没有描述特征商品数:%s' % (self.page, self.lost_item)) 74 | return 75 | 76 | for product_url in product_url_list: 77 | yield scrapy.Request(url=product_url, 78 | callback=self._get_product_details, 79 | headers=DEFAULT_REQUEST_HEADERS) 80 | self.page += 1 81 | yield scrapy.Request( 82 | url='https://www.amazon.com/s?k=' + self.keyword + '&page=' + str( 83 | self.page) + '&rh=' + self.rh + '&ref=is_pn_' + str(self.page - 84 | 1), 85 | callback=self.parse) 86 | 87 | def _get_product_details(self, response): 88 | # 处理亚马逊的反爬文本,释放注释代码 89 | res_body = response.text 90 | _res = res_body.replace('', '').replace('', '') 91 | response = response.replace(body=_res) 92 | 93 | title = response.xpath('//span[@id="title"]/text()').extract_first() 94 | if not title: 95 | print('您的IP已被亚马逊限制,请更换IP后重试') 96 | return 97 | title = title.replace('\n', '') 98 | # 产品图片地址 99 | image_url = response.xpath( 100 | '//img[@data-fling-refmarker="detail_main_image_block"]/@data-midres-replacement').extract_first() # noqa: E501 101 | # 商品唯一标识 102 | asin = response.xpath( 103 | '//div[@id="cerberus-data-metrics"]/@data-asin').extract_first() 104 | # 价格 105 | price = response.xpath( 106 | '//div[@id="cerberus-data-metrics"]/@data-asin-price').extract_first() # noqa: E501 107 | # 描述 108 | description = response.xpath( 109 | '//*[@id="productDescription_fullView"]').extract_first() 110 | if description: 111 | # 过滤掉html标签 112 | description = BeautifulSoup(description).get_text() 113 | # 特征 114 | features = response.xpath( 115 | '//div[@id="feature-bullets"]//span[@class="a-list-item"]/text()') \ 116 | .extract() 117 | 118 | # 如果没有评论也没有获取到产品特征,那就不要这条数据 119 | if not description and not features: 120 | self.lost_item += 1 121 | print('没有描述也没有特征,结束..,总共已过滤%s个' % self.lost_item) 122 | return 123 | 124 | item = AmazonItem() 125 | item['title'] = title 126 | item['asin'] = asin 127 | item['image_url'] = image_url 128 | item['url'] = response.url 129 | item['price'] = price 130 | item['description'] = description 131 | item['features'] = features 132 | 133 | # 保存图片 134 | try: 135 | self.save_image(image_url, asin) 136 | except Exception: 137 | print('图片下载保存失败..') 138 | 139 | comments_url = 'https://www.amazon.com/kinery-Concentrator-Generator' \ 140 | '-Adjustable-Humidifiers/product-reviews/%s/ref=cm_cr' \ 141 | '_unknown?ie=UTF8&reviewerType=all_reviews&filterBy' \ 142 | 'Star=five_star&pageNumber=1' % asin 143 | yield scrapy.Request( 144 | url=comments_url, callback=self._get_good_comments, 145 | meta={"item": item}) 146 | 147 | def save_image(self, img_url, img_name): 148 | response = requests.get(img_url) 149 | # 获取的文本实际上是图片的二进制文本 150 | img = response.content 151 | # 将他拷贝到本地文件 w 写 b 二进制 wb代表写入二进制文本 152 | # 保存路径 153 | path = '../images/%s.jpg' % (img_name) 154 | with open(path, 'wb') as f: 155 | f.write(img) 156 | 157 | def _get_good_comments(self, response): 158 | """获取商品好评:只取一页五星好评""" 159 | review_titles = response.xpath( 160 | '//span[@data-hook="review-title"]/span/text()').extract() 161 | review_contents = response.xpath( 162 | '//div[@aria-expanded="false"]/span/text()').extract() 163 | 164 | item = response.meta["item"] 165 | item["review_good_titles"] = review_titles 166 | item["review_good_contents"] = review_contents 167 | 168 | comments_url = 'https://www.amazon.com/kinery-Concentrator-' \ 169 | 'Generator-Adjustable-Humidifiers/product-reviews/%s' \ 170 | '/ref=cm_cr_unknown?ie=UTF8&reviewerType=all_reviews' \ 171 | '&filterByStar=one_star&pageNumber=1' % item.get('asin') 172 | yield scrapy.Request( 173 | url=comments_url, callback=self._get_bad_comments, 174 | meta={"item": item}) 175 | 176 | def _get_bad_comments(self, response): 177 | """获取商品差评:只取一页一星差评""" 178 | review_titles = response.xpath( 179 | '//span[@data-hook="review-title"]/span/text()').extract() 180 | review_contents = response.xpath( 181 | '//div[@aria-expanded="false"]/span/text()').extract() 182 | 183 | item = response.meta["item"] 184 | item["review_bad_titles"] = review_titles 185 | item["review_bad_contents"] = review_contents 186 | 187 | yield item 188 | -------------------------------------------------------------------------------- /Amazon/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Amazon.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Amazon 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2018, YaCheng 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ========================== 2 | 3 | # **scrapy-amazon**(亚马逊爬虫) 4 | 5 | 基于scrapy的亚马逊的爬虫 6 | 7 | 默认python3环境,python2未测试 8 | - 默认抓取手机版亚马逊 9 | - 默认采集亚马逊指定关键词所有商品 10 | - 采集属性包括商品名称、链接、图片地址、ASIN、商品描述、评论等等 11 | - 爬取到的数据存储到MongoDB数据库 12 | 13 | 未开源版本新增功能 14 | - 支持采集指定不同国家的亚马逊(美国亚马逊、日本亚马逊等等) 15 | - 支持指定代理IP访问,减少亚马逊Robot Check几率 16 | - 支持采集、发布日志保存到文件,方便查询 17 | - 接入百度翻译、有道翻译、腾讯翻译,自定义语种实现伪原创 18 | - 支持采集到的数据清洗伪原创一键发布到wordpress(带特色图片) 19 | - 支持发布去重、减少网站被K几率 20 | 21 | 注意:建议自行指定IP池,随机更换User-Agent,防止被封 22 | 23 | 24 | 25 | 截图展示 26 | ======= 27 | 28 | ![数据展示](https://github.com/OFZFZS/scrapy-amazon/blob/master/scpture.jpg?raw=true) 29 | 30 | 31 | 32 | 联系作者 33 | ------- 34 | 35 | QQ1498066696,不常回复,欢迎直接issue 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.8.1 2 | lxml==4.4.1 3 | pymongo==3.9.0 4 | Scrapy==1.7.3 -------------------------------------------------------------------------------- /scpture.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OFZFZS/scrapy-amazon/3befb32289ccd548fa74cead2e359be848b01fbc/scpture.jpg --------------------------------------------------------------------------------