├── .gitignore ├── README.md ├── amazon ├── amazon │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── helper.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── settings.cpython-36.pyc │ │ └── sql.cpython-36.pyc │ ├── helper.py │ ├── items.py │ ├── main.py │ ├── middlewares │ │ ├── AmazonSpiderMiddleware.py │ │ ├── ProxyMiddleware.py │ │ ├── RotateUserAgentMiddleware.py │ │ └── __init__.py │ ├── mysqlpipelines │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ └── pipelines.cpython-36.pyc │ │ ├── pipelines.py │ │ └── sql.py │ ├── pipelines.py │ ├── proxy.json │ ├── settings-demo.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ ├── asin_spider.py │ │ ├── cate_spider.py │ │ ├── detail_spider.py │ │ ├── keyword_ranking_spider.py │ │ ├── proxy │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-36.pyc │ │ │ │ ├── fineproxy_spider.cpython-36.pyc │ │ │ │ └── kuaidaili_spider.cpython-36.pyc │ │ │ ├── fineproxy_spider.py │ │ │ ├── kuaidaili_spider.py │ │ │ └── privateproxy_spider.py │ │ ├── reivew_profile_spider.py │ │ ├── review_detail_spider.py │ │ └── sales_ranking_spider.py │ └── sql.py ├── db │ ├── ipricejot.sql │ └── py_salesranking_and_review.sql ├── requirements.txt └── scrapy.cfg └── amazon2 ├── __init__.py ├── amazon2 ├── __init__.py ├── items.py ├── middlewares │ ├── AmazonSpiderMiddleware.py │ ├── RotateUserAgentMiddleware.py │ └── __init__.py ├── pipelines.py ├── settings.py └── spiders │ ├── AmazonBaseSpider.py │ ├── DemoSpider.py │ └── __init__.py ├── requirements.txt └── scrapy.cfg /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | **/__pycache__ 3 | amazon/amazon/*.json 4 | /amazon/amazon/settings.py 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # amazon-scrapy 2 | Scrapy the detail and lowest price of amazon best seller product by python spider 3 | 4 | In developing........, welcome to join ! 5 | 6 | Some code has apply to pricejot.com. 7 | https://www.pricejot.com/ 8 | 9 | 10 | 11 | ## How to install 12 | install python3.6 13 | pip install -r requirements.txt 14 | 15 | if you are contributer 16 | recommend to php install pipreqs 17 | run "pipreqs /path/to/project" if you add some packages in your code 18 | 19 | 20 | ## Source url 21 | * list page 22 | https://www.amazon.com/Best-Sellers/zgbs/ update every two hours. 23 | * price detail page 24 | https://www.amazon.com/gp/offer-listing/B01FCTAEK4 25 | 26 | 27 | ## TODO 28 | 1. Scrapy the reviews by asin https://www.amazon.com/Libratone-ONE-Click-Caribbean-Green/product-reviews/B00ZU33BBC 29 | 2. Scrapy the detail by asin pool(50%) 30 | 31 | 32 | ## done 33 | 1. Scrapy the category of level 1 from https://www.amazon.com/Best-Sellers/zgbs/ and store in mysql 34 | 2. Scrapy the asin in the best seller asin from https://www.amazon.com/Best-Sellers/zgbs/ and store in mysql 35 | 3. Scrapy the keywords rank by asin 36 | 37 | 38 | ## Reffering Documents 39 | * python https://bop.molun.net/ 40 | * scrapy https://docs.scrapy.org/en/latest/ 41 | 42 | ## About proxy service 43 | 1. Recommend to use swiftproxy https://www.swiftproxy.net/ 44 | 2. We get proxy list from all over the world by random eg: https://www.swiftproxy.net/api/proxy/get_proxy_ip?num=100®ions=GLOBAL&protocol=http&return_type=txt&lb=1&sb= 45 | 3. By the way, We need login in and set the white_ip to use the proxy. 46 | 47 | 48 | ## Contract us 49 | Email:huangdingzhi@foxmail.com 50 | Wechat:dzdzhuang 51 | 52 | Based on Python 3.6 53 | 54 | ## License 55 | 56 | The MIT License(http://opensource.org/licenses/MIT) 57 | 58 | Please feel free to use and contribute to the development. 59 | 60 | ## Contribution 61 | 62 | If you have any ideas or suggestions to improve Amazon-scrapy, welcome to submit an issue/pull request. 63 | 64 | ## Backer 65 | > 您的支持是我们坚持的最大动力。 66 | 67 | 打赏 68 | 打赏 69 | 70 | ### Thanks 71 | 72 | | Backer | Fee | 73 | | --- | --- | 74 | | Jet.Zhang | ¥18.88 | 75 | | Mike | ¥18.88 | 76 | | 飞龙 | ¥5 | 77 | 78 | -------------------------------------------------------------------------------- /amazon/amazon/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/__init__.py -------------------------------------------------------------------------------- /amazon/amazon/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/__pycache__/helper.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/__pycache__/helper.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/__pycache__/sql.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/__pycache__/sql.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/helper.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import re 4 | import pytz 5 | 6 | from math import ceil 7 | from random import Random 8 | 9 | from amazon import settings 10 | 11 | 12 | class Helper(object): 13 | tz = pytz.timezone(settings.TIMEZONE) 14 | 15 | @classmethod 16 | def get_num_split_comma(cls, value): 17 | # num = value.split(',') 18 | # ranknum = '' 19 | # if len(num) > 1: 20 | # for n in num: 21 | # ranknum += n 22 | # return ranknum 23 | # else: 24 | # return value 25 | 26 | return value.replace(',', '') 27 | 28 | @classmethod 29 | def get_star_split_str(cls, value): 30 | rate = value.split('out of 5 stars') # 分割字符串 31 | return rate[0].strip() 32 | 33 | @classmethod 34 | def get_date_split_str(cls, value): 35 | return value.split('on')[1].strip() 36 | 37 | @classmethod 38 | def convert_date_str(cls, date_str): 39 | return datetime.datetime.strptime(date_str, '%B %d, %Y') 40 | 41 | @classmethod 42 | def delay_forty_days(cls): 43 | now = datetime.datetime.now() 44 | delay14 = now + datetime.timedelta(days=-40) # 计算往前40天之后的时间 45 | return delay14 46 | 47 | @classmethod 48 | def get_rank_classify(cls, spider_str): 49 | result = re.match(r'#([0-9,]+)(?:.*)in (.*) \(.*[Ss]ee [Tt]op.*\)', spider_str) 50 | return result.groups() 51 | 52 | @classmethod 53 | def get_keyword_page_num(cls, rank): 54 | page_num = ceil(int(rank) / 16) 55 | return page_num 56 | 57 | @classmethod 58 | def get_keyword_page_range(cls, page_num): 59 | return range(page_num - 4 if page_num - 4 > 0 else 1, page_num + 4 if page_num + 4 <= 20 else 20) 60 | 61 | @classmethod 62 | def random_str(cls, randomlength): 63 | str = '' 64 | chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789' 65 | length = len(chars) - 1 66 | random = Random() 67 | for i in range(randomlength): 68 | str += chars[random.randint(0, length)] 69 | return str 70 | 71 | @classmethod 72 | def get_now_date(cls): 73 | now = datetime.datetime.now(cls.tz).strftime('%Y-%m-%d %H:%M:%S') 74 | return now 75 | -------------------------------------------------------------------------------- /amazon/amazon/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | 12 | class CateItem(scrapy.Item): 13 | title = scrapy.Field() 14 | link = scrapy.Field() 15 | level = scrapy.Field() 16 | pid = scrapy.Field() 17 | pass 18 | 19 | class AsinBestItem(scrapy.Item): 20 | asin = scrapy.Field() 21 | cid = scrapy.Field() 22 | rank = scrapy.Field() 23 | pass 24 | 25 | class DetailItem(scrapy.Item): 26 | asin = scrapy.Field() 27 | image = scrapy.Field() 28 | title = scrapy.Field() 29 | star = scrapy.Field() 30 | reviews = scrapy.Field() 31 | seller_price = scrapy.Field() 32 | amazon_price = scrapy.Field() 33 | pass 34 | 35 | class ReviewProfileItem(scrapy.Item): 36 | asin = scrapy.Field() 37 | product = scrapy.Field() 38 | brand = scrapy.Field() 39 | seller = scrapy.Field() 40 | image = scrapy.Field() 41 | review_total = scrapy.Field() 42 | review_rate = scrapy.Field() 43 | pct_five = scrapy.Field() 44 | pct_four = scrapy.Field() 45 | pct_three = scrapy.Field() 46 | pct_two = scrapy.Field() 47 | pct_one = scrapy.Field() 48 | pass 49 | 50 | 51 | class ReviewDetailItem(scrapy.Item): 52 | asin = scrapy.Field() 53 | review_id = scrapy.Field() 54 | reviewer = scrapy.Field() 55 | review_url = scrapy.Field() 56 | star = scrapy.Field() 57 | date = scrapy.Field() 58 | title = scrapy.Field() 59 | content = scrapy.Field() 60 | pass 61 | 62 | 63 | class KeywordRankingItem(scrapy.Item): 64 | skwd_id = scrapy.Field() 65 | rank = scrapy.Field() 66 | date = scrapy.Field() 67 | 68 | 69 | class SalesRankingItem(scrapy.Item): 70 | rank = scrapy.Field() 71 | classify = scrapy.Field() 72 | asin = scrapy.Field() 73 | -------------------------------------------------------------------------------- /amazon/amazon/main.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | #Just run once 3 | execute("scrapy crawl detail ".split()) 4 | -------------------------------------------------------------------------------- /amazon/amazon/middlewares/AmazonSpiderMiddleware.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from datetime import datetime 10 | 11 | 12 | class AmazonSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, dict or Item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Response, dict 44 | # or Item objects. 45 | pass 46 | 47 | def process_start_requests(self, start_requests, spider): 48 | # Called with the start requests of the spider, and works 49 | # similarly to the process_spider_output() method, except 50 | # that it doesn’t have a response associated. 51 | 52 | # Must return only requests (not items). 53 | for r in start_requests: 54 | yield r 55 | 56 | def spider_opened(self, spider): 57 | spider.started_on = datetime.now() 58 | 59 | 60 | -------------------------------------------------------------------------------- /amazon/amazon/middlewares/ProxyMiddleware.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | import redis 4 | import time 5 | from amazon import settings 6 | 7 | 8 | class ProxyMiddleware(object): 9 | def __init__(self): 10 | with open('proxy.json', 'r') as f: 11 | self.proxies = json.load(f) 12 | self.r = redis.Redis(host=settings.REDIS_HOST, port=settings.REDIS_PORT, db=settings.REDIS_DB, 13 | password=settings.REDIS_PASSWORD) 14 | 15 | def process_request(self, request, spider): 16 | while True: 17 | proxy = random.choice(self.proxies) 18 | if self.proxyReady(proxy): 19 | request.meta['proxy'] = 'http://{}'.format(proxy) 20 | break 21 | 22 | def proxyReady(self, proxy): 23 | key = proxy + settings.BOT_NAME 24 | retult = self.r.exists(key) 25 | if retult: 26 | return False 27 | else: 28 | self.r.setex(key, 1, 15) 29 | return True 30 | -------------------------------------------------------------------------------- /amazon/amazon/middlewares/RotateUserAgentMiddleware.py: -------------------------------------------------------------------------------- 1 | import random 2 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 3 | 4 | 5 | class RotateUserAgentMiddleware(UserAgentMiddleware): 6 | def __init__(self, user_agent=''): 7 | UserAgentMiddleware.__init__(self) 8 | self.user_agent = user_agent 9 | 10 | def process_request(self, request, spider): 11 | ua = random.choice(self.user_agent_list) 12 | if ua: 13 | # print(ua) 14 | request.headers.setdefault('User-Agent', ua) 15 | 16 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 17 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 18 | user_agent_list = [ 19 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ 20 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ 21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ 22 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ 23 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ 24 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ 25 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 27 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 29 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 31 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 33 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 34 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ 35 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ 36 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 37 | ] 38 | -------------------------------------------------------------------------------- /amazon/amazon/middlewares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/middlewares/__init__.py -------------------------------------------------------------------------------- /amazon/amazon/mysqlpipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/mysqlpipelines/__init__.py -------------------------------------------------------------------------------- /amazon/amazon/mysqlpipelines/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/mysqlpipelines/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/mysqlpipelines/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/mysqlpipelines/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/mysqlpipelines/pipelines.py: -------------------------------------------------------------------------------- 1 | from scrapy.exceptions import DropItem 2 | 3 | from amazon.helper import Helper 4 | from amazon.sql import ReviewSql, RankingSql 5 | from .sql import Sql 6 | from amazon.items import CateItem, ReviewProfileItem, ReviewDetailItem, SalesRankingItem, KeywordRankingItem 7 | from amazon.items import AsinBestItem 8 | from amazon.items import DetailItem 9 | 10 | class AmazonPipeline(object): 11 | def process_item(self,item,spider): 12 | if isinstance(item,CateItem): 13 | Sql.insert_cate_log(item) 14 | print('save category: '+ item['title']) 15 | pass 16 | 17 | if isinstance(item,AsinBestItem): 18 | Sql.cache_best_asin(item) 19 | print('save best seller: '+item['asin']) 20 | pass 21 | 22 | if isinstance(item, ReviewProfileItem): 23 | ReviewSql.insert_profile_item(item) 24 | return item 25 | 26 | if isinstance(item, ReviewDetailItem): 27 | delay_date = Helper.delay_forty_days() # 40天的截止时间 28 | item_date = Helper.convert_date_str(item['date']) 29 | if item_date < delay_date: # 判断是否过了40天限额,如果超出范围 则抛弃此item 30 | raise DropItem('the review_id:[%s] has been expired' % item['review_id']) 31 | else: 32 | item['review_url'] = 'https://www.amazon.com' + item['review_url'] 33 | item['date'] = item_date.strftime('%Y-%m-%d') 34 | ReviewSql.insert_detail_item(item) 35 | 36 | return item 37 | 38 | if isinstance(item, SalesRankingItem): 39 | RankingSql.insert_sales_ranking(item) 40 | return item 41 | 42 | if isinstance(item, KeywordRankingItem): 43 | RankingSql.insert_keyword_ranking(item) 44 | return item 45 | 46 | if isinstance(item, DetailItem): 47 | return item 48 | 49 | pass 50 | 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /amazon/amazon/mysqlpipelines/sql.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | from amazon import settings 3 | 4 | 5 | db = pymysql.connect(settings.MYSQL_HOST, settings.MYSQL_USER, settings.MYSQL_PASSWORD, settings.MYSQL_DB, charset=settings.MYSQL_CHARSET, cursorclass=pymysql.cursors.DictCursor) 6 | cursor = db.cursor() 7 | 8 | 9 | class Sql: 10 | 11 | asin_pool = [] 12 | 13 | @classmethod 14 | def insert_cate_log(cls, item): 15 | sql = "INSERT INTO py_cates (title,link,level,pid) VALUES ('%s', '%s','%d','%d')" % (item['title'],item['link'],item['level'],item['pid']) 16 | try: 17 | cursor.execute(sql) 18 | db.commit() 19 | except: 20 | db.rollback() 21 | pass 22 | 23 | @classmethod 24 | def clear_cate(cls, level): 25 | sql = "truncate table py_cates" 26 | try: 27 | cursor.execute(sql) 28 | db.commit() 29 | except: 30 | db.rollback() 31 | pass 32 | 33 | @classmethod 34 | def cache_best_asin(cls, item): 35 | cls.asin_pool.append((item['asin'], item['cid'], item['rank'])) 36 | pass 37 | 38 | @classmethod 39 | def store_best_asin(cls): 40 | sql_clear = "truncate table py_asin_best" 41 | sql = "INSERT INTO py_asin_best (asin,cid,rank) VALUES (%s, %s, %s)" 42 | try: 43 | cursor.execute(sql_clear) 44 | cursor.executemany(sql,cls.asin_pool) 45 | db.commit() 46 | except Exception as err: 47 | print(err) 48 | db.rollback() 49 | pass 50 | 51 | @classmethod 52 | def findall_cate_level1(cls): 53 | sql = "SELECT id,link FROM py_cates WHERE level < 2" 54 | cursor.execute(sql) 55 | return cursor.fetchall() 56 | 57 | @classmethod 58 | def findall_asin_level1(cls): 59 | sql = "SELECT distinct(asin), cid FROM py_asin_best limit 0,300" 60 | cursor.execute(sql) 61 | return cursor.fetchall() 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /amazon/amazon/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class AmazonPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | 13 | 14 | -------------------------------------------------------------------------------- /amazon/amazon/proxy.json: -------------------------------------------------------------------------------- 1 | ["121.40.199.105:80", "210.16.120.244:3128", "139.59.125.112:8080", "128.199.75.94:80", "128.199.138.78:8080", "139.59.243.186:443", "128.199.193.202:3128", "128.199.74.233:8080", "47.52.96.5:80", "128.199.191.123:3128", "139.59.117.11:3128", "54.158.134.115:80", "120.25.211.80:9999", "209.198.197.165:80", "120.24.208.42:9999", "195.222.68.87:3128", "180.254.219.129:80", "104.131.63.78:3128", "104.131.132.131:3128", "165.227.124.179:3128", "119.81.197.124:3128", "188.166.221.165:3128", "36.85.246.175:80", "94.181.196.171:5328", "125.162.153.208:80", "36.83.66.57:80", "139.59.21.143:3128", "139.59.125.112:80", "165.227.92.177:80", "167.114.221.238:3128", "94.23.157.1:8081", "149.202.195.236:443", "95.189.123.74:8080", "41.78.25.185:3128", "61.135.155.82:443", "124.47.7.45:80", "125.99.100.200:8080", "178.27.197.105:80", "190.153.210.237:80", "95.110.189.166:80", "91.93.135.47:80", "95.110.189.185:80", "82.165.151.230:80", "82.67.68.28:80", "177.4.173.242:80", "123.59.51.130:8080", "36.85.246.175:80", "196.43.197.25:80", "128.199.169.17:80", "40.114.14.173:80", "64.237.61.242:80", "31.14.40.113:3128", "104.155.189.170:80", "114.215.102.168:8081", "196.43.197.26:80", "31.47.198.61:80", "178.32.213.128:80", "142.4.214.9:88", "199.15.198.7:8080", "199.15.198.9:8080", "199.15.198.10:8080", "94.153.172.75:80", "125.162.153.208:80", "36.83.66.57:80", "104.207.147.8:3256", "64.34.21.84:80", "83.169.17.103:80", "203.74.4.0:80", "203.74.4.5:80", "203.74.4.2:80", "203.74.4.6:80", "203.74.4.3:80", "203.74.4.1:80", "203.74.4.7:80", "203.74.4.4:80", "120.77.255.133:8088", "114.215.103.121:8081", "139.196.104.28:9000", "183.240.87.229:8080", "61.153.67.110:9999", "212.83.164.85:80", "167.114.196.153:80", "212.184.12.11:80", "103.15.251.75:80", "193.108.38.23:80", "46.38.52.36:8081", "177.207.234.14:80", "193.70.3.144:80", "202.78.227.33:80", "61.5.207.102:80", "62.210.249.233:80", "88.198.39.58:80", "35.154.200.203:80", "107.170.214.74:80", "54.233.168.79:80", "54.158.134.115:80", "202.79.36.119:8080", "195.14.242.39:80", "185.141.164.8:8080", "180.254.225.18:80", "168.234.75.142:80", "120.199.64.163:8081", "78.134.212.173:80", "120.77.210.59:80", "120.25.211.80:9999", "121.40.199.105:80", "209.198.197.165:80", "122.192.66.50:808", "120.24.208.42:9999", "119.28.74.189:808", "195.222.68.87:3128", "180.254.219.129:80", "50.203.117.22:80", "209.141.61.84:80", "191.253.67.206:8080", "94.19.39.81:3128", "200.42.45.211:80", "52.41.94.5:80", "168.128.29.75:80", "130.0.24.28:8080", "196.43.197.27:80", "52.65.157.207:80", "57.100.3.252:808", "186.103.239.190:80", "181.221.5.145:80", "124.47.7.38:80", "124.172.191.51:80", "219.91.255.179:80", "193.205.4.176:80", "37.59.36.212:88", "86.102.106.150:8080", "62.210.51.150:80", "138.197.154.98:80", "189.84.213.2:80", "195.55.85.254:80", "91.121.171.104:8888", "82.224.48.173:80", "51.255.161.222:80", "185.44.69.44:3128", "213.108.201.82:80", "82.200.205.49:3128", "51.254.127.194:8081"] -------------------------------------------------------------------------------- /amazon/amazon/settings-demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for amazon project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'amazon' 13 | 14 | SPIDER_MODULES = ['amazon.spiders'] 15 | NEWSPIDER_MODULE = 'amazon.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | SPIDER_MIDDLEWARES = { 50 | 'amazon.middlewares.AmazonSpiderMiddleware.AmazonSpiderMiddleware': 543, 51 | } 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'amazon.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | DOWNLOADER_MIDDLEWARES = { 59 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 60 | 'amazon.middlewares.RotateUserAgentMiddleware.RotateUserAgentMiddleware': 543, 61 | 'amazon.middlewares.ProxyMiddleware.ProxyMiddleware': 542, 62 | } 63 | 64 | # Enable or disable extensions 65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 72 | ITEM_PIPELINES = { 73 | #'amazon.pipelines.AmazonPipeline': 300, 74 | 'amazon.mysqlpipelines.pipelines.AmazonPipeline':1, 75 | } 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | #LOG_LEVEL = 'ERROR' 98 | #mysql 99 | MYSQL_HOST = '127.0.0.1' 100 | MYSQL_USER = 'dev' 101 | MYSQL_PASSWORD = '123456' 102 | MYSQL_PORT = 3306 103 | MYSQL_DB = 'pricejot_test' 104 | MYSQL_CHARSET = 'utf8mb4' 105 | 106 | MYSQL = { 107 | 'host': MYSQL_HOST, 108 | 'port': MYSQL_PORT, 109 | 'user': MYSQL_USER, 110 | 'password': MYSQL_PASSWORD, 111 | 'charset': MYSQL_CHARSET, 112 | 'database': MYSQL_DB 113 | } 114 | RETRY_TIMES = 30 115 | DOWNLOAD_TIMEOUT = 30 116 | FEED_EXPORT_ENCODING = 'utf-8' 117 | 118 | TIMEZONE = 'America/Los_Angeles' -------------------------------------------------------------------------------- /amazon/amazon/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for amazon project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'amazon' 13 | 14 | SPIDER_MODULES = ['amazon.spiders'] 15 | NEWSPIDER_MODULE = 'amazon.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | SPIDER_MIDDLEWARES = { 50 | 'amazon.middlewares.AmazonSpiderMiddleware.AmazonSpiderMiddleware': 543, 51 | } 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'amazon.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | DOWNLOADER_MIDDLEWARES = { 59 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 60 | 'amazon.middlewares.RotateUserAgentMiddleware.RotateUserAgentMiddleware': 543, 61 | 'amazon.middlewares.ProxyMiddleware.ProxyMiddleware': 542, 62 | } 63 | 64 | # Enable or disable extensions 65 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 66 | #EXTENSIONS = { 67 | # 'scrapy.extensions.telnet.TelnetConsole': None, 68 | #} 69 | 70 | # Configure item pipelines 71 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 72 | ITEM_PIPELINES = { 73 | #'amazon.pipelines.AmazonPipeline': 300, 74 | 'amazon.mysqlpipelines.pipelines.AmazonPipeline':1, 75 | } 76 | 77 | # Enable and configure the AutoThrottle extension (disabled by default) 78 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 79 | #AUTOTHROTTLE_ENABLED = True 80 | # The initial download delay 81 | #AUTOTHROTTLE_START_DELAY = 5 82 | # The maximum download delay to be set in case of high latencies 83 | #AUTOTHROTTLE_MAX_DELAY = 60 84 | # The average number of requests Scrapy should be sending in parallel to 85 | # each remote server 86 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 87 | # Enable showing throttling stats for every response received: 88 | #AUTOTHROTTLE_DEBUG = False 89 | 90 | # Enable and configure HTTP caching (disabled by default) 91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 92 | #HTTPCACHE_ENABLED = True 93 | #HTTPCACHE_EXPIRATION_SECS = 0 94 | #HTTPCACHE_DIR = 'httpcache' 95 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 96 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 97 | #LOG_LEVEL = 'ERROR' 98 | #mysql 99 | MYSQL_HOST = '127.0.0.1' 100 | MYSQL_USER = 'dev' 101 | MYSQL_PASSWORD = '123456' 102 | MYSQL_PORT = 3306 103 | MYSQL_DB = 'pricejot_test' 104 | MYSQL_CHARSET = 'utf8mb4' 105 | 106 | MYSQL = { 107 | 'host': MYSQL_HOST, 108 | 'port': MYSQL_PORT, 109 | 'user': MYSQL_USER, 110 | 'password': MYSQL_PASSWORD, 111 | 'charset': MYSQL_CHARSET, 112 | 'database': MYSQL_DB 113 | } 114 | RETRY_TIMES = 30 115 | DOWNLOAD_TIMEOUT = 30 116 | FEED_EXPORT_ENCODING = 'utf-8' 117 | 118 | TIMEZONE = 'America/Los_Angeles' -------------------------------------------------------------------------------- /amazon/amazon/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/asin_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import json 3 | from amazon.items import AsinBestItem 4 | import pydispatch 5 | from scrapy import signals 6 | from datetime import datetime 7 | from amazon.mysqlpipelines.pipelines import Sql 8 | class AsinSpider(scrapy.Spider): 9 | name = "asin" 10 | custom_settings = { 11 | 'LOG_LEVEL': 'ERROR', 12 | 'LOG_ENABLED': True, 13 | 'LOG_STDOUT': True 14 | } 15 | 16 | def __init__(self): 17 | scrapy.Spider.__init__(self) 18 | pydispatch.dispatcher.connect(self.handle_spider_closed, signals.spider_closed) 19 | # all asin scrapied will store in the array 20 | self.asin_pool = [] 21 | 22 | def start_requests(self): 23 | cates = Sql.findall_cate_level1() 24 | for row in cates: 25 | row['link'] += '?ajax=1' 26 | yield scrapy.Request(url=row['link']+'&pg=1', callback=self.parse, meta={'cid': row['id'], 'page': 1, 'link': row['link']}) 27 | 28 | def parse(self, response): 29 | list = response.css('.zg_itemImmersion') 30 | 31 | # scrapy next page go go go ! 32 | response.meta['page'] = response.meta['page'] +1 33 | if response.meta['page'] < 6: 34 | yield scrapy.Request(url=response.meta['link']+'&pg='+str(response.meta['page']), callback=self.parse, meta=response.meta) 35 | 36 | # yield the asin 37 | for row in list: 38 | try: 39 | info = row.css('.zg_itemWrapper')[0].css('div::attr(data-p13n-asin-metadata)')[0].extract() 40 | rank = int(float(row.css('.zg_rankNumber::text')[0].extract())) 41 | 42 | except: 43 | continue 44 | pass 45 | info = json.loads(info) 46 | item = AsinBestItem() 47 | item['asin'] = info['asin'] 48 | item['cid'] = response.meta['cid'] 49 | item['rank'] = rank 50 | yield item 51 | 52 | def handle_spider_closed(self, spider): 53 | Sql.store_best_asin() 54 | work_time = datetime.now() - spider.started_on 55 | print('total spent:', work_time) 56 | print('done') 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/cate_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from amazon.items import CateItem 3 | from amazon.mysqlpipelines.sql import Sql 4 | 5 | class CateSpider(scrapy.Spider): 6 | name = "cate" 7 | custom_settings = { 8 | 'LOG_LEVEL': 'ERROR', 9 | 'LOG_ENABLED': True, 10 | 'LOG_STDOUT': True 11 | } 12 | level = 1 13 | 14 | def start_requests(self): 15 | 16 | urls = [ 17 | 'https://www.amazon.com/Best-Sellers/zgbs/', 18 | ] 19 | Sql.clear_cate(self.level) 20 | for url in urls: 21 | yield scrapy.Request(url=url, callback=self.parse, meta={'level': self.level}) 22 | 23 | def parse(self, response): 24 | 25 | if response.meta['level'] == 1: 26 | list = response.css('#zg_browseRoot ul')[0].css('li a') 27 | elif response.meta['level'] == 2: 28 | list = response.css('#zg_browseRoot ul')[0].css('ul')[0].css('li a') 29 | else: 30 | return 0 31 | item = CateItem() 32 | leve_cur = response.meta['level'] 33 | response.meta['level'] = response.meta['level'] + 1 34 | 35 | for one in list: 36 | item['title'] = one.css('::text')[0].extract() 37 | link = one.css('::attr(href)')[0].extract() 38 | item['link'] = link.split('ref=')[0] 39 | item['level'] = leve_cur 40 | item['pid'] = 1 41 | yield item 42 | if int(float(self.level)) > 1: 43 | yield scrapy.Request(url=item['link'], callback=self.parse, meta=response.meta) 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/detail_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from amazon.items import DetailItem 3 | from amazon.mysqlpipelines.pipelines import Sql 4 | import pydispatch 5 | import re 6 | from amazon.helper import Helper 7 | from scrapy import signals 8 | from datetime import datetime 9 | 10 | 11 | 12 | class DetailSpider(scrapy.Spider): 13 | name = "detail" 14 | custom_settings = { 15 | 'LOG_LEVEL': 'ERROR', 16 | 'LOG_ENABLED': True, 17 | 'LOG_STDOUT': True 18 | } 19 | 20 | def __init__(self): 21 | scrapy.Spider.__init__(self) 22 | pydispatch.dispatcher.connect(self.handle_spider_closed, signals.spider_closed) 23 | # all asin scrapied will store in the array 24 | self.product_pool = {} 25 | self.log = [] 26 | self.products = [] 27 | 28 | def start_requests(self): 29 | self.products = Sql.findall_asin_level1() 30 | print(len(self.products)) 31 | for row in self.products: 32 | yield scrapy.Request( 33 | url='https://www.amazon.com/gp/offer-listing/' + row['asin'] + '/?f_new=true', 34 | callback=self.listing_parse, 35 | meta={ 36 | 'asin': row['asin'], 37 | 'cid': row['cid'], 38 | } 39 | ) 40 | 41 | def review_parse(self, response): 42 | item = self.fetch_detail_from_review_page(response) 43 | self.product_pool[item['asin']] = item 44 | yield item 45 | 46 | def listing_parse(self, response): 47 | print(response.status) 48 | 49 | if not response.css('#olpProductImage'): 50 | yield scrapy.Request( 51 | url='https://www.amazon.com/product-reviews/' + response.meta['asin'], 52 | callback=self.review_parse, 53 | meta={'asin': response.meta['asin'], 'cid': response.meta['cid']} 54 | ) 55 | return 56 | try: 57 | item = self.fetch_detail_from_listing_page(response) 58 | self.product_pool[item['asin']] = item 59 | except Exception as err: 60 | print(err) 61 | print(response.meta['asin']) 62 | yield item 63 | 64 | def handle_spider_closed(self, spider): 65 | work_time = datetime.now() - spider.started_on 66 | print('total spent:', work_time) 67 | print(len(self.product_pool), 'item fetched') 68 | print(self.product_pool) 69 | print('done') 70 | print(self.log) 71 | 72 | 73 | 74 | 75 | def fetch_detail_from_listing_page(self, response): 76 | item = DetailItem() 77 | item['asin'] = response.meta['asin'] 78 | item['image'] = response.css('#olpProductImage img::attr(src)')[0].extract().strip().replace('_SS160', '_SS320') 79 | item['title'] = response.css('title::text')[0].extract().split(':')[2].strip() 80 | 81 | try: 82 | item['star'] = response.css('.a-icon-star span::text')[0].extract().split(' ')[0].strip() 83 | except: 84 | item['star'] = 0 85 | try: 86 | item['reviews'] = response.css('.a-size-small > .a-link-normal::text')[0].extract().strip().split(' ')[0] 87 | except: 88 | item['reviews'] = 0 89 | 90 | price_info_list = response.css(".olpOffer[role=\"row\"] ") 91 | item['amazon_price'] = 0 92 | item['seller_price'] = 0 93 | for row in price_info_list: 94 | if (item['amazon_price'] == 0) and row.css(".olpSellerName > img"): 95 | try: 96 | item['amazon_price'] = row.css('.olpOfferPrice::text')[0].extract().strip().lstrip('$') 97 | except: 98 | item['amazon_price'] = 0 99 | continue 100 | if (item['seller_price'] == 0) and (not row.css(".olpSellerName > img")): 101 | try: 102 | item['seller_price'] = row.css('.olpOfferPrice::text')[0].extract().strip().lstrip('$') 103 | except: 104 | item['seller_price'] = 0 105 | return item 106 | 107 | def fetch_detail_from_review_page(self, response): 108 | 109 | 110 | info = response.css('#cm_cr-product_info')[0].extract() 111 | item = DetailItem() 112 | item['asin'] = response.meta['asin'] 113 | item['image'] = response.css('.product-image img::attr(src)')[0].extract().strip().replace('S60', 'S320') 114 | item['title'] = response.css('.product-title >h1>a::text')[0].extract().strip() 115 | item['star'] = re.findall("([0-9].[0-9]) out of", info)[0] 116 | 117 | # 获取评价总数 118 | item['reviews'] = response.css('.AverageCustomerReviews .totalReviewCount::text')[0].extract().strip() 119 | item['reviews'] = Helper.get_num_split_comma(item['reviews']) 120 | item['seller_price'] = 0 121 | item['amazon_price'] = 0 122 | price = response.css('.arp-price::text')[0].extract().strip().lstrip('$') 123 | item['amazon_price'] = price 124 | return item 125 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/keyword_ranking_spider.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import scrapy 4 | from pydispatch import dispatcher 5 | from scrapy import signals 6 | from amazon.helper import Helper 7 | from amazon.items import KeywordRankingItem 8 | from amazon.sql import RankingSql 9 | 10 | 11 | class KeywordRankingSpider(scrapy.Spider): 12 | name = 'keyword_ranking' 13 | custom_settings = { 14 | 'LOG_LEVEL': 'ERROR', 15 | 'LOG_FILE': 'keyword_ranking.json', 16 | 'LOG_ENABLED': True, 17 | 'LOG_STDOUT': True, 18 | 'CONCURRENT_REQUESTS_PER_DOMAIN': 30 19 | } 20 | 21 | def __init__(self, *args, **kwargs): 22 | super().__init__(*args, **kwargs) 23 | self.items = {} 24 | self.found = {} 25 | self.keyword_pool = {} 26 | self.store_poll = {} 27 | self.store_date = {} 28 | dispatcher.connect(self.init_scrapy, signals.engine_started) 29 | dispatcher.connect(self.close_scrapy, signals.engine_stopped) 30 | 31 | def start_requests(self): 32 | for keyword, poll in self.keyword_pool.items(): 33 | yield scrapy.Request(('https://www.amazon.com/s/?field-keywords=%s&t=' + Helper.random_str(10)) % keyword, 34 | self.load_first_page, meta={'items': poll}) 35 | 36 | def parse(self, response): 37 | result_li = response.xpath('//li[@data-asin]') 38 | for item in response.meta['items']: 39 | if len(result_li) == 0: 40 | self.found[item['id']] = 'none' 41 | logging.warning("[keyword none] url: [%s] skwd_id:[%s] asin:[%s] \r\n body: %s" % (response.url, item['id'],item['asin'], response.body)) 42 | else: 43 | for result in result_li: 44 | data_asin = result.xpath('./@data-asin').extract()[0] 45 | if data_asin == item['asin']: 46 | # print(item) 47 | self.found[item['id']] = True 48 | # keywordItem = KeywordRankingItem() 49 | data_id = result.xpath('./@id').extract()[0] 50 | item_id = data_id.split('_')[1] 51 | rank = int(item_id) +1 52 | if item['id'] in self.store_poll.keys(): 53 | self.store_poll[item['id']].append(rank) 54 | else: 55 | self.store_poll[item['id']] = [rank] 56 | self.store_date[item['id']] = Helper.get_now_date() 57 | break 58 | 59 | def load_first_page(self, response): 60 | page = response.css('#bottomBar span.pagnDisabled::text').extract() 61 | page = 1 if len(page) == 0 else int(page[0]) 62 | page_num = 1 63 | while page_num <= page: 64 | # yield scrapy.Request(response.url + '&page=%s' % page_num, self.parse, meta={'asin': response.meta['item']['asin'], 65 | # 'skwd_id': response.meta['item']['id']}) 66 | yield scrapy.Request(response.url + '&page=%s' % page_num, self.parse, meta={'items': response.meta['items']}) 67 | page_num += 1 68 | 69 | def init_scrapy(self): 70 | self.items = RankingSql.fetch_keywords_ranking() 71 | for item in self.items: 72 | if item['keyword'] in self.keyword_pool.keys(): 73 | self.keyword_pool[item['keyword']].append({'id': item['id'], 'asin': item['asin']}) 74 | else: 75 | self.keyword_pool[item['keyword']] = [{'id': item['id'], 'asin': item['asin']}] 76 | 77 | self.found = {item['id']: False for item in self.items} 78 | 79 | def close_scrapy(self): 80 | for skwd_id, is_found in self.found.items(): 81 | if is_found is not True: 82 | if is_found == 'none': 83 | # RankingSql.update_keywords_none_rank(skwd_id) 84 | logging.info('[keyword none] skwd_id:[%s]' % skwd_id) 85 | else: 86 | RankingSql.update_keywords_expire_rank(skwd_id) 87 | else: 88 | keywordrank = KeywordRankingItem() 89 | keywordrank['skwd_id'] = skwd_id 90 | keywordrank['rank'] = min(self.store_poll[skwd_id]) 91 | keywordrank['date'] = self.store_date[skwd_id] 92 | RankingSql.insert_keyword_ranking(keywordrank) 93 | 94 | 95 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/proxy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/spiders/proxy/__init__.py -------------------------------------------------------------------------------- /amazon/amazon/spiders/proxy/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/spiders/proxy/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/spiders/proxy/__pycache__/fineproxy_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/spiders/proxy/__pycache__/fineproxy_spider.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/spiders/proxy/__pycache__/kuaidaili_spider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon/amazon/spiders/proxy/__pycache__/kuaidaili_spider.cpython-36.pyc -------------------------------------------------------------------------------- /amazon/amazon/spiders/proxy/fineproxy_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | import json 4 | 5 | class FineproxySpider(scrapy.Spider): 6 | name = "fineproxy" 7 | custom_settings = { 8 | 'LOG_LEVEL': 'ERROR', 9 | 'LOG_ENABLED': True, 10 | 'LOG_STDOUT': True 11 | } 12 | 13 | 14 | def start_requests(self): 15 | url = "http://fineproxy.org/eng/fresh-proxies/" 16 | yield scrapy.Request(url=url, callback=self.parse, meta={}) 17 | 18 | def parse(self, response): 19 | pattern = "Fast proxies: (.*)Other fresh and working proxies:" 20 | tmp = re.findall(pattern, response.text)[0] 21 | proxy = re.findall("([0-9]{1,4}.[0-9]{1,4}.[0-9]{1,4}.[0-9]{1,4}:[0-9]{1,4})", tmp) 22 | with open('proxy.json', 'w') as f: 23 | json.dump(proxy, f) 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/proxy/kuaidaili_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import re 3 | import json 4 | 5 | class KuaidailiSpider(scrapy.Spider): 6 | name = "kuaidaili" 7 | # custom_settings = { 8 | # 'LOG_LEVEL': 'ERROR', 9 | # 'LOG_ENABLED': True, 10 | # 'LOG_STDOUT': True 11 | # } 12 | 13 | 14 | def start_requests(self): 15 | 16 | self.headers = { 17 | 'Host': 'www.kuaidaili.com', 18 | 'Upgrade-Insecure-Requests': '1', 19 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:52.0) Gecko/20100101 Firefox/52.0', 20 | } 21 | 22 | url = "http://www.kuaidaili.com/free/inha/" 23 | yield scrapy.Request(url=url, callback=self.parse, meta={}) 24 | 25 | def parse(self, response): 26 | print(response.status) 27 | print('3333') 28 | print(response.css('.center tr').re('td')) 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/proxy/privateproxy_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import json 3 | import pymysql 4 | from amazon import settings 5 | 6 | class privateproxySpider(scrapy.Spider): 7 | name = "privateproxy" 8 | custom_settings = { 9 | 'LOG_LEVEL': 'ERROR', 10 | 'LOG_ENABLED': True, 11 | 'LOG_STDOUT': True 12 | } 13 | 14 | def start_requests(self): 15 | url = "http://www.qq.com" 16 | db = pymysql.connect(settings.MYSQL_HOST, settings.MYSQL_USER, settings.MYSQL_PASSWORD, settings.MYSQL_DB, charset=settings.MYSQL_CHARSET, cursorclass=pymysql.cursors.DictCursor) 17 | cursor = db.cursor() 18 | 19 | sql = "SELECT CONCAT_WS(':', ip, port) AS proxy FROM proxy where work = 1" 20 | cursor.execute(sql) 21 | 22 | proxy_array = [] 23 | proxy_list = cursor.fetchall() 24 | for item in proxy_list: 25 | proxy_array.append(item['proxy']) 26 | 27 | with open('proxy.json', 'w') as f: 28 | json.dump(proxy_array, f) 29 | yield scrapy.Request(url=url, callback=self.parse, meta={}) 30 | 31 | def parse(self, response): 32 | print('proxy update done') 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/reivew_profile_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | from amazon.helper import Helper 4 | from amazon.items import ReviewProfileItem 5 | 6 | 7 | class ProfileSpider(scrapy.Spider): 8 | name = 'review_profile' 9 | custom_settings = { 10 | 'LOG_LEVEL': 'ERROR', 11 | 'LOG_FILE': 'profile.json', 12 | 'LOG_ENABLED': True, 13 | 'LOG_STDOUT': True 14 | } 15 | 16 | def __init__(self, asin, *args, **kwargs): 17 | super().__init__(*args, **kwargs) 18 | self.asin = asin 19 | 20 | def start_requests(self): 21 | yield scrapy.Request('https://www.amazon.com/product-reviews/%s' % self.asin, callback=self.parse) 22 | 23 | def parse(self, response): 24 | item = ReviewProfileItem() 25 | 26 | item['asin'] = response.meta['asin'] if 'asin' in response.meta else self.asin 27 | # 获取平均评价数值 28 | average = response.css('.averageStarRatingNumerical a span::text').extract() # 获取平均评价值 29 | item['review_rate'] = Helper.get_star_split_str(average[0]) # 获取平均值 30 | # 获取评价总数 31 | total = response.css('.AverageCustomerReviews .totalReviewCount::text').extract() # 获取评价总数 32 | item['review_total'] = Helper.get_num_split_comma(total[0]) 33 | # 获取产品名称 34 | product = response.css('.product-title h1 a::text').extract() 35 | item['product'] = product[0] 36 | # 获取产品 brand 37 | item['brand'] = response.css('.product-by-line a::text').extract()[0] 38 | item['image'] = response.css('.product-image img::attr(src)').extract()[0] 39 | 40 | # 获取产品商家 41 | item['seller'] = item['brand'] 42 | # 获取各星评价百分比数 43 | review_summary = response.css('.reviewNumericalSummary .histogram ' 44 | '#histogramTable tr td:last-child').re(r'\d{1,3}\%') 45 | 46 | pct = list(map(lambda x: x[0:-1], review_summary)) 47 | 48 | item['pct_five'] = pct[0] 49 | item['pct_four'] = pct[1] 50 | item['pct_three'] = pct[2] 51 | item['pct_two'] = pct[3] 52 | item['pct_one'] = pct[4] 53 | 54 | yield item 55 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/review_detail_spider.py: -------------------------------------------------------------------------------- 1 | import math 2 | import subprocess 3 | 4 | import scrapy 5 | from pydispatch import dispatcher 6 | from scrapy import signals 7 | 8 | from amazon.helper import Helper 9 | from amazon.items import ReviewDetailItem, ReviewProfileItem 10 | from amazon.sql import ReviewSql 11 | 12 | 13 | class ReviewSpider(scrapy.Spider): 14 | name = 'review_detail' 15 | custom_settings = { 16 | 'LOG_LEVEL': 'ERROR', 17 | 'LOG_FILE': 'review_detail.json', 18 | 'LOG_ENABLED': True, 19 | 'LOG_STDOUT': True 20 | } 21 | 22 | def __init__(self, asin, daily=0, *args, **kwargs): 23 | super().__init__(*args, **kwargs) 24 | self.asin = asin 25 | self.last_review = 0 26 | self.profile_update_self = False # profile自动计数更新 27 | self.updated = False # profile是否更新过 28 | self.daily = True if int(daily) == 1 else False # 判断是否是每日更新 29 | self.start_urls = [ 30 | 'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=three_star' % self.asin, 31 | 'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=two_star' % self.asin, 32 | 'https://www.amazon.com/product-reviews/%s?sortBy=recent&filterByStar=one_star' % self.asin 33 | ] 34 | dispatcher.connect(self.update_profile_self, signals.engine_stopped) 35 | dispatcher.connect(self.init_profile, signals.engine_started) 36 | 37 | def start_requests(self): 38 | self.load_profile() 39 | for url in self.start_urls: 40 | yield scrapy.Request(url, callback=self.get_detail) 41 | 42 | def parse(self, response): 43 | reviews = response.css('.review-views .review') 44 | for row in reviews: 45 | item = ReviewDetailItem() 46 | item['asin'] = self.asin 47 | item['review_id'] = row.css('div::attr(id)')[0].extract() 48 | item['reviewer'] = row.css('.author::text')[0].extract() 49 | item['title'] = row.css('.review-title::text')[0].extract() 50 | item['review_url'] = row.css('.review-title::attr(href)')[0].extract() 51 | item['date'] = Helper.get_date_split_str(row.css('.review-date::text')[0].extract()) 52 | item['star'] = Helper.get_star_split_str(row.css('.review-rating span::text')[0].extract()) 53 | content = row.css('.review-data .review-text::text').extract() 54 | item['content'] = '
'.join(content) if len(content) > 0 else '' 55 | yield item 56 | 57 | def get_detail(self, response): 58 | # 获取页面数 59 | page = response.css('ul.a-pagination li a::text') 60 | 61 | i = 1 62 | # 获取评价总数 63 | total = response.css('.AverageCustomerReviews .totalReviewCount::text').extract() # 获取评价总数 64 | now_total = Helper.get_num_split_comma(total[0]) 65 | last_review = self.last_review 66 | sub_total = int(now_total) - int(last_review) 67 | if sub_total != 0: 68 | # if sub_total != 0: # 若计算出的页数 不为0 则说明有新的评论,更新profile 69 | self.updated = True 70 | yield scrapy.Request('https://www.amazon.com/product-reviews/%s' % self.asin, 71 | callback=self.profile_parse) 72 | if len(page) < 3: # 若找到的a标签总数小于3 说明没有page组件 只有1页数据 73 | yield scrapy.Request(url=response.url + '&pageNumber=1', callback=self.parse) 74 | else: 75 | if self.daily: 76 | page_num = math.ceil(sub_total / 10) 77 | print('update item page_num is %s' % page_num) 78 | else: 79 | self.profile_update_self = True 80 | page_num = Helper.get_num_split_comma(page[len(page) - 3].extract()) # 获得总页数 81 | while i <= int(page_num): 82 | yield scrapy.Request(url=response.url + '&pageNumber=%s' % i, 83 | callback=self.parse) 84 | i = i + 1 85 | else: 86 | print('there is no item to update') 87 | 88 | def profile_parse(self, response): 89 | item = ReviewProfileItem() 90 | 91 | item['asin'] = self.asin 92 | # 获取平均评价数值 93 | average = response.css('.averageStarRatingNumerical a span::text').extract() # 获取平均评价值 94 | item['review_rate'] = Helper.get_star_split_str(average[0]) # 获取平均值 95 | # 获取评价总数 96 | total = response.css('.AverageCustomerReviews .totalReviewCount::text').extract() # 获取评价总数 97 | item['review_total'] = Helper.get_num_split_comma(total[0]) 98 | # 获取产品名称 99 | product = response.css('.product-title h1 a::text').extract() 100 | item['product'] = product[0] 101 | # 获取产品 brand 102 | item['brand'] = response.css('.product-by-line a::text').extract()[0] 103 | item['image'] = response.css('.product-image img::attr(src)').extract()[0] 104 | 105 | # 获取产品商家 106 | item['seller'] = item['brand'] 107 | # 获取各星评价百分比数 108 | review_summary = response.css('.reviewNumericalSummary .histogram ' 109 | '#histogramTable tr td:last-child').re(r'\d{1,3}\%') 110 | 111 | pct = list(map(lambda x: x[0:-1], review_summary)) 112 | 113 | item['pct_five'] = pct[0] 114 | item['pct_four'] = pct[1] 115 | item['pct_three'] = pct[2] 116 | item['pct_two'] = pct[3] 117 | item['pct_one'] = pct[4] 118 | 119 | yield item 120 | 121 | def load_profile(self): 122 | # 若没有profile记录 则抓取新的profile 录入数据库 123 | if self.last_review is False: 124 | self.profile_update_self = True 125 | print('this asin profile is not exist, now to get the profile of asin:', self.asin) 126 | yield scrapy.Request('https://www.amazon.com/product-reviews/%s' % self.asin, 127 | callback=self.profile_parse) 128 | self.last_review = ReviewSql.get_last_review_total(self.asin) 129 | 130 | # scrapy 完成后 加载,如果是没有记录的profile 初次insert lastest_review为0 将所有多余的数据跑完后 防止第二次重复跑取,将latest_total更新 131 | def update_profile_self(self): 132 | if self.profile_update_self is True and self.updated is False: 133 | # 若需要自主更新 并且 未更新状态 134 | ReviewSql.update_profile_self(self.asin) 135 | 136 | # scrapy 开始前加载,获取目前asin的latest_review 137 | def init_profile(self): 138 | self.last_review = ReviewSql.get_last_review_total(self.asin) 139 | -------------------------------------------------------------------------------- /amazon/amazon/spiders/sales_ranking_spider.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import scrapy 4 | 5 | from pydispatch import dispatcher 6 | from scrapy import signals 7 | 8 | from amazon.helper import Helper 9 | from amazon.items import SalesRankingItem 10 | from amazon.sql import RankingSql 11 | 12 | 13 | class SalesRankingSpider(scrapy.Spider): 14 | name = 'sales_ranking' 15 | custom_settings = { 16 | 'LOG_LEVEL': 'ERROR', 17 | 'LOG_FILE': 'sales_ranking.json', 18 | 'LOG_ENABLED': True, 19 | 'LOG_STDOUT': True 20 | } 21 | 22 | def __init__(self, **kwargs): 23 | super().__init__(**kwargs) 24 | self.items = [] 25 | dispatcher.connect(self.load_asin, signals.engine_started) 26 | 27 | def start_requests(self): 28 | for item in self.items: 29 | yield scrapy.Request('https://www.amazon.com/dp/%s' % item['asin'], self.parse, meta={'item': item}) 30 | 31 | def parse(self, response): 32 | product_detail = response.xpath('//div/table').re(r'#[0-9,]+(?:.*)in.*\(.*[Ss]ee [Tt]op.*\)') 33 | if len(product_detail) == 0: 34 | product_detail = response.css('div #SalesRank').re(r'#[0-9,]+(?:.*)in.*\(.*[Ss]ee [Tt]op.*\)') 35 | if len(product_detail) != 0: 36 | item = SalesRankingItem() 37 | key_rank_str = product_detail[0] 38 | key_rank_tuple = Helper.get_rank_classify(key_rank_str) 39 | item['rank'] = Helper.get_num_split_comma(key_rank_tuple[0]) 40 | item['classify'] = key_rank_tuple[1] 41 | item['asin'] = response.meta['item']['asin'] 42 | yield item 43 | else: 44 | raise Exception('catch asin[%s] sales ranking error' % response.meta['item']['asin']) 45 | 46 | def load_asin(self): 47 | self.items = RankingSql.fetch_sales_ranking() 48 | -------------------------------------------------------------------------------- /amazon/amazon/sql.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import pymysql 4 | import pytz 5 | 6 | from amazon import settings 7 | 8 | 9 | def conn_db(): 10 | db_conf = settings.MYSQL 11 | db_conf['cursorclass'] = pymysql.cursors.DictCursor 12 | conn = pymysql.connect(**db_conf) 13 | conn.autocommit(1) 14 | return conn 15 | 16 | 17 | def cursor_db(conn): 18 | return conn.cursor() 19 | 20 | 21 | class ReviewSql(object): 22 | conn = conn_db() 23 | cursor = cursor_db(conn) 24 | 25 | 26 | @classmethod 27 | def insert_profile_item(cls, item): 28 | sql = "INSERT INTO `py_review_profile`" \ 29 | "(`asin`, `product`, `brand`, `seller`, `image`," \ 30 | "`review_total`, `review_rate`, `pct_five`, `pct_four`, `pct_three`, " \ 31 | "`pct_two`, `pct_one`, `latest_total`) " \ 32 | "VALUES ('%s', %s, %s, %s, '%s', '%s', " \ 33 | "'%s', '%s', '%s', '%s', '%s', '%s', 0)" %\ 34 | (item['asin'], cls.conn.escape(item['product']), cls.conn.escape(item['brand']), cls.conn.escape(item['seller']), item['image'], 35 | item['review_total'], item['review_rate'], item['pct_five'], item['pct_four'], 36 | item['pct_three'], item['pct_two'], item['pct_one']) 37 | try: 38 | if cls.check_exist_profile(item['asin']): 39 | cls.update_profile_item(item) 40 | print('update review profile--[asin]:', item['asin']) 41 | else: 42 | cls.cursor.execute(sql) 43 | cls.conn.commit() 44 | print('save review profile--[asin]:', item['asin']) 45 | except pymysql.MySQLError as e: 46 | with open('sql.log', 'r+') as i: 47 | i.write('profile sql error![error]:'+e) 48 | print(e) 49 | cls.conn.rollback() 50 | pass 51 | 52 | @classmethod 53 | def update_profile_item(cls, item): 54 | sql = "UPDATE `py_review_profile` SET `latest_total`=`review_total`,`product`=%s, `brand`=%s, `seller`=%s, `image`=%s, `review_total`='%s', `review_rate`='%s'," \ 55 | "`pct_five`='%s', `pct_four`='%s', `pct_three`='%s', `pct_two`='%s', `pct_one`='%s' " \ 56 | "WHERE `asin`='%s'" % \ 57 | (cls.conn.escape(item['product']), cls.conn.escape(item['brand']), cls.conn.escape(item['seller']), cls.conn.escape(item['image']), 58 | item['review_total'], item['review_rate'],item['pct_five'], item['pct_four'], item['pct_three'], item['pct_two'], 59 | item['pct_one'], item['asin']) 60 | try: 61 | cls.cursor.execute(sql) 62 | cls.conn.commit() 63 | except pymysql.MySQLError as e: 64 | print(e) 65 | cls.conn.rollback() 66 | 67 | @classmethod 68 | def check_exist_profile(cls, asin): 69 | sql = "SELECT * FROM `py_review_profile` WHERE (`asin` = '%s')" % (asin) 70 | result = cls.cursor.execute(sql) 71 | if result: 72 | return True 73 | else: 74 | return False 75 | 76 | @classmethod 77 | def insert_detail_item(cls, item): 78 | sql = "INSERT INTO `py_review_detail`(`asin`, `review_id`, `reviewer`, `review_url`, `star`, `date`, `title`, `content`) " \ 79 | "VALUES ('%s', '%s', %s, '%s', '%s', '%s', %s, %s)" % \ 80 | (item['asin'], item['review_id'], cls.conn.escape(item['reviewer']), item['review_url'], item['star'], 81 | item['date'], cls.conn.escape(item['title']), cls.conn.escape(item['content'])) 82 | try: 83 | if cls.check_exist_detail(item['asin'], item['review_id']) is not True: 84 | print('save review detail--[asin]:', item['asin'], '[reviewID]:', item['review_id']) 85 | cls.cursor.execute(sql) 86 | cls.conn.commit() 87 | except pymysql.MySQLError as e: 88 | print(e) 89 | cls.conn.rollback() 90 | pass 91 | 92 | @classmethod 93 | def check_exist_detail(cls, asin, review_id): 94 | sql = "SELECT * FROM `py_review_detail` WHERE `asin` = '%s' AND `review_id`='%s'" % (asin, review_id) 95 | result = cls.cursor.execute(sql) 96 | if result: 97 | return True 98 | else: 99 | return False 100 | 101 | @classmethod 102 | def get_last_review_total(cls, asin): 103 | sql = "SELECT `review_total`, `latest_total` FROM `py_review_profile` WHERE `asin`='%s'" % asin 104 | cls.cursor.execute(sql) 105 | item = cls.cursor.fetchone() 106 | if item: 107 | return item['latest_total'] 108 | else: 109 | return False 110 | 111 | @classmethod 112 | def update_profile_self(cls, asin): 113 | sql = "UPDATE `py_review_profile` SET `latest_total` = `review_total` WHERE `asin`='%s'" % asin 114 | cls.cursor.execute(sql) 115 | cls.conn.commit() 116 | 117 | 118 | class RankingSql(object): 119 | expire_rank = 500 120 | conn = conn_db() 121 | cursor = cursor_db(conn) 122 | py_keyword_table = 'py_salesranking_keywords' # 爬虫抓 123 | py_sales_table = 'py_salesrankings' 124 | keyword_table = 'salesranking_keywords' 125 | sales_table = 'salesrankings' 126 | tz = pytz.timezone(settings.TIMEZONE) 127 | 128 | @classmethod 129 | def insert_sales_ranking(cls, item): 130 | now = datetime.now(cls.tz).strftime('%Y-%m-%d %H:%M:%S') 131 | sql = "INSERT INTO `%s`(`asin`, `rank`, `classify`, `date`) VALUES ('%s', '%s', %s, '%s')" % \ 132 | (cls.py_sales_table, item['asin'], item['rank'], cls.conn.escape(item['classify']), now) 133 | update_sql = "UPDATE `%s` SET `last_rank`=`rank`, `status`=1, `classify`=%s, `rank`='%s', `updated_at`='%s' WHERE `asin` = '%s'" % \ 134 | (cls.sales_table, cls.conn.escape(item['classify']), item['rank'], now, item['asin']) 135 | try: 136 | cls.cursor.execute(sql) 137 | cls.cursor.execute(update_sql) 138 | cls.conn.commit() 139 | print('save sales_rank:', item) 140 | except pymysql.DatabaseError as error: 141 | print(error) 142 | cls.conn.rollback() 143 | 144 | @classmethod 145 | def insert_keyword_ranking(cls, item): 146 | sql = "INSERT INTO `%s`(`skwd_id`, `rank`, `date`) VALUES ('%s', '%s', '%s')" % \ 147 | (cls.py_keyword_table, item['skwd_id'], item['rank'], item['date']) 148 | update_sql = "UPDATE `%s` SET `last_rank`=`rank`, `rank`='%s', `status`=1, `updated_at`='%s' WHERE `id`='%s'" % \ 149 | (cls.keyword_table, item['rank'], item['date'], item['skwd_id']) 150 | try: 151 | cls.cursor.execute(sql) 152 | cls.cursor.execute(update_sql) 153 | cls.conn.commit() 154 | print('save keyword_rank:', item) 155 | except pymysql.DatabaseError as error: 156 | print(error) 157 | cls.conn.rollback() 158 | 159 | @classmethod 160 | def fetch_sales_ranking(cls): 161 | sql = "SELECT `id`, `asin` FROM `%s`WHERE `status` =1 AND `deleted_at` is NULL" % cls.sales_table 162 | cls.cursor.execute(sql) 163 | item = cls.cursor.fetchall() 164 | return item 165 | 166 | @classmethod 167 | def fetch_keywords_ranking(cls): 168 | sql = "SELECT `a`.`id`, `a`.`keyword`, `a`.`rank` as `rank`, `b`.`asin` as `asin` FROM `%s` as `a` " \ 169 | "LEFT JOIN `%s` as `b` ON `b`.`id`=`a`.`sk_id` WHERE `b`.`deleted_at` is NULL AND `a`.`deleted_at` is NULL " % \ 170 | (cls.keyword_table, cls.sales_table) 171 | cls.cursor.execute(sql) 172 | item = cls.cursor.fetchall() 173 | return item 174 | 175 | @classmethod 176 | def update_keywords_expire_rank(cls, skwd_id): 177 | now = datetime.now(cls.tz).strftime('%Y-%m-%d %H:%M:%S') 178 | sql = "UPDATE `%s` SET `last_rank`=`rank`, `rank`='%s', `updated_at`='%s', `status`=1 WHERE `id`='%s'" % (cls.keyword_table, cls.expire_rank, now, skwd_id) 179 | py_sql = "INSERT INTO `%s`(`skwd_id`, `rank`, `date`) VALUES ('%s', '%s', '%s')" % (cls.py_keyword_table, skwd_id, cls.expire_rank, now) 180 | try: 181 | cls.cursor.execute(sql) 182 | cls.cursor.execute(py_sql) 183 | cls.conn.commit() 184 | print('update keyword_rank: [', skwd_id, '] expired') 185 | except pymysql.DataError as error: 186 | print(error) 187 | cls.conn.rollback() 188 | 189 | @classmethod 190 | def update_keywords_none_rank(cls, skwd_id): 191 | now = datetime.now(cls.tz).strftime('%Y-%m-%d %H:%M:%S') 192 | sql = "UPDATE `%s` SET `updated_at`='%s', `status`=2 WHERE `id`='%s'" % (cls.keyword_table, now, skwd_id) 193 | try: 194 | cls.cursor.execute(sql) 195 | print('update keyword_rank: [', skwd_id, '] none') 196 | except pymysql.DataError as error: 197 | print(error) 198 | cls.conn.rollback() 199 | 200 | 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /amazon/db/ipricejot.sql: -------------------------------------------------------------------------------- 1 | SET NAMES utf8; 2 | SET FOREIGN_KEY_CHECKS = 0; 3 | 4 | -- ---------------------------- 5 | -- Table structure for `py_cates` 6 | -- ---------------------------- 7 | DROP TABLE IF EXISTS `py_cates`; 8 | CREATE TABLE `py_cates` ( 9 | `id` int(11) NOT NULL AUTO_INCREMENT, 10 | `title` varchar(512) NOT NULL, 11 | `link` varchar(512) NOT NULL, 12 | `level` tinyint(2) NOT NULL DEFAULT '1', 13 | `pid` int(11) NOT NULL DEFAULT '0', 14 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 15 | PRIMARY KEY (`id`) 16 | ) ENGINE=InnoDB AUTO_INCREMENT=38 DEFAULT CHARSET=utf8mb4; 17 | 18 | -- ---------------------------- 19 | -- Table structure for `py_asin_best` 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `py_asin_best`; 22 | CREATE TABLE `py_asin_best` ( 23 | `id` int(11) NOT NULL AUTO_INCREMENT, 24 | `asin` char(10) NOT NULL, 25 | `cid` int(11) NOT NULL, 26 | `rank` tinyint(4) NOT NULL, 27 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 28 | PRIMARY KEY (`id`) 29 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 30 | 31 | SET FOREIGN_KEY_CHECKS = 1; 32 | -------------------------------------------------------------------------------- /amazon/db/py_salesranking_and_review.sql: -------------------------------------------------------------------------------- 1 | /* 2 | SQLyog v10.2 3 | MySQL - 5.7.18-log : Database - ipricejot 4 | ********************************************************************* 5 | */ 6 | 7 | 8 | /*!40101 SET NAMES utf8 */; 9 | 10 | /*!40101 SET SQL_MODE=''*/; 11 | 12 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 13 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 14 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 15 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 16 | CREATE DATABASE /*!32312 IF NOT EXISTS*/`ipricejot` /*!40100 DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_bin */; 17 | 18 | USE `ipricejot`; 19 | 20 | /*Table structure for table `py_review_detail` */ 21 | 22 | DROP TABLE IF EXISTS `py_review_detail`; 23 | 24 | CREATE TABLE `py_review_detail` ( 25 | `id` bigint(20) NOT NULL AUTO_INCREMENT, 26 | `asin` varchar(11) CHARACTER SET utf8 NOT NULL COMMENT 'asin号', 27 | `review_id` varchar(50) CHARACTER SET utf8 NOT NULL COMMENT '评论id号', 28 | `reviewer` varchar(255) CHARACTER SET utf8 NOT NULL COMMENT '评论者', 29 | `review_url` varchar(255) COLLATE utf8mb4_bin NOT NULL COMMENT '评价链接', 30 | `star` varchar(4) CHARACTER SET utf8 NOT NULL DEFAULT '0' COMMENT '评论星级', 31 | `date` varchar(255) CHARACTER SET utf8 NOT NULL COMMENT '评论日期', 32 | `title` varchar(255) CHARACTER SET utf8 NOT NULL COMMENT '评论标题', 33 | `content` text CHARACTER SET utf8 COMMENT '评论内容', 34 | PRIMARY KEY (`id`), 35 | UNIQUE KEY `asin_review_id_unique` (`asin`,`review_id`) 36 | ) ENGINE=InnoDB AUTO_INCREMENT=2706 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin; 37 | 38 | /*Table structure for table `py_review_profile` */ 39 | 40 | DROP TABLE IF EXISTS `py_review_profile`; 41 | 42 | CREATE TABLE `py_review_profile` ( 43 | `id` bigint(20) NOT NULL AUTO_INCREMENT, 44 | `asin` varchar(11) NOT NULL COMMENT 'asin号', 45 | `product` varchar(500) NOT NULL COMMENT '产品名', 46 | `brand` varchar(255) NOT NULL COMMENT '商品标签', 47 | `seller` varchar(255) DEFAULT NULL COMMENT '销售商家', 48 | `image` varchar(255) NOT NULL DEFAULT '' COMMENT '图片地址', 49 | `review_total` int(11) NOT NULL DEFAULT '0' COMMENT '评论总数', 50 | `review_rate` varchar(4) NOT NULL DEFAULT '0' COMMENT '评论平均分值', 51 | `pct_five` tinyint(2) NOT NULL DEFAULT '0' COMMENT '5星所占比分比', 52 | `pct_four` tinyint(2) NOT NULL DEFAULT '0' COMMENT '4星所占百分比', 53 | `pct_three` tinyint(2) NOT NULL DEFAULT '0' COMMENT '3星所占百分比', 54 | `pct_two` tinyint(2) NOT NULL DEFAULT '0' COMMENT '2星所占百分比', 55 | `pct_one` tinyint(2) NOT NULL DEFAULT '0' COMMENT '1星所占百分比', 56 | `latest_total` int(11) DEFAULT NULL COMMENT '上一次的评论总数', 57 | PRIMARY KEY (`id`), 58 | UNIQUE KEY `unique_asin` (`asin`) 59 | ) ENGINE=InnoDB AUTO_INCREMENT=22 DEFAULT CHARSET=utf8; 60 | 61 | /*Table structure for table `py_salesranking_keywords` */ 62 | 63 | DROP TABLE IF EXISTS `py_salesranking_keywords`; 64 | 65 | CREATE TABLE `py_salesranking_keywords` ( 66 | `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 67 | `skwd_id` int(11) NOT NULL COMMENT 'salesranking_keyword_id', 68 | `rank` int(11) NOT NULL COMMENT '排名', 69 | `date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '爬取时间', 70 | PRIMARY KEY (`id`) 71 | ) ENGINE=InnoDB AUTO_INCREMENT=11 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 72 | 73 | /*Table structure for table `py_salesrankings` */ 74 | 75 | DROP TABLE IF EXISTS `py_salesrankings`; 76 | 77 | CREATE TABLE `py_salesrankings` ( 78 | `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 79 | `sk_id` int(11) NOT NULL COMMENT 'salesranking_id', 80 | `rank` int(11) NOT NULL COMMENT '排名', 81 | `classify` varchar(150) COLLATE utf8_unicode_ci NOT NULL DEFAULT '' COMMENT '分类', 82 | `date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '爬取时间', 83 | PRIMARY KEY (`id`) 84 | ) ENGINE=InnoDB AUTO_INCREMENT=13 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 85 | 86 | /*Table structure for table `salesranking_keywords` */ 87 | 88 | DROP TABLE IF EXISTS `salesranking_keywords`; 89 | 90 | CREATE TABLE `salesranking_keywords` ( 91 | `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 92 | `sk_id` int(11) NOT NULL COMMENT 'saleranking_id', 93 | `keyword` varchar(255) COLLATE utf8_unicode_ci NOT NULL COMMENT '关键字', 94 | `status` tinyint(4) NOT NULL DEFAULT '0' COMMENT '抓取状态 0抓取中 1成功 2抓取失败', 95 | `rank` int(11) NOT NULL DEFAULT '0' COMMENT '当前排名', 96 | `last_rank` int(11) NOT NULL DEFAULT '0' COMMENT '上次排名', 97 | `deleted_at` timestamp NULL DEFAULT NULL, 98 | `created_at` timestamp NULL DEFAULT NULL, 99 | `updated_at` timestamp NULL DEFAULT NULL, 100 | PRIMARY KEY (`id`), 101 | UNIQUE KEY `salesranking_keywords_sk_id_keyword_unique` (`sk_id`,`keyword`) 102 | ) ENGINE=InnoDB AUTO_INCREMENT=6 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 103 | 104 | /*Table structure for table `salesrankings` */ 105 | 106 | DROP TABLE IF EXISTS `salesrankings`; 107 | 108 | CREATE TABLE `salesrankings` ( 109 | `id` bigint(20) unsigned NOT NULL AUTO_INCREMENT, 110 | `sid` int(11) NOT NULL COMMENT 'seller_uid', 111 | `asin` varchar(11) COLLATE utf8_unicode_ci NOT NULL COMMENT '商品asin号', 112 | `title` varchar(500) COLLATE utf8_unicode_ci NOT NULL COMMENT '商品名称', 113 | `image` varchar(255) COLLATE utf8_unicode_ci NOT NULL COMMENT '商品图片', 114 | `link` varchar(255) COLLATE utf8_unicode_ci NOT NULL COMMENT '亚马逊商品链接', 115 | `classify` varchar(150) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '商品分类', 116 | `status` tinyint(4) NOT NULL DEFAULT '0' COMMENT '抓取状态 0抓取中 1成功 2抓取失败', 117 | `rank` int(11) NOT NULL DEFAULT '0' COMMENT '目前排名', 118 | `last_rank` int(11) NOT NULL DEFAULT '0' COMMENT '上次排名', 119 | `deleted_at` timestamp NULL DEFAULT NULL, 120 | `created_at` timestamp NULL DEFAULT NULL, 121 | `updated_at` timestamp NULL DEFAULT NULL, 122 | PRIMARY KEY (`id`), 123 | UNIQUE KEY `salesrankings_sid_asin_unique` (`sid`,`asin`) 124 | ) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 125 | 126 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 127 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 128 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; 129 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 130 | -------------------------------------------------------------------------------- /amazon/requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==1.4.0 2 | PyMySQL==0.7.11 3 | PyDispatcher==2.0.5 4 | pytz==2017.2 5 | -------------------------------------------------------------------------------- /amazon/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = amazon.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = amazon 12 | -------------------------------------------------------------------------------- /amazon2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon2/__init__.py -------------------------------------------------------------------------------- /amazon2/amazon2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon2/amazon2/__init__.py -------------------------------------------------------------------------------- /amazon2/amazon2/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Amazon2Item(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /amazon2/amazon2/middlewares/AmazonSpiderMiddleware.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | from datetime import datetime 10 | 11 | 12 | class AmazonSpiderMiddleware(object): 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, dict or Item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Response, dict 44 | # or Item objects. 45 | pass 46 | 47 | def process_start_requests(self, start_requests, spider): 48 | # Called with the start requests of the spider, and works 49 | # similarly to the process_spider_output() method, except 50 | # that it doesn’t have a response associated. 51 | 52 | # Must return only requests (not items). 53 | for r in start_requests: 54 | yield r 55 | 56 | def spider_opened(self, spider): 57 | spider.started_on = datetime.now() 58 | 59 | 60 | -------------------------------------------------------------------------------- /amazon2/amazon2/middlewares/RotateUserAgentMiddleware.py: -------------------------------------------------------------------------------- 1 | import random 2 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 3 | 4 | 5 | class RotateUserAgentMiddleware(UserAgentMiddleware): 6 | def __init__(self, user_agent=''): 7 | UserAgentMiddleware.__init__(self) 8 | self.user_agent = user_agent 9 | 10 | def process_request(self, request, spider): 11 | ua = random.choice(self.user_agent_list) 12 | if ua: 13 | # print(ua) 14 | request.headers.setdefault('User-Agent', ua) 15 | 16 | # the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape 17 | # for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php 18 | user_agent_list = [ 19 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ 20 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ 21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ 22 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ 23 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ 24 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ 25 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ 26 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 27 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ 29 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 30 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ 31 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 32 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 33 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ 34 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ 35 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ 36 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 37 | ] 38 | -------------------------------------------------------------------------------- /amazon2/amazon2/middlewares/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dynamohuang/amazon-scrapy/33afe5b482d3d55a065084289e20540ce6d99081/amazon2/amazon2/middlewares/__init__.py -------------------------------------------------------------------------------- /amazon2/amazon2/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class Amazon2Pipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /amazon2/amazon2/settings.py: -------------------------------------------------------------------------------- 1 | 2 | BOT_NAME = 'amazon2' 3 | 4 | SPIDER_MODULES = ['amazon2.spiders'] 5 | NEWSPIDER_MODULE = 'amazon2.spiders' 6 | 7 | ROBOTSTXT_OBEY = False 8 | 9 | CONCURRENT_REQUESTS = 32 10 | 11 | COOKIES_ENABLED = False 12 | 13 | SPIDER_MIDDLEWARES = { 14 | 'amazon2.middlewares.AmazonSpiderMiddleware.AmazonSpiderMiddleware': 543, 15 | } 16 | 17 | DOWNLOADER_MIDDLEWARES = { 18 | 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 19 | 'amazon2.middlewares.RotateUserAgentMiddleware.RotateUserAgentMiddleware': 543, 20 | } 21 | -------------------------------------------------------------------------------- /amazon2/amazon2/spiders/AmazonBaseSpider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import pydispatch.dispatcher 3 | from scrapy import signals 4 | from datetime import datetime 5 | 6 | 7 | class AmazonBaseSpider(scrapy.Spider): 8 | name = "AmazonBase" 9 | custom_settings = { 10 | 'LOG_LEVEL': 'ERROR', 11 | 'LOG_ENABLED': True, 12 | 'LOG_STDOUT': True, 13 | } 14 | 15 | def __init__(self): 16 | pydispatch.dispatcher.connect(self.handle_spider_closed, signals.spider_closed) 17 | self.result_pool = {} 18 | self.log = [] 19 | 20 | def start_requests(self): 21 | return 22 | 23 | def parse(self, response): 24 | return 25 | 26 | def print_progress(self, spider): 27 | work_time = datetime.now() - spider.started_on 28 | print('Spent:', work_time, ':', len(self.result_pool), 'item fetched') 29 | 30 | def handle_spider_closed(self): 31 | return 32 | -------------------------------------------------------------------------------- /amazon2/amazon2/spiders/DemoSpider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from amazon2.spiders.AmazonBaseSpider import AmazonBaseSpider 3 | 4 | 5 | # scrapy crawl demo -a asin=B07K97BQDF 6 | class DemoSpider(AmazonBaseSpider): 7 | name = "demo" 8 | 9 | def __init__(self, asin='B07K97BQDF'): 10 | AmazonBaseSpider.__init__(self) 11 | self.asin = asin 12 | 13 | def start_requests(self): 14 | yield scrapy.Request( 15 | url='https://www.amazon.com/dp/' + self.asin, 16 | callback=self.parse, 17 | meta={ 18 | 'asin': self.asin, 19 | 'cid': -10 20 | } 21 | ) 22 | 23 | def parse(self, response): 24 | print(response.meta['asin']) 25 | self.result_pool[response.meta['asin']] = {} 26 | self.result_pool[response.meta['asin']]['title'] = 'title for ' + response.meta['asin'] 27 | 28 | # Bingo! Here we get the result and You can restore or output it 29 | def handle_spider_closed(self, spider): 30 | print(self.result_pool.get(self.asin)) 31 | AmazonBaseSpider.print_progress(self, spider) 32 | -------------------------------------------------------------------------------- /amazon2/amazon2/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /amazon2/requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==1.5.* 2 | -------------------------------------------------------------------------------- /amazon2/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = amazon2.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = amazon2 12 | --------------------------------------------------------------------------------