├── README.md ├── config.py ├── daemonize.py ├── extractor.py ├── feeds.py ├── insert_feed.py ├── log.py ├── models.py ├── scheduler.py ├── spider.py ├── utils.py └── worker.py /README.md: -------------------------------------------------------------------------------- 1 | # 项目名: `jd_spider` 2 | 3 | 4 | ### 爬虫表的设计: 5 | 6 | *设计宗旨, 怎么简单怎么来!* 7 | 8 | ### 初始化程序: 9 | 10 | 1. 更改配置 11 | 12 | `vim config.py` 13 | 14 | 2. 自动初始化库表 15 | 16 | `python models.py` 17 | 18 | 3. 启动爬虫 19 | 20 | `python spider` 21 | 22 | 4. 启动数据清理 23 | 24 | `python extractor.py` 25 | 26 | #### 架构图 27 | *缺少一个图,后期补上...* 28 | ``` 29 | push url ---> redis (urls task) ---> spider ---> redis (urls result) ---> extractor ---> mysql 30 | | | 31 | ------------------------------------------------------------| 32 | ``` 33 | 34 | `京东所有的品类都是三层.` 35 | 36 | **详情页表** 37 | 38 | ``` 39 | class Product(Model): 40 | pid = BigIntegerField(primary_key=True) 41 | purl = CharField(unique=True,index=True) 42 | pname = CharField(max_length=200) 43 | brand = CharField(max_length=30) 44 | brand_img = CharField() 45 | product_img = CharField() 46 | price = FloatField() 47 | extra = BlobField() 48 | created_on = DateTimeField(default=datetime.now) 49 | ``` 50 | 51 | **关联表** 52 | 53 | ``` 54 | class ProductAndCategory(Model): 55 | pid = BigIntegerField(primary_key=True) 56 | top_id = IntegerField() # 厨具 57 | second_id = IntegerField() # 烹饪锅具 58 | third_id = IntegerField() # 炒锅 59 | top_name = CharField() 60 | second_name = CharField() 61 | third_name = CharField() 62 | created_on = DateTimeField(default=datetime.now) 63 | ``` 64 | 65 | 66 | **品类表** 67 | 68 | ``` 69 | class Category(Model): 70 | cat_id = IntegerField(primary_key=True) 71 | cat_name = CharField() 72 | cat_url = CharField() 73 | level = IntegerField() 74 | created_on = DateTimeField(default=datetime.now) 75 | 76 | ``` 77 | 78 | ### 说明: 79 | 80 | 该项目没有做深层次递归抓取,所以说需要构建一批索引页的,那么什么索引页?如下... 81 | 82 | 某个商品类别的索引页: 83 | http://list.jd.com/list.html?cat=6196,6197,6201 84 | 85 | 如果需要翻页,需要加page及排序参数: 86 | 87 | &page=2&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main 88 | 89 | 最后生成索引页的逻辑: 90 | ``` 91 | for page in range(1, 10): 92 | url = "http://list.jd.com/list.html?cat=6196,6197,6201&page={0}&sort=sort_totalsales15_desc&trans=1&JL=6_0_0#J_main".format(page) 93 |   push_mq(url) 94 | ``` 95 | 96 | ### 抓取的流程: 97 | ### step: 1 98 | 首先把厨具类的子类给拿出来,这些url可以作为后面的种子索引页面. 99 | 100 | http://channel.jd.com/kitchenware.html 101 | 102 | ### step: 2 103 | 通过种子可以拿到相关产品的列表页. 104 | 某一个种子列表页面 `http://list.jd.com/list.html?cat=6196,6197,6199` , 可以获取所有的商品页的url. 105 | 106 | 107 | ### step: 3 108 | 109 | 通过具体的商品页面 `http://item.jd.com/137179.html` , 可以拿到 商品的名,品牌,编号ID,材质等,面包屑(含有 递归的类别、品牌、产品名) 110 | 111 | 注: JD的价格是特殊处理的,需要调下面api来获取. `http://p.3.cn/prices/get?skuid=J_{pid}` 112 | 113 | 114 | ------ 115 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import redis 5 | 6 | DEBUG = False 7 | 8 | pid_file = 'pid.sock' 9 | 10 | max_requests = 100000 11 | 12 | daemon_flag = False 13 | 14 | process_num = 2 15 | 16 | max_page_limiter = 30 17 | 18 | spider_limiter = { 19 | 'timeout': 10, 20 | 'success_t': 0.3, 21 | 'faild_t': 10 22 | } 23 | 24 | log_file = "std.log" 25 | 26 | mysql_config = { 27 | "host": "127.0.0.1", 28 | "port": 3306, 29 | "user": 'root', 30 | "passwd": '123123', 31 | "db": 'jd_spider', 32 | } 33 | 34 | redis_config = { 35 | "host": "127.0.0.1", 36 | "host": 6379 37 | } 38 | 39 | rd = redis.StrictRedis() 40 | 41 | JD_URLS_TASK = "jd_urls_task" 42 | 43 | JD_URLS_RESULT = "jd_urls_result" 44 | 45 | PLIST_FEEDS_URLS = [ 46 | 'http://list.jd.com/list.html?cat=6196,6197,6199&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 47 | 'http://list.jd.com/list.html?cat=6196,6197,6200&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 48 | 'http://list.jd.com/list.html?cat=6196,6197,6202&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 49 | 'http://list.jd.com/list.html?cat=6196,6197,6201&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 50 | 'http://list.jd.com/list.html?cat=6196,6197,6203&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 51 | 'http://list.jd.com/list.html?cat=6196,6197,6204&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 52 | 'http://list.jd.com/list.html?cat=6196,6197,6205&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 53 | 'http://list.jd.com/list.html?cat=6196,6197,6206&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 54 | 'http://list.jd.com/list.html?cat=6196,6197,6207&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 55 | 'http://list.jd.com/list.html?cat=6196,6197,11976&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 56 | 'http://list.jd.com/list.html?cat=6196,6219,6223&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 57 | 'http://list.jd.com/list.html?cat=6196,6219,6224&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 58 | 'http://list.jd.com/list.html?cat=6196,6219,6220&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 59 | 'http://list.jd.com/list.html?cat=6196,6219,6221&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 60 | 'http://list.jd.com/list.html?cat=6196,6219,11979&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 61 | 'http://list.jd.com/list.html?cat=6196,6219,6850&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 62 | 'http://list.jd.com/list.html?cat=6196,6219,6225&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 63 | 'http://list.jd.com/list.html?cat=6196,6214,6215&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 64 | 'http://list.jd.com/list.html?cat=6196,6214,6218&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 65 | 'http://list.jd.com/list.html?cat=6196,6214,11977&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 66 | 'http://list.jd.com/list.html?cat=6196,6214,6216&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 67 | 'http://list.jd.com/list.html?cat=6196,6214,11978&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 68 | 'http://list.jd.com/list.html?cat=6196,6227,6228&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 69 | 'http://list.jd.com/list.html?cat=6196,6227,6230&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 70 | 'http://list.jd.com/list.html?cat=6196,6227,6231&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 71 | 'http://list.jd.com/list.html?cat=6196,6227,6232&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 72 | 'http://list.jd.com/list.html?cat=6196,6227,11975&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 73 | 'http://list.jd.com/list.html?cat=6196,11143,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 74 | 'http://list.jd.com/list.html?cat=6196,11143,11149,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 75 | 'http://list.jd.com/list.html?cat=6196,11143,11150,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 76 | 'http://list.jd.com/list.html?cat=6196,11143,11155,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 77 | 'http://list.jd.com/list.html?cat=6196,11143,11151,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 78 | 'http://list.jd.com/list.html?cat=6196,11143,11152,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 79 | 'http://list.jd.com/list.html?cat=6196,11143,11153,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 80 | 'http://list.jd.com/list.html?cat=6196,11143,11154,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 81 | 'http://list.jd.com/list.html?cat=6196,11143,11156,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 82 | 'http://list.jd.com/list.html?cat=6196,6219,6222&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 83 | 'http://list.jd.com/list.html?cat=6196,6198,6211&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 84 | 'http://list.jd.com/list.html?cat=6196,6198,6212&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 85 | 'http://list.jd.com/list.html?cat=6196,6198,6209&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 86 | 'http://list.jd.com/list.html?cat=6196,6198,6210&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 87 | 'http://list.jd.com/list.html?cat=6196,6198,11980&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 88 | 'http://list.jd.com/list.html?cat=6196,6198,11981&page=1&delivery=1&trans=1&JL=4_10_0#J_main' 89 | ] 90 | 91 | 92 | PLIST_MORE_URLS = [ 93 | 'http://list.jd.com/list.html?cat=6196,6197,6199&page=1', 94 | 'http://list.jd.com/list.html?cat=6196,6197,6200&page=1', 95 | 'http://list.jd.com/list.html?cat=6196,6197,6202&page=1', 96 | 'http://list.jd.com/list.html?cat=6196,6197,6201&page=1', 97 | 'http://list.jd.com/list.html?cat=6196,6197,6203&page=1', 98 | 'http://list.jd.com/list.html?cat=6196,6197,6204&page=1', 99 | 'http://list.jd.com/list.html?cat=6196,6197,6205&page=1', 100 | 'http://list.jd.com/list.html?cat=6196,6197,6206&page=1', 101 | 'http://list.jd.com/list.html?cat=6196,6197,6207&page=1', 102 | 'http://list.jd.com/list.html?cat=6196,6197,11976&page=1', 103 | 'http://list.jd.com/list.html?cat=6196,6219,6223&page=1', 104 | 'http://list.jd.com/list.html?cat=6196,6219,6224&page=1', 105 | 'http://list.jd.com/list.html?cat=6196,6219,6220&page=1', 106 | 'http://list.jd.com/list.html?cat=6196,6219,6221&page=1', 107 | 'http://list.jd.com/list.html?cat=6196,6219,11979&page=1', 108 | 'http://list.jd.com/list.html?cat=6196,6219,6850&page=1', 109 | 'http://list.jd.com/list.html?cat=6196,6219,6225&page=1', 110 | 'http://list.jd.com/list.html?cat=6196,6214,6215&page=1', 111 | 'http://list.jd.com/list.html?cat=6196,6214,6218&page=1', 112 | 'http://list.jd.com/list.html?cat=6196,6214,11977&page=1', 113 | 'http://list.jd.com/list.html?cat=6196,6214,6216&page=1', 114 | 'http://list.jd.com/list.html?cat=6196,6214,11978&page=1', 115 | 'http://list.jd.com/list.html?cat=6196,6227,6228&page=1', 116 | 'http://list.jd.com/list.html?cat=6196,6227,6230&page=1', 117 | 'http://list.jd.com/list.html?cat=6196,6227,6231&page=1', 118 | 'http://list.jd.com/list.html?cat=6196,6227,6232&page=1', 119 | 'http://list.jd.com/list.html?cat=6196,6227,11975&page=1', 120 | 'http://list.jd.com/list.html?cat=6196,11143,11148&page=1', 121 | 'http://list.jd.com/list.html?cat=6196,11143,11149,11148&page=1', 122 | 'http://list.jd.com/list.html?cat=6196,11143,11150,11148&page=1', 123 | 'http://list.jd.com/list.html?cat=6196,11143,11155,11148&page=1', 124 | 'http://list.jd.com/list.html?cat=6196,11143,11151,11148&page=1', 125 | 'http://list.jd.com/list.html?cat=6196,11143,11152,11148&page=1', 126 | 'http://list.jd.com/list.html?cat=6196,11143,11153,11148&page=1', 127 | 'http://list.jd.com/list.html?cat=6196,11143,11154,11148&page=1', 128 | 'http://list.jd.com/list.html?cat=6196,11143,11156,11148&page=1', 129 | 'http://list.jd.com/list.html?cat=6196,6219,6222&page=1', 130 | 'http://list.jd.com/list.html?cat=6196,6198,6211&page=1', 131 | 'http://list.jd.com/list.html?cat=6196,6198,6212&page=1', 132 | 'http://list.jd.com/list.html?cat=6196,6198,6209&page=1', 133 | 'http://list.jd.com/list.html?cat=6196,6198,6210&page=1', 134 | 'http://list.jd.com/list.html?cat=6196,6198,11980&page=1', 135 | 'http://list.jd.com/list.html?cat=6196,6198,11981&page=1' 136 | ] 137 | -------------------------------------------------------------------------------- /daemonize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import os 5 | import time 6 | 7 | def daemonize(): 8 | pid=os.fork() # fork1 9 | if pid<0: # error 10 | print "fork1 error" 11 | return -1 12 | elif pid>0: # parent. 13 | exit(0) 14 | os.chdir(os.getcwd()) 15 | os.setsid() 16 | pid=os.fork() # fork 2 17 | if pid<0: 18 | print "fork2 error" 19 | return -1 20 | elif pid>0: 21 | exit(0) 22 | os.umask(0) 23 | os.close(0) 24 | os.close(1) 25 | os.close(2) 26 | fd=os.open('/dev/null', 2) 27 | os.dup(fd) 28 | os.dup(fd) 29 | 30 | if __name__ == "__main__": 31 | daemonize() 32 | time.sleep(30) 33 | 34 | -------------------------------------------------------------------------------- /extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding("utf-8") 7 | import re 8 | import time 9 | import traceback 10 | from urlparse import parse_qs, urlparse 11 | 12 | import requests 13 | from peewee import IntegrityError 14 | from lxml.html import tostring, fromstring 15 | 16 | from config import * 17 | from utils import * 18 | from log import logger 19 | from spider import fetch 20 | from models import Product, Category, ProductAndCategory 21 | 22 | 23 | rd = redis.StrictRedis() 24 | 25 | 26 | def parse_category(url, doc, res=''): 27 | ''' 28 | desc: 29 | 用来解析面包屑, 分析出具体商品所属的cat层 30 | ''' 31 | save_list = [] 32 | _sum_url = doc.xpath('//*[@id="root-nav"]/div/div/span[1]/a[2]/@href')[0] 33 | _cat_list = parse_qs(urlparse(_sum_url).query, keep_blank_values=False).get('cat')[0] 34 | cat_list = [int(x) for x in _cat_list.split(',')] 35 | assert len(cat_list) == 3 36 | l1_name = doc.xpath('//*[@id="root-nav"]/div/div/strong/a')[0].text_content() 37 | l1_url = doc.xpath('//*[@id="root-nav"]/div/div/strong/a/@href')[0] 38 | l2_url = doc.xpath('//*[@id="root-nav"]/div/div/span[1]/a[1]/@href')[0] 39 | l2_name = doc.xpath('//*[@id="root-nav"]/div/div/span[1]/a[1]')[0].text_content() 40 | l3_url = doc.xpath('//*[@id="root-nav"]/div/div/span[1]/a[2]/@href')[0] 41 | l3_name = doc.xpath('//*[@id="root-nav"]/div/div/span[1]/a[2]/text()')[0] 42 | 43 | save_list.append({"cat_id": cat_list[0], "cat_name": l1_name, "cat_url": l1_url, "level":'1'}) 44 | save_list.append({"cat_id": cat_list[1], "cat_name": l2_name, "cat_url": l2_url, "level":'2'}) 45 | save_list.append({"cat_id": cat_list[2], "cat_name": l3_name, "cat_url": l3_url, "level":'3'}) 46 | for _d in save_list: 47 | try: 48 | _d['cat_name'] = _d['cat_name'].encode('utf-8') 49 | _d['cat_url'] = perfect_href(_d['cat_url']) 50 | Category.create(**_d) 51 | except IntegrityError: 52 | logger.info('category faild repeat --- cat_id: %s , url:%s'%(_d['cat_id'], url)) 53 | res = { 54 | 'top_id': cat_list[0], 55 | 'second_id': cat_list[1], 56 | 'third_id': cat_list[2], 57 | 'top_name': l1_name.encode('utf-8'), 58 | 'second_name': l2_name.encode('utf-8'), 59 | 'third_name': l3_name.encode('utf-8'), 60 | } 61 | return res 62 | 63 | 64 | def extract_plist_url(url, doc, res=''): 65 | ''' 66 | desc: 67 | 从list.jd.com里抽出分页及产品url,扔到队列里 68 | example url: 69 | http://list.jd.com/list.html?cat=6196,6197,6199&page=1&delivery=1&trans=1&JL=4_10_0#J_main 70 | ''' 71 | _page = parse_qs(urlparse(url).query, keep_blank_values=False).get('page') 72 | if _page and int(_page[0]) == 1: 73 | max_page = int(doc.xpath('//span[@class="p-skip"]/em/b')[0].text_content()) # 获取该类别最大的页码 74 | if isinstance(max_page,int) and max_page > 1: 75 | for _p in range(2,max_page): 76 | if _p > max_page_limiter: 77 | continue 78 | print re.sub('page=1',"page=%s"%_p, url) 79 | queue_push_url(re.sub('page=1',"page=%s"%_p, url)) 80 | 81 | plist_doc = doc.xpath('//ul[@class="gl-warp clearfix "]')[0] # 拿到商品列表的标签 82 | plist = re.findall('//item.jd.com/\d*.html', tostring(plist_doc)) 83 | for _one in plist: 84 | purl = "http:" + _one 85 | if Product.select().where(Product.purl == purl).first(): 86 | logger.info('extract_plist_url --- product %s exist'%(purl)) 87 | continue 88 | else: 89 | queue_push_url(purl) 90 | 91 | 92 | def extract_product_detail(url, doc, res=''): 93 | ''' 94 | desc: 95 | 通过详情页获取一系列信息,入库 96 | ''' 97 | if not doc.xpath('//div[@class="breadcrumb"]'): 98 | logger.info('extract_product_detail --- url %s %s'%(url, u'全球购不处理!!!')) 99 | return 100 | 101 | if doc.xpath('//div[@class="breadcrumb"]//a/text()')[0] == u"首页": 102 | logger.info('extract_product_detail --- url %s %s'%(url, u'闪购页面暂时不处理!!!')) 103 | return 104 | 105 | _this_dao = Product.select().where(Product.purl == url).first() 106 | if _this_dao: 107 | logger.info('extract_product_detail --- product %s exist'%(url)) 108 | return 109 | 110 | # pid 111 | pid = re.search('http://item.jd.com/(?P\d*).html', url).groupdict()['id'] 112 | 113 | # product brand 114 | brand = doc.xpath('//*[@id="parameter-brand"]/li/a[1]')[0].text_content() 115 | # same detail page not contains brand img ,so set null 116 | _brand_img = doc.xpath('//*[@id="extInfo"]/div[1]/a/img/@src') 117 | if _brand_img: 118 | brand_img = _brand_img[0] 119 | brand_img = perfect_href(brand_img) 120 | else: 121 | brand_img = '' 122 | 123 | # product img 124 | imgs = doc.xpath('//div[@class="spec-items"]/ul/li/img/@src') 125 | fix_img = lambda x: re.sub('/n5/','/imgzone/', "http:" + x) 126 | imgs = map(fix_img, imgs) 127 | img_first = imgs.pop(0) 128 | 129 | # pname 130 | pname = doc.xpath('//div[@id="product-intro"]//div[@id="itemInfo"]//h1')[0].text_content() 131 | 132 | # 价格 133 | _price_url = "http://p.3.cn/prices/get?skuid=J_{pid}" 134 | price = None 135 | _price_res = fetch(_price_url.format(pid=pid)) 136 | 137 | 138 | if _price_res.status_code == 200: 139 | price = json.loads(_price_res.text)[0]['p'] 140 | else: 141 | raise("Not Parse Price") 142 | 143 | # 面包屑 == category 144 | _cat_body = parse_category(url, doc, res) 145 | if not ProductAndCategory.select().where(ProductAndCategory.pid== pid).first(): 146 | _cat_body.update({'pid':int(pid)}) 147 | ProductAndCategory.create(**_cat_body) 148 | 149 | data = { 150 | 'pid': pid, 151 | 'purl': url, 152 | 'pname': pname.encode('utf-8'), 153 | 'brand': brand.encode('utf-8'), 154 | 'brand_img': brand_img, 155 | 'product_img': img_first, 156 | 'price': price, 157 | 'extra': json.dumps({'img':imgs}) 158 | } 159 | try: 160 | Product.create(**data) 161 | logger.info('product success save--- url: %s'%(url)) 162 | except IntegrityError: 163 | logger.info('product faild repeat --- url: %s'%(url)) 164 | except Exception, e: 165 | ex = traceback.format_exc() 166 | logger.error('product faild exception --- url: %s\n %s'%(url, ex)) 167 | 168 | 169 | def extractor_worker(): 170 | func_map = { 171 | #re.compile('http:\/\/list.jd.com\/list.html\?cat=.*&page=\d*&delivery'): extract_plist_url, 172 | re.compile('http:\/\/list.jd.com\/list.html\?cat=.*&page=\d*'): extract_plist_url, 173 | re.compile('^http:\/\/item.jd.com/\d*.html$'): extract_product_detail 174 | } 175 | 176 | while 1: 177 | url, view = queue_pop_result() 178 | if not url: 179 | time.sleep(1) 180 | continue 181 | mark = False 182 | for reg, func in func_map.items(): 183 | if reg.search(url): 184 | mark = True 185 | doc = fromstring(view) 186 | try: 187 | func(url, doc, view) 188 | except Exception, e: 189 | logger.error('''Raise Error:\n 190 | url: %s\n 191 | error: %s\n'''%(url, traceback.format_exc())) 192 | if not mark: 193 | logger.error('not match any regex | func -- url: %s'%url) 194 | time.sleep(0.01) 195 | 196 | 197 | def test_one(): 198 | import pdb;pdb.set_trace() 199 | res = fetch(url) 200 | doc = fromstring(d.text.decode('utf-8', errors='ignore')) 201 | parse_category(doc) 202 | 203 | 204 | if __name__ == "__main__": 205 | extractor_worker() 206 | 207 | -------------------------------------------------------------------------------- /feeds.py: -------------------------------------------------------------------------------- 1 | plist_feeds_urls = [ 2 | 'http://list.jd.com/list.html?cat=6196,6197,6199&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 3 | 'http://list.jd.com/list.html?cat=6196,6197,6200&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 4 | 'http://list.jd.com/list.html?cat=6196,6197,6202&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 5 | 'http://list.jd.com/list.html?cat=6196,6197,6201&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 6 | 'http://list.jd.com/list.html?cat=6196,6197,6203&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 7 | 'http://list.jd.com/list.html?cat=6196,6197,6204&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 8 | 'http://list.jd.com/list.html?cat=6196,6197,6205&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 9 | 'http://list.jd.com/list.html?cat=6196,6197,6206&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 10 | 'http://list.jd.com/list.html?cat=6196,6197,6207&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 11 | 'http://list.jd.com/list.html?cat=6196,6197,11976&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 12 | 'http://list.jd.com/list.html?cat=6196,6219,6223&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 13 | 'http://list.jd.com/list.html?cat=6196,6219,6224&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 14 | 'http://list.jd.com/list.html?cat=6196,6219,6220&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 15 | 'http://list.jd.com/list.html?cat=6196,6219,6221&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 16 | 'http://list.jd.com/list.html?cat=6196,6219,11979&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 17 | 'http://list.jd.com/list.html?cat=6196,6219,6850&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 18 | 'http://list.jd.com/list.html?cat=6196,6219,6225&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 19 | 'http://list.jd.com/list.html?cat=6196,6214,6215&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 20 | 'http://list.jd.com/list.html?cat=6196,6214,6218&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 21 | 'http://list.jd.com/list.html?cat=6196,6214,11977&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 22 | 'http://list.jd.com/list.html?cat=6196,6214,6216&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 23 | 'http://list.jd.com/list.html?cat=6196,6214,11978&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 24 | 'http://list.jd.com/list.html?cat=6196,6227,6228&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 25 | 'http://list.jd.com/list.html?cat=6196,6227,6230&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 26 | 'http://list.jd.com/list.html?cat=6196,6227,6231&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 27 | 'http://list.jd.com/list.html?cat=6196,6227,6232&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 28 | 'http://list.jd.com/list.html?cat=6196,6227,11975&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 29 | 'http://list.jd.com/list.html?cat=6196,11143,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 30 | 'http://list.jd.com/list.html?cat=6196,11143,11149,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 31 | 'http://list.jd.com/list.html?cat=6196,11143,11150,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 32 | 'http://list.jd.com/list.html?cat=6196,11143,11155,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 33 | 'http://list.jd.com/list.html?cat=6196,11143,11151,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 34 | 'http://list.jd.com/list.html?cat=6196,11143,11152,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 35 | 'http://list.jd.com/list.html?cat=6196,11143,11153,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 36 | 'http://list.jd.com/list.html?cat=6196,11143,11154,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 37 | 'http://list.jd.com/list.html?cat=6196,11143,11156,11148&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 38 | 'http://list.jd.com/list.html?cat=6196,6219,6222&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 39 | 'http://list.jd.com/list.html?cat=6196,6198,6211&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 40 | 'http://list.jd.com/list.html?cat=6196,6198,6212&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 41 | 'http://list.jd.com/list.html?cat=6196,6198,6209&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 42 | 'http://list.jd.com/list.html?cat=6196,6198,6210&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 43 | 'http://list.jd.com/list.html?cat=6196,6198,11980&page=1&delivery=1&trans=1&JL=4_10_0#J_main', 44 | 'http://list.jd.com/list.html?cat=6196,6198,11981&page=1&delivery=1&trans=1&JL=4_10_0#J_main' 45 | ] 46 | -------------------------------------------------------------------------------- /insert_feed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | from config import rd, PLIST_FEEDS_URLS, JD_URLS_TASK, PLIST_MORE_URLS 5 | 6 | 7 | def insert_feeds(feed_list): 8 | for _url in feed_list: 9 | print _url 10 | rd.sadd(JD_URLS_TASK, _url) 11 | 12 | 13 | if __name__ == "__main__": 14 | select = raw_input("选择抓取的类型\n全部 all | 京东自营 ziying\n") 15 | if select == "all": 16 | insert_feeds(PLIST_MORE_URLS) 17 | elif select == "ziying": 18 | insert_feeds(PLIST_FEEDS_URLS) 19 | else: 20 | raise('all or ziying') 21 | print "insert feeds success !!!" 22 | -------------------------------------------------------------------------------- /log.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import os.path 5 | import logging 6 | from logging.handlers import TimedRotatingFileHandler 7 | 8 | from config import log_file 9 | 10 | 11 | def get_logger(logfile,mark=None): 12 | if mark: 13 | logger = logging.getLogger(mark) 14 | else: 15 | logger = logging.getLogger() 16 | logger.setLevel(logging.INFO) 17 | 18 | fmt = '%(asctime)s - %(process)s - %(name)s - %(levelname)s: - %(lineno)d - %(message)s' 19 | formatter = logging.Formatter(fmt) 20 | handler = logging.FileHandler(logfile) 21 | handler.setFormatter(formatter) 22 | logger.addHandler(handler) 23 | return logger 24 | 25 | 26 | logger = get_logger(log_file) 27 | 28 | 29 | if __name__ == "__main__": 30 | logger.info('test') 31 | 32 | -------------------------------------------------------------------------------- /models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from datetime import datetime 4 | 5 | from peewee import * 6 | 7 | from config import mysql_config 8 | 9 | 10 | db = MySQLDatabase(mysql_config['db'], host=mysql_config['host'], user=mysql_config['user'], passwd=mysql_config['passwd'], threadlocals=True, charset='utf8mb4') 11 | 12 | 13 | class Product(Model): 14 | pid = BigIntegerField(primary_key=True) 15 | purl = CharField(unique=True,index=True) 16 | pname = CharField(max_length=200) 17 | brand = CharField(max_length=30) 18 | brand_img = CharField() 19 | product_img = CharField() 20 | price = FloatField() 21 | extra = BlobField() 22 | created_on = DateTimeField(default=datetime.now) 23 | 24 | class Meta: 25 | database = db 26 | db_table = 'product' 27 | 28 | 29 | class Category(Model): 30 | cat_id = IntegerField(primary_key=True) 31 | cat_name = CharField() 32 | cat_url = CharField() 33 | level = IntegerField() 34 | created_on = DateTimeField(default=datetime.now) 35 | 36 | class Meta: 37 | database = db 38 | db_table = 'category' 39 | 40 | 41 | class ProductAndCategory(Model): 42 | pid = BigIntegerField(primary_key=True) 43 | top_id = IntegerField() # 厨具 44 | second_id = IntegerField() # 烹饪锅具 45 | third_id = IntegerField() # 炒锅 46 | top_name = CharField() 47 | second_name = CharField() 48 | third_name = CharField() 49 | created_on = DateTimeField(default=datetime.now) 50 | 51 | class Meta: 52 | database = db 53 | db_table = 'product_and_cat' 54 | 55 | 56 | if __name__ == '__main__': 57 | db.connect() 58 | select = raw_input("是否要删掉所有表, 再创建表: Y/n \n") 59 | for one in [Product, Category, ProductAndCategory]: 60 | if select == "Y": 61 | one.drop_table(fail_silently=True) 62 | try: 63 | one.create_table() 64 | except OperationalError: 65 | print one , "already existed" 66 | 67 | -------------------------------------------------------------------------------- /scheduler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import os 5 | import time 6 | import signal 7 | import logging 8 | from multiprocessing import Process, Value 9 | 10 | from setproctitle import setproctitle 11 | 12 | from worker import extractor_handler, spider_handler 13 | from config import * 14 | from log import logger 15 | from daemonize import daemonize 16 | 17 | 18 | jobs = {} 19 | is_running = True 20 | running_status = Value('d', True) 21 | 22 | 23 | #判断进程及lock是否存在 24 | def set_exists_pid(): 25 | continue_status = False 26 | if os.path.exists(pid_file): 27 | with open(pid_file,'r') as f: 28 | pid = f.read() 29 | if len(pid) == 0: 30 | continue_status = True 31 | else: 32 | pid = int(pid) 33 | if check_status(pid): 34 | return False 35 | else: 36 | continue_status = True 37 | else: 38 | continue_status = True 39 | 40 | if continue_status: 41 | with open(pid_file,'w') as f: 42 | logger.info('write pid %s'%os.getpid()) 43 | f.write(str(os.getpid())) 44 | return continue_status 45 | 46 | 47 | #接收信号,比如 kill,或者是键盘 ctrl c 48 | def sig_handler(num, stack): 49 | logger.info('receiving signal, exiting...') 50 | global is_running 51 | global running_status 52 | running_status.value = False 53 | is_running = False 54 | 55 | 56 | #添加进程 57 | def sig_add(num, stack): 58 | logger.info('receiving add signal, Add Process...') 59 | #res = fork_process(process_num) 60 | res = fork_process(1) 61 | jobs.update(res) 62 | 63 | 64 | #亲切的干掉一个进程 65 | def sig_reduce(num, stack): 66 | logger.info('receiving signal, Reduce Process...') 67 | for pid,pid_obj in jobs.iteritems(): 68 | jobs[pid]['is_running'] = False 69 | time.sleep(5) 70 | if pid_obj['obj'].is_alive(): 71 | pid_obj['obj'].terminate() 72 | # os.kill(pid, signal.SIGKILL) 73 | logger.info('receiving reduce signal,%s be killed'%pid) 74 | return 75 | 76 | 77 | #调用工作函数的入口 78 | def request_worker(func,process_name): 79 | setproctitle(process_name) #设置进程的名字 80 | # global running_status 81 | logger.info("child pid %s"%os.getpid()) 82 | counter = 0 83 | while running_status.value: 84 | s = func() 85 | if s: #如果有返回值,那么判断该任务只想运行一次 86 | break 87 | 88 | 89 | #fork进程 90 | def fork_process(x): 91 | jobs = {} 92 | for i in xrange(x): 93 | detail = {} 94 | p = Process(target = request_worker, args = (spider_handler, "spider :fetch")) 95 | p.start() 96 | detail['obj'] = p 97 | detail['is_running'] = True 98 | jobs[p.pid] = detail 99 | return jobs 100 | 101 | 102 | #探测一个进程的状态 103 | def check_status(pid): 104 | try: 105 | os.kill(pid,0) 106 | return True 107 | except: 108 | return False 109 | 110 | 111 | #管理进程总控 112 | def spawn_worker(): 113 | parent_id = os.getpid() 114 | p = Process(target = request_worker, args = (extractor_handler, "spider :extractor")) 115 | p.start() 116 | detail = {} 117 | detail['obj'] = p 118 | detail['is_running'] = True 119 | jobs[p.pid] = detail 120 | res = fork_process(process_num) 121 | jobs.update(res) 122 | while is_running: 123 | time.sleep(0.01) 124 | #第一种方法,调用非阻塞waitpid方法收尸 125 | if len(jobs) < process_num: 126 | res = fork_process(process_num - len(jobs)) 127 | jobs.update(res) 128 | for pid in jobs.keys(): 129 | try: 130 | if not check_status(pid): 131 | # if not jobs[pid]['obj'].is_alive(): 132 | del jobs[pid] 133 | os.waitpid(pid, os.WNOHANG) 134 | except: 135 | pass 136 | else: 137 | _c = 0 138 | interval = 0.1 139 | while 1: 140 | logger.info(str(_c)) 141 | logger.info(str(jobs)) 142 | if _c >= 30 or len(jobs) == 0: 143 | break 144 | for pid in jobs.keys(): 145 | if not check_status(pid): 146 | jobs.pop(pid) 147 | _c += 1 148 | time.sleep(0.1) 149 | for pid in jobs: 150 | try: 151 | os.kill(pid,signal.SIGKILL) 152 | except: 153 | pass 154 | os.remove(pid_file) 155 | 156 | 157 | if __name__ == '__main__': 158 | if not set_exists_pid(): 159 | logger.error("service is alive") 160 | raise("service is alive") 161 | if daemon_flag: 162 | daemonize() 163 | setproctitle("spider :Master") 164 | signal.signal(signal.SIGINT, sig_handler) 165 | signal.signal(signal.SIGTERM, sig_handler) 166 | signal.signal(signal.SIGTTIN, sig_add) 167 | signal.signal(signal.SIGTTOU, sig_reduce) 168 | #第二种方法,直接忽视子进程退出前发出的sigchld信号,交给内核,让内核来收拾,其实也是让内核用waitpid来解决。 169 | signal.signal(signal.SIGCHLD, signal.SIG_IGN) 170 | logger.info('main process: %d start', os.getpid()) 171 | spawn_worker() 172 | logger.info('main: %d kill all jobs done', os.getpid()) 173 | 174 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import time 5 | import json 6 | import socket 7 | import logging 8 | import traceback 9 | 10 | import redis 11 | import requests 12 | from user_agent import generate_user_agent 13 | 14 | from utils import * 15 | from config import * 16 | from log import logger 17 | 18 | 19 | logging.getLogger("requests").setLevel(logging.WARNING) 20 | 21 | 22 | def fetch(url): 23 | user_agent = {'User-Agent': generate_user_agent()} 24 | res = None 25 | try: 26 | res = requests.get(url, headers=user_agent, timeout=spider_limiter['timeout']) 27 | except requests.exceptions.Timeout as e: 28 | logger.error("fetch faild !!! url:%s connect timeout", url) 29 | except requests.exceptions.TooManyRedirects as e: 30 | logger.error("fetch faild !!! url:%s redirect more than 3 times", url) 31 | except requests.exceptions.ConnectionError as e: 32 | logger.error("fetch faild !!! url:%s connect error", url) 33 | except socket.timeout as e: 34 | logger.error("fetch faild !!! url:%s recv timetout", url) 35 | except: 36 | logger.error("fetch faild !!! url:%s %s"%(url, traceback.format_exc())) 37 | 38 | if res and res.status_code == 200: 39 | logger.info("fetch success code: %s , url: %s"%(res.status_code, url)) 40 | else: 41 | queue_push_url(url) 42 | logger.error("fetch faild !!! url: %s"%(url)) 43 | return res 44 | 45 | 46 | def spider_worker(): 47 | while 1: 48 | url = queue_pop_url() 49 | if not url: 50 | time.sleep(1) 51 | continue 52 | res = fetch(url) 53 | if res and res.status_code == 200: 54 | data = {'url': url, 'view': res.text} 55 | queue_push_result(data) 56 | time.sleep(spider_limiter['success_t']) 57 | else: 58 | time.sleep(spider_limiter['faild_t']) 59 | logger.warning('spdier error will sleep ...') 60 | 61 | 62 | if __name__ == "__main__": 63 | spider_worker() 64 | 65 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import json 5 | from config import * 6 | 7 | 8 | def queue_push_url(url): 9 | return rd.sadd(JD_URLS_TASK, url) 10 | 11 | 12 | def queue_pop_url(): 13 | url = rd.spop(JD_URLS_TASK) 14 | return url 15 | 16 | 17 | def queue_push_result(data): 18 | return rd.rpush(JD_URLS_RESULT, json.dumps(data)) 19 | 20 | 21 | def queue_pop_result(): 22 | url, view = None, None 23 | if DEBUG: 24 | res = rd.lrange(JD_URLS_RESULT,0,1)[0] # for test 25 | else: 26 | res = rd.lpop(JD_URLS_RESULT) 27 | if res: 28 | data = json.loads(res) 29 | url, view = data['url'], data['view'] 30 | return url, view 31 | 32 | 33 | def perfect_href(url): 34 | if url.startswith('http:'): 35 | return url 36 | else: 37 | return "http:" + url 38 | 39 | -------------------------------------------------------------------------------- /worker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | from log import logger 5 | 6 | from spider import spider_worker 7 | from extractor import extractor_worker 8 | 9 | 10 | #你的业务逻辑 11 | def spider_handler(): 12 | spider_worker() 13 | logger.info('this is spider_worker') 14 | 15 | 16 | def extractor_handler(): 17 | extractor_worker() 18 | logger.info('this is extractor_worker') 19 | 20 | ALLOW_METHOD = [{"func":spider_handler, "counte":2}, {"func":extractor_handler, "count":1}] 21 | 22 | --------------------------------------------------------------------------------