├── .idea ├── inspectionProfiles │ ├── Project_Default.xml │ └── profiles_settings.xml ├── jingdong.iml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── README.md ├── good.png ├── goods.xls ├── jd_comment.sql ├── jd_goods.sql ├── jd_spider ├── __init__.py ├── __init__.pyc ├── items.py ├── items.pyc ├── middlewares.py ├── middlewares.pyc ├── pipelines.py ├── pipelines.pyc ├── settings.py ├── settings.pyc └── spiders │ ├── __init__.py │ ├── __init__.pyc │ ├── jd_comment.py │ ├── jd_comment.pyc │ ├── jd_home.py │ └── jd_home.pyc ├── scrapy.cfg └── test.xlsx /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/jingdong.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 88 | 89 | 90 | 92 | 93 | 104 | 105 | 106 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 143 | 144 | 145 | 146 | 149 | 150 | 153 | 154 | 155 | 156 | 159 | 160 | 163 | 164 | 167 | 168 | 169 | 170 | 173 | 174 | 177 | 178 | 181 | 182 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 213 | 214 | 230 | 231 | 247 | 248 | 264 | 265 | 276 | 277 | 295 | 296 | 314 | 315 | 335 | 336 | 357 | 358 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 1467681217214 404 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 437 | 440 | 441 | 442 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jd_spider 2 | 3 | 用scrapy框架写的京东爬虫，可以抓取京东商品信息和评论 4 | 5 | # 1、目的： 6 |

1、爬取京东的商品信息（以电子烟为例）

7 |

2、爬取商品的评论信息

8 | 9 | #2、抓取到的数据属性如下所示 10 |
11 | ##商品数据 12 |
13 | ![image](http://img.blog.csdn.net/20160706175045922) 14 |
15 | ##评论数据 16 |
17 | ![image](http://img.blog.csdn.net/20160706174754513) 18 |
19 | #3、使用说明： 20 | ##(1)抓取商品信息和抓取评论分别写在了两个不同的爬虫里 21 | 22 |

抓取商品信息代码在jd_home.py中，数据库在setting.py中修改ITEM_PIPELINES，使用MySQLPipeline

23 |

抓取评论代码在jd_comment.py中，数据库在setting.py中修改ITEM_PIPELINES，使用CommentPipeline

24 | 25 | ##(2)setting.py文件 26 |

默认开启了代理IP，因为IP的存活期的限制，要定期更新PROXIES中IP信息，可从网站：http://www.xicidaili.com/ 中找免费的代理IP

27 | 28 |

如果不想使用代理IP，可以将DOWNLOADER_MIDDLEWARES代码注释掉

29 | 30 |

数据库的配置：

31 |

setting.py中可以配置数据库的主机，端口，用户名，密码和数据库名信息

32 |

pipeline.py中在sql语句中配置你要存入的表的名称。

33 |

数据库表结构：

34 |

jd_comment.sql：评论数据

35 |

jd_goods.sql：商品数据

36 | 37 |
38 |

在使用本爬虫中，因为在抓取评论信息时需要用到goods.xls文件。因此需要先抓取商品信息，然后将商品信息的相关内容导出到goods.xls中（这里提供了一个goods.xls的格式供参考）

39 |

goods.xls格式：第1列：商品ID，第2列：商品评论数；第3列：商品的commentVersion

40 |

在一个工程中，抓取商品信息和抓取评论信息不能同时进行。

41 |
42 |

更多爬虫的细节可以参考我的博客文章：

43 |

http://blog.csdn.net/xiaoquantouer/article/details/51840332

44 | 45 |

http://blog.csdn.net/xiaoquantouer/article/details/51841016

46 | 47 |
48 | ##有问题欢迎留言 49 | 50 | 51 | -------------------------------------------------------------------------------- /good.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/good.png -------------------------------------------------------------------------------- /goods.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/goods.xls -------------------------------------------------------------------------------- /jd_comment.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : test 5 | Source Server Version : 50621 6 | Source Host : 127.0.0.1:3306 7 | Source Database : test 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50621 11 | File Encoding : 65001 12 | 13 | Date: 2016-09-20 19:05:04 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for jd_comment 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `jd_comment`; 22 | CREATE TABLE `jd_comment` ( 23 | `user_name` varchar(255) DEFAULT NULL, 24 | `user_ID` varchar(255) DEFAULT NULL, 25 | `userProvince` varchar(255) DEFAULT NULL, 26 | `content` varchar(255) DEFAULT NULL, 27 | `good_ID` varchar(255) DEFAULT NULL, 28 | `good_name` varchar(255) DEFAULT NULL, 29 | `date` varchar(255) DEFAULT NULL, 30 | `replyCount` varchar(255) DEFAULT NULL, 31 | `score` varchar(255) DEFAULT NULL, 32 | `status` varchar(255) DEFAULT NULL, 33 | `title` varchar(255) DEFAULT NULL, 34 | `userRegisterTime` varchar(255) DEFAULT NULL, 35 | `productColor` varchar(255) DEFAULT NULL, 36 | `productSize` varchar(255) DEFAULT NULL, 37 | `userLevelName` varchar(255) DEFAULT NULL, 38 | `isMobile` varchar(255) DEFAULT NULL, 39 | `days` varchar(255) DEFAULT NULL, 40 | `tags` varchar(255) DEFAULT NULL 41 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 42 | 43 | -- ---------------------------- 44 | -- Records of jd_comment 45 | -- ---------------------------- 46 | -------------------------------------------------------------------------------- /jd_goods.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : test 5 | Source Server Version : 50621 6 | Source Host : 127.0.0.1:3306 7 | Source Database : test 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50621 11 | File Encoding : 65001 12 | 13 | Date: 2016-09-20 19:05:13 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for jd_goods 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `jd_goods`; 22 | CREATE TABLE `jd_goods` ( 23 | `ID` varchar(255) DEFAULT NULL, 24 | `name` varchar(255) DEFAULT NULL, 25 | `comment_num` varchar(255) DEFAULT NULL, 26 | `shop_name` varchar(255) DEFAULT NULL, 27 | `link` varchar(255) DEFAULT NULL, 28 | `commentVersion` varchar(255) DEFAULT NULL, 29 | `score1count` varchar(255) DEFAULT NULL, 30 | `score2count` varchar(255) DEFAULT NULL, 31 | `score3count` varchar(255) DEFAULT NULL, 32 | `score4count` varchar(255) DEFAULT NULL, 33 | `score5count` varchar(255) DEFAULT NULL, 34 | `price` varchar(255) DEFAULT NULL 35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 36 | 37 | -- ---------------------------- 38 | -- Records of jd_goods 39 | -- ---------------------------- 40 | -------------------------------------------------------------------------------- /jd_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/__init__.py -------------------------------------------------------------------------------- /jd_spider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/__init__.pyc -------------------------------------------------------------------------------- /jd_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | from scrapy import Item, Field 10 | 11 | 12 | class JdSpiderItem(scrapy.Item): 13 | # define the fields for your item here like: 14 | # name = scrapy.Field() 15 | pass 16 | 17 | 18 | class goodsItem(Item): 19 | link = Field() # 商品链接 20 | ID = Field() # 商品ID 21 | name = Field() # 商品名字 22 | comment_num = Field() # 评论人数 23 | shop_name = Field() # 店家名字 24 | price = Field() # 价钱 25 | commentVersion = Field() # 为了得到评论的地址需要该字段 26 | score1count = Field() # 评分为1星的人数 27 | score2count = Field() # 评分为2星的人数 28 | score3count = Field() # 评分为3星的人数 29 | score4count = Field() # 评分为4星的人数 30 | score5count = Field() # 评分为5星的人数 31 | 32 | 33 | class commentItem(Item): 34 | user_name = Field() # 评论用户的名字 35 | user_ID = Field() # 评论用户的ID 36 | userProvince = Field() # 评论用户来自的地区 37 | content = Field() # 评论内容 38 | good_ID = Field() # 评论的商品ID 39 | good_name = Field() # 评论的商品名字 40 | date = Field() # 评论时间 41 | replyCount = Field() # 回复数 42 | score = Field() # 评分 43 | status = Field() # 状态 44 | title = Field() 45 | userLevelId = Field() 46 | userRegisterTime = Field() # 用户注册时间 47 | productColor = Field() # 商品颜色 48 | productSize = Field() # 商品大小 49 | userLevelName = Field() # 银牌会员，钻石会员等 50 | userClientShow = Field() # 来自什么比如来自京东客户端 51 | isMobile = Field() # 是否来自手机 52 | days = Field() # 天数 53 | commentTags = Field() # 标签 54 | -------------------------------------------------------------------------------- /jd_spider/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/items.pyc -------------------------------------------------------------------------------- /jd_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | __author__ = 'jiangqiaowei' 3 | import random 4 | import base64 5 | from settings import PROXIES 6 | 7 | 8 | # 主要用来动态获取user agent, user agent列表USER_AGENTS在setting.py中进行配置 9 | class RandomUserAgent(object): 10 | """Randomly rotate user agents based on a list of predefined ones""" 11 | 12 | def __init__(self, agents): 13 | self.agents = agents 14 | 15 | @classmethod 16 | def from_crawler(cls, crawler): 17 | return cls(crawler.settings.getlist('USER_AGENTS')) 18 | 19 | def process_request(self, request, spider): 20 | #print "**************************" + random.choice(self.agents) 21 | request.headers.setdefault('User-Agent', random.choice(self.agents)) 22 | 23 | 24 | # 用来切换代理，proxy列表PROXIES也是在settings.py中进行配置 25 | class ProxyMiddleware(object): 26 | def process_request(self, request, spider): 27 | proxy = random.choice(PROXIES) 28 | if proxy['user_pass'] is not None: 29 | request.meta['proxy'] = "http://%s" % proxy['ip_port'] 30 | encoded_user_pass = base64.encodestring(proxy['user_pass']) 31 | request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass 32 | print "**************ProxyMiddleware have pass************" + proxy['ip_port'] 33 | else: 34 | print "**************ProxyMiddleware no pass************" + proxy['ip_port'] 35 | request.meta['proxy'] = "http://%s" % proxy['ip_port'] -------------------------------------------------------------------------------- /jd_spider/middlewares.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/middlewares.pyc -------------------------------------------------------------------------------- /jd_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import MySQLdb.cursors 8 | from twisted.enterprise import adbapi 9 | 10 | from scrapy.xlib.pydispatch import dispatcher 11 | from scrapy import signals 12 | from scrapy.utils.project import get_project_settings 13 | from scrapy import log 14 | 15 | SETTINGS = get_project_settings() 16 | 17 | 18 | class MySQLPipeline(object): 19 | @classmethod 20 | def from_crawler(cls, crawler): 21 | return cls(crawler.stats) 22 | 23 | def __init__(self, stats): 24 | # Instantiate DB 25 | self.dbpool = adbapi.ConnectionPool('MySQLdb', 26 | host=SETTINGS['DB_HOST'], 27 | user=SETTINGS['DB_USER'], 28 | passwd=SETTINGS['DB_PASSWD'], 29 | port=SETTINGS['DB_PORT'], 30 | db=SETTINGS['DB_DB'], 31 | charset='utf8', 32 | use_unicode=True, 33 | cursorclass=MySQLdb.cursors.DictCursor 34 | ) 35 | self.stats = stats 36 | dispatcher.connect(self.spider_closed, signals.spider_closed) 37 | 38 | def spider_closed(self, spider): 39 | """ Cleanup function, called after crawing has finished to close open 40 | objects. 41 | Close ConnectionPool. """ 42 | self.dbpool.close() 43 | 44 | def process_item(self, item, spider): 45 | query = self.dbpool.runInteraction(self._insert_record, item) 46 | query.addErrback(self._handle_error) 47 | return item 48 | 49 | def _insert_record(self, tx, item): 50 | ID = item['ID'][0] 51 | name = item['name'][0] 52 | comment_num = str(item['comment_num']) 53 | shop_name = item['shop_name'][0] 54 | link = item['link'][0] 55 | commentVersion = str(item['commentVersion']) 56 | commentVersion = commentVersion[1:-1] 57 | 58 | score1count = str(item['score1count']) 59 | score2count = str(item['score2count']) 60 | score3count = str(item['score3count']) 61 | score4count = str(item['score4count']) 62 | score5count = str(item['score5count']) 63 | 64 | price = str(item['price']) 65 | 66 | ID = ID.encode('utf-8') 67 | name = name.encode('utf-8') 68 | comment_num = comment_num.encode('utf-8') 69 | shop_name = shop_name.encode('utf-8') 70 | link = link.encode('utf-8') 71 | commentVersion = commentVersion.encode('utf-8') 72 | score1count = score1count.encode('utf-8') 73 | score2count = score2count.encode('utf-8') 74 | score3count = score3count.encode('utf-8') 75 | score4count = score4count.encode('utf-8') 76 | score5count = score5count.encode('utf-8') 77 | price = price.encode('utf-8') 78 | 79 | sql = "INSERT INTO jd_goods VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \ 80 | (ID, name, comment_num, shop_name, link, commentVersion, score1count, score2count, score3count, 81 | score4count, score5count, price) 82 | tx.execute(sql) 83 | print "yes" 84 | 85 | def _handle_error(self, e): 86 | log.err(e) 87 | 88 | 89 | class CommentPipeline(object): 90 | @classmethod 91 | def from_crawler(cls, crawler): 92 | return cls(crawler.stats) 93 | 94 | def __init__(self, stats): 95 | # Instantiate DB 96 | self.dbpool = adbapi.ConnectionPool('MySQLdb', 97 | host=SETTINGS['DB_HOST'], 98 | user=SETTINGS['DB_USER'], 99 | passwd=SETTINGS['DB_PASSWD'], 100 | port=SETTINGS['DB_PORT'], 101 | db=SETTINGS['DB_DB'], 102 | charset='utf8', 103 | use_unicode=True, 104 | cursorclass=MySQLdb.cursors.DictCursor 105 | ) 106 | self.stats = stats 107 | dispatcher.connect(self.spider_closed, signals.spider_closed) 108 | 109 | def spider_closed(self, spider): 110 | """ Cleanup function, called after crawing has finished to close open 111 | objects. 112 | Close ConnectionPool. """ 113 | self.dbpool.close() 114 | 115 | def process_item(self, item, spider): 116 | query = self.dbpool.runInteraction(self._insert_record, item) 117 | query.addErrback(self._handle_error) 118 | return item 119 | 120 | def _insert_record(self, tx, item): 121 | user_name = item['user_name'] 122 | user_ID = item['user_ID'] 123 | userProvince = item['userProvince'] 124 | content = item['content'] 125 | good_ID = item['good_ID'] 126 | good_name = item['good_name'] 127 | date = item['date'] 128 | replyCount = item['replyCount'] 129 | score = item['score'] 130 | status = item['status'] 131 | title = item['title'] 132 | userRegisterTime = item['userRegisterTime'] 133 | productColor = item['productColor'] 134 | productSize = item['productSize'] 135 | userLevelName = item['userLevelName'] 136 | isMobile = item['isMobile'] 137 | days = item['days'] 138 | tags = item['commentTags'] 139 | 140 | sql = "INSERT INTO jd_comment VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'," \ 141 | "'%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \ 142 | (user_name, user_ID, userProvince, content, good_ID, good_name, date, replyCount, score, 143 | status, title, userRegisterTime, productColor, productSize, userLevelName, 144 | isMobile, days, tags) 145 | 146 | tx.execute(sql) 147 | print "yes" 148 | 149 | def _handle_error(self, e): 150 | log.err(e) 151 | -------------------------------------------------------------------------------- /jd_spider/pipelines.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/pipelines.pyc -------------------------------------------------------------------------------- /jd_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jd_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jd_spider' 13 | 14 | SPIDER_MODULES = ['jd_spider.spiders'] 15 | NEWSPIDER_MODULE = 'jd_spider.spiders' 16 | 17 | USER_AGENTS = [ 18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 19 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 20 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 21 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 22 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 23 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 24 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 25 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 26 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 27 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 28 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 29 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 30 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 31 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 32 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 33 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 34 | ] 35 | 36 | # 这里使用的代理IP，因为IP的存活期的限制，请定期更新下面的IP，可从http://www.xicidaili.com/ 中找免费的代理IP 37 | PROXIES = [ 38 | {'ip_port': '202.108.2.42:80', 'user_pass': ''}, 39 | {'ip_port': '122.96.59.104:80', 'user_pass': ''}, 40 | {'ip_port': '120.76.243.40:80', 'user_pass': ''}, 41 | {'ip_port': '139.196.108.68:80', 'user_pass': ''}, 42 | {'ip_port': '60.194.100.51:80', 'user_pass': ''}, 43 | {'ip_port': '202.171.253.72:80', 'user_pass': ''}, 44 | {'ip_port': '123.56.74.13:8080', 'user_pass': ''}, 45 | ] 46 | 47 | COOKIES_ENABLED = False 48 | 49 | 50 | # 如果不想使用代理IP，可以将下面这段DOWNLOADER_MIDDLEWARES代码注释掉 51 | DOWNLOADER_MIDDLEWARES = { 52 | 'jd_spider.middlewares.RandomUserAgent': 1, 53 | 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110, 54 | 'jd_spider.middlewares.ProxyMiddleware': 100, 55 | } 56 | 57 | DOWNLOAD_DELAY = 7 # 下载延迟 58 | 59 | LOG_LEVEL = 'INFO' 60 | 61 | # 数据库的配置，请将下面的换成你自己的数据库配置 62 | DB_HOST = 'localhost' # 主机名 63 | DB_PORT = 3306 # 端口号 64 | DB_USER = 'root' # 用户名 65 | DB_PASSWD = 'xiaoquantou' # 密码 66 | DB_DB = 'test' # 数据库名 67 | 68 | ITEM_PIPELINES = { 69 | 'jd_spider.pipelines.MySQLPipeline': 300, # 抓取商品信息时，使用该通道 70 | # 'jd_spider.pipelines.CommentPipeline': 300, # 抓取评论信息时，使用该通道 71 | } 72 | -------------------------------------------------------------------------------- /jd_spider/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/settings.pyc -------------------------------------------------------------------------------- /jd_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /jd_spider/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/spiders/__init__.pyc -------------------------------------------------------------------------------- /jd_spider/spiders/jd_comment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.spiders import Spider 3 | from jd_spider.items import commentItem 4 | import json 5 | import xlrd 6 | 7 | 8 | class comment_spider(Spider): 9 | name = "comment" 10 | xlrd.Book.encoding = "utf-8" 11 | data = xlrd.open_workbook("goods.xls") 12 | # goods为要抓取评论的商品信息，现提供一个goods.xls文件供参考,第1列：商品ID；第2列：商品评论数；第3列：商品的commentVersion 13 | # test.xlsx也可以使用 14 | table = data.sheets()[0] 15 | nrows = table.nrows # 行数 16 | ncols = table.ncols # 列数 17 | good_id = table.col_values(0) # 商品ID 18 | comment_n = table.col_values(1) # 商品评论数 19 | comment_V = table.col_values(2) # 商品评论的commentVersion 20 | 21 | start_urls = [] 22 | for i in range(len(good_id)): # 一件商品一件商品的抽取 23 | good_num = int(good_id[i]) 24 | comment_total = int(comment_n[i]) 25 | if comment_total % 10 == 0: # 算出评论的页数，一页10条评论 26 | page = comment_total/10 27 | else: 28 | page = comment_total/10 + 1 29 | for k in range(0, page): 30 | url = "http://sclub.jd.com/productpage/p-" + str(good_num) + "-s-0-t-3-p-" + str(k) \ 31 | + ".html?callback=fetchJSON_comment98vv" + str(comment_V[i]) 32 | start_urls.append(url) 33 | 34 | def parse(self, response): 35 | temp1 = response.body.split('productAttr') 36 | str = '{"productAttr' + temp1[1][:-2] 37 | str = str.decode("gbk").encode("utf-8") 38 | js = json.loads(unicode(str, "utf-8")) 39 | comments = js['comments'] # 该页所有评论 40 | 41 | items = [] 42 | for comment in comments: 43 | item1 = commentItem() 44 | item1['user_name'] = comment['nickname'] 45 | item1['user_ID'] = comment['id'] 46 | item1['userProvince'] = comment['userProvince'] 47 | item1['content'] = comment['content'] 48 | item1['good_ID'] = comment['referenceId'] 49 | item1['good_name'] = comment['referenceName'] 50 | item1['date'] = comment['referenceTime'] 51 | item1['replyCount'] = comment['replyCount'] 52 | item1['score'] = comment['score'] 53 | item1['status'] = comment['status'] 54 | title = "" 55 | if comment.has_key('title'): 56 | item1['title'] = comment['title'] 57 | item1['title'] = title 58 | item1['userRegisterTime'] = comment['userRegisterTime'] 59 | item1['productColor'] = comment['productColor'] 60 | item1['productSize'] = comment['productSize'] 61 | item1['userLevelName'] = comment['userLevelName'] 62 | item1['isMobile'] = comment['isMobile'] 63 | item1['days'] = comment['days'] 64 | tags = "" 65 | if comment.has_key('commentTags'): 66 | for i in comment['commentTags']: 67 | tags = tags + i['name'] + " " 68 | item1['commentTags'] = tags 69 | items.append(item1) 70 | return items -------------------------------------------------------------------------------- /jd_spider/spiders/jd_comment.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/spiders/jd_comment.pyc -------------------------------------------------------------------------------- /jd_spider/spiders/jd_home.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.spiders import Spider 3 | from jd_spider.items import goodsItem 4 | from scrapy.selector import Selector 5 | import scrapy 6 | import re 7 | import json 8 | 9 | 10 | class jd_spider(Spider): 11 | name = "jd" 12 | start_urls = [] 13 | for i in range(1, 11): # 这里需要自己设置页数，目前只能抓取电子烟分类下前10页的商品 14 | url = 'http://list.jd.com/list.html?cat=1672,2599,1440&ev=111217_635585&page=' + str(i) 15 | start_urls.append(url) 16 | 17 | def parse_price(self, response): 18 | item1 = response.meta['item'] 19 | temp1 = response.body.split('jQuery([') 20 | s = temp1[1][:-4] # 获取到需要的json内容 21 | js = json.loads(str(s)) # js是一个list 22 | if js.has_key('pcp'): 23 | item1['price'] = js['pcp'] 24 | else: 25 | item1['price'] = js['p'] 26 | return item1 27 | 28 | def parse_getCommentnum(self, response): 29 | item1 = response.meta['item'] 30 | # response.body是一个json格式的 31 | js = json.loads(str(response.body)) 32 | item1['score1count'] = js['CommentsCount'][0]['Score1Count'] 33 | item1['score2count'] = js['CommentsCount'][0]['Score2Count'] 34 | item1['score3count'] = js['CommentsCount'][0]['Score3Count'] 35 | item1['score4count'] = js['CommentsCount'][0]['Score4Count'] 36 | item1['score5count'] = js['CommentsCount'][0]['Score5Count'] 37 | item1['comment_num'] = js['CommentsCount'][0]['CommentCount'] 38 | num = item1['ID'] # 获得商品ID 39 | s1 = str(num) 40 | url = "http://pm.3.cn/prices/pcpmgets?callback=jQuery&skuids=" + s1[3:-2] + "&origin=2" 41 | yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_price) 42 | 43 | def parse_detail(self, response): 44 | item1 = response.meta['item'] 45 | sel = Selector(response) 46 | 47 | temp = response.body.split('commentVersion:') 48 | pattern = re.compile("[\'](\d+)[\']") 49 | if len(temp) < 2: 50 | item1['commentVersion'] = -1 51 | else: 52 | match = pattern.match(temp[1][:10]) 53 | item1['commentVersion'] = match.group() 54 | 55 | url = "http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=" + str(item1['ID'][0]) 56 | yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_getCommentnum) 57 | 58 | def parse(self, response): # 解析搜索页 59 | sel = Selector(response) # Xpath选择器 60 | goods = sel.xpath('//li[@class="gl-item"]') 61 | for good in goods: 62 | item1 = goodsItem() 63 | item1['ID'] = good.xpath('./div/@data-sku').extract() 64 | item1['name'] = good.xpath('./div/div[@class="p-name"]/a/em/text()').extract() 65 | item1['shop_name'] = good.xpath('./div/div[@class="p-shop"]/@data-shop_name').extract() 66 | item1['link'] = good.xpath('./div/div[@class="p-img"]/a/@href').extract() 67 | url = "http:" + item1['link'][0] + "#comments-list" 68 | yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_detail) 69 | -------------------------------------------------------------------------------- /jd_spider/spiders/jd_home.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/spiders/jd_home.pyc -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jd_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jd_spider 12 | -------------------------------------------------------------------------------- /test.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/test.xlsx --------------------------------------------------------------------------------