├── .idea
├── inspectionProfiles
│ ├── Project_Default.xml
│ └── profiles_settings.xml
├── jingdong.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.md
├── good.png
├── goods.xls
├── jd_comment.sql
├── jd_goods.sql
├── jd_spider
├── __init__.py
├── __init__.pyc
├── items.py
├── items.pyc
├── middlewares.py
├── middlewares.pyc
├── pipelines.py
├── pipelines.pyc
├── settings.py
├── settings.pyc
└── spiders
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── jd_comment.py
│ ├── jd_comment.pyc
│ ├── jd_home.py
│ └── jd_home.pyc
├── scrapy.cfg
└── test.xlsx
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/jingdong.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 | 1467681217214
404 |
405 |
406 | 1467681217214
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # jd_spider
2 |
3 | 用scrapy框架写的京东爬虫,可以抓取京东商品信息和评论
4 |
5 | # 1、目的:
6 |
1、爬取京东的商品信息(以电子烟为例)
7 | 2、爬取商品的评论信息
8 |
9 | #2、抓取到的数据属性如下所示
10 |
11 | ##商品数据
12 |
13 | 
14 |
15 | ##评论数据
16 |
17 | 
18 |
19 | #3、使用说明:
20 | ##(1)抓取商品信息和抓取评论分别写在了两个不同的爬虫里
21 |
22 | 抓取商品信息代码在jd_home.py中,数据库在setting.py中修改ITEM_PIPELINES,使用MySQLPipeline
23 | 抓取评论代码在jd_comment.py中,数据库在setting.py中修改ITEM_PIPELINES,使用CommentPipeline
24 |
25 | ##(2)setting.py文件
26 | 默认开启了代理IP,因为IP的存活期的限制,要定期更新PROXIES中IP信息,可从网站:http://www.xicidaili.com/ 中找免费的代理IP
27 |
28 | 如果不想使用代理IP,可以将DOWNLOADER_MIDDLEWARES代码注释掉
29 |
30 | 数据库的配置:
31 |
setting.py中可以配置数据库的主机,端口,用户名,密码和数据库名信息
32 | pipeline.py中在sql语句中配置你要存入的表的名称。
33 | 数据库表结构:
34 | jd_comment.sql:评论数据
35 | jd_goods.sql:商品数据
36 |
37 |
38 | 在使用本爬虫中,因为在抓取评论信息时需要用到goods.xls文件。因此需要先抓取商品信息,然后将商品信息的相关内容导出到goods.xls中(这里提供了一个goods.xls的格式供参考)
39 | goods.xls格式:第1列:商品ID,第2列:商品评论数;第3列:商品的commentVersion
40 | 在一个工程中,抓取商品信息和抓取评论信息不能同时进行。
41 |
42 | 更多爬虫的细节可以参考我的博客文章:
43 | http://blog.csdn.net/xiaoquantouer/article/details/51840332
44 |
45 | http://blog.csdn.net/xiaoquantouer/article/details/51841016
46 |
47 |
48 | ##有问题欢迎留言
49 |
50 |
51 |
--------------------------------------------------------------------------------
/good.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/good.png
--------------------------------------------------------------------------------
/goods.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/goods.xls
--------------------------------------------------------------------------------
/jd_comment.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat MySQL Data Transfer
3 |
4 | Source Server : test
5 | Source Server Version : 50621
6 | Source Host : 127.0.0.1:3306
7 | Source Database : test
8 |
9 | Target Server Type : MYSQL
10 | Target Server Version : 50621
11 | File Encoding : 65001
12 |
13 | Date: 2016-09-20 19:05:04
14 | */
15 |
16 | SET FOREIGN_KEY_CHECKS=0;
17 |
18 | -- ----------------------------
19 | -- Table structure for jd_comment
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `jd_comment`;
22 | CREATE TABLE `jd_comment` (
23 | `user_name` varchar(255) DEFAULT NULL,
24 | `user_ID` varchar(255) DEFAULT NULL,
25 | `userProvince` varchar(255) DEFAULT NULL,
26 | `content` varchar(255) DEFAULT NULL,
27 | `good_ID` varchar(255) DEFAULT NULL,
28 | `good_name` varchar(255) DEFAULT NULL,
29 | `date` varchar(255) DEFAULT NULL,
30 | `replyCount` varchar(255) DEFAULT NULL,
31 | `score` varchar(255) DEFAULT NULL,
32 | `status` varchar(255) DEFAULT NULL,
33 | `title` varchar(255) DEFAULT NULL,
34 | `userRegisterTime` varchar(255) DEFAULT NULL,
35 | `productColor` varchar(255) DEFAULT NULL,
36 | `productSize` varchar(255) DEFAULT NULL,
37 | `userLevelName` varchar(255) DEFAULT NULL,
38 | `isMobile` varchar(255) DEFAULT NULL,
39 | `days` varchar(255) DEFAULT NULL,
40 | `tags` varchar(255) DEFAULT NULL
41 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
42 |
43 | -- ----------------------------
44 | -- Records of jd_comment
45 | -- ----------------------------
46 |
--------------------------------------------------------------------------------
/jd_goods.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat MySQL Data Transfer
3 |
4 | Source Server : test
5 | Source Server Version : 50621
6 | Source Host : 127.0.0.1:3306
7 | Source Database : test
8 |
9 | Target Server Type : MYSQL
10 | Target Server Version : 50621
11 | File Encoding : 65001
12 |
13 | Date: 2016-09-20 19:05:13
14 | */
15 |
16 | SET FOREIGN_KEY_CHECKS=0;
17 |
18 | -- ----------------------------
19 | -- Table structure for jd_goods
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `jd_goods`;
22 | CREATE TABLE `jd_goods` (
23 | `ID` varchar(255) DEFAULT NULL,
24 | `name` varchar(255) DEFAULT NULL,
25 | `comment_num` varchar(255) DEFAULT NULL,
26 | `shop_name` varchar(255) DEFAULT NULL,
27 | `link` varchar(255) DEFAULT NULL,
28 | `commentVersion` varchar(255) DEFAULT NULL,
29 | `score1count` varchar(255) DEFAULT NULL,
30 | `score2count` varchar(255) DEFAULT NULL,
31 | `score3count` varchar(255) DEFAULT NULL,
32 | `score4count` varchar(255) DEFAULT NULL,
33 | `score5count` varchar(255) DEFAULT NULL,
34 | `price` varchar(255) DEFAULT NULL
35 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
36 |
37 | -- ----------------------------
38 | -- Records of jd_goods
39 | -- ----------------------------
40 |
--------------------------------------------------------------------------------
/jd_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/__init__.py
--------------------------------------------------------------------------------
/jd_spider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/__init__.pyc
--------------------------------------------------------------------------------
/jd_spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 | from scrapy import Item, Field
10 |
11 |
12 | class JdSpiderItem(scrapy.Item):
13 | # define the fields for your item here like:
14 | # name = scrapy.Field()
15 | pass
16 |
17 |
18 | class goodsItem(Item):
19 | link = Field() # 商品链接
20 | ID = Field() # 商品ID
21 | name = Field() # 商品名字
22 | comment_num = Field() # 评论人数
23 | shop_name = Field() # 店家名字
24 | price = Field() # 价钱
25 | commentVersion = Field() # 为了得到评论的地址需要该字段
26 | score1count = Field() # 评分为1星的人数
27 | score2count = Field() # 评分为2星的人数
28 | score3count = Field() # 评分为3星的人数
29 | score4count = Field() # 评分为4星的人数
30 | score5count = Field() # 评分为5星的人数
31 |
32 |
33 | class commentItem(Item):
34 | user_name = Field() # 评论用户的名字
35 | user_ID = Field() # 评论用户的ID
36 | userProvince = Field() # 评论用户来自的地区
37 | content = Field() # 评论内容
38 | good_ID = Field() # 评论的商品ID
39 | good_name = Field() # 评论的商品名字
40 | date = Field() # 评论时间
41 | replyCount = Field() # 回复数
42 | score = Field() # 评分
43 | status = Field() # 状态
44 | title = Field()
45 | userLevelId = Field()
46 | userRegisterTime = Field() # 用户注册时间
47 | productColor = Field() # 商品颜色
48 | productSize = Field() # 商品大小
49 | userLevelName = Field() # 银牌会员,钻石会员等
50 | userClientShow = Field() # 来自什么 比如来自京东客户端
51 | isMobile = Field() # 是否来自手机
52 | days = Field() # 天数
53 | commentTags = Field() # 标签
54 |
--------------------------------------------------------------------------------
/jd_spider/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/items.pyc
--------------------------------------------------------------------------------
/jd_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | __author__ = 'jiangqiaowei'
3 | import random
4 | import base64
5 | from settings import PROXIES
6 |
7 |
8 | # 主要用来动态获取user agent, user agent列表USER_AGENTS在setting.py中进行配置
9 | class RandomUserAgent(object):
10 | """Randomly rotate user agents based on a list of predefined ones"""
11 |
12 | def __init__(self, agents):
13 | self.agents = agents
14 |
15 | @classmethod
16 | def from_crawler(cls, crawler):
17 | return cls(crawler.settings.getlist('USER_AGENTS'))
18 |
19 | def process_request(self, request, spider):
20 | #print "**************************" + random.choice(self.agents)
21 | request.headers.setdefault('User-Agent', random.choice(self.agents))
22 |
23 |
24 | # 用来切换代理,proxy列表PROXIES也是在settings.py中进行配置
25 | class ProxyMiddleware(object):
26 | def process_request(self, request, spider):
27 | proxy = random.choice(PROXIES)
28 | if proxy['user_pass'] is not None:
29 | request.meta['proxy'] = "http://%s" % proxy['ip_port']
30 | encoded_user_pass = base64.encodestring(proxy['user_pass'])
31 | request.headers['Proxy-Authorization'] = 'Basic ' + encoded_user_pass
32 | print "**************ProxyMiddleware have pass************" + proxy['ip_port']
33 | else:
34 | print "**************ProxyMiddleware no pass************" + proxy['ip_port']
35 | request.meta['proxy'] = "http://%s" % proxy['ip_port']
--------------------------------------------------------------------------------
/jd_spider/middlewares.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/middlewares.pyc
--------------------------------------------------------------------------------
/jd_spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import MySQLdb.cursors
8 | from twisted.enterprise import adbapi
9 |
10 | from scrapy.xlib.pydispatch import dispatcher
11 | from scrapy import signals
12 | from scrapy.utils.project import get_project_settings
13 | from scrapy import log
14 |
15 | SETTINGS = get_project_settings()
16 |
17 |
18 | class MySQLPipeline(object):
19 | @classmethod
20 | def from_crawler(cls, crawler):
21 | return cls(crawler.stats)
22 |
23 | def __init__(self, stats):
24 | # Instantiate DB
25 | self.dbpool = adbapi.ConnectionPool('MySQLdb',
26 | host=SETTINGS['DB_HOST'],
27 | user=SETTINGS['DB_USER'],
28 | passwd=SETTINGS['DB_PASSWD'],
29 | port=SETTINGS['DB_PORT'],
30 | db=SETTINGS['DB_DB'],
31 | charset='utf8',
32 | use_unicode=True,
33 | cursorclass=MySQLdb.cursors.DictCursor
34 | )
35 | self.stats = stats
36 | dispatcher.connect(self.spider_closed, signals.spider_closed)
37 |
38 | def spider_closed(self, spider):
39 | """ Cleanup function, called after crawing has finished to close open
40 | objects.
41 | Close ConnectionPool. """
42 | self.dbpool.close()
43 |
44 | def process_item(self, item, spider):
45 | query = self.dbpool.runInteraction(self._insert_record, item)
46 | query.addErrback(self._handle_error)
47 | return item
48 |
49 | def _insert_record(self, tx, item):
50 | ID = item['ID'][0]
51 | name = item['name'][0]
52 | comment_num = str(item['comment_num'])
53 | shop_name = item['shop_name'][0]
54 | link = item['link'][0]
55 | commentVersion = str(item['commentVersion'])
56 | commentVersion = commentVersion[1:-1]
57 |
58 | score1count = str(item['score1count'])
59 | score2count = str(item['score2count'])
60 | score3count = str(item['score3count'])
61 | score4count = str(item['score4count'])
62 | score5count = str(item['score5count'])
63 |
64 | price = str(item['price'])
65 |
66 | ID = ID.encode('utf-8')
67 | name = name.encode('utf-8')
68 | comment_num = comment_num.encode('utf-8')
69 | shop_name = shop_name.encode('utf-8')
70 | link = link.encode('utf-8')
71 | commentVersion = commentVersion.encode('utf-8')
72 | score1count = score1count.encode('utf-8')
73 | score2count = score2count.encode('utf-8')
74 | score3count = score3count.encode('utf-8')
75 | score4count = score4count.encode('utf-8')
76 | score5count = score5count.encode('utf-8')
77 | price = price.encode('utf-8')
78 |
79 | sql = "INSERT INTO jd_goods VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \
80 | (ID, name, comment_num, shop_name, link, commentVersion, score1count, score2count, score3count,
81 | score4count, score5count, price)
82 | tx.execute(sql)
83 | print "yes"
84 |
85 | def _handle_error(self, e):
86 | log.err(e)
87 |
88 |
89 | class CommentPipeline(object):
90 | @classmethod
91 | def from_crawler(cls, crawler):
92 | return cls(crawler.stats)
93 |
94 | def __init__(self, stats):
95 | # Instantiate DB
96 | self.dbpool = adbapi.ConnectionPool('MySQLdb',
97 | host=SETTINGS['DB_HOST'],
98 | user=SETTINGS['DB_USER'],
99 | passwd=SETTINGS['DB_PASSWD'],
100 | port=SETTINGS['DB_PORT'],
101 | db=SETTINGS['DB_DB'],
102 | charset='utf8',
103 | use_unicode=True,
104 | cursorclass=MySQLdb.cursors.DictCursor
105 | )
106 | self.stats = stats
107 | dispatcher.connect(self.spider_closed, signals.spider_closed)
108 |
109 | def spider_closed(self, spider):
110 | """ Cleanup function, called after crawing has finished to close open
111 | objects.
112 | Close ConnectionPool. """
113 | self.dbpool.close()
114 |
115 | def process_item(self, item, spider):
116 | query = self.dbpool.runInteraction(self._insert_record, item)
117 | query.addErrback(self._handle_error)
118 | return item
119 |
120 | def _insert_record(self, tx, item):
121 | user_name = item['user_name']
122 | user_ID = item['user_ID']
123 | userProvince = item['userProvince']
124 | content = item['content']
125 | good_ID = item['good_ID']
126 | good_name = item['good_name']
127 | date = item['date']
128 | replyCount = item['replyCount']
129 | score = item['score']
130 | status = item['status']
131 | title = item['title']
132 | userRegisterTime = item['userRegisterTime']
133 | productColor = item['productColor']
134 | productSize = item['productSize']
135 | userLevelName = item['userLevelName']
136 | isMobile = item['isMobile']
137 | days = item['days']
138 | tags = item['commentTags']
139 |
140 | sql = "INSERT INTO jd_comment VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s'," \
141 | "'%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \
142 | (user_name, user_ID, userProvince, content, good_ID, good_name, date, replyCount, score,
143 | status, title, userRegisterTime, productColor, productSize, userLevelName,
144 | isMobile, days, tags)
145 |
146 | tx.execute(sql)
147 | print "yes"
148 |
149 | def _handle_error(self, e):
150 | log.err(e)
151 |
--------------------------------------------------------------------------------
/jd_spider/pipelines.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/pipelines.pyc
--------------------------------------------------------------------------------
/jd_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for jd_spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'jd_spider'
13 |
14 | SPIDER_MODULES = ['jd_spider.spiders']
15 | NEWSPIDER_MODULE = 'jd_spider.spiders'
16 |
17 | USER_AGENTS = [
18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
19 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
20 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
21 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
22 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
23 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
24 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
25 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
26 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
27 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
28 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
29 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
30 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
31 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
32 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
33 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
34 | ]
35 |
36 | # 这里使用的代理IP,因为IP的存活期的限制,请定期更新下面的IP,可从http://www.xicidaili.com/ 中找免费的代理IP
37 | PROXIES = [
38 | {'ip_port': '202.108.2.42:80', 'user_pass': ''},
39 | {'ip_port': '122.96.59.104:80', 'user_pass': ''},
40 | {'ip_port': '120.76.243.40:80', 'user_pass': ''},
41 | {'ip_port': '139.196.108.68:80', 'user_pass': ''},
42 | {'ip_port': '60.194.100.51:80', 'user_pass': ''},
43 | {'ip_port': '202.171.253.72:80', 'user_pass': ''},
44 | {'ip_port': '123.56.74.13:8080', 'user_pass': ''},
45 | ]
46 |
47 | COOKIES_ENABLED = False
48 |
49 |
50 | # 如果不想使用代理IP,可以将下面这段DOWNLOADER_MIDDLEWARES代码注释掉
51 | DOWNLOADER_MIDDLEWARES = {
52 | 'jd_spider.middlewares.RandomUserAgent': 1,
53 | 'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
54 | 'jd_spider.middlewares.ProxyMiddleware': 100,
55 | }
56 |
57 | DOWNLOAD_DELAY = 7 # 下载延迟
58 |
59 | LOG_LEVEL = 'INFO'
60 |
61 | # 数据库的配置,请将下面的换成你自己的数据库配置
62 | DB_HOST = 'localhost' # 主机名
63 | DB_PORT = 3306 # 端口号
64 | DB_USER = 'root' # 用户名
65 | DB_PASSWD = 'xiaoquantou' # 密码
66 | DB_DB = 'test' # 数据库名
67 |
68 | ITEM_PIPELINES = {
69 | 'jd_spider.pipelines.MySQLPipeline': 300, # 抓取商品信息时,使用该通道
70 | # 'jd_spider.pipelines.CommentPipeline': 300, # 抓取评论信息时,使用该通道
71 | }
72 |
--------------------------------------------------------------------------------
/jd_spider/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/settings.pyc
--------------------------------------------------------------------------------
/jd_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/jd_spider/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/spiders/__init__.pyc
--------------------------------------------------------------------------------
/jd_spider/spiders/jd_comment.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy.spiders import Spider
3 | from jd_spider.items import commentItem
4 | import json
5 | import xlrd
6 |
7 |
8 | class comment_spider(Spider):
9 | name = "comment"
10 | xlrd.Book.encoding = "utf-8"
11 | data = xlrd.open_workbook("goods.xls")
12 | # goods为要抓取评论的商品信息,现提供一个goods.xls文件供参考,第1列:商品ID;第2列:商品评论数;第3列:商品的commentVersion
13 | # test.xlsx也可以使用
14 | table = data.sheets()[0]
15 | nrows = table.nrows # 行数
16 | ncols = table.ncols # 列数
17 | good_id = table.col_values(0) # 商品ID
18 | comment_n = table.col_values(1) # 商品评论数
19 | comment_V = table.col_values(2) # 商品评论的commentVersion
20 |
21 | start_urls = []
22 | for i in range(len(good_id)): # 一件商品一件商品的抽取
23 | good_num = int(good_id[i])
24 | comment_total = int(comment_n[i])
25 | if comment_total % 10 == 0: # 算出评论的页数,一页10条评论
26 | page = comment_total/10
27 | else:
28 | page = comment_total/10 + 1
29 | for k in range(0, page):
30 | url = "http://sclub.jd.com/productpage/p-" + str(good_num) + "-s-0-t-3-p-" + str(k) \
31 | + ".html?callback=fetchJSON_comment98vv" + str(comment_V[i])
32 | start_urls.append(url)
33 |
34 | def parse(self, response):
35 | temp1 = response.body.split('productAttr')
36 | str = '{"productAttr' + temp1[1][:-2]
37 | str = str.decode("gbk").encode("utf-8")
38 | js = json.loads(unicode(str, "utf-8"))
39 | comments = js['comments'] # 该页所有评论
40 |
41 | items = []
42 | for comment in comments:
43 | item1 = commentItem()
44 | item1['user_name'] = comment['nickname']
45 | item1['user_ID'] = comment['id']
46 | item1['userProvince'] = comment['userProvince']
47 | item1['content'] = comment['content']
48 | item1['good_ID'] = comment['referenceId']
49 | item1['good_name'] = comment['referenceName']
50 | item1['date'] = comment['referenceTime']
51 | item1['replyCount'] = comment['replyCount']
52 | item1['score'] = comment['score']
53 | item1['status'] = comment['status']
54 | title = ""
55 | if comment.has_key('title'):
56 | item1['title'] = comment['title']
57 | item1['title'] = title
58 | item1['userRegisterTime'] = comment['userRegisterTime']
59 | item1['productColor'] = comment['productColor']
60 | item1['productSize'] = comment['productSize']
61 | item1['userLevelName'] = comment['userLevelName']
62 | item1['isMobile'] = comment['isMobile']
63 | item1['days'] = comment['days']
64 | tags = ""
65 | if comment.has_key('commentTags'):
66 | for i in comment['commentTags']:
67 | tags = tags + i['name'] + " "
68 | item1['commentTags'] = tags
69 | items.append(item1)
70 | return items
--------------------------------------------------------------------------------
/jd_spider/spiders/jd_comment.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/spiders/jd_comment.pyc
--------------------------------------------------------------------------------
/jd_spider/spiders/jd_home.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy.spiders import Spider
3 | from jd_spider.items import goodsItem
4 | from scrapy.selector import Selector
5 | import scrapy
6 | import re
7 | import json
8 |
9 |
10 | class jd_spider(Spider):
11 | name = "jd"
12 | start_urls = []
13 | for i in range(1, 11): # 这里需要自己设置页数,目前只能抓取电子烟分类下前10页的商品
14 | url = 'http://list.jd.com/list.html?cat=1672,2599,1440&ev=111217_635585&page=' + str(i)
15 | start_urls.append(url)
16 |
17 | def parse_price(self, response):
18 | item1 = response.meta['item']
19 | temp1 = response.body.split('jQuery([')
20 | s = temp1[1][:-4] # 获取到需要的json内容
21 | js = json.loads(str(s)) # js是一个list
22 | if js.has_key('pcp'):
23 | item1['price'] = js['pcp']
24 | else:
25 | item1['price'] = js['p']
26 | return item1
27 |
28 | def parse_getCommentnum(self, response):
29 | item1 = response.meta['item']
30 | # response.body是一个json格式的
31 | js = json.loads(str(response.body))
32 | item1['score1count'] = js['CommentsCount'][0]['Score1Count']
33 | item1['score2count'] = js['CommentsCount'][0]['Score2Count']
34 | item1['score3count'] = js['CommentsCount'][0]['Score3Count']
35 | item1['score4count'] = js['CommentsCount'][0]['Score4Count']
36 | item1['score5count'] = js['CommentsCount'][0]['Score5Count']
37 | item1['comment_num'] = js['CommentsCount'][0]['CommentCount']
38 | num = item1['ID'] # 获得商品ID
39 | s1 = str(num)
40 | url = "http://pm.3.cn/prices/pcpmgets?callback=jQuery&skuids=" + s1[3:-2] + "&origin=2"
41 | yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_price)
42 |
43 | def parse_detail(self, response):
44 | item1 = response.meta['item']
45 | sel = Selector(response)
46 |
47 | temp = response.body.split('commentVersion:')
48 | pattern = re.compile("[\'](\d+)[\']")
49 | if len(temp) < 2:
50 | item1['commentVersion'] = -1
51 | else:
52 | match = pattern.match(temp[1][:10])
53 | item1['commentVersion'] = match.group()
54 |
55 | url = "http://club.jd.com/clubservice.aspx?method=GetCommentsCount&referenceIds=" + str(item1['ID'][0])
56 | yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_getCommentnum)
57 |
58 | def parse(self, response): # 解析搜索页
59 | sel = Selector(response) # Xpath选择器
60 | goods = sel.xpath('//li[@class="gl-item"]')
61 | for good in goods:
62 | item1 = goodsItem()
63 | item1['ID'] = good.xpath('./div/@data-sku').extract()
64 | item1['name'] = good.xpath('./div/div[@class="p-name"]/a/em/text()').extract()
65 | item1['shop_name'] = good.xpath('./div/div[@class="p-shop"]/@data-shop_name').extract()
66 | item1['link'] = good.xpath('./div/div[@class="p-img"]/a/@href').extract()
67 | url = "http:" + item1['link'][0] + "#comments-list"
68 | yield scrapy.Request(url, meta={'item': item1}, callback=self.parse_detail)
69 |
--------------------------------------------------------------------------------
/jd_spider/spiders/jd_home.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/jd_spider/spiders/jd_home.pyc
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = jd_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jd_spider
12 |
--------------------------------------------------------------------------------
/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xiaoquantou/jd_spider/e2863c5d5144e501b931450939b89272498b3a4b/test.xlsx
--------------------------------------------------------------------------------