├── .idea
├── RealSpider.iml
├── misc.xml
├── modules.xml
└── vcs.xml
├── README.md
├── spider
└── realSpider
│ ├── Books
│ ├── Resources
│ ├── realSpider
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── rotate_useragent.py
│ ├── settings.py
│ └── spiders
│ │ ├── CSDNBlogCrawlSpider.py
│ │ ├── CSDNBlogSpider.py
│ │ ├── DomzSpider.py
│ │ ├── W3schoolSpider.py
│ │ ├── __init__.py
│ │ └── doubanSpider.py
│ ├── scrapy.cfg
│ ├── w3school_data_utf8.json
│ └── xml
└── start.txt
/.idea/RealSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RealSpider
2 | 我的毕业设计爬虫,python、scrapy
3 |
4 |
5 | #2016-5-10 13:56:51
6 | #决定用来写demo
7 | #正式项目在Rebusole里写
8 |
9 |
10 | #2016-5-11 14:34:22
11 | #还是喜欢这个名字,等我会倒腾了,还是用这个。。
12 |
--------------------------------------------------------------------------------
/spider/realSpider/Books:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 | DMOZ - Computers: Programming: Languages: Python: Books
37 |
38 |
55 |
56 |
57 |
58 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |

70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
97 |
98 |
99 |
100 |
112 |
113 |
130 |
131 |
132 |
133 |

134 |
135 |
151 |
152 |
168 |
169 |

170 |
376 |
377 |
378 |

379 |
380 |
418 |

419 |
429 |
430 |

431 |
432 |
433 |
434 |
![[Book Mozilla]](/img/moz/obooksm.gif)
435 |
436 |
437 |
438 |
439 |
440 |
Copyright © 1998-2016 AOL Inc.
441 |
442 |
Terms of Use
443 |
444 |
452 |
453 |
454 |
457 |
458 |
459 |
460 |
461 |
--------------------------------------------------------------------------------
/spider/realSpider/Resources:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 | DMOZ - Computers: Programming: Languages: Python: Resources
37 |
38 |
55 |
56 |
57 |
58 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |

70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
97 |
98 |
99 |
100 |
112 |
113 |
114 |
115 |
116 |
117 |
127 |
128 |
129 |
130 |
131 |

132 |
133 |
144 |
145 |

146 |
187 |
188 |
189 |

190 |
191 |
215 |

216 |
226 |
227 |

228 |
229 |
230 |
231 |

232 |
233 |
234 |
235 |
236 |
237 |
Copyright © 1998-2016 AOL Inc.
238 |
239 |
Terms of Use
240 |
241 |
249 |
250 |
251 |
254 |
255 |
256 |
257 |
258 |
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RealSanqian/RealSpider/1471f2227021380ae707bbd9aa908a2de41183d6/spider/realSpider/realSpider/__init__.py
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/items.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 | import scrapy
8 |
9 |
10 | class DmozItem(scrapy.Item):
11 | title = scrapy.Field()
12 | link = scrapy.Field()
13 | url = scrapy.Field()
14 |
15 | class DoubanItem(scrapy.Item):
16 | # define the fields for your item here like:
17 | # name = scrapy.Field()
18 | name = scrapy.Field() #电影名称
19 | description = scrapy.Field() #电影描述
20 | url = scrapy.Field() #抓取的url
21 |
22 | class NewsItem(scrapy.Item):
23 | name = scrapy.Field() #网站名称
24 | title = scrapy.Field() #新闻标题
25 | detail = scrapy.Field() #文章内容
26 | url = scrapy.Field() #网站地址
27 | link = scrapy.Field() #爬虫爬取地址
28 | time = scrapy.Field() #爬虫爬取时间
29 | vn = scrapy.Field() #爬虫爬取次数
30 |
31 | class W3schoolItem(scrapy.Item):
32 | title = scrapy.Field()
33 | link = scrapy.Field()
34 | desc = scrapy.Field()
35 |
36 |
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import json
8 | import codecs
9 |
10 | class RealspiderPipeline(object):
11 | def process_item(self, item, spider):
12 | return item
13 |
14 | class W3SchoolPipeline(object):
15 | def __init__(self):
16 | self.file = codecs.open('w3school_data_utf8.json', 'wb', encoding='utf-8')
17 |
18 | def process_item(self, item, spider):
19 | line = json.dumps(dict(item)) + '\n'
20 | # print line
21 | self.file.write(line.decode("unicode_escape"))
22 | return item
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/rotate_useragent.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 |
3 | from scrapy import log
4 |
5 | """避免被ban策略之一:使用useragent池。
6 |
7 | 使用注意:需在settings.py中进行相应的设置。
8 | """
9 |
10 | import random
11 | from scrapy.contrib.downloadermiddleware.useragent import UserAgentMiddleware
12 |
13 | class RotateUserAgentMiddleware(UserAgentMiddleware):
14 |
15 | def __init__(self, user_agent=''):
16 | self.user_agent = user_agent
17 |
18 | def process_request(self, request, spider):
19 | ua = random.choice(self.user_agent_list)
20 | if ua:
21 | #显示当前使用的useragent
22 | print "********Current UserAgent:%s************" %ua
23 |
24 | #记录
25 | log.msg('Current UserAgent: '+ua, level=1)
26 | request.headers.setdefault('User-Agent', ua)
27 |
28 | #the default user_agent_list composes chrome,I E,firefox,Mozilla,opera,netscape
29 | #for more user agent strings,you can find it in http://www.useragentstring.com/pages/useragentstring.php
30 | user_agent_list = [\
31 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
32 | "(KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
33 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
34 | "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
35 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
36 | "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
37 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
38 | "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
39 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
40 | "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
41 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
42 | "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
43 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
44 | "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
45 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
46 | "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
47 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
48 | "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
49 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
50 | "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
51 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
52 | "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
53 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
54 | "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
55 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
56 | "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
57 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
58 | "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
59 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
60 | "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
61 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
62 | "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
63 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
64 | "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
65 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
66 | "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
67 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
68 | "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
69 | ]
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for realSpider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'realSpider'
13 |
14 | SPIDER_MODULES = ['realSpider.spiders']
15 | NEWSPIDER_MODULE = 'realSpider.spiders'
16 |
17 | #禁止cookies,防止被ban
18 | COOKIES_ENABLED = False
19 |
20 | ITEM_PIPELINES = {
21 | 'realSpider.pipelines.W3SchoolPipeline':300 ,
22 | }
23 |
24 | #取消默认的useragent,使用新的useragent
25 | DOWNLOADER_MIDDLEWARES = {
26 | 'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware' : None,
27 | 'realSpider.rotate_useragent.RotateUserAgentMiddleware' :400
28 | }
29 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
30 | #USER_AGENT = 'realSpider (+http://www.yourdomain.com)'
31 |
32 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
33 | #CONCURRENT_REQUESTS=32
34 |
35 | # Configure a delay for requests for the same website (default: 0)
36 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
37 | # See also autothrottle settings and docs
38 | #DOWNLOAD_DELAY=3
39 | # The download delay setting will honor only one of:
40 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
41 | #CONCURRENT_REQUESTS_PER_IP=16
42 |
43 | # Disable cookies (enabled by default)
44 | #COOKIES_ENABLED=False
45 |
46 | # Disable Telnet Console (enabled by default)
47 | #TELNETCONSOLE_ENABLED=False
48 |
49 | # Override the default request headers:
50 | #DEFAULT_REQUEST_HEADERS = {
51 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
52 | # 'Accept-Language': 'en',
53 | #}
54 |
55 | # Enable or disable spider middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
57 | #SPIDER_MIDDLEWARES = {
58 | # 'realSpider.middlewares.MyCustomSpiderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable downloader middlewares
62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
63 | #DOWNLOADER_MIDDLEWARES = {
64 | # 'realSpider.middlewares.MyCustomDownloaderMiddleware': 543,
65 | #}
66 |
67 | # Enable or disable extensions
68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
69 | #EXTENSIONS = {
70 | # 'scrapy.telnet.TelnetConsole': None,
71 | #}
72 |
73 | # Configure item pipelines
74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
75 | #ITEM_PIPELINES = {
76 | # 'realSpider.pipelines.SomePipeline': 300,
77 | #}
78 |
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
82 | #AUTOTHROTTLE_ENABLED=True
83 | # The initial download delay
84 | #AUTOTHROTTLE_START_DELAY=5
85 | # The maximum download delay to be set in case of high latencies
86 | #AUTOTHROTTLE_MAX_DELAY=60
87 | # Enable showing throttling stats for every response received:
88 | #AUTOTHROTTLE_DEBUG=False
89 |
90 | # Enable and configure HTTP caching (disabled by default)
91 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
92 | #HTTPCACHE_ENABLED=True
93 | #HTTPCACHE_EXPIRATION_SECS=0
94 | #HTTPCACHE_DIR='httpcache'
95 | #HTTPCACHE_IGNORE_HTTP_CODES=[]
96 | #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
97 |
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/spiders/CSDNBlogCrawlSpider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from scrapy.contrib.spiders import CrawlSpider, Rule
4 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
5 | from scrapy.selector import Selector
6 | from realspider.items import CsdnblogcrawlspiderItem
7 |
8 |
9 | class CSDNBlogCrawlSpider(CrawlSpider):
10 |
11 | """继承自CrawlSpider,实现自动爬取的爬虫。"""
12 |
13 | name = "CSDNBlogCrawlSpider"
14 | #设置下载延时
15 | download_delay = 2
16 | allowed_domains = ['blog.csdn.net']
17 | #第一篇文章地址
18 | start_urls = ['http://blog.csdn.net/u012150179/article/details/11749017']
19 |
20 | #rules编写法一,官方文档方式
21 | #rules = [
22 | # #提取“下一篇”的链接并**跟进**,若不使用restrict_xpaths参数限制,会将页面中所有
23 | # #符合allow链接全部抓取
24 | # Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'),
25 | # restrict_xpaths=('//li[@class="next_article"]')),
26 | # follow=True)
27 | #
28 | # #提取“下一篇”链接并执行**处理**
29 | # #Rule(SgmlLinkExtractor(allow=('/u012150179/article/details')),
30 | # # callback='parse_item',
31 | # # follow=False),
32 | #]
33 |
34 | #rules编写法二,更推荐的方式(自己测验,使用法一时经常出现爬到中间就finish情况,并且无错误码)
35 | rules = [
36 | Rule(SgmlLinkExtractor(allow=('/u012150179/article/details'),
37 | restrict_xpaths=('//li[@class="next_article"]')),
38 | callback='parse_item',
39 | follow=True)
40 | ]
41 |
42 | def parse_item(self, response):
43 |
44 | #print "parse_item>>>>>>"
45 | item = CsdnblogcrawlspiderItem()
46 | sel = Selector(response)
47 | blog_url = str(response.url)
48 | blog_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract()
49 |
50 | item['blog_name'] = [n.encode('utf-8') for n in blog_name]
51 | item['blog_url'] = blog_url.encode('utf-8')
52 |
53 | yield item
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/spiders/CSDNBlogSpider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from scrapy.spider import Spider
3 | from scrapy.http import Request
4 | from scrapy.selector import Selector
5 | from realspider.items import CSDNBlogItem
6 |
7 | class CSDNBlogSpider(Spider):
8 | """爬虫CSDNBlogSpider"""
9 |
10 | name = "CSDNBlog"
11 |
12 | #减慢爬取速度 为1s
13 | download_delay = 1
14 | allowed_domains = ["blog.csdn.net"]
15 | start_urls = [
16 |
17 | #第一篇文章地址
18 | "http://blog.csdn.net/u012150179/article/details/11749017"
19 | ]
20 |
21 | def parse(self, response):
22 | sel = Selector(response)
23 |
24 | #items = []
25 | #获得文章url和标题
26 | item = CSDNBlogItem()
27 |
28 | article_url = str(response.url)
29 | article_name = sel.xpath('//div[@id="article_details"]/div/h1/span/a/text()').extract()
30 |
31 | item['article_name'] = [n.encode('utf-8') for n in article_name]
32 | item['article_url'] = article_url.encode('utf-8')
33 |
34 | yield item
35 |
36 | #获得下一篇文章的url
37 | urls = sel.xpath('//li[@class="next_article"]/a/@href').extract()
38 | for url in urls:
39 | print url
40 | url = "http://blog.csdn.net" + url
41 | print url
42 | yield Request(url, callback=self.parse)
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/spiders/DomzSpider.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | from scrapy.spider import Spider
3 | from scrapy.selector import Selector
4 | from realSpider.items import DmozItem
5 |
6 |
7 | class DmozSpider(Spider):
8 | name = "dmozaa"
9 | allowed_domains = ["dmoz.org"]
10 | start_urls = [
11 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
12 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/",
13 | ]
14 |
15 | def parse(self, response):
16 | sel = Selector(response)
17 | sites = sel.xpath('//ul[@class="directory-url"]/li')
18 | items = []
19 |
20 | for site in sites:
21 | item = DmozItem()
22 | item['name'] = site.xpath('a/text()').extract()
23 | item['url'] = site.xpath('a/@href').extract()
24 | item['description'] = site.xpath('text()').re('-\s[^\n]*\\r')
25 | items.append(item)
26 | return items
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/spiders/W3schoolSpider.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | from scrapy.spider import Spider
4 | from scrapy.selector import Selector
5 |
6 |
7 | from realSpider.items import W3schoolItem
8 |
9 |
10 | class W3schoolSpider(Spider):
11 | """爬取w3school标签"""
12 | # log.start("log",loglevel='INFO')
13 | name = "w3school"
14 | allowed_domains = ["w3school.com.cn"]
15 | start_urls = [
16 | "http://www.w3school.com.cn/xml/xml_syntax.asp"
17 | ]
18 |
19 | def parse(self, response):
20 | sel = Selector(response)
21 | sites = sel.xpath('//div[@id="navsecond"]/div[@id="course"]/ul[1]/li')
22 | items = []
23 |
24 | for site in sites:
25 | item = W3schoolItem()
26 | title = site.xpath('a/text()').extract()
27 | link = site.xpath('a/@href').extract()
28 | desc = site.xpath('a/@title').extract()
29 | item['title'] = [t.encode('utf-8') for t in title]
30 | item['link'] = [l.encode('utf-8') for l in link]
31 | item['desc'] = [d.encode('utf-8') for d in desc]
32 | items.append(item)
33 | return items
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | import scrapy
6 |
7 | class DmozSpider(scrapy.Spider):
8 | name = "dmoz"
9 | allowed_domains = ["dmoz.org"]
10 | start_urls = [
11 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
12 | "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
13 | ]
14 |
15 | def parse(self, response):
16 | filename = response.url.split("/")[-2] + '.html'
17 | with open(filename, 'wb') as f:
18 | f.write(response.body)
19 |
--------------------------------------------------------------------------------
/spider/realSpider/realSpider/spiders/doubanSpider.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | from scrapy.contrib.spiders import CrawlSpider, Rule
3 | from scrapy.selector import Selector
4 | from realSpider.items import DoubanItem
5 | from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
6 |
7 | class DoubanSpider(CrawlSpider) :
8 |
9 | name = "douban"
10 | allowed_domains = ["movie.douban.com"]
11 | start_urls = ["http://movie.douban.com/top250"]
12 | rules = (
13 | #将所有符合正则表达式的url加入到抓取列表中
14 | Rule(SgmlLinkExtractor(allow = (r'http://movie\.douban\.com/top250\?start=\d+&filter=&type=',))),
15 | #将所有符合正则表达式的url请求后下载网页代码, 形成response后调用自定义回调函数
16 | Rule(SgmlLinkExtractor(allow = (r'http://movie\.douban\.com/subject/\d+', )), callback = 'parse_page', follow = True),
17 | )
18 | headers = {
19 | "Accept": "*/*",
20 | "Accept-Encoding": "gzip,deflate,sdch",
21 | "Accept-Language": "zh,zh-TW;q=0.8,en;q=0.6,en-US;q=0.4,zh-CN;q=0.2",
22 | "Connection": "keep-alive",
23 | "Content-Type": "text/html; charset=utf-8",
24 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",
25 | "Referer": "http://www.zhihu.com/"
26 | }
27 |
28 | def parse_page(self, response) :
29 | sel = Selector(response)
30 | item = DoubanItem()
31 | item['name'] = sel.xpath('//h1/span[@property="v:itemreviewed"]/text()').extract()
32 | item['description'] = sel.xpath('//div/span[@property="v:summary"]/text()').extract()
33 | item['url'] = response.url
34 | print item.name
35 | return item
--------------------------------------------------------------------------------
/spider/realSpider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = realSpider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = realSpider
12 |
--------------------------------------------------------------------------------
/spider/realSpider/w3school_data_utf8.json:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RealSanqian/RealSpider/1471f2227021380ae707bbd9aa908a2de41183d6/spider/realSpider/w3school_data_utf8.json
--------------------------------------------------------------------------------
/spider/realSpider/xml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RealSanqian/RealSpider/1471f2227021380ae707bbd9aa908a2de41183d6/spider/realSpider/xml
--------------------------------------------------------------------------------
/start.txt:
--------------------------------------------------------------------------------
1 | 2016-4-26 00:05:04 第一次使用github
2 |
--------------------------------------------------------------------------------