├── README.md
└── jianshu
    ├── .swp
    ├── jianshu
        ├── __init__.py
        ├── __init__.pyc
        ├── items.py
        ├── items.pyc
        ├── pipelines.py
        ├── settings.py
        ├── settings.pyc
        └── spiders
        │   ├── __init__.py
        │   ├── __init__.pyc
        │   ├── jianshuSpider.py
        │   └── jianshuSpider.pyc
    ├── main.py
    └── scrapy.cfg


/README.md:
--------------------------------------------------------------------------------
 1 | ###爬虫框架scrapy练习
 2 | 
 3 | 
 4 | 创建scrapy工程
 5 | 
 6 |   `scrapy startproject jianshu`
 7 | 
 8 |   工程目录：
 9 |   jianshu/
10 |     scrapy.cfg
11 |     jianshu/
12 |         __init__.py
13 |         items.py
14 |         pipelines.py
15 |         settings.py
16 |         spiders/
17 |             __init__.py
18 |             ...
19 | 
20 | 运行scrapy爬虫
21 | 
22 |   `scrapy crawl yourspidername`
23 | 
24 | 可直接运行工程main.py文件
25 | 
26 | ` cmdline.execute("scrapy crawl jianshu".split())`
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/jianshu/.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/.swp


--------------------------------------------------------------------------------
/jianshu/jianshu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/__init__.py


--------------------------------------------------------------------------------
/jianshu/jianshu/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/__init__.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy.item import Item,Field
 9 | 
10 | class JianshuItem(Item):
11 |     # define the fields for your item here like:
12 |     # name = scrapy.Field()
13 |     title = Field()
14 |     author = Field()
15 |     url = Field()
16 |     readNum = Field()
17 |     commentNum = Field()
18 |     likeNum = Field()
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/items.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/items.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class JianshuPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for jianshu project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'jianshu'
13 | 
14 | SPIDER_MODULES = ['jianshu.spiders']
15 | NEWSPIDER_MODULE = 'jianshu.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'jianshu (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'jianshu.middlewares.MyCustomSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'jianshu.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'jianshu.pipelines.SomePipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | FEED_URI=u'/Users/apple/Documents/jianshu-monthly.csv'
93 | FEED_FORMAT='CSV'


--------------------------------------------------------------------------------
/jianshu/jianshu/settings.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/settings.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/spiders/__init__.pyc


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/jianshuSpider.py:
--------------------------------------------------------------------------------
 1 | #coding=utf-8
 2 | import scrapy
 3 | from scrapy.spiders import CrawlSpider
 4 | from scrapy.selector import Selector
 5 | from scrapy.http import Request
 6 | from jianshu.items import JianshuItem
 7 | import urllib
 8 | 
 9 | 
10 | class Jianshu(CrawlSpider):
11 |     name='jianshu'
12 |     start_urls=['http://www.jianshu.com/top/monthly']
13 |     url = 'http://www.jianshu.com'
14 | 
15 |     def parse(self, response):
16 |         item = JianshuItem()
17 |         selector = Selector(response)
18 |         articles = selector.xpath('//ul[@class="article-list thumbnails"]/li')
19 | 
20 |         for article in articles:
21 |             title = article.xpath('div/h4/a/text()').extract()
22 |             url = article.xpath('div/h4/a/@href').extract()
23 |             author = article.xpath('div/p/a/text()').extract()
24 | 
25 |             # 下载所有热门文章的缩略图, 注意有些文章没有图片
26 |             try:
27 |                 image = article.xpath("a/img/@src").extract()
28 |                 urllib.urlretrieve(image[0], '/Users/apple/Documents/images/%s-%s.jpg' %(author[0],title[0]))
29 |             except:
30 |                 print '--no---image--'
31 | 
32 | 
33 |             listtop = article.xpath('div/div/a/text()').extract()
34 |             likeNum = article.xpath('div/div/span/text()').extract()
35 | 
36 |             readAndComment = article.xpath('div/div[@class="list-footer"]')
37 |             data = readAndComment[0].xpath('string(.)').extract()[0]
38 | 
39 | 
40 |             item['title'] = title
41 |             item['url'] = 'http://www.jianshu.com/'+url[0]
42 |             item['author'] = author
43 | 
44 |             item['readNum']=listtop[0]
45 |             # 有的文章是禁用了评论的
46 |             try:
47 |                 item['commentNum']=listtop[1]
48 |             except:
49 |                 item['commentNum']=''
50 |             item['likeNum']= likeNum
51 |             yield item
52 | 
53 |         next_link = selector.xpath('//*[@id="list-container"]/div/button/@data-url').extract()
54 | 
55 | 
56 | 
57 |         if len(next_link)==1 :
58 | 
59 |             next_link = self.url+ str(next_link[0])
60 |             print "----"+next_link
61 |             yield Request(next_link,callback=self.parse)
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/jianshu/jianshu/spiders/jianshuSpider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/spiders/jianshuSpider.pyc


--------------------------------------------------------------------------------
/jianshu/main.py:
--------------------------------------------------------------------------------
1 | from scrapy import cmdline
2 | cmdline.execute("scrapy crawl jianshu".split())


--------------------------------------------------------------------------------
/jianshu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jianshu.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jianshu
12 | 


--------------------------------------------------------------------------------