├── README.md └── jianshu ├── .swp ├── jianshu ├── __init__.py ├── __init__.pyc ├── items.py ├── items.pyc ├── pipelines.py ├── settings.py ├── settings.pyc └── spiders │ ├── __init__.py │ ├── __init__.pyc │ ├── jianshuSpider.py │ └── jianshuSpider.pyc ├── main.py └── scrapy.cfg /README.md: -------------------------------------------------------------------------------- 1 | ###爬虫框架scrapy练习 2 | 3 | 4 | 创建scrapy工程 5 | 6 | `scrapy startproject jianshu` 7 | 8 | 工程目录: 9 | jianshu/ 10 | scrapy.cfg 11 | jianshu/ 12 | __init__.py 13 | items.py 14 | pipelines.py 15 | settings.py 16 | spiders/ 17 | __init__.py 18 | ... 19 | 20 | 运行scrapy爬虫 21 | 22 | `scrapy crawl yourspidername` 23 | 24 | 可直接运行工程main.py文件 25 | 26 | ` cmdline.execute("scrapy crawl jianshu".split())` 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /jianshu/.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/.swp -------------------------------------------------------------------------------- /jianshu/jianshu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/__init__.py -------------------------------------------------------------------------------- /jianshu/jianshu/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/__init__.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy.item import Item,Field 9 | 10 | class JianshuItem(Item): 11 | # define the fields for your item here like: 12 | # name = scrapy.Field() 13 | title = Field() 14 | author = Field() 15 | url = Field() 16 | readNum = Field() 17 | commentNum = Field() 18 | likeNum = Field() 19 | 20 | 21 | -------------------------------------------------------------------------------- /jianshu/jianshu/items.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/items.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class JianshuPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /jianshu/jianshu/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jianshu project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jianshu' 13 | 14 | SPIDER_MODULES = ['jianshu.spiders'] 15 | NEWSPIDER_MODULE = 'jianshu.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'jianshu (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'jianshu.middlewares.MyCustomSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'jianshu.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'jianshu.pipelines.SomePipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | FEED_URI=u'/Users/apple/Documents/jianshu-monthly.csv' 93 | FEED_FORMAT='CSV' -------------------------------------------------------------------------------- /jianshu/jianshu/settings.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/settings.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/spiders/__init__.pyc -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/jianshuSpider.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import scrapy 3 | from scrapy.spiders import CrawlSpider 4 | from scrapy.selector import Selector 5 | from scrapy.http import Request 6 | from jianshu.items import JianshuItem 7 | import urllib 8 | 9 | 10 | class Jianshu(CrawlSpider): 11 | name='jianshu' 12 | start_urls=['http://www.jianshu.com/top/monthly'] 13 | url = 'http://www.jianshu.com' 14 | 15 | def parse(self, response): 16 | item = JianshuItem() 17 | selector = Selector(response) 18 | articles = selector.xpath('//ul[@class="article-list thumbnails"]/li') 19 | 20 | for article in articles: 21 | title = article.xpath('div/h4/a/text()').extract() 22 | url = article.xpath('div/h4/a/@href').extract() 23 | author = article.xpath('div/p/a/text()').extract() 24 | 25 | # 下载所有热门文章的缩略图, 注意有些文章没有图片 26 | try: 27 | image = article.xpath("a/img/@src").extract() 28 | urllib.urlretrieve(image[0], '/Users/apple/Documents/images/%s-%s.jpg' %(author[0],title[0])) 29 | except: 30 | print '--no---image--' 31 | 32 | 33 | listtop = article.xpath('div/div/a/text()').extract() 34 | likeNum = article.xpath('div/div/span/text()').extract() 35 | 36 | readAndComment = article.xpath('div/div[@class="list-footer"]') 37 | data = readAndComment[0].xpath('string(.)').extract()[0] 38 | 39 | 40 | item['title'] = title 41 | item['url'] = 'http://www.jianshu.com/'+url[0] 42 | item['author'] = author 43 | 44 | item['readNum']=listtop[0] 45 | # 有的文章是禁用了评论的 46 | try: 47 | item['commentNum']=listtop[1] 48 | except: 49 | item['commentNum']='' 50 | item['likeNum']= likeNum 51 | yield item 52 | 53 | next_link = selector.xpath('//*[@id="list-container"]/div/button/@data-url').extract() 54 | 55 | 56 | 57 | if len(next_link)==1 : 58 | 59 | next_link = self.url+ str(next_link[0]) 60 | print "----"+next_link 61 | yield Request(next_link,callback=self.parse) 62 | 63 | 64 | -------------------------------------------------------------------------------- /jianshu/jianshu/spiders/jianshuSpider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ppy2790/jianshu/ff89cf72f6242b06303f92f56847719cc02262cf/jianshu/jianshu/spiders/jianshuSpider.pyc -------------------------------------------------------------------------------- /jianshu/main.py: -------------------------------------------------------------------------------- 1 | from scrapy import cmdline 2 | cmdline.execute("scrapy crawl jianshu".split()) -------------------------------------------------------------------------------- /jianshu/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jianshu.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jianshu 12 | --------------------------------------------------------------------------------