├── .gitignore ├── README.md ├── run.py ├── scrapy.cfg └── scrapyuniversal ├── __init__.py ├── configs └── china.json ├── items.py ├── loaders.py ├── middlewares.py ├── pipelines.py ├── rules.py ├── settings.py ├── spiders ├── __init__.py ├── china.py └── universal.py ├── urls.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | *.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ScrapyUniversal 2 | Scrapy Universal Spider 3 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from scrapy.utils.project import get_project_settings 3 | from scrapyuniversal.spiders.universal import UniversalSpider 4 | from scrapyuniversal.utils import get_config 5 | from scrapy.crawler import CrawlerProcess 6 | 7 | def run(): 8 | name = sys.argv[1] 9 | custom_settings = get_config(name) 10 | spider = custom_settings.get('spider', 'universal') 11 | project_settings = get_project_settings() 12 | settings = dict(project_settings.copy()) 13 | settings.update(custom_settings.get('settings')) 14 | process = CrawlerProcess(settings) 15 | process.crawl(spider, **{'name': name}) 16 | process.start() 17 | 18 | 19 | if __name__ == '__main__': 20 | run() 21 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scrapyuniversal.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scrapyuniversal 12 | -------------------------------------------------------------------------------- /scrapyuniversal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/ScrapyUniversal/0e44064177e3460efdce5a82c8ab968883cbbab0/scrapyuniversal/__init__.py -------------------------------------------------------------------------------- /scrapyuniversal/configs/china.json: -------------------------------------------------------------------------------- 1 | { 2 | "spider": "universal", 3 | "website": "中华网科技", 4 | "type": "新闻", 5 | "index": "http://tech.china.com/", 6 | "settings": { 7 | "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36" 8 | }, 9 | "start_urls": { 10 | "type": "dynamic", 11 | "method": "china", 12 | "args": [ 13 | 5, 14 | 10 15 | ] 16 | }, 17 | "allowed_domains": [ 18 | "tech.china.com" 19 | ], 20 | "rules": "china", 21 | "item": { 22 | "class": "NewsItem", 23 | "loader": "ChinaLoader", 24 | "attrs": { 25 | "title": [ 26 | { 27 | "method": "xpath", 28 | "args": [ 29 | "//h1[@id='chan_newsTitle']/text()" 30 | ] 31 | } 32 | ], 33 | "url": [ 34 | { 35 | "method": "attr", 36 | "args": [ 37 | "url" 38 | ] 39 | } 40 | ], 41 | "text": [ 42 | { 43 | "method": "xpath", 44 | "args": [ 45 | "//div[@id='chan_newsDetail']//text()" 46 | ] 47 | } 48 | ], 49 | "datetime": [ 50 | { 51 | "method": "xpath", 52 | "args": [ 53 | "//div[@id='chan_newsInfo']/text()" 54 | ], 55 | "re": "(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)" 56 | } 57 | ], 58 | "source": [ 59 | { 60 | "method": "xpath", 61 | "args": [ 62 | "//div[@id='chan_newsInfo']/text()" 63 | ], 64 | "re": "来源:(.*)" 65 | } 66 | ], 67 | "website": [ 68 | { 69 | "method": "value", 70 | "args": [ 71 | "中华网" 72 | ] 73 | } 74 | ] 75 | } 76 | } 77 | } -------------------------------------------------------------------------------- /scrapyuniversal/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Field, Item 9 | 10 | class NewsItem(Item): 11 | title = Field() 12 | text = Field() 13 | datetime = Field() 14 | source = Field() 15 | url = Field() 16 | website = Field() 17 | -------------------------------------------------------------------------------- /scrapyuniversal/loaders.py: -------------------------------------------------------------------------------- 1 | from scrapy.loader import ItemLoader 2 | from scrapy.loader.processors import TakeFirst, Join, Compose 3 | 4 | 5 | class NewsLoader(ItemLoader): 6 | default_output_processor = TakeFirst() 7 | 8 | 9 | class ChinaLoader(NewsLoader): 10 | text_out = Compose(Join(), lambda s: s.strip()) 11 | source_out = Compose(Join(), lambda s: s.strip()) 12 | -------------------------------------------------------------------------------- /scrapyuniversal/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ScrapyuniversalSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scrapyuniversal/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class ScrapyuniversalPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapyuniversal/rules.py: -------------------------------------------------------------------------------- 1 | from scrapy.linkextractors import LinkExtractor 2 | from scrapy.spiders import Rule 3 | 4 | rules = { 5 | 'china': ( 6 | Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'), 7 | callback='parse_item'), 8 | Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]')) 9 | ) 10 | } 11 | -------------------------------------------------------------------------------- /scrapyuniversal/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for scrapyuniversal project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'scrapyuniversal' 13 | 14 | SPIDER_MODULES = ['scrapyuniversal.spiders'] 15 | NEWSPIDER_MODULE = 'scrapyuniversal.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'scrapyuniversal (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'scrapyuniversal.middlewares.ScrapyuniversalSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'scrapyuniversal.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'scrapyuniversal.pipelines.ScrapyuniversalPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /scrapyuniversal/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapyuniversal/spiders/china.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.linkextractors import LinkExtractor 3 | from scrapy.spiders import CrawlSpider, Rule 4 | from scrapyuniversal.items import * 5 | from scrapyuniversal.loaders import * 6 | 7 | 8 | class ChinaSpider(CrawlSpider): 9 | name = 'china' 10 | allowed_domains = ['tech.china.com'] 11 | start_urls = ['http://tech.china.com/articles/'] 12 | 13 | rules = ( 14 | Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'), 15 | callback='parse_item'), 16 | Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]')) 17 | ) 18 | 19 | def parse_item(self, response): 20 | loader = ChinaLoader(item=NewsItem(), response=response) 21 | loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()') 22 | loader.add_value('url', response.url) 23 | loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()') 24 | loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)') 25 | loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源:(.*)') 26 | loader.add_value('website', '中华网') 27 | yield loader.load_item() 28 | -------------------------------------------------------------------------------- /scrapyuniversal/spiders/universal.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.linkextractors import LinkExtractor 3 | from scrapy.spiders import CrawlSpider, Rule 4 | from scrapyuniversal.items import * 5 | from scrapyuniversal.loaders import * 6 | from scrapyuniversal.utils import get_config 7 | from scrapyuniversal import urls 8 | from scrapyuniversal.rules import rules 9 | 10 | class UniversalSpider(CrawlSpider): 11 | name = 'universal' 12 | 13 | def __init__(self, name, *args, **kwargs): 14 | config = get_config(name) 15 | self.config = config 16 | self.rules = rules.get(config.get('rules')) 17 | start_urls = config.get('start_urls') 18 | if start_urls: 19 | if start_urls.get('type') == 'static': 20 | self.start_urls = start_urls.get('value') 21 | elif start_urls.get('type') == 'dynamic': 22 | self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args', []))) 23 | self.allowed_domains = config.get('allowed_domains') 24 | super(UniversalSpider, self).__init__(*args, **kwargs) 25 | 26 | def parse_item(self, response): 27 | item = self.config.get('item') 28 | if item: 29 | cls = eval(item.get('class'))() 30 | loader = eval(item.get('loader'))(cls, response=response) 31 | # 动态获取属性配置 32 | for key, value in item.get('attrs').items(): 33 | for extractor in value: 34 | if extractor.get('method') == 'xpath': 35 | loader.add_xpath(key, *extractor.get('args'), **{'re': extractor.get('re')}) 36 | if extractor.get('method') == 'css': 37 | loader.add_css(key, *extractor.get('args'), **{'re': extractor.get('re')}) 38 | if extractor.get('method') == 'value': 39 | loader.add_value(key, *extractor.get('args'), **{'re': extractor.get('re')}) 40 | if extractor.get('method') == 'attr': 41 | loader.add_value(key, getattr(response, *extractor.get('args'))) 42 | yield loader.load_item() -------------------------------------------------------------------------------- /scrapyuniversal/urls.py: -------------------------------------------------------------------------------- 1 | def china(start, end): 2 | for page in range(start, end + 1): 3 | yield 'http://tech.china.com/articles/index_' + str(page) + '.html' 4 | -------------------------------------------------------------------------------- /scrapyuniversal/utils.py: -------------------------------------------------------------------------------- 1 | from os.path import realpath, dirname 2 | import json 3 | 4 | 5 | def get_config(name): 6 | path = dirname(realpath(__file__)) + '/configs/' + name + '.json' 7 | with open(path, 'r', encoding='utf-8') as f: 8 | return json.loads(f.read()) --------------------------------------------------------------------------------