├── .gitignore
├── README.md
├── run.py
├── scrapy.cfg
└── scrapyuniversal
    ├── __init__.py
    ├── configs
        └── china.json
    ├── items.py
    ├── loaders.py
    ├── middlewares.py
    ├── pipelines.py
    ├── rules.py
    ├── settings.py
    ├── spiders
        ├── __init__.py
        ├── china.py
        └── universal.py
    ├── urls.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /.idea
2 | *.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ScrapyUniversal
2 | Scrapy Universal Spider
3 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from scrapy.utils.project import get_project_settings
 3 | from scrapyuniversal.spiders.universal import UniversalSpider
 4 | from scrapyuniversal.utils import get_config
 5 | from scrapy.crawler import CrawlerProcess
 6 | 
 7 | def run():
 8 |     name = sys.argv[1]
 9 |     custom_settings = get_config(name)
10 |     spider = custom_settings.get('spider', 'universal')
11 |     project_settings = get_project_settings()
12 |     settings = dict(project_settings.copy())
13 |     settings.update(custom_settings.get('settings'))
14 |     process = CrawlerProcess(settings)
15 |     process.crawl(spider, **{'name': name})
16 |     process.start()
17 | 
18 | 
19 | if __name__ == '__main__':
20 |     run()
21 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scrapyuniversal.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scrapyuniversal
12 | 


--------------------------------------------------------------------------------
/scrapyuniversal/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Python3WebSpider/ScrapyUniversal/0e44064177e3460efdce5a82c8ab968883cbbab0/scrapyuniversal/__init__.py


--------------------------------------------------------------------------------
/scrapyuniversal/configs/china.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "spider": "universal",
 3 |   "website": "中华网科技",
 4 |   "type": "新闻",
 5 |   "index": "http://tech.china.com/",
 6 |   "settings": {
 7 |     "USER_AGENT": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36"
 8 |   },
 9 |   "start_urls": {
10 |     "type": "dynamic",
11 |     "method": "china",
12 |     "args": [
13 |       5,
14 |       10
15 |     ]
16 |   },
17 |   "allowed_domains": [
18 |     "tech.china.com"
19 |   ],
20 |   "rules": "china",
21 |   "item": {
22 |     "class": "NewsItem",
23 |     "loader": "ChinaLoader",
24 |     "attrs": {
25 |       "title": [
26 |         {
27 |           "method": "xpath",
28 |           "args": [
29 |             "//h1[@id='chan_newsTitle']/text()"
30 |           ]
31 |         }
32 |       ],
33 |       "url": [
34 |         {
35 |           "method": "attr",
36 |           "args": [
37 |             "url"
38 |           ]
39 |         }
40 |       ],
41 |       "text": [
42 |         {
43 |           "method": "xpath",
44 |           "args": [
45 |             "//div[@id='chan_newsDetail']//text()"
46 |           ]
47 |         }
48 |       ],
49 |       "datetime": [
50 |         {
51 |           "method": "xpath",
52 |           "args": [
53 |             "//div[@id='chan_newsInfo']/text()"
54 |           ],
55 |           "re": "(\\d+-\\d+-\\d+\\s\\d+:\\d+:\\d+)"
56 |         }
57 |       ],
58 |       "source": [
59 |         {
60 |           "method": "xpath",
61 |           "args": [
62 |             "//div[@id='chan_newsInfo']/text()"
63 |           ],
64 |           "re": "来源：(.*)"
65 |         }
66 |       ],
67 |       "website": [
68 |         {
69 |           "method": "value",
70 |           "args": [
71 |             "中华网"
72 |           ]
73 |         }
74 |       ]
75 |     }
76 |   }
77 | }


--------------------------------------------------------------------------------
/scrapyuniversal/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | from scrapy import Field, Item
 9 | 
10 | class NewsItem(Item):
11 |     title = Field()
12 |     text = Field()
13 |     datetime = Field()
14 |     source = Field()
15 |     url = Field()
16 |     website = Field()
17 | 


--------------------------------------------------------------------------------
/scrapyuniversal/loaders.py:
--------------------------------------------------------------------------------
 1 | from scrapy.loader import ItemLoader
 2 | from scrapy.loader.processors import TakeFirst, Join, Compose
 3 | 
 4 | 
 5 | class NewsLoader(ItemLoader):
 6 |     default_output_processor = TakeFirst()
 7 | 
 8 | 
 9 | class ChinaLoader(NewsLoader):
10 |     text_out = Compose(Join(), lambda s: s.strip())
11 |     source_out = Compose(Join(), lambda s: s.strip())
12 | 


--------------------------------------------------------------------------------
/scrapyuniversal/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ScrapyuniversalSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scrapyuniversal/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class ScrapyuniversalPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/scrapyuniversal/rules.py:
--------------------------------------------------------------------------------
 1 | from scrapy.linkextractors import LinkExtractor
 2 | from scrapy.spiders import Rule
 3 | 
 4 | rules = {
 5 |     'china': (
 6 |         Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
 7 |              callback='parse_item'),
 8 |         Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
 9 |     )
10 | }
11 | 


--------------------------------------------------------------------------------
/scrapyuniversal/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for scrapyuniversal project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'scrapyuniversal'
13 | 
14 | SPIDER_MODULES = ['scrapyuniversal.spiders']
15 | NEWSPIDER_MODULE = 'scrapyuniversal.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'scrapyuniversal (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'scrapyuniversal.middlewares.ScrapyuniversalSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'scrapyuniversal.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'scrapyuniversal.pipelines.ScrapyuniversalPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/scrapyuniversal/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapyuniversal/spiders/china.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.linkextractors import LinkExtractor
 3 | from scrapy.spiders import CrawlSpider, Rule
 4 | from scrapyuniversal.items import *
 5 | from scrapyuniversal.loaders import *
 6 | 
 7 | 
 8 | class ChinaSpider(CrawlSpider):
 9 |     name = 'china'
10 |     allowed_domains = ['tech.china.com']
11 |     start_urls = ['http://tech.china.com/articles/']
12 |     
13 |     rules = (
14 |         Rule(LinkExtractor(allow='article\/.*\.html', restrict_xpaths='//div[@id="left_side"]//div[@class="con_item"]'),
15 |              callback='parse_item'),
16 |         Rule(LinkExtractor(restrict_xpaths='//div[@id="pageStyle"]//a[contains(., "下一页")]'))
17 |     )
18 |     
19 |     def parse_item(self, response):
20 |         loader = ChinaLoader(item=NewsItem(), response=response)
21 |         loader.add_xpath('title', '//h1[@id="chan_newsTitle"]/text()')
22 |         loader.add_value('url', response.url)
23 |         loader.add_xpath('text', '//div[@id="chan_newsDetail"]//text()')
24 |         loader.add_xpath('datetime', '//div[@id="chan_newsInfo"]/text()', re='(\d+-\d+-\d+\s\d+:\d+:\d+)')
25 |         loader.add_xpath('source', '//div[@id="chan_newsInfo"]/text()', re='来源：(.*)')
26 |         loader.add_value('website', '中华网')
27 |         yield loader.load_item()
28 | 


--------------------------------------------------------------------------------
/scrapyuniversal/spiders/universal.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.linkextractors import LinkExtractor
 3 | from scrapy.spiders import CrawlSpider, Rule
 4 | from scrapyuniversal.items import *
 5 | from scrapyuniversal.loaders import *
 6 | from scrapyuniversal.utils import get_config
 7 | from scrapyuniversal import urls
 8 | from scrapyuniversal.rules import rules
 9 | 
10 | class UniversalSpider(CrawlSpider):
11 |     name = 'universal'
12 |     
13 |     def __init__(self, name, *args, **kwargs):
14 |         config = get_config(name)
15 |         self.config = config
16 |         self.rules = rules.get(config.get('rules'))
17 |         start_urls = config.get('start_urls')
18 |         if start_urls:
19 |             if start_urls.get('type') == 'static':
20 |                 self.start_urls = start_urls.get('value')
21 |             elif start_urls.get('type') == 'dynamic':
22 |                 self.start_urls = list(eval('urls.' + start_urls.get('method'))(*start_urls.get('args', [])))
23 |         self.allowed_domains = config.get('allowed_domains')
24 |         super(UniversalSpider, self).__init__(*args, **kwargs)
25 |     
26 |     def parse_item(self, response):
27 |         item = self.config.get('item')
28 |         if item:
29 |             cls = eval(item.get('class'))()
30 |             loader = eval(item.get('loader'))(cls, response=response)
31 |             # 动态获取属性配置
32 |             for key, value in item.get('attrs').items():
33 |                 for extractor in value:
34 |                     if extractor.get('method') == 'xpath':
35 |                         loader.add_xpath(key, *extractor.get('args'), **{'re': extractor.get('re')})
36 |                     if extractor.get('method') == 'css':
37 |                         loader.add_css(key, *extractor.get('args'), **{'re': extractor.get('re')})
38 |                     if extractor.get('method') == 'value':
39 |                         loader.add_value(key, *extractor.get('args'), **{'re': extractor.get('re')})
40 |                     if extractor.get('method') == 'attr':
41 |                         loader.add_value(key, getattr(response, *extractor.get('args')))
42 |             yield loader.load_item()


--------------------------------------------------------------------------------
/scrapyuniversal/urls.py:
--------------------------------------------------------------------------------
1 | def china(start, end):
2 |     for page in range(start, end + 1):
3 |         yield 'http://tech.china.com/articles/index_' + str(page) + '.html'
4 | 


--------------------------------------------------------------------------------
/scrapyuniversal/utils.py:
--------------------------------------------------------------------------------
1 | from os.path import realpath, dirname
2 | import json
3 | 
4 | 
5 | def get_config(name):
6 |     path = dirname(realpath(__file__)) + '/configs/' + name + '.json'
7 |     with open(path, 'r', encoding='utf-8') as f:
8 |         return json.loads(f.read())


--------------------------------------------------------------------------------