├── crawler ├── __init__.py ├── spiders │ ├── __init__.py │ └── common_spider.py ├── items.py ├── incremental.py ├── pipelines.py ├── downloader.py ├── utils.py └── settings.py ├── requirements.txt ├── scrapy.cfg ├── check_json_file.py ├── configs └── dianping_beijingyumao.cfg ├── run_crawler.sh ├── LICENSE └── README.md /crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy>=0.12.0 2 | configobj 3 | -------------------------------------------------------------------------------- /crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | 6 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/topics/scrapyd.html 5 | 6 | [settings] 7 | default = crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawler 12 | -------------------------------------------------------------------------------- /crawler/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class CommonItem(Item): 9 | # define the fields for your item here like: 10 | # name = Field() 11 | url = Field() 12 | -------------------------------------------------------------------------------- /check_json_file.py: -------------------------------------------------------------------------------- 1 | #encoding=utf8 2 | import sys 3 | import json 4 | import os 5 | 6 | print sys.argv 7 | try: 8 | tmp_json_file = open(sys.argv[1]) 9 | json_data = json.load(tmp_json_file) 10 | tmp_json_file.close() 11 | json_file = open(sys.argv[2], 'wb') 12 | json_file.write(open(sys.argv[1], 'rb', 1).read()) 13 | json_file.close() 14 | print 'well done!' 15 | except Exception,e: 16 | print e 17 | print 'error in load tmp json' 18 | 19 | os.remove(sys.argv[1]) 20 | -------------------------------------------------------------------------------- /configs/dianping_beijingyumao.cfg: -------------------------------------------------------------------------------- 1 | [dianping_beijingyumao] 2 | allowed_domains = dianping.com 3 | start_urls = http://www.dianping.com/search/category/2/45/g152 4 | list_url_pattern = .*category/2/45/g152[p\d]* 5 | list_restrict_xpaths = '<>' 6 | list_content = list,item 7 | item_url_pattern = .*shop/\d+ 8 | item_restrict_xpaths = <> 9 | item_content = name,address,region,intro,phone_num,cover_image,hours,sport 10 | #item_incremental = yes 11 | item_name_xpaths = <> 12 | item_address_xpaths = <> 13 | item_region_xpaths = <> 14 | item_phone_num_xpaths = <> 15 | item_cover_image_xpaths = <> 16 | item_hours_xpaths = <> 17 | item_sport_xpaths = "羽毛球" 18 | download_delay = 5 19 | -------------------------------------------------------------------------------- /run_crawler.sh: -------------------------------------------------------------------------------- 1 | CRAWLER_DIR=$(cd "$(dirname "$0")"; pwd) 2 | XVFB=/usr/bin/Xvfb 3 | if [ ! -x "$myPath"] 4 | then 5 | count=`ps -ef | grep Xvfb | grep -v "grep" | wc -l` 6 | echo $count 7 | if [ $count -eq 0 ] 8 | then 9 | Xvfb :2 -screen 0 640x480x16 -nolisten tcp & 10 | fi 11 | fi 12 | 13 | cd $CRAWLER_DIR 14 | if [ $2 ] 15 | then 16 | ps -ef|grep $1_$2.json|awk '{print$2}'|xargs -i kill -9 {} 17 | else 18 | ps -ef|grep $1.json|awk '{print$2}'|xargs -i kill -9 {} 19 | fi 20 | if [ $2 ] 21 | then 22 | JSON_DIR=/tmp/$2 23 | if [ ! -d "$JSON_DIR" ] 24 | then 25 | mkdir "$JSON_DIR" 26 | fi 27 | scrapy crawl $1 --set FEED_URI=$JSON_DIR/.$1_$2.json.tmp --set FEED_FORMAT=json 28 | else 29 | JSON_DIR=/tmp 30 | scrapy crawl $1 --set FEED_URI=$JSON_DIR/.$1.json.tmp --set FEED_FORMAT=json 31 | fi 32 | if [ $2 ] 33 | then 34 | /usr/bin/python $CRAWLER_DIR/check_json_file.py $JSON_DIR/.$1_$2.json.tmp $JSON_DIR/$1_$2.json 35 | else 36 | /usr/bin/python $CRAWLER_DIR/check_json_file.py $JSON_DIR/.$1.json.tmp $JSON_DIR/$1.json 37 | fi 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Andy Qiu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /crawler/incremental.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage 3 | from scrapy.utils.httpobj import urlparse_cached 4 | from scrapy.exceptions import IgnoreRequest 5 | import settings 6 | 7 | class IncrementalDownloader(object): 8 | 9 | storage = None 10 | storage_class = FilesystemCacheStorage 11 | ignore_schemes = [] 12 | ignore_http_codes = [] 13 | 14 | def _get_storage(self, spider): 15 | return self.storage_class(spider.settings) 16 | 17 | def process_response(self, request, response, spider): 18 | if not self.storage: 19 | self.storage = self._get_storage(spider) 20 | if not self.ignore_schemes: 21 | self.ignore_schemes = spider.settings.getlist('HTTPCACHE_IGNORE_SCHEMES') 22 | if not self.ignore_http_codes: 23 | self.ignore_http_codes = map(int, spider.settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES')) 24 | cached_response = self.storage.retrieve_response(spider, request) 25 | if cached_response and cached_response.body == response.body: 26 | # return what if exist?? 27 | response.flags.append('exist_in_cache') 28 | # raise IgnoreRequest 29 | if self.is_cacheable(request) and self.is_cacheable_response(response): 30 | self.storage.store_response(spider, request, response) 31 | return response 32 | 33 | def is_cacheable_response(self, response): 34 | return response.status not in self.ignore_http_codes 35 | 36 | def is_cacheable(self, request): 37 | return urlparse_cached(request).scheme not in self.ignore_schemes 38 | -------------------------------------------------------------------------------- /crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html 5 | 6 | from scrapy.xlib.pydispatch import dispatcher 7 | from scrapy import signals 8 | from scrapy.exceptions import DropItem 9 | from scrapy.http import Request 10 | 11 | class CommonPipeline(object): 12 | def __init__(self): 13 | self.duplicates = {} 14 | dispatcher.connect(self.spider_opened, signals.spider_opened) 15 | dispatcher.connect(self.spider_closed, signals.spider_closed) 16 | 17 | def spider_opened(self, spider): 18 | self.duplicates[spider] = {} 19 | 20 | def spider_closed(self, spider): 21 | for k in self.duplicates[spider].keys(): 22 | del self.duplicates[spider][k] 23 | del self.duplicates[spider] 24 | del self.duplicates 25 | 26 | def ensure_not_empty(self, item, field): 27 | if field in item: 28 | if item[field] ==[]: 29 | raise DropItem("Empty item found: %s" % item) 30 | 31 | def ensure_not_duplicate(self, spider, item, field): 32 | if field in item: 33 | if field not in self.duplicates[spider]: 34 | self.duplicates[spider][field] = set() 35 | if item[field] and type(item[field]) is list: 36 | if item[field][0] in self.duplicates[spider][field]: 37 | raise DropItem("Duplicate item found: %s" % item) 38 | else: 39 | self.duplicates[spider][field].add(item[field][0]) 40 | 41 | def process_item(self, item, spider): 42 | self.ensure_not_empty(item, 'url') 43 | 44 | self.ensure_not_duplicate(spider, item, 'url') 45 | 46 | return item 47 | 48 | -------------------------------------------------------------------------------- /crawler/downloader.py: -------------------------------------------------------------------------------- 1 | #coding=utf8 2 | from scrapy.http import Request, FormRequest, HtmlResponse 3 | from threading import Thread, Event 4 | import gtk 5 | import webkit 6 | import jswebkit 7 | import urlparse 8 | 9 | class Render(object): 10 | pending = Event() 11 | def __init__(self, request, response): 12 | try: 13 | print 'Render.__init__' 14 | self.webview = webkit.WebView() 15 | self.webview.connect( 'load-finished', self.load_finished ) 16 | parsed_url = urlparse.urlparse(request.url) 17 | self.webview.load_html_string(response.body, parsed_url.scheme+'://'+parsed_url.netloc) 18 | #self.webview.load_uri(request.url) 19 | except Exception,e: 20 | print e 21 | finally: 22 | gtk.main() 23 | 24 | 25 | def load_finished(self, *args, **kw): 26 | try: 27 | print 'Render.load_finished' 28 | js = jswebkit.JSContext( self.webview.get_main_frame().get_global_context() ) 29 | self.rendered_html = str( js.EvaluateScript( 'document.body.innerHTML' ) ) 30 | self.pending.set() 31 | except Exception,e: 32 | print e 33 | finally: 34 | gtk.main_quit() 35 | 36 | class WebkitDownloader(object): 37 | def process_response(self, request, response, spider): 38 | try: 39 | render = Render(request, response) 40 | render.pending.wait() 41 | response = response.replace(body = render.rendered_html) 42 | except Exception,e: 43 | print e 44 | return response 45 | 46 | #def process_request( self, request, spider ): 47 | #webview = webkit.WebView() 48 | #webview.connect( 'load-finished', lambda v,f: gtk.main_quit() ) 49 | #webview.load_uri( request.url ) 50 | #gtk.main() 51 | #js = jswebkit.JSContext( webview.get_main_frame().get_global_context() ) 52 | #renderedBody = str( js.EvaluateScript( 'document.body.innerHTML' ) ) 53 | #return HtmlResponse( request.url, body=renderedBody ) 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ajax_crawler 2 | ============ 3 | 4 | A flexible web crawler based on Scrapy for fetching most of Ajax or other various types of web pages. 5 | 6 | Easy to use: To customize a new web crawler, You just need to write a config file and run. 7 | 8 | # Usage 9 | * Edit A Config File In The 'Configs' Directory 10 | ```shell 11 | cd configs 12 | touch xxx.cfg 13 | vim xxx.cfg 14 | ``` 15 | like this 16 | ```INI 17 | [xxx] # crawler name, should be the same as config file name. 18 | allowed_domains = dianping.com # domain name, can be a list. 19 | start_urls = http://www.dianping.com/search/category/2/45/g152 # start url, should be a certain url. 20 | list_url_pattern = .*category/2/45/g152[p\d]* # list url pattern # list url patern, you can use regular expressions here. 21 | list_restrict_xpaths = '<>' # list restrict xpaths, we use this to find item urls. 22 | list_content = list,item # decide what kind of content you can find in the list restrict xpaths. 23 | item_url_pattern = .*shop/\d+ # item url patter, you can use regular expressions here. 24 | item_restrict_xpaths = <> # item restrict xpaths, we use this to find item contents. 25 | item_content = name,address,region,intro,phone_num,cover_image,hours,sport # decide what field names can find in the item_restrict_xpaths. 26 | #item_incremental = yes # decide this crawler should be incremental (should use cache) 27 | item_name_xpaths = <> # we can find item content in the item field xpaths 28 | item_address_xpaths = <> 29 | item_region_xpaths = <> 30 | item_phone_num_xpaths = <> 31 | item_cover_image_xpaths = <> 32 | item_hours_xpaths = <> 33 | item_sport_xpaths = "羽毛球" # also can be a certain string 34 | download_delay = 5 # downlaod delay to reduce crawling frequency 35 | #js_parser = on # decide whether to use WebKit parsing js and rerendering web pages 36 | ``` 37 | * Then Just Run The Crawler 38 | ```bash 39 | scrapy crawl xxx 40 | ``` 41 | -------------------------------------------------------------------------------- /crawler/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from configobj import ConfigObj 3 | 4 | import os 5 | import sys 6 | import urlparse 7 | import re 8 | 9 | from os.path import dirname 10 | path = os.path.abspath(os.path.dirname(__file__)) 11 | sys.path.append(path) 12 | 13 | CRAWLER_DIR = dirname(path) 14 | print CRAWLER_DIR 15 | 16 | def load_configs(): 17 | filenames = [] 18 | try: 19 | cfg_dir = CRAWLER_DIR + '/configs/' 20 | if CRAWLER_DIR.startswith("~"): 21 | filenames += [ os.path.expanduser(cfg_dir) + filename for filename in os.listdir(os.path.expanduser(cfg_dir)) ] 22 | else: 23 | filenames += [ cfg_dir + filename for filename in os.listdir(cfg_dir) ] 24 | except Exception,e: 25 | print e 26 | 27 | cfg_list = [] 28 | for filename in filenames: 29 | if re.search('\.cfg$',filename) != None: 30 | cfg_list.append(filename) 31 | 32 | configs = [] 33 | for cfg in cfg_list: 34 | try: 35 | configs.append(ConfigObj( cfg, encoding='utf8')) 36 | except Exception,e: 37 | print e 38 | print "error occured when reading config file @" + cfg 39 | 40 | return configs 41 | 42 | configs = load_configs() 43 | #应用于此spider的配置 44 | site_config = None 45 | spider_name = None 46 | spider_config = None 47 | exist_site = False 48 | 49 | print sys.argv 50 | for argv in sys.argv: 51 | for config in configs: 52 | if argv in config: 53 | exist_site = True 54 | spider_name = argv 55 | site_config = config 56 | spider_config = site_config[spider_name] 57 | 58 | if exist_site == False: 59 | if 'crawl' in sys.argv: 60 | print "Unable to find spider: " + sys.argv[sys.argv.index('crawl') + 1] 61 | sys.exit() 62 | else: 63 | try: 64 | net_loc = urlparse.urlsplit(sys.argv[sys.argv.index('shell') + 1]).netloc 65 | site_name = re.search('([^\.]+)\.[^\.]+$',net_loc).group(1) 66 | for config in configs: 67 | if site_name in config: 68 | spider_name = site_name 69 | site_config = config 70 | spider_config = site_config[spider_name] 71 | exist_site = True 72 | if exist_site == False: 73 | spider_name = configs[0].keys()[0] 74 | site_config = configs[0] 75 | spider_config = site_config[spider_name] 76 | except: 77 | print "Unable to resolve the commands" 78 | sys.exit() 79 | -------------------------------------------------------------------------------- /crawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Scrapy settings for crawler project 3 | # 4 | # For simplicity, this file contains only the most important settings by 5 | # default. All the other settings are documented here: 6 | # 7 | # http://doc.scrapy.org/topics/settings.html 8 | # 9 | from configobj import ConfigObj 10 | 11 | import os 12 | import sys 13 | import datetime 14 | import re 15 | from os.path import dirname 16 | 17 | path = os.path.abspath(os.path.dirname(__file__)) 18 | sys.path.append(path) 19 | from utils import * 20 | 21 | configs = load_configs() 22 | 23 | #应用于此spider的配置 24 | site_config = None 25 | spider_name = None 26 | spider_config = None 27 | exist_site = False 28 | 29 | for argv in sys.argv: 30 | for config in configs: 31 | if argv in config: 32 | exist_site = True 33 | spider_name = argv 34 | site_config = config 35 | spider_config = site_config[spider_name] 36 | 37 | BOT_NAME = 'AjaxSpider' 38 | #BOT_VERSION = '2.0' 39 | 40 | SPIDER_MODULES = ['crawler.spiders'] 41 | NEWSPIDER_MODULE = 'crawler.spiders' 42 | DEFAULT_ITEM_CLASS = 'crawler.items.CommonItem' 43 | #USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION) 44 | #USER_AGENT = 'Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.2.3) Gecko/%s Fedora/3.6.3-4.fc13 Firefox/3.6.3' % (datetime.date.today().strftime("%Y%m%d")) 45 | USER_AGENT = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)' 46 | #USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' 47 | 48 | LOG_FILE=(CRAWLER_DIR + '/log/'+ spider_name + '_log_'+datetime.date.today().strftime("%Y%m%d")+'.log') 49 | 50 | ITEM_PIPELINES = { 51 | 'crawler.pipelines.CommonPipeline':300, 52 | } 53 | 54 | if spider_config.has_key('download_delay'): 55 | DOWNLOAD_DELAY = float(spider_config['download_delay']) 56 | else: 57 | DOWNLOAD_DELAY = 0.25 58 | RANDOMIZE_DOWNLOAD_DELAY = True 59 | 60 | print spider_config.keys() 61 | if spider_config.has_key('js_parser') and spider_config['js_parser']=='on': 62 | DOWNLOADER_MIDDLEWARES = { 63 | 'crawler.downloader.WebkitDownloader': 543, 64 | } 65 | 66 | incremental = False 67 | for cfg_key in spider_config.iterkeys(): 68 | if cfg_key.endswith('_incremental') and spider_config[cfg_key]=='yes': 69 | incremental = True 70 | break 71 | if incremental: 72 | print 'incremental is : ON' 73 | HTTPCACHE_EXPIRATION_SECS = 172800 74 | FEED_STORE_EMPTY = True 75 | DOWNLOADER_MIDDLEWARES = { 76 | 'crawler.incremental.IncrementalDownloader': 544, 77 | } 78 | 79 | RETRY_HTTP_CODES = [500, 503, 504, 400, 408] 80 | if spider_config.has_key('ignore_http_code'): 81 | RETRY_HTTP_CODES.remove(int(spider_config['ignore_http_code'])) 82 | 83 | os.environ["DISPLAY"] = ":2" 84 | 85 | COOKIES_ENABLED = True 86 | COOKIES_DEBUG = True 87 | -------------------------------------------------------------------------------- /crawler/spiders/common_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.spider import * 3 | from scrapy.selector import * 4 | from scrapy.http import Request, FormRequest, HtmlResponse 5 | from scrapy.conf import settings 6 | from scrapy.item import Item, Field 7 | from scrapy.utils.response import get_base_url 8 | from scrapy.utils.url import urljoin_rfc 9 | 10 | from configobj import ConfigObj 11 | from crawler.items import CommonItem 12 | from crawler.utils import * 13 | 14 | import httplib 15 | import urllib 16 | import cookielib 17 | import urlparse 18 | import os 19 | import re 20 | import sys 21 | import datetime 22 | import itertools 23 | import random 24 | import hashlib 25 | import copy 26 | 27 | class DataNode: 28 | def __init__(self,name): 29 | self.name=name 30 | self.parent = None 31 | self.children=set() 32 | self.url_patterns = set() 33 | self.restrict_xpaths = set() 34 | self.xpaths = set() 35 | self.incremental = False 36 | def addChild(self, child): 37 | self.children.add(child) 38 | def getChildren(self): 39 | return self.children 40 | def setParent(self, parent): 41 | if parent.name != self.name: 42 | self.parent = parent 43 | def setUrlPatterns(self, url_patterns): 44 | self.url_patterns = set(url_patterns) 45 | def getUrlPatterns(self): 46 | return self.url_patterns 47 | def setRestrictXPaths(self, restrict_xpaths): 48 | self.restrict_xpaths = set(restrict_xpaths) 49 | def getRestrictXPaths(self, restrict_xpaths): 50 | return self.restrict_xpaths 51 | def setXPaths(self, xpaths): 52 | self.xpaths = set(xpaths) 53 | def getXPaths(self): 54 | return self.xpaths 55 | def setIncremental(self, incremental): 56 | self.incremental = incremental 57 | def getIncremental(self, incremental): 58 | return self.incremental 59 | def __repr__(self): 60 | return""%self.name 61 | 62 | class XPath: 63 | def __init__(self): 64 | pass 65 | 66 | def eval(self): 67 | pass 68 | 69 | try: 70 | BaseSpider = Spider 71 | except: 72 | pass 73 | 74 | class CommonSpider(BaseSpider): 75 | 76 | configs = load_configs() 77 | 78 | #settings applied to the spider 79 | site_config = None 80 | spider_name = None 81 | spider_config = None 82 | exist_site = False 83 | 84 | print sys.argv 85 | for argv in sys.argv: 86 | for config in configs: 87 | if argv in config: 88 | exist_site = True 89 | spider_name = argv 90 | site_config = config 91 | spider_config = site_config[spider_name] 92 | 93 | if exist_site == False: 94 | if 'crawl' in sys.argv: 95 | print "Unable to find spider: " + sys.argv[sys.argv.index('crawl') + 1] 96 | sys.exit() 97 | else: 98 | try: 99 | net_loc = urlparse.urlsplit(sys.argv[sys.argv.index('shell') + 1]).netloc 100 | site_name = re.search('([^\.]+)\.[^\.]+$',net_loc).group(1) 101 | for config in configs: 102 | if site_name in config: 103 | spider_name = site_name 104 | site_config = config 105 | spider_config = site_config[spider_name] 106 | exist_site = True 107 | if exist_site == False: 108 | spider_name = configs[0].keys()[0] 109 | site_config = configs[0] 110 | spider_config = site_config[spider_name] 111 | except: 112 | print "Unable to resolve the commands" 113 | sys.exit() 114 | 115 | item_fields = set() 116 | setting_vars = {} 117 | params = {} 118 | root_node = DataNode('root') 119 | site_tree = {'root':root_node} 120 | for cfg_key in spider_config.iterkeys(): 121 | #初始化自定义变量 122 | if re.search('^\$',cfg_key): 123 | var_name = re.sub('^\$','',cfg_key) 124 | setting_vars[var_name] = spider_config[cfg_key] 125 | #/*------------初始化数据节点树 begin-------------------*/ 126 | #初始化url种子列表 127 | if cfg_key.endswith('_url_pattern'): 128 | node_name = cfg_key.replace('_url_pattern', '') 129 | node_url_patterns = spider_config[cfg_key] 130 | if type(node_url_patterns) is not list: 131 | node_url_patterns = [node_url_patterns] 132 | if not site_tree.has_key(node_name): 133 | site_tree[node_name] = DataNode(node_name) 134 | node = site_tree[node_name] 135 | node.setUrlPatterns(node_url_patterns) 136 | if cfg_key.endswith('_restrict_xpaths'): 137 | node_name = cfg_key.replace('_restrict_xpaths', '') 138 | node_restrict_xpaths = spider_config[cfg_key] 139 | if type(node_restrict_xpaths) is not list: 140 | node_restrict_xpaths = [node_restrict_xpaths] 141 | if not site_tree.has_key(node_name): 142 | site_tree[node_name] = DataNode(node_name) 143 | node = site_tree[node_name] 144 | node.setRestrictXPaths(node_restrict_xpaths) 145 | if cfg_key.endswith('_incremental'): 146 | node_name = cfg_key.replace('_incremental', '') 147 | node_incremental = spider_config[cfg_key] 148 | if not site_tree.has_key(node_name): 149 | site_tree[node_name] = DataNode(node_name) 150 | node = site_tree[node_name] 151 | if node_incremental == 'yes': 152 | node.setIncremental(True) 153 | #/*------------初始化数据节点树 end-------------------*/ 154 | params = setting_vars.copy() 155 | 156 | for k,v in site_tree.items(): 157 | if k+'_content' in spider_config: 158 | node_contents = spider_config[k+'_content'] 159 | if type(node_contents) is not list: 160 | node_contents = [node_contents] 161 | for node_content in node_contents: 162 | if not site_tree.has_key(node_content): 163 | site_tree[node_content] = DataNode(node_content) 164 | #数据项定义 165 | if spider_config.has_key(node_content) or spider_config.has_key('item_'+node_content+'_xpaths'): 166 | field_xpaths = spider_config[node_content] if spider_config.has_key(node_content) else spider_config['item_'+node_content+'_xpaths'] 167 | if type(field_xpaths) is not list: 168 | field_xpaths = [field_xpaths] 169 | site_tree[node_content].setXPaths(field_xpaths) 170 | item_fields.add(node_content) 171 | if not CommonItem().fields.has_key(node_content): 172 | CommonItem().fields[node_content] = Field() 173 | v.addChild(site_tree[node_content]) 174 | site_tree[node_content].setParent(v) 175 | 176 | item_fields.add('url') 177 | CommonItem().fields['url'] = Field() 178 | 179 | for k,v in site_tree.iteritems(): 180 | if not v.parent and v.name != 'root': 181 | v.parent = site_tree['root'] 182 | site_tree['root'].addChild(v) 183 | 184 | for item_field in item_fields: 185 | if not CommonItem().fields.has_key(item_field): 186 | CommonItem().fields[item_field] = {} 187 | 188 | name = spider_name 189 | allowed_domains = spider_config['allowed_domains'] 190 | if type(allowed_domains) is not list: 191 | allowed_domains = [allowed_domains] 192 | 193 | start_urls = spider_config['start_urls'] 194 | if type(start_urls) is not list: 195 | start_urls = [start_urls] 196 | 197 | start_urls = tuple(start_urls) 198 | 199 | items = {} 200 | queue = {} 201 | 202 | def __init__(self, *a, **kw): 203 | super(CommonSpider, self).__init__(*a, **kw) 204 | 205 | def _eval_xpath(self,hxs,xpath): 206 | """ 获得xpath表达式语句 207 | """ 208 | ret_val = '' 209 | if xpath.find('&') > -1: 210 | m = re.search(r'<<(.+)&(.*)>>',xpath) 211 | xpath_ex = m.group(1) 212 | reg_ex = m.group(2) 213 | ret_val += hxs + """.select('""" + xpath_ex + """').re('""" + reg_ex + """'.decode('utf8'))""" 214 | else: 215 | m = re.search(r'<<(.+)>>',xpath) 216 | xpath_ex = m.group(1) 217 | ret_val += hxs + """.select('""" + xpath_ex + """').extract()""" 218 | 219 | return ret_val 220 | 221 | def _join_lists(self, ls, linearity=False): 222 | """ 交叉连接,参数ls为需要做连接的list,linearity表示交叉连接是否线性的(一一对应) 223 | 224 | """ 225 | ret_val = [] 226 | for x in ls: 227 | if type(x) is not list: 228 | x = [x] 229 | #if len(ls) <= 1: 230 | #return ls 231 | if linearity == True: 232 | ret_val = itertools.imap(None,*ls) 233 | return tuple(ret_val) 234 | ret_val = itertools.product(*ls) 235 | return tuple(ret_val) 236 | 237 | def _process_ex(self, hxs, ex_str, url=None): 238 | """ 对表达式的处理 239 | """ 240 | ret_val = '' 241 | exs = ex_str.split('~') 242 | ret_val += '[' 243 | for i,ex in enumerate(exs): 244 | if ex.startswith('<<') and ex.endswith('>>'): 245 | ret_val += self._eval_xpath(hxs,ex) 246 | else: 247 | try: 248 | #if ex.find('(')>-1 and ex.find(')')>-1: 249 | try: 250 | ret_val += '[u\'' + re.search(ex, url).group(1) + '\']' 251 | #else: 252 | except: 253 | ret_val += '[u\'' + re.search(ex, url).group(0) + '\']' 254 | except: 255 | ret_val += '[u\'' + ex + '\']' 256 | if i == len(exs)-1: 257 | ret_val += ']' 258 | else: 259 | ret_val += ',' 260 | return ret_val 261 | 262 | def eval_custom_var(self, response, hxs, var): 263 | """ 计算自定义变量表达式的结果 264 | """ 265 | ret_val = [] 266 | tmp_v = var[:] 267 | if type(tmp_v) is not list: 268 | tmp_v = [tmp_v] 269 | for i in xrange(len(tmp_v)): 270 | tmp_var = tmp_v[i].split('~') 271 | for j in xrange(len(tmp_var)): 272 | val = tmp_var[j] 273 | if val.find('{')>-1 and val.find('}')>-1: 274 | val = eval(""" " """+val.replace('{','').replace('}','')+""" " """) 275 | if val.startswith('<<') and val.endswith('>>'): 276 | rs_var = eval(self._eval_xpath('hxs',val)) 277 | if rs_var: 278 | tmp_var[j] = rs_var 279 | else: 280 | try: 281 | rs_var = eval(val) 282 | if rs_var: 283 | tmp_var[j] = rs_var 284 | except: 285 | try: 286 | rs_var = re.search(val,response.url) 287 | if rs_var: 288 | try: 289 | tmp_var[j] = rs_var.group(1) 290 | except: 291 | tmp_var[j] = rs_var.group(0) 292 | except: 293 | pass 294 | if type(tmp_var[j]) is not list: 295 | tmp_var[j] = [tmp_var[j]] 296 | tmp_v[i] = [''.join([unicode(_x) for _x in tmp_x]) for tmp_x in self._join_lists(tmp_var)] 297 | for tmp_x in tmp_v: 298 | if type(tmp_x) is list: 299 | ret_val += tmp_x 300 | else: 301 | ret_val.append(tmp_x) 302 | 303 | return ret_val 304 | 305 | 306 | def extract_url(self, response, hxs, url_pattern, restrict_xpaths, params): 307 | """ 提取Url 308 | """ 309 | ret_urls = [] 310 | length = 0 311 | tmp_urls = [] 312 | exists = set() 313 | base_url = get_base_url(response) 314 | #TODO:exist_list_param handler 315 | if not restrict_xpaths: 316 | restrict_xpaths = ["<>"] 317 | for restrict_xpath in restrict_xpaths: 318 | url_xpath = restrict_xpath 319 | closed_xpath = False 320 | if re.search('/([^/]+\(\).*>>)', restrict_xpath) or re.search('/@([^/]+>>)', restrict_xpath): 321 | closed_xpath = True 322 | if not closed_xpath: 323 | joint_xpath = '' 324 | level = 0 325 | while not self.eval_custom_var(response, hxs, restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>')) or self.eval_custom_var(response, hxs, restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>'))==[restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>')]: 326 | joint_xpath += '/*' 327 | level += 1 328 | if level>=5: 329 | break 330 | url_xpath = restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>') 331 | tmp_urls += self.eval_custom_var(response, hxs, url_xpath) 332 | sorted_tmp_urls = list(set(tmp_urls)) 333 | sorted_tmp_urls.sort(key=tmp_urls.index) 334 | length = len(sorted_tmp_urls) 335 | _url_pattern = self.process_url_pattern(url_pattern, response, hxs) 336 | for i,tmp_url in enumerate(sorted_tmp_urls): 337 | #handle incomplete urls 338 | tmp_url = urljoin_rfc(base_url, tmp_url) 339 | try: 340 | if re.search(_url_pattern, tmp_url): 341 | #if url_pattern.find("(")>-1 and url_pattern.find(")")>-1: 342 | try: 343 | tmp_url = re.search(_url_pattern, tmp_url).group(1) 344 | #else: 345 | except: 346 | tmp_url = re.search(_url_pattern, tmp_url).group(0) 347 | else: 348 | continue 349 | except: 350 | pass 351 | tmp_params = re.findall('\{([^\{\}]*)\}', tmp_url) 352 | processed_url = tmp_url 353 | if tmp_params: 354 | exist_invalid_param = False 355 | for tmp_param in tmp_params: 356 | if not tmp_param or tmp_param not in self.params: 357 | exist_invalid_param = True 358 | print 'invalid param:%s'%tmp_param 359 | if exist_invalid_param: 360 | break 361 | processed_url = [urlparse.urljoin(response.url, ''.join(x_url_parts)) for x_url_parts in self._join_lists(eval('[[\'' + tmp_url.replace('{','\'],').replace('}',',[\'') + '\']]'))] 362 | if type(processed_url) is list: 363 | for _url in processed_url: 364 | if _url in exists: 365 | processed_url.remove(_url) 366 | ret_urls += [[(i+1)*(j+1)-1,_url] for j,_url in enumerate(processed_url)] 367 | else: 368 | if not processed_url in exists: 369 | ret_urls.append([i, processed_url]) 370 | exists.add(processed_url) 371 | #TODO:exist_list_param handler 372 | tmp_params = re.findall('\{([^\{\}]*)\}', url_pattern) 373 | if not ret_urls and (params or tmp_params): 374 | tmp_urls = [url_pattern] 375 | exist_invalid_param = False 376 | for tmp_param in tmp_params: 377 | if not tmp_param or tmp_param not in self.params: 378 | exist_invalid_param = True 379 | print 'invalid param:%s'%tmp_param 380 | if not tmp_params or exist_invalid_param: 381 | pass 382 | else: 383 | tmp_urls = [urljoin_rfc(base_url, ''.join(x_url_parts)) for x_url_parts in self._join_lists(eval('[[\'' + url_pattern.replace('{','\'],').replace('}',',[\'') + '\']]'))] 384 | lengh = len(tmp_urls) 385 | for i,tmp_url in enumerate(tmp_urls): 386 | ret_urls.append((i,tmp_url)) 387 | ret_urls = [ret_url for ret_url in ret_urls if ret_url[1].find('http://')>-1] 388 | return length, ret_urls 389 | 390 | def generate_sign(self): 391 | sign = hashlib.sha1(str(random.random())).hexdigest() 392 | while self.items.has_key(sign): 393 | sign = hashlib.sha1(str(random.random())).hexdigest() 394 | return sign 395 | 396 | def parse_item_xpaths(self, hxs, xpaths, item, url, name, replace=False, allow_empty=True): 397 | _res = [] 398 | for item_field_xpath in xpaths: 399 | item_field_xpath = item_field_xpath.replace('`',',') 400 | _res += self._join_lists(eval(self._process_ex('hxs',item_field_xpath, url=url))) 401 | 402 | joined_res = [''.join(_one) for _one in _res] 403 | if name.find("url")>-1: 404 | for i,joined_one in enumerate(joined_res): 405 | if not joined_one.startswith("http://"): 406 | if not joined_one.startswith("/"): 407 | joined_one = "/" + joined_one 408 | joined_res[i] = urlparse.urljoin(url, joined_one) 409 | 410 | if item.has_key(name): 411 | if replace: 412 | if joined_res or allow_empty: 413 | item[name] = joined_res 414 | else: 415 | if joined_res != item[name]: 416 | item[name] += joined_res 417 | else: 418 | if joined_res or allow_empty: 419 | item[name] = joined_res 420 | 421 | 422 | def parse_multi_items(self, hxs, node, item, response, index, count): 423 | if node.restrict_xpaths: 424 | for child in node.children: 425 | if child.xpaths: 426 | restrict_xpath = '|'.join([restrict_xpath.replace("<<", "").replace(">>", "") for restrict_xpath in node.restrict_xpaths]) 427 | try: 428 | m = re.search(r'<<(.+)&(.*)>>',xpath) 429 | restrict_xpath = m.group(1) 430 | except: 431 | pass 432 | restrict_selectors = hxs.select(restrict_xpath) 433 | #fetch multi items from one page 434 | if index != None and len(restrict_selectors) > index and len(restrict_selectors)==count: 435 | try: 436 | XmlXPathSelector = Selector 437 | except: 438 | pass 439 | restrict_hxs = XmlXPathSelector(HtmlResponse(response.url, body=re.sub('[\n\r\t]+', '', restrict_selectors[index].extract()), encoding='utf8')) 440 | #restrict_hxs = restrict_selectors[index] 441 | self.parse_item_xpaths(restrict_hxs, child.xpaths, item, response.url, child.name, True, False) 442 | 443 | def process_url_pattern(self, url_pattern, response, hxs): 444 | return ''.join(self._join_lists(eval('[[\'' + url_pattern.replace('{','\'],').replace('}',',[\'') + '\']]'))[0]) 445 | 446 | def match_url_pattern(self, url_pattern, url): 447 | match = False 448 | _url_pattern = url_pattern[:] 449 | if re.search('\{\w+\}', url_pattern): 450 | _url_pattern = re.sub('\{\w+\}', '\w+', url_pattern).replace('?', '\?') 451 | if re.search(_url_pattern, url): 452 | match = True 453 | return match 454 | 455 | def parse(self, response): 456 | if self.spider_config.has_key('encoding'): 457 | _encoding = self.spider_config['encoding'] 458 | response = response.replace(encoding=_encoding) 459 | try: 460 | HtmlXPathSelector = Selector 461 | except: 462 | pass 463 | 464 | hxs = HtmlXPathSelector(response) 465 | 466 | curr_nodes = set() 467 | 468 | #tell the curr_node 469 | if 'node' in response.meta: 470 | curr_nodes = set(response.meta['node']) 471 | else: 472 | for key, node in self.site_tree.items(): 473 | for url_pattern in node.url_patterns: 474 | if self.match_url_pattern(url_pattern, response.url): 475 | curr_nodes.add(node) 476 | 477 | if not curr_nodes: 478 | curr_nodes.add(self.site_tree['root']) 479 | 480 | #make a sign of the item (when??) 481 | #curr_sign = self.generate_sign() 482 | #if response.request.headers.has_key('Sign'): 483 | #curr_sign = response.request.headers['Sign'] 484 | 485 | is_root = False 486 | for curr_node in curr_nodes: 487 | #if not curr_node.parent or curr_node.parent.name == 'root': 488 | if not curr_node.parent: 489 | is_root = True 490 | 491 | #fetch an item from self.items 492 | if 'item' in response.meta: 493 | if is_root: 494 | item = CommonItem() 495 | else: 496 | item = copy.deepcopy(response.meta['item']) 497 | #item = copy.deepcopy(self.items[curr_sign]['item']) 498 | #self.items[curr_sign]['req_count'] -= 1 499 | #if not self.items[curr_sign]['req_count']: 500 | #self.items.pop(curr_sign) 501 | else: 502 | item = CommonItem() 503 | 504 | #解析自定义变量或参数 505 | for k,v in self.setting_vars.iteritems(): 506 | tmp_v = self.eval_custom_var(response, hxs, v) 507 | self.params[k] = tmp_v 508 | if len(tmp_v) == 1: 509 | self.params[k] = tmp_v[0] 510 | globals()[k] = self.params[k] 511 | 512 | 513 | #解析url和数据项 514 | for curr_node in curr_nodes: 515 | if curr_node.incremental and 'exist_in_cache' in response.flags: 516 | yield None 517 | else: 518 | tail_branch = True 519 | no_yield = True 520 | if curr_node.name == 'item' and not item.has_key('url'): 521 | item['url'] = [response.url] 522 | for child in curr_node.children: 523 | #if child.children and child is not curr_node: 524 | if child.children: 525 | tail_branch = False 526 | #解析数据项 527 | if child.xpaths: 528 | self.parse_item_xpaths(hxs, child.xpaths, item, response.url, child.name) 529 | for child in curr_node.children: 530 | #parse multi items from one single page 531 | curr_items = [] 532 | #if child.restrict_xpaths and child.parent and child.parent.name != 'root': 533 | if child.restrict_xpaths: 534 | tail_child = True 535 | belongtos = [] 536 | for restrict_xpath in child.restrict_xpaths: 537 | belongtos += self._join_lists(eval(self._process_ex('hxs',restrict_xpath))) 538 | #for chd in child.children: 539 | #if chd.children and chd is not child: 540 | #tail_child = False 541 | if child.url_patterns: 542 | tail_child = False 543 | for i in xrange(len(belongtos)): 544 | new_item = copy.deepcopy(item) 545 | #item_sign = self.generate_sign() 546 | self.parse_multi_items(hxs, child, new_item, response, i, len(belongtos)) 547 | if tail_child and len(new_item) == len(self.item_fields): 548 | yield new_item 549 | no_yield = False 550 | else: 551 | curr_items.append(new_item) 552 | 553 | #parse follow urls 554 | if child.url_patterns: 555 | restrict_xpaths = child.restrict_xpaths 556 | prefix = child.name + "_" 557 | pending_params = {} 558 | for key,val in self.params.iteritems(): 559 | if key.startswith(prefix): 560 | pending_params[key.replace(prefix,'')] = val 561 | pending_urls = [] 562 | urls_len = 0 563 | for url_pattern in child.url_patterns: 564 | urls_length, url_list = self.extract_url(response, hxs, url_pattern, restrict_xpaths, pending_params) 565 | urls_len += urls_length 566 | pending_urls += url_list 567 | if pending_params: 568 | tmp_param_list = [] 569 | pending_params_keys = [] 570 | for k,v in pending_params.items(): 571 | pending_params_keys.append(k) 572 | tmp_param_list.append(v) 573 | tmp_seq_params = self._join_lists(tmp_param_list) 574 | for i,pending_url in pending_urls: 575 | #TODO:exist_list_param handler 576 | for j,tmp_seq_param in enumerate(tmp_seq_params): 577 | #req_sign = self.generate_sign() 578 | req_item = copy.deepcopy(item) 579 | _tmp_param = {} 580 | for i, k in enumerate(pending_params_keys): 581 | _tmp_param[k] = tmp_seq_param[i] 582 | #Attach item sign with the request 583 | req = FormRequest(pending_url, method="POST", formdata=_tmp_param) 584 | #req.headers['Sign'] = req_sign 585 | #attach item with the request 586 | if 'item' not in req.meta: 587 | req.meta['item'] = req_item 588 | #if req_sign not in self.items: 589 | #self.items[req_sign] = {'item':req_item, 'req_count':0} 590 | #self.items[req_sign]['req_count'] += 1 591 | if 'node' not in req.meta: 592 | req.meta['node'] = [] 593 | req.meta['node'].append(child) 594 | yield req 595 | no_yield = False 596 | else: 597 | for i,pending_url in pending_urls: 598 | #req_sign = self.generate_sign() 599 | req_item = copy.deepcopy(item) 600 | if len(curr_items)==urls_len and i