├── crawler
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   └── common_spider.py
    ├── items.py
    ├── incremental.py
    ├── pipelines.py
    ├── downloader.py
    ├── utils.py
    └── settings.py
├── requirements.txt
├── scrapy.cfg
├── check_json_file.py
├── configs
    └── dianping_beijingyumao.cfg
├── run_crawler.sh
├── LICENSE
└── README.md


/crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy>=0.12.0
2 | configobj
3 | 


--------------------------------------------------------------------------------
/crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 
6 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawler
12 | 


--------------------------------------------------------------------------------
/crawler/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/topics/items.html
 5 | 
 6 | from scrapy.item import Item, Field
 7 | 
 8 | class CommonItem(Item):
 9 |     # define the fields for your item here like:
10 |     # name = Field()
11 |     url = Field()
12 | 


--------------------------------------------------------------------------------
/check_json_file.py:
--------------------------------------------------------------------------------
 1 | #encoding=utf8
 2 | import sys
 3 | import json
 4 | import os
 5 | 
 6 | print sys.argv
 7 | try:
 8 |     tmp_json_file = open(sys.argv[1])
 9 |     json_data = json.load(tmp_json_file)
10 |     tmp_json_file.close()
11 |     json_file = open(sys.argv[2], 'wb')
12 |     json_file.write(open(sys.argv[1], 'rb', 1).read())
13 |     json_file.close()
14 |     print 'well done!'
15 | except Exception,e:
16 |     print e
17 |     print 'error in load tmp json'
18 | 
19 | os.remove(sys.argv[1])
20 | 


--------------------------------------------------------------------------------
/configs/dianping_beijingyumao.cfg:
--------------------------------------------------------------------------------
 1 | [dianping_beijingyumao]
 2 | allowed_domains = dianping.com
 3 | start_urls = http://www.dianping.com/search/category/2/45/g152
 4 | list_url_pattern = .*category/2/45/g152[p\d]*
 5 | list_restrict_xpaths = '<<//div[@class="page"]//a/@href>>'
 6 | list_content = list,item
 7 | item_url_pattern = .*shop/\d+
 8 | item_restrict_xpaths = <<//div[@class="tit"]>>
 9 | item_content = name,address,region,intro,phone_num,cover_image,hours,sport
10 | #item_incremental = yes
11 | item_name_xpaths = <<//h1[@class="shop-title"]/text()>>
12 | item_address_xpaths = <<//span[@itemprop="street-address"]/text()>>
13 | item_region_xpaths = <<//span[@class="region"]/text()>>
14 | item_phone_num_xpaths = <<//span[@itemprop="tel"]/text()>>
15 | item_cover_image_xpaths = <<//img[@itemprop="photo"]/@src>>
16 | item_hours_xpaths = <<//div[@class="desc-info"]//ul/li/span[@class="J_full-cont"]/text()>>
17 | item_sport_xpaths = "羽毛球"
18 | download_delay = 5
19 | 


--------------------------------------------------------------------------------
/run_crawler.sh:
--------------------------------------------------------------------------------
 1 | CRAWLER_DIR=$(cd "$(dirname "$0")"; pwd)
 2 | XVFB=/usr/bin/Xvfb
 3 | if [ ! -x "$myPath"]
 4 | then 
 5 |     count=`ps -ef | grep Xvfb | grep -v "grep" | wc -l`
 6 |     echo $count
 7 |     if [ $count -eq 0 ]
 8 |     then
 9 |         Xvfb :2 -screen 0 640x480x16 -nolisten tcp &
10 |     fi
11 | fi
12 | 
13 | cd $CRAWLER_DIR
14 | if [ $2 ]
15 | then
16 |     ps -ef|grep $1_$2.json|awk '{print$2}'|xargs -i kill -9 {}
17 | else
18 |     ps -ef|grep $1.json|awk '{print$2}'|xargs -i kill -9 {}
19 | fi
20 | if [ $2 ]
21 | then
22 | JSON_DIR=/tmp/$2
23 | if [ ! -d "$JSON_DIR" ]
24 | then
25 | mkdir "$JSON_DIR"
26 | fi
27 | scrapy crawl $1 --set FEED_URI=$JSON_DIR/.$1_$2.json.tmp --set FEED_FORMAT=json
28 | else
29 | JSON_DIR=/tmp
30 | scrapy crawl $1 --set FEED_URI=$JSON_DIR/.$1.json.tmp --set FEED_FORMAT=json
31 | fi
32 | if [ $2 ]
33 | then
34 |     /usr/bin/python $CRAWLER_DIR/check_json_file.py $JSON_DIR/.$1_$2.json.tmp $JSON_DIR/$1_$2.json
35 | else
36 |     /usr/bin/python $CRAWLER_DIR/check_json_file.py $JSON_DIR/.$1.json.tmp $JSON_DIR/$1.json
37 | fi
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Andy Qiu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/crawler/incremental.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | from scrapy.contrib.downloadermiddleware.httpcache import FilesystemCacheStorage
 3 | from scrapy.utils.httpobj import urlparse_cached
 4 | from scrapy.exceptions import IgnoreRequest
 5 | import settings
 6 | 
 7 | class IncrementalDownloader(object):
 8 | 
 9 |     storage = None
10 |     storage_class = FilesystemCacheStorage
11 |     ignore_schemes = []
12 |     ignore_http_codes = []    
13 | 
14 |     def _get_storage(self, spider):
15 |         return self.storage_class(spider.settings)
16 | 
17 |     def process_response(self, request, response, spider):
18 |         if not self.storage:
19 |             self.storage = self._get_storage(spider)
20 |             if not self.ignore_schemes:
21 |                 self.ignore_schemes = spider.settings.getlist('HTTPCACHE_IGNORE_SCHEMES')
22 |             if not self.ignore_http_codes:
23 |                 self.ignore_http_codes = map(int, spider.settings.getlist('HTTPCACHE_IGNORE_HTTP_CODES'))
24 |         cached_response = self.storage.retrieve_response(spider, request)
25 |         if cached_response and cached_response.body == response.body:
26 |             # return what if exist??
27 |             response.flags.append('exist_in_cache')
28 |             # raise IgnoreRequest
29 |         if self.is_cacheable(request) and self.is_cacheable_response(response):
30 |             self.storage.store_response(spider, request, response)
31 |         return response
32 | 
33 |     def is_cacheable_response(self, response):
34 |         return response.status not in self.ignore_http_codes
35 | 
36 |     def is_cacheable(self, request):
37 |         return urlparse_cached(request).scheme not in self.ignore_schemes
38 | 


--------------------------------------------------------------------------------
/crawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: http://doc.scrapy.org/topics/item-pipeline.html
 5 | 
 6 | from scrapy.xlib.pydispatch import dispatcher
 7 | from scrapy import signals
 8 | from scrapy.exceptions import DropItem
 9 | from scrapy.http import Request
10 | 
11 | class CommonPipeline(object):
12 |     def __init__(self):
13 |         self.duplicates = {}
14 |         dispatcher.connect(self.spider_opened, signals.spider_opened)
15 |         dispatcher.connect(self.spider_closed, signals.spider_closed)
16 | 
17 |     def spider_opened(self, spider):
18 |         self.duplicates[spider] = {}
19 | 
20 |     def spider_closed(self, spider):
21 |         for k in self.duplicates[spider].keys():
22 |             del self.duplicates[spider][k]
23 |         del self.duplicates[spider]
24 |         del self.duplicates
25 | 
26 |     def ensure_not_empty(self, item, field):
27 |         if field in item:
28 |             if item[field] ==[]:
29 |                 raise DropItem("Empty item found: %s" % item)
30 | 
31 |     def ensure_not_duplicate(self, spider, item, field):
32 |         if field in item:
33 |             if field not in self.duplicates[spider]:
34 |                 self.duplicates[spider][field] = set()
35 |             if item[field] and type(item[field]) is list:
36 |                 if item[field][0] in self.duplicates[spider][field]:
37 |                     raise DropItem("Duplicate item found: %s" % item)
38 |                 else:
39 |                     self.duplicates[spider][field].add(item[field][0])
40 | 
41 |     def process_item(self, item, spider):
42 |         self.ensure_not_empty(item, 'url')
43 | 
44 |         self.ensure_not_duplicate(spider, item, 'url')
45 | 
46 |         return item
47 | 
48 | 


--------------------------------------------------------------------------------
/crawler/downloader.py:
--------------------------------------------------------------------------------
 1 | #coding=utf8
 2 | from scrapy.http import Request, FormRequest, HtmlResponse
 3 | from threading import Thread, Event
 4 | import gtk
 5 | import webkit
 6 | import jswebkit
 7 | import urlparse
 8 | 
 9 | class Render(object):
10 |     pending = Event()
11 |     def __init__(self, request, response):
12 |         try:
13 |             print 'Render.__init__'
14 |             self.webview = webkit.WebView()
15 |             self.webview.connect( 'load-finished', self.load_finished )
16 |             parsed_url = urlparse.urlparse(request.url)
17 |             self.webview.load_html_string(response.body, parsed_url.scheme+'://'+parsed_url.netloc)
18 |             #self.webview.load_uri(request.url)
19 |         except Exception,e:
20 |             print e
21 |         finally:
22 |             gtk.main()
23 | 
24 | 
25 |     def load_finished(self, *args, **kw):
26 |         try:
27 |             print 'Render.load_finished'
28 |             js = jswebkit.JSContext( self.webview.get_main_frame().get_global_context() )
29 |             self.rendered_html = str( js.EvaluateScript( 'document.body.innerHTML' ) )
30 |             self.pending.set()
31 |         except Exception,e:
32 |             print e
33 |         finally:
34 |             gtk.main_quit()
35 | 
36 | class WebkitDownloader(object):
37 |     def process_response(self, request, response, spider):
38 |         try:
39 |             render = Render(request, response)
40 |             render.pending.wait()
41 |             response = response.replace(body = render.rendered_html)
42 |         except Exception,e:
43 |             print e
44 |         return response
45 | 
46 |     #def process_request( self, request, spider ):
47 |         #webview = webkit.WebView()
48 |         #webview.connect( 'load-finished', lambda v,f: gtk.main_quit() )
49 |         #webview.load_uri( request.url )
50 |         #gtk.main()
51 |         #js = jswebkit.JSContext( webview.get_main_frame().get_global_context() )
52 |         #renderedBody = str( js.EvaluateScript( 'document.body.innerHTML' ) )
53 |         #return HtmlResponse( request.url, body=renderedBody )
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ajax_crawler
 2 | ============
 3 | 
 4 | A flexible web crawler based on Scrapy for fetching most of Ajax or other various types of web pages. 
 5 | 
 6 | Easy to use: To customize a new web crawler, You just need to write a config file and run.
 7 | 
 8 | # Usage
 9 | * Edit A Config File In The 'Configs' Directory
10 | ```shell
11 | cd configs
12 | touch xxx.cfg
13 | vim xxx.cfg
14 | ```
15 | like this
16 | ```INI
17 | [xxx] # crawler name, should be the same as config file name.
18 | allowed_domains = dianping.com # domain name, can be a list.
19 | start_urls = http://www.dianping.com/search/category/2/45/g152 # start url, should be a certain url.
20 | list_url_pattern = .*category/2/45/g152[p\d]* # list url pattern # list url patern, you can use regular expressions here.
21 | list_restrict_xpaths = '<<//div[@class="page"]//a/@href>>' # list restrict xpaths, we use this to find item urls.
22 | list_content = list,item # decide what kind of content you can find in the list restrict xpaths.
23 | item_url_pattern = .*shop/\d+ # item url patter, you can use regular expressions here.
24 | item_restrict_xpaths = <<//div[@class="tit"]>> # item restrict xpaths, we use this to find item contents.
25 | item_content = name,address,region,intro,phone_num,cover_image,hours,sport # decide what field names can find in the item_restrict_xpaths.
26 | #item_incremental = yes # decide this crawler should be incremental (should use cache)
27 | item_name_xpaths = <<//h1[@class="shop-title"]/text()>> # we can find item content in the item field xpaths
28 | item_address_xpaths = <<//span[@itemprop="street-address"]/text()>>
29 | item_region_xpaths = <<//span[@class="region"]/text()>>
30 | item_phone_num_xpaths = <<//span[@itemprop="tel"]/text()>>
31 | item_cover_image_xpaths = <<//img[@itemprop="photo"]/@src>>
32 | item_hours_xpaths = <<//div[@class="desc-info"]//ul/li/span[@class="J_full-cont"]/text()>>
33 | item_sport_xpaths = "羽毛球" # also can be a certain string
34 | download_delay = 5 # downlaod delay to reduce crawling frequency
35 | #js_parser = on # decide whether to use WebKit parsing js and rerendering web pages
36 | ```
37 | * Then Just Run The Crawler
38 | ```bash
39 | scrapy crawl xxx
40 | ```
41 | 


--------------------------------------------------------------------------------
/crawler/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from configobj import ConfigObj
 3 | 
 4 | import os
 5 | import sys
 6 | import urlparse
 7 | import re
 8 | 
 9 | from os.path import dirname
10 | path = os.path.abspath(os.path.dirname(__file__))
11 | sys.path.append(path)
12 | 
13 | CRAWLER_DIR = dirname(path)
14 | print CRAWLER_DIR
15 | 
16 | def load_configs():
17 |     filenames = []
18 |     try:
19 |         cfg_dir = CRAWLER_DIR + '/configs/'
20 |         if CRAWLER_DIR.startswith("~"):
21 |             filenames += [ os.path.expanduser(cfg_dir) + filename for filename in os.listdir(os.path.expanduser(cfg_dir)) ]
22 |         else:
23 |             filenames += [ cfg_dir + filename for filename in os.listdir(cfg_dir) ]
24 |     except Exception,e:
25 |         print e
26 | 
27 |     cfg_list = []
28 |     for filename in filenames:
29 |         if re.search('\.cfg$',filename) != None:
30 |             cfg_list.append(filename)
31 | 
32 |     configs = []
33 |     for cfg in cfg_list:
34 |         try:
35 |             configs.append(ConfigObj( cfg, encoding='utf8'))
36 |         except Exception,e:
37 |             print e
38 |             print "error occured when reading config file @" + cfg
39 | 
40 |     return configs
41 | 
42 | configs = load_configs()
43 | #应用于此spider的配置
44 | site_config = None
45 | spider_name = None
46 | spider_config = None
47 | exist_site = False
48 | 
49 | print sys.argv
50 | for argv in sys.argv:
51 |     for config in configs:
52 |         if argv in config:
53 |             exist_site = True
54 |             spider_name = argv
55 |             site_config = config
56 |             spider_config = site_config[spider_name]
57 | 
58 | if exist_site == False:
59 |     if 'crawl' in sys.argv:
60 |         print "Unable to find spider: " + sys.argv[sys.argv.index('crawl') + 1]
61 |         sys.exit()
62 |     else:
63 |         try:
64 |             net_loc = urlparse.urlsplit(sys.argv[sys.argv.index('shell') + 1]).netloc
65 |             site_name = re.search('([^\.]+)\.[^\.]+$',net_loc).group(1)
66 |             for config in configs:
67 |                 if site_name in config:
68 |                     spider_name = site_name
69 |                     site_config = config
70 |                     spider_config = site_config[spider_name]
71 |                     exist_site = True
72 |             if exist_site == False:
73 |                 spider_name = configs[0].keys()[0]
74 |                 site_config = configs[0]
75 |                 spider_config = site_config[spider_name]
76 |         except:
77 |             print "Unable to resolve the commands"
78 |             sys.exit()
79 | 


--------------------------------------------------------------------------------
/crawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Scrapy settings for crawler project
 3 | #
 4 | # For simplicity, this file contains only the most important settings by
 5 | # default. All the other settings are documented here:
 6 | #
 7 | #     http://doc.scrapy.org/topics/settings.html
 8 | #
 9 | from configobj import ConfigObj
10 | 
11 | import os
12 | import sys
13 | import datetime
14 | import re
15 | from os.path import dirname
16 | 
17 | path = os.path.abspath(os.path.dirname(__file__))
18 | sys.path.append(path)
19 | from utils import *
20 | 
21 | configs = load_configs()
22 | 
23 | #应用于此spider的配置
24 | site_config = None
25 | spider_name = None
26 | spider_config = None
27 | exist_site = False
28 | 
29 | for argv in sys.argv:
30 |     for config in configs:
31 |         if argv in config:
32 |             exist_site = True
33 |             spider_name = argv
34 |             site_config = config
35 |             spider_config = site_config[spider_name]
36 | 
37 | BOT_NAME = 'AjaxSpider'
38 | #BOT_VERSION = '2.0'
39 | 
40 | SPIDER_MODULES = ['crawler.spiders']
41 | NEWSPIDER_MODULE = 'crawler.spiders'
42 | DEFAULT_ITEM_CLASS = 'crawler.items.CommonItem'
43 | #USER_AGENT = '%s/%s' % (BOT_NAME, BOT_VERSION)
44 | #USER_AGENT = 'Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.2.3) Gecko/%s Fedora/3.6.3-4.fc13 Firefox/3.6.3' % (datetime.date.today().strftime("%Y%m%d"))
45 | USER_AGENT = 'Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)'
46 | #USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5'
47 | 
48 | LOG_FILE=(CRAWLER_DIR + '/log/'+ spider_name + '_log_'+datetime.date.today().strftime("%Y%m%d")+'.log')
49 | 
50 | ITEM_PIPELINES = {
51 |     'crawler.pipelines.CommonPipeline':300,
52 | }
53 | 
54 | if spider_config.has_key('download_delay'):
55 |     DOWNLOAD_DELAY = float(spider_config['download_delay'])
56 | else:
57 |     DOWNLOAD_DELAY = 0.25
58 | RANDOMIZE_DOWNLOAD_DELAY = True
59 | 
60 | print spider_config.keys()
61 | if spider_config.has_key('js_parser') and spider_config['js_parser']=='on':
62 |     DOWNLOADER_MIDDLEWARES = {
63 |         'crawler.downloader.WebkitDownloader': 543,
64 |     }
65 | 
66 | incremental = False
67 | for cfg_key in spider_config.iterkeys():
68 |     if cfg_key.endswith('_incremental') and spider_config[cfg_key]=='yes':
69 |         incremental = True
70 |         break
71 | if incremental:
72 |     print 'incremental is : ON'
73 |     HTTPCACHE_EXPIRATION_SECS = 172800
74 |     FEED_STORE_EMPTY = True
75 |     DOWNLOADER_MIDDLEWARES = {
76 |         'crawler.incremental.IncrementalDownloader': 544,
77 |     }
78 | 
79 | RETRY_HTTP_CODES = [500, 503, 504, 400, 408]
80 | if spider_config.has_key('ignore_http_code'):
81 |     RETRY_HTTP_CODES.remove(int(spider_config['ignore_http_code']))
82 | 
83 | os.environ["DISPLAY"] = ":2"
84 | 
85 | COOKIES_ENABLED = True
86 | COOKIES_DEBUG = True
87 | 


--------------------------------------------------------------------------------
/crawler/spiders/common_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from scrapy.spider import *
  3 | from scrapy.selector import *
  4 | from scrapy.http import Request, FormRequest, HtmlResponse
  5 | from scrapy.conf import settings
  6 | from scrapy.item import Item, Field
  7 | from scrapy.utils.response import get_base_url
  8 | from scrapy.utils.url import urljoin_rfc
  9 | 
 10 | from configobj import ConfigObj
 11 | from crawler.items import CommonItem
 12 | from crawler.utils import *
 13 | 
 14 | import httplib
 15 | import urllib
 16 | import cookielib
 17 | import urlparse
 18 | import os
 19 | import re
 20 | import sys
 21 | import datetime
 22 | import itertools
 23 | import random
 24 | import hashlib
 25 | import copy
 26 | 
 27 | class DataNode:
 28 |     def __init__(self,name):
 29 |         self.name=name
 30 |         self.parent = None
 31 |         self.children=set()
 32 |         self.url_patterns = set()
 33 |         self.restrict_xpaths = set()
 34 |         self.xpaths = set()
 35 |         self.incremental = False
 36 |     def addChild(self, child):
 37 |         self.children.add(child)
 38 |     def getChildren(self):
 39 |         return self.children
 40 |     def setParent(self, parent):
 41 |         if parent.name != self.name:
 42 |             self.parent = parent
 43 |     def setUrlPatterns(self, url_patterns):
 44 |         self.url_patterns = set(url_patterns)
 45 |     def getUrlPatterns(self):
 46 |         return self.url_patterns
 47 |     def setRestrictXPaths(self, restrict_xpaths):
 48 |         self.restrict_xpaths = set(restrict_xpaths)
 49 |     def getRestrictXPaths(self, restrict_xpaths):
 50 |         return self.restrict_xpaths
 51 |     def setXPaths(self, xpaths):
 52 |         self.xpaths = set(xpaths)
 53 |     def getXPaths(self):
 54 |         return self.xpaths
 55 |     def setIncremental(self, incremental):
 56 |         self.incremental = incremental
 57 |     def getIncremental(self, incremental):
 58 |         return self.incremental
 59 |     def __repr__(self):
 60 |         return"<DataNode %s>"%self.name
 61 | 
 62 | class XPath:
 63 |     def __init__(self):
 64 |         pass
 65 | 
 66 |     def eval(self):
 67 |         pass
 68 | 
 69 | try:
 70 |     BaseSpider = Spider
 71 | except:
 72 |     pass
 73 | 
 74 | class CommonSpider(BaseSpider):
 75 | 
 76 |     configs = load_configs()
 77 | 
 78 |     #settings applied to the spider
 79 |     site_config = None
 80 |     spider_name = None
 81 |     spider_config = None
 82 |     exist_site = False
 83 | 
 84 |     print sys.argv
 85 |     for argv in sys.argv:
 86 |         for config in configs:
 87 |             if argv in config:
 88 |                 exist_site = True
 89 |                 spider_name = argv
 90 |                 site_config = config
 91 |                 spider_config = site_config[spider_name]
 92 | 
 93 |     if exist_site == False:
 94 |         if 'crawl' in sys.argv:
 95 |             print "Unable to find spider: " + sys.argv[sys.argv.index('crawl') + 1]
 96 |             sys.exit()
 97 |         else:
 98 |             try:
 99 |                 net_loc = urlparse.urlsplit(sys.argv[sys.argv.index('shell') + 1]).netloc
100 |                 site_name = re.search('([^\.]+)\.[^\.]+$',net_loc).group(1)
101 |                 for config in configs:
102 |                     if site_name in config:
103 |                         spider_name = site_name
104 |                         site_config = config
105 |                         spider_config = site_config[spider_name]
106 |                         exist_site = True
107 |                 if exist_site == False:
108 |                     spider_name = configs[0].keys()[0]
109 |                     site_config = configs[0]
110 |                     spider_config = site_config[spider_name]
111 |             except:
112 |                 print "Unable to resolve the commands"
113 |                 sys.exit()
114 | 
115 |     item_fields = set()
116 |     setting_vars = {}
117 |     params = {}
118 |     root_node = DataNode('root')
119 |     site_tree = {'root':root_node}
120 |     for cfg_key in spider_config.iterkeys():
121 |         #初始化自定义变量
122 |         if re.search('^\$',cfg_key):
123 |             var_name = re.sub('^\$','',cfg_key)
124 |             setting_vars[var_name] = spider_config[cfg_key]
125 |         #/*------------初始化数据节点树 begin-------------------*/
126 |         #初始化url种子列表
127 |         if cfg_key.endswith('_url_pattern'):
128 |             node_name = cfg_key.replace('_url_pattern', '')
129 |             node_url_patterns = spider_config[cfg_key]
130 |             if type(node_url_patterns) is not list:
131 |                 node_url_patterns = [node_url_patterns]
132 |             if not site_tree.has_key(node_name):
133 |                 site_tree[node_name] = DataNode(node_name)
134 |             node = site_tree[node_name]
135 |             node.setUrlPatterns(node_url_patterns)
136 |         if cfg_key.endswith('_restrict_xpaths'):
137 |             node_name = cfg_key.replace('_restrict_xpaths', '')
138 |             node_restrict_xpaths = spider_config[cfg_key]
139 |             if type(node_restrict_xpaths) is not list:
140 |                 node_restrict_xpaths = [node_restrict_xpaths]
141 |             if not site_tree.has_key(node_name):
142 |                 site_tree[node_name] = DataNode(node_name)
143 |             node = site_tree[node_name]
144 |             node.setRestrictXPaths(node_restrict_xpaths)
145 |         if cfg_key.endswith('_incremental'):
146 |             node_name = cfg_key.replace('_incremental', '')
147 |             node_incremental = spider_config[cfg_key]
148 |             if not site_tree.has_key(node_name):
149 |                 site_tree[node_name] = DataNode(node_name)
150 |             node = site_tree[node_name]
151 |             if node_incremental == 'yes':
152 |                 node.setIncremental(True)
153 |         #/*------------初始化数据节点树 end-------------------*/
154 |     params = setting_vars.copy()
155 |   
156 |     for k,v in site_tree.items():
157 |         if k+'_content' in spider_config:
158 |             node_contents = spider_config[k+'_content']
159 |             if type(node_contents) is not list:
160 |                 node_contents = [node_contents]
161 |             for node_content in node_contents:
162 |                 if not site_tree.has_key(node_content):
163 |                     site_tree[node_content] = DataNode(node_content)
164 |                 #数据项定义
165 |                 if spider_config.has_key(node_content) or spider_config.has_key('item_'+node_content+'_xpaths'):
166 |                     field_xpaths = spider_config[node_content] if spider_config.has_key(node_content) else spider_config['item_'+node_content+'_xpaths']
167 |                     if type(field_xpaths) is not list:
168 |                         field_xpaths = [field_xpaths]
169 |                     site_tree[node_content].setXPaths(field_xpaths)
170 |                     item_fields.add(node_content)
171 |                     if not CommonItem().fields.has_key(node_content):
172 |                         CommonItem().fields[node_content] = Field()
173 |                 v.addChild(site_tree[node_content])
174 |                 site_tree[node_content].setParent(v)
175 |     
176 |     item_fields.add('url')
177 |     CommonItem().fields['url'] = Field()
178 | 
179 |     for k,v in site_tree.iteritems():
180 |         if not v.parent and v.name != 'root':
181 |             v.parent = site_tree['root']
182 |             site_tree['root'].addChild(v)
183 | 
184 |     for item_field in item_fields:
185 |         if not CommonItem().fields.has_key(item_field):
186 |             CommonItem().fields[item_field] = {}
187 | 
188 |     name = spider_name
189 |     allowed_domains = spider_config['allowed_domains']
190 |     if type(allowed_domains) is not list:
191 |         allowed_domains = [allowed_domains]
192 | 
193 |     start_urls = spider_config['start_urls']
194 |     if type(start_urls) is not list:
195 |         start_urls = [start_urls]
196 | 
197 |     start_urls = tuple(start_urls)
198 | 
199 |     items = {}
200 |     queue = {}
201 | 
202 |     def __init__(self, *a, **kw):
203 |         super(CommonSpider, self).__init__(*a, **kw)
204 | 
205 |     def _eval_xpath(self,hxs,xpath):
206 |         """ 获得xpath表达式语句
207 |         """
208 |         ret_val = ''
209 |         if xpath.find('&') > -1:
210 |             m = re.search(r'<<(.+)&(.*)>>',xpath)
211 |             xpath_ex = m.group(1)
212 |             reg_ex = m.group(2)
213 |             ret_val +=   hxs + """.select('""" + xpath_ex + """').re('""" + reg_ex + """'.decode('utf8'))"""
214 |         else:
215 |             m = re.search(r'<<(.+)>>',xpath)
216 |             xpath_ex = m.group(1)
217 |             ret_val +=   hxs + """.select('""" + xpath_ex + """').extract()"""
218 | 
219 |         return ret_val
220 | 
221 |     def _join_lists(self, ls, linearity=False):
222 |         """ 交叉连接,参数ls为需要做连接的list,linearity表示交叉连接是否线性的(一一对应)
223 | 
224 |         """
225 |         ret_val = []
226 |         for x in ls:
227 |             if type(x) is not list:
228 |                 x = [x]
229 |         #if len(ls) <= 1:
230 |             #return ls
231 |         if linearity == True:
232 |             ret_val = itertools.imap(None,*ls)
233 |             return tuple(ret_val)
234 |         ret_val = itertools.product(*ls)
235 |         return tuple(ret_val)
236 | 
237 |     def _process_ex(self, hxs, ex_str, url=None):
238 |         """ 对表达式的处理
239 |         """
240 |         ret_val = ''
241 |         exs = ex_str.split('~')
242 |         ret_val += '['
243 |         for i,ex in enumerate(exs):
244 |             if ex.startswith('<<') and ex.endswith('>>'):
245 |                 ret_val += self._eval_xpath(hxs,ex)
246 |             else:
247 |                 try:
248 |                     #if ex.find('(')>-1 and ex.find(')')>-1:
249 |                     try:
250 |                         ret_val += '[u\'' + re.search(ex, url).group(1) + '\']'
251 |                     #else:
252 |                     except:
253 |                         ret_val += '[u\'' + re.search(ex, url).group(0) + '\']'
254 |                 except:
255 |                     ret_val += '[u\'' + ex + '\']'
256 |             if i == len(exs)-1:
257 |                 ret_val += ']'
258 |             else:
259 |                 ret_val += ','
260 |         return ret_val
261 | 
262 |     def eval_custom_var(self, response, hxs, var):
263 |         """ 计算自定义变量表达式的结果 
264 |         """
265 |         ret_val = []
266 |         tmp_v = var[:]
267 |         if type(tmp_v) is not list:
268 |             tmp_v = [tmp_v]
269 |         for i in xrange(len(tmp_v)):
270 |             tmp_var = tmp_v[i].split('~')
271 |             for j in xrange(len(tmp_var)):
272 |                 val = tmp_var[j]
273 |                 if val.find('{')>-1 and val.find('}')>-1:
274 |                     val = eval(""" " """+val.replace('{','').replace('}','')+""" " """)
275 |                 if val.startswith('<<') and val.endswith('>>'):
276 |                     rs_var = eval(self._eval_xpath('hxs',val))
277 |                     if rs_var:
278 |                         tmp_var[j] = rs_var
279 |                 else:
280 |                     try:
281 |                         rs_var = eval(val)
282 |                         if rs_var:
283 |                             tmp_var[j] = rs_var
284 |                     except:
285 |                         try:
286 |                             rs_var = re.search(val,response.url)
287 |                             if rs_var:
288 |                                 try:
289 |                                     tmp_var[j] = rs_var.group(1)
290 |                                 except:
291 |                                     tmp_var[j] = rs_var.group(0)
292 |                         except:
293 |                             pass
294 |                 if type(tmp_var[j]) is not list:
295 |                     tmp_var[j] = [tmp_var[j]]
296 |             tmp_v[i] = [''.join([unicode(_x) for _x in tmp_x]) for tmp_x in self._join_lists(tmp_var)]
297 |         for tmp_x in tmp_v:
298 |             if type(tmp_x) is list:
299 |                 ret_val += tmp_x
300 |             else:
301 |                 ret_val.append(tmp_x)
302 | 
303 |         return ret_val
304 | 
305 | 
306 |     def extract_url(self, response, hxs, url_pattern, restrict_xpaths, params):
307 |         """ 提取Url
308 |         """
309 |         ret_urls = []
310 |         length = 0
311 |         tmp_urls = []
312 |         exists = set()
313 |         base_url = get_base_url(response)
314 |         #TODO:exist_list_param handler
315 |         if not restrict_xpaths:
316 |             restrict_xpaths = ["<<//a/@href>>"]
317 |         for restrict_xpath in restrict_xpaths:
318 |             url_xpath = restrict_xpath
319 |             closed_xpath = False
320 |             if re.search('/([^/]+\(\).*>>)', restrict_xpath) or re.search('/@([^/]+>>)', restrict_xpath):
321 |                 closed_xpath = True
322 |             if not closed_xpath:
323 |                 joint_xpath = ''
324 |                 level = 0
325 |                 while not self.eval_custom_var(response, hxs, restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>')) or self.eval_custom_var(response, hxs, restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>'))==[restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>')]:
326 |                     joint_xpath += '/*'
327 |                     level += 1
328 |                     if level>=5:
329 |                         break
330 |                 url_xpath = restrict_xpath.replace('/>>','>>').replace('>>', joint_xpath+'/@href>>')
331 |             tmp_urls += self.eval_custom_var(response, hxs, url_xpath)
332 |         sorted_tmp_urls = list(set(tmp_urls))
333 |         sorted_tmp_urls.sort(key=tmp_urls.index)
334 |         length = len(sorted_tmp_urls)
335 |         _url_pattern = self.process_url_pattern(url_pattern, response, hxs)
336 |         for i,tmp_url in enumerate(sorted_tmp_urls):
337 |             #handle incomplete urls
338 |             tmp_url = urljoin_rfc(base_url, tmp_url)
339 |             try:
340 |                 if re.search(_url_pattern, tmp_url):
341 |                     #if url_pattern.find("(")>-1 and url_pattern.find(")")>-1:
342 |                     try:
343 |                         tmp_url = re.search(_url_pattern, tmp_url).group(1)
344 |                     #else:
345 |                     except:
346 |                         tmp_url = re.search(_url_pattern, tmp_url).group(0)
347 |                 else:
348 |                     continue
349 |             except:
350 |                 pass
351 |             tmp_params = re.findall('\{([^\{\}]*)\}', tmp_url)
352 |             processed_url = tmp_url
353 |             if tmp_params:
354 |                 exist_invalid_param = False
355 |                 for tmp_param in tmp_params:
356 |                     if not tmp_param or tmp_param not in self.params:
357 |                         exist_invalid_param = True
358 |                         print 'invalid param:%s'%tmp_param
359 |                 if exist_invalid_param:
360 |                     break
361 |                 processed_url = [urlparse.urljoin(response.url, ''.join(x_url_parts)) for x_url_parts in self._join_lists(eval('[[\'' + tmp_url.replace('{','\'],').replace('}',',[\'') + '\']]'))]
362 |             if type(processed_url) is list:
363 |                 for _url in processed_url:
364 |                     if _url in exists:
365 |                         processed_url.remove(_url)
366 |                 ret_urls += [[(i+1)*(j+1)-1,_url] for j,_url in enumerate(processed_url)]
367 |             else:
368 |                 if not processed_url in exists:
369 |                     ret_urls.append([i, processed_url])
370 |                     exists.add(processed_url)
371 |         #TODO:exist_list_param handler
372 |         tmp_params = re.findall('\{([^\{\}]*)\}', url_pattern)
373 |         if not ret_urls and (params or tmp_params):
374 |             tmp_urls = [url_pattern]
375 |             exist_invalid_param = False
376 |             for tmp_param in tmp_params:
377 |                 if not tmp_param or tmp_param not in self.params:
378 |                     exist_invalid_param = True
379 |                     print 'invalid param:%s'%tmp_param
380 |             if not tmp_params or exist_invalid_param:
381 |                 pass
382 |             else:
383 |                 tmp_urls = [urljoin_rfc(base_url, ''.join(x_url_parts)) for x_url_parts in self._join_lists(eval('[[\'' + url_pattern.replace('{','\'],').replace('}',',[\'') + '\']]'))]
384 |             lengh = len(tmp_urls)
385 |             for i,tmp_url in enumerate(tmp_urls):
386 |                 ret_urls.append((i,tmp_url))
387 |         ret_urls = [ret_url for ret_url in ret_urls if ret_url[1].find('http://')>-1]
388 |         return length, ret_urls
389 | 
390 |     def generate_sign(self):
391 |         sign = hashlib.sha1(str(random.random())).hexdigest()
392 |         while self.items.has_key(sign):
393 |             sign = hashlib.sha1(str(random.random())).hexdigest()
394 |         return sign
395 | 
396 |     def parse_item_xpaths(self, hxs, xpaths, item, url, name, replace=False, allow_empty=True):
397 |         _res = []
398 |         for item_field_xpath in xpaths:
399 |             item_field_xpath = item_field_xpath.replace('`',',')
400 |             _res += self._join_lists(eval(self._process_ex('hxs',item_field_xpath, url=url)))
401 | 
402 |         joined_res = [''.join(_one) for _one in _res]
403 |         if name.find("url")>-1:
404 |             for i,joined_one in enumerate(joined_res):
405 |                 if not joined_one.startswith("http://"):
406 |                     if not joined_one.startswith("/"):
407 |                         joined_one = "/" + joined_one
408 |                     joined_res[i] = urlparse.urljoin(url, joined_one)
409 | 
410 |         if item.has_key(name):
411 |             if replace:
412 |                 if joined_res or allow_empty:
413 |                     item[name] = joined_res
414 |             else:
415 |                 if joined_res != item[name]:
416 |                     item[name] += joined_res
417 |         else:
418 |             if joined_res or allow_empty:
419 |                 item[name] = joined_res
420 | 
421 | 
422 |     def parse_multi_items(self, hxs, node, item, response, index, count):
423 |         if node.restrict_xpaths:
424 |             for child in node.children:
425 |                 if child.xpaths:
426 |                     restrict_xpath = '|'.join([restrict_xpath.replace("<<", "").replace(">>", "") for restrict_xpath in node.restrict_xpaths])
427 |                     try:
428 |                         m = re.search(r'<<(.+)&(.*)>>',xpath)
429 |                         restrict_xpath = m.group(1)
430 |                     except:
431 |                         pass
432 |                     restrict_selectors = hxs.select(restrict_xpath)
433 |                     #fetch multi items from one page
434 |                     if index != None and len(restrict_selectors) > index and len(restrict_selectors)==count:
435 |                         try:
436 |                             XmlXPathSelector = Selector
437 |                         except:
438 |                             pass
439 |                         restrict_hxs = XmlXPathSelector(HtmlResponse(response.url, body=re.sub('[\n\r\t]+', '', restrict_selectors[index].extract()), encoding='utf8'))
440 |                         #restrict_hxs = restrict_selectors[index]
441 |                         self.parse_item_xpaths(restrict_hxs, child.xpaths, item, response.url, child.name, True, False)
442 | 
443 |     def process_url_pattern(self, url_pattern, response, hxs):
444 |         return ''.join(self._join_lists(eval('[[\'' + url_pattern.replace('{','\'],').replace('}',',[\'') + '\']]'))[0])
445 | 
446 |     def match_url_pattern(self, url_pattern, url):
447 |         match = False
448 |         _url_pattern =  url_pattern[:]
449 |         if re.search('\{\w+\}', url_pattern):
450 |             _url_pattern = re.sub('\{\w+\}', '\w+', url_pattern).replace('?', '\?')
451 |         if re.search(_url_pattern, url):
452 |             match = True
453 |         return match
454 | 
455 |     def parse(self, response):
456 |         if self.spider_config.has_key('encoding'):
457 |             _encoding = self.spider_config['encoding']
458 |             response = response.replace(encoding=_encoding)
459 |         try:
460 |             HtmlXPathSelector = Selector
461 |         except:
462 |             pass
463 | 
464 |         hxs = HtmlXPathSelector(response)
465 | 
466 |         curr_nodes = set()
467 | 
468 |         #tell the curr_node
469 |         if 'node' in response.meta:
470 |             curr_nodes = set(response.meta['node'])
471 |         else:
472 |             for key, node in self.site_tree.items():
473 |                 for url_pattern in node.url_patterns:
474 |                     if self.match_url_pattern(url_pattern, response.url):
475 |                         curr_nodes.add(node)
476 | 
477 |         if not curr_nodes:
478 |             curr_nodes.add(self.site_tree['root'])
479 | 
480 |         #make a sign of the item (when??)
481 |         #curr_sign = self.generate_sign()
482 |         #if response.request.headers.has_key('Sign'):
483 |             #curr_sign = response.request.headers['Sign']
484 | 
485 |         is_root = False
486 |         for curr_node in curr_nodes:
487 |             #if not curr_node.parent or curr_node.parent.name == 'root':
488 |             if not curr_node.parent:
489 |                 is_root = True
490 | 
491 |         #fetch an item from self.items
492 |         if 'item' in response.meta:
493 |             if is_root:
494 |                 item = CommonItem()
495 |             else:
496 |                 item = copy.deepcopy(response.meta['item'])
497 |                 #item = copy.deepcopy(self.items[curr_sign]['item'])
498 |                 #self.items[curr_sign]['req_count'] -= 1
499 |                 #if not self.items[curr_sign]['req_count']:
500 |                     #self.items.pop(curr_sign)
501 |         else:
502 |             item = CommonItem()
503 | 
504 |         #解析自定义变量或参数
505 |         for k,v in self.setting_vars.iteritems():
506 |             tmp_v = self.eval_custom_var(response, hxs, v)
507 |             self.params[k] = tmp_v
508 |             if len(tmp_v) == 1:
509 |                 self.params[k] = tmp_v[0]
510 |             globals()[k] = self.params[k]
511 | 
512 | 
513 |         #解析url和数据项
514 |         for curr_node in curr_nodes:
515 |             if curr_node.incremental and 'exist_in_cache' in response.flags:
516 |                 yield None
517 |             else:
518 |                 tail_branch = True
519 |                 no_yield = True
520 |                 if curr_node.name == 'item' and not item.has_key('url'):
521 |                     item['url'] = [response.url]
522 |                 for child in curr_node.children:
523 |                     #if child.children and child is not curr_node:
524 |                     if child.children:
525 |                         tail_branch = False
526 |                     #解析数据项
527 |                     if child.xpaths:
528 |                         self.parse_item_xpaths(hxs, child.xpaths, item, response.url, child.name)
529 |                 for child in curr_node.children:
530 |                     #parse multi items from one single page
531 |                     curr_items = []
532 |                     #if child.restrict_xpaths and child.parent and child.parent.name != 'root':
533 |                     if child.restrict_xpaths:
534 |                         tail_child = True
535 |                         belongtos = []
536 |                         for restrict_xpath in child.restrict_xpaths:
537 |                             belongtos += self._join_lists(eval(self._process_ex('hxs',restrict_xpath)))
538 |                         #for chd in child.children:
539 |                             #if chd.children and chd is not child:
540 |                                 #tail_child = False
541 |                         if child.url_patterns:
542 |                             tail_child = False
543 |                         for i in xrange(len(belongtos)):
544 |                             new_item = copy.deepcopy(item)
545 |                             #item_sign = self.generate_sign()
546 |                             self.parse_multi_items(hxs, child, new_item, response, i, len(belongtos))
547 |                             if tail_child and len(new_item) == len(self.item_fields):
548 |                                 yield new_item
549 |                                 no_yield = False
550 |                             else:
551 |                                 curr_items.append(new_item)
552 | 
553 |                     #parse follow urls
554 |                     if child.url_patterns:
555 |                         restrict_xpaths = child.restrict_xpaths
556 |                         prefix = child.name + "_"
557 |                         pending_params = {}
558 |                         for key,val in self.params.iteritems():
559 |                             if key.startswith(prefix):
560 |                                 pending_params[key.replace(prefix,'')] = val
561 |                         pending_urls = []
562 |                         urls_len = 0
563 |                         for url_pattern in child.url_patterns:
564 |                             urls_length, url_list = self.extract_url(response, hxs, url_pattern, restrict_xpaths, pending_params)
565 |                             urls_len += urls_length
566 |                             pending_urls += url_list
567 |                         if pending_params:
568 |                             tmp_param_list = []
569 |                             pending_params_keys = []
570 |                             for k,v in pending_params.items():
571 |                                 pending_params_keys.append(k)
572 |                                 tmp_param_list.append(v)
573 |                             tmp_seq_params = self._join_lists(tmp_param_list)
574 |                             for i,pending_url in pending_urls:
575 |                                 #TODO:exist_list_param handler
576 |                                 for j,tmp_seq_param in enumerate(tmp_seq_params):
577 |                                     #req_sign = self.generate_sign()
578 |                                     req_item = copy.deepcopy(item)
579 |                                     _tmp_param = {}
580 |                                     for i, k in enumerate(pending_params_keys):
581 |                                         _tmp_param[k] = tmp_seq_param[i]
582 |                                     #Attach item sign with the request
583 |                                     req = FormRequest(pending_url, method="POST", formdata=_tmp_param)
584 |                                     #req.headers['Sign'] = req_sign
585 |                                     #attach item with the request
586 |                                     if 'item' not in req.meta:
587 |                                         req.meta['item'] = req_item
588 |                                     #if req_sign not in self.items:
589 |                                         #self.items[req_sign] = {'item':req_item, 'req_count':0}
590 |                                     #self.items[req_sign]['req_count'] += 1
591 |                                     if 'node' not in req.meta:
592 |                                         req.meta['node'] = []
593 |                                     req.meta['node'].append(child)
594 |                                     yield req
595 |                                     no_yield = False
596 |                         else:
597 |                             for i,pending_url in pending_urls:
598 |                                 #req_sign = self.generate_sign()
599 |                                 req_item = copy.deepcopy(item)
600 |                                 if len(curr_items)==urls_len and i<urls_len:
601 |                                     #req_sign = curr_items[i][0]
602 |                                     req_item = curr_items[i]
603 |                                 #Attach item with the request
604 |                                 req = Request(pending_url)
605 |                                 #req.headers['Sign'] = req_sign
606 |                                 #attach item_sign with the item
607 |                                 if 'item' not in req.meta:
608 |                                     req.meta['item'] = req_item
609 |                                 #if req_sign not in self.items:
610 |                                     #self.items[req_sign] = {'item':req_item, 'req_count':0}
611 |                                 #self.items[req_sign]['req_count'] += 1
612 |                                 if 'node' not in req.meta:
613 |                                     req.meta['node'] = []
614 |                                 req.meta['node'].append(child)
615 |                                 yield req
616 |                                 no_yield = False
617 | 
618 |                 #decide when to yield the item
619 |                 if (no_yield or tail_branch) and len(item) == len(self.item_fields):
620 |                     yield item
621 | 
622 | 


--------------------------------------------------------------------------------