├── README.md ├── __init__.py ├── domain.py ├── spider.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | second-spider 2 | ============= 3 | 4 | A simple python gevent concurrency spider 5 | 6 | ### Features 7 | 8 | 1. The concurrency foundation on [gevent](http://www.gevent.org/) 9 | 2. The spider strategy highly configurable: 10 | 11 | > 12 | * Max depth 13 | * Sum totals of urls 14 | * Max concurrency of http request,avoid dos 15 | * Request headers and cookies 16 | * Same host strategy 17 | * Same domain strategy 18 | * Max running time 19 | 20 | 21 | ### Dependencies 22 | 23 | * python 2.7 24 | * ~~* gevent 1.0dev~~ 25 | * [gevent 1.0 final](https://github.com/surfly/gevent/releases/tag/1.0) 26 | * requests 1.0.3 27 | * pyquery 1.2.4 28 | 29 | 30 | ### Test 31 | 32 | ``` 33 | python spider.py -v 34 | ``` 35 | 36 | ### Example 37 | 38 | ``` 39 | import logging 40 | from spider import Spider 41 | 42 | logging.basicConfig( 43 | level=logging.DEBUG , 44 | format='%(asctime)s %(levelname)s %(message)s') 45 | 46 | spider = Spider() 47 | spider.setRootUrl("http://www.sina.com.cn") 48 | spider.run() 49 | 50 | ``` 51 | 52 | 53 | ### TODO 54 | 55 | * Support Distributed , update `gevent.Queue` -> `redis.Queue` 56 | * Storage backend highly configurable 57 | * Support Ajax url (webkit etc..) 58 | 59 | 60 | ### LICENSE 61 | 62 | Copyright © 2013 by kenshin 63 | 64 | Under MIT license : [rem.mit-license.org](http://rem.mit-license.org/) 65 | 66 | 67 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | __version__ = (0, 1, 2) 4 | 5 | __title__ = "Second Spider" 6 | 7 | __author__ = ("kenshin","jetz") 8 | -------------------------------------------------------------------------------- /domain.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | # From http://www.iana.org/domains/root/db 5 | GENERAL_TLD = ['com','edu','gov','net','org','mil','travel','aero', 6 | 'asia','cat','coop','int','jobs','mobi','museum','post', 7 | 'tel','xxx','pro','arpa'] 8 | 9 | REGOIN_TLD = { "cn": ['xj', 'sh', 'ac', 'gs', 'zj', 'yn', 'ah', 'gz', 10 | 'bj', 'gx', 'jl', 'hk', 'gd', 11 | 'hn', 'hl', 'edu', 'hb', 'cq', 'ha', 'fj', 'he', 12 | 'xz', 'sx', 'jx','ln', 'tw', 13 | 'mo', 'js', 'nx', 'hi', 'tj', 'sn', 'nm', 'sc', 'qh', 14 | 'sd'], 15 | "tw": ['idv','game','club','ebiz'], 16 | "hk": ['idv'], 17 | } 18 | 19 | def GetFirstLevelDomain(raw_host=""): 20 | 21 | raw_host.lower() 22 | port = 80 23 | if ":" in raw_host: 24 | try: 25 | (host, port) = raw_host.split(':') 26 | except ValueError: 27 | raise ValueError('Too many ":" in %s' % raw_host) 28 | else: 29 | host = raw_host 30 | 31 | rev = host.split(".")[::-1] 32 | 33 | if rev[0] in GENERAL_TLD: 34 | rev = rev[:2] 35 | elif len(rev[0].decode('utf-8')) == 2: 36 | if rev[1] in GENERAL_TLD+REGOIN_TLD.get(rev[0], []): 37 | rev = rev[:3] 38 | else: 39 | rev = rev[:2] 40 | else: 41 | return None 42 | 43 | return ".".join(rev[::-1]) 44 | 45 | 46 | class DomainTest(unittest.TestCase): 47 | 48 | def test_base_function(self): 49 | self.assertEqual(GetFirstLevelDomain('www.google.com'), 'google.com') 50 | 51 | def test_g_tld(self): 52 | tlds = ['subdomain.china.asia', 53 | '4th.www.float.int', 54 | 'e.pypi.python.org'] 55 | 56 | match = ['china.asia','float.int','python.org'] 57 | 58 | self.assertEquals([GetFirstLevelDomain(t) for t in tlds], match) 59 | 60 | def test_special_cctld(self): 61 | 62 | self.assertEqual(GetFirstLevelDomain('www.gx.cn'), 'www.gx.cn') 63 | 64 | def test_cjk_domain(self): 65 | 66 | self.assertEqual(GetFirstLevelDomain('www.g.中国'), 'g.中国') 67 | 68 | def test_domain_with_port(self): 69 | self.assertEqual(GetFirstLevelDomain('del.icio.us:8080'), 'icio.us') 70 | 71 | def test_bad_domain(self): 72 | self.assertIsNone(GetFirstLevelDomain('i-am.b_ad.domain')) 73 | 74 | if __name__ == '__main__': 75 | unittest.main() 76 | -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import gevent 3 | from gevent import (monkey, 4 | queue, 5 | event, 6 | pool) 7 | 8 | import re 9 | import sys 10 | import logging 11 | import unittest 12 | import urllib 13 | import urlparse 14 | import requests 15 | from threading import Timer 16 | from pyquery import PyQuery 17 | from utils import HtmlAnalyzer, UrlFilter 18 | 19 | 20 | __all__ = ['Strategy', 'UrlObj', 'Spider', 'HtmlAnalyzer', 'UrlFilter'] 21 | 22 | 23 | 24 | class Strategy(object): 25 | 26 | default_cookies = {} 27 | 28 | default_headers = { 29 | 'User-Agent': 'SinaSec Webscan Spider', 30 | 'Accept': 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 31 | 'Cache-Control': 'max-age=0', 32 | 'Accept-Charset': 'GBK,utf-8;q=0.7,*;q=0.3', 33 | } 34 | 35 | 36 | def __init__(self,max_depth=5,max_count=5000,concurrency=5,timeout=10,time=6*3600,headers=None, 37 | cookies=None,ssl_verify=False,same_host=False,same_domain=True): 38 | self.max_depth = max_depth 39 | self.max_count = max_count 40 | self.concurrency = concurrency 41 | self.timeout = timeout 42 | self.time = time 43 | self.headers = self.default_headers 44 | self.headers.update(headers or {}) 45 | self.cookies = self.default_cookies 46 | self.cookies.update(cookies or {}) 47 | self.ssl_verify = ssl_verify 48 | self.same_host = same_host 49 | self.same_domain = same_domain 50 | 51 | 52 | class UrlObj(object): 53 | 54 | def __init__(self, url, depth=0, linkin=None): 55 | if not url.startswith("http"): 56 | url = "http://" + url 57 | self.url = url.strip('/') 58 | self.depth = depth 59 | self.linkin = linkin 60 | 61 | def __str__(self): 62 | return self.url 63 | 64 | def __repr__(self): 65 | return "" % self.url 66 | 67 | def __hash__(self): 68 | return hash(self.url) 69 | 70 | def setLinkin(self, urlobj): 71 | self.linkin = urlobj 72 | 73 | def incrDepth(self): 74 | self.depth += 1 75 | 76 | 77 | class UrlTable(object): 78 | 79 | infinite = float("inf") 80 | 81 | def __init__(self, size=0): 82 | self.__urls = {} 83 | 84 | if size == 0 : 85 | size = self.infinite 86 | self.size = size 87 | 88 | def __len__(self): 89 | return len(self.__urls) 90 | 91 | def __contains__(self, url): 92 | return hash(url) in self.__urls.keys() 93 | 94 | def __iter__(self): 95 | for url in self.urls: 96 | yield url 97 | 98 | def insert(self, url): 99 | if isinstance(url, basestring): 100 | url = UrlObj(url) 101 | if url not in self: 102 | self.__urls.setdefault(hash(url), url) 103 | 104 | @property 105 | def urls(self): 106 | return self.__urls.values() 107 | 108 | def full(self): 109 | return len(self) >= self.size 110 | 111 | 112 | class Spider(object): 113 | 114 | logger = logging.getLogger("spider.mainthread") 115 | 116 | def __init__(self,strategy=Strategy()): 117 | monkey.patch_all() 118 | self.strategy = strategy 119 | self.queue = queue.Queue() 120 | self.urltable = UrlTable(strategy.max_count) 121 | self.pool = pool.Pool(strategy.concurrency) 122 | self.greenlet_finished = event.Event() 123 | self._stop = event.Event() 124 | 125 | 126 | def setRootUrl(self,url): 127 | if isinstance(url,basestring): 128 | url = UrlObj(url) 129 | self.root = url 130 | self.put(self.root) 131 | 132 | def put(self, url): 133 | if url not in self.urltable: 134 | self.queue.put(url) 135 | 136 | def run(self): 137 | self.timer = Timer(self.strategy.time, self.stop) 138 | self.timer.start() 139 | self.logger.info("spider '%s' begin running",self.root) 140 | 141 | while not self.stopped() and self.timer.isAlive(): 142 | for greenlet in list(self.pool): 143 | if greenlet.dead: 144 | self.pool.discard(greenlet) 145 | try: 146 | url = self.queue.get_nowait() 147 | except queue.Empty: 148 | if self.pool.free_count() != self.pool.size: 149 | self.greenlet_finished.wait() 150 | self.greenlet_finished.clear() 151 | continue 152 | else: 153 | self.stop() 154 | greenlet = Handler(url, self) 155 | self.pool.start(greenlet) 156 | 157 | def stopped(self): 158 | return self._stop.is_set() 159 | 160 | def stop(self): 161 | self.logger.info("spider '%s' finished. fetch total (%d) urls",self.root,len(self.urltable)) 162 | self.timer.cancel() 163 | self._stop.set() 164 | self.pool.join() 165 | self.queue.put(StopIteration) 166 | return 167 | 168 | def dump(self): 169 | import StringIO 170 | out = StringIO.StringIO() 171 | for url in self.urltable: 172 | try: 173 | print >> out ,url 174 | except: 175 | continue 176 | return out.getvalue() 177 | 178 | 179 | class Handler(gevent.Greenlet): 180 | 181 | logger = logging.getLogger("spider.handler") 182 | 183 | def __init__(self, urlobj, spider): 184 | gevent.Greenlet.__init__(self) 185 | self.urlobj = urlobj 186 | self.spider = spider 187 | self.charset = "utf-8" 188 | 189 | def _run(self): 190 | strategy = self.spider.strategy 191 | urltable = self.spider.urltable 192 | queue = self.spider.queue 193 | 194 | try: 195 | html = self.open(self.urlobj.url) 196 | except Exception, why: 197 | self.logger.debug("open '%s' failed,since : %s", self.urlobj, why) 198 | return self.stop() 199 | 200 | linkin = self.urlobj 201 | depth = linkin.depth + 1 202 | 203 | if strategy.max_depth and (depth > strategy.max_depth): 204 | return self.stop() 205 | 206 | for link in self.feed(html): 207 | 208 | if urltable.full(): 209 | self.stop() 210 | self.spider.stop() 211 | return 212 | 213 | if link in urltable: 214 | continue 215 | 216 | 217 | if strategy.same_host and (not UrlFilter.isSameHost(link,linkin.url)): 218 | continue 219 | 220 | if strategy.same_domain and (not UrlFilter.isSameDomain(link, linkin.url)): 221 | continue 222 | 223 | url = UrlObj(link, depth, linkin) 224 | urltable.insert(url) 225 | queue.put(url) 226 | 227 | self.logger.debug( 228 | "sucess crawled '%s' the <%d> urls", url, len(urltable)) 229 | 230 | self.stop() 231 | 232 | def open(self, url): 233 | strategy = self.spider.strategy 234 | try: 235 | resp = requests.get(url, headers=strategy.headers, 236 | cookies=strategy.cookies, timeout=strategy.timeout, 237 | verify=strategy.ssl_verify) 238 | except requests.exceptions.RequestException, e: 239 | raise e 240 | if resp.status_code != requests.codes.ok: 241 | resp.raise_for_status() 242 | charset = HtmlAnalyzer.detectCharSet(resp.text) 243 | if charset is not None: 244 | self.charset = charset 245 | resp.encoding = charset 246 | return resp.text 247 | 248 | def feed(self,html): 249 | return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset) 250 | 251 | 252 | def stop(self): 253 | self.spider.greenlet_finished.set() 254 | self.kill(block=False) 255 | 256 | 257 | class TestSpider(unittest.TestCase): 258 | 259 | def setUp(self): 260 | self.root = "http://www.sina.com.cn" 261 | strategy = Strategy(max_depth=3, max_count=5000, 262 | same_host=False, same_domain=True) 263 | self.spider = Spider(strategy) 264 | self.spider.setRootUrl(self.root) 265 | self.spider.run() 266 | 267 | def testSpiderStrategy(self): 268 | self.assertEqual(len(self.spider.urltable), 5000) 269 | self.assertLessEqual(self.spider.urltable.urls[-1].depth, 3) 270 | for url in self.spider.urltable.urls[100:200]: 271 | self.assert_(UrlFilter.isSameDomain(self.root, str(url))) 272 | 273 | 274 | 275 | if __name__ == '__main__': 276 | logging.basicConfig( 277 | level=logging.DEBUG if "-v" in sys.argv else logging.WARN, 278 | format='%(asctime)s %(levelname)s %(message)s') 279 | unittest.main() 280 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | import re 5 | import urllib 6 | import urlparse 7 | from pyquery import PyQuery 8 | import domain 9 | import os.path 10 | import unittest 11 | 12 | 13 | class HtmlAnalyzer(object): 14 | 15 | @staticmethod 16 | def detectCharSet(html): 17 | 18 | pq = PyQuery(html) 19 | 20 | metas = pq('head')('meta') 21 | 22 | for meta in metas: 23 | for key in meta.keys(): 24 | if key == "charset": 25 | charset = meta.get('charset') 26 | return charset 27 | if key == "content": 28 | try: 29 | p = re.match(r".+charset=(.*)\W*", meta.get('content')) 30 | return p.group(1) 31 | except: 32 | continue 33 | 34 | @staticmethod 35 | def extractLinks(html, baseurl, charset): 36 | 37 | def _extract(url, attr): 38 | link = url.attrib[attr] 39 | # strip('\\"') for href like Sina 40 | link = link.strip("/ ").strip('\\"') 41 | if link is None: 42 | raise 43 | 44 | link = urlparse.urljoin(baseurl, link) 45 | link = urlparse.urldefrag(link)[0] 46 | 47 | try: 48 | link = urllib.quote(link, ':?=+&#/@') 49 | except (UnicodeDecodeError, KeyError): 50 | try: 51 | link = urllib.quote(link.encode(charset), ':?=+&#/@') 52 | except: 53 | pass 54 | 55 | return link 56 | 57 | def _isValidLink(url): 58 | try: 59 | return all([UrlFilter.checkScheme(url), 60 | UrlFilter.checkInvalidChar(url), 61 | UrlFilter.checkInvalidExtention(url) 62 | ]) 63 | except: 64 | return False 65 | 66 | pq = PyQuery(html) 67 | 68 | allLinks = [] 69 | 70 | for url in pq('a'): 71 | try: 72 | link = _extract(url, 'href') 73 | except: 74 | continue 75 | if _isValidLink(link): 76 | allLinks.append(link) 77 | 78 | for url in pq('form'): 79 | try: 80 | link = _extract(url, 'action') 81 | except: 82 | continue 83 | if _isValidLink(link): 84 | allLinks.append(link) 85 | return allLinks 86 | 87 | 88 | class UniqRule(object): 89 | 90 | # 用于形如abc123格式 91 | alnum = re.compile(r'^(\D+)(\d+)$') 92 | 93 | date = re.compile(r'^([12]\d)?\d\d-\d{1,2}(-\d{1,2})?$') 94 | 95 | connector = '|' 96 | 97 | # 相同后缀名 98 | ext = { 99 | '.asp': '.asp', 100 | '.aspx': '.asp', 101 | '.jsp': '.jsp', 102 | '.jspx': '.jsp', 103 | } 104 | 105 | scheme = { 106 | 'http': 'http', 107 | 'https': 'http' 108 | } 109 | 110 | normalize_dict = { 111 | 'digit': '1', 112 | 'letter': 'a', 113 | 'date': '2013-01-01', 114 | } 115 | 116 | def __init__(self, depth=None): 117 | self.depth = depth 118 | 119 | def is_digit(self, param): 120 | return param.isdigit() 121 | 122 | def is_letter(self, param): 123 | return len(param) == 1 and param.isalpha() 124 | 125 | def is_alnum(self, param): 126 | if UniqRule.alnum.match(param): 127 | return True 128 | return False 129 | 130 | # 形如abc-123-453 131 | def is_hyphen_split(self, param): 132 | return not param.find('-') == -1 133 | 134 | def is_underscore_split(self, param): 135 | return not param.find('_') == -1 136 | 137 | def is_date(self, param): 138 | if UniqRule.date.match(param): 139 | return True 140 | return False 141 | 142 | def split_params(self, pathnode): 143 | name_params = pathnode.split(';') 144 | if len(name_params) > 1: 145 | return name_params[0], name_params[1:] 146 | else: 147 | return name_params[0], [] 148 | 149 | def normalize(self, param): 150 | if self.is_digit(param): 151 | return UniqRule.normalize_dict['digit'] 152 | elif self.is_letter(param): 153 | return UniqRule.normalize_dict['letter'] 154 | elif self.is_date(param): 155 | return UniqRule.normalize_dict['date'] 156 | elif self.is_alnum(param): 157 | match = UniqRule.alnum.match(param) 158 | return match.group(1) + UniqRule.normalize_dict['digit'] 159 | elif self.is_hyphen_split(param): 160 | params = param.split('-') 161 | for k, v in enumerate(params): 162 | if v.isdigit(): 163 | params[k] = UniqRule.normalize_dict['digit'] 164 | return '-'.join(params) 165 | elif self.is_underscore_split(param): 166 | params = param.split('_') 167 | for k, v in enumerate(params): 168 | if v.isdigit(): 169 | params[k] = UniqRule.normalize_dict['digit'] 170 | return '_'.join(params) 171 | else: 172 | return param 173 | 174 | ############################################################ 175 | 176 | def is_depth_set(self): 177 | return self.depth is not None 178 | 179 | def normalize_scheme(self, scheme): 180 | return UniqRule.scheme.get(scheme, scheme) 181 | 182 | def normalize_hostname(self, hostname): 183 | return hostname 184 | 185 | def normalize_dirs(self, dir_list): 186 | dir_depth = len(dir_list) 187 | if self.is_depth_set() and self.depth <= dir_depth: 188 | return UniqRule.connector.join([self.normalize(dir_list[i]) 189 | for i in xrange(self.depth)]) 190 | return UniqRule.connector.join([self.normalize(dir_list[i]) 191 | for i in xrange(dir_depth)]) 192 | 193 | def normalize_tailpage(self, tailpage): 194 | try: 195 | tpname, params = self.split_params(tailpage) 196 | except IndexError: 197 | return tailpage 198 | fname, ext = os.path.splitext(tpname) 199 | norm_name = self.normalize(fname) 200 | norm_ext = UniqRule.ext.get(ext, ext) 201 | norm_params = sorted(params) 202 | result = [norm_name, norm_ext] 203 | result.extend(norm_params) 204 | return UniqRule.connector.join(result) 205 | 206 | def normalize_querykeys(self, querykeys): 207 | return UniqRule.connector.join(sorted(querykeys)) 208 | 209 | 210 | class UrlFilter(object): 211 | 212 | invalid_chars = {'\'': None, 213 | '\"': None, 214 | '\\': None, 215 | ' ': None, 216 | '\n': None, 217 | '\r': None, 218 | '+': None 219 | } 220 | 221 | invalid_extention = { 222 | 'jpg': None, 223 | 'gif': None, 224 | 'bmp': None, 225 | 'jpeg': None, 226 | 'png': None, 227 | 228 | 'swf': None, 229 | 'mp3': None, 230 | 'wma': None, 231 | 'wmv': None, 232 | 'wav': None, 233 | 'mid': None, 234 | 'ape': None, 235 | 'mpg': None, 236 | 'mpeg': None, 237 | 'rm': None, 238 | 'rmvb': None, 239 | 'avi': None, 240 | 'mkv': None, 241 | 242 | 'zip': None, 243 | 'rar': None, 244 | 'gz': None, 245 | 'iso': None, 246 | 'jar': None, 247 | 248 | 'doc': None, 249 | 'docx': None, 250 | 'ppt': None, 251 | 'pptx': None, 252 | 'chm': None, 253 | 'pdf': None, 254 | 255 | 'exe': None, 256 | 'msi': None, 257 | } 258 | 259 | @staticmethod 260 | def checkScheme(url): 261 | scheme, netloc, path, pm, q, f = urlparse.urlparse(url) 262 | return scheme in ('http', 'https') 263 | 264 | @classmethod 265 | def checkInvalidChar(cls, url): 266 | exist_invalid_char = False 267 | for c in url: 268 | if c in cls.invalid_chars: 269 | exist_invalid_char = True 270 | break 271 | return (not exist_invalid_char) 272 | 273 | @classmethod 274 | def checkInvalidExtention(cls, url): 275 | dotpos = url.rfind('.') + 1 276 | typestr = url[dotpos:].lower() 277 | return (typestr not in cls.invalid_extention) 278 | 279 | @staticmethod 280 | def isSameDomain(first_url, second_url): 281 | fhost = urlparse.urlparse(first_url).netloc 282 | shost = urlparse.urlparse(second_url).netloc 283 | return (domain.GetFirstLevelDomain(fhost) == 284 | domain.GetFirstLevelDomain(shost)) 285 | 286 | @staticmethod 287 | def isSameHost(first_url, second_url): 288 | return urlparse.urlparse(first_url).netloc == urlparse.urlparse(second_url).netloc 289 | 290 | @staticmethod 291 | def isSameSuffixWithoutWWW(first_url, second_url): 292 | fhost = '.' + urlparse.urlparse(first_url).netloc 293 | shost = '.' + urlparse.urlparse(second_url).netloc 294 | 295 | if shost[:5] == '.www.': 296 | shost = shost[5:] 297 | 298 | if fhost.find(shost) != -1: 299 | return True 300 | else: 301 | return False 302 | 303 | # check whether first_url has the suffix second_url 304 | @staticmethod 305 | def isSameSuffix(first_url, second_url): 306 | fhost = '.' + urlparse.urlparse(first_url).netloc 307 | shost = '.' + urlparse.urlparse(second_url).netloc 308 | 309 | if fhost.find(shost) != -1: 310 | return True 311 | else: 312 | return False 313 | 314 | 315 | # remove similary urls 316 | @staticmethod 317 | def uniq(urls, rule=UniqRule()): 318 | result = {} 319 | for u in urls: 320 | try: 321 | urlobj = UrlObject(u, rule) 322 | except Exception: 323 | result[hash(u)] = u 324 | continue 325 | result.setdefault(urlobj.hashcode, u) 326 | return result.values() 327 | 328 | 329 | class TestHtmlAnalyzer(unittest.TestCase): 330 | 331 | url = "http://www.sina.com.cn" 332 | charset = 'gb2312' 333 | 334 | def setUp(self): 335 | import requests 336 | r = requests.get(self.url) 337 | r.encoding = self.charset 338 | self.html = r.text 339 | 340 | def testDetectCharSet(self): 341 | charset = HtmlAnalyzer.detectCharSet(self.html) 342 | self.assertEqual(charset, self.charset) 343 | 344 | def testExtractLinks(self): 345 | links = [] 346 | for link in HtmlAnalyzer.extractLinks(self.html, self.url, self.charset): 347 | links.append(link) 348 | self.assertGreater(len(links), 1000) 349 | 350 | 351 | class TestUrlFilter(unittest.TestCase): 352 | 353 | def testCheckScheme(self): 354 | url1 = "http://www.sina.com.cn" 355 | url2 = "javascript:void(0)" 356 | url3 = "mailto:kenshin.acs@gmail.com" 357 | self.assert_(UrlFilter.checkScheme(url1)) 358 | self.assertFalse(UrlFilter.checkScheme(url2)) 359 | self.assertFalse(UrlFilter.checkScheme(url3)) 360 | 361 | def testCheckInvalidChar(self): 362 | url1 = "http://www.sina.com.cn" 363 | url2 = "http://www.sina.com.cn+" 364 | self.assert_(UrlFilter.checkInvalidChar(url1)) 365 | self.assertFalse(UrlFilter.checkInvalidChar(url2)) 366 | 367 | def testCheckInvalidExtention(self): 368 | url1 = "http://www.sina.com.cn" 369 | url2 = "http://www.sina.com.cn/hack.pdf" 370 | self.assert_(UrlFilter.checkInvalidExtention(url1)) 371 | self.assertFalse(UrlFilter.checkInvalidExtention(url2)) 372 | 373 | def testIsSameDomain(self): 374 | url1 = "http://www.sina.com.cn" 375 | url2 = "http://www.sina.com" 376 | url3 = "http://news.sina.com.cn" 377 | self.assertFalse(UrlFilter.isSameDomain(url1, url2)) 378 | self.assert_(UrlFilter.isSameDomain(url1, url3)) 379 | 380 | def testIsSameHost(self): 381 | url1 = "http://www.sina.com.cn" 382 | url2 = "http://news.sina.com.cn" 383 | url3 = "http://www.sina.com.cn/news/" 384 | self.assertFalse(UrlFilter.isSameHost(url1, url2)) 385 | self.assert_(UrlFilter.isSameHost(url1, url3)) 386 | 387 | def testIsSameSuffixWithoutWWW(self): 388 | url1 = "http://news.sina.com.cn" 389 | url2 = "http://www.news.sina.com.cn" 390 | url3 = "http://www.sina.com.cn" 391 | self.assert_(UrlFilter.isSameSuffixWithoutWWW(url1, url2)) 392 | self.assert_(UrlFilter.isSameSuffixWithoutWWW(url1, url3)) 393 | 394 | def testIsSameSuffix(self): 395 | url1 = "http://news.sina.com.cn" 396 | url2 = "http://www.news.sina.com.cn" 397 | url3 = "http://sina.com.cn" 398 | self.assertFalse(UrlFilter.isSameSuffix(url1, url2)) 399 | self.assert_(UrlFilter.isSameSuffix(url1, url3)) 400 | 401 | 402 | if __name__ == '__main__': 403 | unittest.main() 404 | --------------------------------------------------------------------------------