├── README.md
├── __init__.py
├── domain.py
├── spider.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | second-spider
 2 | =============
 3 | 
 4 | A simple python gevent concurrency spider
 5 | 
 6 | ### Features
 7 | 
 8 | 1. The concurrency foundation on [gevent](http://www.gevent.org/)
 9 | 2. The spider strategy highly configurable:
10 | 
11 | > 
12 | * Max depth 
13 | * Sum totals of urls
14 | * Max concurrency of http request,avoid dos
15 | * Request headers and cookies
16 | * Same host strategy
17 | * Same domain strategy
18 | * Max running time
19 | 
20 | 
21 | ### Dependencies
22 | 
23 | * python 2.7  
24 | * ~~* gevent 1.0dev~~
25 | * [gevent 1.0 final](https://github.com/surfly/gevent/releases/tag/1.0)
26 | * requests 1.0.3
27 | * pyquery 1.2.4
28 | 
29 | 
30 | ### Test
31 | 
32 | ```
33 | python spider.py -v
34 | ```
35 | 
36 | ### Example
37 | 
38 | ```
39 | import logging
40 | from spider  import Spider
41 | 
42 | logging.basicConfig(
43 |         level=logging.DEBUG ,
44 |         format='%(asctime)s %(levelname)s %(message)s')
45 | 
46 | spider = Spider()
47 | spider.setRootUrl("http://www.sina.com.cn")
48 | spider.run()
49 | 
50 | ```
51 | 
52 | 
53 | ### TODO
54 | 
55 | * Support Distributed , update `gevent.Queue` -> `redis.Queue`
56 | * Storage backend highly configurable
57 | * Support Ajax url (webkit etc..)
58 | 
59 | 
60 | ### LICENSE
61 | 
62 | Copyright © 2013 by kenshin
63 | 
64 | Under MIT license : [rem.mit-license.org](http://rem.mit-license.org/)
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | __version__ = (0, 1, 2)
4 | 
5 | __title__ = "Second Spider"
6 | 
7 | __author__ = ("kenshin","jetz")
8 | 


--------------------------------------------------------------------------------
/domain.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import unittest
 3 | 
 4 | # From http://www.iana.org/domains/root/db
 5 | GENERAL_TLD = ['com','edu','gov','net','org','mil','travel','aero',
 6 |                'asia','cat','coop','int','jobs','mobi','museum','post',
 7 |                'tel','xxx','pro','arpa']
 8 | 
 9 | REGOIN_TLD  = { "cn": ['xj', 'sh', 'ac', 'gs', 'zj', 'yn', 'ah', 'gz', 
10 |                        'bj', 'gx', 'jl', 'hk', 'gd', 
11 |                        'hn', 'hl', 'edu', 'hb', 'cq', 'ha', 'fj', 'he',
12 |                        'xz', 'sx', 'jx','ln', 'tw', 
13 |                        'mo', 'js', 'nx', 'hi', 'tj', 'sn', 'nm', 'sc', 'qh',
14 |                        'sd'],
15 |                 "tw": ['idv','game','club','ebiz'],
16 |                 "hk": ['idv'],
17 |                 }
18 | 
19 | def GetFirstLevelDomain(raw_host=""):
20 | 
21 |     raw_host.lower()
22 |     port = 80
23 |     if ":" in raw_host:
24 |         try:
25 |             (host, port) = raw_host.split(':')
26 |         except ValueError:
27 |             raise ValueError('Too many ":" in %s' % raw_host)
28 |     else:
29 |         host = raw_host
30 |     
31 |     rev = host.split(".")[::-1]
32 |     
33 |     if rev[0] in GENERAL_TLD:
34 |         rev = rev[:2]
35 |     elif len(rev[0].decode('utf-8')) == 2:
36 |         if rev[1] in GENERAL_TLD+REGOIN_TLD.get(rev[0], []):
37 |             rev = rev[:3]
38 |         else:
39 |             rev = rev[:2]
40 |     else:
41 |         return None
42 | 
43 |     return ".".join(rev[::-1])
44 | 
45 | 
46 | class DomainTest(unittest.TestCase):
47 |    
48 |     def test_base_function(self):
49 |         self.assertEqual(GetFirstLevelDomain('www.google.com'), 'google.com')
50 | 
51 |     def test_g_tld(self):
52 |         tlds = ['subdomain.china.asia',
53 |                 '4th.www.float.int',
54 |                 'e.pypi.python.org']
55 | 
56 |         match = ['china.asia','float.int','python.org']
57 | 
58 |         self.assertEquals([GetFirstLevelDomain(t) for t in tlds], match)
59 | 
60 |     def test_special_cctld(self):
61 | 
62 |         self.assertEqual(GetFirstLevelDomain('www.gx.cn'), 'www.gx.cn')
63 | 
64 |     def test_cjk_domain(self):
65 |         
66 |         self.assertEqual(GetFirstLevelDomain('www.g.中国'), 'g.中国')
67 | 
68 |     def test_domain_with_port(self):
69 |         self.assertEqual(GetFirstLevelDomain('del.icio.us:8080'), 'icio.us')
70 | 
71 |     def test_bad_domain(self):
72 |         self.assertIsNone(GetFirstLevelDomain('i-am.b_ad.domain'))
73 | 
74 | if __name__ == '__main__':
75 |     unittest.main()
76 | 


--------------------------------------------------------------------------------
/spider.py:
--------------------------------------------------------------------------------
  1 | # coding:utf-8
  2 | import gevent
  3 | from gevent import (monkey,
  4 |                     queue,
  5 |                     event,
  6 |                     pool)
  7 | 
  8 | import re
  9 | import sys
 10 | import logging
 11 | import unittest
 12 | import urllib
 13 | import urlparse
 14 | import requests
 15 | from threading import Timer
 16 | from pyquery import PyQuery
 17 | from utils import HtmlAnalyzer, UrlFilter
 18 | 
 19 | 
 20 | __all__ = ['Strategy', 'UrlObj', 'Spider', 'HtmlAnalyzer', 'UrlFilter']
 21 | 
 22 | 
 23 | 
 24 | class Strategy(object):
 25 | 
 26 |     default_cookies = {}
 27 | 
 28 |     default_headers = {
 29 |         'User-Agent': 'SinaSec Webscan Spider',
 30 |         'Accept': 'Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 31 |         'Cache-Control': 'max-age=0',
 32 |         'Accept-Charset': 'GBK,utf-8;q=0.7,*;q=0.3',
 33 |     }
 34 | 
 35 | 
 36 |     def __init__(self,max_depth=5,max_count=5000,concurrency=5,timeout=10,time=6*3600,headers=None,
 37 |                  cookies=None,ssl_verify=False,same_host=False,same_domain=True):
 38 |         self.max_depth = max_depth
 39 |         self.max_count = max_count
 40 |         self.concurrency = concurrency
 41 |         self.timeout = timeout
 42 |         self.time = time
 43 |         self.headers = self.default_headers
 44 |         self.headers.update(headers or {})
 45 |         self.cookies = self.default_cookies
 46 |         self.cookies.update(cookies or {})
 47 |         self.ssl_verify = ssl_verify
 48 |         self.same_host = same_host
 49 |         self.same_domain = same_domain
 50 | 
 51 | 
 52 | class UrlObj(object):
 53 | 
 54 |     def __init__(self, url, depth=0, linkin=None):
 55 |         if not url.startswith("http"):
 56 |             url = "http://" + url
 57 |         self.url = url.strip('/')
 58 |         self.depth = depth
 59 |         self.linkin = linkin
 60 | 
 61 |     def __str__(self):
 62 |         return self.url
 63 | 
 64 |     def __repr__(self):
 65 |         return "<Url object: %s>" % self.url
 66 | 
 67 |     def __hash__(self):
 68 |         return hash(self.url)
 69 | 
 70 |     def setLinkin(self, urlobj):
 71 |         self.linkin = urlobj
 72 | 
 73 |     def incrDepth(self):
 74 |         self.depth += 1
 75 | 
 76 | 
 77 | class UrlTable(object):
 78 | 
 79 |     infinite = float("inf")
 80 | 
 81 |     def __init__(self, size=0):
 82 |         self.__urls = {}
 83 | 
 84 |         if size == 0 :
 85 |             size = self.infinite
 86 |         self.size = size
 87 | 
 88 |     def __len__(self):
 89 |         return len(self.__urls)
 90 | 
 91 |     def __contains__(self, url):
 92 |         return hash(url) in self.__urls.keys()
 93 | 
 94 |     def __iter__(self):
 95 |         for url in self.urls:
 96 |             yield url
 97 | 
 98 |     def insert(self, url):
 99 |         if isinstance(url, basestring):
100 |             url = UrlObj(url)
101 |         if url not in self:
102 |             self.__urls.setdefault(hash(url), url)
103 | 
104 |     @property
105 |     def urls(self):
106 |         return self.__urls.values()
107 | 
108 |     def full(self):
109 |         return len(self) >= self.size
110 | 
111 | 
112 | class Spider(object):
113 | 
114 |     logger = logging.getLogger("spider.mainthread")
115 | 
116 |     def __init__(self,strategy=Strategy()):
117 |         monkey.patch_all()
118 |         self.strategy = strategy
119 |         self.queue = queue.Queue()
120 |         self.urltable = UrlTable(strategy.max_count)
121 |         self.pool = pool.Pool(strategy.concurrency)
122 |         self.greenlet_finished = event.Event()
123 |         self._stop = event.Event()
124 | 
125 | 
126 |     def setRootUrl(self,url):
127 |         if isinstance(url,basestring):
128 |             url = UrlObj(url)
129 |         self.root = url
130 |         self.put(self.root)
131 | 
132 |     def put(self, url):
133 |         if url not in self.urltable:
134 |             self.queue.put(url)
135 | 
136 |     def run(self):
137 |         self.timer = Timer(self.strategy.time, self.stop)
138 |         self.timer.start()
139 |         self.logger.info("spider '%s' begin running",self.root)
140 | 
141 |         while not self.stopped() and self.timer.isAlive():
142 |             for greenlet in list(self.pool):
143 |                 if greenlet.dead:
144 |                     self.pool.discard(greenlet)
145 |             try:
146 |                 url = self.queue.get_nowait()
147 |             except queue.Empty:
148 |                 if self.pool.free_count() != self.pool.size:
149 |                     self.greenlet_finished.wait()
150 |                     self.greenlet_finished.clear()
151 |                     continue
152 |                 else:
153 |                     self.stop()
154 |             greenlet = Handler(url, self)
155 |             self.pool.start(greenlet)
156 | 
157 |     def stopped(self):
158 |         return self._stop.is_set()
159 | 
160 |     def stop(self):
161 |         self.logger.info("spider '%s' finished. fetch total (%d) urls",self.root,len(self.urltable))
162 |         self.timer.cancel()
163 |         self._stop.set()
164 |         self.pool.join()
165 |         self.queue.put(StopIteration)
166 |         return
167 | 
168 |     def dump(self):
169 |         import StringIO
170 |         out = StringIO.StringIO()
171 |         for url in self.urltable:
172 |             try:
173 |                 print >> out ,url
174 |             except:
175 |                 continue
176 |         return out.getvalue()
177 | 
178 | 
179 | class Handler(gevent.Greenlet):
180 | 
181 |     logger = logging.getLogger("spider.handler")
182 | 
183 |     def __init__(self, urlobj, spider):
184 |         gevent.Greenlet.__init__(self)
185 |         self.urlobj = urlobj
186 |         self.spider = spider
187 |         self.charset = "utf-8"
188 | 
189 |     def _run(self):
190 |         strategy = self.spider.strategy
191 |         urltable = self.spider.urltable
192 |         queue = self.spider.queue
193 | 
194 |         try:
195 |             html = self.open(self.urlobj.url)
196 |         except Exception, why:
197 |             self.logger.debug("open '%s' failed,since : %s", self.urlobj, why)
198 |             return self.stop()
199 | 
200 |         linkin = self.urlobj
201 |         depth = linkin.depth + 1
202 | 
203 |         if strategy.max_depth and (depth > strategy.max_depth):
204 |             return self.stop()
205 | 
206 |         for link in self.feed(html):
207 | 
208 |             if urltable.full():
209 |                 self.stop()
210 |                 self.spider.stop()
211 |                 return
212 | 
213 |             if link in urltable:
214 |                 continue
215 | 
216 | 
217 |             if strategy.same_host and (not UrlFilter.isSameHost(link,linkin.url)):
218 |                 continue
219 | 
220 |             if strategy.same_domain and (not UrlFilter.isSameDomain(link, linkin.url)):
221 |                 continue
222 | 
223 |             url = UrlObj(link, depth, linkin)
224 |             urltable.insert(url)
225 |             queue.put(url)
226 | 
227 |             self.logger.debug(
228 |                 "sucess crawled '%s' the <%d> urls", url, len(urltable))
229 | 
230 |         self.stop()
231 | 
232 |     def open(self, url):
233 |         strategy = self.spider.strategy
234 |         try:
235 |             resp = requests.get(url, headers=strategy.headers,
236 |                                 cookies=strategy.cookies, timeout=strategy.timeout,
237 |                                 verify=strategy.ssl_verify)
238 |         except requests.exceptions.RequestException, e:
239 |             raise e
240 |         if resp.status_code != requests.codes.ok:
241 |             resp.raise_for_status()
242 |         charset = HtmlAnalyzer.detectCharSet(resp.text)
243 |         if charset is not None:
244 |             self.charset = charset
245 |             resp.encoding = charset
246 |         return resp.text
247 | 
248 |     def feed(self,html):
249 |         return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset)
250 | 
251 | 
252 |     def stop(self):
253 |         self.spider.greenlet_finished.set()
254 |         self.kill(block=False)
255 | 
256 | 
257 | class TestSpider(unittest.TestCase):
258 | 
259 |     def setUp(self):
260 |         self.root = "http://www.sina.com.cn"
261 |         strategy = Strategy(max_depth=3, max_count=5000,
262 |                             same_host=False, same_domain=True)
263 |         self.spider = Spider(strategy)
264 |         self.spider.setRootUrl(self.root)
265 |         self.spider.run()
266 | 
267 |     def testSpiderStrategy(self):
268 |         self.assertEqual(len(self.spider.urltable), 5000)
269 |         self.assertLessEqual(self.spider.urltable.urls[-1].depth, 3)
270 |         for url in self.spider.urltable.urls[100:200]:
271 |             self.assert_(UrlFilter.isSameDomain(self.root, str(url)))
272 | 
273 | 
274 | 
275 | if __name__ == '__main__':
276 |     logging.basicConfig(
277 |         level=logging.DEBUG if "-v" in sys.argv else logging.WARN,
278 |         format='%(asctime)s %(levelname)s %(message)s')
279 |     unittest.main()
280 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | import urllib
  6 | import urlparse
  7 | from pyquery import PyQuery
  8 | import domain
  9 | import os.path
 10 | import unittest
 11 | 
 12 | 
 13 | class HtmlAnalyzer(object):
 14 | 
 15 |     @staticmethod
 16 |     def detectCharSet(html):
 17 | 
 18 |         pq = PyQuery(html)
 19 | 
 20 |         metas = pq('head')('meta')
 21 | 
 22 |         for meta in metas:
 23 |             for key in meta.keys():
 24 |                 if key == "charset":
 25 |                     charset = meta.get('charset')
 26 |                     return charset
 27 |                 if key == "content":
 28 |                     try:
 29 |                         p = re.match(r".+charset=(.*)\W*", meta.get('content'))
 30 |                         return p.group(1)
 31 |                     except:
 32 |                         continue
 33 | 
 34 |     @staticmethod
 35 |     def extractLinks(html, baseurl, charset):
 36 | 
 37 |         def _extract(url, attr):
 38 |             link = url.attrib[attr]
 39 |             # strip('\\"') for href like <a href=\"http://www.sina.com.cn\">Sina</a>
 40 |             link = link.strip("/ ").strip('\\"')
 41 |             if link is None:
 42 |                 raise
 43 | 
 44 |             link = urlparse.urljoin(baseurl, link)
 45 |             link = urlparse.urldefrag(link)[0]
 46 | 
 47 |             try:
 48 |                 link = urllib.quote(link, ':?=+&#/@')
 49 |             except (UnicodeDecodeError, KeyError):
 50 |                 try:
 51 |                     link = urllib.quote(link.encode(charset), ':?=+&#/@')
 52 |                 except:
 53 |                     pass
 54 | 
 55 |             return link
 56 | 
 57 |         def _isValidLink(url):
 58 |             try:
 59 |                 return all([UrlFilter.checkScheme(url),
 60 |                             UrlFilter.checkInvalidChar(url),
 61 |                             UrlFilter.checkInvalidExtention(url)
 62 |                             ])
 63 |             except:
 64 |                 return False
 65 | 
 66 |         pq = PyQuery(html)
 67 | 
 68 |         allLinks = []
 69 | 
 70 |         for url in pq('a'):
 71 |             try:
 72 |                 link = _extract(url, 'href')
 73 |             except:
 74 |                 continue
 75 |             if _isValidLink(link):
 76 |                 allLinks.append(link)
 77 | 
 78 |         for url in pq('form'):
 79 |             try:
 80 |                 link = _extract(url, 'action')
 81 |             except:
 82 |                 continue
 83 |             if _isValidLink(link):
 84 |                 allLinks.append(link)
 85 |         return allLinks
 86 | 
 87 | 
 88 | class UniqRule(object):
 89 | 
 90 |     # 用于形如abc123格式
 91 |     alnum = re.compile(r'^(\D+)(\d+)$')
 92 | 
 93 |     date = re.compile(r'^([12]\d)?\d\d-\d{1,2}(-\d{1,2})?$')
 94 | 
 95 |     connector = '|'
 96 | 
 97 |     # 相同后缀名
 98 |     ext = {
 99 |         '.asp':  '.asp',
100 |         '.aspx': '.asp',
101 |         '.jsp':  '.jsp',
102 |         '.jspx': '.jsp',
103 |     }
104 | 
105 |     scheme = {
106 |         'http':  'http',
107 |         'https': 'http'
108 |     }
109 | 
110 |     normalize_dict = {
111 |         'digit':  '1',
112 |         'letter': 'a',
113 |         'date':   '2013-01-01',
114 |     }
115 | 
116 |     def __init__(self, depth=None):
117 |         self.depth = depth
118 | 
119 |     def is_digit(self, param):
120 |         return param.isdigit()
121 | 
122 |     def is_letter(self, param):
123 |         return len(param) == 1 and param.isalpha()
124 | 
125 |     def is_alnum(self, param):
126 |         if UniqRule.alnum.match(param):
127 |             return True
128 |         return False
129 | 
130 |     # 形如abc-123-453
131 |     def is_hyphen_split(self, param):
132 |         return not param.find('-') == -1
133 | 
134 |     def is_underscore_split(self, param):
135 |         return not param.find('_') == -1
136 | 
137 |     def is_date(self, param):
138 |         if UniqRule.date.match(param):
139 |             return True
140 |         return False
141 | 
142 |     def split_params(self, pathnode):
143 |         name_params = pathnode.split(';')
144 |         if len(name_params) > 1:
145 |             return name_params[0], name_params[1:]
146 |         else:
147 |             return name_params[0], []
148 | 
149 |     def normalize(self, param):
150 |         if self.is_digit(param):
151 |             return UniqRule.normalize_dict['digit']
152 |         elif self.is_letter(param):
153 |             return UniqRule.normalize_dict['letter']
154 |         elif self.is_date(param):
155 |             return UniqRule.normalize_dict['date']
156 |         elif self.is_alnum(param):
157 |             match = UniqRule.alnum.match(param)
158 |             return match.group(1) + UniqRule.normalize_dict['digit']
159 |         elif self.is_hyphen_split(param):
160 |             params = param.split('-')
161 |             for k, v in enumerate(params):
162 |                 if v.isdigit():
163 |                     params[k] = UniqRule.normalize_dict['digit']
164 |             return '-'.join(params)
165 |         elif self.is_underscore_split(param):
166 |             params = param.split('_')
167 |             for k, v in enumerate(params):
168 |                 if v.isdigit():
169 |                     params[k] = UniqRule.normalize_dict['digit']
170 |             return '_'.join(params)
171 |         else:
172 |             return param
173 | 
174 |     ############################################################
175 | 
176 |     def is_depth_set(self):
177 |         return self.depth is not None
178 | 
179 |     def normalize_scheme(self, scheme):
180 |         return UniqRule.scheme.get(scheme, scheme)
181 | 
182 |     def normalize_hostname(self, hostname):
183 |         return hostname
184 | 
185 |     def normalize_dirs(self, dir_list):
186 |         dir_depth = len(dir_list)
187 |         if self.is_depth_set() and self.depth <= dir_depth:
188 |             return UniqRule.connector.join([self.normalize(dir_list[i])
189 |                                             for i in xrange(self.depth)])
190 |         return UniqRule.connector.join([self.normalize(dir_list[i])
191 |                                         for i in xrange(dir_depth)])
192 | 
193 |     def normalize_tailpage(self, tailpage):
194 |         try:
195 |             tpname, params = self.split_params(tailpage)
196 |         except IndexError:
197 |             return tailpage
198 |         fname, ext = os.path.splitext(tpname)
199 |         norm_name = self.normalize(fname)
200 |         norm_ext = UniqRule.ext.get(ext, ext)
201 |         norm_params = sorted(params)
202 |         result = [norm_name, norm_ext]
203 |         result.extend(norm_params)
204 |         return UniqRule.connector.join(result)
205 | 
206 |     def normalize_querykeys(self, querykeys):
207 |         return UniqRule.connector.join(sorted(querykeys))
208 | 
209 | 
210 | class UrlFilter(object):
211 | 
212 |     invalid_chars = {'\'': None,
213 |                      '\"': None,
214 |                      '\\': None,
215 |                      ' ': None,
216 |                      '\n': None,
217 |                      '\r': None,
218 |                      '+': None
219 |                      }
220 | 
221 |     invalid_extention = {
222 |         'jpg':  None,
223 |         'gif':  None,
224 |         'bmp':  None,
225 |         'jpeg':  None,
226 |         'png':  None,
227 | 
228 |         'swf':  None,
229 |         'mp3':  None,
230 |         'wma':  None,
231 |         'wmv':  None,
232 |         'wav':  None,
233 |         'mid':  None,
234 |         'ape':  None,
235 |         'mpg':  None,
236 |         'mpeg':  None,
237 |         'rm':  None,
238 |         'rmvb':  None,
239 |         'avi':  None,
240 |         'mkv':  None,
241 | 
242 |         'zip':  None,
243 |         'rar':  None,
244 |         'gz':  None,
245 |         'iso':  None,
246 |         'jar':  None,
247 | 
248 |         'doc':  None,
249 |         'docx':  None,
250 |         'ppt':  None,
251 |         'pptx':  None,
252 |         'chm':  None,
253 |         'pdf':  None,
254 | 
255 |         'exe':  None,
256 |         'msi':  None,
257 |     }
258 | 
259 |     @staticmethod
260 |     def checkScheme(url):
261 |         scheme, netloc, path, pm, q, f = urlparse.urlparse(url)
262 |         return scheme in ('http', 'https')
263 | 
264 |     @classmethod
265 |     def checkInvalidChar(cls, url):
266 |         exist_invalid_char = False
267 |         for c in url:
268 |             if c in cls.invalid_chars:
269 |                 exist_invalid_char = True
270 |                 break
271 |         return (not exist_invalid_char)
272 | 
273 |     @classmethod
274 |     def checkInvalidExtention(cls, url):
275 |         dotpos = url.rfind('.') + 1
276 |         typestr = url[dotpos:].lower()
277 |         return (typestr not in cls.invalid_extention)
278 | 
279 |     @staticmethod
280 |     def isSameDomain(first_url, second_url):
281 |         fhost = urlparse.urlparse(first_url).netloc
282 |         shost = urlparse.urlparse(second_url).netloc
283 |         return (domain.GetFirstLevelDomain(fhost) ==
284 |                 domain.GetFirstLevelDomain(shost))
285 | 
286 |     @staticmethod
287 |     def isSameHost(first_url, second_url):
288 |         return urlparse.urlparse(first_url).netloc == urlparse.urlparse(second_url).netloc
289 | 
290 |     @staticmethod
291 |     def isSameSuffixWithoutWWW(first_url, second_url):
292 |         fhost = '.' + urlparse.urlparse(first_url).netloc
293 |         shost = '.' + urlparse.urlparse(second_url).netloc
294 | 
295 |         if shost[:5] == '.www.':
296 |             shost = shost[5:]
297 | 
298 |         if fhost.find(shost) != -1:
299 |             return True
300 |         else:
301 |             return False
302 | 
303 |     # check whether first_url has the suffix second_url
304 |     @staticmethod
305 |     def isSameSuffix(first_url, second_url):
306 |         fhost = '.' + urlparse.urlparse(first_url).netloc
307 |         shost = '.' + urlparse.urlparse(second_url).netloc
308 | 
309 |         if fhost.find(shost) != -1:
310 |             return True
311 |         else:
312 |             return False
313 | 
314 | 
315 |     # remove similary urls
316 |     @staticmethod
317 |     def uniq(urls, rule=UniqRule()):
318 |         result = {}
319 |         for u in urls:
320 |             try:
321 |                 urlobj = UrlObject(u, rule)
322 |             except Exception:
323 |                 result[hash(u)] = u
324 |                 continue
325 |             result.setdefault(urlobj.hashcode, u)
326 |         return result.values()
327 | 
328 | 
329 | class TestHtmlAnalyzer(unittest.TestCase):
330 | 
331 |     url = "http://www.sina.com.cn"
332 |     charset = 'gb2312'
333 | 
334 |     def setUp(self):
335 |         import requests
336 |         r = requests.get(self.url)
337 |         r.encoding = self.charset
338 |         self.html = r.text
339 | 
340 |     def testDetectCharSet(self):
341 |         charset = HtmlAnalyzer.detectCharSet(self.html)
342 |         self.assertEqual(charset, self.charset)
343 | 
344 |     def testExtractLinks(self):
345 |         links = []
346 |         for link in HtmlAnalyzer.extractLinks(self.html, self.url, self.charset):
347 |             links.append(link)
348 |         self.assertGreater(len(links), 1000)
349 | 
350 | 
351 | class TestUrlFilter(unittest.TestCase):
352 | 
353 |     def testCheckScheme(self):
354 |         url1 = "http://www.sina.com.cn"
355 |         url2 = "javascript:void(0)"
356 |         url3 = "mailto:kenshin.acs@gmail.com"
357 |         self.assert_(UrlFilter.checkScheme(url1))
358 |         self.assertFalse(UrlFilter.checkScheme(url2))
359 |         self.assertFalse(UrlFilter.checkScheme(url3))
360 | 
361 |     def testCheckInvalidChar(self):
362 |         url1 = "http://www.sina.com.cn"
363 |         url2 = "http://www.sina.com.cn+"
364 |         self.assert_(UrlFilter.checkInvalidChar(url1))
365 |         self.assertFalse(UrlFilter.checkInvalidChar(url2))
366 | 
367 |     def testCheckInvalidExtention(self):
368 |         url1 = "http://www.sina.com.cn"
369 |         url2 = "http://www.sina.com.cn/hack.pdf"
370 |         self.assert_(UrlFilter.checkInvalidExtention(url1))
371 |         self.assertFalse(UrlFilter.checkInvalidExtention(url2))
372 | 
373 |     def testIsSameDomain(self):
374 |         url1 = "http://www.sina.com.cn"
375 |         url2 = "http://www.sina.com"
376 |         url3 = "http://news.sina.com.cn"
377 |         self.assertFalse(UrlFilter.isSameDomain(url1, url2))
378 |         self.assert_(UrlFilter.isSameDomain(url1, url3))
379 | 
380 |     def testIsSameHost(self):
381 |         url1 = "http://www.sina.com.cn"
382 |         url2 = "http://news.sina.com.cn"
383 |         url3 = "http://www.sina.com.cn/news/"
384 |         self.assertFalse(UrlFilter.isSameHost(url1, url2))
385 |         self.assert_(UrlFilter.isSameHost(url1, url3))
386 | 
387 |     def testIsSameSuffixWithoutWWW(self):
388 |         url1 = "http://news.sina.com.cn"
389 |         url2 = "http://www.news.sina.com.cn"
390 |         url3 = "http://www.sina.com.cn"
391 |         self.assert_(UrlFilter.isSameSuffixWithoutWWW(url1, url2))
392 |         self.assert_(UrlFilter.isSameSuffixWithoutWWW(url1, url3))
393 | 
394 |     def testIsSameSuffix(self):
395 |         url1 = "http://news.sina.com.cn"
396 |         url2 = "http://www.news.sina.com.cn"
397 |         url3 = "http://sina.com.cn"
398 |         self.assertFalse(UrlFilter.isSameSuffix(url1, url2))
399 |         self.assert_(UrlFilter.isSameSuffix(url1, url3))
400 | 
401 | 
402 | if __name__ == '__main__':
403 |     unittest.main()
404 | 


--------------------------------------------------------------------------------