├── .gitignore ├── README.txt ├── setup.py ├── test_urlnorm.py └── urlnorm.py /.gitignore: -------------------------------------------------------------------------------- 1 | pip-log.txt 2 | build 3 | dist 4 | MANIFEST 5 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | urlnorm.py 2 | ========== 3 | 4 | Normalize a URL to a standard unicode representation 5 | 6 | urlnorm normalizes a URL by: 7 | 8 | * lowercasing the scheme and hostname 9 | * converting the hostname to IDN format 10 | * taking out default port if present (e.g., http://www.foo.com:80/) 11 | * collapsing the path (./, ../, etc) 12 | * removing the last character in the hostname if it is '.' 13 | * unquoting any % escaped characters (where possible) 14 | 15 | Installation 16 | ============ 17 | 18 | pip install urlnorm 19 | 20 | 21 | Example 22 | ======= 23 | 24 | >>> import urlnorm 25 | >>> urlnorm.norm("http://xn--q-bga.com./u/u/../%72/l/") 26 | u'http://q\xe9.com/u/r/l/' 27 | 28 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | # also update in urlnorm.py 4 | version = '1.1.4' 5 | 6 | setup(name='urlnorm', 7 | version=version, 8 | long_description=open("./README.txt", "r").read(), 9 | description="Normalize a URL to a standard unicode encoding", 10 | py_modules=['urlnorm'], 11 | license='MIT License', 12 | author='Jehiah Czebotar', 13 | author_email='jehiah@gmail.com', 14 | url='http://github.com/jehiah/urlnorm', 15 | download_url="http://github.com/downloads/jehiah/urlnorm/urlnorm-%s.tar.gz" % version, 16 | ) 17 | -------------------------------------------------------------------------------- /test_urlnorm.py: -------------------------------------------------------------------------------- 1 | """ 2 | this is a py.test test file 3 | """ 4 | import urlnorm 5 | from urlnorm import _unicode 6 | 7 | def pytest_generate_tests(metafunc): 8 | if metafunc.function in [test_norms]: 9 | """ test suite; some taken from RFC1808. Run with py.test""" 10 | tests = { 11 | 'http://1113982867/': 'http://66.102.7.147/', # ip dword encoding 12 | 'http://www.thedraymin.co.uk:/main/?p=308': 'http://www.thedraymin.co.uk/main/?p=308', # empty port 13 | 'http://www.foo.com:80/foo': 'http://www.foo.com/foo', 14 | 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', 15 | 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', 16 | 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', 17 | 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', 18 | 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', 19 | 'ftp://user:pass@ftp.foo.net/foo/bar': 'ftp://user:pass@ftp.foo.net/foo/bar', 20 | 'http://USER:pass@www.Example.COM/foo/bar': 'http://USER:pass@www.example.com/foo/bar', 21 | 'http://www.example.com./': 'http://www.example.com/', 22 | 'http://test.example/?a=%26&b=1': 'http://test.example/?a=%26&b=1', # should not un-encode the & that is part of a parameter value 23 | 'http://test.example/?a=%e3%82%82%26': 'http://test.example/?a=\xe3\x82\x82%26'.decode('utf8'), # should return a unicode character 24 | # note: this breaks the internet for parameters that are positional (stupid nextel) and/or don't have an = sign 25 | # 'http://test.example/?a=1&b=2&a=3': 'http://test.example/?a=1&a=3&b=2', # should be in sorted/grouped order 26 | 27 | # 'http://s.xn--q-bga.de/': 'http://s.q\xc3\xa9.de/'.decode('utf8'), # should be in idna format 28 | 'http://test.example/?': 'http://test.example/', # no trailing ? 29 | 'http://test.example?': 'http://test.example/', # with trailing / 30 | 'http://a.COM/path/?b&a' : 'http://a.com/path/?b&a', 31 | # test utf8 and unicode 32 | u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'), 33 | u'http://XBLA\u306eXbox.com'.encode('utf8'): 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'), 34 | u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'), 35 | # test idna + utf8 domain 36 | # u'http://xn--q-bga.XBLA\u306eXbox.com'.encode('utf8'): 'http://q\xc3\xa9.xbla\xe3\x81\xaexbox.com'.decode('utf8'), 37 | 'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': 'http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3'.decode('utf8'), 38 | 'http://test.example/\xe3\x82\xad': 'http://test.example/\xe3\x82\xad', 39 | 40 | # check that %23 (#) is not escaped where it shouldn't be 41 | 'http://test.example/?p=%23val#test-%23-val%25': 'http://test.example/?p=%23val#test-%23-val%25', 42 | # check that %25 is not unescaped to % 43 | 'http://test.example/%25/?p=val%25ue' : 'http://test.example/%25/?p=val%25ue', 44 | "http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n" : "http://test.domain/I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xef\xbf\xbdliz\xc3\xa6ti\xc3\xb8n", 45 | # check that %20 in paths, params, query strings, and fragments are unescaped 46 | 'http://test.example/abcde%20def?que%20ry=str%20ing#frag%20ment' : 'http://test.example/abcde def?que ry=str ing#frag ment', 47 | # check that spaces are collated to '+' 48 | "http://test.example/path;par%20ams/with a%20space+/" : "http://test.example/path;par ams/with a space+/", # spaces in paths are ok 49 | "http://[2001:db8:1f70::999:de8:7648:6e8]/test" : "http://[2001:db8:1f70::999:de8:7648:6e8]/test", #ipv6 address 50 | "http://[::ffff:192.168.1.1]/test" : "http://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation 51 | "http://[::ffff:192.168.1.1]:80/test" : "http://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation 52 | "htTps://[::fFff:192.168.1.1]:443/test" : "https://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation 53 | 54 | # python 2.5 urlparse doesn't handle unknown protocols, so skipping this for now 55 | #"itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw" : "itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw", #can handle itms:// 56 | 57 | } 58 | for bad, good in tests.items(): 59 | metafunc.addcall(funcargs=dict(bad=bad, good=good)) 60 | 61 | elif metafunc.function == test_unquote: 62 | for bad, good, unsafe in ( 63 | ('%20', ' ', ''), 64 | ('%3f', '%3F', '?'), # don't unquote it, but uppercase it 65 | ('%E3%82%AD', u'\u30ad', ''), 66 | ): 67 | metafunc.addcall(funcargs=dict(bad=bad, good=good, unsafe=unsafe)) 68 | 69 | elif metafunc.function in [test_invalid_urls]: 70 | for url in [ 71 | 'http://http://www.exemple.com/', # invalid domain 72 | '-', 73 | 'asdf', 74 | 'HTTP://4294967297/test', # one more than max ip > int 75 | 'http://[img]http://i790.photobucket.com/albums/yy185/zack-32009/jordan.jpg[/IMG]', 76 | ]: 77 | metafunc.addcall(funcargs=dict(url=url)) 78 | elif metafunc.function == test_norm_path: 79 | tests = { 80 | '/foo/bar/.': '/foo/bar/', 81 | '/foo/bar/./': '/foo/bar/', 82 | '/foo/bar/..': '/foo/', 83 | '/foo/bar/../': '/foo/', 84 | '/foo/bar/../baz': '/foo/baz', 85 | '/foo/bar/../..': '/', 86 | '/foo/bar/../../': '/', 87 | '/foo/bar/../../baz': '/baz', 88 | '/foo/bar/../../../baz': '/../baz', 89 | '/foo/bar/../../../../baz': '/baz', 90 | '/./foo': '/foo', 91 | '/../foo': '/../foo', 92 | '/foo.': '/foo.', 93 | '/.foo': '/.foo', 94 | '/foo..': '/foo..', 95 | '/..foo': '/..foo', 96 | '/./../foo': '/../foo', 97 | '/./foo/.': '/foo/', 98 | '/foo/./bar': '/foo/bar', 99 | '/foo/../bar': '/bar', 100 | '/foo//': '/foo/', 101 | '/foo///bar//': '/foo/bar/', 102 | } 103 | for bad, good in tests.items(): 104 | metafunc.addcall(funcargs=dict(bad=bad, good=good)) 105 | 106 | def test_invalid_urls(url): 107 | try: 108 | output = urlnorm.norm(url) 109 | print '%r' % output 110 | except urlnorm.InvalidUrl: 111 | return 112 | assert 1 == 0, "this should have raised an InvalidUrl exception" 113 | 114 | def test_unquote(bad, good, unsafe): 115 | output = urlnorm.unquote_safe(bad, unsafe) 116 | assert output == good 117 | 118 | def test_norms(bad, good): 119 | new_url = urlnorm.norm(bad) 120 | assert new_url == _unicode(good) 121 | 122 | def test_norm_path(bad, good): 123 | output = urlnorm.norm_path("http", bad) 124 | assert output == _unicode(good) 125 | -------------------------------------------------------------------------------- /urlnorm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf8 -*- 3 | 4 | """ 5 | urlnorm.py - URL normalization routines 6 | 7 | urlnorm normalizes a URL by: 8 | * lowercasing the scheme and hostname 9 | * converting the hostname to IDN format 10 | * taking out default port if present (e.g., http://www.foo.com:80/) 11 | * collapsing the path (./, ../, //, etc) 12 | * removing the last character in the hostname if it is '.' 13 | * unescaping any percent escape sequences (where possible) 14 | * upercase percent escape (ie: %3f => %3F) 15 | * converts spaces to %20 16 | * converts ip encoded as an integer to dotted quad notation 17 | 18 | Available functions: 19 | norm - given a URL (string), returns a normalized URL 20 | norm_netloc 21 | norm_path 22 | unquote_path 23 | unquote_params 24 | unquote_qs 25 | unquote_fragment 26 | 27 | 28 | CHANGES: 29 | 1.1.4 - unescape " " in params, query string, and fragments 30 | 1.1.3 - don't escape " " in path 31 | 1.1.2 - leave %20 as %20, collate ' ' to %20, leave '+' as '+' 32 | 1.1 - collate %20 and ' ' to '+' 33 | 1.1 - fix unescaping of parameters 34 | 1.1 - added int2ip 35 | 1.0.1 - fix problem unescaping %23 and %20 in query string 36 | 1.0 - new release 37 | 0.94 - idna handling, unescaping querystring, fragment, add ws + wss ports 38 | 0.92 - unknown schemes now pass the port through silently 39 | 0.91 - general cleanup 40 | - changed dictionaries to lists where appropriate 41 | - more fine-grained authority parsing and normalisation 42 | """ 43 | 44 | __license__ = """ 45 | Copyright (c) 1999-2002 Mark Nottingham 46 | Copyright (c) 2010 Jehiah Czebotar 47 | 48 | Permission is hereby granted, free of charge, to any person obtaining a copy 49 | of this software and associated documentation files (the "Software"), to deal 50 | in the Software without restriction, including without limitation the rights 51 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 52 | copies of the Software, and to permit persons to whom the Software is 53 | furnished to do so, subject to the following conditions: 54 | 55 | The above copyright notice and this permission notice shall be included in all 56 | copies or substantial portions of the Software. 57 | 58 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 59 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 60 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 61 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 62 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 63 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 64 | SOFTWARE. 65 | """ 66 | 67 | # also update in setup.py 68 | __version__ = "1.1.4" 69 | 70 | from urlparse import urlparse, urlunparse 71 | from string import lower 72 | import re 73 | 74 | 75 | class InvalidUrl(Exception): 76 | pass 77 | 78 | _server_authority = re.compile('^(?:([^\@]+)\@)?([^\:\[\]]+|\[[a-fA-F0-9\:\.]+\])(?:\:(.*?))?$') 79 | _default_port = {'http': '80', 80 | 'itms': '80', 81 | 'ws': '80', 82 | 'https': '443', 83 | 'wss': '443', 84 | 'gopher': '70', 85 | 'news': '119', 86 | 'snews': '563', 87 | 'nntp': '119', 88 | 'snntp': '563', 89 | 'ftp': '21', 90 | 'telnet': '23', 91 | 'prospero': '191', 92 | } 93 | _relative_schemes = set(['http', 94 | 'https', 95 | 'ws', 96 | 'wss', 97 | 'itms', 98 | 'news', 99 | 'snews', 100 | 'nntp', 101 | 'snntp', 102 | 'ftp', 103 | 'file', 104 | '' 105 | ]) 106 | 107 | params_unsafe_list = set('?=+%#;') 108 | qs_unsafe_list = set('?&=+%#') 109 | fragment_unsafe_list = set('+%#') 110 | path_unsafe_list = set('/?;%+#') 111 | _hextochr = dict(('%02x' % i, chr(i)) for i in range(256)) 112 | _hextochr.update(('%02X' % i, chr(i)) for i in range(256)) 113 | 114 | 115 | def unquote_path(s): 116 | return unquote_safe(s, path_unsafe_list) 117 | 118 | 119 | def unquote_params(s): 120 | return unquote_safe(s, params_unsafe_list) 121 | 122 | 123 | def unquote_qs(s): 124 | return unquote_safe(s, qs_unsafe_list) 125 | 126 | 127 | def unquote_fragment(s): 128 | return unquote_safe(s, fragment_unsafe_list) 129 | 130 | 131 | def unquote_safe(s, unsafe_list): 132 | """unquote percent escaped string except for percent escape sequences that are in unsafe_list""" 133 | # note: this build utf8 raw strings ,then does a .decode('utf8') at the end. 134 | # as a result it's doing .encode('utf8') on each block of the string as it's processed. 135 | res = _utf8(s).split('%') 136 | for i in xrange(1, len(res)): 137 | item = res[i] 138 | try: 139 | raw_chr = _hextochr[item[:2]] 140 | if raw_chr in unsafe_list or ord(raw_chr) < 20: 141 | # leave it unescaped (but uppercase the percent escape) 142 | res[i] = '%' + item[:2].upper() + item[2:] 143 | else: 144 | res[i] = raw_chr + item[2:] 145 | except KeyError: 146 | res[i] = '%' + item 147 | except UnicodeDecodeError: 148 | # note: i'm not sure what this does 149 | res[i] = unichr(int(item[:2], 16)) + item[2:] 150 | o = "".join(res) 151 | return _unicode(o) 152 | 153 | 154 | def norm(url): 155 | """given a string URL, return its normalized/unicode form""" 156 | url = _unicode(url) # operate on unicode strings 157 | url_tuple = urlparse(url) 158 | normalized_tuple = norm_tuple(*url_tuple) 159 | return urlunparse(normalized_tuple) 160 | 161 | def norm_tuple(scheme, authority, path, parameters, query, fragment): 162 | """given individual url components, return its normalized form""" 163 | scheme = lower(scheme) 164 | if not scheme: 165 | raise InvalidUrl('missing URL scheme') 166 | authority = norm_netloc(scheme, authority) 167 | if not authority: 168 | raise InvalidUrl('missing netloc') 169 | path = norm_path(scheme, path) 170 | # TODO: put query in sorted order; or at least group parameters together 171 | # Note that some websites use positional parameters or the name part of a query so this would break the internet 172 | # query = urlencode(parse_qs(query, keep_blank_values=1), doseq=1) 173 | parameters = unquote_params(parameters) 174 | query = unquote_qs(query) 175 | fragment = unquote_fragment(fragment) 176 | return (scheme, authority, path, parameters, query, fragment) 177 | 178 | 179 | def norm_path(scheme, path): 180 | if scheme in _relative_schemes: 181 | # resolve `/../` and `/./` and `//` components in path as appropriate 182 | i = 0 183 | parts = [] 184 | start = 0 185 | while i < len(path): 186 | if path[i] == "/" or i == len(path) - 1: 187 | chunk = path[start:i+1] 188 | start = i + 1 189 | if chunk in ["", "/", ".", "./"]: 190 | # do nothing 191 | pass 192 | elif chunk in ["..", "../"]: 193 | if len(parts): 194 | parts = parts[:len(parts)-1] 195 | else: 196 | parts.append(chunk) 197 | else: 198 | parts.append(chunk) 199 | i+=1 200 | path = "/"+ ("".join(parts)) 201 | path = unquote_path(path) 202 | if not path: 203 | return '/' 204 | return path 205 | 206 | MAX_IP = 0xffffffffL 207 | 208 | 209 | def int2ip(ipnum): 210 | assert isinstance(ipnum, int) 211 | if MAX_IP < ipnum or ipnum < 0: 212 | raise TypeError("expected int between 0 and %d inclusive" % MAX_IP) 213 | ip1 = ipnum >> 24 214 | ip2 = ipnum >> 16 & 0xFF 215 | ip3 = ipnum >> 8 & 0xFF 216 | ip4 = ipnum & 0xFF 217 | return "%d.%d.%d.%d" % (ip1, ip2, ip3, ip4) 218 | 219 | 220 | def norm_netloc(scheme, netloc): 221 | if not netloc: 222 | return netloc 223 | match = _server_authority.match(netloc) 224 | if not match: 225 | raise InvalidUrl('no host in netloc %r' % netloc) 226 | 227 | userinfo, host, port = match.groups() 228 | # catch a few common errors: 229 | if host.isdigit(): 230 | try: 231 | host = int2ip(int(host)) 232 | except TypeError: 233 | raise InvalidUrl('host %r does not escape to a valid ip' % host) 234 | if host[-1] == '.': 235 | host = host[:-1] 236 | 237 | # bracket check is for ipv6 hosts 238 | if '.' not in host and not (host[0] == '[' and host[-1] == ']'): 239 | raise InvalidUrl('host %r is not valid' % host) 240 | 241 | authority = lower(host) 242 | if 'xn--' in authority: 243 | subdomains = [_idn(subdomain) for subdomain in authority.split('.')] 244 | authority = '.'.join(subdomains) 245 | 246 | if userinfo: 247 | authority = "%s@%s" % (userinfo, authority) 248 | if port and port != _default_port.get(scheme, None): 249 | authority = "%s:%s" % (authority, port) 250 | return authority 251 | 252 | 253 | def _idn(subdomain): 254 | if subdomain.startswith('xn--'): 255 | try: 256 | subdomain = subdomain.decode('idna') 257 | except UnicodeError: 258 | raise InvalidUrl('Error converting subdomain %r to IDN' % subdomain) 259 | return subdomain 260 | 261 | 262 | def _utf8(value): 263 | if isinstance(value, unicode): 264 | return value.encode("utf-8") 265 | assert isinstance(value, str) 266 | return value 267 | 268 | 269 | def _unicode(value): 270 | if isinstance(value, str): 271 | return value.decode("utf-8") 272 | assert isinstance(value, unicode) 273 | return value 274 | --------------------------------------------------------------------------------