├── .gitignore
├── README.txt
├── setup.py
├── test_urlnorm.py
└── urlnorm.py


/.gitignore:
--------------------------------------------------------------------------------
1 | pip-log.txt
2 | build
3 | dist
4 | MANIFEST
5 | 


--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
 1 | urlnorm.py
 2 | ==========
 3 | 
 4 | Normalize a URL to a standard unicode representation
 5 | 
 6 | urlnorm normalizes a URL by:
 7 | 
 8 |   * lowercasing the scheme and hostname
 9 |   * converting the hostname to IDN format
10 |   * taking out default port if present (e.g., http://www.foo.com:80/)
11 |   * collapsing the path (./, ../, etc)
12 |   * removing the last character in the hostname if it is '.'
13 |   * unquoting any % escaped characters (where possible)
14 | 
15 | Installation
16 | ============
17 | 
18 |     pip install urlnorm
19 | 
20 | 
21 | Example
22 | =======
23 | 
24 |     >>> import urlnorm
25 |     >>> urlnorm.norm("http://xn--q-bga.com./u/u/../%72/l/")
26 |     u'http://q\xe9.com/u/r/l/'
27 | 
28 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | # also update in urlnorm.py
 4 | version = '1.1.4'
 5 | 
 6 | setup(name='urlnorm',
 7 |         version=version,
 8 |         long_description=open("./README.txt", "r").read(),
 9 |         description="Normalize a URL to a standard unicode encoding",
10 |         py_modules=['urlnorm'],
11 |         license='MIT License',
12 |         author='Jehiah Czebotar',
13 |         author_email='jehiah@gmail.com',
14 |         url='http://github.com/jehiah/urlnorm',
15 |         download_url="http://github.com/downloads/jehiah/urlnorm/urlnorm-%s.tar.gz" % version,
16 |         )
17 | 


--------------------------------------------------------------------------------
/test_urlnorm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | this is a py.test test file
  3 | """
  4 | import urlnorm
  5 | from urlnorm import _unicode
  6 | 
  7 | def pytest_generate_tests(metafunc):
  8 |     if metafunc.function in [test_norms]:
  9 |         """ test suite; some taken from RFC1808. Run with py.test"""
 10 |         tests = {
 11 |             'http://1113982867/':            'http://66.102.7.147/', # ip dword encoding
 12 |             'http://www.thedraymin.co.uk:/main/?p=308': 'http://www.thedraymin.co.uk/main/?p=308', # empty port
 13 |             'http://www.foo.com:80/foo':     'http://www.foo.com/foo',
 14 |             'http://www.foo.com:8000/foo':   'http://www.foo.com:8000/foo',
 15 |             'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html',
 16 |             'http://www.foo.com.:81/foo':    'http://www.foo.com:81/foo',
 17 |             'http://www.foo.com/%7ebar':     'http://www.foo.com/~bar',
 18 |             'http://www.foo.com/%7Ebar':     'http://www.foo.com/~bar',
 19 |             'ftp://user:pass@ftp.foo.net/foo/bar': 'ftp://user:pass@ftp.foo.net/foo/bar',
 20 |             'http://USER:pass@www.Example.COM/foo/bar': 'http://USER:pass@www.example.com/foo/bar',
 21 |             'http://www.example.com./':      'http://www.example.com/',
 22 |             'http://test.example/?a=%26&b=1': 'http://test.example/?a=%26&b=1', # should not un-encode the & that is part of a parameter value
 23 |             'http://test.example/?a=%e3%82%82%26': 'http://test.example/?a=\xe3\x82\x82%26'.decode('utf8'), # should return a unicode character
 24 |             # note: this breaks the internet for parameters that are positional (stupid nextel) and/or don't have an = sign
 25 |             # 'http://test.example/?a=1&b=2&a=3': 'http://test.example/?a=1&a=3&b=2', # should be in sorted/grouped order
 26 |             
 27 |             # 'http://s.xn--q-bga.de/':       'http://s.q\xc3\xa9.de/'.decode('utf8'), # should be in idna format
 28 |             'http://test.example/?':        'http://test.example/', # no trailing ?
 29 |             'http://test.example?':       'http://test.example/', # with trailing /
 30 |             'http://a.COM/path/?b&a' : 'http://a.com/path/?b&a',
 31 |             # test utf8 and unicode
 32 |             u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'),
 33 |             u'http://XBLA\u306eXbox.com'.encode('utf8'): 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'),
 34 |             u'http://XBLA\u306eXbox.com': 'http://xbla\xe3\x81\xaexbox.com/'.decode('utf8'),
 35 |             # test idna + utf8 domain
 36 |             # u'http://xn--q-bga.XBLA\u306eXbox.com'.encode('utf8'): 'http://q\xc3\xa9.xbla\xe3\x81\xaexbox.com'.decode('utf8'),
 37 |             'http://ja.wikipedia.org/wiki/%E3%82%AD%E3%83%A3%E3%82%BF%E3%83%94%E3%83%A9%E3%83%BC%E3%82%B8%E3%83%A3%E3%83%91%E3%83%B3': 'http://ja.wikipedia.org/wiki/\xe3\x82\xad\xe3\x83\xa3\xe3\x82\xbf\xe3\x83\x94\xe3\x83\xa9\xe3\x83\xbc\xe3\x82\xb8\xe3\x83\xa3\xe3\x83\x91\xe3\x83\xb3'.decode('utf8'),
 38 |             'http://test.example/\xe3\x82\xad': 'http://test.example/\xe3\x82\xad',
 39 |             
 40 |             # check that %23 (#) is not escaped where it shouldn't be
 41 |             'http://test.example/?p=%23val#test-%23-val%25': 'http://test.example/?p=%23val#test-%23-val%25',
 42 |             # check that %25 is not unescaped to %
 43 |             'http://test.example/%25/?p=val%25ue' : 'http://test.example/%25/?p=val%25ue',
 44 |             "http://test.domain/I%C3%B1t%C3%ABrn%C3%A2ti%C3%B4n%EF%BF%BDliz%C3%A6ti%C3%B8n" : "http://test.domain/I\xc3\xb1t\xc3\xabrn\xc3\xa2ti\xc3\xb4n\xef\xbf\xbdliz\xc3\xa6ti\xc3\xb8n",
 45 |             # check that %20 in paths, params, query strings, and fragments are unescaped
 46 |             'http://test.example/abcde%20def?que%20ry=str%20ing#frag%20ment' : 'http://test.example/abcde def?que ry=str ing#frag ment',
 47 |             # check that spaces are collated to '+'
 48 |             "http://test.example/path;par%20ams/with a%20space+/" : "http://test.example/path;par ams/with a space+/",  # spaces in paths are ok
 49 |             "http://[2001:db8:1f70::999:de8:7648:6e8]/test" : "http://[2001:db8:1f70::999:de8:7648:6e8]/test", #ipv6 address
 50 |             "http://[::ffff:192.168.1.1]/test" : "http://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation
 51 |             "http://[::ffff:192.168.1.1]:80/test" : "http://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation
 52 |             "htTps://[::fFff:192.168.1.1]:443/test" : "https://[::ffff:192.168.1.1]/test", # ipv4 address in ipv6 notation
 53 | 
 54 |             # python 2.5 urlparse doesn't handle unknown protocols, so skipping this for now
 55 |             #"itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw" : "itms://itunes.apple.com/us/app/touch-pets-cats/id379475816?mt=8#23161525,,1293732683083,260430,tw", #can handle itms://
 56 | 
 57 |         }
 58 |         for bad, good in tests.items():
 59 |             metafunc.addcall(funcargs=dict(bad=bad, good=good))
 60 |     
 61 |     elif metafunc.function == test_unquote:
 62 |         for bad, good, unsafe in (
 63 |             ('%20', ' ', ''),
 64 |             ('%3f', '%3F', '?'), # don't unquote it, but uppercase it
 65 |             ('%E3%82%AD', u'\u30ad', ''),
 66 |             ):
 67 |             metafunc.addcall(funcargs=dict(bad=bad, good=good, unsafe=unsafe))
 68 |     
 69 |     elif metafunc.function in [test_invalid_urls]:
 70 |         for url in [
 71 |             'http://http://www.exemple.com/', # invalid domain
 72 |             '-',
 73 |             'asdf',
 74 |             'HTTP://4294967297/test', # one more than max ip > int
 75 |             'http://[img]http://i790.photobucket.com/albums/yy185/zack-32009/jordan.jpg[/IMG]',
 76 |             ]:
 77 |             metafunc.addcall(funcargs=dict(url=url))
 78 |     elif metafunc.function == test_norm_path:
 79 |         tests = {
 80 |             '/foo/bar/.':                    '/foo/bar/',
 81 |             '/foo/bar/./':                   '/foo/bar/',
 82 |             '/foo/bar/..':                   '/foo/',
 83 |             '/foo/bar/../':                  '/foo/',
 84 |             '/foo/bar/../baz':               '/foo/baz',
 85 |             '/foo/bar/../..':                '/',
 86 |             '/foo/bar/../../':               '/',
 87 |             '/foo/bar/../../baz':            '/baz',
 88 |             '/foo/bar/../../../baz':         '/../baz',
 89 |             '/foo/bar/../../../../baz':      '/baz',
 90 |             '/./foo':                        '/foo',
 91 |             '/../foo':                       '/../foo',
 92 |             '/foo.':                         '/foo.',
 93 |             '/.foo':                         '/.foo',
 94 |             '/foo..':                        '/foo..',
 95 |             '/..foo':                        '/..foo',
 96 |             '/./../foo':                     '/../foo',
 97 |             '/./foo/.':                      '/foo/',
 98 |             '/foo/./bar':                    '/foo/bar',
 99 |             '/foo/../bar':                   '/bar',
100 |             '/foo//':                        '/foo/',
101 |             '/foo///bar//':                  '/foo/bar/',
102 |         }
103 |         for bad, good in tests.items():
104 |             metafunc.addcall(funcargs=dict(bad=bad, good=good))
105 | 
106 | def test_invalid_urls(url):
107 |     try:
108 |         output = urlnorm.norm(url)
109 |         print '%r' % output
110 |     except urlnorm.InvalidUrl:
111 |         return
112 |     assert 1 == 0, "this should have raised an InvalidUrl exception"
113 | 
114 | def test_unquote(bad, good, unsafe):
115 |     output = urlnorm.unquote_safe(bad, unsafe)
116 |     assert output == good
117 | 
118 | def test_norms(bad, good):
119 |     new_url = urlnorm.norm(bad)
120 |     assert new_url == _unicode(good)
121 | 
122 | def test_norm_path(bad, good):
123 |     output = urlnorm.norm_path("http", bad)
124 |     assert output == _unicode(good)
125 | 


--------------------------------------------------------------------------------
/urlnorm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf8 -*-
  3 | 
  4 | """
  5 | urlnorm.py - URL normalization routines
  6 | 
  7 | urlnorm normalizes a URL by:
  8 |   * lowercasing the scheme and hostname
  9 |   * converting the hostname to IDN format
 10 |   * taking out default port if present (e.g., http://www.foo.com:80/)
 11 |   * collapsing the path (./, ../, //, etc)
 12 |   * removing the last character in the hostname if it is '.'
 13 |   * unescaping any percent escape sequences (where possible)
 14 |   * upercase percent escape (ie: %3f => %3F)
 15 |   * converts spaces to %20
 16 |   * converts ip encoded as an integer to dotted quad notation 
 17 | 
 18 | Available functions:
 19 |   norm - given a URL (string), returns a normalized URL
 20 |   norm_netloc
 21 |   norm_path
 22 |   unquote_path
 23 |   unquote_params
 24 |   unquote_qs
 25 |   unquote_fragment
 26 | 
 27 | 
 28 | CHANGES:
 29 | 1.1.4 - unescape " " in params, query string, and fragments
 30 | 1.1.3 - don't escape " " in path
 31 | 1.1.2 - leave %20 as %20, collate ' ' to %20, leave '+' as '+'
 32 | 1.1 - collate %20 and ' ' to '+'
 33 | 1.1 - fix unescaping of parameters
 34 | 1.1 - added int2ip
 35 | 1.0.1 - fix problem unescaping %23 and %20 in query string
 36 | 1.0 - new release
 37 | 0.94 - idna handling, unescaping querystring, fragment, add ws + wss ports
 38 | 0.92 - unknown schemes now pass the port through silently
 39 | 0.91 - general cleanup
 40 |      - changed dictionaries to lists where appropriate
 41 |      - more fine-grained authority parsing and normalisation    
 42 | """
 43 | 
 44 | __license__ = """
 45 | Copyright (c) 1999-2002 Mark Nottingham <mnot@pobox.com>
 46 | Copyright (c) 2010 Jehiah Czebotar <jehiah@gmail.com>
 47 | 
 48 | Permission is hereby granted, free of charge, to any person obtaining a copy
 49 | of this software and associated documentation files (the "Software"), to deal
 50 | in the Software without restriction, including without limitation the rights
 51 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 52 | copies of the Software, and to permit persons to whom the Software is
 53 | furnished to do so, subject to the following conditions:
 54 | 
 55 | The above copyright notice and this permission notice shall be included in all
 56 | copies or substantial portions of the Software.
 57 | 
 58 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 59 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 60 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 61 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 62 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 63 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 64 | SOFTWARE.
 65 | """
 66 | 
 67 | # also update in setup.py
 68 | __version__ = "1.1.4"
 69 | 
 70 | from urlparse import urlparse, urlunparse
 71 | from string import lower
 72 | import re
 73 | 
 74 | 
 75 | class InvalidUrl(Exception):
 76 |     pass
 77 | 
 78 | _server_authority = re.compile('^(?:([^\@]+)\@)?([^\:\[\]]+|\[[a-fA-F0-9\:\.]+\])(?:\:(.*?))?$')
 79 | _default_port = {'http': '80',
 80 |                  'itms': '80',
 81 |                  'ws': '80',
 82 |                  'https': '443',
 83 |                  'wss': '443',
 84 |                  'gopher': '70',
 85 |                  'news': '119',
 86 |                  'snews': '563',
 87 |                  'nntp': '119',
 88 |                  'snntp': '563',
 89 |                  'ftp': '21',
 90 |                  'telnet': '23',
 91 |                  'prospero': '191',
 92 |                  }
 93 | _relative_schemes = set(['http',
 94 |                          'https',
 95 |                          'ws',
 96 |                          'wss',
 97 |                          'itms',
 98 |                          'news',
 99 |                          'snews',
100 |                          'nntp',
101 |                          'snntp',
102 |                          'ftp',
103 |                          'file',
104 |                          ''
105 |                          ])
106 | 
107 | params_unsafe_list = set('?=+%#;')
108 | qs_unsafe_list = set('?&=+%#')
109 | fragment_unsafe_list = set('+%#')
110 | path_unsafe_list = set('/?;%+#')
111 | _hextochr = dict(('%02x' % i, chr(i)) for i in range(256))
112 | _hextochr.update(('%02X' % i, chr(i)) for i in range(256))
113 | 
114 | 
115 | def unquote_path(s):
116 |     return unquote_safe(s, path_unsafe_list)
117 | 
118 | 
119 | def unquote_params(s):
120 |     return unquote_safe(s, params_unsafe_list)
121 | 
122 | 
123 | def unquote_qs(s):
124 |     return unquote_safe(s, qs_unsafe_list)
125 | 
126 | 
127 | def unquote_fragment(s):
128 |     return unquote_safe(s, fragment_unsafe_list)
129 | 
130 | 
131 | def unquote_safe(s, unsafe_list):
132 |     """unquote percent escaped string except for percent escape sequences that are in unsafe_list"""
133 |     # note: this build utf8 raw strings ,then does a .decode('utf8') at the end.
134 |     # as a result it's doing .encode('utf8') on each block of the string as it's processed.
135 |     res = _utf8(s).split('%')
136 |     for i in xrange(1, len(res)):
137 |         item = res[i]
138 |         try:
139 |             raw_chr = _hextochr[item[:2]]
140 |             if raw_chr in unsafe_list or ord(raw_chr) < 20:
141 |                 # leave it unescaped (but uppercase the percent escape)
142 |                 res[i] = '%' + item[:2].upper() + item[2:]
143 |             else:
144 |                 res[i] = raw_chr + item[2:]
145 |         except KeyError:
146 |             res[i] = '%' + item
147 |         except UnicodeDecodeError:
148 |             # note: i'm not sure what this does
149 |             res[i] = unichr(int(item[:2], 16)) + item[2:]
150 |     o = "".join(res)
151 |     return _unicode(o)
152 | 
153 | 
154 | def norm(url):
155 |     """given a string URL, return its normalized/unicode form"""
156 |     url = _unicode(url)  # operate on unicode strings
157 |     url_tuple = urlparse(url)
158 |     normalized_tuple = norm_tuple(*url_tuple)
159 |     return urlunparse(normalized_tuple)
160 | 
161 | def norm_tuple(scheme, authority, path, parameters, query, fragment):
162 |     """given individual url components, return its normalized form"""
163 |     scheme = lower(scheme)
164 |     if not scheme:
165 |         raise InvalidUrl('missing URL scheme')
166 |     authority = norm_netloc(scheme, authority)
167 |     if not authority:
168 |         raise InvalidUrl('missing netloc')
169 |     path = norm_path(scheme, path)
170 |     # TODO: put query in sorted order; or at least group parameters together
171 |     # Note that some websites use positional parameters or the name part of a query so this would break the internet
172 |     # query = urlencode(parse_qs(query, keep_blank_values=1), doseq=1)
173 |     parameters = unquote_params(parameters)
174 |     query = unquote_qs(query)
175 |     fragment = unquote_fragment(fragment)
176 |     return (scheme, authority, path, parameters, query, fragment)
177 | 
178 | 
179 | def norm_path(scheme, path):
180 |     if scheme in _relative_schemes:
181 |         # resolve `/../` and `/./` and `//` components in path as appropriate
182 |         i = 0
183 |         parts = []
184 |         start = 0
185 |         while i < len(path):
186 |             if path[i] == "/" or i == len(path) - 1:
187 |                 chunk = path[start:i+1]
188 |                 start = i + 1
189 |                 if chunk in ["", "/", ".", "./"]:
190 |                     # do nothing
191 |                     pass
192 |                 elif chunk in ["..", "../"]:
193 |                     if len(parts):
194 |                         parts = parts[:len(parts)-1]
195 |                     else:
196 |                         parts.append(chunk)
197 |                 else:
198 |                     parts.append(chunk)
199 |             i+=1
200 |         path = "/"+ ("".join(parts))
201 |     path = unquote_path(path)
202 |     if not path:
203 |         return '/'
204 |     return path
205 | 
206 | MAX_IP = 0xffffffffL
207 | 
208 | 
209 | def int2ip(ipnum):
210 |     assert isinstance(ipnum, int)
211 |     if MAX_IP < ipnum or ipnum < 0:
212 |         raise TypeError("expected int between 0 and %d inclusive" % MAX_IP)
213 |     ip1 = ipnum >> 24
214 |     ip2 = ipnum >> 16 & 0xFF
215 |     ip3 = ipnum >> 8 & 0xFF
216 |     ip4 = ipnum & 0xFF
217 |     return "%d.%d.%d.%d" % (ip1, ip2, ip3, ip4)
218 | 
219 | 
220 | def norm_netloc(scheme, netloc):
221 |     if not netloc:
222 |         return netloc
223 |     match = _server_authority.match(netloc)
224 |     if not match:
225 |         raise InvalidUrl('no host in netloc %r' % netloc)
226 | 
227 |     userinfo, host, port = match.groups()
228 |     # catch a few common errors:
229 |     if host.isdigit():
230 |         try:
231 |             host = int2ip(int(host))
232 |         except TypeError:
233 |             raise InvalidUrl('host %r does not escape to a valid ip' % host)
234 |     if host[-1] == '.':
235 |         host = host[:-1]
236 | 
237 |     # bracket check is for ipv6 hosts
238 |     if '.' not in host and not (host[0] == '[' and host[-1] == ']'):
239 |         raise InvalidUrl('host %r is not valid' % host)
240 | 
241 |     authority = lower(host)
242 |     if 'xn--' in authority:
243 |         subdomains = [_idn(subdomain) for subdomain in authority.split('.')]
244 |         authority = '.'.join(subdomains)
245 | 
246 |     if userinfo:
247 |         authority = "%s@%s" % (userinfo, authority)
248 |     if port and port != _default_port.get(scheme, None):
249 |         authority = "%s:%s" % (authority, port)
250 |     return authority
251 | 
252 | 
253 | def _idn(subdomain):
254 |     if subdomain.startswith('xn--'):
255 |         try:
256 |             subdomain = subdomain.decode('idna')
257 |         except UnicodeError:
258 |             raise InvalidUrl('Error converting subdomain %r to IDN' % subdomain)
259 |     return subdomain
260 | 
261 | 
262 | def _utf8(value):
263 |     if isinstance(value, unicode):
264 |         return value.encode("utf-8")
265 |     assert isinstance(value, str)
266 |     return value
267 | 
268 | 
269 | def _unicode(value):
270 |     if isinstance(value, str):
271 |         return value.decode("utf-8")
272 |     assert isinstance(value, unicode)
273 |     return value
274 | 


--------------------------------------------------------------------------------