634 |
635 | This program is free software: you can redistribute it and/or modify
636 | it under the terms of the GNU Affero General Public License as published by
637 | the Free Software Foundation, either version 3 of the License, or
638 | (at your option) any later version.
639 |
640 | This program is distributed in the hope that it will be useful,
641 | but WITHOUT ANY WARRANTY; without even the implied warranty of
642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
643 | GNU Affero General Public License for more details.
644 |
645 | You should have received a copy of the GNU Affero General Public License
646 | along with this program. If not, see .
647 |
648 | Also add information on how to contact you by electronic and paper mail.
649 |
650 | If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source. For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code. There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 |
658 | You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | .
662 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include *.txt *.md
2 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | Sort-friendly URI Reordering Transform (SURT) python package.
2 |
3 | Usage:
4 |
5 | ::
6 |
7 | >>> from surt import surt
8 | >>> surt("http://archive.org/goo/?a=2&b&a=1")
9 | 'org,archive)/goo?a=1&a=2&b'
10 | >>> surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True)
11 | 'org,archive,)/goo?a=1&a=2&b'
12 | >>> surt("http://123.456.78.910/goo/?a=2&b&a=1", reverse_ipaddr=False)
13 | '123.456.78.910)/goo?a=1&a=2&b'
14 |
15 | Installation:
16 |
17 | ::
18 |
19 | pip install surt
20 |
21 | Or install the dev version from git:
22 |
23 | ::
24 |
25 | pip install git+https://github.com/internetarchive/surt.git#egg=surt
26 |
27 | More information about SURTs:
28 | http://crawler.archive.org/articles/user\_manual/glossary.html#surt
29 |
30 | This is mostly a python port of the webarchive-commons org.archive.url
31 | package. The original java version of the org.archive.url package is
32 | here:
33 | https://github.com/iipc/webarchive-commons/tree/master/src/main/java/org/archive/url
34 |
35 | This module depends on the ``tldextract`` module to query the Public
36 | Suffix List. ``tldextract`` can be installed via ``pip``
37 |
38 | |Build Status|
39 |
40 | .. |Build Status| image:: https://travis-ci.org/internetarchive/surt.svg
41 | :target: https://travis-ci.org/internetarchive/surt
42 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | from setuptools.command.test import test as TestCommand
3 |
4 | class PyTest(TestCommand):
5 | def finalize_options(self):
6 | TestCommand.finalize_options(self)
7 | self.test_suite = True
8 |
9 | def run_tests(self):
10 | import pytest
11 | import sys
12 | cmdline = ' -v --cov surt tests/'
13 | errcode = pytest.main(cmdline)
14 | sys.exit(errcode)
15 |
16 |
17 | setup(name='surt',
18 | version='0.3.1',
19 | author='rajbot',
20 | author_email='raj@archive.org',
21 | classifiers=[
22 | 'License :: OSI Approved :: GNU Affero General Public License v3',
23 | ],
24 | description='Sort-friendly URI Reordering Transform (SURT) python package.',
25 | long_description=open('README.rst').read(),
26 | url='https://github.com/internetarchive/surt',
27 | zip_safe=True,
28 | install_requires=[
29 | 'six',
30 | 'tldextract>=2.0',
31 | ],
32 | provides=[ 'surt' ],
33 | packages=[ 'surt' ],
34 | scripts=[],
35 | # Tests
36 | tests_require=[ 'pytest', 'pytest-cov' ],
37 | test_suite='',
38 | cmdclass={'test': PyTest},
39 | )
40 |
--------------------------------------------------------------------------------
/surt/DefaultIAURLCanonicalizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright(c)2012-2013 Internet Archive. Software license AGPL version 3.
4 | #
5 | # This file is part of the `surt` python package.
6 | #
7 | # surt is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # surt is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with surt. If not, see .
19 | #
20 | # The surt source is hosted at https://github.com/internetarchive/surt
21 |
22 | """This is a python port of DefaultIAURLCanonicalizer.java:
23 | http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/DefaultIAURLCanonicalizer.java?view=markup
24 | """
25 | from __future__ import absolute_import
26 |
27 | import surt.GoogleURLCanonicalizer
28 | import surt.IAURLCanonicalizer
29 |
30 |
31 | # canonicalize()
32 | #_______________________________________________________________________________
33 | def canonicalize(url, **options):
34 | """The input url is a handyurl instance
35 | """
36 |
37 | url = surt.GoogleURLCanonicalizer.canonicalize(url, **options)
38 | url = surt.IAURLCanonicalizer.canonicalize(url, **options)
39 |
40 | return url
41 |
--------------------------------------------------------------------------------
/surt/GoogleURLCanonicalizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright(c)2012-2013 Internet Archive. Software license AGPL version 3.
5 | #
6 | # This file is part of the `surt` python package.
7 | #
8 | # surt is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU Affero General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # surt is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Affero General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Affero General Public License
19 | # along with surt. If not, see .
20 | #
21 | # The surt source is hosted at https://github.com/internetarchive/surt
22 |
23 | """This is a python port of GoogleURLCanonicalizer.java:
24 | http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/GoogleURLCanonicalizer.java?view=markup
25 | """
26 |
27 | from __future__ import absolute_import
28 |
29 | import re
30 | import struct
31 | import socket
32 | import encodings.idna
33 |
34 | from surt.handyurl import handyurl
35 |
36 | try:
37 | from urllib.parse import quote_from_bytes, unquote_to_bytes
38 | except:
39 | from urllib import quote as quote_from_bytes, unquote as unquote_to_bytes
40 | from six import text_type, binary_type
41 |
42 | # canonicalize()
43 | #_______________________________________________________________________________
44 | def canonicalize(url, **_ignored):
45 | url.hash = None
46 | if url.authUser:
47 | url.authUser = minimalEscape(url.authUser)
48 | if url.authPass:
49 | url.authPass = minimalEscape(url.authPass)
50 | if url.query:
51 | url.query = minimalEscape(url.query)
52 |
53 | if url.host:
54 | host = unescapeRepeatedly(url.host)
55 | try:
56 | host.decode('ascii')
57 | except UnicodeDecodeError:
58 | try:
59 | host = host.decode('utf-8', 'ignore').encode('idna')
60 | except ValueError:
61 | pass
62 |
63 | host = host.replace(b'..', b'.').strip(b'.')
64 |
65 | ip = attemptIPFormats(host)
66 | if ip:
67 | host = ip;
68 | else:
69 | host = escapeOnce(host.lower())
70 |
71 | url.host = host
72 |
73 | path = unescapeRepeatedly(url.path)
74 | if url.host:
75 | path = normalizePath(path)
76 | # else path is free-form sort of thing, not /directory/thing
77 | url.path = escapeOnce(path)
78 |
79 | return url
80 |
81 | # normalizePath()
82 | #_______________________________________________________________________________
83 |
84 | def normalizePath(path):
85 | if not path:
86 | return b'/'
87 |
88 | #gives an empty trailing element if path ends with '/':
89 | paths = path.split(b'/')
90 | keptPaths = []
91 | first = True
92 |
93 | for p in paths:
94 | if first:
95 | first = False
96 | continue
97 | elif b'.' == p:
98 | # skip
99 | continue
100 | elif b'..' == p:
101 | #pop the last path, if present:
102 | if len(keptPaths) > 0:
103 | keptPaths = keptPaths[:-1]
104 | else:
105 | # TODO: leave it? let's do for now...
106 | keptPaths.append(p)
107 | else:
108 | keptPaths.append(p)
109 |
110 | path = b'/'
111 |
112 | # If the path ends in '/', then the last element of keptPaths will be ''
113 | # Since we add a trailing '/' after the second-to-last element of keptPaths
114 | # in the for loop below, the trailing slash is preserved.
115 | numKept = len(keptPaths)
116 | if numKept > 0:
117 | for i in range(0, numKept-1):
118 | p = keptPaths[i]
119 | if len(p) > 0:
120 | #this will omit multiple slashes:
121 | path += p + b'/'
122 | path += keptPaths[numKept-1]
123 |
124 | return path
125 |
126 | OCTAL_IP = re.compile(br"^(0[0-7]*)(\.[0-7]+)?(\.[0-7]+)?(\.[0-7]+)?$")
127 | DECIMAL_IP = re.compile(br"^([1-9][0-9]*)(\.[0-9]+)?(\.[0-9]+)?(\.[0-9]+)?$")
128 |
129 | # attemptIPFormats()
130 | #_______________________________________________________________________________
131 | def attemptIPFormats(host):
132 | if None == host:
133 | return None
134 |
135 | if host.isdigit():
136 | #mask hostname to lower four bytes to workaround issue with liveweb arc files
137 | return socket.inet_ntoa(
138 | struct.pack('>L', int(host) & 0xffffffff)).encode('ascii')
139 | else:
140 | m = DECIMAL_IP.match(host)
141 | if m:
142 | try:
143 | return socket.gethostbyname_ex(host)[2][0].encode('ascii')
144 | except (socket.gaierror, socket.herror):
145 | return None
146 | else:
147 | m = OCTAL_IP.match(host)
148 | if m:
149 | try:
150 | return socket.gethostbyname_ex(host)[2][0].encode('ascii')
151 | except socket.gaierror:
152 | return None
153 |
154 | return None
155 |
156 |
157 | # minimalEscape()
158 | #_______________________________________________________________________________
159 | def minimalEscape(input):
160 | return escapeOnce(unescapeRepeatedly(input))
161 |
162 | # escapeOnce()
163 | #_______________________________________________________________________________
164 | def escapeOnce(input):
165 | """escape everything outside of 32-128, except #"""
166 | if input:
167 | return quote_from_bytes(
168 | input, safe=b'''!"$&'()*+,-./:;<=>?@[\]^_`{|}~''').encode(
169 | 'ascii')
170 | else:
171 | return input
172 |
173 |
174 | # unescapeRepeatedly()
175 | #_______________________________________________________________________________
176 | def unescapeRepeatedly(input):
177 | '''Argument may be str or bytes. Returns bytes.'''
178 | if None == input:
179 | return None
180 |
181 | while True:
182 | un = unquote_to_bytes(input)
183 | if un == input:
184 | return input
185 | input = un
186 |
187 |
--------------------------------------------------------------------------------
/surt/IAURLCanonicalizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright(c)2012-2013 Internet Archive. Software license AGPL version 3.
4 | #
5 | # This file is part of the `surt` python package.
6 | #
7 | # surt is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # surt is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with surt. If not, see .
19 | #
20 | # The surt source is hosted at https://github.com/internetarchive/surt
21 |
22 | """This is a python port of IAURLCanonicalizer.java:
23 | http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/IAURLCanonicalizer.java?view=markup
24 | """
25 |
26 | from __future__ import absolute_import
27 |
28 | import re
29 |
30 | from surt.handyurl import handyurl
31 | from surt.URLRegexTransformer import stripPathSessionID, stripQuerySessionID
32 |
33 | # canonicalize()
34 | #_______________________________________________________________________________
35 | def canonicalize(url, host_lowercase=True, host_massage=True,
36 | auth_strip_user=True, auth_strip_pass=True,
37 | port_strip_default=True, path_strip_empty=False,
38 | path_lowercase=True, path_strip_session_id=True,
39 | path_strip_trailing_slash_unless_empty=True,
40 | query_lowercase=True, query_strip_session_id=True,
41 | query_strip_empty=True, query_alpha_reorder=True,
42 | hash_strip=True, **_ignored):
43 | """The input url is a handyurl instance"""
44 | if host_lowercase and url.host:
45 | url.host = url.host.lower()
46 |
47 | if host_massage and url.host and (url.scheme != b'dns'): ###java version calls massageHost regardless of scheme
48 | url.host = massageHost(url.host)
49 |
50 | if auth_strip_user:
51 | url.authUser = None
52 | url.authPass = None
53 | elif auth_strip_pass:
54 | url.arthPass = None
55 |
56 | if port_strip_default and url.scheme:
57 | defaultPort = getDefaultPort(url.scheme)
58 | if url.port == defaultPort:
59 | url.port = handyurl.DEFAULT_PORT
60 |
61 | path = url.path
62 | if path_strip_empty and b'/' == path:
63 | url.path = None
64 | else:
65 | if path_lowercase and path:
66 | path = path.lower()
67 | if path_strip_session_id and path:
68 | path = stripPathSessionID(path)
69 | if path_strip_empty and b'/' == path:
70 | path = None
71 | if path_strip_trailing_slash_unless_empty and path:
72 | if path.endswith(b'/') and len(path)>1:
73 | path = path[:-1]
74 |
75 | url.path = path
76 |
77 | query = url.query
78 | if query:
79 | if len(query) > 0:
80 | if query_strip_session_id:
81 | query = stripQuerySessionID(query)
82 | if query_lowercase:
83 | query = query.lower()
84 | if query_alpha_reorder:
85 | query = alphaReorderQuery(query)
86 | if b'' == query and query_strip_empty:
87 | query = None
88 | url.query = query
89 | else:
90 | if query_strip_empty:
91 | url.last_delimiter = None
92 |
93 | return url
94 |
95 |
96 | # alphaReorderQuery()
97 | #_______________________________________________________________________________
98 | def alphaReorderQuery(orig):
99 | """It's a shame that we can't use urlparse.parse_qsl() for this, but this
100 | function does keeps the trailing '=' if there is a query arg with no value:
101 | "?foo" vs "?foo=", and we want to exactly match the java version
102 | """
103 |
104 |
105 | if None == orig:
106 | return None
107 |
108 | if len(orig) <= 1:
109 | return orig
110 |
111 | args = orig.split(b'&')
112 | qas = [tuple(arg.split(b'=', 1)) for arg in args]
113 | qas.sort()
114 |
115 | s = b''
116 | for t in qas:
117 | if 1 == len(t):
118 | s += t[0] + b'&'
119 | else:
120 | s += t[0] + b'=' + t[1] + b'&'
121 |
122 | return s[:-1] #remove last &
123 |
124 |
125 | # massageHost()
126 | #_______________________________________________________________________________
127 | _RE_WWWDIGITS = re.compile(b'www\d*\.')
128 |
129 | def massageHost(host):
130 | m = _RE_WWWDIGITS.match(host)
131 | if m:
132 | return host[len(m.group(0)):]
133 | else:
134 | return host
135 |
136 | # getDefaultPort()
137 | #_______________________________________________________________________________
138 | def getDefaultPort(scheme):
139 | scheme_lower = scheme.lower()
140 | if b'http' == scheme_lower:
141 | return 80
142 | elif b'https' == scheme_lower:
143 | return 443
144 | else:
145 | return 0
146 |
147 |
--------------------------------------------------------------------------------
/surt/URLRegexTransformer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright(c)2012-2013 Internet Archive. Software license AGPL version 3.
4 | #
5 | # This file is part of the `surt` python package.
6 | #
7 | # surt is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # surt is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with surt. If not, see .
19 | #
20 | # The surt source is hosted at https://github.com/internetarchive/surt
21 |
22 | """This is a python port of URLRegexTransformer.java:
23 | http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/URLRegexTransformer.java?view=markup
24 | """
25 |
26 | import re
27 |
28 | # stripPathSessionID
29 | #_______________________________________________________________________________
30 | _RES_PATH_SESSIONID = [
31 | re.compile(b"^(.*/)(\((?:[a-z]\([0-9a-z]{24}\))+\)/)([^\?]+\.aspx.*)$", re.I),
32 | re.compile(b"^(.*/)(\\([0-9a-z]{24}\\)/)([^\\?]+\\.aspx.*)$", re.I),
33 | ]
34 |
35 | def stripPathSessionID(path):
36 | """It looks like the java version returns a lowercased path..
37 | So why does it uses a case-insensitive regex? We won't lowercase here.
38 | """
39 | for pattern in _RES_PATH_SESSIONID:
40 | m = pattern.match(path)
41 | if m:
42 | path = m.group(1) + m.group(3)
43 |
44 | return path
45 |
46 |
47 | # stripQuerySessionID
48 | #_______________________________________________________________________________
49 | _RES_QUERY_SESSIONID = [
50 | re.compile(b"^(.*)(?:jsessionid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
51 | re.compile(b"^(.*)(?:phpsessid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
52 | re.compile(b"^(.*)(?:sid=[0-9a-zA-Z]{32})(?:&(.*))?$", re.I),
53 | re.compile(b"^(.*)(?:ASPSESSIONID[a-zA-Z]{8}=[a-zA-Z]{24})(?:&(.*))?$", re.I),
54 | re.compile(b"^(.*)(?:cfid=[^&]+&cftoken=[^&]+)(?:&(.*))?$", re.I),
55 | ]
56 |
57 | def stripQuerySessionID(query):
58 | for pattern in _RES_QUERY_SESSIONID:
59 | m = pattern.match(query)
60 | if m:
61 | if m.group(2):
62 | query = m.group(1) + m.group(2)
63 | else:
64 | query = m.group(1)
65 |
66 | return query
67 |
68 |
69 | # hostToSURT
70 | #_______________________________________________________________________________
71 | _RE_IP_ADDRESS = re.compile(br"(?:(?:\d{1,3}\.){3}\d{1,3})$")
72 |
73 | def hostToSURT(host, reverse_ipaddr=True):
74 | if not reverse_ipaddr and _RE_IP_ADDRESS.match(host):
75 | return host
76 |
77 | parts = host.split(b'.')
78 | parts.reverse()
79 | return b','.join(parts)
80 |
--------------------------------------------------------------------------------
/surt/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright(c)2012-2013 Internet Archive. Software license AGPL version 3.
4 | #
5 | # This file is part of the `surt` python package.
6 | #
7 | # surt is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # surt is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with surt. If not, see .
19 | #
20 | # The surt source is hosted at https://github.com/internetarchive/surt
21 |
22 | """A python port of the archive-commons org.archive.url HandyURL class
23 |
24 | The original java version is here:
25 | http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/
26 | """
27 |
28 | from __future__ import absolute_import
29 |
30 | from surt.handyurl import handyurl
31 | from surt.surt import surt
32 |
33 |
34 | __all__= [
35 | 'handyurl',
36 | 'surt'
37 | ]
38 |
--------------------------------------------------------------------------------
/surt/handyurl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 |
4 | # Copyright(c)2012-2013 Internet Archive. Software license AGPL version 3.
5 | #
6 | # This file is part of the `surt` python package.
7 | #
8 | # surt is free software: you can redistribute it and/or modify
9 | # it under the terms of the GNU Affero General Public License as published by
10 | # the Free Software Foundation, either version 3 of the License, or
11 | # (at your option) any later version.
12 | #
13 | # surt is distributed in the hope that it will be useful,
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | # GNU Affero General Public License for more details.
17 | #
18 | # You should have received a copy of the GNU Affero General Public License
19 | # along with surt. If not, see .
20 | #
21 | # The surt source is hosted at https://github.com/internetarchive/surt
22 |
23 | from __future__ import absolute_import
24 |
25 | import re
26 | import tldextract
27 | import collections
28 |
29 | try:
30 | from urllib.parse import SplitResultBytes
31 | except:
32 | from urlparse import SplitResult as SplitResultBytes
33 |
34 | from surt.URLRegexTransformer import hostToSURT
35 |
36 | _RE_MULTIPLE_PROTOCOLS = re.compile(br'^(https?://)+')
37 | _RE_HAS_PROTOCOL = re.compile(b"^([a-zA-Z][a-zA-Z0-9\+\-\.]*):")
38 | _RE_SPACES = re.compile(b'[\n\r\t]')
39 |
40 | class handyurl(object):
41 | """A python port of the archive-commons org.archive.url HandyURL class
42 |
43 | To simplify the surt module, we add the URLParser.parse method here,
44 | which makes the URLParser class unnecessary. handyurl becomes a thin
45 | wrapper around python's urlparse module.
46 |
47 | Init an empty class:
48 | >>> h = handyurl()
49 |
50 | Init with just a host:
51 | >>> h = handyurl(host='www.amazon.co.uk')
52 |
53 | This version of handyurl contains a field for last_delimiter, to allow
54 | a url to roundtrip through this class without modification. From the
55 | urlparse docs:
56 | "This [roundtripping] may result in a slightly different, but equivalent URL,
57 | if the URL that was parsed originally had unnecessary delimiters
58 | (for example, a ? with an empty query; the RFC states that these are equivalent)."
59 | We want the url http://www.google.com/? to work, since there is a test for
60 | it in the GoogleURLCanonicalizer class. Note, however, the IAURLCanonicalizer
61 | class strips empty queries.
62 | """
63 | DEFAULT_PORT = None
64 |
65 | # init
66 | #___________________________________________________________________________
67 | def __init__(self, scheme=None, authUser=None, authPass=None,
68 | host=None, port=DEFAULT_PORT, path=None,
69 | query=None, hash=None, last_delimiter=None):
70 | self.scheme = scheme
71 | self.authUser = authUser
72 | self.authPass = authPass
73 | self.host = host
74 | self.port = port
75 | self.path = path
76 | self.query = query
77 | self.hash = hash
78 | self.last_delimiter = last_delimiter #added in python version
79 |
80 |
81 | '''
82 | RFC 2396-inspired regex.
83 |
84 | From the RFC Appendix B:
85 |
86 | URI Generic Syntax August 1998
87 |
88 | B. Parsing a URI Reference with a Regular Expression
89 |
90 | As described in Section 4.3, the generic URI syntax is not sufficient
91 | to disambiguate the components of some forms of URI. Since the
92 | "greedy algorithm" described in that section is identical to the
93 | disambiguation method used by POSIX regular expressions, it is
94 | natural and commonplace to use a regular expression for parsing the
95 | potential four components and fragment identifier of a URI reference.
96 |
97 | The following line is the regular expression for breaking-down a URI
98 | reference into its components.
99 |
100 | ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
101 | 12 3 4 5 6 7 8 9
102 |
103 | The numbers in the second line above are only to assist readability;
104 | they indicate the reference points for each subexpression (i.e., each
105 | paired parenthesis). We refer to the value matched for subexpression
106 | as $. For example, matching the above expression to
107 |
108 | http://www.ics.uci.edu/pub/ietf/uri/#Related
109 |
110 | results in the following subexpression matches:
111 |
112 | $1 = http:
113 | $2 = http
114 | $3 = //www.ics.uci.edu
115 | $4 = www.ics.uci.edu
116 | $5 = /pub/ietf/uri/
117 | $6 =
118 | $7 =
119 | $8 = #Related
120 | $9 = Related
121 |
122 | where indicates that the component is not present, as is
123 | the case for the query component in the above example. Therefore, we
124 | can determine the value of the four components and fragment as
125 |
126 | scheme = $2
127 | authority = $4
128 | path = $5
129 | query = $7
130 | fragment = $9
131 |
132 |
133 | --
134 | Below differs from the rfc regex in that...
135 | (1) we allow a URI made of a fragment only (Added extra
136 | group so indexing is off by one after scheme).
137 | (2) scheme is limited to legal scheme characters
138 |
139 | 1: scheme:
140 | 2: scheme
141 | 3: //authority/path
142 | 4: //authority
143 | 5: authority (aka netloc)
144 | 6: path
145 | 7: ?query
146 | 8: query
147 | 9: #fragment
148 | A: fragment
149 | '''
150 | RFC2396REGEX = re.compile(br'^(([a-zA-Z][a-zA-Z0-9+.-]*):)?((//([^/?#]*))?([^?#]*)(\?([^#]*))?)?(#(.*))?$')
151 | # group open: 12 34 5 6 7 8 9 A
152 | # group close: 2 1 54 6 87 3 A9
153 |
154 | @classmethod
155 | def urlsplit(cls, url):
156 | """Similar to urllib.parse.urlsplit, but does not try to decode raw
157 | bytes. (Library method fails on non-ascii)"""
158 | assert isinstance(url, bytes)
159 |
160 | m = cls.RFC2396REGEX.match(url)
161 | assert m
162 |
163 | return SplitResultBytes(m.group(2) or b'', m.group(5) or b'',
164 | m.group(6) or b'', m.group(8) or b'',
165 | m.group(10) or b'')
166 |
167 | # parse() classmethod
168 | #___________________________________________________________________________
169 | @classmethod
170 | def parse(cls, url):
171 | u"""This method was in the java URLParser class, but we don't need
172 | a whole class to parse a url, when we can just use python's urlparse.
173 |
174 | """
175 |
176 | if not isinstance(url, bytes):
177 | url = url.encode('utf-8')
178 |
179 | # Note RE_SPACES does not match regular space (0x20). That is,
180 | # regular spaces are removed at head and tail, but not in the middle.
181 | # There's a test case for GoogleURLCanonicalizer.canonicalize that
182 | # asserts this behavior.
183 | url = url.strip()
184 | url = _RE_SPACES.sub(b'', url)
185 |
186 | url = cls.addDefaultSchemeIfNeeded(url)
187 |
188 | #From Tymm: deal with http://https/order.1and1.com
189 | url = _RE_MULTIPLE_PROTOCOLS.sub(lambda m: m.group(1), url)
190 |
191 | o = cls.urlsplit(url)
192 |
193 | scheme = o.scheme or None
194 | query = o.query or None
195 | fragment = o.fragment or None
196 |
197 | """Deal with hostnames that end with ':' without being followed by a port number"""
198 | if o.netloc.endswith(b':'):
199 | o = o._replace(netloc=o.netloc.rstrip(b':'))
200 | port = o.port or None
201 |
202 | hostname = o.hostname or None
203 | path = o.path or None
204 |
205 | if scheme.startswith(b'http'):
206 | #deal with "http:////////////////www.vikings.com"
207 | if hostname is None and path is not None:
208 | parts = path.lstrip(b'/').partition(b'/')
209 | hostname = parts[0]
210 | path = b'/'+parts[2]
211 |
212 | h = cls(scheme = scheme,
213 | host = hostname,
214 | path = path,
215 | query = query,
216 | hash = fragment,
217 | port = port,
218 | )
219 |
220 | #See note at top about last_delimiter
221 | if url.endswith(b'?') and None == h.query:
222 | h.last_delimiter = b'?'
223 |
224 | return h
225 |
226 | # addDefaultSchemeIfNeeded()
227 | #___________________________________________________________________________
228 | """copied from URLParser.java"""
229 | @classmethod
230 | def addDefaultSchemeIfNeeded(cls, url):
231 | if not url:
232 | return url
233 |
234 | ###noah: accept anything that looks like it starts with a scheme:
235 | if _RE_HAS_PROTOCOL.match(url):
236 | return url
237 | else:
238 | return b"http://"+url
239 |
240 | # geturl()
241 | #___________________________________________________________________________
242 | def geturl(self):
243 | """urlparse.ParseResult has a geturl() method, so we have one too.
244 | Nicer than typing the java method name!
245 | """
246 | return self.getURLString()
247 |
248 | # getURLString()
249 | #___________________________________________________________________________
250 | def getURLString(self, **options):
251 | return self.geturl_bytes(**options).decode('utf-8')
252 |
253 | def geturl_bytes(self,
254 | surt=False,
255 | public_suffix=False,
256 | trailing_comma=False,
257 | reverse_ipaddr=True,
258 | with_scheme=True,
259 | **options):
260 | hostSrc = self.host
261 | if hostSrc:
262 | if public_suffix:
263 | hostSrc = self.getPublicSuffix()
264 | if surt:
265 | hostSrc = hostToSURT(hostSrc, reverse_ipaddr)
266 |
267 | if with_scheme:
268 | s = self.scheme + b':'
269 | if hostSrc:
270 | if self.scheme != b'dns':
271 | s += b'//'
272 | if surt:
273 | s += b"("
274 | elif not hostSrc:
275 | s = self.scheme + b':'
276 | else:
277 | s = b''
278 |
279 | if hostSrc:
280 | if self.authUser:
281 | s += self.authUser
282 | if self.authPass:
283 | s += self.authPass
284 | s += b'@'
285 |
286 | s += hostSrc
287 |
288 | if self.port != self.DEFAULT_PORT:
289 | s += (":%d" % self.port).encode('utf-8')
290 |
291 | if surt:
292 | if trailing_comma:
293 | s += b','
294 | s += b')'
295 |
296 | if self.path:
297 | s += self.path
298 | elif self.query is not None or self.hash is not None:
299 | #must have '/' with query or hash:
300 | s += b'/'
301 |
302 | if None != self.query:
303 | s += b'?' + self.query
304 | if None != self.hash:
305 | s += b'#' + self.hash
306 |
307 | if None != self.last_delimiter:
308 | s += self.last_delimiter
309 |
310 | return s
311 |
312 | # getPublicSuffix
313 | #___________________________________________________________________________
314 | def getPublicSuffix(self):
315 | """Uses the tldextract module to get the public suffix via the
316 | Public Suffix List.
317 | """
318 | return tldextract.extract(self.host).registered_domain.encode('ascii')
319 |
320 | # getPublicPrefix
321 | #___________________________________________________________________________
322 | def getPublicPrefix(self):
323 | """Uses the tldextract module to get the subdomain, using the
324 | Public Suffix List.
325 | """
326 | return tldextract.extract(self.host).subdomain
327 |
328 | # repr
329 | #___________________________________________________________________________
330 | # commented out because of http://bugs.python.org/issue5876
331 | # "__repr__ returning unicode doesn't work when called implicitly"
332 | #def __repr__(self):
333 | # return u"""handyurl(scheme=%s, authUser=%s, authPass=%s, host=%s, port=%s, path=%s, query=%s, hash=%s)""".encode('utf-8') % (self.scheme, self.authUser, self.authPass, self.host, self.port, self.path, self.query, self.hash)
334 |
--------------------------------------------------------------------------------
/surt/surt.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Copyright(c)2012-2013 Internet Archive. Software license AGPL version 3.
4 | #
5 | # This file is part of the `surt` python package.
6 | #
7 | # surt is free software: you can redistribute it and/or modify
8 | # it under the terms of the GNU Affero General Public License as published by
9 | # the Free Software Foundation, either version 3 of the License, or
10 | # (at your option) any later version.
11 | #
12 | # surt is distributed in the hope that it will be useful,
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 | # GNU Affero General Public License for more details.
16 | #
17 | # You should have received a copy of the GNU Affero General Public License
18 | # along with surt. If not, see .
19 | #
20 | # The surt source is hosted at https://github.com/internetarchive/surt
21 |
22 | """This is a python port of the WaybackURLKeyMaker.java class:
23 |
24 | http://archive-access.svn.sourceforge.net/viewvc/archive-access/trunk/archive-access/projects/archive-commons/src/main/java/org/archive/url/WaybackURLKeyMaker.java?view=markup
25 | """
26 |
27 | from __future__ import absolute_import
28 |
29 | from surt.handyurl import handyurl
30 | from surt.URLRegexTransformer import hostToSURT
31 |
32 | import surt.DefaultIAURLCanonicalizer as DefaultIAURLCanonicalizer
33 |
34 | class CompositeCanonicalizer(object):
35 | def __init__(self, canonicalizers):
36 | self.canonicalizers = [
37 | self._normalize(canon) for canon in canonicalizers
38 | ]
39 | def __call__(self, hurl, **options):
40 | for canon in self.canonicalizers:
41 | hurl = canon(hurl, **options)
42 | return hurl
43 | @staticmethod
44 | def _normalize(canonicalizer):
45 | if hasattr(canonicalizer, '__call__'):
46 | return canonicalizer
47 | if hasattr(canonicalizer, 'canonicalize'):
48 | return canonicalizer.canonicalize
49 | raise AttributeError('canonicalizer must either be callable or have'
50 | ' "canonicalizer" method')
51 |
52 | # surt()
53 | #_______________________________________________________________________________
54 | def surt(url, canonicalizer=None, **options):
55 | if isinstance(url, bytes):
56 | return _surt_bytes(url, canonicalizer, **options)
57 | else:
58 | if url is not None:
59 | url = url.encode('utf-8')
60 | return _surt_bytes(url, canonicalizer, **options).decode('utf-8')
61 |
62 | def _surt_bytes(url, canonicalizer, **options):
63 | if not url:
64 | return b"-"
65 |
66 | if url.startswith(b"filedesc"):
67 | return url
68 |
69 | if canonicalizer is None:
70 | canonicalizer = DefaultIAURLCanonicalizer.canonicalize
71 | else:
72 | if isinstance(canonicalizer, (list, tuple)):
73 | canonicalizer = CompositeCanonicalizer(canonicalizer)
74 | elif (not hasattr(canonicalizer, '__call__') and
75 | hasattr(canonicalizer, 'canonicalize')):
76 | canonicalizer = canonicalizer.canonicalize
77 |
78 | options.setdefault('surt', True)
79 | options.setdefault('with_scheme', False)
80 |
81 | hurl = canonicalizer(handyurl.parse(url), **options)
82 | return hurl.geturl_bytes(**options)
83 |
--------------------------------------------------------------------------------
/tests/test_surt.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from __future__ import absolute_import, unicode_literals
4 |
5 | import surt
6 | from surt import handyurl
7 |
8 | import pytest
9 |
10 | def test_handyurl_parse():
11 | # These tests come from URLParserTest.java
12 | assert handyurl.parse("http://www.archive.org/index.html#foo").geturl() == 'http://www.archive.org/index.html#foo'
13 | assert handyurl.parse("http://www.archive.org/").geturl() == 'http://www.archive.org/'
14 | assert handyurl.parse("http://www.archive.org").geturl() == 'http://www.archive.org'
15 | assert handyurl.parse("http://www.archive.org?").geturl() == 'http://www.archive.org?'
16 | assert handyurl.parse("http://www.archive.org:8080/index.html?query#foo").geturl() == 'http://www.archive.org:8080/index.html?query#foo'
17 | assert handyurl.parse("http://www.archive.org:8080/index.html?#foo").geturl() == 'http://www.archive.org:8080/index.html#foo'
18 | assert handyurl.parse("http://www.archive.org:8080?#foo").geturl() == 'http://www.archive.org:8080/#foo'
19 | assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u'http://bücher.ch:8080/#foo'
20 | assert handyurl.parse(u"dns:bücher.ch").geturl() == u'dns:bücher.ch'
21 | # XXX assert print(handyurl.parse(u"http://bücher.ch:8080?#foo").geturl()) == http://b\xfccher.ch:8080/#foo
22 | # XXX assert print(handyurl.parse(u"dns:bücher.ch").geturl()) == dns:b\xfccher.ch
23 | assert handyurl.parse(u"http://bücher.ch:8080?#foo").geturl() == u"http://b\xfccher.ch:8080/#foo"
24 | assert handyurl.parse(u"dns:bücher.ch").geturl() == u"dns:b\xfccher.ch"
25 |
26 | ###From Tymm:
27 | assert handyurl.parse("http:////////////////www.vikings.com").geturl() == 'http://www.vikings.com/'
28 | assert handyurl.parse("http://https://order.1and1.com").geturl() == 'https://order.1and1.com'
29 |
30 | ###From Common Crawl, host ends with ':' without a port number
31 | assert handyurl.parse("http://mineral.galleries.com:/minerals/silicate/chabazit/chabazit.htm").geturl() == 'http://mineral.galleries.com/minerals/silicate/chabazit/chabazit.htm'
32 |
33 | assert handyurl.parse("mailto:bot@archive.org").scheme == b'mailto'
34 | assert handyurl.parse("mailto:bot@archive.org").geturl() == 'mailto:bot@archive.org'
35 |
36 | def test_getPublicSuffix():
37 | # These tests are based off the ones found in HandyURLTest.java
38 | assert handyurl(host='www.fool.com').getPublicSuffix() == b'fool.com'
39 | assert handyurl(host='www.amazon.co.uk').getPublicSuffix() == b'amazon.co.uk'
40 | assert handyurl(host='www.images.amazon.co.uk').getPublicSuffix() == b'amazon.co.uk'
41 | assert handyurl(host='funky-images.fancy.co.jp').getPublicSuffix() == b'fancy.co.jp'
42 |
43 | def test_getPublicPrefix():
44 | # These tests are based off the ones found in HandyURLTest.java
45 | assert handyurl(host='www.fool.com').getPublicPrefix() == 'www'
46 | assert handyurl(host='www.amazon.co.uk').getPublicPrefix() == 'www'
47 | assert handyurl(host='www.images.amazon.co.uk').getPublicPrefix() == 'www.images'
48 | assert handyurl(host='funky-images.fancy.co.jp').getPublicPrefix() == 'funky-images'
49 |
50 | def test_DefaultIAURLCanonicalizer():
51 | # These tests are from DefaultIAURLCanonicalizerTest.java
52 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www.alexa.com/")).getURLString() == 'http://alexa.com/'
53 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html")).getURLString() == 'http://archive.org/index.html'
54 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?")).getURLString() == 'http://archive.org/index.html'
55 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?a=b")).getURLString() == 'http://archive.org/index.html?a=b'
56 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=b'
57 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
58 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse("http://www34.archive.org/index.html?b=a&b=b&a=b")).getURLString() == 'http://archive.org/index.html?a=b&b=a&b=b'
59 |
60 | def test_GoogleURLCanonicalizer():
61 | # The tests are copied from GoogleURLCanonicalizerTest.java
62 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35")).getURLString() == 'http://host/%25'
63 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%25%32%35%25%32%35")).getURLString() == 'http://host/%25%25'
64 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%2525252525252525")).getURLString() == 'http://host/%25'
65 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/asdf%25%32%35asd")).getURLString() == 'http://host/asdf%25asd'
66 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host/%%%25%32%35asd%%")).getURLString() == 'http://host/%25%25%25asd%25%25'
67 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/")).getURLString() == 'http://www.google.com/'
68 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%31%36%38%2e%31%38%38%2e%39%39%2e%32%36/%2E%73%65%63%75%72%65/%77%77%77%2E%65%62%61%79%2E%63%6F%6D/")).getURLString() == 'http://168.188.99.26/.secure/www.ebay.com/'
69 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/")).getURLString() == 'http://195.127.0.11/uploads/%20%20%20%20/.verify/.eBaysecure=updateuserdataxplimnbqmn-xplmvalidateinfoswqpcmlx=hgplmcx/'
70 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host%23.com/%257Ea%2521b%2540c%2523d%2524e%25f%255E00%252611%252A22%252833%252944_55%252B")).getURLString() == 'http://host%23.com/~a!b@c%23d$e%25f^00&11*22(33)44_55+'
71 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://3279880203/blah")).getURLString() == 'http://195.127.0.11/blah'
72 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/blah/..")).getURLString() == 'http://www.google.com/'
73 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com/")).getURLString() == 'http://www.google.com/'
74 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("www.google.com")).getURLString() == 'http://www.google.com/'
75 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.evil.com/blah#frag")).getURLString() == 'http://www.evil.com/blah'
76 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.GOOgle.com/")).getURLString() == 'http://www.google.com/'
77 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com.../")).getURLString() == 'http://www.google.com/'
78 |
79 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/foo\tbar\rbaz\n2")).getURLString() == 'http://www.google.com/foobarbaz2'
80 |
81 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?")).getURLString() == 'http://www.google.com/q?'
82 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?")).getURLString() == 'http://www.google.com/q?r?'
83 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.google.com/q?r?s")).getURLString() == 'http://www.google.com/q?r?s'
84 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo#bar#baz")).getURLString() == 'http://evil.com/foo'
85 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo;")).getURLString() == 'http://evil.com/foo;'
86 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://evil.com/foo?bar;")).getURLString() == 'http://evil.com/foo?bar;'
87 |
88 | #This test case differs from the Java version. The Java version returns
89 | #'http://%01%80.com/' for this case. If idna/punycode encoding of a hostname
90 | #is not possible, the python version encodes unicode domains as utf-8 before
91 | #percent encoding, so we get 'http://%01%C2%80.com/'
92 | # assert print(canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString()) http://%01%C2%80.com/
93 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u"http://\u0001\u0080.com/")).getURLString() == 'http://%01%C2%80.com/'
94 |
95 | #Add these unicode tests:
96 | # assert print(canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString()) http://xn--bcher-kva.ch:8080/
97 | # assert print(canonicalize(handyurl.parse('☃.com')).getURLString()) == http://xn--n3h.com/
98 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(u'B\xfccher.ch:8080')).getURLString() == 'http://xn--bcher-kva.ch:8080/'
99 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse('☃.com')).getURLString() == 'http://xn--n3h.com/'
100 |
101 | #Add these percent-encoded unicode tests
102 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.t%EF%BF%BD%04.82.net/")).getURLString() == 'http://www.t%EF%BF%BD%04.82.net/'
103 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://notrailingslash.com")).getURLString() == 'http://notrailingslash.com/'
104 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://www.gotaport.com:1234/")).getURLString() == 'http://www.gotaport.com:1234/'
105 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse(" http://www.google.com/ ")).getURLString() == 'http://www.google.com/'
106 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http:// leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
107 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
108 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("%20leadingspace.com/")).getURLString() == 'http://%20leadingspace.com/'
109 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("https://www.securesite.com/")).getURLString() == 'https://www.securesite.com/'
110 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com/ab%23cd")).getURLString() == 'http://host.com/ab%23cd'
111 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("http://host.com//twoslashes?more//slashes")).getURLString() == 'http://host.com/twoslashes?more//slashes'
112 | assert surt.GoogleURLCanonicalizer.canonicalize(handyurl.parse("mailto:foo@example.com")).getURLString() == 'mailto:foo@example.com'
113 |
114 | def test_attemptIPFormats():
115 | # The tests are copied from GoogleURLCanonicalizerTest.java
116 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(None) is None
117 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(b"www.foo.com") is None
118 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(b"127.0.0.1") == b'127.0.0.1'
119 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(b"017.0.0.1") == b'15.0.0.1'
120 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(b"168.188.99.26") == b'168.188.99.26'
121 | #java version returns null, ours returns the correct ipv4
122 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(b"10.0.258") == b'10.0.1.2'
123 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(b"1.2.3.256") is None #returns None
124 |
125 | # ARC files from the wayback machine's liveweb proxy contain numeric
126 | # hostnames > 2^32 for some reason. We'll copy the behavior of the java code.
127 | assert surt.GoogleURLCanonicalizer.attemptIPFormats(b"39024579298") == b'22.11.210.226'
128 |
129 | def test_unescapeRepeatedly():
130 | # The tests are copied from GoogleURLCanonicalizerTest.java
131 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%!A%21%21%25") == b'%!A!!%'
132 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%") == b'%'
133 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%2") == b'%2'
134 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%25") == b'%'
135 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%25%") == b'%%'
136 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%2525") == b'%'
137 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%252525") == b'%'
138 | assert surt.GoogleURLCanonicalizer.unescapeRepeatedly(b"%25%32%35") == b'%'
139 |
140 | def test_IAURLCanonicalizer():
141 | # These tests are from IAURLCanonicalizerTest.java
142 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://ARCHIVE.ORG/")).getURLString() == 'http://archive.org/'
143 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:80/")).getURLString() == 'http://archive.org/'
144 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:80/")).getURLString() == 'https://archive.org:80/'
145 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org:443/")).getURLString() == 'http://archive.org:443/'
146 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("https://www.archive.org:443/")).getURLString() == 'https://archive.org/'
147 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("http://www.archive.org/big/")).getURLString() == 'http://archive.org/big'
148 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse("dns:www.archive.org")).getURLString() == 'dns:www.archive.org'
149 |
150 | def test_alphaReorderQuery():
151 | # These tests are from IAURLCanonicalizerTest.java
152 | assert surt.IAURLCanonicalizer.alphaReorderQuery(None) is None
153 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"") == b''
154 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"") == b''
155 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"a") == b'a'
156 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"ab") == b'ab'
157 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"a=1") == b'a=1'
158 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"ab=1") == b'ab=1'
159 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"a=1&") == b'&a=1'
160 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"a=1&b=1") == b'a=1&b=1'
161 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"b=1&a=1") == b'a=1&b=1'
162 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"a=a&a=a") == b'a=a&a=a'
163 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"a=b&a=a") == b'a=a&a=b'
164 | assert surt.IAURLCanonicalizer.alphaReorderQuery(b"b=b&a=b&b=a&a=a") == b'a=a&a=b&b=a&b=b'
165 |
166 | def test_massageHost():
167 | # These tests are from IAURLCanonicalizerTest.java
168 | assert surt.IAURLCanonicalizer.massageHost(b"foo.com") == b'foo.com'
169 | assert surt.IAURLCanonicalizer.massageHost(b"www.foo.com") == b'foo.com'
170 | assert surt.IAURLCanonicalizer.massageHost(b"www12.foo.com") == b'foo.com'
171 |
172 | assert surt.IAURLCanonicalizer.massageHost(b"www2foo.com") == b'www2foo.com'
173 | assert surt.IAURLCanonicalizer.massageHost(b"www2.www2foo.com") == b'www2foo.com'
174 |
175 | def test_getDefaultPort():
176 | # These tests are from IAURLCanonicalizerTest.java
177 | assert surt.IAURLCanonicalizer.getDefaultPort(b"foo") == 0
178 | assert surt.IAURLCanonicalizer.getDefaultPort(b"http") == 80
179 | assert surt.IAURLCanonicalizer.getDefaultPort(b"https") == 443
180 |
181 | def test_stripPathSessionID():
182 | # These tests are from IAURLCanonicalizerTest.java
183 | # Check ASP_SESSIONID2:
184 | assert surt.URLRegexTransformer.stripPathSessionID(b"/(S(4hqa0555fwsecu455xqckv45))/mileg.aspx") == b'/mileg.aspx'
185 |
186 | # Check ASP_SESSIONID2 (again):
187 | assert surt.URLRegexTransformer.stripPathSessionID(b"/(4hqa0555fwsecu455xqckv45)/mileg.aspx") == b'/mileg.aspx'
188 |
189 | # Check ASP_SESSIONID3:
190 | assert surt.URLRegexTransformer.stripPathSessionID(b"/(a(4hqa0555fwsecu455xqckv45)S(4hqa0555fwsecu455xqckv45)f(4hqa0555fwsecu455xqckv45))/mileg.aspx?page=sessionschedules") == b'/mileg.aspx?page=sessionschedules'
191 |
192 | # '@' in path:
193 | assert surt.URLRegexTransformer.stripPathSessionID(b"/photos/36050182@N05/") == b'/photos/36050182@N05/'
194 |
195 |
196 | def test_stripQuerySessionID():
197 | #base = "http://www.archive.org/index.html"
198 | base = b""
199 | str32id = b"0123456789abcdefghijklemopqrstuv"
200 | url = base + b"?jsessionid=" + str32id
201 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?'
202 |
203 | # Test that we don't strip if not 32 chars only.
204 | url = base + b"?jsessionid=" + str32id + b'0'
205 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?jsessionid=0123456789abcdefghijklemopqrstuv0'
206 |
207 | # Test what happens when followed by another key/value pair.
208 | url = base + b"?jsessionid=" + str32id + b"&x=y"
209 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?x=y'
210 |
211 | # Test what happens when followed by another key/value pair and
212 | # prefixed by a key/value pair.
213 | url = base + b"?one=two&jsessionid=" + str32id + b"&x=y"
214 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?one=two&x=y'
215 |
216 | # Test what happens when prefixed by a key/value pair.
217 | url = base + b"?one=two&jsessionid=" + str32id
218 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?one=two&'
219 |
220 | # Test aspsession.
221 | url = base + b"?aspsessionidABCDEFGH=" + b"ABCDEFGHIJKLMNOPQRSTUVWX" + b"&x=y"
222 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?x=y'
223 |
224 | # Test archive phpsession.
225 | url = base + b"?phpsessid=" + str32id + b"&x=y"
226 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?x=y'
227 |
228 | # With prefix too.
229 | url = base + b"?one=two&phpsessid=" + str32id + b"&x=y"
230 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?one=two&x=y'
231 |
232 | # With only prefix
233 | url = base + b"?one=two&phpsessid=" + str32id
234 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?one=two&'
235 |
236 | # Test sid.
237 | url = base + b"?" + b"sid=9682993c8daa2c5497996114facdc805" + b"&x=y";
238 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?x=y'
239 |
240 | # Igor test.
241 | url = base + b"?" + b"sid=9682993c8daa2c5497996114facdc805" + b"&" + b"jsessionid=" + str32id
242 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?'
243 |
244 | url = b"?CFID=1169580&CFTOKEN=48630702&dtstamp=22%2F08%2F2006%7C06%3A58%3A11"
245 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?dtstamp=22%2F08%2F2006%7C06%3A58%3A11'
246 |
247 | url = b"?CFID=12412453&CFTOKEN=15501799&dt=19_08_2006_22_39_28"
248 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?dt=19_08_2006_22_39_28'
249 |
250 | url = b"?CFID=14475712&CFTOKEN=2D89F5AF-3048-2957-DA4EE4B6B13661AB&r=468710288378&m=forgotten"
251 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?r=468710288378&m=forgotten'
252 |
253 | url = b"?CFID=16603925&CFTOKEN=2AE13EEE-3048-85B0-56CEDAAB0ACA44B8"
254 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?'
255 |
256 | url = b"?CFID=4308017&CFTOKEN=63914124&requestID=200608200458360%2E39414378"
257 | assert surt.URLRegexTransformer.stripQuerySessionID(url) == b'?requestID=200608200458360%2E39414378'
258 |
259 | @pytest.mark.parametrize("host_in,host_out", [
260 | (b"www.archive.org", [b"org,archive,www", b"org,archive,www"]),
261 | (b"123.123.net", [b"net,123,123", b"net,123,123"]),
262 | (b"100.100.100.100.org", [b"org,100,100,100,100", b"org,100,100,100,100"]),
263 | (b"123.45.167.89", [b"89,167,45,123", b"123.45.167.89"]),
264 | (b"10.162.1024.3", [b"3,1024,162,10", b"3,1024,162,10"]),
265 | # any four period-delimited 1-3 digit integers are interpreted as IP address, currently
266 | (b"990.991.992.993", [b"993,992,991,990", b"990.991.992.993"])
267 | ])
268 | def test_hostToSURT(host_in, host_out):
269 | h = surt.URLRegexTransformer.hostToSURT
270 |
271 | assert h(host_in) == host_out[0]
272 | assert h(host_in, reverse_ipaddr=True) == host_out[0]
273 | assert h(host_in, reverse_ipaddr=False) == host_out[1]
274 |
275 | def test_surt():
276 | # These tests are from WaybackURLKeyMakerTest.java
277 |
278 | assert surt.surt(None) == '-'
279 | assert surt.surt('') == '-'
280 | assert surt.surt("filedesc:foo.arc.gz") == 'filedesc:foo.arc.gz'
281 | assert surt.surt("filedesc:/foo.arc.gz") == 'filedesc:/foo.arc.gz'
282 | assert surt.surt("filedesc://foo.arc.gz") == 'filedesc://foo.arc.gz'
283 | assert surt.surt("warcinfo:foo.warc.gz") == 'warcinfo:foo.warc.gz'
284 | assert surt.surt("dns:alexa.com") == 'dns:alexa.com'
285 | assert surt.surt("dns:archive.org") == 'dns:archive.org'
286 |
287 | assert surt.surt("http://www.archive.org/") == 'org,archive)/'
288 | assert surt.surt("http://archive.org/") == 'org,archive)/'
289 | assert surt.surt("http://archive.org/goo/") == 'org,archive)/goo'
290 | assert surt.surt("http://archive.org/goo/?") == 'org,archive)/goo'
291 | assert surt.surt("http://archive.org/goo/?b&a") == 'org,archive)/goo?a&b'
292 | assert surt.surt("http://archive.org/goo/?a=2&b&a=1") == 'org,archive)/goo?a=1&a=2&b'
293 |
294 | # trailing comma mode
295 | assert surt.surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) == 'org,archive,)/goo?a=1&a=2&b'
296 | assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org'
297 | assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'
298 |
299 | # PHP session id:
300 | assert surt.surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") == 'org,archive)/index.php?action=profile;u=4221'
301 |
302 | # WHOIS url:
303 | assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'il,org,isoc,whois)/shaveh.co.il'
304 |
305 | # Yahoo web bug. See https://github.com/internetarchive/surt/issues/1
306 | assert surt.surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') == 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'
307 |
308 | # Simple customization:
309 | assert surt.surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) == 'com,example,www)/'
310 | assert surt.surt("mailto:foo@example.com") == 'mailto:foo@example.com'
311 | assert surt.surt("http://www.example.com/", with_scheme=True) == 'http://(com,example)/'
312 | assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=True) == 'http://(com,example)/'
313 | assert surt.surt("http://www.example.com/", with_scheme=False) == 'com,example)/'
314 | assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True) == 'http://(com,example,)/'
315 | assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True) == 'https://(com,example,)/'
316 | assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=True) == 'com,example,)/'
317 | assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=False) == 'com,example)/'
318 | assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) == 'ftp://(com,example,)/'
319 | assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=False) == 'http://(com,example,www)/'
320 | assert surt.surt("http://www.example.com/", with_scheme=False, host_massage=False) == 'com,example,www)/'
321 | assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'http://(com,example,www,)/'
322 | assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'https://(com,example,www,)/'
323 | assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'ftp://(com,example,www,)/'
324 |
325 | assert surt.surt("mailto:foo@example.com", with_scheme=True) == 'mailto:foo@example.com'
326 | assert surt.surt("mailto:foo@example.com", trailing_comma=True) == 'mailto:foo@example.com'
327 | assert surt.surt("mailto:foo@example.com", with_scheme=True, trailing_comma=True) == 'mailto:foo@example.com'
328 | assert surt.surt("dns:archive.org", with_scheme=True) == 'dns:archive.org'
329 | assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org'
330 | assert surt.surt("dns:archive.org", with_scheme=True, trailing_comma=True) == 'dns:archive.org'
331 | assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", with_scheme=True) == 'whois://(il,org,isoc,whois)/shaveh.co.il'
332 | assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True) == 'il,org,isoc,whois,)/shaveh.co.il'
333 | assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True, with_scheme=True) == 'whois://(il,org,isoc,whois,)/shaveh.co.il'
334 | assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'
335 | assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True) == 'warcinfo:foo.warc.gz'
336 | assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True, trailing_comma=True) == 'warcinfo:foo.warc.gz'
337 |
338 | @pytest.mark.xfail(reason="a bug not yet fixed for compatibility concern")
339 | def test_surt_query():
340 | assert surt.surt("http://example.com/script?type=a+b+%26+c&grape=wine") \
341 | == "com,example)/script?grape=wine&type=a+b+%26+c"
342 |
343 | @pytest.mark.parametrize("url,out", [
344 | ("http://example.com/app?item=Wroc%C5%82aw",
345 | "com,example)/app?item=wroc%c5%82aw")
346 | ])
347 | def test_surt_nonascii(url, out):
348 | """non-ASCII %-encoded in unicode string input"""
349 | assert surt.surt(url) == out
350 |
351 | @pytest.mark.parametrize("url,opts,out", [
352 | ("http://www.example.com/", dict(reverse_ipaddr=False), "com,example)/"),
353 | ("http://192.168.1.254/info/", {}, "254,1,168,192)/info"),
354 | ("http://192.168.1.254/info/", dict(reverse_ipaddr=True), "254,1,168,192)/info"),
355 | ("http://192.168.1.254/info/", dict(reverse_ipaddr=False), "192.168.1.254)/info")
356 | ])
357 | def test_surt_ipaddress(url, opts, out):
358 | assert surt.surt(url, **opts) == out
359 |
360 | @pytest.mark.parametrize("burl", [
361 | b'http://example.com/'
362 | ])
363 | def test_surt_return_type(burl):
364 | """surt.surt() returns the same type of string object (i.e. returns unicode
365 | string for unicode string input, and byets for bytes)
366 |
367 | Note this behavior may change in the future versions. This test is for
368 | testing compatibility until that happens.
369 | """
370 | assert isinstance(burl, bytes)
371 |
372 | b = surt.surt(burl)
373 | assert type(b) is type(burl)
374 |
375 | uurl = burl.decode('ascii')
376 | u = surt.surt(uurl)
377 | assert type(u) is type(uurl)
378 |
379 | def test_options():
380 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y'
381 | assert surt.IAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'
382 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y')).getURLString() == 'http://example.com/foo?x=y'
383 | assert surt.DefaultIAURLCanonicalizer.canonicalize(handyurl.parse('http://example.com/foo?X=Y'), query_lowercase=False).getURLString() == 'http://example.com/foo?X=Y'
384 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | # Tox (http://tox.testrun.org/) is a tool for running tests
2 | # in multiple virtualenvs. This configuration file will run the
3 | # test suite on all supported python versions. To use it, "pip install tox"
4 | # and then run "tox" from this directory.
5 |
6 | [tox]
7 | envlist =
8 | py26, py27,
9 | py33, py34, py35, py36
10 | pypy, pypy3,
11 |
12 | [testenv]
13 | deps =
14 | pytest
15 | commands = py.test -v {posargs}
16 |
17 | [testenv:cov]
18 | basepython = python2.7
19 | skip_install = true
20 | usedevelop = true
21 | deps =
22 | pytest
23 | pytest-cov
24 | commands = py.test --cov surt --cov-report term-missing {posargs}
25 |
--------------------------------------------------------------------------------