├── .coveragerc
├── .gitignore
├── .pre-commit-config.yaml
├── .travis.yml
├── Makefile
├── README.md
├── UNLICENSE
├── bin
    └── generate-tlds
├── pylintrc
├── requirements.txt
├── requirements_dev.txt
├── setup.py
├── tests
    ├── __init__.py
    ├── _urlparse_less_special_test.py
    ├── doc_test.py
    ├── encoding_test.py
    ├── search_test.py
    └── urllib_utf8_test.py
├── tox.ini
└── yelp_uri
    ├── __init__.py
    ├── _urlparse_less_special.py
    ├── encoding.py
    ├── search.py
    ├── tlds
        ├── __init__.py
        ├── all.py
        └── common.py
    └── urllib_utf8.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | timid = True
 4 | source = .
 5 | omit =
 6 |     .tox/*
 7 |     /usr/*
 8 |     setup.py
 9 |     venv/*
10 | 
11 | [report]
12 | exclude_lines =
13 |     # Have to re-enable the standard pragma
14 |     \#\s*pragma: no cover
15 | 
16 |     # Don't complain if tests don't hit defensive assertion code:
17 |     ^\s*raise AssertionError\b
18 |     ^\s*raise NotImplementedError\b
19 |     ^\s*return NotImplemented\b
20 |     ^\s*raise$
21 | 
22 |     # Don't complain if non-runnable code isn't run:
23 |     ^if __name__ == ['"]__main__['"]:$
24 | 
25 | [html]
26 | directory = coverage-html
27 | 
28 | # vim:ft=dosini
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.egg-info
 2 | *.iml
 3 | *.py[co]
 4 | .*.sw[a-z]
 5 | .coverage
 6 | .idea
 7 | .pre-commit-files
 8 | .project
 9 | .pydevproject
10 | .tox
11 | .venv.touch
12 | /venv*
13 | coverage-html
14 | dist
15 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: git@github.com:pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |         files: \.(py|sh|yaml)$
 7 |     -   id: end-of-file-fixer
 8 |         files: \.(py|sh|yaml)$
 9 |     -   id: check-yaml
10 |         files: \.(yaml|yml)$
11 |     -   id: debug-statements
12 |         files: \.py$
13 |     -   id: name-tests-test
14 |         files: tests/.+\.py$
15 |     -   id: fix-encoding-pragma
16 |         args:
17 |           - --remove
18 |         language_version: python3.8
19 | -   repo: http://github.com/asottile/reorder_python_imports
20 |     rev: v3.10.0
21 |     hooks:
22 |     -   id: reorder-python-imports
23 | -   repo: http://github.com/asottile/pyupgrade
24 |     rev: v3.10.1
25 |     hooks:
26 |     -   id: pyupgrade
27 |         args: ['--py38-plus']
28 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | matrix:
 3 |     include:
 4 |         -   env: TOXENV=py27
 5 |         -   env: TOXENV=py36
 6 |             python: 3.6
 7 | install: pip install tox
 8 | script: tox
 9 | cache:
10 |     directories:
11 |         - $HOME/.cache/pip
12 |         - $HOME/.cache/pre-commit
13 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | export PATH := $(PWD)/bin:$(PWD)/venv/bin:$(PATH)
 2 | 
 3 | REBUILD_FLAG =
 4 | 
 5 | .PHONY: all
 6 | all: venv test
 7 | 
 8 | venv: .venv.touch
 9 | 	rm -rf venv
10 | 	virtualenv venv --python python3.8
11 | 	pip install -r requirements_dev.txt
12 | 
13 | .PHONY: tests test
14 | tests: test
15 | test: venv
16 | 	tox $(REBUILD_FLAG)
17 | 
18 | 
19 | .venv.touch: setup.py requirements.txt requirements_dev.txt
20 | 	$(eval REBUILD_FLAG := --recreate)
21 | 	touch .venv.touch
22 | 
23 | 
24 | .PHONY: clean
25 | clean:
26 | 	find . -iname '*.pyc' | xargs rm -f
27 | 	rm -rf .tox
28 | 	rm -rf ./venv
29 | 	rm -f .venv.touch
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # yelp\_uri
 2 | 
 3 | [![Build Status](https://travis-ci.org/Yelp/yelp_uri.svg)](https://travis-ci.org/Yelp/yelp\_uri)
 4 | 
 5 | 
 6 | ## Installation
 7 | 
 8 | For a primer on pip and virtualenv, see the [Python Packaging User Guide](https://python-packaging-user-guide.readthedocs.org/en/latest/tutorial.html).
 9 | 
10 | TL;DR: `pip install yelp_uri`
11 | 
12 | 
13 | ## Usage
14 | 
15 | Make a well-encoded URI from user input.
16 | 
17 | ```python
18 |     >>> weird_uri = 'http://münch.com/münch?one=m%C3%BCnch#m%FCnch'
19 | 
20 |     >>> import yelp_uri.encoding as E
21 |     >>> well_encoded = E.recode_uri(weird_uri)
22 |     >>> print(well_encoded)
23 |     http://xn--mnch-0ra.com/m%C3%BCnch?one=m%C3%BCnch#m%C3%BCnch
24 | 
25 | ```
26 | 
27 | Make a user-readable url, from either a well-encoded url or user input:
28 | 
29 | ```python
30 |     >>> print(E.decode_uri(well_encoded))
31 |     http://münch.com/münch?one=münch#münch
32 |     >>> print(E.decode_uri(weird_uri))
33 |     http://münch.com/münch?one=münch#münch
34 | 
35 | ```
36 | 
37 | 
38 | 
39 | `yelp_uri.search` has regexes for finding URLs in user-generated plaintext.
40 | 
41 | ```python
42 |     >>> plaintext = '''
43 |     ...     Reference: http://en.wikipedia.org/wiki/Eon_(geology)
44 |     ...     Follow @YelpCincy on Twitter (http://twitter.com/YelpCincy)
45 |     ... '''
46 |     >>> from yelp_uri.search import url_regex
47 |     >>> for url in url_regex.finditer(plaintext): print(url.group())
48 |     http://en.wikipedia.org/wiki/Eon_(geology)
49 |     http://twitter.com/YelpCincy
50 | 
51 | ```
52 | 


--------------------------------------------------------------------------------
/UNLICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org/>
25 | 


--------------------------------------------------------------------------------
/bin/generate-tlds:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """
 3 | This reads the IANA-maintained list of tlds and formats/outputs them for use
 4 | in the domains regular expression. To regenerate:
 5 |     ./bin/generate-tlds > yelp_uri/tlds/all.py
 6 | """
 7 | import sys
 8 | 
 9 | import urllib2
10 | 
11 | 
12 | def main(url='http://data.iana.org/TLD/tlds-alpha-by-domain.txt'):
13 |     try:
14 |         domain_data = urllib2.urlopen(url)
15 |     except urllib2.URLError as e:
16 |         print(
17 |             "Could not get the domains from the given URL. Perhaps the IANA"
18 |             "has changed the location of the file or it no longer exists."
19 |         )
20 |         return e.reason
21 | 
22 |     # Convert all newlines except the last one to '|', so 'foo\nbar\n' -> 'foo|bar'.
23 |     # Ignores all lines starting with '#', which is a comment in the text file.
24 |     data = (
25 |         line.lower()
26 |         for line in domain_data.read().splitlines()
27 |         if not line.startswith("#") and line.strip()
28 |     )
29 | 
30 |     tlds = set()
31 |     for datum in data:
32 |         # get both the punycoded and unicoded versions:
33 |         tlds.add(datum.decode('utf-8'))
34 |         tlds.add(datum.decode('idna'))
35 | 
36 |     domains_string = "',\n    '".join(sorted(tlds))
37 | 
38 |     print('''\
39 | # -*- coding: utf-8 -*-
40 | from __future__ import unicode_literals
41 | # Generated automatically. To regenerate:
42 | #    ./bin/generate-tlds > yelp_uri/tlds/all.py
43 | all_tlds = '|'.join((
44 |     '{}',
45 | ))'''.format(domains_string).encode('UTF-8'))
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     sys.exit(main())
50 | 


--------------------------------------------------------------------------------
/pylintrc:
--------------------------------------------------------------------------------
 1 | [MESSAGES CONTROL]
 2 | disable=
 3 |     locally-disabled,
 4 |     missing-docstring,
 5 |     maybe-no-member,
 6 |     redefined-variable-type,
 7 |     redundant-keyword-arg,
 8 |     too-many-function-args,
 9 | 
10 | 
11 | [REPORTS]
12 | output-format=colorized
13 | reports=no
14 | 
15 | [BASIC]
16 | #const-rgx=(([A-Za-z_][A-Za-z0-9_]*)|(__.*__))$
17 | const-rgx=(([A-Za-z_][A-Za-z0-9_]*)|(__.*__))$
18 | 
19 | #function-rgx=[a-z_][a-z0-9_]{2,30}$
20 | function-rgx=[a-z_][a-z0-9_]{2,60}$
21 | 
22 | #method-rgx=[a-z_][a-z0-9_]{2,30}$
23 | method-rgx=(%(function-rgx)s|%(const-rgx)s)
24 | 
25 | #variable-rgx=[a-z_][a-z0-9_]{2,30}$
26 | variable-rgx=[a-z_][a-z0-9_]{0,30}$
27 | 
28 | [FORMAT]
29 | max-line-length=131
30 | 
31 | [TYPECHECK]
32 | ignored-classes=
33 |     pytest,
34 |     RFC3986,
35 |     _MovedItems,
36 | 
37 | [DESIGN]
38 | min-public-methods=0
39 | 
40 | # vim:ft=dosini:
41 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | coverage
2 | flake8
3 | mock
4 | pytest
5 | pre-commit
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages
 2 | from setuptools import setup
 3 | 
 4 | 
 5 | def main():
 6 |     setup(
 7 |         name='yelp_uri',
 8 |         version='2.0.1',
 9 |         description="Uri utilities maintained by Yelp",
10 |         url='https://github.com/Yelp/yelp_uri',
11 |         author='Buck Golemon',
12 |         author_email='buck@yelp.com',
13 |         platforms='all',
14 |         classifiers=[
15 |             'License :: Public Domain',
16 |             'Programming Language :: Python :: 3.8',
17 |         ],
18 |         packages=find_packages(exclude=('tests*',)),
19 |         install_requires=[
20 |             'yelp_encodings',
21 |             'yelp_bytes'
22 |         ],
23 |         options={
24 |             'bdist_wheel': {
25 |                 'universal': 1,
26 |             }
27 |         },
28 |     )
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     exit(main())
33 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/yelp_uri/8688042e6579bc235e8b2ddd2b552c5be84ba674/tests/__init__.py


--------------------------------------------------------------------------------
/tests/_urlparse_less_special_test.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | """Tests are also pulled from stdlib 2.6
  3 | This file is space-indented to ease merging from upstream.
  4 | 
  5 | http://hg.python.org/cpython/raw-file/4a17784f2fee/Lib/test/test_urlparse.py
  6 | """
  7 | import unittest
  8 | 
  9 | import yelp_uri._urlparse_less_special as urlparse
 10 | 
 11 | 
 12 | RFC1808_BASE = "http://a/b/c/d;p?q#f"
 13 | RFC2396_BASE = "http://a/b/c/d;p?q"
 14 | RFC3986_BASE = 'http://a/b/c/d;p?q'
 15 | SIMPLE_BASE = 'http://a/b/c/d'
 16 | 
 17 | # A list of test cases.  Each test case is a a two-tuple that contains
 18 | # a string with the query and a dictionary with the expected result.
 19 | 
 20 | parse_qsl_test_cases = [
 21 |     ("", []),
 22 |     ("&", []),
 23 |     ("&&", []),
 24 |     ("=", [('', '')]),
 25 |     ("=a", [('', 'a')]),
 26 |     ("a", [('a', '')]),
 27 |     ("a=", [('a', '')]),
 28 |     ("a=", [('a', '')]),
 29 |     ("&a=b", [('a', 'b')]),
 30 |     ("a=a+b&b=b+c", [('a', 'a b'), ('b', 'b c')]),
 31 |     ("a=1&a=2", [('a', '1'), ('a', '2')]),
 32 | ]
 33 | 
 34 | parse_qs_test_cases = [
 35 |     ("", {}),
 36 |     ("&", {}),
 37 |     ("&&", {}),
 38 |     ("=", {'': ['']}),
 39 |     ("=a", {'': ['a']}),
 40 |     ("a", {'a': ['']}),
 41 |     ("a=", {'a': ['']}),
 42 |     ("&a=b", {'a': ['b']}),
 43 |     ("a=a+b&b=b+c", {'a': ['a b'], 'b': ['b c']}),
 44 |     ("a=1&a=2", {'a': ['1', '2']}),
 45 | ]
 46 | 
 47 | 
 48 | class UrlParseTestCase(unittest.TestCase):
 49 |     def checkRoundtrips(self, url, parsed, split):
 50 |         result = urlparse.urlparse(url)
 51 |         self.assertEqual(result, parsed)
 52 |         t = (result.scheme, result.netloc, result.path,
 53 |              result.params, result.query, result.fragment)
 54 |         self.assertEqual(t, parsed)
 55 |         # put it back together and it should be the same
 56 |         result2 = urlparse.urlunparse(result)
 57 |         self.assertEqual(result2, url)
 58 |         self.assertEqual(result2, result.geturl())
 59 | 
 60 |         # the result of geturl() is a fixpoint; we can always parse it
 61 |         # again to get the same result:
 62 |         result3 = urlparse.urlparse(result.geturl())
 63 |         self.assertEqual(result3.geturl(), result.geturl())
 64 |         self.assertEqual(result3, result)
 65 |         self.assertEqual(result3.scheme, result.scheme)
 66 |         self.assertEqual(result3.netloc, result.netloc)
 67 |         self.assertEqual(result3.path, result.path)
 68 |         self.assertEqual(result3.params, result.params)
 69 |         self.assertEqual(result3.query, result.query)
 70 |         self.assertEqual(result3.fragment, result.fragment)
 71 |         self.assertEqual(result3.username, result.username)
 72 |         self.assertEqual(result3.password, result.password)
 73 |         self.assertEqual(result3.hostname, result.hostname)
 74 |         self.assertEqual(result3.port, result.port)
 75 | 
 76 |         # check the roundtrip using urlsplit() as well
 77 |         result = urlparse.urlsplit(url)
 78 |         self.assertEqual(result, split)
 79 |         t = (result.scheme, result.netloc, result.path,
 80 |              result.query, result.fragment)
 81 |         self.assertEqual(t, split)
 82 |         result2 = urlparse.urlunsplit(result)
 83 |         self.assertEqual(result2, url)
 84 |         self.assertEqual(result2, result.geturl())
 85 | 
 86 |         # check the fixpoint property of re-parsing the result of geturl()
 87 |         result3 = urlparse.urlsplit(result.geturl())
 88 |         self.assertEqual(result3.geturl(), result.geturl())
 89 |         self.assertEqual(result3, result)
 90 |         self.assertEqual(result3.scheme, result.scheme)
 91 |         self.assertEqual(result3.netloc, result.netloc)
 92 |         self.assertEqual(result3.path, result.path)
 93 |         self.assertEqual(result3.query, result.query)
 94 |         self.assertEqual(result3.fragment, result.fragment)
 95 |         self.assertEqual(result3.username, result.username)
 96 |         self.assertEqual(result3.password, result.password)
 97 |         self.assertEqual(result3.hostname, result.hostname)
 98 |         self.assertEqual(result3.port, result.port)
 99 | 
100 |     def test_qsl(self):
101 |         for orig, expect in parse_qsl_test_cases:
102 |             result = urlparse.parse_qsl(orig, keep_blank_values=True)
103 |             self.assertEqual(result, expect, "Error parsing %s" % repr(orig))
104 | 
105 |     def test_qs(self):
106 |         for orig, expect in parse_qs_test_cases:
107 |             result = urlparse.parse_qs(orig, keep_blank_values=True)
108 |             self.assertEqual(result, expect, "Error parsing %s" % repr(orig))
109 | 
110 |     def test_roundtrips(self):
111 |         testcases = [
112 |             ('file:///tmp/junk.txt',
113 |              ('file', '', '/tmp/junk.txt', '', '', ''),
114 |              ('file', '', '/tmp/junk.txt', '', '')),
115 |             ('imap://mail.python.org/mbox1',
116 |              ('imap', 'mail.python.org', '/mbox1', '', '', ''),
117 |              ('imap', 'mail.python.org', '/mbox1', '', '')),
118 |             ('mms://wms.sys.hinet.net/cts/Drama/09006251100.asf',
119 |              ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
120 |               '', '', ''),
121 |              ('mms', 'wms.sys.hinet.net', '/cts/Drama/09006251100.asf',
122 |               '', '')),
123 |             ('svn+ssh://svn.zope.org/repos/main/ZConfig/trunk/',
124 |              ('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
125 |               '', '', ''),
126 |              ('svn+ssh', 'svn.zope.org', '/repos/main/ZConfig/trunk/',
127 |               '', '')),
128 |             ('git+ssh://git@github.com/user/project.git',
129 |              ('git+ssh', 'git@github.com', '/user/project.git',
130 |               '', '', ''),
131 |              ('git+ssh', 'git@github.com', '/user/project.git',
132 |               '', ''))
133 |         ]
134 |         for url, parsed, split in testcases:
135 |             self.checkRoundtrips(url, parsed, split)
136 | 
137 |     def test_http_roundtrips(self):
138 |         # urlparse.urlsplit treats 'http:' as an optimized special case,
139 |         # so we test both 'http:' and 'https:' in all the following.
140 |         # Three cheers for white box knowledge!
141 |         testcases = [
142 |             ('://www.python.org',
143 |              ('www.python.org', '', '', '', ''),
144 |              ('www.python.org', '', '', '')),
145 |             ('://www.python.org#abc',
146 |              ('www.python.org', '', '', '', 'abc'),
147 |              ('www.python.org', '', '', 'abc')),
148 |             ('://www.python.org?q=abc',
149 |              ('www.python.org', '', '', 'q=abc', ''),
150 |              ('www.python.org', '', 'q=abc', '')),
151 |             ('://www.python.org/#abc',
152 |              ('www.python.org', '/', '', '', 'abc'),
153 |              ('www.python.org', '/', '', 'abc')),
154 |             ('://a/b/c/d;p?q#f',
155 |              ('a', '/b/c/d', 'p', 'q', 'f'),
156 |              ('a', '/b/c/d;p', 'q', 'f')),
157 |         ]
158 |         for scheme in ('http', 'https'):
159 |             for url, parsed, split in testcases:
160 |                 url = scheme + url
161 |                 parsed = (scheme,) + parsed
162 |                 split = (scheme,) + split
163 |                 self.checkRoundtrips(url, parsed, split)
164 | 
165 |     def checkJoin(self, base, relurl, expected):
166 |         self.assertEqual(urlparse.urljoin(base, relurl), expected,
167 |                          (base, relurl, expected))
168 | 
169 |     def test_unparse_parse(self):
170 |         for u in ['Python', './Python', 'x-newscheme://foo.com/stuff', 'x://y', 'x:/y', 'x:/', '/', ]:
171 |             self.assertEqual(urlparse.urlunsplit(urlparse.urlsplit(u)), u)
172 |             self.assertEqual(urlparse.urlunparse(urlparse.urlparse(u)), u)
173 | 
174 |     def test_RFC1808(self):
175 |         # "normal" cases from RFC 1808:
176 |         self.checkJoin(RFC1808_BASE, 'g:h', 'g:h')
177 |         self.checkJoin(RFC1808_BASE, 'g', 'http://a/b/c/g')
178 |         self.checkJoin(RFC1808_BASE, './g', 'http://a/b/c/g')
179 |         self.checkJoin(RFC1808_BASE, 'g/', 'http://a/b/c/g/')
180 |         self.checkJoin(RFC1808_BASE, '/g', 'http://a/g')
181 |         self.checkJoin(RFC1808_BASE, '//g', 'http://g')
182 |         self.checkJoin(RFC1808_BASE, 'g?y', 'http://a/b/c/g?y')
183 |         self.checkJoin(RFC1808_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
184 |         self.checkJoin(RFC1808_BASE, '#s', 'http://a/b/c/d;p?q#s')
185 |         self.checkJoin(RFC1808_BASE, 'g#s', 'http://a/b/c/g#s')
186 |         self.checkJoin(RFC1808_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
187 |         self.checkJoin(RFC1808_BASE, 'g?y#s', 'http://a/b/c/g?y#s')
188 |         self.checkJoin(RFC1808_BASE, 'g;x', 'http://a/b/c/g;x')
189 |         self.checkJoin(RFC1808_BASE, 'g;x?y#s', 'http://a/b/c/g;x?y#s')
190 |         self.checkJoin(RFC1808_BASE, '.', 'http://a/b/c/')
191 |         self.checkJoin(RFC1808_BASE, './', 'http://a/b/c/')
192 |         self.checkJoin(RFC1808_BASE, '..', 'http://a/b/')
193 |         self.checkJoin(RFC1808_BASE, '../', 'http://a/b/')
194 |         self.checkJoin(RFC1808_BASE, '../g', 'http://a/b/g')
195 |         self.checkJoin(RFC1808_BASE, '../..', 'http://a/')
196 |         self.checkJoin(RFC1808_BASE, '../../', 'http://a/')
197 |         self.checkJoin(RFC1808_BASE, '../../g', 'http://a/g')
198 | 
199 |         # "abnormal" cases from RFC 1808:
200 |         self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
201 |         self.checkJoin(RFC1808_BASE, '../../../g', 'http://a/../g')
202 |         self.checkJoin(RFC1808_BASE, '../../../../g', 'http://a/../../g')
203 |         self.checkJoin(RFC1808_BASE, '/./g', 'http://a/./g')
204 |         self.checkJoin(RFC1808_BASE, '/../g', 'http://a/../g')
205 |         self.checkJoin(RFC1808_BASE, 'g.', 'http://a/b/c/g.')
206 |         self.checkJoin(RFC1808_BASE, '.g', 'http://a/b/c/.g')
207 |         self.checkJoin(RFC1808_BASE, 'g..', 'http://a/b/c/g..')
208 |         self.checkJoin(RFC1808_BASE, '..g', 'http://a/b/c/..g')
209 |         self.checkJoin(RFC1808_BASE, './../g', 'http://a/b/g')
210 |         self.checkJoin(RFC1808_BASE, './g/.', 'http://a/b/c/g/')
211 |         self.checkJoin(RFC1808_BASE, 'g/./h', 'http://a/b/c/g/h')
212 |         self.checkJoin(RFC1808_BASE, 'g/../h', 'http://a/b/c/h')
213 | 
214 |         # RFC 1808 and RFC 1630 disagree on these (according to RFC 1808),
215 |         # so we'll not actually run these tests (which expect 1808 behavior).
216 |         # self.checkJoin(RFC1808_BASE, 'http:g', 'http:g')
217 |         # self.checkJoin(RFC1808_BASE, 'http:', 'http:')
218 | 
219 |     def test_RFC2396(self):
220 |         # cases from RFC 2396
221 | 
222 |         self.checkJoin(RFC2396_BASE, 'g:h', 'g:h')
223 |         self.checkJoin(RFC2396_BASE, 'g', 'http://a/b/c/g')
224 |         self.checkJoin(RFC2396_BASE, './g', 'http://a/b/c/g')
225 |         self.checkJoin(RFC2396_BASE, 'g/', 'http://a/b/c/g/')
226 |         self.checkJoin(RFC2396_BASE, '/g', 'http://a/g')
227 |         self.checkJoin(RFC2396_BASE, '//g', 'http://g')
228 |         self.checkJoin(RFC2396_BASE, 'g?y', 'http://a/b/c/g?y')
229 |         self.checkJoin(RFC2396_BASE, '#s', 'http://a/b/c/d;p?q#s')
230 |         self.checkJoin(RFC2396_BASE, 'g#s', 'http://a/b/c/g#s')
231 |         self.checkJoin(RFC2396_BASE, 'g?y#s', 'http://a/b/c/g?y#s')
232 |         self.checkJoin(RFC2396_BASE, 'g;x', 'http://a/b/c/g;x')
233 |         self.checkJoin(RFC2396_BASE, 'g;x?y#s', 'http://a/b/c/g;x?y#s')
234 |         self.checkJoin(RFC2396_BASE, '.', 'http://a/b/c/')
235 |         self.checkJoin(RFC2396_BASE, './', 'http://a/b/c/')
236 |         self.checkJoin(RFC2396_BASE, '..', 'http://a/b/')
237 |         self.checkJoin(RFC2396_BASE, '../', 'http://a/b/')
238 |         self.checkJoin(RFC2396_BASE, '../g', 'http://a/b/g')
239 |         self.checkJoin(RFC2396_BASE, '../..', 'http://a/')
240 |         self.checkJoin(RFC2396_BASE, '../../', 'http://a/')
241 |         self.checkJoin(RFC2396_BASE, '../../g', 'http://a/g')
242 |         self.checkJoin(RFC2396_BASE, '', RFC2396_BASE)
243 |         self.checkJoin(RFC2396_BASE, '../../../g', 'http://a/../g')
244 |         self.checkJoin(RFC2396_BASE, '../../../../g', 'http://a/../../g')
245 |         self.checkJoin(RFC2396_BASE, '/./g', 'http://a/./g')
246 |         self.checkJoin(RFC2396_BASE, '/../g', 'http://a/../g')
247 |         self.checkJoin(RFC2396_BASE, 'g.', 'http://a/b/c/g.')
248 |         self.checkJoin(RFC2396_BASE, '.g', 'http://a/b/c/.g')
249 |         self.checkJoin(RFC2396_BASE, 'g..', 'http://a/b/c/g..')
250 |         self.checkJoin(RFC2396_BASE, '..g', 'http://a/b/c/..g')
251 |         self.checkJoin(RFC2396_BASE, './../g', 'http://a/b/g')
252 |         self.checkJoin(RFC2396_BASE, './g/.', 'http://a/b/c/g/')
253 |         self.checkJoin(RFC2396_BASE, 'g/./h', 'http://a/b/c/g/h')
254 |         self.checkJoin(RFC2396_BASE, 'g/../h', 'http://a/b/c/h')
255 |         self.checkJoin(RFC2396_BASE, 'g;x=1/./y', 'http://a/b/c/g;x=1/y')
256 |         self.checkJoin(RFC2396_BASE, 'g;x=1/../y', 'http://a/b/c/y')
257 |         self.checkJoin(RFC2396_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
258 |         self.checkJoin(RFC2396_BASE, 'g?y/../x', 'http://a/b/c/g?y/../x')
259 |         self.checkJoin(RFC2396_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
260 |         self.checkJoin(RFC2396_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
261 | 
262 |     def test_RFC3986(self):
263 |         # Test cases from RFC3986
264 |         self.checkJoin(RFC3986_BASE, '?y', 'http://a/b/c/d;p?y')
265 |         self.checkJoin(RFC2396_BASE, ';x', 'http://a/b/c/;x')
266 |         self.checkJoin(RFC3986_BASE, 'g:h', 'g:h')
267 |         self.checkJoin(RFC3986_BASE, 'g', 'http://a/b/c/g')
268 |         self.checkJoin(RFC3986_BASE, './g', 'http://a/b/c/g')
269 |         self.checkJoin(RFC3986_BASE, 'g/', 'http://a/b/c/g/')
270 |         self.checkJoin(RFC3986_BASE, '/g', 'http://a/g')
271 |         self.checkJoin(RFC3986_BASE, '//g', 'http://g')
272 |         self.checkJoin(RFC3986_BASE, '?y', 'http://a/b/c/d;p?y')
273 |         self.checkJoin(RFC3986_BASE, 'g?y', 'http://a/b/c/g?y')
274 |         self.checkJoin(RFC3986_BASE, '#s', 'http://a/b/c/d;p?q#s')
275 |         self.checkJoin(RFC3986_BASE, 'g#s', 'http://a/b/c/g#s')
276 |         self.checkJoin(RFC3986_BASE, 'g?y#s', 'http://a/b/c/g?y#s')
277 |         self.checkJoin(RFC3986_BASE, ';x', 'http://a/b/c/;x')
278 |         self.checkJoin(RFC3986_BASE, 'g;x', 'http://a/b/c/g;x')
279 |         self.checkJoin(RFC3986_BASE, 'g;x?y#s', 'http://a/b/c/g;x?y#s')
280 |         self.checkJoin(RFC3986_BASE, '', 'http://a/b/c/d;p?q')
281 |         self.checkJoin(RFC3986_BASE, '.', 'http://a/b/c/')
282 |         self.checkJoin(RFC3986_BASE, './', 'http://a/b/c/')
283 |         self.checkJoin(RFC3986_BASE, '..', 'http://a/b/')
284 |         self.checkJoin(RFC3986_BASE, '../', 'http://a/b/')
285 |         self.checkJoin(RFC3986_BASE, '../g', 'http://a/b/g')
286 |         self.checkJoin(RFC3986_BASE, '../..', 'http://a/')
287 |         self.checkJoin(RFC3986_BASE, '../../', 'http://a/')
288 |         self.checkJoin(RFC3986_BASE, '../../g', 'http://a/g')
289 | 
290 |         # Abnormal Examples
291 | 
292 |         # The 'abnormal scenarios' are incompatible with RFC2986 parsing
293 |         # Tests are here for reference.
294 | 
295 |         # self.checkJoin(RFC3986_BASE, '../../../g','http://a/g')
296 |         # self.checkJoin(RFC3986_BASE, '../../../../g','http://a/g')
297 |         # self.checkJoin(RFC3986_BASE, '/./g','http://a/g')
298 |         # self.checkJoin(RFC3986_BASE, '/../g','http://a/g')
299 | 
300 |         self.checkJoin(RFC3986_BASE, 'g.', 'http://a/b/c/g.')
301 |         self.checkJoin(RFC3986_BASE, '.g', 'http://a/b/c/.g')
302 |         self.checkJoin(RFC3986_BASE, 'g..', 'http://a/b/c/g..')
303 |         self.checkJoin(RFC3986_BASE, '..g', 'http://a/b/c/..g')
304 |         self.checkJoin(RFC3986_BASE, './../g', 'http://a/b/g')
305 |         self.checkJoin(RFC3986_BASE, './g/.', 'http://a/b/c/g/')
306 |         self.checkJoin(RFC3986_BASE, 'g/./h', 'http://a/b/c/g/h')
307 |         self.checkJoin(RFC3986_BASE, 'g/../h', 'http://a/b/c/h')
308 |         self.checkJoin(RFC3986_BASE, 'g;x=1/./y', 'http://a/b/c/g;x=1/y')
309 |         self.checkJoin(RFC3986_BASE, 'g;x=1/../y', 'http://a/b/c/y')
310 |         self.checkJoin(RFC3986_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
311 |         self.checkJoin(RFC3986_BASE, 'g?y/../x', 'http://a/b/c/g?y/../x')
312 |         self.checkJoin(RFC3986_BASE, 'g#s/./x', 'http://a/b/c/g#s/./x')
313 |         self.checkJoin(RFC3986_BASE, 'g#s/../x', 'http://a/b/c/g#s/../x')
314 |         # self.checkJoin(RFC3986_BASE, 'http:g','http:g') # strict parser
315 |         self.checkJoin(RFC3986_BASE, 'http:g', 'http://a/b/c/g')  # relaxed parser
316 | 
317 |     def test_urljoins(self):
318 |         self.checkJoin(SIMPLE_BASE, 'g:h', 'g:h')
319 |         self.checkJoin(SIMPLE_BASE, 'http:g', 'http://a/b/c/g')
320 |         self.checkJoin(SIMPLE_BASE, 'http:', 'http://a/b/c/d')
321 |         self.checkJoin(SIMPLE_BASE, 'g', 'http://a/b/c/g')
322 |         self.checkJoin(SIMPLE_BASE, './g', 'http://a/b/c/g')
323 |         self.checkJoin(SIMPLE_BASE, 'g/', 'http://a/b/c/g/')
324 |         self.checkJoin(SIMPLE_BASE, '/g', 'http://a/g')
325 |         self.checkJoin(SIMPLE_BASE, '//g', 'http://g')
326 |         self.checkJoin(SIMPLE_BASE, '?y', 'http://a/b/c/d?y')
327 |         self.checkJoin(SIMPLE_BASE, 'g?y', 'http://a/b/c/g?y')
328 |         self.checkJoin(SIMPLE_BASE, 'g?y/./x', 'http://a/b/c/g?y/./x')
329 |         self.checkJoin(SIMPLE_BASE, '.', 'http://a/b/c/')
330 |         self.checkJoin(SIMPLE_BASE, './', 'http://a/b/c/')
331 |         self.checkJoin(SIMPLE_BASE, '..', 'http://a/b/')
332 |         self.checkJoin(SIMPLE_BASE, '../', 'http://a/b/')
333 |         self.checkJoin(SIMPLE_BASE, '../g', 'http://a/b/g')
334 |         self.checkJoin(SIMPLE_BASE, '../..', 'http://a/')
335 |         self.checkJoin(SIMPLE_BASE, '../../g', 'http://a/g')
336 |         self.checkJoin(SIMPLE_BASE, '../../../g', 'http://a/../g')
337 |         self.checkJoin(SIMPLE_BASE, './../g', 'http://a/b/g')
338 |         self.checkJoin(SIMPLE_BASE, './g/.', 'http://a/b/c/g/')
339 |         self.checkJoin(SIMPLE_BASE, '/./g', 'http://a/./g')
340 |         self.checkJoin(SIMPLE_BASE, 'g/./h', 'http://a/b/c/g/h')
341 |         self.checkJoin(SIMPLE_BASE, 'g/../h', 'http://a/b/c/h')
342 |         self.checkJoin(SIMPLE_BASE, 'http:g', 'http://a/b/c/g')
343 |         self.checkJoin(SIMPLE_BASE, 'http:', 'http://a/b/c/d')
344 |         self.checkJoin(SIMPLE_BASE, 'http:?y', 'http://a/b/c/d?y')
345 |         self.checkJoin(SIMPLE_BASE, 'http:g?y', 'http://a/b/c/g?y')
346 |         self.checkJoin(SIMPLE_BASE, 'http:g?y/./x', 'http://a/b/c/g?y/./x')
347 | 
348 |     def test_urldefrag(self):
349 |         for url, defrag, frag in [
350 |                 ('http://python.org#frag', 'http://python.org', 'frag'),
351 |                 ('http://python.org', 'http://python.org', ''),
352 |                 ('http://python.org/#frag', 'http://python.org/', 'frag'),
353 |                 ('http://python.org/', 'http://python.org/', ''),
354 |                 ('http://python.org/?q#frag', 'http://python.org/?q', 'frag'),
355 |                 ('http://python.org/?q', 'http://python.org/?q', ''),
356 |                 ('http://python.org/p#frag', 'http://python.org/p', 'frag'),
357 |                 ('http://python.org/p?q', 'http://python.org/p?q', ''),
358 |                 (RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
359 |                 (RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
360 |         ]:
361 |             self.assertEqual(urlparse.urldefrag(url), (defrag, frag))
362 | 
363 |     def test_urlsplit_attributes(self):
364 |         url = "HTTP://WWW.PYTHON.ORG/doc/#frag"
365 |         p = urlparse.urlsplit(url)
366 |         self.assertEqual(p.scheme, "http")
367 |         self.assertEqual(p.netloc, "WWW.PYTHON.ORG")
368 |         self.assertEqual(p.path, "/doc/")
369 |         self.assertEqual(p.query, "")
370 |         self.assertEqual(p.fragment, "frag")
371 |         self.assertEqual(p.username, None)
372 |         self.assertEqual(p.password, None)
373 |         self.assertEqual(p.hostname, "WWW.PYTHON.ORG")
374 |         self.assertEqual(p.port, None)
375 |         # geturl() won't return exactly the original URL in this case
376 |         # since the scheme is always case-normalized
377 |         # self.assertEqual(p.geturl(), url)
378 | 
379 |         url = "http://User:Pass@www.python.org:080/doc/?query=yes#frag"
380 |         p = urlparse.urlsplit(url)
381 |         self.assertEqual(p.scheme, "http")
382 |         self.assertEqual(p.netloc, "User:Pass@www.python.org:080")
383 |         self.assertEqual(p.path, "/doc/")
384 |         self.assertEqual(p.query, "query=yes")
385 |         self.assertEqual(p.fragment, "frag")
386 |         self.assertEqual(p.username, "User")
387 |         self.assertEqual(p.password, "Pass")
388 |         self.assertEqual(p.hostname, "www.python.org")
389 |         self.assertEqual(p.port, 80)
390 |         self.assertEqual(p.geturl(), url)
391 | 
392 |         # Addressing issue1698, which suggests Username can contain
393 |         # "@" characters.  Though not RFC compliant, many ftp sites allow
394 |         # and request email addresses as usernames.
395 | 
396 |         url = "http://User@example.com:Pass@www.python.org:080/doc/?query=yes#frag"
397 |         p = urlparse.urlsplit(url)
398 |         self.assertEqual(p.scheme, "http")
399 |         self.assertEqual(p.netloc, "User@example.com:Pass@www.python.org:080")
400 |         self.assertEqual(p.path, "/doc/")
401 |         self.assertEqual(p.query, "query=yes")
402 |         self.assertEqual(p.fragment, "frag")
403 |         self.assertEqual(p.username, "User@example.com")
404 |         self.assertEqual(p.password, "Pass")
405 |         self.assertEqual(p.hostname, "www.python.org")
406 |         self.assertEqual(p.port, 80)
407 |         self.assertEqual(p.geturl(), url)
408 | 
409 |     def test_attributes_bad_port(self):
410 |         """Check handling of non-integer ports."""
411 |         p = urlparse.urlsplit("http://www.example.net:foo")
412 |         self.assertEqual(p.netloc, "www.example.net:foo")
413 |         self.assertRaises(ValueError, lambda: p.port)
414 | 
415 |         p = urlparse.urlparse("http://www.example.net:foo")
416 |         self.assertEqual(p.netloc, "www.example.net:foo")
417 |         self.assertRaises(ValueError, lambda: p.port)
418 | 
419 |     def test_attributes_without_netloc(self):
420 |         # This example is straight from RFC 3261.  It looks like it
421 |         # should allow the username, hostname, and port to be filled
422 |         # in, but doesn't.  Since it's a URI and doesn't use the
423 |         # scheme://netloc syntax, the netloc and related attributes
424 |         # should be left empty.
425 |         uri = "sip:alice@atlanta.com;maddr=239.255.255.1;ttl=15"
426 |         p = urlparse.urlsplit(uri)
427 |         self.assertEqual(p.netloc, None)
428 |         self.assertEqual(p.username, None)
429 |         self.assertEqual(p.password, None)
430 |         self.assertEqual(p.hostname, None)
431 |         self.assertEqual(p.port, None)
432 |         self.assertEqual(p.geturl(), uri)
433 | 
434 |         p = urlparse.urlparse(uri)
435 |         self.assertEqual(p.netloc, None)
436 |         self.assertEqual(p.username, None)
437 |         self.assertEqual(p.password, None)
438 |         self.assertEqual(p.hostname, None)
439 |         self.assertEqual(p.port, None)
440 |         self.assertEqual(p.geturl(), uri)
441 | 
442 |     def test_caching(self):
443 |         # Test case for bug #1313119
444 |         uri = "http://example.com/doc/"
445 |         unicode_uri = str(uri)
446 | 
447 |         urlparse.urlparse(unicode_uri)
448 |         p = urlparse.urlparse(uri)
449 |         self.assertEqual(type(p.scheme), type(uri))
450 |         self.assertEqual(type(p.hostname), type(uri))
451 |         self.assertEqual(type(p.path), type(uri))
452 | 
453 |     def test_noslash(self):
454 |         # Issue 1637: http://foo.com?query is legal
455 |         self.assertEqual(urlparse.urlparse("http://example.com?blahblah=/foo"),
456 |                          ('http', 'example.com', '', '', 'blahblah=/foo', ''))
457 | 
458 |     def test_anyscheme(self):
459 |         # Issue 7904: s3://foo.com/stuff has netloc "foo.com".
460 |         self.assertEqual(urlparse.urlparse("s3://foo.com/stuff"),
461 |                          ('s3', 'foo.com', '/stuff', '', '', ''))
462 |         self.assertEqual(urlparse.urlparse("x-newscheme://foo.com/stuff"),
463 |                          ('x-newscheme', 'foo.com', '/stuff', '', '', ''))
464 | 
465 |     def test_split_relative_urls(self):
466 |         self.assertEqual(urlparse.urlparse("x-newscheme:stuff"),
467 |                          ('x-newscheme', None, 'stuff', '', '', ''))
468 |         self.assertEqual(urlparse.urlparse("x-newscheme:/stuff"),
469 |                          ('x-newscheme', None, '/stuff', '', '', ''))
470 |         self.assertEqual(urlparse.urlparse("x-newscheme://stuff"),
471 |                          ('x-newscheme', 'stuff', '', '', '', ''))
472 |         self.assertEqual(urlparse.urlparse("x-newscheme:///stuff"),
473 |                          ('x-newscheme', '', '/stuff', '', '', ''))
474 | 
475 |     def test_unsplit_relative_urls(self):
476 |         self.assertEqual(urlparse.urlunparse(('x-newscheme', None, 'stuff', '', '', '')),
477 |                          "x-newscheme:stuff")
478 |         self.assertEqual(urlparse.urlunparse(('x-newscheme', None, '/stuff', '', '', '')),
479 |                          "x-newscheme:/stuff")
480 |         self.assertEqual(urlparse.urlunparse(('x-newscheme', 'stuff', '', '', '', '')),
481 |                          "x-newscheme://stuff")
482 |         self.assertEqual(urlparse.urlunparse(('x-newscheme', '', '/stuff', '', '', '')),
483 |                          "x-newscheme:///stuff")
484 | 
485 | # vim:et:sts=4:ts=4
486 | 


--------------------------------------------------------------------------------
/tests/doc_test.py:
--------------------------------------------------------------------------------
1 | def test_docs():
2 |     from doctest import testfile
3 |     failures, _ = testfile('README.md', module_relative=False, encoding='UTF-8')
4 |     assert not failures
5 | 


--------------------------------------------------------------------------------
/tests/encoding_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | import yelp_uri.encoding as E
  4 | from yelp_uri.urllib_utf8 import quote
  5 | 
  6 | 
  7 | def test_uri_error():
  8 |     # Exception handlers around recode catch UnicodeError
  9 |     assert issubclass(E.MalformedUrlError, UnicodeError), type.mro(E.MalformedUrlError)
 10 | 
 11 | 
 12 | def test_bad_port():
 13 |     try:
 14 |         E.encode_uri('http://foo.bar:buz')
 15 |     except E.MalformedUrlError as error:
 16 |         assert error.args == ("Invalid port number: invalid literal for int() with base 10: 'buz'",)
 17 | 
 18 | 
 19 | def test_bad_domain_segment_too_long():
 20 |     try:
 21 |         E.encode_uri('http://foo.%s.bar' % ('x' * 64))
 22 |     except E.MalformedUrlError as error:
 23 |         error_msg = (
 24 |             "Invalid hostname: encoding with 'IDNA' codec failed "
 25 |             "(UnicodeError: label empty or too long): "
 26 |         )
 27 | 
 28 |         assert error.args == (
 29 |             error_msg +
 30 |             repr("foo.xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx.bar"),
 31 |         )
 32 | 
 33 | 
 34 | def test_bad_domain_extra_dots():
 35 |     # We normalize this one ala Chrome browser
 36 |     assert E.encode_uri('http://..foo..com../.bar.') == 'http://foo.com/.bar.'
 37 | 
 38 | 
 39 | def test_recode_none_raises_attribute_error():
 40 |     with pytest.raises(AttributeError):
 41 |         E.recode_uri(None)
 42 | 
 43 | 
 44 | def test_unicode_url_gets_quoted():
 45 |     url = 'http://www.yelp.com/münchen'
 46 |     assert E.recode_uri(url) == 'http://www.yelp.com/m%C3%BCnchen'
 47 | 
 48 | 
 49 | def test_mixed_quoting_url():
 50 |     """Test that a url with mixed quoting has uniform quoting after requoting"""
 51 |     url = 'http://www.yelp.com/m%C3%BCnchen/münchen'
 52 |     assert E.recode_uri(url) == 'http://www.yelp.com/m%C3%BCnchen/m%C3%BCnchen'
 53 | 
 54 | 
 55 | def test_mixed_quoting_param():
 56 |     """Tests that a url with mixed quoting in the parameters has uniform quoting after requoting"""
 57 |     url = 'http://www.yelp.com?m%C3%BCnchen=münchen'
 58 |     assert E.recode_uri(url) == 'http://www.yelp.com?m%C3%BCnchen=m%C3%BCnchen'
 59 | 
 60 | 
 61 | def test_mixed_encoding():
 62 |     """Tests that a url with mixed encoding has uniform encoding after recoding"""
 63 |     url = 'http://www.yelp.com/m%C3%BCnchen?m%FCnchen'
 64 |     assert E.recode_uri(url) == 'http://www.yelp.com/m%C3%BCnchen?m%C3%BCnchen'
 65 | 
 66 | 
 67 | def test_mixed_quoting_multiple_queries():
 68 |     """Tests that a url with mixed quoting in multiple parameters has uniform quoting after requoting"""
 69 |     url = 'http://yelp.com/münchen/m%C3%BCnchen?münchen=m%C3%BCnchen&htmlchars=<">'
 70 |     assert E.recode_uri(url) == \
 71 |         'http://yelp.com/m%C3%BCnchen/m%C3%BCnchen?m%C3%BCnchen=m%C3%BCnchen&htmlchars=%3C%22%3E'
 72 | 
 73 | 
 74 | def test_utf8_url():
 75 |     """Tests that a url with mixed quoting in multiple parameters has uniform quoting after requoting"""
 76 |     url = 'http://yelp.com/münchen/m%C3%BCnchen?münchen=m%C3%BCnchen&htmlchars=<">'.encode()
 77 |     assert E.recode_uri(url) == \
 78 |         'http://yelp.com/m%C3%BCnchen/m%C3%BCnchen?m%C3%BCnchen=m%C3%BCnchen&htmlchars=%3C%22%3E'
 79 | 
 80 | 
 81 | def test_multiple_escapes():
 82 |     url = 'http://münch.com?zero=münch&one=m%C3%BCnch&two=m%25C3%25BCnch&three=m%2525C3%2525BCnch'
 83 |     assert E.recode_uri(url) == \
 84 |         'http://xn--mnch-0ra.com?zero=m%C3%BCnch&one=m%C3%BCnch&two=m%25C3%25BCnch&three=m%2525C3%2525BCnch'
 85 | 
 86 | 
 87 | def test_url_reserved_chars():
 88 |     url = 'http://www.yelp.com?chars=%s' % quote(':/?&=')
 89 |     assert E.recode_uri(url) == url
 90 | 
 91 | 
 92 | def test_multi_params_for_individual_path_segment():
 93 |     # Nothing (overly) strange in this url: nothing should be escaped
 94 |     url = '/foo;bar;baz/barney;fred;wilma'
 95 |     assert E.recode_uri(url) == url
 96 | 
 97 | 
 98 | def test_url_with_params():
 99 |     url = (
100 |         'http://ad.doubleclick.net/clk;217976351;41128009;f?'
101 |         'http%3A//www.24hourfitness.com/FindClubDetail.do?'
102 |         'clubid=189&edit=null&semiPromoCode=null&cm_mmc='
103 |         'Yelp-_-ClubPage-_-BusinessListing-_-Link'
104 |     )
105 |     assert E.recode_uri(url) == url
106 | 
107 | 
108 | def test_url_with_hashbang():
109 |     # For a discussion of url hashbangs, see: http://www.jenitennison.com/blog/node/154
110 |     url = 'https://twitter.com/#!/YelpCincy/statuses/179565284020060161'
111 |     assert E.recode_uri(url) == url
112 | 
113 | 
114 | def test_url_with_colon():
115 |     # Ticket: 31242
116 |     url = 'http://www.yelp.fr/biz/smalls-marseille#hrid:u_UQvMf97E8pD4HEb59uIw'
117 |     assert E.recode_uri(url) == url
118 | 
119 | 
120 | def test_param_xss():
121 |     assert E.recode_uri('/foo;<script>;baz/barney;%2F%3B%25;wilma') == '/foo;%3Cscript%3E;baz/barney;%2F%3B%25;wilma'
122 | 
123 | 
124 | def test_letters_recoding():
125 |     """Tests that alphanumeric escapes get unquoted in recode_uri."""
126 |     url = (
127 |         'http://💩.la/%74%68%69%73%5f%69%73%5f%61%5f%70%61%74%68'
128 |         '?a=b%3f%63&%64%3D%65=f#%73%70%61%63%65%73 %61%72%65 %64%61%6e%67%65%72%6f%75%73'
129 |     )
130 |     assert E.recode_uri(url) == \
131 |         'http://xn--ls8h.la/this_is_a_path?a=b%3fc&d%3De=f#spaces%20are%20dangerous'
132 | 
133 | 
134 | def worst_case(strange_character):
135 |     """
136 |     generate a worst-case url, using a "strange" character
137 | 
138 |     >>> print(worst_case(u'ü'))
139 |     http://ü:ü@www.ü.com/ü;ü;ü/ü;ü;ü?ü=ü&ü=ü#!ü/ü
140 |     """
141 |     template = ('http://', ':', '@www.', '.com/', ';', ';', '/', ';', ';', '?', '=', '&', '=', '#!', '/', '')
142 |     if isinstance(strange_character, bytes):
143 |         template = (part.encode('US-ASCII') for part in template)
144 |     return strange_character.join(template)
145 | 
146 | 
147 | def test_worst_case_unicode():
148 |     # Test both unicode and utf8-bytes
149 |     for umlaut in ('ü', 'ü'):
150 |         strange_url = worst_case(umlaut)
151 |         normalized_url = E.recode_uri(strange_url)
152 | 
153 |         # I think this makes things more readable
154 |         processed = normalized_url.replace('%C3%BC', '%')
155 |         assert processed == 'http://%:%@www.xn--tda.com/%;%;%/%;%;%?%=%&%=%#!%/%'
156 | 
157 | 
158 | def test_worst_case_ascii():
159 |     for ascii_char in '<> ':
160 |         strange_url = worst_case(ascii_char)
161 |         normalized_url = E.recode_uri(strange_url)
162 | 
163 |         escaped_char = '%%%2X' % ord(ascii_char)
164 |         assert escaped_char in normalized_url
165 | 
166 |         # I think this makes things more readable
167 |         processed = normalized_url.replace(ascii_char, '{ascii_char}')
168 |         processed = processed.replace(escaped_char, '%')
169 |         assert processed == 'http://%:%@www.{ascii_char}.com/%;%;%/%;%;%?%=%&%=%#!%/%'
170 | 
171 | 
172 | def test_bad_bytes():
173 |     # This is unlikely to happen except due to gross programming error,
174 |     # but I want to show what *would* happen.
175 |     for bad_stuff in ('\xFF', b'\xFF', '%FF', b'%FF'):
176 |         strange_url = worst_case(bad_stuff)
177 |         normalized_url = E.recode_uri(strange_url)
178 | 
179 |         # I think this makes things more readable
180 |         processed = normalized_url.replace('%C3%BF', '%')
181 |         assert processed == 'http://%:%@www.xn--wda.com/%;%;%/%;%;%?%=%&%=%#!%/%'
182 | 
183 | 
184 | def test_dots_fixup():
185 |     # Real-world example:
186 |     # http://www.yelp.com/biz/orange-county-church-of-christ-irvine
187 |     strange_url = 'http://.ocregion.com'
188 |     normalized_url = E.recode_uri(strange_url)
189 |     assert normalized_url == 'http://ocregion.com'
190 | 
191 |     # Extreme example
192 |     strange_url = 'http://guest@....example....com....:8080/'
193 |     normalized_url = E.recode_uri(strange_url)
194 |     assert normalized_url == 'http://guest@example.com:8080/'
195 | 
196 | 
197 | def test_bad_domain():
198 |     # domain names with segments over length 64 are un-encodable by the idna codec
199 |     strange_url = 'http://www.%s.com/' % ('x' * 64)
200 |     with pytest.raises(UnicodeError):
201 |         E.recode_uri(strange_url)
202 | 
203 | 
204 | def test_bad_port2():
205 |     # Similarly, we raise UnicodeError for
206 |     strange_url = 'http://www.example.com:80wtf80/'
207 |     with pytest.raises(UnicodeError):
208 |         E.recode_uri(strange_url)
209 | 
210 | 
211 | def test_bad_user():
212 |     # This was previously throwing UnicodeError via idna codec, but I've
213 |     # fixed it.
214 |     strange_url = 'http://user....@www.example.com/'
215 |     assert E.recode_uri(strange_url) == strange_url
216 | 
217 | 
218 | def test_yelp_scheme_url():
219 |     strange_url = 'yelp:///example'
220 |     assert E.recode_uri(strange_url) == strange_url
221 | 
222 | 
223 | def test_relative_url():
224 |     strange_url = '➨.ws/➨'
225 |     expected = '%E2%9E%A8.ws/%E2%9E%A8'
226 |     processed = E.recode_uri(strange_url)
227 |     assert processed == expected
228 | 
229 | 
230 | def test_path_only_url():
231 |     strange_url = '/➨ ?➨ #➨ '
232 |     expected = '/%E2%9E%A8%20?%E2%9E%A8%20#%E2%9E%A8%20'
233 |     processed = E.recode_uri(strange_url)
234 |     assert processed == expected
235 | 
236 | 
237 | examples = pytest.mark.parametrize(('charname', 'chars', 'pathchars', 'expected_url'), [
238 |     ('latin1', 'ü', '\x81', 'http://xn--mnchen-3ya.com/m%C3%BCchen/%C2%81'),
239 |     ('win1252', '€', '', 'http://xn--mnchen-ic1c.com/m%E2%82%ACchen/'),
240 |     ('utf8', 'プŁ☃', '', 'http://xn--mnchen-3db6836e0mua.com/m%E3%83%97%C5%81%E2%98%83chen/'),
241 |     ('emoji', '🐱', '', 'http://xn--mnchen-i844e.com/m%F0%9F%90%B1chen/'),
242 |     ('ascii', '-._~%', "!$&'()*+,;=:@", "http://m-._~%nchen.com/m-._~%chen/!$&'()*+,;=:@")
243 | ])
244 | 
245 | 
246 | @examples
247 | def test_recode_unicode(charname, chars, pathchars, expected_url):
248 |     del charname  # passed, but unused
249 |     url_template = "http://m{chars}nchen.com/m{chars}chen/{pathchars}"
250 |     unicode_url = url_template.format(chars=chars, pathchars=pathchars)
251 |     assert E.recode_uri(unicode_url) == expected_url
252 | 
253 | 
254 | @examples
255 | @pytest.mark.parametrize('encoding', ('latin1', 'windows-1252', 'utf8', 'ascii'))
256 | def test_recode_encoded(charname, chars, pathchars, expected_url, encoding):
257 |     url_template = "http://m{chars}nchen.com/m{chars}chen/{pathchars}"
258 |     unicode_url = url_template.format(chars=chars, pathchars=pathchars)
259 | 
260 |     try:
261 |         encoded_url = unicode_url.encode(encoding)
262 |     except UnicodeEncodeError:
263 |         pytest.skip("Some of these things just won't go.")
264 | 
265 |     assert E.recode_uri(encoded_url) == expected_url
266 | 
267 |     quoted_url = url_template.format(
268 |         chars=quote(chars.encode(encoding)),
269 |         pathchars=quote(pathchars.encode(encoding)),
270 |     )
271 |     if charname == 'ascii':
272 |         # ASCII is a special case when it comes to quoting: their quoted-ness should go untouched.
273 |         assert E.recode_uri(quoted_url) == quoted_url
274 |     else:
275 |         assert E.recode_uri(quoted_url) == expected_url
276 | 
277 | 
278 | class TestUnquoteBytes:
279 |     ASCII = b''.join(bytes((c,)) for c in range(0x80))
280 |     NON_ASCII = b''.join(bytes((c,)) for c in range(0x80, 0x100))
281 | 
282 |     @staticmethod
283 |     def assert_unquote_bytes(input_value, expected):
284 |         assert E._unquote_bytes(input_value) == expected
285 | 
286 |     def test_dont_touch_unquoted_ascii(self):
287 |         url = b'http://yelp.com/' + self.ASCII
288 |         self.assert_unquote_bytes(url, url)
289 | 
290 |     def test_dont_touch_quoted_ascii(self):
291 |         quoted_ascii = quote(self.ASCII)
292 |         url = b'http://yelp.com/' + quoted_ascii.encode('US-ASCII')
293 |         self.assert_unquote_bytes(url, url)
294 | 
295 |     def test_dont_touch_unquoted_nonascii(self):
296 |         unquoted_url = b'http://yelp.com/' + self.NON_ASCII
297 |         self.assert_unquote_bytes(unquoted_url, unquoted_url)
298 | 
299 |     def test_unescape_quoted_nonascii(self):
300 |         quoted_non_ascii = quote(self.NON_ASCII)
301 |         unquoted_url = b'http://yelp.com/' + self.NON_ASCII
302 |         quoted_url = b'http://yelp.com/' + quoted_non_ascii.encode('US-ASCII')
303 |         self.assert_unquote_bytes(quoted_url, unquoted_url)
304 | 
305 | 
306 | class TestRecodeEmail:
307 |     munchen = 'münchen'
308 |     email = '{munchen}@{munchen}.com?subject={munchen}'.format(munchen=munchen)
309 | 
310 |     # The username is best encoded as simply utf8.
311 |     expected = '{utf8_munchen}@{idna_munchen}.com?subject={percent_munchen}'.format(
312 |         utf8_munchen=munchen,
313 |         idna_munchen=munchen.encode('IDNA').decode('US-ASCII'),
314 |         percent_munchen=quote(munchen.encode('UTF-8')),
315 |     )
316 | 
317 |     @staticmethod
318 |     def test_empty_string():
319 |         assert E.encode_email('') == ''
320 | 
321 |     def test_not_an_email(self):
322 |         not_an_email = "They don't use email in " + self.munchen
323 |         assert E.encode_email(not_an_email) == not_an_email.encode('IDNA').decode('US-ASCII')
324 | 
325 |     def test_encode_email(self):
326 |         assert E.encode_email(self.email) == self.expected
327 | 
328 |     def test_decode_email(self):
329 |         assert E.decode_email(self.expected) == self.email
330 | 
331 |     def test_recode_email(self):
332 |         assert E.recode_email(self.email) == self.expected
333 | 
334 |     # Tests for idempotency:
335 |     def test_encode_email_idempotent(self):
336 |         assert E.encode_email(E.encode_email(self.email)) == self.expected
337 | 
338 |     def test_decode_email_idempotent(self):
339 |         assert E.decode_email(E.decode_email(self.expected)) == self.email
340 | 
341 |     def test_recode_email_idempotent(self):
342 |         assert E.recode_email(E.recode_email(self.email)) == self.expected
343 | 
344 |     def test_mailto_scheme(self):
345 |         assert E.recode_email('mailto:' + self.email) == 'mailto:' + self.expected
346 | 
347 |         # This is technically wrong, but we fix it.
348 |         assert E.recode_email('mailto://' + self.email) == 'mailto:' + self.expected
349 | 


--------------------------------------------------------------------------------
/tests/search_test.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from yelp_uri.search import email_regex
  4 | from yelp_uri.search import url_regex
  5 | from yelp_uri.search import url_regex_common_tlds
  6 | 
  7 | 
  8 | def test_email_regex():
  9 |     """Test that our email-matching regex and fast url/email matching regex work properly."""
 10 | 
 11 |     def assert_finds_email(text, email=None):
 12 |         if email is None:
 13 |             email = text
 14 |         match = email_regex.search(text)
 15 |         assert match
 16 |         user, domain = match.groups()
 17 |         assert f'{user}@{domain}' == email
 18 | 
 19 |     def assert_no_email(text):
 20 |         assert not email_regex.search(text)
 21 | 
 22 |     assert_finds_email('i_love_yelp@yahoo.com')
 23 |     assert_finds_email('Tom+Yelp@yahoo.com')
 24 |     assert_finds_email('info@te-aro.ca')
 25 |     assert_finds_email('Soirée@yelp.com')
 26 |     assert_finds_email('"Tom H" <Tom@yahoo.com>', 'Tom@yahoo.com')
 27 |     assert_finds_email('Email me at dave@yelp.com.', 'dave@yelp.com')
 28 |     assert_finds_email(
 29 |         'john.c.lilly@sensorydeprivation.co.uk is the person to talk to',
 30 |         'john.c.lilly@sensorydeprivation.co.uk',
 31 |     )
 32 |     assert_finds_email('"John Q. Public" <john.q.@gmail.com>', 'john.q.@gmail.com')  # Username ends in "."
 33 |     assert_no_email('Read http://www.girlgeniusonline.com')
 34 |     assert_no_email('br@n is the @wesome!!!')
 35 |     assert_no_email('http://user@example.com:8080?query!')
 36 |     assert_no_email('Log in with http://guest@example.com...')
 37 | 
 38 | 
 39 | class RegexAssertion:
 40 |     def __init__(self, regex):
 41 |         self.url_regex = regex
 42 | 
 43 |     def finds_url(self, text, url):
 44 |         match = self.url_regex.search(text)
 45 |         assert match is not None, "Expected url_regex match on " + url + " (text: '" + text + "')"
 46 |         assert match.group() == url
 47 | 
 48 |     def finds_whole_url(self, url):
 49 |         self.finds_url(url, url)
 50 | 
 51 |     def finds_no_url(self, text):
 52 |         assert self.url_regex.match(text) is None
 53 | 
 54 | 
 55 | def assert_url_regex(regex_assert):
 56 | 
 57 |     regex_assert.finds_url('dictionary.com\'s', 'dictionary.com')
 58 |     regex_assert.finds_url('yelp.com.', 'yelp.com')
 59 |     regex_assert.finds_url('http://sensorydeprivation.co.uk is the person to talk to', 'http://sensorydeprivation.co.uk')
 60 |     regex_assert.finds_url('Read https://www.girlgeniusonline.com', 'https://www.girlgeniusonline.com')
 61 |     regex_assert.finds_url('look at this site with com in it: http://news.ycombinator.com', 'http://news.ycombinator.com')
 62 |     regex_assert.finds_url('My site is http://example.com.I am bad at spacebar.', 'http://example.com')
 63 |     regex_assert.finds_url('Check out http://I.have.com?query!', 'http://I.have.com?query')
 64 |     regex_assert.finds_url('Check out http://I.have.com#!she/bang!', 'http://I.have.com#!she/bang')
 65 |     regex_assert.finds_url('Follow @YelpCincy on Twitter (http://twitter.com/YelpCincy)', 'http://twitter.com/YelpCincy')
 66 |     regex_assert.finds_url(
 67 |         'Reference: http://en.wikipedia.org/wiki/Eon_(geology)',
 68 |         'http://en.wikipedia.org/wiki/Eon_(geology)',
 69 |     )
 70 |     regex_assert.finds_url('http://example.com:8O80', 'http://example.com')
 71 |     regex_assert.finds_url('notemail@https://example.com', 'https://example.com')
 72 |     regex_assert.finds_url('notemail@www.example.com', 'www.example.com')
 73 | 
 74 |     regex_assert.finds_whole_url('http://example.com:8080')
 75 |     regex_assert.finds_whole_url('http://lil_jay78.blogspot.com/')
 76 |     regex_assert.finds_whole_url('http://yelp.com/berkeley')
 77 |     regex_assert.finds_whole_url('www.audrey_tsang.com')
 78 |     regex_assert.finds_whole_url('http://a.com')
 79 |     regex_assert.finds_whole_url('http://who_even_uses_dot_mobi.mobi')
 80 |     regex_assert.finds_whole_url('http://www.yelp.com/münchen')
 81 |     regex_assert.finds_whole_url('http://www.ü.com/ü;ü;ü/ü;ü;ü?ü=ü&ü=ü#!ü/ü')
 82 |     regex_assert.finds_whole_url('http://➡.ws/➡;➡;➡/➡;➡;➡?➡=➡&➡=➡#!➡/➡')
 83 |     regex_assert.finds_whole_url('http://y.combinator')
 84 |     regex_assert.finds_whole_url('http://.ocregion.com')
 85 | 
 86 |     # uri should stop search at backslash
 87 |     regex_assert.finds_url('https://www.dmv.ca.gov\\notpartofurl', 'https://www.dmv.ca.gov')
 88 |     regex_assert.finds_url(
 89 |         'Contact the dmv: https://www.dmv.ca.gov/portal/dmv/dmvfooter2/contactus\\nnext_paragraph',
 90 |         'https://www.dmv.ca.gov/portal/dmv/dmvfooter2/contactus'
 91 |     )
 92 |     regex_assert.finds_url(
 93 |         'Contact the dmv: https://www.dmv.ca.gov/portal/dmv/dmvfooter2/contactus.html\\nnext_paragraph',
 94 |         'https://www.dmv.ca.gov/portal/dmv/dmvfooter2/contactus.html'
 95 |     )
 96 |     # :39 is a line number in this context but is valid path component
 97 |     regex_assert.finds_whole_url('https://media4.com/en_US.min.js:39')
 98 |     # Proves :39 is indeed the path component and not the port
 99 |     regex_assert.finds_whole_url('https://media4.com:443/en_US.min.js:39')
100 | 
101 |     # Ticket: #31242
102 |     colon_url = 'http://www.yelp.fr/biz/smalls-marseille#hrid:u_UQvMf97E8pD4HEb59uIw'
103 |     regex_assert.finds_whole_url(colon_url)
104 |     regex_assert.finds_url('(%s)' % colon_url, colon_url)
105 |     colon_url_with_parens = colon_url.replace('#', '#(') + ')'
106 |     regex_assert.finds_whole_url(colon_url_with_parens)
107 |     # This is a real testing domain, from:
108 |     # http://en.wikipedia.org/wiki/List_of_TLDs#Test_TLDS
109 |     regex_assert.finds_whole_url('http://例子.測試')
110 |     regex_assert.finds_no_url('http://br@n is the @wesome!!!')
111 |     regex_assert.finds_no_url('ftp://user@example.com')
112 |     # These are real reviews that were improperly linkified.
113 |     regex_assert.finds_no_url(
114 |         'My legs... my legs... must keep going...  must.......be' +
115 |         '.....nearly.....there.....yet....HOLY SHIT IT CONTINUES?')
116 |     regex_assert.finds_no_url('Great setting however....it was a bit loud.')
117 |     regex_assert.finds_no_url(
118 |         "I've been greatly disappointed with Gilmore's the last couple years" +
119 |         "...it used to be my go-to spot in the burbs for French."
120 |     )
121 |     regex_assert.finds_no_url('isemail@emailaddress.com')
122 | 
123 |     regex_assert.finds_url('http://www.foo.com/blah#foo', 'http://www.foo.com/blah#foo')
124 |     regex_assert.finds_url('http://www.foo.com/blah!', 'http://www.foo.com/blah')
125 | 
126 |     regex_assert.finds_whole_url('http://Cervejoteca.com.br')
127 | 
128 | 
129 | def test_url_regex():
130 |     """Test that our url-matching regex and fast url/email matching regex work properly."""
131 |     a = RegexAssertion(url_regex)
132 |     assert_url_regex(a)
133 | 
134 |     # These are real examples of insurance spam on talk
135 |     a.finds_whole_url('LOANFINDERFAST.us')
136 |     a.finds_whole_url('LOANFINDERFAST.US')
137 |     a.finds_url('LOANFINDERFAST.us.', 'LOANFINDERFAST.us')
138 |     a.finds_url('LOANFINDERFAST.us-', 'LOANFINDERFAST.us')
139 | 
140 | 
141 | def test_custom_regex():
142 |     a = RegexAssertion(url_regex_common_tlds)
143 |     assert_url_regex(a)
144 | 
145 |     # Uncommon tld that are common text patterns in code and configs
146 |     a.finds_no_url('org.openqa.selenium.support.events')
147 |     a.finds_no_url('EventFiringWebElement.click')
148 |     a.finds_no_url('os.name')
149 |     a.finds_no_url('main.py')
150 |     a.finds_no_url('foo.sh')
151 |     a.finds_no_url('pid.id')
152 | 
153 |     a.finds_url('Check out http://I.have.com?query!', 'http://I.have.com?query')
154 |     a.finds_whole_url('www.audrey_tsang.com')
155 | 
156 | 
157 | def test_url_regex_i18n():
158 |     a = RegexAssertion(url_regex)
159 | 
160 |     # We now know the list of non-ascii top-level domains as well:
161 |     a.finds_url(
162 |         'This is a website: 中国互联网络信息中心.中国. No, really.',
163 |         '中国互联网络信息中心.中国'
164 |     )
165 | 
166 |     # But we should be able to find the punycoded TLDs in both cases:
167 |     a.finds_url(
168 |         'This is a website: 中国互联网络信息中心.xn--fiqs8s. No, really.',
169 |         '中国互联网络信息中心.xn--fiqs8s'
170 |     )
171 | 
172 | 
173 | @pytest.mark.parametrize('test_input,expected', [
174 |     # uri should not include [
175 |     ('http://example.com[', 'http://example.com'),
176 |     ('http://example.com[', 'http://example.com'),
177 |     ('http://example.com[blah', 'http://example.com'),
178 |     ('http://example.com[blah]', 'http://example.com'),
179 |     ('http://example.com/path[', 'http://example.com/path'),
180 | 
181 |     # uri should not include ( unless it is properly closed
182 |     ('http://example.com(', 'http://example.com'),
183 |     ('http://example.com(blah', 'http://example.com'),
184 |     ('http://example.com(blah)', 'http://example.com'),
185 |     ('http://example.com/foo(', 'http://example.com/foo'),
186 | 
187 |     # uri should not include {
188 |     ('http://example.com{', 'http://example.com'),
189 |     ('http://example.com{blah', 'http://example.com'),
190 |     ('http://example.com{blah}', 'http://example.com'),
191 |     ('http://example.com/foo{', 'http://example.com/foo'),
192 | ])
193 | def test_url_open_bracket_finds_url(test_input, expected):
194 |     """Test that our url-matching regex and fast url/email matching regex work properly."""
195 |     regex_assert = RegexAssertion(url_regex)
196 |     regex_assert.finds_url(test_input, expected)
197 | 
198 | 
199 | @pytest.mark.parametrize('test_input', [
200 |     # Properly closed, only case where url with ( is valid
201 |     ('http://www.foo.com/blah_(disambiguation)'),
202 | ])
203 | def test_url_open_bracket_finds_whole_url(test_input):
204 |     regex_assert = RegexAssertion(url_regex)
205 |     regex_assert.finds_whole_url(test_input)
206 | 
207 | 
208 | @pytest.mark.parametrize('test_input,expected', [
209 |     # uri should not include [
210 |     ('http://example.com[', 'http://example.com'),
211 |     ('http://example.com[', 'http://example.com'),
212 |     ('http://example.com[blah', 'http://example.com'),
213 |     ('http://example.com[blah]', 'http://example.com'),
214 |     ('http://example.com/path[', 'http://example.com/path'),
215 | 
216 |     # uri should not include ( unless it is properly closed
217 |     ('http://example.com(', 'http://example.com'),
218 |     ('http://example.com(blah', 'http://example.com'),
219 |     ('http://example.com(blah)', 'http://example.com'),
220 |     ('http://example.com/foo(', 'http://example.com/foo'),
221 | 
222 |     # uri should not include {
223 |     ('http://example.com{', 'http://example.com'),
224 |     ('http://example.com{blah', 'http://example.com'),
225 |     ('http://example.com{blah}', 'http://example.com'),
226 |     ('http://example.com/foo{', 'http://example.com/foo'),
227 | ])
228 | def test_custom_regex_open_bracket_finds_url(test_input, expected):
229 |     regex_assert = RegexAssertion(url_regex_common_tlds)
230 |     regex_assert.finds_url(test_input, expected)
231 | 
232 | 
233 | @pytest.mark.parametrize('test_input', [
234 |     # Properly closed, only case where url with ( is valid
235 |     ('http://www.foo.com/blah_(disambiguation)'),
236 | ])
237 | def test_custom_regex_open_bracket_finds_whole_url(test_input):
238 |     regex_assert = RegexAssertion(url_regex_common_tlds)
239 |     regex_assert.finds_whole_url(test_input)
240 | 


--------------------------------------------------------------------------------
/tests/urllib_utf8_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import urllib.parse
 3 | 
 4 | from yelp_uri import urllib_utf8
 5 | 
 6 | 
 7 | def test_urlencode():
 8 |     query = {
 9 |         'empty': '',  # empty string
10 |         'x': 'y',  # normal ASCII
11 |         'foo': 'bär',  # unicode val
12 |         'bärge': 'large',  # unicode key
13 |         'äah': 'yätes',  # unicode key & val
14 |         # a non-ASCII str, just to make things interesting
15 |         'name': 'Unique Caf\xe9',
16 |     }
17 | 
18 |     # the same query, as utf-8 encoded strings
19 |     utf8_query = {
20 |         k.encode('utf-8'): v.encode('utf-8') if isinstance(v, str) else v
21 |         for (k, v) in query.items()
22 |     }
23 | 
24 |     # Dictionaries can re-order the url params, so we
25 |     # compare the split up params as sets
26 |     def assert_params_equal(left, right):
27 |         assert set(left.split('&')) == set(right.split('&'))
28 | 
29 |     assert_params_equal(urllib_utf8.urlencode(query),
30 |                         urllib.parse.urlencode(utf8_query))
31 | 
32 |     # try again, but with arrays of pairs
33 |     assert_params_equal(urllib_utf8.urlencode(tuple(query.items())),
34 |                         urllib.parse.urlencode(utf8_query))
35 | 
36 | 
37 | def test_urlencode_lists():
38 |     expected = 'exprs=foo%3A123&exprs=b%C3%A4r%3A222%F0%9F%90%B5'
39 | 
40 |     query = {'exprs': ['foo:123', 'bär:222🐵'.encode()]}
41 |     assert urllib.parse.urlencode(query, True) == expected
42 |     assert urllib_utf8.urlencode(query, True) == expected
43 | 
44 |     query = {'exprs': ['foo:123', 'bär:222🐵']}
45 |     assert urllib_utf8.urlencode(query, True) == expected
46 | 
47 | 
48 | def test_quote_and_quote_plus():
49 |     strings = ['', 'Hello there!', 'naïve', ' San José ', ' cafés']
50 |     for string in strings:
51 |         utf8_string = string.encode('utf-8') if isinstance(string, str) else string
52 |         assert urllib_utf8.quote(string) == urllib.parse.quote(utf8_string)
53 |         assert urllib_utf8.quote_plus(string) == urllib.parse.quote_plus(utf8_string)
54 | 
55 | 
56 | def test_quote_unicode():
57 |     """Quoting and unquoting Unicode strings should give the same result
58 |     as when given regular strings. See ticket #28786.
59 |     """
60 |     assert urllib_utf8.quote('montréal') == urllib_utf8.quote('montréal')
61 |     assert urllib_utf8.quote_plus('montréal') == urllib_utf8.quote_plus('montréal')
62 |     assert urllib_utf8.unquote('montréal') == urllib_utf8.unquote('montréal')
63 |     assert urllib_utf8.unquote_plus('montréal') == urllib_utf8.unquote_plus('montréal')
64 | 
65 | 
66 | def test_parse_qs():
67 |     """urllib_utf8.parse_qs should mostly act like urlparse.parse_qs."""
68 |     strings = ['', 'foo=bar', 'foo=bar', 'foo=bar&baz=quux', 'foo=1&foo=2']
69 |     for query_string in strings:
70 |         assert urllib_utf8.parse_qs(query_string) == urllib.parse.parse_qs(query_string)
71 |         utf8_string = query_string.encode('utf-8') if isinstance(query_string, str) else query_string
72 |         assert urllib_utf8.parse_qs(utf8_string) == urllib.parse.parse_qs(utf8_string)
73 | 
74 | 
75 | def _verify_extract_unicode_value(query_string, expected_value):
76 |     """Verify the first 'foo=X' value matches expected_value."""
77 |     kwargs = urllib_utf8.parse_qs(query_string)
78 |     assert kwargs['foo'][0] == expected_value
79 | 
80 | 
81 | def test_parse_qs_unicode():
82 |     """Our Tornado 1 setup sometimes gives URL-encoded strings as (byte)strings as expected.
83 |     Verify we can properly handle this case, and give proper Unicode output.
84 |     """
85 |     _verify_extract_unicode_value('foo=M%C3%BCnchen', 'München')
86 | 
87 | 
88 | def test_parse_qs_mangled():
89 |     """Our Tornado 2 setup sometimes gives URL-encoded strings in the Unicode, rather than (byte)string,
90 |     class.  Normal urlparse.parse_qs does an implicit decoding as latin1 or some similar goofiness,
91 |     mangling any non-ASCII.  Verify we can properly handle this case, and give proper Unicode output.
92 |     """
93 |     _verify_extract_unicode_value('foo=M%C3%BCnchen', 'München')
94 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | project = yelp_uri
 3 | # These should match the travis env list
 4 | envlist = py38
 5 | 
 6 | [testenv]
 7 | deps = -rrequirements_dev.txt
 8 | commands =
 9 |     {envpython} --version
10 |     coverage --version
11 |     coverage erase
12 |     coverage run -m pytest {posargs}
13 |     coverage report --show-missing --fail-under 94
14 |     flake8 --version
15 |     flake8 {[tox]project} tests setup.py
16 | 
17 | [testenv:docs]
18 | deps =
19 |     {[testenv]deps}
20 |     sphinx
21 | changedir = docs
22 | commands = sphinx-build -b html -d build/doctrees source build/html
23 | 
24 | [flake8]
25 | max-line-length=131
26 | 
27 | [pytest]
28 | addopts = -v -rfE --doctest-modules --ignore venv
29 | 


--------------------------------------------------------------------------------
/yelp_uri/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The python standard library is a bit deficient in its handling of urls.
  3 | This namespace contains the yelp extensions to the stdlib functionality.
  4 | 
  5 | This understanding of urls is based on RFC3986: http://www.ietf.org/rfc/rfc2396.txt
  6 | Where RFC3986 is incompatible with RFC2396 (the older, obsoleted standard), we prefer strict compatibility with RFC3986.
  7 | 
  8 | Because email addresses resemble uris in several regards, we handle them under this namespace as well.
  9 | """
 10 | # This namespace reserved for *very* general-purpse uri functions.
 11 | import re
 12 | try:
 13 |     from string import ascii_letters as LETTERS
 14 | except ImportError:
 15 |     from string import letters as LETTERS
 16 | from string import digits as DIGITS, printable as PRINTABLE
 17 | from collections import namedtuple
 18 | 
 19 | from yelp_bytes import from_bytes
 20 | 
 21 | import yelp_uri._urlparse_less_special as _urlparse
 22 | 
 23 | 
 24 | class MalformedUrlError(UnicodeError):
 25 |     """This error means there are unrecoverable issues with your url"""
 26 |     # I derive from UnicodeError because that's what the IDNA encoder throws with a bad domain name.
 27 |     # Also, UnicodeError derives from ValueError, which is what urlparse throws with a non-numeric port.
 28 |     pass
 29 | 
 30 | 
 31 | class RFC3986:
 32 |     """
 33 |     Codify some knowlege about the characters in a URL
 34 |     From:
 35 |         http://tools.ietf.org/html/rfc3986#appendix-A
 36 | 
 37 |     This class has an 're' attribute which holds values suitable for inclusion
 38 |     in a regular expression.
 39 |     """
 40 | 
 41 |     def __init__(self):
 42 |         # Basic character classes, useful later.
 43 |         self.digits = DIGITS
 44 |         self.letters = LETTERS
 45 |         self.alphanum = DIGITS + LETTERS
 46 |         self.whitespace = re.search(r'\s+', PRINTABLE).group()
 47 | 
 48 |         # From http://tools.ietf.org/html/rfc3986#appendix-A
 49 |         # In reverse order:
 50 |         self.subdelims = "!$&'()*+,;="
 51 |         self.gendelims = ':/?#@'
 52 |         self.reserved = self.subdelims + self.gendelims
 53 |         # Combined: Wherever unreserved is used, percent-encoded is as well.
 54 |         self.unreserved = self.alphanum + '-._~' + '%'
 55 |         self.pchar = self.unreserved + self.subdelims + ':@'
 56 |         self.query = self.pchar + '/?'
 57 |         self.fragment = self.pchar + '/?'
 58 |         self.path = self.pchar + '/'
 59 |         # deviation: The "subdelims" actually don't make sense in domain names or user names
 60 |         self.regname = self.unreserved
 61 |         self.userinfo = self.unreserved + '+'
 62 | 
 63 |         # Yelp extension: characters that don't belong at the end of a URI
 64 |         self.bad_end = self.whitespace + '''<{[(.!'",;?:-\\'''
 65 |         # The set of characters that is always OK to unescape.
 66 |         self.plaintext = self.alphanum + '_-'
 67 |         # The set of allowable URL characters.
 68 |         self.url = self.unreserved + self.reserved
 69 | 
 70 |         self.re = self.produce_character_classes()
 71 | 
 72 |     def produce_character_classes(self):
 73 |         class RFC3986re:
 74 |             pass
 75 | 
 76 |         for attr, val in vars(self).items():
 77 |             re_val = re.escape(val)
 78 |             re_val = re.sub('[^' + re_val + ']', '', PRINTABLE)
 79 |             re_val = self.norm_re_class(re_val)
 80 |             # print '%20s : %s' % (attr, re_val)
 81 |             setattr(RFC3986re, attr, re_val)
 82 | 
 83 |             neg_attr = 'not_' + attr
 84 |             neg_val = re.sub('[' + re_val + ']', '', PRINTABLE)
 85 |             neg_val = self.norm_re_class(neg_val)
 86 |             # print '%20s : %s' % (neg_attr, neg_val)
 87 |             setattr(RFC3986re, neg_attr, neg_val)
 88 | 
 89 |         return RFC3986re
 90 | 
 91 |     def norm_re_class(self, re_class):
 92 |         re_class = re_class.replace(self.whitespace, 'space')
 93 |         if '_' in re_class and self.alphanum in re_class:
 94 |             re_class = re_class.replace(self.alphanum, 'word').replace(r'\_', '')
 95 |         return re.escape(re_class).replace('space', r'\s').replace('word', r'\w')
 96 | 
 97 | 
 98 | # This is a singleton class
 99 | RFC3986 = RFC3986()
100 | 
101 | 
102 | def netlocsplit(netloc):
103 |     "Split a `netloc` into its component parts."
104 |     # Leverage the code already implemented in urlparse.ResultMixin
105 |     tmp = _urlparse.ResultMixin()
106 |     tmp.netloc = netloc
107 |     try:
108 |         port = tmp.port
109 |     except ValueError as error:
110 |         # Make this error a little more explicit and catch-able.
111 |         if len(error.args) == 1 and isinstance(error.args[0], str):
112 |             raise MalformedUrlError('Invalid port number: ' + error.args[0])
113 |         else:  # An exception I don't expect
114 |             raise
115 | 
116 |     return NetlocSplitResult(tmp.username, tmp.password, tmp.hostname, port)
117 | 
118 | 
119 | def netlocunsplit(split_netloc):
120 |     "Given a result from `netlocsplit`, return a string that would `netlocsplit` into the same tuple."
121 |     user, passwd, host, port = split_netloc
122 | 
123 |     netloc = host
124 |     if port is not None:
125 |         netloc = netloc + ':' + str(port)
126 |     if user is not None or passwd is not None:
127 |         netloc = '@' + netloc
128 |         if passwd is not None:
129 |             netloc = ':' + passwd + netloc
130 |         if user is not None:
131 |             netloc = user + netloc
132 | 
133 |     return netloc
134 | 
135 | 
136 | class NetlocSplitResult(namedtuple('NetlocSplitResult', 'username password hostname port')):
137 |     """A result from yelp.uri.netlocsplit
138 |     See also: /usr/lib/python2.6/urlparse.py:SplitResult
139 |     """
140 | 
141 |     __slots__ = ()
142 | 
143 |     def geturl(self):
144 |         return netlocunsplit(self)
145 | 
146 | 
147 | def urlsplit(url):
148 |     """Similar to stdlib urlparse.urlsplit, but splits the url into more parts.
149 | 
150 |     url -- string url to be parsed.
151 |     return -- a yelp.uri.SplitResult
152 |     """
153 |     url = _urlparse.urlsplit(
154 |         from_bytes(url) if isinstance(url, bytes) else url
155 |     )
156 |     nl = netlocsplit(url.netloc)
157 |     return SplitResult(url.scheme, nl.username, nl.password, nl.hostname, nl.port, url.path, url.query, url.fragment)
158 | 
159 | 
160 | def urlunsplit(split_url):
161 |     """"
162 |     split_url -- a yelp.uri.SplitResult
163 |     return -- url string
164 |     """
165 |     netloc = netlocunsplit(split_url[1:5])
166 | 
167 |     split_url2 = list(split_url)
168 |     split_url2[1:5] = [netloc]
169 |     return _urlparse.urlunsplit(split_url2)
170 | 
171 | 
172 | class SplitResult(namedtuple('SplitResult', 'scheme username password hostname port path query fragment')):
173 |     """A result from yelp.uri.urlsplit
174 |     See also: /usr/lib/python2.6/urlparse.py:SplitResult
175 |     """
176 | 
177 |     __slots__ = ()
178 | 
179 |     def geturl(self):
180 |         return urlunsplit(self)
181 | 
182 |     @property
183 |     def netloc(self):
184 |         return NetlocSplitResult(self.username, self.password, self.hostname, self.port)
185 | 
186 |     def replace(self, **kwargs):
187 |         """_replace has been promoted to a public method"""
188 |         return self._replace(**kwargs)
189 | 
190 | 
191 | # List the names that this module "really" exports.
192 | __all__ = (
193 |     'RFC3986',
194 |     'netlocsplit',
195 |     'netlocunsplit',
196 |     'NetlocSplitResult',
197 |     'urlsplit',
198 |     'urlunsplit',
199 |     'SplitResult',
200 | )
201 | 


--------------------------------------------------------------------------------
/yelp_uri/_urlparse_less_special.py:
--------------------------------------------------------------------------------
  1 | # vim:et:sts=4:ts=4
  2 | """
  3 | This is a copy of the python2.6 stdlib urlparse with special cases factored out.
  4 | We've been doing painful special-case code to undo the special cases herein, but
  5 | it's overall easier and more reliable to just fix this code...
  6 | We preserve the 4-space indents to ease merging from upstream.
  7 | 
  8 | 
  9 | Parse (absolute and relative) URLs.
 10 | 
 11 | urlparse module is based upon the following RFC specifications.
 12 | 
 13 | RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding
 14 | and L.  Masinter, January 2005.
 15 | 
 16 | RFC 2396:  "Uniform Resource Identifiers (URI)": Generic Syntax by T.
 17 | Berners-Lee, R. Fielding, and L. Masinter, August 1998.
 18 | 
 19 | RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zwinski, July 1998.
 20 | 
 21 | RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June
 22 | 1995.
 23 | 
 24 | RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M.
 25 | McCahill, December 1994
 26 | 
 27 | RFC 3986 is considered the current standard and any future changes to
 28 | urlparse module should conform with it.  The urlparse module is
 29 | currently not entirely compliant with this RFC due to defacto
 30 | scenarios for parsing, and for backward compatibility purposes, some
 31 | parsing quirks from older RFCs are retained. The testcases in
 32 | test_urlparse.py provides a good indicator of parsing behavior.
 33 | 
 34 | """
 35 | from collections import namedtuple
 36 | 
 37 | 
 38 | # This is a stdlib file. To ease merging, we won't fix these style issues.
 39 | 
 40 | __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag",
 41 |            "urlsplit", "urlunsplit", "parse_qs", "parse_qsl"]
 42 | 
 43 | # Characters valid in scheme names
 44 | scheme_chars = ('abcdefghijklmnopqrstuvwxyz'
 45 |                 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
 46 |                 '0123456789'
 47 |                 '+-.')
 48 | 
 49 | MAX_CACHE_SIZE = 20
 50 | _parse_cache = {}
 51 | 
 52 | 
 53 | def clear_cache():
 54 |     """Clear the parse cache."""
 55 |     _parse_cache.clear()
 56 | 
 57 | 
 58 | class ResultMixin:
 59 |     """Shared methods for the parsed result objects."""
 60 | 
 61 |     @property
 62 |     def username(self):
 63 |         netloc = self.netloc
 64 |         if netloc is None:
 65 |             return None
 66 |         if "@" in netloc:
 67 |             userinfo = netloc.rsplit("@", 1)[0]
 68 |             if ":" in userinfo:
 69 |                 userinfo = userinfo.split(":", 1)[0]
 70 |             return userinfo
 71 |         return None
 72 | 
 73 |     @property
 74 |     def password(self):
 75 |         netloc = self.netloc
 76 |         if netloc is None:
 77 |             return None
 78 |         if "@" in netloc:
 79 |             userinfo = netloc.rsplit("@", 1)[0]
 80 |             if ":" in userinfo:
 81 |                 return userinfo.split(":", 1)[1]
 82 |         return None
 83 | 
 84 |     @property
 85 |     def hostname(self):
 86 |         netloc = self.netloc
 87 |         if netloc is None:
 88 |             return None
 89 |         if "@" in netloc:
 90 |             netloc = netloc.rsplit("@", 1)[1]
 91 |         if ":" in netloc:
 92 |             netloc = netloc.split(":", 1)[0]
 93 |         return netloc
 94 | 
 95 |     @property
 96 |     def port(self):
 97 |         netloc = self.netloc
 98 |         if netloc is None:
 99 |             return None
100 |         if "@" in netloc:
101 |             netloc = netloc.rsplit("@", 1)[1]
102 |         if ":" in netloc:
103 |             port = netloc.split(":", 1)[1]
104 |             return int(port, 10)
105 |         return None
106 | 
107 | 
108 | class SplitResult(namedtuple('SplitResult', 'scheme netloc path query fragment'), ResultMixin):
109 |     __slots__ = ()
110 | 
111 |     def geturl(self):
112 |         return urlunsplit(self)
113 | 
114 | 
115 | class ParseResult(namedtuple('ParseResult', 'scheme netloc path params query fragment'), ResultMixin):
116 |     __slots__ = ()
117 | 
118 |     def geturl(self):
119 |         return urlunparse(self)
120 | 
121 | 
122 | def urlparse(url, scheme='', allow_fragments=True):
123 |     """Parse a URL into 6 components:
124 |     <scheme>://<netloc>/<path>;<params>?<query>#<fragment>
125 |     Return a 6-tuple: (scheme, netloc, path, params, query, fragment).
126 |     Note that we don't break the components up in smaller bits
127 |     (e.g. netloc is a single string) and we don't expand % escapes."""
128 |     t = urlsplit(url, scheme, allow_fragments)
129 |     scheme, netloc, url, query, fragment = t
130 |     if ';' in url:
131 |         url, params = _splitparams(url)
132 |     else:
133 |         params = ''
134 |     return ParseResult(scheme, netloc, url, params, query, fragment)
135 | 
136 | 
137 | def _splitparams(url):
138 |     if '/' in url:
139 |         i = url.find(';', url.rfind('/'))
140 |         if i < 0:
141 |             return url, ''
142 |     else:
143 |         i = url.find(';')
144 |     return url[:i], url[i + 1:]
145 | 
146 | 
147 | def _splitnetloc(url, start=0):
148 |     delim = len(url)   # position of end of domain part of url, default is end
149 |     for c in '/?#':    # look for delimiters; the order is NOT important
150 |         wdelim = url.find(c, start)        # find first of this delim
151 |         if wdelim >= 0:                    # if found
152 |             delim = min(delim, wdelim)     # use earliest delim position
153 |     return url[start:delim], url[delim:]   # return (domain, rest)
154 | 
155 | 
156 | def urlsplit(url, scheme='', allow_fragments=True):
157 |     """Parse a URL into 5 components:
158 |     <scheme>://<netloc>/<path>?<query>#<fragment>
159 |     Return a 5-tuple: (scheme, netloc, path, query, fragment).
160 |     Note that we don't break the components up in smaller bits
161 |     (e.g. netloc is a single string) and we don't expand % escapes."""
162 |     allow_fragments = bool(allow_fragments)
163 |     key = url, scheme, allow_fragments, type(url), type(scheme)
164 |     cached = _parse_cache.get(key, None)
165 |     if cached:
166 |         return cached
167 |     if len(_parse_cache) >= MAX_CACHE_SIZE:  # avoid runaway growth
168 |         clear_cache()
169 |     netloc = None
170 |     query = fragment = ''
171 |     i = url.find(':')
172 |     if i > 0:
173 |         if url[:i] == 'http':  # optimize the common case
174 |             scheme = url[:i].lower()
175 |             url = url[i + 1:]
176 |             if url[:2] == '//':
177 |                 netloc, url = _splitnetloc(url, 2)
178 |             if allow_fragments and '#' in url:
179 |                 url, fragment = url.split('#', 1)
180 |             if '?' in url:
181 |                 url, query = url.split('?', 1)
182 |             v = SplitResult(scheme, netloc, url, query, fragment)
183 |             _parse_cache[key] = v
184 |             return v
185 |         for c in url[:i]:
186 |             if c not in scheme_chars:
187 |                 break
188 |         else:
189 |             scheme, url = url[:i].lower(), url[i + 1:]
190 | 
191 |     if url[:2] == '//':
192 |         netloc, url = _splitnetloc(url, 2)
193 |     if allow_fragments and '#' in url:
194 |         url, fragment = url.split('#', 1)
195 |     if '?' in url:
196 |         url, query = url.split('?', 1)
197 |     v = SplitResult(scheme, netloc, url, query, fragment)
198 |     _parse_cache[key] = v
199 |     return v
200 | 
201 | 
202 | def urlunparse(data):
203 |     """Put a parsed URL back together again.  This may result in a
204 |     slightly different, but equivalent URL, if the URL that was parsed
205 |     originally had redundant delimiters, e.g. a ? with an empty query
206 |     (the draft states that these are equivalent)."""
207 |     scheme, netloc, url, params, query, fragment = data
208 |     if params:
209 |         url = f"{url};{params}"
210 |     return urlunsplit((scheme, netloc, url, query, fragment))
211 | 
212 | 
213 | def urlunsplit(data):
214 |     """Combine the elements of a tuple as returned by urlsplit() into a
215 |     complete URL as a string. The data argument can be any five-item iterable.
216 |     This may result in a slightly different, but equivalent URL, if the URL that
217 |     was parsed originally had unnecessary delimiters (for example, a ? with an
218 |     empty query; the RFC states that these are equivalent)."""
219 |     scheme, netloc, url, query, fragment = data
220 |     if netloc is not None:
221 |         if url and url[:1] != '/':
222 |             url = '/' + url
223 |         url = '//' + netloc + url
224 |     if scheme:
225 |         url = scheme + ':' + url
226 |     if query:
227 |         url = url + '?' + query
228 |     if fragment:
229 |         url = url + '#' + fragment
230 |     return url
231 | 
232 | 
233 | def urljoin(base, url, allow_fragments=True):
234 |     """Join a base URL and a possibly relative URL to form an absolute
235 |     interpretation of the latter."""
236 |     if not base:
237 |         return url
238 |     if not url:
239 |         return base
240 |     bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
241 |         urlparse(base, '', allow_fragments)
242 |     scheme, netloc, path, params, query, fragment = \
243 |         urlparse(url, bscheme, allow_fragments)
244 |     if scheme != bscheme:
245 |         return url
246 |     if netloc:
247 |         return urlunparse((scheme, netloc, path,
248 |                            params, query, fragment))
249 |     netloc = bnetloc
250 |     if path[:1] == '/':
251 |         return urlunparse((scheme, netloc, path,
252 |                            params, query, fragment))
253 |     if not path:
254 |         path = bpath
255 |         if not params:
256 |             params = bparams
257 |         else:
258 |             path = path[:-1]
259 |             return urlunparse((scheme, netloc, path,
260 |                                params, query, fragment))
261 |         if not query:
262 |             query = bquery
263 |         return urlunparse((scheme, netloc, path,
264 |                            params, query, fragment))
265 |     segments = bpath.split('/')[:-1] + path.split('/')
266 |     # XXX The stuff below is bogus in various ways...
267 |     if segments[-1] == '.':
268 |         segments[-1] = ''
269 |     while '.' in segments:
270 |         segments.remove('.')
271 |     while 1:
272 |         i = 1
273 |         n = len(segments) - 1
274 |         while i < n:
275 |             if segments[i] == '..' and segments[i - 1] not in ('', '..'):
276 |                 del segments[i - 1:i + 1]
277 |                 break
278 |             i = i + 1
279 |         else:
280 |             break
281 |     if segments == ['', '..']:
282 |         segments[-1] = ''
283 |     elif len(segments) >= 2 and segments[-1] == '..':
284 |         segments[-2:] = ['']
285 |     return urlunparse((scheme, netloc, '/'.join(segments),
286 |                        params, query, fragment))
287 | 
288 | 
289 | def urldefrag(url):
290 |     """Removes any existing fragment from URL.
291 | 
292 |     Returns a tuple of the defragmented URL and the fragment.  If
293 |     the URL contained no fragments, the second element is the
294 |     empty string.
295 |     """
296 |     if '#' in url:
297 |         s, n, p, a, q, frag = urlparse(url)
298 |         defrag = urlunparse((s, n, p, a, q, ''))
299 |         return defrag, frag
300 |     else:
301 |         return url, ''
302 | 
303 | # unquote method for parse_qs and parse_qsl
304 | # Cannot use directly from urllib as it would create circular reference.
305 | # urllib uses urlparse methods ( urljoin)
306 | 
307 | 
308 | _hexdig = '0123456789ABCDEFabcdef'
309 | _hextochr = {a + b: chr(int(a + b, 16)) for a in _hexdig for b in _hexdig}
310 | 
311 | 
312 | def unquote(s):
313 |     """unquote('abc%20def') -> 'abc def'."""
314 |     res = s.split('%')
315 |     for i in range(1, len(res)):
316 |         item = res[i]
317 |         try:
318 |             res[i] = _hextochr[item[:2]] + item[2:]
319 |         except KeyError:
320 |             res[i] = '%' + item
321 |         except UnicodeDecodeError:
322 |             res[i] = chr(int(item[:2], 16)) + item[2:]
323 |     return "".join(res)
324 | 
325 | 
326 | def parse_qs(qs, keep_blank_values=0, strict_parsing=0):
327 |     """Parse a query given as a string argument.
328 | 
329 |         Arguments:
330 | 
331 |         qs: URL-encoded query string to be parsed
332 | 
333 |         keep_blank_values: flag indicating whether blank values in
334 |             URL encoded queries should be treated as blank strings.
335 |             A true value indicates that blanks should be retained as
336 |             blank strings.  The default false value indicates that
337 |             blank values are to be ignored and treated as if they were
338 |             not included.
339 | 
340 |         strict_parsing: flag indicating what to do with parsing errors.
341 |             If false (the default), errors are silently ignored.
342 |             If true, errors raise a ValueError exception.
343 |     """
344 |     d = {}
345 |     for name, value in parse_qsl(qs, keep_blank_values, strict_parsing):
346 |         if name in d:
347 |             d[name].append(value)
348 |         else:
349 |             d[name] = [value]
350 |     return d
351 | 
352 | 
353 | def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
354 |     """Parse a query given as a string argument.
355 | 
356 |     Arguments:
357 | 
358 |     qs: URL-encoded query string to be parsed
359 | 
360 |     keep_blank_values: flag indicating whether blank values in
361 |         URL encoded queries should be treated as blank strings.  A
362 |         true value indicates that blanks should be retained as blank
363 |         strings.  The default false value indicates that blank values
364 |         are to be ignored and treated as if they were  not included.
365 | 
366 |     strict_parsing: flag indicating what to do with parsing errors. If
367 |         false (the default), errors are silently ignored. If true,
368 |         errors raise a ValueError exception.
369 | 
370 |     Returns a list, as G-d intended.
371 |     """
372 |     pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
373 |     r = []
374 |     for name_value in pairs:
375 |         if not name_value and not strict_parsing:
376 |             continue
377 |         nv = name_value.split('=', 1)
378 |         if len(nv) != 2:
379 |             if strict_parsing:
380 |                 raise ValueError(f"bad query field: {name_value!r}")
381 |             # Handle case of a control-name with no equal sign
382 |             if keep_blank_values:
383 |                 nv.append('')
384 |             else:
385 |                 continue
386 |         if len(nv[1]) or keep_blank_values:
387 |             name = unquote(nv[0].replace('+', ' '))
388 |             value = unquote(nv[1].replace('+', ' '))
389 |             r.append((name, value))
390 | 
391 |     return r
392 | 


--------------------------------------------------------------------------------
/yelp_uri/encoding.py:
--------------------------------------------------------------------------------
  1 | """Handle encoding of uris.
  2 | 
  3 | This is complicated since a uri consists of several parts with non-uniform encoding schemes.
  4 | In general, hostnames should be punycode, usernames should be utf8, and everything else should be urlquote+utf8.
  5 | """
  6 | import re
  7 | import urllib.parse
  8 | 
  9 | from yelp_bytes import from_bytes
 10 | from yelp_bytes import to_bytes
 11 | 
 12 | from yelp_uri import MalformedUrlError
 13 | from yelp_uri import RFC3986
 14 | from yelp_uri import SplitResult
 15 | from yelp_uri import urlsplit
 16 | from yelp_uri import urlunsplit
 17 | 
 18 | 
 19 | def encode_uri(uri):
 20 |     """Given a (presumed decoded) uri, return a well-encoded uri suitable for an html href."""
 21 |     uri = urlsplit(uri)
 22 | 
 23 |     new_uri = encode_split_uri(uri)
 24 | 
 25 |     return urlunsplit(new_uri)
 26 | 
 27 | 
 28 | def encode_split_uri(uri):
 29 |     """uri -- a yelp.uri.SplitResult object"""
 30 |     return SplitResult(
 31 |         _encode(uri.scheme),
 32 |         _encode(uri.username, expected=RFC3986.userinfo),
 33 |         _encode(uri.password, expected=RFC3986.userinfo),
 34 |         _encode_hostname(uri.hostname),
 35 |         _encode(uri.port, encoding='ASCII', expected=RFC3986.digits),
 36 |         _encode(uri.path, expected=RFC3986.path),
 37 |         _encode(uri.query, expected=RFC3986.query),
 38 |         _encode(uri.fragment, expected=RFC3986.fragment),
 39 |     )
 40 | 
 41 | 
 42 | def decode_uri(uri):
 43 |     """Given a uri, return a decoded uri suitable for displaying to users."""
 44 |     uri = urlsplit(uri)
 45 | 
 46 |     new_uri = decode_split_uri(uri)
 47 | 
 48 |     return urlunsplit(new_uri)
 49 | 
 50 | 
 51 | def decode_split_uri(uri):
 52 |     """uri -- a yelp.uri.SplitResult object"""
 53 |     return SplitResult(
 54 |         _decode(uri.scheme),
 55 |         _decode(uri.username),
 56 |         _decode(uri.password),
 57 |         _decode_hostname(uri.hostname),  # Decoded below
 58 |         _decode(uri.port, encoding='ASCII'),
 59 |         _decode(uri.path),
 60 |         _decode(uri.query),
 61 |         _decode(uri.fragment),
 62 |     )
 63 | 
 64 | 
 65 | def recode_uri(uri):
 66 |     """Take an unknown uri and return a well-encoded uri suitable for an html href.
 67 |     This is essentially equivalent to encode(decode(uri)), but a little more efficient.
 68 |     """
 69 |     uri = urlsplit(uri)
 70 | 
 71 |     new_uri = recode_split_uri(uri)
 72 | 
 73 |     return urlunsplit(new_uri)
 74 | 
 75 | 
 76 | def recode_split_uri(uri):
 77 |     """uri -- a yelp.uri.SplitResult object"""
 78 |     return encode_split_uri(decode_split_uri(uri))
 79 | 
 80 | 
 81 | def encode_email(email):
 82 |     email = _emailsplit(email)
 83 | 
 84 |     new_email = encode_split_email(email)
 85 | 
 86 |     return _emailunsplit(new_email)
 87 | 
 88 | 
 89 | def encode_split_email(email):
 90 |     """email -- a yelp.uri.SplitResult object"""
 91 |     return SplitResult(
 92 |         _encode(email.scheme),  # Could be "mailto"
 93 |         # We can't percent-quote email usernames because of the postfix "percent hack"
 94 |         # http://www.postfix.org/postconf.5.html#allow_percent_hack
 95 |         _encode(email.username, expected=RFC3986.userinfo, encoding=None),
 96 |         None,  # Passwords are invalid.
 97 |         _encode_hostname(email.hostname),
 98 |         None,  # Ports are invalid.
 99 |         None,  # Paths are invalid.
100 |         _encode(email.query, expected=RFC3986.query),  # Subject and body could be here.
101 |         None,  # Fragments are invalid
102 |     )
103 | 
104 | 
105 | def decode_email(email):
106 |     email = _emailsplit(email)
107 | 
108 |     new_email = decode_split_uri(email)
109 | 
110 |     return _emailunsplit(new_email)
111 | 
112 | 
113 | def decode_split_email(email):
114 |     """email -- a yelp.email.SplitResult object"""
115 |     return SplitResult(
116 |         _decode(email.scheme),  # Could be "mailto"
117 |         _decode(email.username),
118 |         None,
119 |         _decode_hostname(email.hostname),  # Decoded below
120 |         None,
121 |         None,
122 |         _decode(email.query),  # Subject and body could be here.
123 |         None,
124 |     )
125 | 
126 | 
127 | def recode_email(email):
128 |     email = _emailsplit(email)
129 | 
130 |     new_email = recode_split_email(email)
131 | 
132 |     return _emailunsplit(new_email)
133 | 
134 | 
135 | def recode_split_email(email):
136 |     """email -- a yelp.uri.SplitResult object"""
137 |     return encode_split_email(decode_split_email(email))
138 | 
139 | 
140 | # Helper functions, not for export. #
141 | 
142 | 
143 | def _encode(string, encoding='UTF-8', expected='', quoted=True):
144 |     if string is None:
145 |         return string
146 |     string = from_bytes(string)
147 | 
148 |     if encoding:
149 |         string = string.encode(encoding)
150 |         if quoted:
151 |             string = urllib.parse.quote(string, expected)
152 |         else:
153 |             string = string.decode('ASCII')
154 | 
155 |     return string
156 | 
157 | 
158 | def _encode_hostname(hostname):
159 |     # Fix-up bad leading/trailing/consecutive dots in the domain
160 |     # -- prior art in chromium browser.
161 |     if hostname is None:
162 |         return hostname
163 | 
164 |     hostname = hostname.strip('.')
165 |     hostname = _extra_dots_RE.sub('.', hostname)
166 |     try:
167 |         # Because IDNA should always return ascii, there should be no percent-quotes in the hostnames after encoding.
168 |         return _encode(hostname, encoding='IDNA', quoted=False)
169 |     except UnicodeError as error:
170 |         # Make this error a little more explicit and catch-able.
171 |         if len(error.args) == 1 and isinstance(error.args[0], str):
172 |             raise MalformedUrlError(f'Invalid hostname: {error.args[0]}: {hostname!r}')
173 |         else:  # An exception I don't expect
174 |             raise
175 | 
176 | 
177 | def _decode_hostname(hostname):
178 |     # IDNA decoding is not idempotent (sadface) so we need a special case here.
179 |     if hostname is None:
180 |         return hostname
181 | 
182 |     hostname = to_bytes(hostname)
183 |     hostname = _unquote_bytes(hostname)
184 |     if _is_punycoded(hostname):
185 |         return hostname.decode('IDNA')
186 |     else:
187 |         return hostname.decode('internet')
188 | 
189 | 
190 | def _decode(string, encoding='internet'):
191 |     if string is None:
192 |         return string
193 |     string = to_bytes(string)
194 |     string = _unquote_bytes(string)
195 |     string = string.decode(encoding)
196 |     return string
197 | 
198 | 
199 | _ascii_plaintext = RFC3986.plaintext.encode('US-ASCII')
200 | 
201 | 
202 | def _unquote_bytes(string):
203 |     """Similar to urllib.unquote, but only unquote ASCII-plaintext and non-ASCII bytes (0x80-0xFF).
204 |     This is only for use by _decode(), above.
205 |     """
206 | 
207 |     res = string.split(b'%')
208 |     for i in range(1, len(res)):
209 |         item = res[i]
210 |         try:
211 |             c = int(item[:2], 16)
212 |         except ValueError:
213 |             res[i] = b'%' + item
214 |         else:
215 |             char = bytes((c,))
216 |             if (
217 |                     c >= 0x80 or
218 |                     char in _ascii_plaintext
219 |             ):
220 |                 res[i] = char + item[2:]
221 |             else:
222 |                 res[i] = b'%' + item
223 |     return b''.join(res)
224 | 
225 | 
226 | _extra_dots_RE = re.compile(r'\.\.+')
227 | 
228 | 
229 | def _is_punycoded(domain):
230 |     """Hackity check to see if domain has been IDNA-encoded (aka punycode)
231 |     It's essentially what the idna codec does internally though.
232 |     """
233 |     from encodings.idna import ace_prefix
234 | 
235 |     return any(label.startswith(ace_prefix) for label in domain.split(b'.'))
236 | 
237 | 
238 | def _emailsplit(email):
239 |     """We don't support the full RFC6068 here, just a single email.
240 |     To simplify processing, we treat the email as a username@hostname,
241 |     rather than uri-path as specified.
242 | 
243 |     This is rather hackity. To do it the Right Way would probably necessitate a whole new namespace.
244 |     """
245 |     email = urlsplit(email)
246 | 
247 |     if email.path:
248 |         username, sep, hostname = email.path.rpartition('@')
249 |         if not sep:
250 |             username = None
251 |         return email.replace(username=username, hostname=hostname, path='')
252 |     else:
253 |         return email
254 | 
255 | 
256 | def _emailunsplit(split_email):
257 |     """Given a result from _emailsplit, return an email string."""
258 |     # I know. Gross. I don't see a better way.
259 |     email_path = '@'.join(s for s in (split_email.username, split_email.hostname) if s is not None)
260 |     split_email = split_email.replace(username=None, hostname=None, path=email_path)
261 |     return urlunsplit(split_email)
262 | 


--------------------------------------------------------------------------------
/yelp_uri/search.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from .tlds.all import all_tlds
  4 | from .tlds.common import common_tlds
  5 | from yelp_uri import RFC3986
  6 | 
  7 | 
  8 | def create_url_regex(rfc3986, tlds):
  9 |     # A regex for finding urls in free-form text.
 10 |     # This regex is space-indented, so that it looks OK on ReviewBoard
 11 |     #
 12 |     # the following regex works like this:
 13 |     # first part of URL is either:
 14 |     #   http:// or https:// followed by any dotted hostname
 15 |     #    or
 16 |     #   dotted hostname where last part is com, net, org, etc. (so we don't accidentally linkify wrong stuff)
 17 |     #    and in this case, the hostname cannot be preceded by dot or @ (or alphanum), as this causes weird matches
 18 |     #
 19 |     # (optional) second part of URL can consist of lots of differrent characters,
 20 |     #  but cannot end in .!,?;:( because those are maybe not meant to be part of the URL.
 21 |     # NOTE: regex has been recently modified to allow a url to end in ")" so it can support urls
 22 |     # that contain parenthesis, such as wikipedia urls (e.g. 'http://en.wikipedia.org/wiki/Ham_(disambiguation)')
 23 |     # linkify_url has also been changed to look for urls ending in a ")" that do not have a matching "(", and moves
 24 |     # it outside of the closing link tag if found.
 25 |     return re.compile(
 26 |         r"""
 27 |             # Don't start in the middle of something.
 28 |             (?<!  [\w./:-] )
 29 |             (?!  mailto: )
 30 |             # Looking for a domain name
 31 |             (
 32 |                 # we look for a known prefix
 33 |                 (
 34 |                     https?://
 35 |                 |
 36 |                     www\.
 37 |                 )
 38 |                 ([^%(not_userinfo)s]+@)? # maybe a user?
 39 |                 [^%(not_regname)s]+
 40 |                 \.
 41 |                 [^%(not_regname)s.]{2,}
 42 |                 (:\d+)? # maybe a port?
 43 |             |
 44 |                 # Don't start in the middle of an email
 45 |                 (?<!  @ )
 46 |                 # or else look for a domain name with a known suffix.
 47 |                 # We're more strict about dots / userinfo in this case, since the
 48 |                 # user intent is more ambiguous.
 49 |                 (   # one or more domain segments
 50 |                     [^%(not_regname)s.]+\.
 51 |                 )+
 52 |                 (%(tlds)s)
 53 |                 (:\d+)? # maybe a port?
 54 |             )
 55 |             # An optional path/query/fragment component
 56 |             (?P<path_query_fragment>
 57 |                 [/?#]
 58 |                 (
 59 |                     # Figure out if we have parens in our url
 60 |                     (?=(?P<HAS_PARENS>
 61 |                         [^:%(not_url)s]*\(
 62 |                     ))?
 63 |                     (?(HAS_PARENS)
 64 |                         # If we have parens, we use this:
 65 |                         [^%(not_url)s]*
 66 |                         [^%(bad_end)s]
 67 |                     |
 68 |                         # Otherwise, we use this:
 69 |                         [^\)%(not_url)s]*
 70 |                         [^\)%(bad_end)s]
 71 |                     )
 72 |                 )?
 73 |             )?
 74 |             # Look-ahead to make sure the URL ends nicely.
 75 |             (?=
 76 |                 [\)%(bad_end)s]
 77 |             |
 78 |                 $
 79 |             )
 80 |         """ % dict(
 81 |             vars(rfc3986.re),
 82 |             tlds=tlds,
 83 |         ),
 84 |         re.VERBOSE | re.UNICODE | re.IGNORECASE,
 85 |     )
 86 | 
 87 | 
 88 | url_regex = create_url_regex(RFC3986, all_tlds)
 89 | url_regex_common_tlds = create_url_regex(RFC3986, common_tlds)
 90 | 
 91 | # A regex for finding email addresses in free-form text.
 92 | email_regex = re.compile(
 93 |     r"""
 94 |         (?: # Don't start in the middle of something.
 95 |             (?<!  [\w.@/:-] )
 96 |             |
 97 |             (?<= mailto:// )
 98 |         )
 99 |         (   # A local-part in an email address
100 |             # Can't have percent signs. See: http://www.postfix.org/postconf.5.html#allow_percent_hack
101 |             [^%(not_userinfo)s%%]+
102 |         )
103 |         @ # At sign
104 |         ( # Start of FQDN
105 |             (?: # one or more domain segments
106 |                 [^%(not_regname)s.]+\.
107 |             )+
108 |             [^%(not_regname)s.]{2,} # Top-level domain
109 |         ) # End of FQDN
110 |     """ % vars(RFC3986.re),
111 |     re.VERBOSE | re.UNICODE
112 | )
113 | 
114 | # List the names that this module "really" exports.
115 | __all__ = (
116 |     'url_regex',
117 |     'url_regex_common_tlds',
118 |     'email_regex',
119 | )
120 | # vim:et:sts=4
121 | 


--------------------------------------------------------------------------------
/yelp_uri/tlds/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yelp/yelp_uri/8688042e6579bc235e8b2ddd2b552c5be84ba674/yelp_uri/tlds/__init__.py


--------------------------------------------------------------------------------
/yelp_uri/tlds/all.py:
--------------------------------------------------------------------------------
   1 | # Generated automatically. To regenerate:
   2 | #    ./bin/generate-tlds > yelp_uri/tlds/all.py
   3 | all_tlds = '|'.join((
   4 |     'aaa',
   5 |     'aarp',
   6 |     'abarth',
   7 |     'abb',
   8 |     'abbott',
   9 |     'abbvie',
  10 |     'abc',
  11 |     'able',
  12 |     'abogado',
  13 |     'abudhabi',
  14 |     'ac',
  15 |     'academy',
  16 |     'accenture',
  17 |     'accountant',
  18 |     'accountants',
  19 |     'aco',
  20 |     'active',
  21 |     'actor',
  22 |     'ad',
  23 |     'adac',
  24 |     'ads',
  25 |     'adult',
  26 |     'ae',
  27 |     'aeg',
  28 |     'aero',
  29 |     'aetna',
  30 |     'af',
  31 |     'afamilycompany',
  32 |     'afl',
  33 |     'ag',
  34 |     'agakhan',
  35 |     'agency',
  36 |     'ai',
  37 |     'aig',
  38 |     'aigo',
  39 |     'airbus',
  40 |     'airforce',
  41 |     'airtel',
  42 |     'akdn',
  43 |     'al',
  44 |     'alfaromeo',
  45 |     'alibaba',
  46 |     'alipay',
  47 |     'allfinanz',
  48 |     'allstate',
  49 |     'ally',
  50 |     'alsace',
  51 |     'alstom',
  52 |     'am',
  53 |     'americanexpress',
  54 |     'americanfamily',
  55 |     'amex',
  56 |     'amfam',
  57 |     'amica',
  58 |     'amsterdam',
  59 |     'analytics',
  60 |     'android',
  61 |     'anquan',
  62 |     'anz',
  63 |     'ao',
  64 |     'apartments',
  65 |     'app',
  66 |     'apple',
  67 |     'aq',
  68 |     'aquarelle',
  69 |     'ar',
  70 |     'aramco',
  71 |     'archi',
  72 |     'army',
  73 |     'arpa',
  74 |     'art',
  75 |     'arte',
  76 |     'as',
  77 |     'asda',
  78 |     'asia',
  79 |     'associates',
  80 |     'at',
  81 |     'athleta',
  82 |     'attorney',
  83 |     'au',
  84 |     'auction',
  85 |     'audi',
  86 |     'audible',
  87 |     'audio',
  88 |     'auspost',
  89 |     'author',
  90 |     'auto',
  91 |     'autos',
  92 |     'avianca',
  93 |     'aw',
  94 |     'aws',
  95 |     'ax',
  96 |     'axa',
  97 |     'az',
  98 |     'azure',
  99 |     'ba',
 100 |     'baby',
 101 |     'baidu',
 102 |     'banamex',
 103 |     'bananarepublic',
 104 |     'band',
 105 |     'bank',
 106 |     'bar',
 107 |     'barcelona',
 108 |     'barclaycard',
 109 |     'barclays',
 110 |     'barefoot',
 111 |     'bargains',
 112 |     'bauhaus',
 113 |     'bayern',
 114 |     'bb',
 115 |     'bbc',
 116 |     'bbt',
 117 |     'bbva',
 118 |     'bcg',
 119 |     'bcn',
 120 |     'bd',
 121 |     'be',
 122 |     'beats',
 123 |     'beauty',
 124 |     'beer',
 125 |     'bentley',
 126 |     'berlin',
 127 |     'best',
 128 |     'bestbuy',
 129 |     'bet',
 130 |     'bf',
 131 |     'bg',
 132 |     'bh',
 133 |     'bharti',
 134 |     'bi',
 135 |     'bible',
 136 |     'bid',
 137 |     'bike',
 138 |     'bing',
 139 |     'bingo',
 140 |     'bio',
 141 |     'biz',
 142 |     'bj',
 143 |     'black',
 144 |     'blackfriday',
 145 |     'blanco',
 146 |     'blockbuster',
 147 |     'blog',
 148 |     'bloomberg',
 149 |     'blue',
 150 |     'bm',
 151 |     'bms',
 152 |     'bmw',
 153 |     'bn',
 154 |     'bnl',
 155 |     'bnpparibas',
 156 |     'bo',
 157 |     'boats',
 158 |     'boehringer',
 159 |     'bofa',
 160 |     'bom',
 161 |     'bond',
 162 |     'boo',
 163 |     'book',
 164 |     'booking',
 165 |     'boots',
 166 |     'bosch',
 167 |     'bostik',
 168 |     'bot',
 169 |     'boutique',
 170 |     'br',
 171 |     'bradesco',
 172 |     'bridgestone',
 173 |     'broadway',
 174 |     'broker',
 175 |     'brother',
 176 |     'brussels',
 177 |     'bs',
 178 |     'bt',
 179 |     'budapest',
 180 |     'bugatti',
 181 |     'build',
 182 |     'builders',
 183 |     'business',
 184 |     'buy',
 185 |     'buzz',
 186 |     'bv',
 187 |     'bw',
 188 |     'by',
 189 |     'bz',
 190 |     'bzh',
 191 |     'ca',
 192 |     'cab',
 193 |     'cafe',
 194 |     'cal',
 195 |     'call',
 196 |     'calvinklein',
 197 |     'cam',
 198 |     'camera',
 199 |     'camp',
 200 |     'cancerresearch',
 201 |     'canon',
 202 |     'capetown',
 203 |     'capital',
 204 |     'capitalone',
 205 |     'car',
 206 |     'caravan',
 207 |     'cards',
 208 |     'care',
 209 |     'career',
 210 |     'careers',
 211 |     'cars',
 212 |     'cartier',
 213 |     'casa',
 214 |     'cash',
 215 |     'casino',
 216 |     'cat',
 217 |     'catering',
 218 |     'cba',
 219 |     'cbn',
 220 |     'cbre',
 221 |     'cbs',
 222 |     'cc',
 223 |     'cd',
 224 |     'ceb',
 225 |     'center',
 226 |     'ceo',
 227 |     'cern',
 228 |     'cf',
 229 |     'cfa',
 230 |     'cfd',
 231 |     'cg',
 232 |     'ch',
 233 |     'chanel',
 234 |     'channel',
 235 |     'chase',
 236 |     'chat',
 237 |     'cheap',
 238 |     'chintai',
 239 |     'chloe',
 240 |     'christmas',
 241 |     'chrome',
 242 |     'chrysler',
 243 |     'church',
 244 |     'ci',
 245 |     'cipriani',
 246 |     'circle',
 247 |     'cisco',
 248 |     'citadel',
 249 |     'citi',
 250 |     'citic',
 251 |     'city',
 252 |     'cityeats',
 253 |     'ck',
 254 |     'cl',
 255 |     'claims',
 256 |     'cleaning',
 257 |     'click',
 258 |     'clinic',
 259 |     'clinique',
 260 |     'clothing',
 261 |     'cloud',
 262 |     'club',
 263 |     'clubmed',
 264 |     'cm',
 265 |     'cn',
 266 |     'co',
 267 |     'coach',
 268 |     'codes',
 269 |     'coffee',
 270 |     'college',
 271 |     'cologne',
 272 |     'com',
 273 |     'comcast',
 274 |     'commbank',
 275 |     'community',
 276 |     'company',
 277 |     'compare',
 278 |     'computer',
 279 |     'comsec',
 280 |     'condos',
 281 |     'construction',
 282 |     'consulting',
 283 |     'contact',
 284 |     'contractors',
 285 |     'cooking',
 286 |     'cookingchannel',
 287 |     'cool',
 288 |     'coop',
 289 |     'corsica',
 290 |     'country',
 291 |     'coupon',
 292 |     'coupons',
 293 |     'courses',
 294 |     'cr',
 295 |     'credit',
 296 |     'creditcard',
 297 |     'creditunion',
 298 |     'cricket',
 299 |     'crown',
 300 |     'crs',
 301 |     'cruises',
 302 |     'csc',
 303 |     'cu',
 304 |     'cuisinella',
 305 |     'cv',
 306 |     'cw',
 307 |     'cx',
 308 |     'cy',
 309 |     'cymru',
 310 |     'cyou',
 311 |     'cz',
 312 |     'dabur',
 313 |     'dad',
 314 |     'dance',
 315 |     'date',
 316 |     'dating',
 317 |     'datsun',
 318 |     'day',
 319 |     'dclk',
 320 |     'dds',
 321 |     'de',
 322 |     'deal',
 323 |     'dealer',
 324 |     'deals',
 325 |     'degree',
 326 |     'delivery',
 327 |     'dell',
 328 |     'deloitte',
 329 |     'delta',
 330 |     'democrat',
 331 |     'dental',
 332 |     'dentist',
 333 |     'desi',
 334 |     'design',
 335 |     'dev',
 336 |     'dhl',
 337 |     'diamonds',
 338 |     'diet',
 339 |     'digital',
 340 |     'direct',
 341 |     'directory',
 342 |     'discount',
 343 |     'discover',
 344 |     'dish',
 345 |     'diy',
 346 |     'dj',
 347 |     'dk',
 348 |     'dm',
 349 |     'dnp',
 350 |     'do',
 351 |     'docs',
 352 |     'doctor',
 353 |     'dodge',
 354 |     'dog',
 355 |     'doha',
 356 |     'domains',
 357 |     'dot',
 358 |     'download',
 359 |     'drive',
 360 |     'dtv',
 361 |     'dubai',
 362 |     'duck',
 363 |     'dunlop',
 364 |     'duns',
 365 |     'dupont',
 366 |     'durban',
 367 |     'dvag',
 368 |     'dz',
 369 |     'earth',
 370 |     'eat',
 371 |     'ec',
 372 |     'eco',
 373 |     'edeka',
 374 |     'edu',
 375 |     'education',
 376 |     'ee',
 377 |     'eg',
 378 |     'email',
 379 |     'emerck',
 380 |     'energy',
 381 |     'engineer',
 382 |     'engineering',
 383 |     'enterprises',
 384 |     'epost',
 385 |     'epson',
 386 |     'equipment',
 387 |     'er',
 388 |     'ericsson',
 389 |     'erni',
 390 |     'es',
 391 |     'esq',
 392 |     'estate',
 393 |     'esurance',
 394 |     'et',
 395 |     'eu',
 396 |     'eurovision',
 397 |     'eus',
 398 |     'events',
 399 |     'everbank',
 400 |     'exchange',
 401 |     'expert',
 402 |     'exposed',
 403 |     'express',
 404 |     'extraspace',
 405 |     'fage',
 406 |     'fail',
 407 |     'fairwinds',
 408 |     'faith',
 409 |     'family',
 410 |     'fan',
 411 |     'fans',
 412 |     'farm',
 413 |     'farmers',
 414 |     'fashion',
 415 |     'fast',
 416 |     'fedex',
 417 |     'feedback',
 418 |     'ferrari',
 419 |     'ferrero',
 420 |     'fi',
 421 |     'fiat',
 422 |     'fidelity',
 423 |     'fido',
 424 |     'film',
 425 |     'final',
 426 |     'finance',
 427 |     'financial',
 428 |     'fire',
 429 |     'firestone',
 430 |     'firmdale',
 431 |     'fish',
 432 |     'fishing',
 433 |     'fit',
 434 |     'fitness',
 435 |     'fj',
 436 |     'fk',
 437 |     'flickr',
 438 |     'flights',
 439 |     'flir',
 440 |     'florist',
 441 |     'flowers',
 442 |     'fly',
 443 |     'fm',
 444 |     'fo',
 445 |     'foo',
 446 |     'foodnetwork',
 447 |     'football',
 448 |     'ford',
 449 |     'forex',
 450 |     'forsale',
 451 |     'forum',
 452 |     'foundation',
 453 |     'fox',
 454 |     'fr',
 455 |     'fresenius',
 456 |     'frl',
 457 |     'frogans',
 458 |     'frontdoor',
 459 |     'frontier',
 460 |     'ftr',
 461 |     'fujitsu',
 462 |     'fujixerox',
 463 |     'fund',
 464 |     'furniture',
 465 |     'futbol',
 466 |     'fyi',
 467 |     'ga',
 468 |     'gal',
 469 |     'gallery',
 470 |     'gallo',
 471 |     'gallup',
 472 |     'game',
 473 |     'games',
 474 |     'gap',
 475 |     'garden',
 476 |     'gb',
 477 |     'gbiz',
 478 |     'gd',
 479 |     'gdn',
 480 |     'ge',
 481 |     'gea',
 482 |     'gent',
 483 |     'genting',
 484 |     'george',
 485 |     'gf',
 486 |     'gg',
 487 |     'ggee',
 488 |     'gh',
 489 |     'gi',
 490 |     'gift',
 491 |     'gifts',
 492 |     'gives',
 493 |     'giving',
 494 |     'gl',
 495 |     'glade',
 496 |     'glass',
 497 |     'gle',
 498 |     'global',
 499 |     'globo',
 500 |     'gm',
 501 |     'gmail',
 502 |     'gmbh',
 503 |     'gmo',
 504 |     'gmx',
 505 |     'gn',
 506 |     'godaddy',
 507 |     'gold',
 508 |     'goldpoint',
 509 |     'golf',
 510 |     'goo',
 511 |     'goodhands',
 512 |     'goodyear',
 513 |     'goog',
 514 |     'google',
 515 |     'gop',
 516 |     'got',
 517 |     'gov',
 518 |     'gp',
 519 |     'gq',
 520 |     'gr',
 521 |     'grainger',
 522 |     'graphics',
 523 |     'gratis',
 524 |     'green',
 525 |     'gripe',
 526 |     'group',
 527 |     'gs',
 528 |     'gt',
 529 |     'gu',
 530 |     'guardian',
 531 |     'gucci',
 532 |     'guge',
 533 |     'guide',
 534 |     'guitars',
 535 |     'guru',
 536 |     'gw',
 537 |     'gy',
 538 |     'hamburg',
 539 |     'hangout',
 540 |     'haus',
 541 |     'hbo',
 542 |     'hdfc',
 543 |     'hdfcbank',
 544 |     'health',
 545 |     'healthcare',
 546 |     'help',
 547 |     'helsinki',
 548 |     'here',
 549 |     'hermes',
 550 |     'hgtv',
 551 |     'hiphop',
 552 |     'hisamitsu',
 553 |     'hitachi',
 554 |     'hiv',
 555 |     'hk',
 556 |     'hkt',
 557 |     'hm',
 558 |     'hn',
 559 |     'hockey',
 560 |     'holdings',
 561 |     'holiday',
 562 |     'homedepot',
 563 |     'homegoods',
 564 |     'homes',
 565 |     'homesense',
 566 |     'honda',
 567 |     'honeywell',
 568 |     'horse',
 569 |     'host',
 570 |     'hosting',
 571 |     'hot',
 572 |     'hoteles',
 573 |     'hotmail',
 574 |     'house',
 575 |     'how',
 576 |     'hr',
 577 |     'hsbc',
 578 |     'ht',
 579 |     'htc',
 580 |     'hu',
 581 |     'hughes',
 582 |     'hyatt',
 583 |     'hyundai',
 584 |     'ibm',
 585 |     'icbc',
 586 |     'ice',
 587 |     'icu',
 588 |     'id',
 589 |     'ie',
 590 |     'ieee',
 591 |     'ifm',
 592 |     'iinet',
 593 |     'ikano',
 594 |     'il',
 595 |     'im',
 596 |     'imamat',
 597 |     'imdb',
 598 |     'immo',
 599 |     'immobilien',
 600 |     'in',
 601 |     'industries',
 602 |     'infiniti',
 603 |     'info',
 604 |     'ing',
 605 |     'ink',
 606 |     'institute',
 607 |     'insurance',
 608 |     'insure',
 609 |     'int',
 610 |     'intel',
 611 |     'international',
 612 |     'intuit',
 613 |     'investments',
 614 |     'io',
 615 |     'ipiranga',
 616 |     'iq',
 617 |     'ir',
 618 |     'irish',
 619 |     'is',
 620 |     'iselect',
 621 |     'ismaili',
 622 |     'ist',
 623 |     'istanbul',
 624 |     'it',
 625 |     'itau',
 626 |     'itv',
 627 |     'iwc',
 628 |     'jaguar',
 629 |     'java',
 630 |     'jcb',
 631 |     'jcp',
 632 |     'je',
 633 |     'jeep',
 634 |     'jetzt',
 635 |     'jewelry',
 636 |     'jlc',
 637 |     'jll',
 638 |     'jm',
 639 |     'jmp',
 640 |     'jnj',
 641 |     'jo',
 642 |     'jobs',
 643 |     'joburg',
 644 |     'jot',
 645 |     'joy',
 646 |     'jp',
 647 |     'jpmorgan',
 648 |     'jprs',
 649 |     'juegos',
 650 |     'juniper',
 651 |     'kaufen',
 652 |     'kddi',
 653 |     'ke',
 654 |     'kerryhotels',
 655 |     'kerrylogistics',
 656 |     'kerryproperties',
 657 |     'kfh',
 658 |     'kg',
 659 |     'kh',
 660 |     'ki',
 661 |     'kia',
 662 |     'kim',
 663 |     'kinder',
 664 |     'kindle',
 665 |     'kitchen',
 666 |     'kiwi',
 667 |     'km',
 668 |     'kn',
 669 |     'koeln',
 670 |     'komatsu',
 671 |     'kosher',
 672 |     'kp',
 673 |     'kpmg',
 674 |     'kpn',
 675 |     'kr',
 676 |     'krd',
 677 |     'kred',
 678 |     'kuokgroup',
 679 |     'kw',
 680 |     'ky',
 681 |     'kyoto',
 682 |     'kz',
 683 |     'la',
 684 |     'lacaixa',
 685 |     'ladbrokes',
 686 |     'lamborghini',
 687 |     'lamer',
 688 |     'lancaster',
 689 |     'lancia',
 690 |     'lancome',
 691 |     'land',
 692 |     'landrover',
 693 |     'lanxess',
 694 |     'lasalle',
 695 |     'lat',
 696 |     'latino',
 697 |     'latrobe',
 698 |     'law',
 699 |     'lawyer',
 700 |     'lb',
 701 |     'lc',
 702 |     'lds',
 703 |     'lease',
 704 |     'leclerc',
 705 |     'lefrak',
 706 |     'legal',
 707 |     'lego',
 708 |     'lexus',
 709 |     'lgbt',
 710 |     'li',
 711 |     'liaison',
 712 |     'lidl',
 713 |     'life',
 714 |     'lifeinsurance',
 715 |     'lifestyle',
 716 |     'lighting',
 717 |     'like',
 718 |     'lilly',
 719 |     'limited',
 720 |     'limo',
 721 |     'lincoln',
 722 |     'linde',
 723 |     'link',
 724 |     'lipsy',
 725 |     'live',
 726 |     'living',
 727 |     'lixil',
 728 |     'lk',
 729 |     'loan',
 730 |     'loans',
 731 |     'locker',
 732 |     'locus',
 733 |     'loft',
 734 |     'lol',
 735 |     'london',
 736 |     'lotte',
 737 |     'lotto',
 738 |     'love',
 739 |     'lpl',
 740 |     'lplfinancial',
 741 |     'lr',
 742 |     'ls',
 743 |     'lt',
 744 |     'ltd',
 745 |     'ltda',
 746 |     'lu',
 747 |     'lundbeck',
 748 |     'lupin',
 749 |     'luxe',
 750 |     'luxury',
 751 |     'lv',
 752 |     'ly',
 753 |     'ma',
 754 |     'macys',
 755 |     'madrid',
 756 |     'maif',
 757 |     'maison',
 758 |     'makeup',
 759 |     'man',
 760 |     'management',
 761 |     'mango',
 762 |     'market',
 763 |     'marketing',
 764 |     'markets',
 765 |     'marriott',
 766 |     'marshalls',
 767 |     'maserati',
 768 |     'mattel',
 769 |     'mba',
 770 |     'mc',
 771 |     'mcd',
 772 |     'mcdonalds',
 773 |     'mckinsey',
 774 |     'md',
 775 |     'me',
 776 |     'med',
 777 |     'media',
 778 |     'meet',
 779 |     'melbourne',
 780 |     'meme',
 781 |     'memorial',
 782 |     'men',
 783 |     'menu',
 784 |     'meo',
 785 |     'metlife',
 786 |     'mg',
 787 |     'mh',
 788 |     'miami',
 789 |     'microsoft',
 790 |     'mil',
 791 |     'mini',
 792 |     'mint',
 793 |     'mit',
 794 |     'mitsubishi',
 795 |     'mk',
 796 |     'ml',
 797 |     'mlb',
 798 |     'mls',
 799 |     'mm',
 800 |     'mma',
 801 |     'mn',
 802 |     'mo',
 803 |     'mobi',
 804 |     'mobily',
 805 |     'moda',
 806 |     'moe',
 807 |     'moi',
 808 |     'mom',
 809 |     'monash',
 810 |     'money',
 811 |     'monster',
 812 |     'montblanc',
 813 |     'mopar',
 814 |     'mormon',
 815 |     'mortgage',
 816 |     'moscow',
 817 |     'motorcycles',
 818 |     'mov',
 819 |     'movie',
 820 |     'movistar',
 821 |     'mp',
 822 |     'mq',
 823 |     'mr',
 824 |     'ms',
 825 |     'msd',
 826 |     'mt',
 827 |     'mtn',
 828 |     'mtpc',
 829 |     'mtr',
 830 |     'mu',
 831 |     'museum',
 832 |     'mutual',
 833 |     'mutuelle',
 834 |     'mv',
 835 |     'mw',
 836 |     'mx',
 837 |     'my',
 838 |     'mz',
 839 |     'na',
 840 |     'nab',
 841 |     'nadex',
 842 |     'nagoya',
 843 |     'name',
 844 |     'nationwide',
 845 |     'natura',
 846 |     'navy',
 847 |     'nba',
 848 |     'nc',
 849 |     'ne',
 850 |     'nec',
 851 |     'net',
 852 |     'netbank',
 853 |     'netflix',
 854 |     'network',
 855 |     'neustar',
 856 |     'new',
 857 |     'news',
 858 |     'next',
 859 |     'nextdirect',
 860 |     'nexus',
 861 |     'nf',
 862 |     'nfl',
 863 |     'ng',
 864 |     'ngo',
 865 |     'nhk',
 866 |     'ni',
 867 |     'nico',
 868 |     'nike',
 869 |     'nikon',
 870 |     'ninja',
 871 |     'nissan',
 872 |     'nissay',
 873 |     'nl',
 874 |     'no',
 875 |     'nokia',
 876 |     'northwesternmutual',
 877 |     'norton',
 878 |     'now',
 879 |     'nowruz',
 880 |     'nowtv',
 881 |     'np',
 882 |     'nr',
 883 |     'nra',
 884 |     'nrw',
 885 |     'ntt',
 886 |     'nu',
 887 |     'nyc',
 888 |     'nz',
 889 |     'obi',
 890 |     'off',
 891 |     'office',
 892 |     'okinawa',
 893 |     'olayan',
 894 |     'olayangroup',
 895 |     'oldnavy',
 896 |     'ollo',
 897 |     'om',
 898 |     'omega',
 899 |     'one',
 900 |     'ong',
 901 |     'onl',
 902 |     'online',
 903 |     'onyourside',
 904 |     'ooo',
 905 |     'open',
 906 |     'oracle',
 907 |     'orange',
 908 |     'org',
 909 |     'organic',
 910 |     'orientexpress',
 911 |     'origins',
 912 |     'osaka',
 913 |     'otsuka',
 914 |     'ott',
 915 |     'ovh',
 916 |     'pa',
 917 |     'page',
 918 |     'pamperedchef',
 919 |     'panasonic',
 920 |     'panerai',
 921 |     'paris',
 922 |     'pars',
 923 |     'partners',
 924 |     'parts',
 925 |     'party',
 926 |     'passagens',
 927 |     'pay',
 928 |     'pccw',
 929 |     'pe',
 930 |     'pet',
 931 |     'pf',
 932 |     'pfizer',
 933 |     'pg',
 934 |     'ph',
 935 |     'pharmacy',
 936 |     'philips',
 937 |     'photo',
 938 |     'photography',
 939 |     'photos',
 940 |     'physio',
 941 |     'piaget',
 942 |     'pics',
 943 |     'pictet',
 944 |     'pictures',
 945 |     'pid',
 946 |     'pin',
 947 |     'ping',
 948 |     'pink',
 949 |     'pioneer',
 950 |     'pizza',
 951 |     'pk',
 952 |     'pl',
 953 |     'place',
 954 |     'play',
 955 |     'playstation',
 956 |     'plumbing',
 957 |     'plus',
 958 |     'pm',
 959 |     'pn',
 960 |     'pnc',
 961 |     'pohl',
 962 |     'poker',
 963 |     'politie',
 964 |     'porn',
 965 |     'post',
 966 |     'pr',
 967 |     'pramerica',
 968 |     'praxi',
 969 |     'press',
 970 |     'prime',
 971 |     'pro',
 972 |     'prod',
 973 |     'productions',
 974 |     'prof',
 975 |     'progressive',
 976 |     'promo',
 977 |     'properties',
 978 |     'property',
 979 |     'protection',
 980 |     'pru',
 981 |     'prudential',
 982 |     'ps',
 983 |     'pt',
 984 |     'pub',
 985 |     'pw',
 986 |     'pwc',
 987 |     'py',
 988 |     'qa',
 989 |     'qpon',
 990 |     'quebec',
 991 |     'quest',
 992 |     'qvc',
 993 |     'racing',
 994 |     'raid',
 995 |     're',
 996 |     'read',
 997 |     'realestate',
 998 |     'realtor',
 999 |     'realty',
1000 |     'recipes',
1001 |     'red',
1002 |     'redstone',
1003 |     'redumbrella',
1004 |     'rehab',
1005 |     'reise',
1006 |     'reisen',
1007 |     'reit',
1008 |     'ren',
1009 |     'rent',
1010 |     'rentals',
1011 |     'repair',
1012 |     'report',
1013 |     'republican',
1014 |     'rest',
1015 |     'restaurant',
1016 |     'review',
1017 |     'reviews',
1018 |     'rexroth',
1019 |     'rich',
1020 |     'richardli',
1021 |     'ricoh',
1022 |     'rightathome',
1023 |     'rio',
1024 |     'rip',
1025 |     'ro',
1026 |     'rocher',
1027 |     'rocks',
1028 |     'rodeo',
1029 |     'rogers',
1030 |     'room',
1031 |     'rs',
1032 |     'rsvp',
1033 |     'ru',
1034 |     'ruhr',
1035 |     'run',
1036 |     'rw',
1037 |     'rwe',
1038 |     'ryukyu',
1039 |     'sa',
1040 |     'saarland',
1041 |     'safe',
1042 |     'safety',
1043 |     'sakura',
1044 |     'sale',
1045 |     'salon',
1046 |     'samsclub',
1047 |     'samsung',
1048 |     'sandvik',
1049 |     'sandvikcoromant',
1050 |     'sanofi',
1051 |     'sap',
1052 |     'sapo',
1053 |     'sarl',
1054 |     'sas',
1055 |     'save',
1056 |     'saxo',
1057 |     'sb',
1058 |     'sbi',
1059 |     'sbs',
1060 |     'sc',
1061 |     'sca',
1062 |     'scb',
1063 |     'schaeffler',
1064 |     'schmidt',
1065 |     'scholarships',
1066 |     'school',
1067 |     'schule',
1068 |     'schwarz',
1069 |     'science',
1070 |     'scjohnson',
1071 |     'scor',
1072 |     'scot',
1073 |     'sd',
1074 |     'se',
1075 |     'seat',
1076 |     'secure',
1077 |     'security',
1078 |     'seek',
1079 |     'select',
1080 |     'sener',
1081 |     'services',
1082 |     'ses',
1083 |     'seven',
1084 |     'sew',
1085 |     'sex',
1086 |     'sexy',
1087 |     'sfr',
1088 |     'sg',
1089 |     'sh',
1090 |     'shangrila',
1091 |     'sharp',
1092 |     'shaw',
1093 |     'shell',
1094 |     'shia',
1095 |     'shiksha',
1096 |     'shoes',
1097 |     'shop',
1098 |     'shopping',
1099 |     'shouji',
1100 |     'show',
1101 |     'showtime',
1102 |     'shriram',
1103 |     'si',
1104 |     'silk',
1105 |     'sina',
1106 |     'singles',
1107 |     'site',
1108 |     'sj',
1109 |     'sk',
1110 |     'ski',
1111 |     'skin',
1112 |     'sky',
1113 |     'skype',
1114 |     'sl',
1115 |     'sling',
1116 |     'sm',
1117 |     'smart',
1118 |     'smile',
1119 |     'sn',
1120 |     'sncf',
1121 |     'so',
1122 |     'soccer',
1123 |     'social',
1124 |     'softbank',
1125 |     'software',
1126 |     'sohu',
1127 |     'solar',
1128 |     'solutions',
1129 |     'song',
1130 |     'sony',
1131 |     'soy',
1132 |     'space',
1133 |     'spiegel',
1134 |     'spot',
1135 |     'spreadbetting',
1136 |     'sr',
1137 |     'srl',
1138 |     'srt',
1139 |     'st',
1140 |     'stada',
1141 |     'staples',
1142 |     'star',
1143 |     'starhub',
1144 |     'statebank',
1145 |     'statefarm',
1146 |     'statoil',
1147 |     'stc',
1148 |     'stcgroup',
1149 |     'stockholm',
1150 |     'storage',
1151 |     'store',
1152 |     'stream',
1153 |     'studio',
1154 |     'study',
1155 |     'style',
1156 |     'su',
1157 |     'sucks',
1158 |     'supplies',
1159 |     'supply',
1160 |     'support',
1161 |     'surf',
1162 |     'surgery',
1163 |     'suzuki',
1164 |     'sv',
1165 |     'swatch',
1166 |     'swiftcover',
1167 |     'swiss',
1168 |     'sx',
1169 |     'sy',
1170 |     'sydney',
1171 |     'symantec',
1172 |     'systems',
1173 |     'sz',
1174 |     'tab',
1175 |     'taipei',
1176 |     'talk',
1177 |     'taobao',
1178 |     'target',
1179 |     'tatamotors',
1180 |     'tatar',
1181 |     'tattoo',
1182 |     'tax',
1183 |     'taxi',
1184 |     'tc',
1185 |     'tci',
1186 |     'td',
1187 |     'tdk',
1188 |     'team',
1189 |     'tech',
1190 |     'technology',
1191 |     'tel',
1192 |     'telecity',
1193 |     'telefonica',
1194 |     'temasek',
1195 |     'tennis',
1196 |     'teva',
1197 |     'tf',
1198 |     'tg',
1199 |     'th',
1200 |     'thd',
1201 |     'theater',
1202 |     'theatre',
1203 |     'tiaa',
1204 |     'tickets',
1205 |     'tienda',
1206 |     'tiffany',
1207 |     'tips',
1208 |     'tires',
1209 |     'tirol',
1210 |     'tj',
1211 |     'tjmaxx',
1212 |     'tjx',
1213 |     'tk',
1214 |     'tkmaxx',
1215 |     'tl',
1216 |     'tm',
1217 |     'tmall',
1218 |     'tn',
1219 |     'to',
1220 |     'today',
1221 |     'tokyo',
1222 |     'tools',
1223 |     'top',
1224 |     'toray',
1225 |     'toshiba',
1226 |     'total',
1227 |     'tours',
1228 |     'town',
1229 |     'toyota',
1230 |     'toys',
1231 |     'tr',
1232 |     'trade',
1233 |     'trading',
1234 |     'training',
1235 |     'travel',
1236 |     'travelchannel',
1237 |     'travelers',
1238 |     'travelersinsurance',
1239 |     'trust',
1240 |     'trv',
1241 |     'tt',
1242 |     'tube',
1243 |     'tui',
1244 |     'tunes',
1245 |     'tushu',
1246 |     'tv',
1247 |     'tvs',
1248 |     'tw',
1249 |     'tz',
1250 |     'ua',
1251 |     'ubank',
1252 |     'ubs',
1253 |     'uconnect',
1254 |     'ug',
1255 |     'uk',
1256 |     'unicom',
1257 |     'university',
1258 |     'uno',
1259 |     'uol',
1260 |     'ups',
1261 |     'us',
1262 |     'uy',
1263 |     'uz',
1264 |     'va',
1265 |     'vacations',
1266 |     'vana',
1267 |     'vanguard',
1268 |     'vc',
1269 |     've',
1270 |     'vegas',
1271 |     'ventures',
1272 |     'verisign',
1273 |     'vermögensberater',
1274 |     'vermögensberatung',
1275 |     'versicherung',
1276 |     'vet',
1277 |     'vg',
1278 |     'vi',
1279 |     'viajes',
1280 |     'video',
1281 |     'vig',
1282 |     'viking',
1283 |     'villas',
1284 |     'vin',
1285 |     'vip',
1286 |     'virgin',
1287 |     'visa',
1288 |     'vision',
1289 |     'vista',
1290 |     'vistaprint',
1291 |     'viva',
1292 |     'vivo',
1293 |     'vlaanderen',
1294 |     'vn',
1295 |     'vodka',
1296 |     'volkswagen',
1297 |     'vote',
1298 |     'voting',
1299 |     'voto',
1300 |     'voyage',
1301 |     'vu',
1302 |     'vuelos',
1303 |     'wales',
1304 |     'walmart',
1305 |     'walter',
1306 |     'wang',
1307 |     'wanggou',
1308 |     'warman',
1309 |     'watch',
1310 |     'watches',
1311 |     'weather',
1312 |     'weatherchannel',
1313 |     'webcam',
1314 |     'weber',
1315 |     'website',
1316 |     'wed',
1317 |     'wedding',
1318 |     'weibo',
1319 |     'weir',
1320 |     'wf',
1321 |     'whoswho',
1322 |     'wien',
1323 |     'wiki',
1324 |     'williamhill',
1325 |     'win',
1326 |     'windows',
1327 |     'wine',
1328 |     'winners',
1329 |     'wme',
1330 |     'wolterskluwer',
1331 |     'woodside',
1332 |     'work',
1333 |     'works',
1334 |     'world',
1335 |     'ws',
1336 |     'wtc',
1337 |     'wtf',
1338 |     'xbox',
1339 |     'xerox',
1340 |     'xfinity',
1341 |     'xihuan',
1342 |     'xin',
1343 |     'xn--11b4c3d',
1344 |     'xn--1ck2e1b',
1345 |     'xn--1qqw23a',
1346 |     'xn--30rr7y',
1347 |     'xn--3bst00m',
1348 |     'xn--3ds443g',
1349 |     'xn--3e0b707e',
1350 |     'xn--3oq18vl8pn36a',
1351 |     'xn--3pxu8k',
1352 |     'xn--42c2d9a',
1353 |     'xn--45brj9c',
1354 |     'xn--45q11c',
1355 |     'xn--4gbrim',
1356 |     'xn--55qw42g',
1357 |     'xn--55qx5d',
1358 |     'xn--5su34j936bgsg',
1359 |     'xn--5tzm5g',
1360 |     'xn--6frz82g',
1361 |     'xn--6qq986b3xl',
1362 |     'xn--80adxhks',
1363 |     'xn--80ao21a',
1364 |     'xn--80asehdb',
1365 |     'xn--80aswg',
1366 |     'xn--8y0a063a',
1367 |     'xn--90a3ac',
1368 |     'xn--90ae',
1369 |     'xn--90ais',
1370 |     'xn--9dbq2a',
1371 |     'xn--9et52u',
1372 |     'xn--9krt00a',
1373 |     'xn--b4w605ferd',
1374 |     'xn--bck1b9a5dre4c',
1375 |     'xn--c1avg',
1376 |     'xn--c2br7g',
1377 |     'xn--cck2b3b',
1378 |     'xn--cg4bki',
1379 |     'xn--clchc0ea0b2g2a9gcd',
1380 |     'xn--czr694b',
1381 |     'xn--czrs0t',
1382 |     'xn--czru2d',
1383 |     'xn--d1acj3b',
1384 |     'xn--d1alf',
1385 |     'xn--e1a4c',
1386 |     'xn--eckvdtc9d',
1387 |     'xn--efvy88h',
1388 |     'xn--estv75g',
1389 |     'xn--fct429k',
1390 |     'xn--fhbei',
1391 |     'xn--fiq228c5hs',
1392 |     'xn--fiq64b',
1393 |     'xn--fiqs8s',
1394 |     'xn--fiqz9s',
1395 |     'xn--fjq720a',
1396 |     'xn--flw351e',
1397 |     'xn--fpcrj9c3d',
1398 |     'xn--fzc2c9e2c',
1399 |     'xn--fzys8d69uvgm',
1400 |     'xn--g2xx48c',
1401 |     'xn--gckr3f0f',
1402 |     'xn--gecrj9c',
1403 |     'xn--h2brj9c',
1404 |     'xn--hxt814e',
1405 |     'xn--i1b6b1a6a2e',
1406 |     'xn--imr513n',
1407 |     'xn--io0a7i',
1408 |     'xn--j1aef',
1409 |     'xn--j1amh',
1410 |     'xn--j6w193g',
1411 |     'xn--jlq61u9w7b',
1412 |     'xn--jvr189m',
1413 |     'xn--kcrx77d1x4a',
1414 |     'xn--kprw13d',
1415 |     'xn--kpry57d',
1416 |     'xn--kpu716f',
1417 |     'xn--kput3i',
1418 |     'xn--l1acc',
1419 |     'xn--lgbbat1ad8j',
1420 |     'xn--mgb9awbf',
1421 |     'xn--mgba3a3ejt',
1422 |     'xn--mgba3a4f16a',
1423 |     'xn--mgba7c0bbn0a',
1424 |     'xn--mgbaam7a8h',
1425 |     'xn--mgbab2bd',
1426 |     'xn--mgbayh7gpa',
1427 |     'xn--mgbb9fbpob',
1428 |     'xn--mgbbh1a71e',
1429 |     'xn--mgbc0a9azcg',
1430 |     'xn--mgbca7dzdo',
1431 |     'xn--mgberp4a5d4ar',
1432 |     'xn--mgbpl2fh',
1433 |     'xn--mgbt3dhd',
1434 |     'xn--mgbtx2b',
1435 |     'xn--mgbx4cd0ab',
1436 |     'xn--mix891f',
1437 |     'xn--mk1bu44c',
1438 |     'xn--mxtq1m',
1439 |     'xn--ngbc5azd',
1440 |     'xn--ngbe9e0a',
1441 |     'xn--node',
1442 |     'xn--nqv7f',
1443 |     'xn--nqv7fs00ema',
1444 |     'xn--nyqy26a',
1445 |     'xn--o3cw4h',
1446 |     'xn--ogbpf8fl',
1447 |     'xn--p1acf',
1448 |     'xn--p1ai',
1449 |     'xn--pbt977c',
1450 |     'xn--pgbs0dh',
1451 |     'xn--pssy2u',
1452 |     'xn--q9jyb4c',
1453 |     'xn--qcka1pmc',
1454 |     'xn--qxam',
1455 |     'xn--rhqv96g',
1456 |     'xn--rovu88b',
1457 |     'xn--s9brj9c',
1458 |     'xn--ses554g',
1459 |     'xn--t60b56a',
1460 |     'xn--tckwe',
1461 |     'xn--unup4y',
1462 |     'xn--vermgensberater-ctb',
1463 |     'xn--vermgensberatung-pwb',
1464 |     'xn--vhquv',
1465 |     'xn--vuq861b',
1466 |     'xn--w4r85el8fhu5dnra',
1467 |     'xn--w4rs40l',
1468 |     'xn--wgbh1c',
1469 |     'xn--wgbl6a',
1470 |     'xn--xhq521b',
1471 |     'xn--xkc2al3hye2a',
1472 |     'xn--xkc2dl3a5ee0h',
1473 |     'xn--y9a3aq',
1474 |     'xn--yfro4i67o',
1475 |     'xn--ygbi2ammx',
1476 |     'xn--zfr164b',
1477 |     'xperia',
1478 |     'xxx',
1479 |     'xyz',
1480 |     'yachts',
1481 |     'yahoo',
1482 |     'yamaxun',
1483 |     'yandex',
1484 |     'ye',
1485 |     'yodobashi',
1486 |     'yoga',
1487 |     'yokohama',
1488 |     'you',
1489 |     'youtube',
1490 |     'yt',
1491 |     'yun',
1492 |     'za',
1493 |     'zappos',
1494 |     'zara',
1495 |     'zero',
1496 |     'zip',
1497 |     'zippo',
1498 |     'zm',
1499 |     'zone',
1500 |     'zuerich',
1501 |     'zw',
1502 |     'ελ',
1503 |     'бг',
1504 |     'бел',
1505 |     'дети',
1506 |     'ею',
1507 |     'ком',
1508 |     'мкд',
1509 |     'мон',
1510 |     'москва',
1511 |     'онлайн',
1512 |     'орг',
1513 |     'рус',
1514 |     'рф',
1515 |     'сайт',
1516 |     'срб',
1517 |     'укр',
1518 |     'қаз',
1519 |     'հայ',
1520 |     'קום',
1521 |     'ابوظبي',
1522 |     'ارامكو',
1523 |     'الاردن',
1524 |     'الجزائر',
1525 |     'السعودية',
1526 |     'العليان',
1527 |     'المغرب',
1528 |     'امارات',
1529 |     'ایران',
1530 |     'بازار',
1531 |     'بيتك',
1532 |     'بھارت',
1533 |     'تونس',
1534 |     'سودان',
1535 |     'سورية',
1536 |     'شبكة',
1537 |     'عراق',
1538 |     'عمان',
1539 |     'فلسطين',
1540 |     'قطر',
1541 |     'كوم',
1542 |     'مصر',
1543 |     'مليسيا',
1544 |     'موبايلي',
1545 |     'موقع',
1546 |     'همراه',
1547 |     'कॉम',
1548 |     'नेट',
1549 |     'भारत',
1550 |     'संगठन',
1551 |     'ভারত',
1552 |     'ਭਾਰਤ',
1553 |     'ભારત',
1554 |     'இந்தியா',
1555 |     'இலங்கை',
1556 |     'சிங்கப்பூர்',
1557 |     'భారత్',
1558 |     'ලංකා',
1559 |     'คอม',
1560 |     'ไทย',
1561 |     'გე',
1562 |     'みんな',
1563 |     'クラウド',
1564 |     'グーグル',
1565 |     'コム',
1566 |     'ストア',
1567 |     'セール',
1568 |     'ファッション',
1569 |     'ポイント',
1570 |     '世界',
1571 |     '中信',
1572 |     '中国',
1573 |     '中國',
1574 |     '中文网',
1575 |     '企业',
1576 |     '佛山',
1577 |     '信息',
1578 |     '健康',
1579 |     '八卦',
1580 |     '公司',
1581 |     '公益',
1582 |     '台湾',
1583 |     '台灣',
1584 |     '商城',
1585 |     '商店',
1586 |     '商标',
1587 |     '嘉里',
1588 |     '嘉里大酒店',
1589 |     '在线',
1590 |     '大众汽车',
1591 |     '大拿',
1592 |     '娱乐',
1593 |     '家電',
1594 |     '工行',
1595 |     '广东',
1596 |     '微博',
1597 |     '慈善',
1598 |     '我爱你',
1599 |     '手机',
1600 |     '手表',
1601 |     '政务',
1602 |     '政府',
1603 |     '新加坡',
1604 |     '新闻',
1605 |     '时尚',
1606 |     '書籍',
1607 |     '机构',
1608 |     '淡马锡',
1609 |     '游戏',
1610 |     '澳門',
1611 |     '点看',
1612 |     '珠宝',
1613 |     '移动',
1614 |     '组织机构',
1615 |     '网址',
1616 |     '网店',
1617 |     '网站',
1618 |     '网络',
1619 |     '联通',
1620 |     '诺基亚',
1621 |     '谷歌',
1622 |     '购物',
1623 |     '集团',
1624 |     '電訊盈科',
1625 |     '飞利浦',
1626 |     '食品',
1627 |     '餐厅',
1628 |     '香格里拉',
1629 |     '香港',
1630 |     '닷넷',
1631 |     '닷컴',
1632 |     '삼성',
1633 |     '한국',
1634 | ))
1635 | 


--------------------------------------------------------------------------------
/yelp_uri/tlds/common.py:
--------------------------------------------------------------------------------
 1 | # from:
 2 | #   https://w3techs.com/technologies/overview/top_level_domain/all
 3 | #   http://www.statista.com/statistics/265677/number-of-internet-top-level-domains-worldwide/
 4 | common_tlds = '|'.join((
 5 |     'com',
 6 |     'ru',
 7 |     'net',
 8 |     'org',
 9 |     'de',
10 |     'jp',
11 |     'uk',
12 | ))
13 | 


--------------------------------------------------------------------------------
/yelp_uri/urllib_utf8.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A Unicode-friendly wrapper around urllib methods that encodes unicodes into strings.
 3 | 
 4 | urlencode(), quote(), and quote_plus() should always be safe replacements for their unwrapped counterparts;
 5 | if passed strings, they behave exactly the same.
 6 | 
 7 | unquote() and unquote_plus() work just like their unwrapped counterparts, except that they always return unicodes (which
 8 | is usually what you want).
 9 | 
10 | The docs for the actual urllib methods are here:
11 | http://docs.python.org/lib/module-urllib.html
12 | 
13 | WARNING: Despite the fact that this module *supports* having unicode keys in the query
14 | part of a url. YOU SHOULD NOT USE THEM.  That is, you should NEVER support something
15 | like:
16 | BAD: http://www.yelp.com/my_servlet?bäd_param=value
17 | 
18 | ONLY use ascii for the key in url params.
19 | """
20 | # flake8: noqa
21 | from urllib.parse import parse_qs
22 | from urllib.parse import quote
23 | from urllib.parse import quote_plus
24 | from urllib.parse import splitvalue
25 | from urllib.parse import unquote
26 | from urllib.parse import unquote_plus
27 | from urllib.parse import urlencode
28 | 


--------------------------------------------------------------------------------