├── .gitignore
├── AUTHORS.rst
├── LICENSE
├── MANIFEST.in
├── README.rst
├── pyap
    ├── __init__.py
    ├── address.py
    ├── api.py
    ├── exceptions.py
    ├── packages
    │   ├── __init__.py
    │   └── six.py
    ├── parser.py
    ├── source_CA
    │   ├── __init__.py
    │   └── data.py
    ├── source_GB
    │   ├── __init__.py
    │   └── data.py
    ├── source_US
    │   ├── __init__.py
    │   └── data.py
    └── utils.py
├── pyproject.toml
├── setup.py
├── test_parser.py
├── test_parser_ca.py
├── test_parser_gb.py
├── test_parser_us.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Setuptools distribution folder.
 5 | /dist/
 6 | /build/
 7 | 
 8 | # Python egg metadata, regenerated from source files by setuptools.
 9 | /*.egg-info
10 | 
11 | # Coverage reports
12 | /htmlcov
13 | /__pycache__
14 | .coverage
15 | 
16 | # Venvs
17 | /venv*
18 | 
19 | # Misc
20 | run.py
21 | todo.txt
22 | .DS_Store
23 | .tox
24 | files.txt
25 | .vscode


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
1 | Imapy is written by Vladimir Goncharov.
2 | 
3 | 
4 | Special thanks to
5 | ```````````````````````
6 | - `Thomas Bird <https://github.com/thomasbird>`_ (British parsing rules)
7 | - `Thomas Funk <https://github.com/tomfunk>`_ (thomasfunk10@gmail.com)
8 | - `KCzar <https://github.com/KCzar>`_
9 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Vladimir Goncharov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | Pyap: Python address parser
 2 | ===========================
 3 | 
 4 | 
 5 | Pyap is an MIT Licensed text processing library, written in Python, for
 6 | detecting and parsing addresses. Currently it supports US 🇺🇸, Canadian 🇨🇦 and British 🇬🇧 addresses. 
 7 | 
 8 | 
 9 | .. code-block:: python
10 | 
11 |     >>> import pyap
12 |     >>> test_address = """
13 |         Lorem ipsum
14 |         225 E. John Carpenter Freeway, 
15 |         Suite 1500 Irving, Texas 75062
16 |         Dorem sit amet
17 |         """
18 |     >>> addresses = pyap.parse(test_address, country='US')
19 |     >>> for address in addresses:
20 |             # shows found address
21 |             print(address)
22 |             # shows address parts
23 |             print(address.as_dict())
24 |     ...
25 | 
26 | 
27 | 
28 | 
29 | Installation
30 | ------------
31 | 
32 | To install Pyap, simply:
33 | 
34 | .. code-block:: bash
35 | 
36 |     $ pip install pyap
37 | 
38 | 
39 | 
40 | About
41 | -----
42 | This library has been created because i couldn't find any reliable and
43 | opensource solution for detecting addresses on web pages when writing my
44 | web crawler. Currently available solutions have drawbacks when it comes
45 | to using them to process really large amounts of data fast. You'll
46 | either have to buy some proprietary software; use third-party
47 | pay-per-use services or use address detecting which is slow and
48 | unsuitable for real-time processing.
49 | 
50 | Pyap is an alternative to all these methods. It is really fast because
51 | it is based on using regular expressions and it allows to find addresses
52 | in text in real time with low error rates.
53 | 
54 | 
55 | Future work
56 | -----------
57 | - Add rules for parsing FR addresses
58 | - ...
59 | 
60 | 
61 | Typical workflow
62 | ----------------
63 | Pyap should be used as a first thing when you need to detect an address
64 | inside a text when you don't know for sure whether the text contains
65 | addresses or not.
66 | 
67 | To achieve the most accuracy Pyap results could be reverified using
68 | geocoding process.
69 | 
70 | 
71 | Limitations
72 | -----------
73 | Because Pyap is based on regular expressions it provides fast results.
74 | This is also a limitation because regexps intentionally do not use too
75 | much context to detect an address.
76 | 
77 | In other words in order to detect US address, the library doesn't
78 | use any list of US cities or a list of typical street names. It
79 | looks for a pattern which is most likely to be an address.
80 | 
81 | For example the string below would be detected as a valid address: 
82 | "1 SPIRITUAL HEALER DR SHARIF NSAMBU SPECIALISING IN"
83 | 
84 | It happens because this string has all the components of a valid
85 | address: street number "1", street name "SPIRITUAL HEALER" followed
86 | by a street identifier "DR" (Drive), city "SHARIF NSAMBU SPECIALISING"
87 | and a state name abbreviation "IN" (Indiana).
88 | 
89 | The good news is that the above mentioned errors are **quite rare**.
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/pyap/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | """
4 | API hooks
5 | """
6 | from .api import parse
7 | from .utils import (match, findall)
8 | 


--------------------------------------------------------------------------------
/pyap/address.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |     pyap.address
 5 |     ~~~~~~~~~~~~~~~~
 6 | 
 7 |     Contains class for constructing Address object which holds information
 8 |     about address and its components.
 9 | 
10 |     :copyright: (c) 2015 by Vladimir Goncharov.
11 |     :license: MIT, see LICENSE for more details.
12 | """
13 | 
14 | from .packages import six
15 | 
16 | 
17 | class Address(object):
18 | 
19 |     def __init__(self, **args):
20 |         keys = []
21 |         vals = []
22 |         for k, v in six.iteritems(args):
23 |             if v and isinstance(v, str):
24 |                 v = v.strip(' ,;:')
25 |             # create object variables
26 |             setattr(self, k, v)
27 |             # prepare for dict
28 |             keys.append(k)
29 |             vals.append(v)
30 |         self.data_as_dict = dict(zip(keys, vals))
31 | 
32 |     def as_dict(self):
33 |         # Return parsed address parts as a dictionary
34 |         return self.data_as_dict
35 | 
36 |     def __repr__(self):
37 |         # Address object is represented as textual address
38 |         address = ''
39 |         try:
40 |             address = self.full_address
41 |         except AttributeError:
42 |             pass
43 |         if six.PY2:
44 |             address = address.encode('utf-8')
45 |         return address
46 | 


--------------------------------------------------------------------------------
/pyap/api.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |     pyap.api
 5 |     ~~~~~~~~~~~~~~~~
 6 | 
 7 |     This module contains address parser API functions.
 8 | 
 9 |     :copyright: (c) 2015 by Vladimir Goncharov.
10 |     :license: MIT, see LICENSE for more details.
11 | """
12 | 
13 | from . import parser
14 | 
15 | 
16 | def parse(some_text, **kwargs):
17 |     """Creates request to AddressParser
18 |     and returns list of Address objects
19 |     """
20 |     ap = parser.AddressParser(**kwargs)
21 |     return ap.parse(some_text)
22 | 


--------------------------------------------------------------------------------
/pyap/exceptions.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |     pyap.exceptions
 5 |     ~~~~~~~~~~~~~~~~
 6 | 
 7 |     This module contains address parser exceptions.
 8 | 
 9 |     :copyright: (c) 2015 by Vladimir Goncharov.
10 |     :license: MIT, see LICENSE for more details.
11 | """
12 | 
13 | 
14 | class AddressParserException(Exception):
15 |     pass
16 | 
17 | 
18 | class NoCountrySelected(AddressParserException):
19 |     ''' No country selected during module initialization '''
20 |     def __init__(self, message, errors):
21 |         super(NoCountrySelected, self).__init__(message)
22 |         self.errors = errors
23 | 
24 | 
25 | class CountryDetectionMissing(AddressParserException):
26 |     ''' Country-specific address detection rules were not found '''
27 |     def __init__(self, message, errors):
28 |         super(CountryDetectionMissing, self).__init__(message)
29 |         self.errors = errors
30 | 


--------------------------------------------------------------------------------
/pyap/packages/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vladimarius/pyap/34e33739be46e084becae73980d945af068ae699/pyap/packages/__init__.py


--------------------------------------------------------------------------------
/pyap/packages/six.py:
--------------------------------------------------------------------------------
  1 | """Utilities for writing code that runs on Python 2 and 3"""
  2 | 
  3 | # Copyright (c) 2010-2015 Benjamin Peterson
  4 | #
  5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | # of this software and associated documentation files (the "Software"), to deal
  7 | # in the Software without restriction, including without limitation the rights
  8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | # copies of the Software, and to permit persons to whom the Software is
 10 | # furnished to do so, subject to the following conditions:
 11 | #
 12 | # The above copyright notice and this permission notice shall be included in all
 13 | # copies or substantial portions of the Software.
 14 | #
 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 21 | # SOFTWARE.
 22 | 
 23 | from __future__ import absolute_import
 24 | 
 25 | import functools
 26 | import itertools
 27 | import operator
 28 | import sys
 29 | import types
 30 | 
 31 | __author__ = "Benjamin Peterson <benjamin@python.org>"
 32 | __version__ = "1.9.0"
 33 | 
 34 | 
 35 | # Useful for very coarse version differentiation.
 36 | PY2 = sys.version_info[0] == 2
 37 | PY3 = sys.version_info[0] == 3
 38 | PY34 = sys.version_info[0:2] >= (3, 4)
 39 | 
 40 | if PY3:
 41 |     string_types = str,
 42 |     integer_types = int,
 43 |     class_types = type,
 44 |     text_type = str
 45 |     binary_type = bytes
 46 | 
 47 |     MAXSIZE = sys.maxsize
 48 | else:
 49 |     string_types = basestring,
 50 |     integer_types = (int, long)
 51 |     class_types = (type, types.ClassType)
 52 |     text_type = unicode
 53 |     binary_type = str
 54 | 
 55 |     if sys.platform.startswith("java"):
 56 |         # Jython always uses 32 bits.
 57 |         MAXSIZE = int((1 << 31) - 1)
 58 |     else:
 59 |         # It's possible to have sizeof(long) != sizeof(Py_ssize_t).
 60 |         class X(object):
 61 | 
 62 |             def __len__(self):
 63 |                 return 1 << 31
 64 |         try:
 65 |             len(X())
 66 |         except OverflowError:
 67 |             # 32-bit
 68 |             MAXSIZE = int((1 << 31) - 1)
 69 |         else:
 70 |             # 64-bit
 71 |             MAXSIZE = int((1 << 63) - 1)
 72 |         del X
 73 | 
 74 | 
 75 | def _add_doc(func, doc):
 76 |     """Add documentation to a function."""
 77 |     func.__doc__ = doc
 78 | 
 79 | 
 80 | def _import_module(name):
 81 |     """Import module, returning the module after the last dot."""
 82 |     __import__(name)
 83 |     return sys.modules[name]
 84 | 
 85 | 
 86 | class _LazyDescr(object):
 87 | 
 88 |     def __init__(self, name):
 89 |         self.name = name
 90 | 
 91 |     def __get__(self, obj, tp):
 92 |         result = self._resolve()
 93 |         setattr(obj, self.name, result)  # Invokes __set__.
 94 |         try:
 95 |             # This is a bit ugly, but it avoids running this again by
 96 |             # removing this descriptor.
 97 |             delattr(obj.__class__, self.name)
 98 |         except AttributeError:
 99 |             pass
100 |         return result
101 | 
102 | 
103 | class MovedModule(_LazyDescr):
104 | 
105 |     def __init__(self, name, old, new=None):
106 |         super(MovedModule, self).__init__(name)
107 |         if PY3:
108 |             if new is None:
109 |                 new = name
110 |             self.mod = new
111 |         else:
112 |             self.mod = old
113 | 
114 |     def _resolve(self):
115 |         return _import_module(self.mod)
116 | 
117 |     def __getattr__(self, attr):
118 |         _module = self._resolve()
119 |         value = getattr(_module, attr)
120 |         setattr(self, attr, value)
121 |         return value
122 | 
123 | 
124 | class _LazyModule(types.ModuleType):
125 | 
126 |     def __init__(self, name):
127 |         super(_LazyModule, self).__init__(name)
128 |         self.__doc__ = self.__class__.__doc__
129 | 
130 |     def __dir__(self):
131 |         attrs = ["__doc__", "__name__"]
132 |         attrs += [attr.name for attr in self._moved_attributes]
133 |         return attrs
134 | 
135 |     # Subclasses should override this
136 |     _moved_attributes = []
137 | 
138 | 
139 | class MovedAttribute(_LazyDescr):
140 | 
141 |     def __init__(self, name, old_mod, new_mod, old_attr=None, new_attr=None):
142 |         super(MovedAttribute, self).__init__(name)
143 |         if PY3:
144 |             if new_mod is None:
145 |                 new_mod = name
146 |             self.mod = new_mod
147 |             if new_attr is None:
148 |                 if old_attr is None:
149 |                     new_attr = name
150 |                 else:
151 |                     new_attr = old_attr
152 |             self.attr = new_attr
153 |         else:
154 |             self.mod = old_mod
155 |             if old_attr is None:
156 |                 old_attr = name
157 |             self.attr = old_attr
158 | 
159 |     def _resolve(self):
160 |         module = _import_module(self.mod)
161 |         return getattr(module, self.attr)
162 | 
163 | 
164 | class _SixMetaPathImporter(object):
165 | 
166 |     """
167 |     A meta path importer to import six.moves and its submodules.
168 | 
169 |     This class implements a PEP302 finder and loader. It should be compatible
170 |     with Python 2.5 and all existing versions of Python3
171 |     """
172 | 
173 |     def __init__(self, six_module_name):
174 |         self.name = six_module_name
175 |         self.known_modules = {}
176 | 
177 |     def _add_module(self, mod, *fullnames):
178 |         for fullname in fullnames:
179 |             self.known_modules[self.name + "." + fullname] = mod
180 | 
181 |     def _get_module(self, fullname):
182 |         return self.known_modules[self.name + "." + fullname]
183 | 
184 |     def find_module(self, fullname, path=None):
185 |         if fullname in self.known_modules:
186 |             return self
187 |         return None
188 | 
189 |     def __get_module(self, fullname):
190 |         try:
191 |             return self.known_modules[fullname]
192 |         except KeyError:
193 |             raise ImportError("This loader does not know module " + fullname)
194 | 
195 |     def load_module(self, fullname):
196 |         try:
197 |             # in case of a reload
198 |             return sys.modules[fullname]
199 |         except KeyError:
200 |             pass
201 |         mod = self.__get_module(fullname)
202 |         if isinstance(mod, MovedModule):
203 |             mod = mod._resolve()
204 |         else:
205 |             mod.__loader__ = self
206 |         sys.modules[fullname] = mod
207 |         return mod
208 | 
209 |     def is_package(self, fullname):
210 |         """
211 |         Return true, if the named module is a package.
212 | 
213 |         We need this method to get correct spec objects with
214 |         Python 3.4 (see PEP451)
215 |         """
216 |         return hasattr(self.__get_module(fullname), "__path__")
217 | 
218 |     def get_code(self, fullname):
219 |         """Return None
220 | 
221 |         Required, if is_package is implemented"""
222 |         self.__get_module(fullname)  # eventually raises ImportError
223 |         return None
224 |     get_source = get_code  # same as get_code
225 | 
226 | _importer = _SixMetaPathImporter(__name__)
227 | 
228 | 
229 | class _MovedItems(_LazyModule):
230 | 
231 |     """Lazy loading of moved objects"""
232 |     __path__ = []  # mark as package
233 | 
234 | 
235 | _moved_attributes = [
236 |     MovedAttribute("cStringIO", "cStringIO", "io", "StringIO"),
237 |     MovedAttribute("filter", "itertools", "builtins", "ifilter", "filter"),
238 |     MovedAttribute("filterfalse", "itertools", "itertools", "ifilterfalse", "filterfalse"),
239 |     MovedAttribute("input", "__builtin__", "builtins", "raw_input", "input"),
240 |     MovedAttribute("intern", "__builtin__", "sys"),
241 |     MovedAttribute("map", "itertools", "builtins", "imap", "map"),
242 |     MovedAttribute("getcwd", "os", "os", "getcwdu", "getcwd"),
243 |     MovedAttribute("getcwdb", "os", "os", "getcwd", "getcwdb"),
244 |     MovedAttribute("range", "__builtin__", "builtins", "xrange", "range"),
245 |     MovedAttribute("reload_module", "__builtin__", "importlib" if PY34 else "imp", "reload"),
246 |     MovedAttribute("reduce", "__builtin__", "functools"),
247 |     MovedAttribute("shlex_quote", "pipes", "shlex", "quote"),
248 |     MovedAttribute("StringIO", "StringIO", "io"),
249 |     MovedAttribute("UserDict", "UserDict", "collections"),
250 |     MovedAttribute("UserList", "UserList", "collections"),
251 |     MovedAttribute("UserString", "UserString", "collections"),
252 |     MovedAttribute("xrange", "__builtin__", "builtins", "xrange", "range"),
253 |     MovedAttribute("zip", "itertools", "builtins", "izip", "zip"),
254 |     MovedAttribute("zip_longest", "itertools", "itertools", "izip_longest", "zip_longest"),
255 |     MovedModule("builtins", "__builtin__"),
256 |     MovedModule("configparser", "ConfigParser"),
257 |     MovedModule("copyreg", "copy_reg"),
258 |     MovedModule("dbm_gnu", "gdbm", "dbm.gnu"),
259 |     MovedModule("_dummy_thread", "dummy_thread", "_dummy_thread"),
260 |     MovedModule("http_cookiejar", "cookielib", "http.cookiejar"),
261 |     MovedModule("http_cookies", "Cookie", "http.cookies"),
262 |     MovedModule("html_entities", "htmlentitydefs", "html.entities"),
263 |     MovedModule("html_parser", "HTMLParser", "html.parser"),
264 |     MovedModule("http_client", "httplib", "http.client"),
265 |     MovedModule("email_mime_multipart", "email.MIMEMultipart", "email.mime.multipart"),
266 |     MovedModule("email_mime_nonmultipart", "email.MIMENonMultipart", "email.mime.nonmultipart"),
267 |     MovedModule("email_mime_text", "email.MIMEText", "email.mime.text"),
268 |     MovedModule("email_mime_base", "email.MIMEBase", "email.mime.base"),
269 |     MovedModule("BaseHTTPServer", "BaseHTTPServer", "http.server"),
270 |     MovedModule("CGIHTTPServer", "CGIHTTPServer", "http.server"),
271 |     MovedModule("SimpleHTTPServer", "SimpleHTTPServer", "http.server"),
272 |     MovedModule("cPickle", "cPickle", "pickle"),
273 |     MovedModule("queue", "Queue"),
274 |     MovedModule("reprlib", "repr"),
275 |     MovedModule("socketserver", "SocketServer"),
276 |     MovedModule("_thread", "thread", "_thread"),
277 |     MovedModule("tkinter", "Tkinter"),
278 |     MovedModule("tkinter_dialog", "Dialog", "tkinter.dialog"),
279 |     MovedModule("tkinter_filedialog", "FileDialog", "tkinter.filedialog"),
280 |     MovedModule("tkinter_scrolledtext", "ScrolledText", "tkinter.scrolledtext"),
281 |     MovedModule("tkinter_simpledialog", "SimpleDialog", "tkinter.simpledialog"),
282 |     MovedModule("tkinter_tix", "Tix", "tkinter.tix"),
283 |     MovedModule("tkinter_ttk", "ttk", "tkinter.ttk"),
284 |     MovedModule("tkinter_constants", "Tkconstants", "tkinter.constants"),
285 |     MovedModule("tkinter_dnd", "Tkdnd", "tkinter.dnd"),
286 |     MovedModule("tkinter_colorchooser", "tkColorChooser",
287 |                 "tkinter.colorchooser"),
288 |     MovedModule("tkinter_commondialog", "tkCommonDialog",
289 |                 "tkinter.commondialog"),
290 |     MovedModule("tkinter_tkfiledialog", "tkFileDialog", "tkinter.filedialog"),
291 |     MovedModule("tkinter_font", "tkFont", "tkinter.font"),
292 |     MovedModule("tkinter_messagebox", "tkMessageBox", "tkinter.messagebox"),
293 |     MovedModule("tkinter_tksimpledialog", "tkSimpleDialog",
294 |                 "tkinter.simpledialog"),
295 |     MovedModule("urllib_parse", __name__ + ".moves.urllib_parse", "urllib.parse"),
296 |     MovedModule("urllib_error", __name__ + ".moves.urllib_error", "urllib.error"),
297 |     MovedModule("urllib", __name__ + ".moves.urllib", __name__ + ".moves.urllib"),
298 |     MovedModule("urllib_robotparser", "robotparser", "urllib.robotparser"),
299 |     MovedModule("xmlrpc_client", "xmlrpclib", "xmlrpc.client"),
300 |     MovedModule("xmlrpc_server", "SimpleXMLRPCServer", "xmlrpc.server"),
301 | ]
302 | # Add windows specific modules.
303 | if sys.platform == "win32":
304 |     _moved_attributes += [
305 |         MovedModule("winreg", "_winreg"),
306 |     ]
307 | 
308 | for attr in _moved_attributes:
309 |     setattr(_MovedItems, attr.name, attr)
310 |     if isinstance(attr, MovedModule):
311 |         _importer._add_module(attr, "moves." + attr.name)
312 | del attr
313 | 
314 | _MovedItems._moved_attributes = _moved_attributes
315 | 
316 | moves = _MovedItems(__name__ + ".moves")
317 | _importer._add_module(moves, "moves")
318 | 
319 | 
320 | class Module_six_moves_urllib_parse(_LazyModule):
321 | 
322 |     """Lazy loading of moved objects in six.moves.urllib_parse"""
323 | 
324 | 
325 | _urllib_parse_moved_attributes = [
326 |     MovedAttribute("ParseResult", "urlparse", "urllib.parse"),
327 |     MovedAttribute("SplitResult", "urlparse", "urllib.parse"),
328 |     MovedAttribute("parse_qs", "urlparse", "urllib.parse"),
329 |     MovedAttribute("parse_qsl", "urlparse", "urllib.parse"),
330 |     MovedAttribute("urldefrag", "urlparse", "urllib.parse"),
331 |     MovedAttribute("urljoin", "urlparse", "urllib.parse"),
332 |     MovedAttribute("urlparse", "urlparse", "urllib.parse"),
333 |     MovedAttribute("urlsplit", "urlparse", "urllib.parse"),
334 |     MovedAttribute("urlunparse", "urlparse", "urllib.parse"),
335 |     MovedAttribute("urlunsplit", "urlparse", "urllib.parse"),
336 |     MovedAttribute("quote", "urllib", "urllib.parse"),
337 |     MovedAttribute("quote_plus", "urllib", "urllib.parse"),
338 |     MovedAttribute("unquote", "urllib", "urllib.parse"),
339 |     MovedAttribute("unquote_plus", "urllib", "urllib.parse"),
340 |     MovedAttribute("urlencode", "urllib", "urllib.parse"),
341 |     MovedAttribute("splitquery", "urllib", "urllib.parse"),
342 |     MovedAttribute("splittag", "urllib", "urllib.parse"),
343 |     MovedAttribute("splituser", "urllib", "urllib.parse"),
344 |     MovedAttribute("uses_fragment", "urlparse", "urllib.parse"),
345 |     MovedAttribute("uses_netloc", "urlparse", "urllib.parse"),
346 |     MovedAttribute("uses_params", "urlparse", "urllib.parse"),
347 |     MovedAttribute("uses_query", "urlparse", "urllib.parse"),
348 |     MovedAttribute("uses_relative", "urlparse", "urllib.parse"),
349 | ]
350 | for attr in _urllib_parse_moved_attributes:
351 |     setattr(Module_six_moves_urllib_parse, attr.name, attr)
352 | del attr
353 | 
354 | Module_six_moves_urllib_parse._moved_attributes = _urllib_parse_moved_attributes
355 | 
356 | _importer._add_module(Module_six_moves_urllib_parse(__name__ + ".moves.urllib_parse"),
357 |                       "moves.urllib_parse", "moves.urllib.parse")
358 | 
359 | 
360 | class Module_six_moves_urllib_error(_LazyModule):
361 | 
362 |     """Lazy loading of moved objects in six.moves.urllib_error"""
363 | 
364 | 
365 | _urllib_error_moved_attributes = [
366 |     MovedAttribute("URLError", "urllib2", "urllib.error"),
367 |     MovedAttribute("HTTPError", "urllib2", "urllib.error"),
368 |     MovedAttribute("ContentTooShortError", "urllib", "urllib.error"),
369 | ]
370 | for attr in _urllib_error_moved_attributes:
371 |     setattr(Module_six_moves_urllib_error, attr.name, attr)
372 | del attr
373 | 
374 | Module_six_moves_urllib_error._moved_attributes = _urllib_error_moved_attributes
375 | 
376 | _importer._add_module(Module_six_moves_urllib_error(__name__ + ".moves.urllib.error"),
377 |                       "moves.urllib_error", "moves.urllib.error")
378 | 
379 | 
380 | class Module_six_moves_urllib_request(_LazyModule):
381 | 
382 |     """Lazy loading of moved objects in six.moves.urllib_request"""
383 | 
384 | 
385 | _urllib_request_moved_attributes = [
386 |     MovedAttribute("urlopen", "urllib2", "urllib.request"),
387 |     MovedAttribute("install_opener", "urllib2", "urllib.request"),
388 |     MovedAttribute("build_opener", "urllib2", "urllib.request"),
389 |     MovedAttribute("pathname2url", "urllib", "urllib.request"),
390 |     MovedAttribute("url2pathname", "urllib", "urllib.request"),
391 |     MovedAttribute("getproxies", "urllib", "urllib.request"),
392 |     MovedAttribute("Request", "urllib2", "urllib.request"),
393 |     MovedAttribute("OpenerDirector", "urllib2", "urllib.request"),
394 |     MovedAttribute("HTTPDefaultErrorHandler", "urllib2", "urllib.request"),
395 |     MovedAttribute("HTTPRedirectHandler", "urllib2", "urllib.request"),
396 |     MovedAttribute("HTTPCookieProcessor", "urllib2", "urllib.request"),
397 |     MovedAttribute("ProxyHandler", "urllib2", "urllib.request"),
398 |     MovedAttribute("BaseHandler", "urllib2", "urllib.request"),
399 |     MovedAttribute("HTTPPasswordMgr", "urllib2", "urllib.request"),
400 |     MovedAttribute("HTTPPasswordMgrWithDefaultRealm", "urllib2", "urllib.request"),
401 |     MovedAttribute("AbstractBasicAuthHandler", "urllib2", "urllib.request"),
402 |     MovedAttribute("HTTPBasicAuthHandler", "urllib2", "urllib.request"),
403 |     MovedAttribute("ProxyBasicAuthHandler", "urllib2", "urllib.request"),
404 |     MovedAttribute("AbstractDigestAuthHandler", "urllib2", "urllib.request"),
405 |     MovedAttribute("HTTPDigestAuthHandler", "urllib2", "urllib.request"),
406 |     MovedAttribute("ProxyDigestAuthHandler", "urllib2", "urllib.request"),
407 |     MovedAttribute("HTTPHandler", "urllib2", "urllib.request"),
408 |     MovedAttribute("HTTPSHandler", "urllib2", "urllib.request"),
409 |     MovedAttribute("FileHandler", "urllib2", "urllib.request"),
410 |     MovedAttribute("FTPHandler", "urllib2", "urllib.request"),
411 |     MovedAttribute("CacheFTPHandler", "urllib2", "urllib.request"),
412 |     MovedAttribute("UnknownHandler", "urllib2", "urllib.request"),
413 |     MovedAttribute("HTTPErrorProcessor", "urllib2", "urllib.request"),
414 |     MovedAttribute("urlretrieve", "urllib", "urllib.request"),
415 |     MovedAttribute("urlcleanup", "urllib", "urllib.request"),
416 |     MovedAttribute("URLopener", "urllib", "urllib.request"),
417 |     MovedAttribute("FancyURLopener", "urllib", "urllib.request"),
418 |     MovedAttribute("proxy_bypass", "urllib", "urllib.request"),
419 | ]
420 | for attr in _urllib_request_moved_attributes:
421 |     setattr(Module_six_moves_urllib_request, attr.name, attr)
422 | del attr
423 | 
424 | Module_six_moves_urllib_request._moved_attributes = _urllib_request_moved_attributes
425 | 
426 | _importer._add_module(Module_six_moves_urllib_request(__name__ + ".moves.urllib.request"),
427 |                       "moves.urllib_request", "moves.urllib.request")
428 | 
429 | 
430 | class Module_six_moves_urllib_response(_LazyModule):
431 | 
432 |     """Lazy loading of moved objects in six.moves.urllib_response"""
433 | 
434 | 
435 | _urllib_response_moved_attributes = [
436 |     MovedAttribute("addbase", "urllib", "urllib.response"),
437 |     MovedAttribute("addclosehook", "urllib", "urllib.response"),
438 |     MovedAttribute("addinfo", "urllib", "urllib.response"),
439 |     MovedAttribute("addinfourl", "urllib", "urllib.response"),
440 | ]
441 | for attr in _urllib_response_moved_attributes:
442 |     setattr(Module_six_moves_urllib_response, attr.name, attr)
443 | del attr
444 | 
445 | Module_six_moves_urllib_response._moved_attributes = _urllib_response_moved_attributes
446 | 
447 | _importer._add_module(Module_six_moves_urllib_response(__name__ + ".moves.urllib.response"),
448 |                       "moves.urllib_response", "moves.urllib.response")
449 | 
450 | 
451 | class Module_six_moves_urllib_robotparser(_LazyModule):
452 | 
453 |     """Lazy loading of moved objects in six.moves.urllib_robotparser"""
454 | 
455 | 
456 | _urllib_robotparser_moved_attributes = [
457 |     MovedAttribute("RobotFileParser", "robotparser", "urllib.robotparser"),
458 | ]
459 | for attr in _urllib_robotparser_moved_attributes:
460 |     setattr(Module_six_moves_urllib_robotparser, attr.name, attr)
461 | del attr
462 | 
463 | Module_six_moves_urllib_robotparser._moved_attributes = _urllib_robotparser_moved_attributes
464 | 
465 | _importer._add_module(Module_six_moves_urllib_robotparser(__name__ + ".moves.urllib.robotparser"),
466 |                       "moves.urllib_robotparser", "moves.urllib.robotparser")
467 | 
468 | 
469 | class Module_six_moves_urllib(types.ModuleType):
470 | 
471 |     """Create a six.moves.urllib namespace that resembles the Python 3 namespace"""
472 |     __path__ = []  # mark as package
473 |     parse = _importer._get_module("moves.urllib_parse")
474 |     error = _importer._get_module("moves.urllib_error")
475 |     request = _importer._get_module("moves.urllib_request")
476 |     response = _importer._get_module("moves.urllib_response")
477 |     robotparser = _importer._get_module("moves.urllib_robotparser")
478 | 
479 |     def __dir__(self):
480 |         return ['parse', 'error', 'request', 'response', 'robotparser']
481 | 
482 | _importer._add_module(Module_six_moves_urllib(__name__ + ".moves.urllib"),
483 |                       "moves.urllib")
484 | 
485 | 
486 | def add_move(move):
487 |     """Add an item to six.moves."""
488 |     setattr(_MovedItems, move.name, move)
489 | 
490 | 
491 | def remove_move(name):
492 |     """Remove item from six.moves."""
493 |     try:
494 |         delattr(_MovedItems, name)
495 |     except AttributeError:
496 |         try:
497 |             del moves.__dict__[name]
498 |         except KeyError:
499 |             raise AttributeError("no such move, %r" % (name,))
500 | 
501 | 
502 | if PY3:
503 |     _meth_func = "__func__"
504 |     _meth_self = "__self__"
505 | 
506 |     _func_closure = "__closure__"
507 |     _func_code = "__code__"
508 |     _func_defaults = "__defaults__"
509 |     _func_globals = "__globals__"
510 | else:
511 |     _meth_func = "im_func"
512 |     _meth_self = "im_self"
513 | 
514 |     _func_closure = "func_closure"
515 |     _func_code = "func_code"
516 |     _func_defaults = "func_defaults"
517 |     _func_globals = "func_globals"
518 | 
519 | 
520 | try:
521 |     advance_iterator = next
522 | except NameError:
523 |     def advance_iterator(it):
524 |         return it.next()
525 | next = advance_iterator
526 | 
527 | 
528 | try:
529 |     callable = callable
530 | except NameError:
531 |     def callable(obj):
532 |         return any("__call__" in klass.__dict__ for klass in type(obj).__mro__)
533 | 
534 | 
535 | if PY3:
536 |     def get_unbound_function(unbound):
537 |         return unbound
538 | 
539 |     create_bound_method = types.MethodType
540 | 
541 |     def create_unbound_method(func, cls):
542 |         return func
543 | 
544 |     Iterator = object
545 | else:
546 |     def get_unbound_function(unbound):
547 |         return unbound.im_func
548 | 
549 |     def create_bound_method(func, obj):
550 |         return types.MethodType(func, obj, obj.__class__)
551 | 
552 |     def create_unbound_method(func, cls):
553 |         return types.MethodType(func, None, cls)
554 | 
555 |     class Iterator(object):
556 | 
557 |         def next(self):
558 |             return type(self).__next__(self)
559 | 
560 |     callable = callable
561 | _add_doc(get_unbound_function,
562 |          """Get the function out of a possibly unbound function""")
563 | 
564 | 
565 | get_method_function = operator.attrgetter(_meth_func)
566 | get_method_self = operator.attrgetter(_meth_self)
567 | get_function_closure = operator.attrgetter(_func_closure)
568 | get_function_code = operator.attrgetter(_func_code)
569 | get_function_defaults = operator.attrgetter(_func_defaults)
570 | get_function_globals = operator.attrgetter(_func_globals)
571 | 
572 | 
573 | if PY3:
574 |     def iterkeys(d, **kw):
575 |         return iter(d.keys(**kw))
576 | 
577 |     def itervalues(d, **kw):
578 |         return iter(d.values(**kw))
579 | 
580 |     def iteritems(d, **kw):
581 |         return iter(d.items(**kw))
582 | 
583 |     def iterlists(d, **kw):
584 |         return iter(d.lists(**kw))
585 | 
586 |     viewkeys = operator.methodcaller("keys")
587 | 
588 |     viewvalues = operator.methodcaller("values")
589 | 
590 |     viewitems = operator.methodcaller("items")
591 | else:
592 |     def iterkeys(d, **kw):
593 |         return d.iterkeys(**kw)
594 | 
595 |     def itervalues(d, **kw):
596 |         return d.itervalues(**kw)
597 | 
598 |     def iteritems(d, **kw):
599 |         return d.iteritems(**kw)
600 | 
601 |     def iterlists(d, **kw):
602 |         return d.iterlists(**kw)
603 | 
604 |     viewkeys = operator.methodcaller("viewkeys")
605 | 
606 |     viewvalues = operator.methodcaller("viewvalues")
607 | 
608 |     viewitems = operator.methodcaller("viewitems")
609 | 
610 | _add_doc(iterkeys, "Return an iterator over the keys of a dictionary.")
611 | _add_doc(itervalues, "Return an iterator over the values of a dictionary.")
612 | _add_doc(iteritems,
613 |          "Return an iterator over the (key, value) pairs of a dictionary.")
614 | _add_doc(iterlists,
615 |          "Return an iterator over the (key, [values]) pairs of a dictionary.")
616 | 
617 | 
618 | if PY3:
619 |     def b(s):
620 |         return s.encode("latin-1")
621 | 
622 |     def u(s):
623 |         return s
624 |     unichr = chr
625 |     import struct
626 |     int2byte = struct.Struct(">B").pack
627 |     del struct
628 |     byte2int = operator.itemgetter(0)
629 |     indexbytes = operator.getitem
630 |     iterbytes = iter
631 |     import io
632 |     StringIO = io.StringIO
633 |     BytesIO = io.BytesIO
634 |     _assertCountEqual = "assertCountEqual"
635 |     if sys.version_info[1] <= 1:
636 |         _assertRaisesRegex = "assertRaisesRegexp"
637 |         _assertRegex = "assertRegexpMatches"
638 |     else:
639 |         _assertRaisesRegex = "assertRaisesRegex"
640 |         _assertRegex = "assertRegex"
641 | else:
642 |     def b(s):
643 |         return s
644 |     # Workaround for standalone backslash
645 | 
646 |     def u(s):
647 |         return unicode(s.replace(r'\\', r'\\\\'), "unicode_escape")
648 |     unichr = unichr
649 |     int2byte = chr
650 | 
651 |     def byte2int(bs):
652 |         return ord(bs[0])
653 | 
654 |     def indexbytes(buf, i):
655 |         return ord(buf[i])
656 |     iterbytes = functools.partial(itertools.imap, ord)
657 |     import StringIO
658 |     StringIO = BytesIO = StringIO.StringIO
659 |     _assertCountEqual = "assertItemsEqual"
660 |     _assertRaisesRegex = "assertRaisesRegexp"
661 |     _assertRegex = "assertRegexpMatches"
662 | _add_doc(b, """Byte literal""")
663 | _add_doc(u, """Text literal""")
664 | 
665 | 
666 | def assertCountEqual(self, *args, **kwargs):
667 |     return getattr(self, _assertCountEqual)(*args, **kwargs)
668 | 
669 | 
670 | def assertRaisesRegex(self, *args, **kwargs):
671 |     return getattr(self, _assertRaisesRegex)(*args, **kwargs)
672 | 
673 | 
674 | def assertRegex(self, *args, **kwargs):
675 |     return getattr(self, _assertRegex)(*args, **kwargs)
676 | 
677 | 
678 | if PY3:
679 |     exec_ = getattr(moves.builtins, "exec")
680 | 
681 |     def reraise(tp, value, tb=None):
682 |         if value is None:
683 |             value = tp()
684 |         if value.__traceback__ is not tb:
685 |             raise value.with_traceback(tb)
686 |         raise value
687 | 
688 | else:
689 |     def exec_(_code_, _globs_=None, _locs_=None):
690 |         """Execute code in a namespace."""
691 |         if _globs_ is None:
692 |             frame = sys._getframe(1)
693 |             _globs_ = frame.f_globals
694 |             if _locs_ is None:
695 |                 _locs_ = frame.f_locals
696 |             del frame
697 |         elif _locs_ is None:
698 |             _locs_ = _globs_
699 |         exec("""exec _code_ in _globs_, _locs_""")
700 | 
701 |     exec_("""def reraise(tp, value, tb=None):
702 |     raise tp, value, tb
703 | """)
704 | 
705 | 
706 | if sys.version_info[:2] == (3, 2):
707 |     exec_("""def raise_from(value, from_value):
708 |     if from_value is None:
709 |         raise value
710 |     raise value from from_value
711 | """)
712 | elif sys.version_info[:2] > (3, 2):
713 |     exec_("""def raise_from(value, from_value):
714 |     raise value from from_value
715 | """)
716 | else:
717 |     def raise_from(value, from_value):
718 |         raise value
719 | 
720 | 
721 | print_ = getattr(moves.builtins, "print", None)
722 | if print_ is None:
723 |     def print_(*args, **kwargs):
724 |         """The new-style print function for Python 2.4 and 2.5."""
725 |         fp = kwargs.pop("file", sys.stdout)
726 |         if fp is None:
727 |             return
728 | 
729 |         def write(data):
730 |             if not isinstance(data, basestring):
731 |                 data = str(data)
732 |             # If the file has an encoding, encode unicode with it.
733 |             if (isinstance(fp, file) and
734 |                     isinstance(data, unicode) and
735 |                     fp.encoding is not None):
736 |                 errors = getattr(fp, "errors", None)
737 |                 if errors is None:
738 |                     errors = "strict"
739 |                 data = data.encode(fp.encoding, errors)
740 |             fp.write(data)
741 |         want_unicode = False
742 |         sep = kwargs.pop("sep", None)
743 |         if sep is not None:
744 |             if isinstance(sep, unicode):
745 |                 want_unicode = True
746 |             elif not isinstance(sep, str):
747 |                 raise TypeError("sep must be None or a string")
748 |         end = kwargs.pop("end", None)
749 |         if end is not None:
750 |             if isinstance(end, unicode):
751 |                 want_unicode = True
752 |             elif not isinstance(end, str):
753 |                 raise TypeError("end must be None or a string")
754 |         if kwargs:
755 |             raise TypeError("invalid keyword arguments to print()")
756 |         if not want_unicode:
757 |             for arg in args:
758 |                 if isinstance(arg, unicode):
759 |                     want_unicode = True
760 |                     break
761 |         if want_unicode:
762 |             newline = unicode("\n")
763 |             space = unicode(" ")
764 |         else:
765 |             newline = "\n"
766 |             space = " "
767 |         if sep is None:
768 |             sep = space
769 |         if end is None:
770 |             end = newline
771 |         for i, arg in enumerate(args):
772 |             if i:
773 |                 write(sep)
774 |             write(arg)
775 |         write(end)
776 | if sys.version_info[:2] < (3, 3):
777 |     _print = print_
778 | 
779 |     def print_(*args, **kwargs):
780 |         fp = kwargs.get("file", sys.stdout)
781 |         flush = kwargs.pop("flush", False)
782 |         _print(*args, **kwargs)
783 |         if flush and fp is not None:
784 |             fp.flush()
785 | 
786 | _add_doc(reraise, """Reraise an exception.""")
787 | 
788 | if sys.version_info[0:2] < (3, 4):
789 |     def wraps(wrapped, assigned=functools.WRAPPER_ASSIGNMENTS,
790 |               updated=functools.WRAPPER_UPDATES):
791 |         def wrapper(f):
792 |             f = functools.wraps(wrapped, assigned, updated)(f)
793 |             f.__wrapped__ = wrapped
794 |             return f
795 |         return wrapper
796 | else:
797 |     wraps = functools.wraps
798 | 
799 | 
800 | def with_metaclass(meta, *bases):
801 |     """Create a base class with a metaclass."""
802 |     # This requires a bit of explanation: the basic idea is to make a dummy
803 |     # metaclass for one level of class instantiation that replaces itself with
804 |     # the actual metaclass.
805 |     class metaclass(meta):
806 | 
807 |         def __new__(cls, name, this_bases, d):
808 |             return meta(name, bases, d)
809 |     return type.__new__(metaclass, 'temporary_class', (), {})
810 | 
811 | 
812 | def add_metaclass(metaclass):
813 |     """Class decorator for creating a class with a metaclass."""
814 |     def wrapper(cls):
815 |         orig_vars = cls.__dict__.copy()
816 |         slots = orig_vars.get('__slots__')
817 |         if slots is not None:
818 |             if isinstance(slots, str):
819 |                 slots = [slots]
820 |             for slots_var in slots:
821 |                 orig_vars.pop(slots_var)
822 |         orig_vars.pop('__dict__', None)
823 |         orig_vars.pop('__weakref__', None)
824 |         return metaclass(cls.__name__, cls.__bases__, orig_vars)
825 |     return wrapper
826 | 
827 | 
828 | def python_2_unicode_compatible(klass):
829 |     """
830 |     A decorator that defines __unicode__ and __str__ methods under Python 2.
831 |     Under Python 3 it does nothing.
832 | 
833 |     To support Python 2 and 3 with a single code base, define a __str__ method
834 |     returning text and apply this decorator to the class.
835 |     """
836 |     if PY2:
837 |         if '__str__' not in klass.__dict__:
838 |             raise ValueError("@python_2_unicode_compatible cannot be applied "
839 |                              "to %s because it doesn't define __str__()." %
840 |                              klass.__name__)
841 |         klass.__unicode__ = klass.__str__
842 |         klass.__str__ = lambda self: self.__unicode__().encode('utf-8')
843 |     return klass
844 | 
845 | 
846 | # Complete the moves implementation.
847 | # This code is at the end of this module to speed up module loading.
848 | # Turn this module into a package.
849 | __path__ = []  # required for PEP 302 and PEP 451
850 | __package__ = __name__  # see PEP 366 @ReservedAssignment
851 | if globals().get("__spec__") is not None:
852 |     __spec__.submodule_search_locations = []  # PEP 451 @UndefinedVariable
853 | # Remove other six meta path importers, since they cause problems. This can
854 | # happen if six is removed from sys.modules and then reloaded. (Setuptools does
855 | # this for some reason.)
856 | if sys.meta_path:
857 |     for i, importer in enumerate(sys.meta_path):
858 |         # Here's some real nastiness: Another "instance" of the six module might
859 |         # be floating around. Therefore, we can't use isinstance() to check for
860 |         # the six meta path importer, since the other six instance will have
861 |         # inserted an importer with different class.
862 |         if (type(importer).__name__ == "_SixMetaPathImporter" and
863 |                 importer.name == __name__):
864 |             del sys.meta_path[i]
865 |             break
866 |     del i, importer
867 | # Finally, add the importer to the meta path import hook.
868 | sys.meta_path.append(_importer)
869 | 


--------------------------------------------------------------------------------
/pyap/parser.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |     pyap.parser
  5 |     ~~~~~~~~~~~~~~~~
  6 | 
  7 |     This module contains AddressParser class which connects all the
  8 |     functionality of the package in one place.
  9 | 
 10 |     :copyright: (c) 2015 by Vladimir Goncharov.
 11 |     :license: MIT, see LICENSE for more details.
 12 | """
 13 | 
 14 | import re
 15 | import importlib
 16 | 
 17 | from . import exceptions as e
 18 | from . import address
 19 | from . import utils
 20 | from .packages import six
 21 | 
 22 | 
 23 | class AddressParser:
 24 | 
 25 |     def __init__(self, **args):
 26 |         '''Initialize with custom arguments'''
 27 |         for k, v in six.iteritems(args):
 28 |             # store country id in uppercase
 29 |             if k == 'country':
 30 |                 v = v.upper()
 31 |             setattr(self, k, v)
 32 |         try:
 33 |             # import detection rules
 34 |             package = 'pyap' + '.source_' + self.country + \
 35 |                 '.data'
 36 |             data = importlib.import_module(package)
 37 |             self.rules = data.full_address
 38 | 
 39 |         except AttributeError:
 40 |             raise e.NoCountrySelected(
 41 |                 'No country specified during library initialization.',
 42 |                 'Error 1')
 43 | 
 44 |         except ImportError:
 45 |             raise e.CountryDetectionMissing(
 46 |                 'Detection rules for country "{country}" not found.'.
 47 |                 format(country=self.country), 'Error 2'
 48 |             )
 49 | 
 50 |     def parse(self, text):
 51 |         '''Returns a list of addresses found in text
 52 |         together with parsed address parts
 53 |         '''
 54 |         results = []
 55 |         if isinstance(text, str):
 56 |             if six.PY2:
 57 |                 text = unicode(text, 'utf-8')
 58 |         self.clean_text = self._normalize_string(text)
 59 | 
 60 |         # get addresses
 61 |         address_matches = utils.finditer(self.rules, self.clean_text)
 62 |         if address_matches:
 63 |             # append parsed address info
 64 |             results = list(map(self._parse_address, address_matches))
 65 | 
 66 |         return results
 67 | 
 68 |     def _parse_address(self, match):
 69 |         '''Parses address into parts'''
 70 |         if isinstance(match, str):
 71 |             # If the address is passed as a match it saves foing the match twice
 72 |             match = utils.match(self.rules, match, flags=re.VERBOSE | re.U)
 73 |         if match:
 74 |             match_as_dict = match.groupdict()
 75 |             match_as_dict.update({'country_id': self.country})
 76 |             # combine results
 77 |             cleaned_dict = self._combine_results(match_as_dict)
 78 |             cleaned_dict['match_start'] = match.start()
 79 |             cleaned_dict['match_end'] = match.end()
 80 |             # create object containing results
 81 |             return address.Address(**cleaned_dict)
 82 | 
 83 |         return False
 84 | 
 85 |     @staticmethod
 86 |     def _combine_results(match_as_dict):
 87 |             '''Combine results from different parsed parts:
 88 |             we look for non-empty results in values like
 89 |             'postal_code_b' or 'postal_code_c' and store
 90 |             them as main value.
 91 | 
 92 |             So 'postal_code_b':'123456'
 93 |                 becomes:
 94 |                'postal_code'  :'123456'
 95 |             '''
 96 |             keys = []
 97 |             vals = []
 98 |             for k, v in six.iteritems(match_as_dict):
 99 |                 if k[-2:] in '_a_b_c_d_e_f_g_h_i_j_k_l_m':
100 |                     if v:
101 |                         # strip last 2 chars: '..._b' -> '...'
102 |                         keys.append(k[:-2])
103 |                         vals.append(v)
104 |                 else:
105 |                     if k not in keys:
106 |                         keys.append(k)
107 |                         vals.append(v)
108 |             return dict(zip(keys, vals))
109 | 
110 |     @staticmethod
111 |     def _normalize_string(text):
112 |         '''Prepares incoming text for parsing:
113 |         removes excessive spaces, tabs, newlines, etc.
114 |         '''
115 |         conversion = {
116 |             # newlines
117 |             r'\r*(\n\r*)+': ', ',
118 |             r'\s*(\,\s*)+': ', ',
119 |             # replace excessive empty spaces
120 |             r'\s+': ' ',
121 |             # convert all types of hyphens/dashes to a
122 |             # simple old-school dash
123 |             # from http://utf8-chartable.de/unicode-utf8-table.pl?
124 |             # start=8192&number=128&utf8=string-literal
125 |             '‐': '-',
126 |             '‑': '-',
127 |             '‒': '-',
128 |             '–': '-',
129 |             '—': '-',
130 |             '―': '-',
131 |         }
132 |         for find, replace in six.iteritems(conversion):
133 |             text = re.sub(find, replace, text, flags=re.UNICODE)
134 |         return text
135 | 


--------------------------------------------------------------------------------
/pyap/source_CA/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vladimarius/pyap/34e33739be46e084becae73980d945af068ae699/pyap/source_CA/__init__.py


--------------------------------------------------------------------------------
/pyap/source_CA/data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |     pyap.source_US.data
  5 |     ~~~~~~~~~~~~~~~~~~~~
  6 | 
  7 |     This module provides regular expression definitions required for
  8 |     detecting Canada addresses.
  9 | 
 10 |     The module is expected to always contain 'full_address' variable containing
 11 |     all address parsing definitions.
 12 | 
 13 |     :copyright: (c) 2015 by Vladimir Goncharov.
 14 |     :license: MIT, see LICENSE for more details.
 15 | """
 16 | 
 17 | import re
 18 | 
 19 | ''' Numerals from one to nine
 20 | Note: here and below we use syntax like '[Oo][Nn][Ee]'
 21 | instead of '(one)(?i)' to match 'One' or 'oNe' because
 22 | Python Regexps don't seem to support turning On/Off
 23 | case modes for subcapturing groups.
 24 | '''
 25 | zero_to_nine = r"""(?:
 26 |     [Zz][Ee][Rr][Oo]\ |[Oo][Nn][Ee]\ |[Tt][Ww][Oo]\ |
 27 |     [Tt][Hh][Rr][Ee][Ee]\ |[Ff][Oo][Uu][Rr]\ |
 28 |     [Ff][Ii][Vv][Ee]\ |[Ss][Ii][Xx]\ |
 29 |     [Ss][Ee][Vv][Ee][Nn]\ |[Ee][Ii][Gg][Hh][Tt]\ |
 30 |     [Nn][Ii][Nn][Ee]\ |[Tt][Ee][Nn]\ |
 31 |     [Ee][Ll][Ee][Vv][Ee][Nn]\ |
 32 |     [Tt][Ww][Ee][Ll][Vv][Ee]\ |
 33 |     [Tt][Hh][Ii][Rr][Tt][Ee][Ee][Nn]\ |
 34 |     [Ff][Oo][Uu][Rr][Tt][Ee][Ee][Nn]\ |
 35 |     [Ff][Ii][Ff][Tt][Ee][Ee][Nn]\ |
 36 |     [Ss][Ii][Xx][Tt][Ee][Ee][Nn]\ |
 37 |     [Ss][Ee][Vv][Ee][Nn][Tt][Ee][Ee][Nn]\ |
 38 |     [Ee][Ii][Gg][Hh][Tt][Ee][Ee][Nn]\ |
 39 |     [Nn][Ii][Nn][Ee][Tt][Ee][Ee][Nn]\ 
 40 |     )
 41 | """
 42 | 
 43 | # Numerals - 10, 20, 30 ... 90
 44 | ten_to_ninety = r"""(?:
 45 |     [Tt][Ee][Nn]\ |[Tt][Ww][Ee][Nn][Tt][Yy]\ |
 46 |     [Tt][Hh][Ii][Rr][Tt][Yy]\ |
 47 |     [Ff][Oo][Rr][Tt][Yy]\ |
 48 |     [Ff][Oo][Uu][Rr][Tt][Yy]\ |
 49 |     [Ff][Ii][Ff][Tt][Yy]\ |[Ss][Ii][Xx][Tt][Yy]\ |
 50 |     [Ss][Ee][Vv][Ee][Nn][Tt][Yy]\ |
 51 |     [Ee][Ii][Gg][Hh][Tt][Yy]\ |
 52 |     [Nn][Ii][Nn][Ee][Tt][Yy]\ 
 53 |     )"""
 54 | 
 55 | # One hundred
 56 | hundred = r"""(?:
 57 |     [Hh][Uu][Nn][Dd][Rr][Ee][Dd]\ 
 58 |     )"""
 59 | 
 60 | # One thousand
 61 | thousand = r"""(?:
 62 |     [Tt][Hh][Oo][Uu][Ss][Aa][Nn][Dd]\ 
 63 |     )"""
 64 | 
 65 | '''
 66 | Regexp for matching street number.
 67 | Street number can be written 2 ways:
 68 | 1) Using letters - "One thousand twenty two"
 69 | 2) Using numbers
 70 |    a) - "1022"
 71 |    b) - "85-1190"
 72 |         "85 - 1190"
 73 |         "85th - 1190"
 74 |    c) - "85 1190"
 75 | '''
 76 | street_number = r"""(?<![\.0-9])(?P<street_number>
 77 |                         (?:
 78 |                             [Aa][Nn][Dd]\ 
 79 |                             |
 80 |                             {thousand}
 81 |                             |
 82 |                             {hundred}
 83 |                             |
 84 |                             {zero_to_nine}
 85 |                             |
 86 |                             {ten_to_ninety}
 87 |                         ){from_to}
 88 |                         |
 89 |                         # 85th - 1190
 90 |                         (?:\d{from_to}(?:th)?
 91 |                             (?:\ ?\-?\ ?\d{from_to}(?:th)?)?\ 
 92 |                         )
 93 |                         |
 94 |                         # 45
 95 |                         (?:\d{from_to}(?=[\ ,]))
 96 |                     )
 97 |                 """.format(thousand=thousand,
 98 |                            hundred=hundred,
 99 |                            zero_to_nine=zero_to_nine,
100 |                            ten_to_ninety=ten_to_ninety,
101 |                            from_to='{1,5}')
102 | 
103 | '''
104 | Regexp for matching street name.
105 | In example below:
106 | "Hoover Boulevard": "Hoover" is a street name
107 | '''
108 | street_name = r"""(?P<street_name>
109 |                   \w[\w0-9\'\-\ \.]{0,30}?
110 |                  )
111 |               """
112 | 
113 | post_direction = r"""
114 |                     (?P<post_direction>
115 |                         (?:
116 |                             # English
117 |                             [Nn][Oo][Rr][Tt][Hh]{d}|
118 |                             [Ss][Oo][Uu][Tt][Hh]{d}|
119 |                             [Ee][Aa][Ss][Tt]{d}|
120 |                             [Ww][Ee][Ss][Tt]{d}|
121 |                             [Nn][Oo][Rr][Tt][Hh][Ee][Aa][Ss][Tt]{d}|
122 |                             [Nn][Oo][Rr][Tt][Hh][Ww][Ee][Ss][Tt]{d}|
123 |                             [Ss][Oo][Uu][Tt][Hh][Ee][Aa][Ss][Tt]{d}|
124 |                             [Ss][Oo][Uu][Tt][Hh][Ww][Ee][Ss][Tt]{d}|
125 |                             # French
126 |                             [Ee][Ss][Tt]{d}|
127 |                             [Nn][Oo][Rr][Dd]{d}|
128 |                             [Nn][Oo][Rr][Dd]\-[Ee][Ss][Tt]{d}|
129 |                             [Nn][Oo][Rr][Dd]\-[Oo][Uu][Ee][Ss][Tt]{d}|
130 |                             [Ss][Uu][Dd]{d}|
131 |                             [Ss][Uu][Dd]\-[Ee][Ss][Tt]{d}|
132 |                             [Ss][Uu][Dd]\-[Oo][Uu][Ee][Ss][Tt]{d}|
133 |                             [Oo][Uu][Ee][Ss][Tt]{d}
134 |                         )
135 |                         |
136 |                         (?:
137 |                             # English
138 |                             NW{d}|NE{d}|SW{d}|SE{d}|
139 |                             # French (missing above)
140 |                             NO{d}|SO{d}
141 |                         )
142 |                         |
143 |                         (?:
144 |                             # English
145 |                             N[\.\ ]|S[\.\ ]|E[\.\ ]|W[\.\ ]|
146 |                             # French (missing above)
147 |                             O[\.\ ]
148 |                         )
149 |                     )
150 |                 """.format(d='[\ ,]')
151 | 
152 | # Regexp for matching street type
153 | # According to
154 | # https://www.canadapost.ca/tools/pg/manual/PGaddress-e.asp#1385939
155 | street_type = r"""
156 |             (?P<street_type>
157 |                 [Aa][Bb][Bb][Ee][Yy]{div}|
158 |                 [Aa][Cc][Rr][Ee][Ss]{div}|
159 |                 [Aa][Ll][Ll][Éé][Ee]{div}|
160 |                 [Aa][Ll][Ll][Ee][Yy]{div}|
161 |                 [Aa][Uu][Tt][Oo][Rr][Oo][Uu][Tt][Ee]{div}|[Aa][Uu][Tt]{div}|
162 |                 [Aa][Vv][Ee][Nn][Uu][Ee]{div}|[Aa][Vv][Ee]?{div}|
163 |                 [Bb][Aa][Yy]{div}|
164 |                 [Bb][Ee][Aa][Cc][Hh]{div}|
165 |                 [Bb][Ee][Nn][Dd]{div}|
166 |                 [Bb][Oo][Uu][Ll][Ee][Vv][Aa][Er][Dd]{div}|[Bb][Ll][Vv][Dd]{div}|[Bb][Oo][Uu][Ll]{div}|
167 |                 # Broadway
168 |                 [Bb][Rr][Oo][Aa][Dd][Ww][Aa][Yy]{div}|
169 |                 [Bb][Yy]\-?[Pp][Aa][Ss][Ss]{div}|
170 |                 [Bb][Yy][Ww][Aa][Yy]{div}|
171 |                 [Cc][Aa][Mm][Pp][Uu][Ss]{div}|
172 |                 [Cc][Aa][Pp][Ee]{div}|
173 |                 [Cc][Aa][Rr][Rr][EéÉ]{div}|[Cc][Aa][Rr]{div}|
174 |                 [Cc][Aa][Rr][Rr][Ee][Ff][Oo][Uu][Rr]{div}|[Cc][Aa][Rr][Re][Ee][Ff]{div}|
175 |                 [Cc][Ee][Nn][Tt][Rr][Ee]{div}|[Cc][Tt][Rr]{div}|
176 |                 [Cc][Ee][Rr][Cc][Ll][Ee]{div}|
177 |                 [Cc][Hh][Aa][Ss][Ee]{div}|
178 |                 [Cc][Hh][Ee][Mm][Ii][Nn]{div}|[Cc][Hh]{div}|
179 |                 [Cc][Ii][Rr][Cc][Ll][Ee]{div}|[Cc][Ii][Rr]{div}|
180 |                 [Cc][Ii][Rr][Cc][Uu][Ii][Tt]{div}|[Cc][Ii][Rr][Cc][Tt]{div}|
181 |                 [Cc][Ll][Oo][Ss][Ee]{div}|
182 |                 [Cc][Oo][Mm][Mm][Oo][Nn]{div}|
183 |                 [Cc][Oo][Nn][Cc][Ee][Ss][Ss][Ii][Oo][Nn]{div}|[Cc][Oo][Nn][Cc]{div}|
184 |                 [Cc][Oo][Rr][Nn][Ee][Rr][Ss]{div}|
185 |                 [Cc][Ôô][Tt][Ee]{div}|
186 |                 [Cc][Oo][Uu][Rr][Ss]{div}|
187 |                 [Cc][Oo][Uu][Rr]{div}|
188 |                 [Cc][Oo][Uu][Rr][Tt]{div}|[Cc][Rr][Tt]{div}|
189 |                 [Cc][Oo][Vv][Ee]{div}|
190 |                 [Cc][Rr][Ee][Ss][Cc][Ee][Nn][Tt]{div}|[Cc][Rr][Ee][Ss]{div}|
191 |                 [Cc][Rr][Oo][Ii][Ss][Ss][Aa][Nn][Tt]{div}|[Cc][Rr][Oo][Ii][Ss]{div}|
192 |                 [Cc][Rr][Oo][Ss][Ss][Ii][Nn][Gg]{div}|[Cc][Rr][Oo][Ss][Ss]{div}|
193 |                 [Cc][Uu][Ll]\-[Dd][Ee]\-[Ss][Aa][Cc]{div}|[Cc][Dd][Ss]{div}|
194 |                 [Dd][Aa][Ll][Ee]{div}|
195 |                 [Dd][Ee][Ll][Ll]{div}|
196 |                 [Dd][Ii][Vv][Ee][Rr][Ss][Ii][Oo][Nn]{div}|[Dd][Ii][Vv][Ee][Rr][Ss]{div}|
197 |                 [Dd][Oo][Ww][Nn][Ss]{div}|
198 |                 [Dd][Rr][Ii][Vv][Ee]{div}|[Dd][Rr]{div}|
199 |                 [Ée][Cc][Hh][Aa][Nn][Gg][Ee][Uu][Rr]{div}|[Ée][Cc][Hh]{div}|
200 |                 [Ee][Nn][Dd]{div}|
201 |                 [Ee][Ss][Pp][Ll][Aa][Nn][Aa][Dd][Ee]{div}|[Ee][Ss][Pp][Ll]{div}|
202 |                 [Ee][Ss][Tt][Aa][Tt][Ee][Ss]?{div}|
203 |                 [Ee][Xx][Pp][Rr][Ee][Ss][Ss][Ww][Aa][Yy]{div}|[Ee][Xx][Pp][Yy]{div}|
204 |                 [Ee][Xx][Tt][Ee][Nn][Ss][Ii][Oo][Nn]{div}|[Ee][Xx][Tt][Ee][Nn]{div}|
205 |                 [Ff][Aa][Rr][Mm]{div}|
206 |                 [Ff][Ii][Ee][Ll][Dd]{div}|
207 |                 [Ff][Oo][Rr][Ee][Ss][Tt]{div}|
208 |                 [Ff][Rr][Ee][Ee][Ww][Aa][Yy]{div}|[Ff][Ww][Yy]{div}|
209 |                 [Ff][Rr][Oo][Nn][Tt]{div}|
210 |                 [Gg][Aa][Rr][Dd][Ee][Nn][Ss]{div}|[Gg][Dd][Nn][Ss]{div}|
211 |                 [Gg][Aa][Tt][Ee]{div}|
212 |                 [Gg][Ll][Aa][Dd][Ee]{div}|
213 |                 [Gg][Ll][Ee][Nn]{div}|
214 |                 [Gg][Rr][Ee][Ee][Nn]{div}|
215 |                 [Gg][Rr][Uo][Uu][Nn][Dd][Ss]{div}|[Gg][Rr][Nn][Dd][Ss]{div}|
216 |                 [Gg][Rr][Oo][Vv][Ee]{div}|
217 |                 [Hh][Aa][Rr][Bb][Oo][Uu][Rr]{div}|[Hh][Aa][Rr][Bb][Rr]{div}|
218 |                 [Hh][Ee][Aa][Tt][Hh]{div}|
219 |                 [Hh][Ee][Ii][Gg][Hh][Tt][Ss]{div}|[Hh][Tt][Ss]{div}|
220 |                 [Hh][Ii][Gg][Hh][Ll][Aa][Nn][Dd][Ss]{div}|[Hh][Gg][Hh][Ll][Dd][Sd]{div}|
221 |                 [Hh][Ii][Gg][Gh][Ww][Aa][Yy]{div}|[Hh][Ww][Yy]{div}|
222 |                 [Hh][Ii][Ll][Ll]{div}|
223 |                 [Hh][Oo][Ll][Ll][Oo][Ww]{div}|
224 |                 [Îi][Ll][Ee]{div}|
225 |                 [Ii][Mm][Pp][Aa][Ss][Ss][Ee]{div}|I[Mm][Pp]{div}|
226 |                 [Ii][Nn][Ll][Ee][Tt]{div}|
227 |                 [Ii][Ss][Ll][Aa][Nn][Dd]{div}|
228 |                 [Kk][Ee][Yy]{div}|
229 |                 [Kk][Nn][Oo][Ll][Ll]{div}|
230 |                 [Ll][Aa][Nn][Dd][Ii][Nn][Gg]{div}|[Ll][Aa][Nn][Dd][Nn][Gg]{div}|
231 |                 [Ll][Aa][Nn][Ee]{div}|
232 |                 [Ll][Ii][Mm][Ii][Tt][Ss]{div}|[Ll][Mm][Tt][Ss]{div}|
233 |                 [Ll][Ii][Nn][Ee]{div}|
234 |                 [Ll][Ii][Nn][Kk]{div}|
235 |                 [Ll][Oo][Oo][Kk][Oo][Uu][Tt]{div}|[Ll][Kk][Oo][Uu][Tt]{div}|
236 |                 [Mm][Aa][Ii][Nn][Ww][Aa][Yy]{div}|
237 |                 [Mm][Aa][Ll][Ll]{div}|
238 |                 [Mm][Aa][Nn][Oo][Rr]{div}|
239 |                 [Mm][Aa][Zz][Ee]{div}|
240 |                 [Mm][Ee][Aa][Dd][Oo][Ww]{div}|
241 |                 [Mm][Ee][Ww][Ss]{div}|
242 |                 [Mm][Oo][Nn][Tt][Éé][Ee]{div}|
243 |                 [Mm][Oo][Oo][Rr]{div}|
244 |                 [Mm][Oo][Uu][Nn][Tt][Aa][Ii][Nn]{div}|[Mm][Tt][Nn]{div}|
245 |                 [Mm][Oo][Uu][Nn][Tt]{div}|
246 |                 [Oo][Rr][Cc][Hh][Aa][Rr][Dd]{div}|[Oo][Rr][Cc][Hh]{div}|
247 |                 [Pp][Aa][Rr][Aa][Dd][Ee]{div}|
248 |                 [Pp][Aa][Rr][Cc]{div}|
249 |                 [Pp][Aa][Rr][Kk][Ww][Aa][Yy]{div}|[Pp][Kk][Yy]{div}|
250 |                 [Pp][Aa][Rr][Kk]{div}|[Pp][Kk]{div}|
251 |                 [Pp][Aa][Ss][Ss][Aa][Gg][Ee]{div}|[Pp][As][Ss][Ss]{div}|
252 |                 [Pp][Aa][Tt][Hh]{div}|
253 |                 [Pp][Aa][Tt][Hh][Ww][Aa][Yy]{div}|[Pp][Tt][Ww][Aa][Yy]{div}|
254 |                 [Pp][Ii][Nn][Ee][Ss]{div}|
255 |                 [Pp][Ll][Aa][Cc][Ee]{div}|[Pp][Ll]{div}|
256 |                 [Pp][Ll][Aa][Tt][Ee][Aa][Uu]{div}|[Pp][Ll][Aa][Tt]{div}|
257 |                 [Pp][Ll][Aa][Zz][Aa]{div}|
258 |                 [Pp][Oo][Ii][Nn][Tt][Ee]{div}|
259 |                 [Pp][Oo][Ii][Nn][Tt]{div}|[Pp][Tt]{div}|
260 |                 [Pp][Oo][Rr][Tt]{div}|
261 |                 [Pp][Rr][Ii][Vv][Aa][Tt][Ee]{div}|[Pp][Vv][Tt]{div}|
262 |                 [Pp][Rr][Oo][Mm][Ee][Nn][Aa][Dd][Ee]{div}|[Pp][Rr][Oo][Mm]{div}|
263 |                 [Qq][Uu][Aa][Ii]{div}|
264 |                 [Qq][Uu][Aa][Yy]{div}|
265 |                 [Rr][Aa][Mm][Pp]{div}|
266 |                 [Rr][Aa][Nn][Gg][Ee]{div}|[Rr][Gg]{div}|
267 |                 [Rr][Aa][Nn][Gg]{div}|
268 |                 [Rr][Ii][Dd][Gg][Ee]{div}|
269 |                 [Rr][Ii][Ss][Ee]{div}|
270 |                 [Rr][Oo][Aa][Dd]{div}|[Rr][Dd]{div}|
271 |                 [Rr][Oo][Nn][Dd]\-[Pp][Oo][Ii][Nn][Tt]{div}|[Rr][Dd][Pp][Tt]{div}|
272 |                 [Rr][Oo][Uu][Tt][Ee]{div}|[Rr][Tt][Ee]{div}|
273 |                 [Rr][Oo][Ww]{div}|
274 |                 [Rr][Uu][Ee][Ll][Ll][Ee]{div}|[Rr][Ll][Ee]{div}|
275 |                 [Rr][Uu][Ee]{div}|
276 |                 [Rr][Uu][Nn]{div}|
277 |                 [Ss][Ee][Nn][Tt][Ii][Ee][Rr]{div}|[Ss][Ee][Nn][Tt]{div}|
278 |                 # Street
279 |                 [Ss][Tt][Rr][Ee][Ee][Tt]{div}|[Ss][Tt](?![A-Za-z]){div}|
280 |                 # Square
281 |                 [Ss][Qq][Uu][Aa][Rr][Ee]{div}|[Ss][Qq]{div}|
282 |                 [Ss][Uu][Bb][Dd][Ii][Vv][Ii][Ss][Ii][Oo][Nn]{div}|[Ss][Uu][Bb][Dd][Ii][Vv]{div}|
283 |                 [Tt][Ee][Rr][Rr][Aa][Cc][Ee]{div}|[Tt][Ee][Re][Re]{div}|
284 |                 [Tt][Ee][Rr][Rr][Aa][Ss][Ss][Ee]{div}|[Tt][Ss][Ss][Es]{div}|
285 |                 [Tt][Hh][Ii][Cc][Kk][Ee][Tt]{div}|[Tt][Hh][Ii][Cc][Kk]{div}|
286 |                 [Tt][Oo][Ww][Ee][Rr][Ss]{div}|
287 |                 [Tt][Oo][Ww][Nn][Ll][Ii][Nn][Ee]{div}|[Tt][Ll][Ii][Nn][Ee]{div}|
288 |                 [Tt][Rr][Aa][Ii][Ll]{div}|
289 |                 [Tt][Uu][Rr][Nn][Aa][Bb][Oo][Uu][Tt]{div}|[Tt][Rr][Nn][Aa][Bb][Tt]{div}|
290 |                 [Vv][Aa][Ll][Ee]{div}|
291 |                 [Vv][Ii][Aa]{div}|
292 |                 [Vv][Ii][Ee][Ww]{div}|
293 |                 [Vv][Ii][Ll][Ll][Aa][Gg][Ee]{div}|[Vv][Ii][Ll][Ll][Gg][Ee]{div}|
294 |                 [Vv][Ii][Ll][Ll][Aa][Ss]{div}|
295 |                 [Vv][Ii][Ss][Tt][Aa]{div}|
296 |                 [Vv][Oo][Ii][Ee]{div}|
297 |                 [Ww][Aa][Ll][Lk]{div}|
298 |                 [Ww][Aa][Yy]{div}|
299 |                 [Ww][Hh][Aa][Rr][Ff]{div}|
300 |                 [Ww][Oo][Oo][Dd]{div}|
301 |                 [Ww][Yy][Nn][Dd]{div}
302 |             )
303 |             (?P<route_id>
304 |                 [\(\ \,]{route_symbols}
305 |                 [Rr][Oo][Uu][Tt][Ee]\ [A-Za-z0-9]+[\)\ \,]{route_symbols}
306 |             )?
307 |             """.format(div="[\.\ ,]{0,2}", route_symbols='{0,3}')
308 | 
309 | floor = r"""
310 |             (?P<floor>
311 |                 (?:
312 |                 \d+[A-Za-z]{0,2}\.?\ [Ff][Ll][Oo][Oo][Rr]\ 
313 |                 )
314 |                 |
315 |                 (?:
316 |                     [Ff][Ll][Oo][Oo][Rr]\ \d+[A-Za-z]{0,2}\ 
317 |                 )
318 |             )
319 |         """
320 | 
321 | building = r"""
322 |             (?:
323 |                 (?:
324 |                     (?:[Bb][Uu][Ii][Ll][Dd][Ii][Nn][Gg])
325 |                     |
326 |                     (?:[Bb][Ll][Dd][Gg])
327 |                 )
328 |                 \ \d{0,2}[A-Za-z]?
329 |             )
330 |             """
331 | 
332 | occupancy = r"""
333 |             (?:
334 |                 (?:
335 |                     (?:
336 |                         #
337 |                         # English
338 |                         #
339 |                         # Suite
340 |                         [Ss][Uu][Ii][Tt][Ee]\ |[Ss][Tt][Ee]\.?\ 
341 |                         |
342 |                         # Apartment
343 |                         [Aa][Pp][Tt]\.?\ |[Aa][Pp][Aa][Rr][Tt][Mm][Ee][Nn][Tt]\ 
344 |                         |
345 |                         # Room
346 |                         [Rr][Oo][Oo][Mm]\ |[Rr][Mm]\.?\ 
347 |                         |
348 |                         # Unit
349 |                         [Uu][Nn][Ii][Tt]\ 
350 |                         |
351 |                         #
352 |                         # French
353 |                         #
354 |                         # Apartement
355 |                         [Aa][Pp][Aa][Rr][Tt][Ee][Mm][Ee][Nn][Tt]\ |A[Pp][Pp]\ 
356 |                         |
357 |                         # Bureau
358 |                         [Bb][Uu][Rr][Ee][Aa][Uu]\ 
359 |                         |
360 |                         # Unité
361 |                         [Uu][Nn][Ii][Tt][Éé]\ 
362 |                     )
363 |                     (?:
364 |                         [A-Za-z\#\&\-\d]{1,7}
365 |                     )?
366 |                 )
367 |                 |
368 |                 (?:
369 |                     \#[0-9]{,3}[A-Za-z]{1}
370 |                 )
371 |             )\ ?
372 |             """
373 | 
374 | po_box = r"""
375 |             (?P<postal_box>
376 |                 # English - PO Box 123
377 |                 (?:[Pp]\.?\ ?[Oo]\.?\ [Bb][Oo][Xx]\ \d+)
378 |                 |
379 |                 # French - B.P. 123
380 |                 (?:[Bb]\.?\ [Pp]\.?\ \d+)
381 |                 |
382 |                 # C.P. 123
383 |                 (?:[Cc]\.?\ [Pp]\.?\ \d+)
384 |                 |
385 |                 # Case postale 123
386 |                 (?:[Cc]ase\ [Pp][Oo][Ss][Tt][Aa][Ll][Ee]\ \d+)
387 |                 |
388 |                 # C.P. 123
389 |                 (?:[Cc]\.[Pp]\.\ \d+)
390 |             )
391 |         """
392 | 
393 | '''Define detection rules for a second type of address format
394 |    (the French one)
395 | '''
396 | street_number_b = re.sub('<([a-z\_]+)>', r'<\1_b>', street_number)
397 | street_name_b = re.sub('<([a-z\_]+)>', r'<\1_b>', street_name)
398 | street_type_b = re.sub('<([a-z\_]+)>', r'<\1_b>', street_type)
399 | po_box_b = re.sub('<([a-z\_]+)>', r'<\1_b>', po_box)
400 | post_direction_b = re.sub('<([a-z\_]+)>', r'<\1_b>', post_direction)
401 | 
402 | po_box_positive_lookahead = r"""
403 |             (?=
404 |                 # English - PO Box 123
405 |                 (?:[Pp]\.?\ ?[Oo]\.?\ [Bb][Oo][Xx]\ \d+)
406 |                 |
407 |                 # French - B.P. 123
408 |                 (?:[Bb]\.?\ [Pp]\.?\ \d+)
409 |                 |
410 |                 # C.P. 123
411 |                 (?:[Cc]\.?\ [Pp]\.?\ \d+)
412 |                 |
413 |                 # Case postale 123
414 |                 (?:[Cc]ase\ [Pp][Oo][Ss][Tt][Aa][Ll][Ee]\ \d+)
415 |                 |
416 |                 # C.P. 123
417 |                 (?:[Cc]\.[Pp]\.\ \d+)
418 |                 |
419 |                 (?:[\ \,])
420 |             )
421 |         """
422 | 
423 | full_street = r"""
424 |     (?:
425 |         # Format commonly used in French
426 |         (?P<full_street_b>
427 | 
428 |             {street_number_b}{div}
429 |             {street_type_b}{div}
430 |             ({street_name_b} {po_box_positive_lookahead})?\,?\ ?
431 |             {post_direction_b}?\,?\ ?
432 |             {po_box_b}?\,?\ ?
433 |         )
434 |         |
435 |         # Format commonly used in English
436 |         (?P<full_street>
437 | 
438 |             {street_number}\,?\ ?
439 |             {street_name}?\,?\ ?
440 |             (?:(?<=[\ \,]){street_type})\,?\ ?
441 |             {post_direction}?\,?\ ?
442 |             {floor}?\,?\ ?
443 | 
444 |             (?P<building_id>
445 |                 {building}
446 |             )?\,?\ ?
447 | 
448 |             (?P<occupancy>
449 |                 {occupancy}
450 |             )?\,?\ ?
451 | 
452 |             {po_box}?
453 |         )
454 |     )""".format(street_number=street_number,
455 |                 street_number_b=street_number_b,
456 | 
457 |                 street_name=street_name,
458 |                 street_name_b=street_name_b,
459 | 
460 |                 street_type=street_type,
461 |                 street_type_b=street_type_b,
462 | 
463 |                 post_direction=post_direction,
464 |                 post_direction_b=post_direction_b,
465 | 
466 |                 floor=floor,
467 |                 building=building,
468 |                 occupancy=occupancy,
469 | 
470 |                 po_box=po_box,
471 |                 po_box_b=po_box_b,
472 |                 po_box_positive_lookahead=po_box_positive_lookahead,
473 | 
474 |                 div='[\ ,]{1,2}',
475 |                 )
476 | 
477 | # region1 here is actually a "province"
478 | region1 = r"""
479 |         (?P<region1>
480 |             (?:
481 |                 # province abbreviations (English)
482 |                 A\.?B\.?|B\.?C\.?|M\.?B\.?|N\.?B\.?|N\.?L\.?|
483 |                 N\.?T\.?|N\.?S\.?|N\.?U\.?|O\.?N\.?|P\.?E\.?|
484 |                 Q\.?C\.?|S\.?K\.?|Y\.?T\.?
485 |             )
486 |             |
487 |             (?:
488 |                 # provinces full (English)
489 |                 [Aa][Ll][Bb][Ee][Rr][Tt][Aa]|
490 |                 [Bb][Rr][Ii][Tt][Ii][Ss][Hh]\ [Cc][Oo][Ll][Uu][Mm][Bb][Ii][Aa]|
491 |                 [Mm][Aa][Nn][Ii][Tt][Oo][Bb][Aa]|
492 |                 [Nn][Ee][Ww]\ [Bb][Rr][Uu][Nn][Ss][Ww][Ii][Cc][Kk]|
493 |                 [Nn][Ee][Ww][Ff][Oo][Uu][Nn][Dd][Ll][Aa][Nn][Dd]\ 
494 |                 [Aa][Nn][Dd]\ [Ll][Aa][Bb][Rr][Aa][Dd][Oo][Rr]|
495 |                 [Nn][Ee][Ww][Ff][Oo][Uu][Nn][Dd][Ll][Aa][Nn][Dd]\ 
496 |                 \&\ [Ll][Aa][Bb][Rr][Aa][Dd][Oo][Rr]|
497 |                 [Nn][Oo][Rr][Tt][Hh][Ww][Ee][Ss][Tt]\ 
498 |                 [Tt][Ee][Rr][Rr][Ii][Tt][Oo][Rr][Ii][Ee][Ss]|
499 |                 [Nn][Oo][Vv][Aa]\ [Ss][Cc][Oo][Tt][Ii][Aa]|
500 |                 [Nn][Uu][Nn][Aa][Vv][Uu][Tt]|
501 |                 [Oo][Nn][Tt][Aa][Rr][Ii][Oo]|
502 |                 [Pp][Rr][Ii][Nn][Cc][Ee]\ [Ee][Dd][Ww][Aa][Rr][Dd]\ 
503 |                 [Ii][Ss][Ll][Aa][Nn][Dd]|
504 |                 [Qq][Uu][Ee][Bb][Ee][Cc]|
505 |                 [Ss][Aa][Ss][Kk][Aa][Tt][Cc][Hh][Ee][Ww][Aa][Nn]|
506 |                 [Yy][Uu][Kk][Oo][Nn]|
507 |                 # provinces full (French)
508 |                 [Cc][Oo][Ll][Oo][Mm][Bb][Ii][Ee]\-
509 |                 [Bb][Rr][Ii][Tt][Aa][Nn]{1,2}[Ii][Qq][Eu][Ee]|
510 |                 [Nn][Oo][Uu][Vv][Ee][Aa][Uu]\-[Bb][Rr][Uu][Nn][Ss][Ww][Ii][Cc][Kk]|
511 |                 [Tt][Ee][Rr][Rr][Ee]\-[Nn][Ee][Uu][Vv][Ee]\-
512 |                 [Ee][Tt]\-[Ll][Aa][Bb][Rr][Aa][Dd][Oo][Rr]|
513 |                 [Tt][Ee][Rr][Rr][Ii][Tt][Oo][Ii][Rr][Ee][Ss]\ [Dd][Uu]\ 
514 |                 [Nn][Oo][Rr][Dd]\-[Oo][Uu][Ee][Ss][Tt]|
515 |                 [Nn][Oo][Uu][Vv][Ee][Ll][Ll][Ee]\-[ÉéEe][Cc][Oo][Ss][Ss][Ee]|
516 |                 [ÎîIi][Ll][Ee]\-[Dd][Uu]\-[Pp][Rr][Ii][Nn][Cc][Ee]\-
517 |                 [ÉéEe][Dd][Oo][Uu][Aa][Rr][Dd]|
518 |                 [Qq][Uu][Éé][Bb][Ee][Cc]
519 |             )
520 |         )
521 |         """
522 | 
523 | city = r"""
524 |         (?P<city>
525 |             (?<=[\, ])[A-z]{1}(?![0-9]) # city second char should not be number
526 |             [\w\ \-\'\.]{2,20}?(?=[\, ])
527 |         )
528 |         """
529 | 
530 | postal_code = r"""
531 |             (?P<postal_code>
532 |                 (?:
533 |                     [ABCEGHJKLMNPRSTVXY]\d[ABCEGHJKLMNPRSTVWXYZ]\ ?
534 |                     \d[ABCEGHJKLMNPRSTVWXYZ]\d
535 |                 )
536 |             )
537 |             """
538 | 
539 | country = r"""
540 |             (?:
541 |                 [Cc][Aa][Nn][Aa][Dd][Aa]
542 |             )
543 |             """
544 | 
545 | # define detection rules for postal code placed in different parts of address
546 | postal_code_b = re.sub('<([a-z\_]+)>', r'<\1_b>', postal_code)
547 | postal_code_c = re.sub('<([a-z\_]+)>', r'<\1_c>', postal_code)
548 | 
549 | full_address = r"""
550 |                 (?P<full_address>
551 |                     {full_street} {div}
552 |                     {city} {div}
553 |                     (?:{postal_code_c} {div})?
554 |                     \(?{region1}[\)\.]? {div}
555 |                     (?:
556 |                         (?:
557 |                             {postal_code}? {div} {country}? 
558 |                             (?:{div} {postal_code_b})?
559 |                         )
560 |                     )
561 |                 )
562 |                 """.format(
563 |     full_street=full_street,
564 |     div='[\, ]{,2}',
565 |     city=city,
566 |     region1=region1,
567 | 
568 |     country=country,
569 |     country_b=country,
570 | 
571 |     postal_code=postal_code,
572 |     postal_code_b=postal_code_b,
573 |     postal_code_c=postal_code_c,
574 | )
575 | 


--------------------------------------------------------------------------------
/pyap/source_GB/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vladimarius/pyap/34e33739be46e084becae73980d945af068ae699/pyap/source_GB/__init__.py


--------------------------------------------------------------------------------
/pyap/source_GB/data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |     pyap.source_GB.data
  5 |     ~~~~~~~~~~~~~~~~~~~~
  6 | 
  7 |     This module provides regular expression definitions required for
  8 |     detecting British/GB/UK addresses.
  9 | 
 10 |     The module is expected to always contain 'full_address' variable containing
 11 |     all address parsing definitions.
 12 | 
 13 |     :copyright: (c) 2015 by Vladimir Goncharov.
 14 |     :license: MIT, see LICENSE for more details.
 15 | """
 16 | 
 17 | 
 18 | '''Numerals from one to nine
 19 | Note: here and below we use syntax like '[Oo][Nn][Ee]'
 20 | instead of '(one)(?i)' to match 'One' or 'oNe' because
 21 | Python Regexps don't seem to support turning On/Off
 22 | case modes for subcapturing groups.
 23 | '''
 24 | zero_to_nine = r"""
 25 |                                 (?:
 26 |                                     [Zz][Ee][Rr][Oo]\ |[Oo][Nn][Ee]\ |[Tt][Ww][Oo]\ |
 27 |                                     [Tt][Hh][Rr][Ee][Ee]\ |[Ff][Oo][Uu][Rr]\ |
 28 |                                     [Ff][Ii][Vv][Ee]\ |[Ss][Ii][Xx]\ |
 29 |                                     [Ss][Ee][Vv][Ee][Nn]\ |[Ee][Ii][Gg][Hh][Tt]\ |
 30 |                                     [Nn][Ii][Nn][Ee]\ |[Tt][Ee][Nn]\ |
 31 |                                     [Ee][Ll][Ee][Vv][Ee][Nn]\ |
 32 |                                     [Tt][Ww][Ee][Ll][Vv][Ee]\ |
 33 |                                     [Tt][Hh][Ii][Rr][Tt][Ee][Ee][Nn]\ |
 34 |                                     [Ff][Oo][Uu][Rr][Tt][Ee][Ee][Nn]\ |
 35 |                                     [Ff][Ii][Ff][Tt][Ee][Ee][Nn]\ |
 36 |                                     [Ss][Ii][Xx][Tt][Ee][Ee][Nn]\ |
 37 |                                     [Ss][Ee][Vv][Ee][Nn][Tt][Ee][Ee][Nn]\ |
 38 |                                     [Ee][Ii][Gg][Hh][Tt][Ee][Ee][Nn]\ |
 39 |                                     [Nn][Ii][Nn][Ee][Tt][Ee][Ee][Nn]\ 
 40 |                                 )
 41 | """
 42 | 
 43 | # Numerals - 10, 20, 30 ... 90
 44 | ten_to_ninety = r"""
 45 |                                 (?:
 46 |                                     [Tt][Ee][Nn]\ |[Tt][Ww][Ee][Nn][Tt][Yy]\ |
 47 |                                     [Tt][Hh][Ii][Rr][Tt][Yy]\ |
 48 |                                     [Ff][Oo][Rr][Tt][Yy]\ |
 49 |                                     [Ff][Oo][Uu][Rr][Tt][Yy]\ |
 50 |                                     [Ff][Ii][Ff][Tt][Yy]\ |[Ss][Ii][Xx][Tt][Yy]\ |
 51 |                                     [Ss][Ee][Vv][Ee][Nn][Tt][Yy]\ |
 52 |                                     [Ee][Ii][Gg][Hh][Tt][Yy]\ |
 53 |                                     [Nn][Ii][Nn][Ee][Tt][Yy]\ 
 54 |                                 )
 55 | """
 56 | 
 57 | # One hundred
 58 | hundred = r"""
 59 |                                 (?:
 60 |                                     [Hh][Uu][Nn][Dd][Rr][Ee][Dd]\ 
 61 |                                 )
 62 | """
 63 | 
 64 | # One thousand
 65 | thousand = r"""
 66 |                                 (?:
 67 |                                     [Tt][Hh][Oo][Uu][Ss][Aa][Nn][Dd]\ 
 68 |                                 )
 69 | """
 70 | 
 71 | part_divider = r'(?: [\,\ \.\-]{0,3}\,[\,\ \.\-]{0,3} )'
 72 | space_pattern = r'(?: [\ \t]{1,3} )'  # TODO: use \b for word boundary and \s for whitespace
 73 | 
 74 | '''
 75 | Regexp for matching street number.
 76 | Street number can be written 2 ways:
 77 | 1) Using letters - "One thousand twenty two"
 78 | 2) Using numbers
 79 |    a) - "1022"
 80 |    b) - "85-1190"
 81 |    c) - "85 1190"
 82 | '''
 83 | street_number = r"""
 84 |                     (?P<street_number>
 85 |                         (?:
 86 |                             (?:
 87 |                                 [Nn][Uu][Mm][Bb][Ee][Rr]|
 88 |                                 [Nn][RrOo]\.?|
 89 |                                 [Nn][Uu][Mm]\.?|
 90 |                                 #
 91 |                             )
 92 |                             {space}?
 93 |                         )?
 94 |                         (?:
 95 |                             (?:
 96 |                                 [Aa][Nn][Dd]\ 
 97 |                                 |
 98 |                                 {thousand}
 99 |                                 |
100 |                                 {hundred}
101 |                                 |
102 |                                 {zero_to_nine}
103 |                                 |
104 |                                 {ten_to_ninety}
105 |                             ){from_to}
106 |                             |
107 |                             (?:
108 |                                 \d{from_to} 
109 |                                 (?: {space}? [A-Za-z] (?![A-Za-z\d]]) )? 
110 |                                 (?!\d)
111 |                                 (?:{space}?\-{space}?\d{from_to} (?: {space}? [A-Za-z] (?![A-Za-z\d]) )? )?
112 |                             )
113 |                         )
114 |                         {space}?
115 |                     )  # end street_number
116 | """.format(
117 |     thousand=thousand,
118 |     hundred=hundred,
119 |     zero_to_nine=zero_to_nine,
120 |     ten_to_ninety=ten_to_ninety,
121 |     space=space_pattern,
122 |     from_to='{1,5}',
123 | )
124 | 
125 | '''
126 | Regexp for matching street name.
127 | In "Hoover Boulevard", "Hoover" is a street name
128 | Seems like the longest US street is 'Northeast Kentucky Industrial Parkway' - 31 charactors
129 | https://atkinsbookshelf.wordpress.com/tag/longest-street-name-in-us/
130 | '''
131 | street_name = r"""
132 |                     (?P<street_name>
133 |                         (?(street_number)           # If street_number has been found, then digits can
134 |                             [a-zA-Z0-9\ \.]{3,31}   # be in the street otherwise no digits are allowed.
135 |                             |                       # This aims to prevent street_name matching everything before the
136 |                             [a-zA-Z\ \.]{3,31}      # address as well as the number.
137 |                         )
138 |                     )
139 | """
140 | 
141 | post_direction = r"""
142 |                     (?P<post_direction>
143 |                         (?:
144 |                             [Nn][Oo][Rr][Tt][Hh]\ |
145 |                             [Ss][Oo][Uu][Tt][Hh]\ |
146 |                             [Ee][Aa][Ss][Tt]\ |
147 |                             [Ww][Ee][Ss][Tt]\ 
148 |                         )
149 |                         |
150 |                         (?:
151 |                             NW\ |NE\ |SW\ |SE\ 
152 |                         )
153 |                         |
154 |                         (?:
155 |                             N\.?\ |S\.?\ |E\.?\ |W\.?\ 
156 |                         )
157 |                     )  # end post_direction
158 | """
159 | 
160 | # Regexp for matching street type
161 | street_type = r"""
162 |                     (?:
163 |                         (?P<street_type>
164 |                             # Street
165 |                             [Ss][Tt][Rr][Ee][Ee][Tt]|S[Tt]\.?(?![A-Za-z])|
166 |                             # Boulevard
167 |                             [Bb][Oo][Uu][Ll][Ee][Vv][Aa][Rr][Dd]|[Bb][Ll][Vv][Dd]\.?|
168 |                             # Highway
169 |                             [Hh][Ii][Gg][Hh][Ww][Aa][Yy]|H[Ww][Yy]\.?|
170 |                             # Broadway
171 |                             [Bb][Rr][Oo][Aa][Dd][Ww][Aa][Yy]|
172 |                             # Freeway
173 |                             [Ff][Rr][Ee][Ee][Ww][Aa][Yy]|
174 |                             # Causeway
175 |                             [Cc][Aa][Uu][Ss][Ee][Ww][Aa][Yy]|C[Ss][Ww][Yy]\.?|
176 |                             # Expressway
177 |                             [Ee][Xx][Pp][Rr][Ee][Ss][Ss][Ww][Aa][Yy]|
178 |                             # Way
179 |                             [Ww][Aa][Yy]|
180 |                             # Walk
181 |                             [Ww][Aa][Ll][Kk]|
182 |                             # Lane
183 |                             [Ll][Aa][Nn][Ee]|L[Nn]\.?|
184 |                             # Road
185 |                             [Rr][Oo][Aa][Dd]|R[Dd]\.?|
186 |                             # Avenue
187 |                             [Aa][Vv][Ee][Nn][Uu][Ee]|A[Vv][Ee]\.?|
188 |                             # Circle
189 |                             [Cc][Ii][Rr][Cc][Ll][Ee]|C[Ii][Rr]\.?|
190 |                             # Cove
191 |                             [Cc][Oo][Vv][Ee]|C[Vv]\.?|
192 |                             # Drive
193 |                             [Dd][Rr][Ii][Vv][Ee]|D[Rr]\.?|
194 |                             # Parkway
195 |                             [Pp][Aa][Rr][Kk][Ww][Aa][Yy]|P[Kk][Ww][Yy]\.?|
196 |                             # Park
197 |                             [Pp][Aa][Rr][Kk]|
198 |                             # Court
199 |                             [Cc][Oo][Uu][Rr][Tt]|C[Tt]\.?|
200 |                             # Square
201 |                             [Ss][Qq][Uu][Aa][Rr][Ee]|S[Qq]\.?|
202 |                             # Loop
203 |                             [Ll][Oo][Oo][Pp]|L[Pp]\.?|
204 |                             # Place
205 |                             [Pp][Ll][Aa][Cc][Ee]|P[Ll]\.?|
206 |                             # Parade
207 |                             [Pp][Aa][Rr][Aa][Dd][Ee]|P[Ll]\.?|
208 |                             # Estate
209 |                             [Ee][Ss][Tt][Aa][Tt][Ee]
210 |                         )
211 |                         (?P<route_id>)
212 |                     )  # end street_type
213 | """.format(
214 |     route_symbols=r'{0,3}',
215 | )
216 | 
217 | floor = r"""
218 |                     (?P<floor>
219 |                         (?:
220 |                         \d+[A-Za-z]{0,2}\.?\ [Ff][Ll][Oo][Oo][Rr]\ 
221 |                         )
222 |                         |
223 |                         (?:
224 |                             [Ff][Ll][Oo][Oo][Rr]\ \d+[A-Za-z]{0,2}\ 
225 |                         )
226 |                     )  # end floor
227 | """
228 | 
229 | building = r"""
230 |                     (?P<building_id>
231 |                         (?:
232 |                             (?:[Bb][Uu][Ii][Ll][Dd][Ii][Nn][Gg])
233 |                             |
234 |                             (?:[Bb][Ll][Dd][Gg])
235 |                         )
236 |                         \ 
237 |                         (?:
238 |                             (?:
239 |                                 [Aa][Nn][Dd]\ 
240 |                                 |
241 |                                 {thousand}
242 |                                 |
243 |                                 {hundred}
244 |                                 |
245 |                                 {zero_to_nine}
246 |                                 |
247 |                                 {ten_to_ninety}
248 |                             ){{1,5}}
249 |                             |
250 |                             \d{{0,4}}[A-Za-z]?
251 |                         )
252 |                         \ ?
253 |                     )  # end building_id
254 | """.format(
255 |     thousand=thousand,
256 |     hundred=hundred,
257 |     zero_to_nine=zero_to_nine,
258 |     ten_to_ninety=ten_to_ninety,
259 | )
260 | 
261 | occupancy = r"""
262 |                     (?P<occupancy>
263 |                         (?:
264 |                             (?:
265 |                                 # Suite
266 |                                 [Ss][Uu][Ii][Tt][Ee]|[Ss][Tt][Ee]\.?
267 |                                 |
268 |                                 # Studio
269 |                                 [Ss][Tt][Uu][Dd][Ii][Oo]|[Ss][Tt][UuDd]\.?
270 |                                 |
271 |                                 # Apartment
272 |                                 [Aa][Pp][Tt]\.?|[Aa][Pp][Aa][Rr][Tt][Mm][Ee][Nn][Tt]
273 |                                 |
274 |                                 # Room
275 |                                 [Rr][Oo][Oo][Mm]|[Rr][Mm]\.?
276 |                                 |
277 |                                 # Flat
278 |                                 [Ff][Ll][Aa][Tt]
279 |                                 |
280 |                                 \#
281 |                             )
282 |                             {space}?
283 |                             (?:
284 |                                 [A-Za-z\#\&\-\d]{{1,7}}
285 |                             )?
286 |                         )
287 |                         {space}?
288 |                     )  # end occupancy
289 | """.format(
290 |     space=space_pattern,
291 | )
292 | 
293 | po_box = r"""
294 |                     (?:
295 |                         [Pp]\.? {space}? [Oo]\.? {space}? ([Bb][Oo][Xx]{space}?)?\d+
296 |                     )
297 | """.format(
298 |     space=space_pattern,
299 | )
300 | 
301 | full_street = r"""
302 |         (?:
303 |             (?P<full_street>
304 |     
305 |                 (?:
306 |                     {po_box} {part_divider}?  # TODO: maybe remove the '?' on the part_dividers is mismatch address parts 
307 |                 )?
308 |                 (?:
309 |                     {floor} {part_divider}?
310 |                 )?
311 |                 (?:
312 |                     {occupancy} {part_divider}?
313 |                 )?
314 |                 (?:
315 |                     {building} {part_divider}?
316 |                 )?
317 |                 
318 |                 (?:
319 |                     (?: {street_number} {space} )
320 |                     |
321 |                     (?! \d{{}} ) 
322 |                     
323 |                 )?
324 |                 (?:{street_name} )
325 |                 (?:{space} {street_type} {space}?)? 
326 |             )
327 |         )  # end full_street
328 | """.format(
329 |     street_number=street_number,
330 |     street_name=street_name,
331 |     street_type=street_type,
332 |     post_direction=post_direction,
333 |     floor=floor,
334 |     building=building,
335 |     occupancy=occupancy,
336 |     po_box=po_box,
337 |     part_divider=part_divider,
338 |     space=space_pattern,
339 | )
340 | 
341 | # region1 is actually a "state"
342 | region1 = r"""
343 |         (?P<region1>
344 |             [A-Za-z]{1}[a-zA-Z0-9\ \.\-']{1,35}
345 |         )  # end region1 
346 | """
347 | 
348 | city = r"""
349 |         (?P<city>
350 |             [A-Za-z]{1}[a-zA-Z0-9\ \.\-']{1,35}
351 |         )  # end city
352 | """
353 | 
354 | postal_code = r"""
355 |         (?P<postal_code>
356 |             (?:
357 |                 (?:[gG][iI][rR] {0,}0[aA]{2})|
358 |                 (?:
359 |                     (?:
360 |                         [aA][sS][cC][nN]|
361 |                         [sS][tT][hH][lL]|
362 |                         [tT][dD][cC][uU]|
363 |                         [bB][bB][nN][dD]|
364 |                         [bB][iI][qQ][qQ]|
365 |                         [fF][iI][qQ][qQ]|
366 |                         [pP][cC][rR][nN]|
367 |                         [sS][iI][qQ][qQ]|
368 |                         [iT][kK][cC][aA]
369 |                     )
370 |                     \ {0,}1[zZ]{2}
371 |                 )|
372 |                 (?:
373 |                     (?:
374 |                         (?:[a-pr-uwyzA-PR-UWYZ][a-hk-yxA-HK-XY]?[0-9][0-9]?)|
375 |                         (?:
376 |                             (?:[a-pr-uwyzA-PR-UWYZ][0-9][a-hjkstuwA-HJKSTUW])|
377 |                             (?:[a-pr-uwyzA-PR-UWYZ][a-hk-yA-HK-Y][0-9][abehmnprv-yABEHMNPRV-Y])
378 |                         )
379 |                     )
380 |                     \ {0,}[0-9][abd-hjlnp-uw-zABD-HJLNP-UW-Z]{2}
381 |                 )
382 |             )
383 |         )  # end postal_code
384 | """
385 | 
386 | country = r"""
387 |         (?P<country>
388 |             (?:[Tt][Hh][Ee]\ *)?[Uu][Nn][Ii][Tt][Ee][Dd]\ *[Kk][Ii][Nn][Gg][Dd][Oo][Mm]\ *[Oo][Ff]\ *(?:[Gg][Rr][Ee][Aa][Tt]\ *)?[Bb][Rr][Ii][Tt][Aa][Ii][Nn](?:\ *[Aa][Nn][Dd]\ *[Nn][Oo][Rr][Tt][Hh][Ee][Rr][Nn]\ *[Ii][Rr][Ee][Ll][Aa][Nn][Dd])?|
389 |             (?:[Gg][Rr][Ee][Aa][Tt]\ *)?[Bb][Rr][Ii][Tt][Aa][Ii][Nn](?:\ *[Aa][Nn][Dd]\ *[Nn][Oo][Rr][Tt][Hh][Ee][Rr][Nn]\ *[Ii][Rr][Ee][Ll][Aa][Nn][Dd])?|
390 |             (?:[Tt][Hh][Ee]\ *)?[Uu][Nn][Ii][Tt][Ee][Dd]\ *[Kk][Ii][Nn][Gg][Dd][Oo][Mm]|
391 |             (?:[Nn][Oo][Rr][Tt][Hh][Ee][Rr][Nn]\ *)?[Ii][Rr][Ee][Ll][Aa][Nn][Dd]|
392 |             [Ee][Nn][Gg][Ll][Aa][Nn][Dd]|
393 |             [Ss][Cc][Oo][Tt][Ll][Aa][Nn][Dd]|
394 |             [Ww][Aa][Ll][Ee][Ss]|
395 |             [Cc][Yy][Mm][Rr][Uu]|
396 |             [Gg][Bb]|
397 |             [Uu][Kk]|  
398 |             [Nn]\.?\ *[Ii]\.?
399 |         )  # end country
400 | """
401 | 
402 | full_address = r"""
403 |     (?P<full_address>
404 |         {full_street} 
405 |         (?: {part_divider} {city} )?
406 |         (?: {part_divider} {region1} )?
407 |         {part_divider}? {postal_code} 
408 |         (?: {part_divider} {country} )?
409 |     )  # end full_address
410 | """.format(
411 |     full_street=full_street,
412 |     part_divider=part_divider,
413 |     city=city,
414 |     region1=region1,
415 |     country=country,
416 |     postal_code=postal_code,
417 | )
418 | 


--------------------------------------------------------------------------------
/pyap/source_US/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vladimarius/pyap/34e33739be46e084becae73980d945af068ae699/pyap/source_US/__init__.py


--------------------------------------------------------------------------------
/pyap/source_US/data.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 |     pyap.source_US.data
  5 |     ~~~~~~~~~~~~~~~~~~~~
  6 | 
  7 |     This module provides regular expression definitions required for
  8 |     detecting US addresses.
  9 | 
 10 |     The module is expected to always contain 'full_address' variable containing
 11 |     all address parsing definitions.
 12 | 
 13 |     :copyright: (c) 2015 by Vladimir Goncharov.
 14 |     :license: MIT, see LICENSE for more details.
 15 | """
 16 | 
 17 | import string
 18 | 
 19 | 
 20 | '''Numerals from one to nine
 21 | Note: here and below we use syntax like '[Oo][Nn][Ee]'
 22 | instead of '(one)(?i)' to match 'One' or 'oNe' because
 23 | Python Regexps don't seem to support turning On/Off
 24 | case modes for subcapturing groups.
 25 | '''
 26 | zero_to_nine = r"""(?:
 27 |     [Zz][Ee][Rr][Oo]\ |[Oo][Nn][Ee]\ |[Tt][Ww][Oo]\ |
 28 |     [Tt][Hh][Rr][Ee][Ee]\ |[Ff][Oo][Uu][Rr]\ |
 29 |     [Ff][Ii][Vv][Ee]\ |[Ss][Ii][Xx]\ |
 30 |     [Ss][Ee][Vv][Ee][Nn]\ |[Ee][Ii][Gg][Hh][Tt]\ |
 31 |     [Nn][Ii][Nn][Ee]\ |[Tt][Ee][Nn]\ |
 32 |     [Ee][Ll][Ee][Vv][Ee][Nn]\ |
 33 |     [Tt][Ww][Ee][Ll][Vv][Ee]\ |
 34 |     [Tt][Hh][Ii][Rr][Tt][Ee][Ee][Nn]\ |
 35 |     [Ff][Oo][Uu][Rr][Tt][Ee][Ee][Nn]\ |
 36 |     [Ff][Ii][Ff][Tt][Ee][Ee][Nn]\ |
 37 |     [Ss][Ii][Xx][Tt][Ee][Ee][Nn]\ |
 38 |     [Ss][Ee][Vv][Ee][Nn][Tt][Ee][Ee][Nn]\ |
 39 |     [Ee][Ii][Gg][Hh][Tt][Ee][Ee][Nn]\ |
 40 |     [Nn][Ii][Nn][Ee][Tt][Ee][Ee][Nn]\ 
 41 |     )"""
 42 | 
 43 | # Numerals - 10, 20, 30 ... 90
 44 | ten_to_ninety = r"""(?:
 45 |     [Tt][Ee][Nn]\ |[Tt][Ww][Ee][Nn][Tt][Yy]\ |
 46 |     [Tt][Hh][Ii][Rr][Tt][Yy]\ |
 47 |     [Ff][Oo][Rr][Tt][Yy]\ |
 48 |     [Ff][Oo][Uu][Rr][Tt][Yy]\ |
 49 |     [Ff][Ii][Ff][Tt][Yy]\ |[Ss][Ii][Xx][Tt][Yy]\ |
 50 |     [Ss][Ee][Vv][Ee][Nn][Tt][Yy]\ |
 51 |     [Ee][Ii][Gg][Hh][Tt][Yy]\ |
 52 |     [Nn][Ii][Nn][Ee][Tt][Yy]\ 
 53 |     )"""
 54 | 
 55 | # One hundred
 56 | hundred = r"""(?:
 57 |     [Hh][Uu][Nn][Dd][Rr][Ee][Dd]\ 
 58 |     )"""
 59 | 
 60 | # One thousand
 61 | thousand = r"""(?:
 62 |     [Tt][Hh][Oo][Uu][Ss][Aa][Nn][Dd]\ 
 63 |     )"""
 64 | 
 65 | '''
 66 | Regexp for matching street number.
 67 | Street number can be written 2 ways:
 68 | 1) Using letters - "One thousand twenty two"
 69 | 2) Using numbers
 70 |    a) - "1022"
 71 |    b) - "85-1190"
 72 |    c) - "85 1190"
 73 | '''
 74 | street_number = r"""(?P<street_number>
 75 |                         (?:
 76 |                             [Aa][Nn][Dd]\ 
 77 |                             |
 78 |                             {thousand}
 79 |                             |
 80 |                             {hundred}
 81 |                             |
 82 |                             {zero_to_nine}
 83 |                             |
 84 |                             {ten_to_ninety}
 85 |                         ){from_to}
 86 |                         |
 87 |                         (?:\d{from_to}
 88 |                             (?:\ ?\-?\ ?\d{from_to})?\ 
 89 |                         )
 90 |                     )
 91 |                 """.format(thousand=thousand,
 92 |                            hundred=hundred,
 93 |                            zero_to_nine=zero_to_nine,
 94 |                            ten_to_ninety=ten_to_ninety,
 95 |                            from_to='{1,5}')
 96 | 
 97 | '''
 98 | Regexp for matching street name.
 99 | In example below:
100 | "Hoover Boulevard": "Hoover" is a street name
101 | '''
102 | street_name = r"""(?P<street_name>
103 |                   [a-zA-Z0-9\ \.]{3,31}  # Seems like the longest US street is
104 |                                          # 'Northeast Kentucky Industrial Parkway'
105 |                                          # https://atkinsbookshelf.wordpress.com/tag/longest-street-name-in-us/
106 |                  )
107 |               """
108 | 
109 | post_direction = r"""
110 |                     (?P<post_direction>
111 |                         (?:
112 |                             [Nn][Oo][Rr][Tt][Hh]\ |
113 |                             [Ss][Oo][Uu][Tt][Hh]\ |
114 |                             [Ee][Aa][Ss][Tt]\ |
115 |                             [Ww][Ee][Ss][Tt]\ 
116 |                         )
117 |                         |
118 |                         (?:
119 |                             NW\ |NE\ |SW\ |SE\ 
120 |                         )
121 |                         |
122 |                         (?:
123 |                             N\.?\ |S\.?\ |E\.?\ |W\.?\ 
124 |                         )
125 |                     )
126 |                 """
127 | 
128 | # This list was taken from: https://pe.usps.com/text/pub28/28apc_002.htm
129 | # Broadway and Lp (abbreviation for Loop) were added to the list
130 | street_type_list = [
131 |     'Allee', 'Alley', 'Ally', 'Aly', 'Anex', 'Annex',
132 |     'Annx', 'Anx', 'Arc', 'Arcade', 'Av', 'Ave',
133 |     'Aven', 'Avenu', 'Avenue', 'Avn', 'Avnue', 'Bayoo',
134 |     'Bayou', 'Bch', 'Beach', 'Bend', 'Bg', 'Bgs',
135 |     'Blf', 'Blfs', 'Bluf', 'Bluff', 'Bluffs', 'Blvd',
136 |     'Bnd', 'Bot', 'Bottm', 'Bottom', 'Boul', 'Boulevard',
137 |     'Boulv', 'Br', 'Branch', 'Brdge', 'Brg', 'Bridge',
138 |     'Brk', 'Brks', 'Brnch', 'Broadway', 'Brook', 'Brooks',
139 |     'Btm', 'Burg', 'Burgs', 'Byp', 'Bypa', 'Bypas',
140 |     'Bypass', 'Byps', 'Byu', 'Camp', 'Canyn', 'Canyon',
141 |     'Cape', 'Causeway', 'Causwa', 'Cen', 'Cent', 'Center',
142 |     'Centers', 'Centr', 'Centre', 'Cir', 'Circ', 'Circl',
143 |     'Circle', 'Circles', 'Cirs', 'Clb', 'Clf', 'Clfs',
144 |     'Cliff', 'Cliffs', 'Club', 'Cmn', 'Cmns', 'Cmp',
145 |     'Cnter', 'Cntr', 'Cnyn', 'Common', 'Commons', 'Cor',
146 |     'Corner', 'Corners', 'Cors', 'Course', 'Court', 'Courts',
147 |     'Cove', 'Coves', 'Cp', 'Cpe', 'Crcl', 'Crcle',
148 |     'Creek', 'Cres', 'Crescent', 'Crest', 'Crk', 'Crossing',
149 |     'Crossroad', 'Crossroads', 'Crse', 'Crsent', 'Crsnt', 'Crssng',
150 |     'Crst', 'Cswy', 'Ct', 'Ctr', 'Ctrs', 'Cts',
151 |     'Curv', 'Curve', 'Cv', 'Cvs', 'Cyn', 'Dale',
152 |     'Dam', 'Div', 'Divide', 'Dl', 'Dm', 'Dr',
153 |     'Driv', 'Drive', 'Drives', 'Drs', 'Drv', 'Dv',
154 |     'Dvd', 'Est', 'Estate', 'Estates', 'Ests', 'Exp',
155 |     'Expr', 'Express', 'Expressway', 'Expw', 'Expy', 'Ext',
156 |     'Extension', 'Extensions', 'Extn', 'Extnsn', 'Exts', 'Fall',
157 |     'Falls', 'Ferry', 'Field', 'Fields', 'Flat', 'Flats',
158 |     'Fld', 'Flds', 'Fls', 'Flt', 'Flts', 'Ford',
159 |     'Fords', 'Forest', 'Forests', 'Forg', 'Forge', 'Forges',
160 |     'Fork', 'Forks', 'Fort', 'Frd', 'Frds', 'Freeway',
161 |     'Freewy', 'Frg', 'Frgs', 'Frk', 'Frks', 'Frry',
162 |     'Frst', 'Frt', 'Frway', 'Frwy', 'Fry', 'Ft',
163 |     'Fwy', 'Garden', 'Gardens', 'Gardn', 'Gateway', 'Gatewy',
164 |     'Gatway', 'Gdn', 'Gdns', 'Glen', 'Glens', 'Gln',
165 |     'Glns', 'Grden', 'Grdn', 'Grdns', 'Green', 'Greens',
166 |     'Grn', 'Grns', 'Grov', 'Grove', 'Groves', 'Grv',
167 |     'Grvs', 'Gtway', 'Gtwy', 'Harb', 'Harbor', 'Harbors',
168 |     'Harbr', 'Haven', 'Hbr', 'Hbrs', 'Heights', 'Highway',
169 |     'Highwy', 'Hill', 'Hills', 'Hiway', 'Hiwy', 'Hl',
170 |     'Hllw', 'Hls', 'Hollow', 'Hollows', 'Holw', 'Holws',
171 |     'Hrbor', 'Ht', 'Hts', 'Hvn', 'Hway', 'Hwy',
172 |     'Inlet', 'Inlt', 'Is', 'Island', 'Islands', 'Isle',
173 |     'Isles', 'Islnd', 'Islnds', 'Iss', 'Jct', 'Jction',
174 |     'Jctn', 'Jctns', 'Jcts', 'Junction', 'Junctions', 'Junctn',
175 |     'Juncton', 'Key', 'Keys', 'Knl', 'Knls', 'Knol',
176 |     'Knoll', 'Knolls', 'Ky', 'Kys', 'Lake', 'Lakes',
177 |     'Land', 'Landing', 'Lane', 'Lck', 'Lcks', 'Ldg',
178 |     'Ldge', 'Lf', 'Lgt', 'Lgts', 'Light', 'Lights',
179 |     'Lk', 'Lks', 'Ln', 'Lndg', 'Lndng', 'Loaf',
180 |     'Lock', 'Locks', 'Lodg', 'Lodge', 'Loop', 'Loops',
181 |     'Lp', 'Mall', 'Manor', 'Manors', 'Mdw', 'Mdws',
182 |     'Meadow', 'Meadows', 'Medows', 'Mews', 'Mill', 'Mills',
183 |     'Mission', 'Missn', 'Ml', 'Mls', 'Mnr', 'Mnrs',
184 |     'Mnt', 'Mntain', 'Mntn', 'Mntns', 'Motorway', 'Mount',
185 |     'Mountain', 'Mountains', 'Mountin', 'Msn', 'Mssn', 'Mt',
186 |     'Mtin', 'Mtn', 'Mtns', 'Mtwy', 'Nck', 'Neck',
187 |     'Opas', 'Orch', 'Orchard', 'Orchrd', 'Oval', 'Overpass',
188 |     'Ovl', 'Park', 'Parks', 'Parkway', 'Parkways', 'Parkwy',
189 |     'Pass', 'Passage', 'Path', 'Paths', 'Pike', 'Pikes',
190 |     'Pine', 'Pines', 'Pkway', 'Pkwy', 'Pkwys', 'Pky',
191 |     'Pl', 'Place', 'Plain', 'Plains', 'Plaza', 'Pln',
192 |     'Plns', 'Plz', 'Plza', 'Pne', 'Pnes', 'Point',
193 |     'Points', 'Port', 'Ports', 'Pr', 'Prairie', 'Prk',
194 |     'Prr', 'Prt', 'Prts', 'Psge', 'Pt', 'Pts',
195 |     'Rad', 'Radial', 'Radiel', 'Radl', 'Ramp', 'Ranch',
196 |     'Ranches', 'Rapid', 'Rapids', 'Rd', 'Rdg', 'Rdge',
197 |     'Rdgs', 'Rds', 'Rest', 'Ridge', 'Ridges', 'Riv',
198 |     'River', 'Rivr', 'Rnch', 'Rnchs', 'Road', 'Roads',
199 |     'Route', 'Row', 'Rpd', 'Rpds', 'Rst', 'Rte',
200 |     'Rue', 'Run', 'Rvr', 'Shl', 'Shls', 'Shoal',
201 |     'Shoals', 'Shoar', 'Shoars', 'Shore', 'Shores', 'Shr',
202 |     'Shrs', 'Skwy', 'Skyway', 'Smt', 'Spg', 'Spgs',
203 |     'Spng', 'Spngs', 'Spring', 'Springs', 'Sprng', 'Sprngs',
204 |     'Spur', 'Spurs', 'Sq', 'Sqr', 'Sqre', 'Sqrs',
205 |     'Sqs', 'Squ', 'Square', 'Squares', 'St', 'Sta',
206 |     'Station', 'Statn', 'Stn', 'Str', 'Stra', 'Strav',
207 |     'Straven', 'Stravenue', 'Stravn', 'Stream', 'Street', 'Streets',
208 |     'Streme', 'Strm', 'Strt', 'Strvn', 'Strvnue', 'Sts',
209 |     'Sumit', 'Sumitt', 'Summit', 'Ter', 'Terr', 'Terrace',
210 |     'Throughway', 'Tpke', 'Trace', 'Traces', 'Track', 'Tracks',
211 |     'Trafficway', 'Trail', 'Trailer', 'Trails', 'Trak', 'Trce',
212 |     'Trfy', 'Trk', 'Trks', 'Trl', 'Trlr', 'Trlrs',
213 |     'Trls', 'Trnpk', 'Trwy', 'Tunel', 'Tunl', 'Tunls',
214 |     'Tunnel', 'Tunnels', 'Tunnl', 'Turnpike', 'Turnpk', 'Un',
215 |     'Underpass', 'Union', 'Unions', 'Uns', 'Upas', 'Valley',
216 |     'Valleys', 'Vally', 'Vdct', 'Via', 'Viadct', 'Viaduct',
217 |     'View', 'Views', 'Vill', 'Villag', 'Village', 'Villages',
218 |     'Ville', 'Villg', 'Villiage', 'Vis', 'Vist', 'Vista',
219 |     'Vl', 'Vlg', 'Vlgs', 'Vlly', 'Vly', 'Vlys',
220 |     'Vst', 'Vsta', 'Vw', 'Vws', 'Walk', 'Walks',
221 |     'Wall', 'Way', 'Ways', 'Well', 'Wells', 'Wl',
222 |     'Wls', 'Wy', 'Xing', 'Xrd', 'Xrds',
223 | ]
224 | 
225 | 
226 | def street_type_list_to_regex(street_type_list):
227 |     """Converts a list of street types into a regex"""
228 |     street_types = '|'.join(set(street_type_list)).lower()
229 |     for letter in string.ascii_lowercase:
230 |         street_types = street_types.replace(letter, '[{upper}{lower}]'.format(upper=letter.upper(), lower=letter))
231 | 
232 |     # Use \b to check that there are word boundaries before and after the street type
233 |     # Optionally match zero to two of " ", ",", or "." after the street name
234 |     street_types = street_types.replace('|', r'\b{div}|\b')
235 |     street_types = r'\b' + street_types + r'\b{div}'
236 |     return street_types.format(
237 |         div=r'[\.\ ,]{0,2}',
238 |     )
239 | 
240 | 
241 | # Regexp for matching street type
242 | street_type = r"""
243 |             (?:
244 |                 (?P<street_type>
245 |                     {street_types}
246 |                 )
247 |                 (?P<route_id>
248 |                     [\(\ \,]{route_symbols}
249 |                     [Rr][Oo][Uu][Tt][Ee]\ [A-Za-z0-9]+[\)\ \,]{route_symbols}
250 |                 )?
251 |             )
252 | """.format(
253 |     route_symbols='{0,3}',
254 |     street_types=street_type_list_to_regex(street_type_list),
255 | )
256 | 
257 | floor = r"""
258 |             (?P<floor>
259 |                 (?:
260 |                 \d+[A-Za-z]{0,2}\.?\ [Ff][Ll][Oo][Oo][Rr]\ 
261 |                 )
262 |                 |
263 |                 (?:
264 |                     [Ff][Ll][Oo][Oo][Rr]\ \d+[A-Za-z]{0,2}\ 
265 |                 )
266 |             )
267 |         """
268 | 
269 | building = r"""
270 |             (?P<building_id>
271 |                 (?:
272 |                     (?:[Bb][Uu][Ii][Ll][Dd][Ii][Nn][Gg])
273 |                     |
274 |                     (?:[Bb][Ll][Dd][Gg])
275 |                 )
276 |                 \ 
277 |                 (?:
278 |                     (?:
279 |                         [Aa][Nn][Dd]\ 
280 |                         |
281 |                         {thousand}
282 |                         |
283 |                         {hundred}
284 |                         |
285 |                         {zero_to_nine}
286 |                         |
287 |                         {ten_to_ninety}
288 |                     ){{1,5}}
289 |                     |
290 |                     \d{{0,4}}[A-Za-z]?
291 |                 )
292 |                 \ ?
293 |             )
294 |             """.format(thousand=thousand,
295 |                        hundred=hundred,
296 |                        zero_to_nine=zero_to_nine,
297 |                        ten_to_ninety=ten_to_ninety,
298 |                        )
299 | 
300 | occupancy = r"""
301 |             (?P<occupancy>
302 |                 (?:
303 |                     (?:
304 |                         (?:
305 |                             # Suite
306 |                             [Ss][Uu][Ii][Tt][Ee]\ |[Ss][Tt][Ee]\.?\ 
307 |                             |
308 |                             # Apartment
309 |                             [Aa][Pp][Tt]\.?\ |[Aa][Pp][Aa][Rr][Tt][Mm][Ee][Nn][Tt]\ 
310 |                             |
311 |                             # Room
312 |                             [Rr][Oo][Oo][Mm]\ |[Rr][Mm]\.?\ 
313 |                         )
314 |                         (?:
315 |                             [A-Za-z\#\&\-\d]{1,7}
316 |                         )?
317 |                     )
318 |                     |
319 |                     (?:
320 |                         \#[0-9]{,3}[A-Za-z]{1}
321 |                     )
322 |                 )\ ?
323 |             )
324 |             """
325 | 
326 | po_box = r"""
327 |             (?:
328 |                 [Pp]\.?\ ?[Oo]\.?\ [Bb][Oo][Xx]\ \d+
329 |             )
330 |         """
331 | 
332 | full_street = r"""
333 |     (?:
334 |         (?P<full_street>
335 |             {street_number}
336 |             {street_name}?\,?\ ?
337 |             (?:[\ \,]{street_type})\,?\ ?
338 |             {post_direction}?\,?\ ?
339 |             {floor}?\,?\ ?
340 |             {building}?\,?\ ?
341 |             {occupancy}?\,?\ ?
342 |             {po_box}?
343 |         )
344 |     )""".format(street_number=street_number,
345 |                 street_name=street_name,
346 |                 street_type=street_type,
347 |                 post_direction=post_direction,
348 |                 floor=floor,
349 |                 building=building,
350 |                 occupancy=occupancy,
351 |                 po_box=po_box,
352 |                 )
353 | 
354 | # region1 is actually a "state"
355 | region1 = r"""
356 |         (?P<region1>
357 |             (?:
358 |                 # states abbreviations
359 |                 AL|AK|AZ|AR|CA|CO|CT|DE|DC|FL|GA|HI|ID|IL|IN|IA|KS|KY|LA|ME|MD|MA|MI|
360 |                 MN|MS|MO|MT|NE|NV|NH|NJ|NM|NY|NC|ND|OH|OK|OR|PA|RI|SC|SD|TN|TX|UT|VT|
361 |                 VA|WA|WV|WI|WY|
362 |                 # unincorporated & commonwealth territories
363 |                 AS|GU|MP|PR|VI
364 |             )
365 |             |
366 |             (?:
367 |                 # states full
368 |                 [Aa][Ll][Aa][Bb][Aa][Mm][Aa]|
369 |                 [Aa][Ll][Aa][Ss][Kk][Aa]|
370 |                 [Aa][Rr][Ii][Zz][Oo][Nn][Aa]|
371 |                 [Aa][Rr][Kk][Aa][Nn][Ss][Aa][Ss]|
372 |                 [Cc][Aa][Ll][Ii][Ff][Oo][Rr][Nn][Ii][Aa]|
373 |                 [Cc][Oo][Ll][Oo][Rr][Aa][Dd][Oo]|
374 |                 [Cc][Oo][Nn][Nn][Ee][Cc][Tt][Ii][Cc][Uu][Tt]|
375 |                 [Dd][Ee][Ll][Aa][Ww][Aa][Rr][Ee]|
376 |                 [Dd][Ii][Ss][Tt][Rr][Ii][Cc][Tt]\ [Oo][Ff]\ 
377 |                 [Cc][Oo][Ll][Uu][Mm][Bb][Ii][Aa]|
378 |                 [Ff][Ll][Oo][Rr][Ii][Dd][Aa]|
379 |                 [Gg][Ee][Oo][Rr][Gg][Ii][Aa]|
380 |                 [Hh][Aa][Ww][Aa][Ii][Ii]|
381 |                 [Ii][Dd][Aa][Hh][Oo]|
382 |                 [Ii][Ll][Ll][Ii][Nn][Oo][Ii][Ss]|
383 |                 [Ii][Nn][Dd][Ii][Aa][Nn][Aa]|
384 |                 [Ii][Oo][Ww][Aa]|
385 |                 [Kk][Aa][Nn][Ss][Aa][Ss]|
386 |                 [Kk][Ee][Nn][Tt][Uu][Cc][Kk][Yy]|
387 |                 [Ll][Oo][Uu][Ii][Ss][Ii][Aa][Nn][Aa]|
388 |                 [Mm][Aa][Ii][Nn][Ee]|
389 |                 [Mm][Aa][Rr][Yy][Ll][Aa][Nn][Dd]|
390 |                 [Mm][Aa][Ss][Ss][Aa][Cc][Hh][Uu][Ss][Ee][Tt][Tt][Ss]|
391 |                 [Mm][Ii][Cc][Hh][Ii][Gg][Aa][Nn]|
392 |                 [Mm][Ii][Nn][Nn][Ee][Ss][Oo][Tt][Aa]|
393 |                 [Mm][Ii][Ss][Ss][Ii][Ss][Ss][Ii][Pp][Pp][Ii]|
394 |                 [Mm][Ii][Ss][Ss][Oo][Uu][Rr][Ii]|
395 |                 [Mm][Oo][Nn][Tt][Aa][Nn][Aa]|
396 |                 [Nn][Ee][Bb][Rr][Aa][Ss][Kk][Aa]|
397 |                 [Nn][Ee][Vv][Aa][Dd][Aa]|
398 |                 [Nn][Ee][Ww]\ [Hh][Aa][Mm][Pp][Ss][Hh][Ii][Rr][Ee]|
399 |                 [Nn][Ee][Ww]\ [Jj][Ee][Rr][Ss][Ee][Yy]|
400 |                 [Nn][Ee][Ww]\ [Mm][Ee][Xx][Ii][Cc][Oo]|
401 |                 [Nn][Ee][Ww]\ [Yy][Oo][Rr][Kk]|
402 |                 [Nn][Oo][Rr][Tt][Hh]\ [Cc][Aa][Rr][Oo][Ll][Ii][Nn][Aa]|
403 |                 [Nn][Oo][Rr][Tt][Hh]\ [Dd][Aa][Kk][Oo][Tt][Aa]|
404 |                 [Oo][Hh][Ii][Oo]|
405 |                 [Oo][Kk][Ll][Aa][Hh][Oo][Mm][Aa]|
406 |                 [Oo][Rr][Ee][Gg][Oo][Nn]|
407 |                 [Pp][Ee][Nn][Nn][Ss][Yy][Ll][Vv][Aa][Nn][Ii][Aa]|
408 |                 [Rr][Hh][Oo][Dd][Ee]\ [Ii][Ss][Ll][Aa][Nn][Dd]|
409 |                 [Ss][Oo][Uu][Tt][Hh]\ [Cc][Aa][Rr][Oo][Ll][Ii][Nn][Aa]|
410 |                 [Ss][Oo][Uu][Tt][Hh]\ [Dd][Aa][Kk][Oo][Tt][Aa]|
411 |                 [Tt][Ee][Nn][Nn][Ee][Ss][Ss][Ee][Ee]|
412 |                 [Tt][Ee][Xx][Aa][Ss]|
413 |                 [Uu][Tt][Aa][Hh]|
414 |                 [Vv][Ee][Rr][Mm][Oo][Nn][Tt]|
415 |                 [Vv][Ii][Rr][Gg][Ii][Nn][Ii][Aa]|
416 |                 [Ww][Aa][Ss][Hh][Ii][Nn][Gg][Tt][Oo][Nn]|
417 |                 [Ww][Ee][Ss][Tt]\ [Vv][Ii][Rr][Gg][Ii][Nn][Ii][Aa]|
418 |                 [Ww][Ii][Ss][Cc][Oo][Nn][Ss][Ii][Nn]|
419 |                 [Ww][Yy][Oo][Mm][Ii][Nn][Gg]|
420 |                 # unincorporated & commonwealth territories
421 |                 [Aa][Mm][Ee][Rr][Ii][Cc][Aa][Nn]\ [Ss][Aa][Mm][Oo][Aa]
422 |                 |[Gg][Uu][Aa][Mm]|
423 |                 [Nn][Oo][Rr][Tt][Hh][Ee][Rr][Nn]\ [Mm][Aa][Rr][Ii][Aa][Nn][Aa]\ 
424 |                 [Ii][Ss][Ll][Aa][Nn][Dd][Ss]|
425 |                 [Pp][Uu][Ee][Rr][Tt][Oo]\ [Rr][Ii][Cc][Oo]|
426 |                 [Vv][Ii][Rr][Gg][Ii][Nn]\ [Ii][Ss][Ll][Aa][Nn][Dd][Ss]
427 |             )
428 |         )
429 |         """
430 | 
431 | # TODO: doesn't catch cities containing French characters
432 | city = r"""
433 |         (?P<city>
434 |             [A-Za-z]{1}[a-zA-Z\ \-\'\.]{2,20}
435 |         )
436 |         """
437 | 
438 | postal_code = r"""
439 |             (?P<postal_code>
440 |                 (?:\d{5}(?:\-\d{4})?)
441 |             )
442 |             """
443 | 
444 | country = r"""
445 |             (?:
446 |                 [Uu]\.?[Ss]\.?[Aa]\.?|
447 |                 [Uu][Nn][Ii][Tt][Ee][Dd]\ [Ss][Tt][Aa][Tt][Ee][Ss](?:\ [Oo][Ff]\ [Aa][Mm][Ee][Rr][Ii][Cc][Aa])?
448 |             )
449 |             """
450 | 
451 | full_address = r"""
452 |                 (?P<full_address>
453 |                     {full_street} {div}
454 |                     {city} {div}
455 |                     {region1} {div}
456 |                     (?:
457 |                         (?:{postal_code}?(\ ?,?{country})?)
458 |                     )
459 |                 )
460 |                 """.format(
461 |     full_street=full_street,
462 |     div=r'[\, ]{,2}',
463 |     city=city,
464 |     region1=region1,
465 |     country=country,
466 |     postal_code=postal_code,
467 | )
468 | 


--------------------------------------------------------------------------------
/pyap/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 |     pyap.utils
 5 |     ~~~~~~~~~~~~~~~~
 6 | 
 7 |     This module provides some utility functions.
 8 | 
 9 |     :copyright: (c) 2015 by Vladimir Goncharov.
10 |     :license: MIT, see LICENSE for more details.
11 | """
12 | 
13 | import re
14 | from .packages import six
15 | 
16 | DEFAULT_FLAGS = re.VERBOSE | re.UNICODE
17 | 
18 | if six.PY2:
19 | 
20 |     def match(regex, string, flags=DEFAULT_FLAGS):
21 |         '''Utility function for re.match '''
22 |         if isinstance(string, str):
23 |             string = unicode(string, 'utf-8')
24 |         return re.match(
25 |             unicode(regex, 'utf-8'),
26 |             string,
27 |             flags=flags
28 |         )
29 | 
30 |     def findall(regex, string, flags=DEFAULT_FLAGS):
31 |         '''Utility function for re.findall '''
32 |         if isinstance(string, str):
33 |             string = unicode(string, 'utf-8')
34 |         return re.findall(
35 |             unicode(regex, 'utf-8'),
36 |             string,
37 |             flags=flags
38 |         )
39 | 
40 |     def finditer(regex, string, flags=DEFAULT_FLAGS):
41 |         '''Utility function for re.finditer '''
42 |         if isinstance(string, str):
43 |             string = unicode(string, 'utf-8')
44 |         return list(re.finditer(
45 |             unicode(regex, 'utf-8'),
46 |             string,
47 |             flags=flags
48 |         ))
49 | 
50 |     def unicode_str(string):
51 |         '''Return Unicode string'''
52 |         return unicode(string, 'utf-8')
53 | 
54 | elif six.PY3:
55 | 
56 |     def match(regex, string, flags=DEFAULT_FLAGS):
57 |         '''Utility function for re.match '''
58 |         return re.match(regex, string, flags=flags)
59 | 
60 |     def findall(regex, string, flags=DEFAULT_FLAGS):
61 |         '''Utility function for re.findall '''
62 |         return re.findall(regex, string, flags=flags)
63 | 
64 |     def finditer(regex, string, flags=DEFAULT_FLAGS):
65 |         '''Utility function for re.finditer '''
66 |         return list(re.finditer(regex, string, flags=flags))
67 | 
68 |     def unicode_str(string):
69 |         '''Return Unicode string'''
70 |         return string
71 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "pyap"
 3 | version = "0.3.1"
 4 | description = "Pyap is an MIT Licensed text processing library, written in Python, for detecting and parsing addresses. Currently it supports USA, Canadian and British addresses."
 5 | authors = ["Vladimir Goncharov <vladimarius@gmail.com>"]
 6 | documentation = "https://github.com/vladimarius/pyap"
 7 | license = "MIT"
 8 | readme="README.rst"
 9 | 
10 | [tool.poetry.dependencies]
11 | python = ">=2.7"
12 | 
13 | [tool.poetry.dev-dependencies]
14 | 
15 | [build-system]
16 | requires = ["poetry>=0.12"]
17 | build-backend = "poetry.masonry.api"
18 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | def readme():
 5 |     with open('README.rst') as f:
 6 |         return f.read()
 7 | 
 8 | 
 9 | setup(name='pyap',
10 |       version='0.2.0',
11 |       description='Library for detecting and parsing addresses.'
12 |       ' Currently supports US, Canadian and British addresses.',
13 |       long_description=readme(),
14 |       long_description_content_type="text/x-rst",
15 |       keywords='address detection, address parsing',
16 |       url='http://github.com/vladimarius/pyap',
17 |       author='Vladimir Goncharov',
18 |       author_email='vladimarius@gmail.com',
19 |       license='MIT',
20 |       packages=['pyap', 'pyap.packages', 'pyap.source_CA', 'pyap.source_US', 'pyap.source_GB'],
21 |       download_url='https://github.com/vladimarius/pyap',
22 |       zip_safe=False,
23 |       classifiers=[
24 |           'Intended Audience :: Developers',
25 |           'Development Status :: 4 - Beta',
26 |           'License :: OSI Approved :: MIT License',
27 |           'Natural Language :: English',
28 |           'Programming Language :: Python',
29 |           'Programming Language :: Python :: 2.7',
30 |           'Programming Language :: Python :: 3',
31 |           'Programming Language :: Python :: 3.1',
32 |           'Programming Language :: Python :: 3.2',
33 |           'Programming Language :: Python :: 3.3',
34 |           'Programming Language :: Python :: 3.4',
35 |           'Topic :: Software Development :: Libraries',
36 |           'Topic :: Scientific/Engineering :: Information Analysis',
37 |           'Topic :: Utilities'
38 |       ],
39 |       )
40 | 


--------------------------------------------------------------------------------
/test_parser.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """ Test for parser classes """
 4 | 
 5 | import re
 6 | import pytest
 7 | import pyap as ap
 8 | from pyap import parser
 9 | from pyap import address
10 | from pyap import exceptions as e
11 | 
12 | 
13 | def test_api_parse():
14 |     test_address = "xxx 225 E. John Carpenter Freeway, " +\
15 |         "Suite 1500 Irving, Texas 75062 xxx"
16 |     addresses = ap.parse(test_address, country='US')
17 |     assert str(addresses[0].full_address) == \
18 |         "225 E. John Carpenter Freeway, Suite 1500 Irving, Texas 75062"
19 | 
20 | 
21 | def test_address_class_init():
22 |     addr = address.Address(
23 |         state='USA ',
24 |         city='CityVille, ',
25 |         street=' Street 1b ',
26 |         full_address='Street 1b CityVille USA')
27 |     assert addr.state == 'USA'
28 | 
29 |     assert addr.city == 'CityVille'
30 | 
31 |     assert addr.street == 'Street 1b'
32 | 
33 |     assert addr.as_dict() == \
34 |         {'state': 'USA',
35 |          'city': 'CityVille',
36 |          'street': 'Street 1b',
37 |          'full_address': 'Street 1b CityVille USA'}
38 | 
39 |     assert str(addr) == 'Street 1b CityVille USA'
40 | 
41 | 
42 | def test_no_country_selected_exception():
43 |     with pytest.raises(e.NoCountrySelected):
44 |         ap = parser.AddressParser()
45 | 
46 | 
47 | def test_country_detection_missing():
48 |     with pytest.raises(e.CountryDetectionMissing):
49 |         ap = parser.AddressParser(country='TheMoon')
50 | 
51 | 
52 | def test_normalize_string():
53 |     ap = parser.AddressParser(country='US')
54 |     raw_string = """\n The  quick      \t, brown fox      jumps over the lazy dog,
55 |     ‐ ‑ ‒ – — ―
56 |     """
57 |     clean_string = u', The quick, brown fox jumps over the lazy dog, - - - - - -, '
58 |     assert ap._normalize_string(raw_string) == clean_string
59 | 
60 | 
61 | def test_combine_results():
62 |     ap = parser.AddressParser(country='US')
63 |     raw_dict = {
64 |         'test_one': None,
65 |         'test_one_a': 1,
66 |         'test_two': None,
67 |         'test_two_b': 2}
68 |     assert ap._combine_results(raw_dict) == {'test_one': 1, 'test_two': 2}
69 | 
70 | 
71 | def test_parse_address():
72 |     ap = parser.AddressParser(country='US')
73 |     result = ap.parse('No address here')
74 |     assert not result
75 | 
76 |     ap = parser.AddressParser(country='US')
77 |     result = ap._parse_address('No address here')
78 |     assert not result
79 | 
80 |     ap = parser.AddressParser(country='US')
81 |     test_address = "xxx 225 E. John Carpenter Freeway, " +\
82 |         "Suite 1500 Irving, Texas 75062 xxx"
83 | 
84 |     addresses = ap.parse(test_address)
85 |     assert addresses[0].full_address == \
86 |         "225 E. John Carpenter Freeway, Suite 1500 Irving, Texas 75062"
87 | 


--------------------------------------------------------------------------------
/test_parser_ca.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """ Tests for CANADA address parser """
  4 | 
  5 | import re
  6 | import pytest
  7 | import pyap.source_CA.data as data_ca
  8 | from pyap import utils
  9 | 
 10 | 
 11 | @pytest.mark.parametrize("input,expected", [
 12 |     # positive assertions
 13 |     ("ZERO ", True),
 14 |     ("one ", True),
 15 |     ("two ", True),
 16 |     ("Three ", True),
 17 |     ("FoUr ", True),
 18 |     ("FivE ", True),
 19 |     ("six ", True),
 20 |     ("SEvEn ", True),
 21 |     ("Eight ", True),
 22 |     ("Nine ", True),
 23 |     # negative assertions
 24 |     ("Nidnes", False),
 25 |     ("One", False),
 26 |     ("two", False),
 27 |     ("onetwothree ", False),
 28 | ])
 29 | def test_zero_to_nine(input, expected):
 30 |     ''' test string match for zero_to_nine '''
 31 |     is_found = utils.match(
 32 |         data_ca.zero_to_nine,
 33 |         input,
 34 |         re.VERBOSE) is not None
 35 |     assert is_found == expected
 36 | 
 37 | 
 38 | @pytest.mark.parametrize("input,expected", [
 39 |     # positive assertions
 40 |     ("tEN ", True),
 41 |     ("TWENTY ", True),
 42 |     ("tHirtY ", True),
 43 |     ("FOUrty ", True),
 44 |     ("fifty ", True),
 45 |     ("sixty ", True),
 46 |     ("seventy ", True),
 47 |     ("eighty ", True),
 48 |     ("NINety ", True),
 49 |     # negative assertions
 50 |     ("ten", False),
 51 |     ("twenTY", False),
 52 |     ("sixtysixsty ", False),
 53 |     ("one twenty ", False),
 54 | ])
 55 | def test_ten_to_ninety(input, expected):
 56 |     ''' test string match for ten_to_ninety '''
 57 |     is_found = utils.match(data_ca.ten_to_ninety, input, re.VERBOSE)\
 58 |         is not None
 59 |     assert is_found == expected
 60 | 
 61 | 
 62 | @pytest.mark.parametrize("input,expected", [
 63 |     # positive assertions
 64 |     ("Hundred ", True),
 65 |     ("HuNdred ", True),
 66 |     # negative assertions
 67 |     ("HuNDdred", False),
 68 |     ("HuNDdred hundred ", False),
 69 | ])
 70 | def test_hundred(input, expected):
 71 |     ''' tests string match for a hundred '''
 72 |     is_found = utils.match(data_ca.hundred, input, re.VERBOSE) is not None
 73 |     assert is_found == expected
 74 | 
 75 | 
 76 | @pytest.mark.parametrize("input,expected", [
 77 |     # positive assertions
 78 |     ("Thousand ", True),
 79 |     ("thOUSAnd ", True),
 80 |     # negative assertions
 81 |     ("thousand", False),
 82 |     ("THoussand ", False),
 83 |     ("THoussand", False),
 84 |     ("THOUssand THoussand ", False),
 85 | ])
 86 | def test_thousand(input, expected):
 87 |     ''' tests string match for a thousand '''
 88 |     is_found = utils.match(data_ca.thousand, input, re.VERBOSE) is not None
 89 |     assert is_found == expected
 90 | 
 91 | 
 92 | @pytest.mark.parametrize("input,expected", [
 93 |     # positive assertions (words)
 94 |     ("One Thousand And Fifty Nine ", True),
 95 |     ("Two hundred and fifty ", True),
 96 |     ("Three hundred four ", True),
 97 |     ("Thirty seven ", True),
 98 |     ("FIFTY One ", True),
 99 |     ("Three hundred Ten ", True),
100 |     # positive assertions (numbers)
101 |     ("1 ", True),
102 |     ("15 ", True),
103 |     ("44 ", True),
104 |     ("256 ", True),
105 |     ("256 ", True),
106 |     ("1256 ", True),
107 |     ("32457 ", True),
108 |     # positive assertions (street intersections)
109 |     ("718 - 8th ", True),
110 | ])
111 | def test_street_number_positive(input, expected):
112 |     ''' tests positive exact string match for a street number '''
113 |     match = utils.match(data_ca.street_number, input, re.VERBOSE)
114 |     is_found = match is not None
115 |     # check for exact match
116 |     assert (is_found == expected) and\
117 |            (match.group(0) == utils.unicode_str(input))
118 | 
119 | 
120 | @pytest.mark.parametrize("input,expected", [
121 |     # negative assertions (words)
122 |     ("ONE THousszz22and FIFTY and four onde", False),
123 |     ("ONE one oNe and onE Three", False),
124 |     # negative assertions (numbers)
125 |     ("536233", False),
126 |     ("111111", False),
127 |     ("1111ss11", False),
128 |     ("123 456", False),
129 | ])
130 | def test_street_number_negative(input, expected):
131 |     ''' tests negative string match for a street number '''
132 |     match = utils.match(
133 |         data_ca.street_number,
134 |         utils.unicode_str(input), re.VERBOSE)
135 |     is_found = match is not None
136 |     """we check that:
137 |        - input should not to match our regex
138 |        - our match should be partial if regex matches some part of string
139 |     """
140 |     assert (is_found == expected) or \
141 |            (match.group(0) != utils.unicode_str(input))
142 | 
143 | 
144 | @pytest.mark.parametrize("input,expected", [
145 |     # positive assertions
146 |     ("N. ", True),
147 |     ("N ", True),
148 |     ("S ", True),
149 |     ("West ", True),
150 |     ("eASt ", True),
151 |     ("NW ", True),
152 |     ("SE ", True),
153 |     # negative assertions
154 |     ("NW.", False),
155 |     ("NW. ", False),
156 |     ("NS ", False),
157 |     ("EW ", False),
158 | ])
159 | def test_post_direction(input, expected):
160 |     ''' tests string match for a post_direction '''
161 |     is_found = utils.match(
162 |         data_ca.post_direction,
163 |         utils.unicode_str(input), re.VERBOSE)\
164 |         is not None
165 |     assert is_found == expected
166 | 
167 | 
168 | @pytest.mark.parametrize("input,expected", [
169 |     # positive assertions
170 |     ("Street ", True),
171 |     ("St. ", True),
172 |     ("St.", True),
173 |     ("Blvd.", True),
174 |     ("Blvd. ", True),
175 |     ("RD", True),
176 |     ("Cir", True),
177 |     ("Highway ", True),
178 |     ("Hwy ", True),
179 |     ("Ctr", True),
180 |     ("Sq.", True),
181 |     ("Street route 5 ", True),
182 |     ("blvd", True),
183 |     # negative assertions
184 |     # TODO
185 | ])
186 | def test_street_type(input, expected):
187 |     ''' tests string match for a street id '''
188 |     is_found = utils.match(
189 |         data_ca.street_type,
190 |         utils.unicode_str(input), re.VERBOSE)\
191 |         is not None
192 |     assert is_found == expected
193 | 
194 | 
195 | @pytest.mark.parametrize("input,expected", [
196 |     # positive assertions
197 |     ("floor 3 ", True),
198 |     ("floor 11 ", True),
199 |     ("floor 15 ", True),
200 |     ("1st floor ", True),
201 |     ("2nd floor ", True),
202 |     ("15th floor ", True),
203 |     ("16th. floor ", True),
204 |     # negative assertions
205 |     ("16th.floor ", False),
206 |     ("1stfloor ", False),
207 | 
208 | ])
209 | def test_floor(input, expected):
210 |     ''' tests string match for a floor '''
211 |     is_found = utils.match(
212 |         data_ca.floor,
213 |         utils.unicode_str(input), re.VERBOSE)\
214 |         is not None
215 |     assert is_found == expected
216 | 
217 | 
218 | @pytest.mark.parametrize("input,expected", [
219 |     # positive assertions
220 |     ("bldg m ", True),
221 |     ("Building F ", True),
222 |     ("bldg 2 ", True),
223 |     ("building 3 ", True),
224 |     ("building 100 ", True),
225 |     ("Building ", True),
226 |     ("building one ", True),
227 |     ("Building three ", True),
228 |     # negative assertions
229 |     ("bldg", False),
230 |     ("bldgm", False),
231 |     ("bldg100 ", False),
232 | 
233 | ])
234 | def test_building(input, expected):
235 |     ''' tests string match for a building '''
236 |     is_found = utils.match(
237 |         data_ca.building,
238 |         utils.unicode_str(input), re.VERBOSE)\
239 |         is not None
240 |     assert is_found == expected
241 | 
242 | 
243 | @pytest.mark.parametrize("input,expected", [
244 |     # positive assertions
245 |     ("suite 900 ", True),
246 |     ("Suite #2 ", True),
247 |     ("suite #218 ", True),
248 |     ("suite J7 ", True),
249 |     ("suite 102A ", True),
250 |     ("suite a&b ", True),
251 |     ("Suite J#200 ", True),
252 |     ("suite 710-327 ", True),
253 |     ("Suite A ", True),
254 |     ("Unit B ", True),
255 |     ("ste A ", True),
256 |     ("Ste 101 ", True),
257 |     ("ste 502b ", True),
258 |     ("ste 14-15 ", True),
259 |     ("ste E ", True),
260 |     ("ste 9E ", True),
261 |     ("Suite 1800 ", True),
262 |     ("Apt 1B ", True),
263 |     ("Rm. 52 ", True),
264 |     ("#2b ", True),
265 | ])
266 | def test_occupancy_positive(input, expected):
267 |     ''' tests exact string match for a place id '''
268 |     match = utils.match(
269 |         data_ca.occupancy,
270 |         utils.unicode_str(input), re.VERBOSE)
271 |     is_found = match is not None
272 |     assert (is_found == expected) and\
273 |            (match.group(0) == utils.unicode_str(input))
274 | 
275 | 
276 | @pytest.mark.parametrize("input,expected", [
277 |     # positive assertions
278 |     ("suite900 ", False),
279 |     ("Suite#2", False),
280 |     ("suite218 ", False),
281 | ])
282 | def test_occupancy_negative(input, expected):
283 |     ''' tests string match for a place id '''
284 |     match = utils.match(
285 |         data_ca.occupancy,
286 |         utils.unicode_str(input), re.VERBOSE)
287 |     is_found = match is not None
288 |     assert (is_found == expected)
289 | 
290 | 
291 | @pytest.mark.parametrize("input,expected", [
292 |     # positive assertions
293 |     ("po box 108", True),
294 |     ("Po Box 53485", True),
295 |     ("P.O. box 119", True),
296 |     ("PO box 1070", True),
297 | ])
298 | def test_po_box_positive(input, expected):
299 |     ''' tests exact string match for a po box '''
300 |     match = utils.match(
301 |         data_ca.po_box,
302 |         utils.unicode_str(input), re.VERBOSE)
303 |     is_found = match is not None
304 |     assert (is_found == expected) and\
305 |            (match.group(0) == utils.unicode_str(input))
306 | 
307 | 
308 | @pytest.mark.parametrize("input,expected", [
309 |     # positive assertions
310 |     ("po box108 ", False),
311 |     ("PoBox53485 ", False),
312 |     ("P.O. box119", False),
313 |     ("POb ox1070 ", False),
314 | ])
315 | def test_po_box_negative(input, expected):
316 |     ''' tests string match for a po box '''
317 |     match = utils.match(
318 |         data_ca.po_box,
319 |         utils.unicode_str(input), re.VERBOSE)
320 |     is_found = match is not None
321 |     assert (is_found == expected)
322 | 
323 | """
324 | NOTE:
325 | Testing for 'full_street' below is meaningless
326 | since "full_street_b" regexp is based on positive
327 | lookahead assertion and lazy regex before it,
328 | so it will fail most of the tests below, while
329 | still finding correct matches in full_address
330 | """
331 | 
332 | 
333 | @pytest.mark.parametrize("input,expected", [
334 |     # positive assertions
335 |     ("15979 Bow Bottom Trail SE, Calgary, AB T2J 6T5", True),
336 |     ("1730 McPherson Crt. Unit 35, Pickering, ON",
337 |         True),
338 |     ("20 Fleeceline Road, Toronto, Ontario M8V 2K3", True),
339 |     ("7034 Gilliespie Lane, Mississauga, ON L5W1E8", True),
340 |     ("12991 Keele Street King City, Ontario L7B 1G2 CANADA", True),
341 |     ("15979 Bow Bottom Trail SE, Calgary, AB T2J 6T5", True),
342 |     ("718 - 8th Avenue SW Calgary, AB T2P 1H3", True),
343 |     ("67 Lougheed Rd Unit B Barrie, Ontario L4N 8G1", True),
344 |     ("200 - 5050 Kingsway Ave. Burnaby, BC. Canada", True),
345 |     ("202-121 14th Street NW Calgary, AB T2N 1Z6", True),
346 |     ("108 - 1550 Hartley Avenue Coquitlam, B.C. V3K 7A1", True),
347 |     ("1555 Walkley Road Unit 3, Ottawa, ON, K1V 6P4 Canada", True),
348 |     ("238 Jarvis Ave, Winnipeg MB R2W 3A2", True),
349 |     ("104-18663 52 AVE SURREY, BC V3S 8E5", True),
350 |     ("14952 121a Ave NW, Edmonton, AB T5V 1A3, Canada", True),
351 |     ("8623 Granville Street Unit 143 Vancouver, BC V6P 5A2", True),
352 |     ("40 Ferrier St. Markham, ON L3R 2Z5", True),
353 |     ("13009 239b St. Maple Ridge, BC V4R 0A5", True),
354 |     ("40, Rue Ruskin, Ottawa (Ontario) K1Y 4W7 Canada", True),
355 |     ("25 Bethridge Road Toronto, Ontario, Canada", True),
356 |     ("3000 Steeles Avenue East, Suite 700 Markham, Ontario Canada", True),
357 |     ("30 Titan Road Unit 17 Toronto, Ontario M8Z 5Y2", True),
358 |     ("405, rue Sainte Montreal Québec", True),
359 |     ("405, rue Sainte-Catherine Est Montréal (Québec) H2L 2C4", True),
360 |     ("5800, rue Saint-Denis, bureau 1105 Montréal (Québec) H2S 3L5 Canada",
361 |         True),
362 |     ("3744, rue Jean-Brillant Bureau 490 Montréal (Québec)", True),
363 |     ("2275, rue Holt Montréal (Québec) H2G 3H1", True),
364 |     ("475, boulevard De Maisonneuve Est Montréal (Québec) H2L 5C4", True),
365 |     ("133 Ilsley Avenue, Unit A Dartmouth (Nova Scotia) B3B 1S9", True),
366 |     ("5205 Satellite Drive Mississauga (Ontario) L4W 5J7", True),
367 |     ("400 Main Street, Bureau 2080 Saint John (New Brunswick) E2K 4N5", True),
368 |     ("16, Place du Commerce Île des Soeurs Verdun (Québec) H3E 2A5", True),
369 |     ("4260, Still Creek Drive Burnaby (Colombie-Britannique) V5C 6C6", True),
370 |     ("201, avenue Portage, Bureau 1750 Winnipeg (Manitoba)", True),
371 |     ("555, boulevard de l'Université Chicoutimi (Québec) Canada", True),
372 |     ("283, boulevard Alexandre-Taché Gatineau (Québec) Canada J9A 1L8", True),
373 |     ("5, rue Saint-Joseph Saint-Jérôme (Québec) J7Z 0B7", True),
374 |     ("58, rue Principale Ripon (Québec) J0V 1V0", True),
375 |     ("33771 George Ferguson Way Abbotsford, BC V2S 2M5", True),
376 |     ("33771 George Ferguson Way Suite 668 Abbotsford, BC V2S 2M5", True),
377 |     ("11, rue Notre-Dame Ouest Montréal (Québec) H2Y 4A7", True),
378 |     ("775, rue Saint-Viateur Québec (Québec) G2L 2Z3", True),
379 |     ("2275, rue Holt Montréal (Québec) H2G 3H1", True),
380 |     ("475, boulevard De Maisonneuve Est Montréal (Québec) H2L 5C4", True),
381 |     ("1050, chemin Sainte-Foy Québec (Québec) G1S 4L8", True),
382 |     ("1401, 18e rue Québec (Québec) G1J 1Z4", True),
383 |     ("1050, chemin Sainte-Foy Québec (Québec) G1S 4L8", True),
384 |     ("101, rue Saint-Jean-Bosco Gatineau (Québec) Canada J8Y 3G5", True),
385 |     ("205, avenue de la Cathédrale Case postale 710 Rimouski (Québec) G5L 7C7",
386 |         True),
387 |     ("3351, boul. des Forges C.P. 500, Trois-Rivières (Québec)"
388 |         " Canada, G9A 5H7", True),
389 |     ("3264 Mainway Burlington L7M 1A7 Ontario, Canada", True),
390 | ])
391 | def test_full_address_positive(input, expected):
392 |     ''' tests exact string match for a full address '''
393 |     match = utils.match(
394 |         data_ca.full_address,
395 |         utils.unicode_str(input), re.VERBOSE | re.U)
396 |     is_found = match is not None
397 |     assert (is_found == expected) and\
398 |            (match.group(0) == utils.unicode_str(input))
399 | 
400 | 
401 | @pytest.mark.parametrize("input,expected", [
402 |     # positive assertions
403 |     ("T2P 1H3", True),
404 |     ("T2P1H3", True),
405 |     ("L1W3E6", True),
406 |     ("L4N 8G1", True),
407 |     ("J8Y 3G5", True),
408 |     ("J9A 1L8", True),
409 | ])
410 | def test_postal_code_positive(input, expected):
411 |     ''' test exact string match for postal code '''
412 |     match = utils.match(
413 |         data_ca.postal_code,
414 |         utils.unicode_str(input), re.VERBOSE)
415 |     is_found = match is not None
416 |     assert is_found == expected and\
417 |         match.group(0) == utils.unicode_str(input)
418 | 
419 | 
420 | @pytest.mark.parametrize("input,expected", [
421 |     # positive assertions
422 |     ("1", False),
423 |     ("23", False),
424 |     ("456", False),
425 |     ("4567", False),
426 |     ("750621", False),
427 |     ("95130-642", False),
428 |     ("95130-64212", False),
429 | ])
430 | def test_postal_code_negative(input, expected):
431 |     ''' test exact string match for postal code '''
432 |     match = utils.match(
433 |         data_ca.postal_code,
434 |         utils.unicode_str(input), re.VERBOSE)
435 |     is_found = match is not None
436 |     assert (is_found == expected) or\
437 |            (match.group(0) != utils.unicode_str(input))
438 | 
439 | 
440 | @pytest.mark.parametrize("input,expected", [
441 |     # positive assertions
442 |     ("Quebec", True),
443 |     ("Nova Scotia", True),
444 |     ("Colombie-Britannique", True),
445 |     ("New Brunswick", True),
446 |     ("Quebec", True),
447 |     ("Québec", True),
448 |     ("Territoires Du Nord-Ouest", True),
449 | ])
450 | def test_region1(input, expected):
451 |     ''' test exact string match for province '''
452 |     match = utils.match(data_ca.region1, input, re.VERBOSE)
453 |     is_found = match is not None
454 |     assert is_found == expected and \
455 |         match.group(0) == utils.unicode_str(input)
456 | 
457 | 
458 | @pytest.mark.parametrize("input,expected", [
459 |     # positive assertions
460 |     ("CANADA", True),
461 |     ("Canada", True),
462 | ])
463 | def test_country(input, expected):
464 |     ''' test exact string match for country '''
465 |     match = utils.match(data_ca.country, input, re.VERBOSE)
466 |     is_found = match is not None
467 |     assert is_found == expected and match.group(0) == input
468 | 


--------------------------------------------------------------------------------
/test_parser_gb.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """ Test for USA address parser """
  4 | 
  5 | import re
  6 | import pytest
  7 | import itertools
  8 | import pyap
  9 | import pyap.parser
 10 | from pyap import utils
 11 | from pyap.packages import six
 12 | import pyap.source_GB.data as data_gb
 13 | 
 14 | 
 15 | def execute_matching_test(input, expected, pattern):
 16 |     match = utils.match(pattern, input, re.VERBOSE)
 17 |     is_found = match is not None
 18 |     if expected:
 19 |         assert is_found == expected and match.group(0) == input
 20 |     else:
 21 |         assert (is_found == expected) or (match.group(0) != input)
 22 | 
 23 | 
 24 | @pytest.mark.parametrize("input,expected", [
 25 |     # positive assertions
 26 |     ("ZERO ", True),
 27 |     ("one ", True),
 28 |     ("two ", True),
 29 |     ("Three ", True),
 30 |     ("FoUr ", True),
 31 |     ("FivE ", True),
 32 |     ("six ", True),
 33 |     ("SEvEn ", True),
 34 |     ("Eight ", True),
 35 |     ("Nine ", True),
 36 |     # negative assertions
 37 |     ("Nidnes", False),
 38 |     ("One", False),
 39 |     ("two", False),
 40 |     ("onetwothree ", False),
 41 | ])
 42 | def test_zero_to_nine(input, expected):
 43 |     ''' test string match for zero_to_nine '''
 44 |     execute_matching_test(input, expected, data_gb.zero_to_nine)
 45 | 
 46 | 
 47 | @pytest.mark.parametrize("input,expected", [
 48 |     # positive assertions
 49 |     ("tEN ", True),
 50 |     ("TWENTY ", True),
 51 |     ("tHirtY ", True),
 52 |     ("FOUrty ", True),
 53 |     ("fifty ", True),
 54 |     ("sixty ", True),
 55 |     ("seventy ", True),
 56 |     ("eighty ", True),
 57 |     ("NINety ", True),
 58 |     # negative assertions
 59 |     ("ten", False),
 60 |     ("twenTY", False),
 61 |     ("sixtysixsty ", False),
 62 |     ("one twenty ", False),
 63 | ])
 64 | def test_ten_to_ninety(input, expected):
 65 |     ''' test string match for ten_to_ninety '''
 66 |     execute_matching_test(input, expected, data_gb.ten_to_ninety)
 67 | 
 68 | 
 69 | @pytest.mark.parametrize("input,expected", [
 70 |     # positive assertions
 71 |     ("Hundred ", True),
 72 |     ("HuNdred ", True),
 73 |     # negative assertions
 74 |     ("HuNDdred", False),
 75 |     ("HuNDdred hundred ", False),
 76 | ])
 77 | def test_hundred(input, expected):
 78 |     ''' tests string match for a hundred '''
 79 |     execute_matching_test(input, expected, data_gb.hundred)
 80 | 
 81 | 
 82 | @pytest.mark.parametrize("input,expected", [
 83 |     # positive assertions
 84 |     ("Thousand ", True),
 85 |     ("thOUSAnd ", True),
 86 |     # negative assertions
 87 |     ("thousand", False),
 88 |     ("THoussand ", False),
 89 |     ("THoussand", False),
 90 |     ("THOUssand THoussand ", False),
 91 | ])
 92 | def test_thousand(input, expected):
 93 |     ''' tests string match for a thousand '''
 94 |     execute_matching_test(input, expected, data_gb.thousand)
 95 | 
 96 | 
 97 | @pytest.mark.parametrize("input,expected", [
 98 |     # positive assertions (words)
 99 |     ("One Thousand And Fifty Nine ", True),
100 |     ("Two hundred and fifty ", True),
101 |     ("Three hundred four ", True),
102 |     ("Thirty seven ", True),
103 |     ("FIFTY One ", True),
104 |     ("Three hundred Ten ", True),
105 |     # positive assertions (numbers)
106 |     ("1 ", True),
107 |     ("15 ", True),
108 |     ("44 ", True),
109 |     ("256 ", True),
110 |     ("256 ", True),
111 |     ("1256 ", True),
112 |     ("32457 ", True),
113 |     ("32457", True),
114 |     ("9652", True),
115 |     ("Number 32457 ", True),
116 |     ("NO. 32457 ", True),
117 |     ("Num. 256 ", True),
118 |     # negative assertions (words)
119 |     ("ONE THousszz22and FIFTY and four onde", False),
120 |     ("ONE one oNe and onE Three", False),
121 |     # negative assertions (numbers)
122 |     ("536233", False),
123 |     ("111111", False),
124 |     ("1111ss11", False),
125 |     ("123 456", False),
126 | ])
127 | def test_street_number(input, expected):
128 |     ''' tests positive exact string match for a street number '''
129 |     execute_matching_test(input, expected, data_gb.street_number)
130 | 
131 | 
132 | @pytest.mark.parametrize("input,expected", [
133 |     # positive assertions
134 |     ("Northeast Kentucky Industrial ", True),
135 |     ("One ", True),
136 |     ("First ", True),
137 |     ("Ave 123 ", True),
138 |     ("Northeast 5 ", True),
139 |     ("Loiret Boulevard", True),
140 |     # negative assertions
141 |     ("Northeast Kentucky Industrial Maple ", False),
142 |     ("a", False),
143 |     ("1", False),
144 |     ("ab", False),
145 | ])
146 | def test_street_name(input, expected):
147 |     ''' tests positive exact string match for a street name '''
148 |     # The `street_name` pattern refers to the `street_number` pattern and so I've inserted
149 |     # a fake `street_number` pattern that matches to the space between characters `\b\B`
150 |     fake_street_number_pattern = r'(?P<street_number>fake_street_number)'
151 |     execute_matching_test("fake_street_number" + input, expected, fake_street_number_pattern + data_gb.street_name)
152 | 
153 | 
154 | @pytest.mark.parametrize("input,expected", [
155 |     # positive assertions
156 |     ("N. ", True),
157 |     ("N ", True),
158 |     ("S ", True),
159 |     ("West ", True),
160 |     ("eASt ", True),
161 |     ("NW ", True),
162 |     ("SE ", True),
163 |     # negative assertions
164 |     ("NW.", False),
165 |     ("NW. ", False),
166 |     ("NS ", False),
167 |     ("EW ", False),
168 | ])
169 | def test_post_direction(input, expected):
170 |     ''' tests string match for a post_direction '''
171 |     execute_matching_test(input, expected, data_gb.post_direction)
172 | 
173 | 
174 | @pytest.mark.parametrize("input,expected", [
175 |     # positive assertions
176 |     ("Street", True),
177 |     ("St.", True),
178 |     ("St.", True),
179 |     ("Blvd.", True),
180 |     ("Blvd.", True),
181 |     ("LN", True),
182 |     ("RD", True),
183 |     ("Cir", True),
184 |     ("Highway", True),
185 |     ("Hwy", True),
186 |     ("Ct", True),
187 |     ("Sq.", True),
188 |     ("LP.", True),
189 |     ("LP.", True),
190 |     ("Street", True),
191 |     ("blvd", True),
192 |     # negative assertions
193 |     # TODO
194 | 
195 | ])
196 | def test_street_type(input, expected):
197 |     ''' tests string match for a street id '''
198 |     execute_matching_test(input, expected, data_gb.street_type)
199 | 
200 | 
201 | @pytest.mark.parametrize("input,expected", [
202 |     # positive assertions
203 |     ("floor 3 ", True),
204 |     ("floor 11 ", True),
205 |     ("floor 15 ", True),
206 |     ("1st floor ", True),
207 |     ("2nd floor ", True),
208 |     ("15th floor ", True),
209 |     ("16th. floor ", True),
210 |     # negative assertions
211 |     ("16th.floor ", False),
212 |     ("1stfloor ", False),
213 | 
214 | ])
215 | def test_floor(input, expected):
216 |     ''' tests string match for a floor '''
217 |     execute_matching_test(input, expected, data_gb.floor)
218 | 
219 | 
220 | @pytest.mark.parametrize("input,expected", [
221 |     # positive assertions
222 |     ("bldg m ", True),
223 |     ("Building F ", True),
224 |     ("bldg 2 ", True),
225 |     ("building 3 ", True),
226 |     ("building 100 ", True),
227 |     ("Building ", True),
228 |     ("building one ", True),
229 |     ("Building three ", True),
230 |     # negative assertions
231 |     ("bldg", False),
232 |     ("bldgm", False),
233 |     ("bldg100 ", False),
234 | 
235 | ])
236 | def test_building(input, expected):
237 |     ''' tests string match for a building '''
238 |     execute_matching_test(input, expected, data_gb.building)
239 | 
240 | 
241 | @pytest.mark.parametrize("input,expected", [
242 |     # positive assertions
243 |     ("suite 900 ", True),
244 |     ("Suite #2 ", True),
245 |     ("suite #218 ", True),
246 |     ("suite J7 ", True),
247 |     ("suite 102A ", True),
248 |     ("suite a&b ", True),
249 |     ("Suite J#200 ", True),
250 |     ("suite 710-327 ", True),
251 |     ("Suite A ", True),
252 |     ("ste A ", True),
253 |     ("Ste 101 ", True),
254 |     ("ste 502b ", True),
255 |     ("ste 14-15 ", True),
256 |     ("ste E ", True),
257 |     ("ste 9E ", True),
258 |     ("Suite 1800 ", True),
259 |     ("Apt 1B ", True),
260 |     ("Rm. 52 ", True),
261 |     ("Flat 2C ", True),
262 |     ("Flat 81b ", True),
263 |     ("Flat 52 ", True),
264 |     ("Flat 546 ", True),
265 |     ("Flat 14 ", True),
266 |     ("Suite#2", True),
267 |     ("suite900 ", True),
268 |     ("suite218 ", True),
269 |     ("1 ", False),
270 |     ("1A ", False),
271 |     ("12 ", False),
272 |     ("123 ", False),
273 | ])
274 | def test_occupancy(input, expected):
275 |     ''' tests exact string match for a place id '''
276 |     execute_matching_test(input, expected, data_gb.occupancy)
277 | 
278 | 
279 | @pytest.mark.parametrize("input,expected", [
280 |     # positive assertions
281 |     ("po box 108", True),
282 |     ("Po Box 53485", True),
283 |     ("P.O. box 119", True),
284 |     ("PO box 1070", True),
285 |     ("po box108", True),
286 |     ("PoBox53485", True),  # While not correctly formatted, this is clearly a PO Box
287 |     ("P.O. box119", True),
288 |     # negitive assertions
289 |     ("POb ox1070", False),
290 |     ("boxer 123", False),
291 | ])
292 | def test_po_box_negative(input, expected):
293 |     ''' tests string match for a po box '''
294 |     execute_matching_test(input, expected, data_gb.po_box)
295 | 
296 | 
297 | @pytest.mark.parametrize("input,expected", [
298 |     # positive assertions
299 |     ("9652 Loiret Boulevard", True),
300 |     ("101 MacIntosh Boulevard", True),
301 |     ("1 West Hegeler Lane", True),
302 |     ("1270 Leeds Avenue", True),
303 |     ("85-1190 Ranchview Rd. NW ", True),
304 |     ("62 Portland Road", True),
305 |     ("Suite 514, 200 N. Pine Avenue ", True),
306 |     ("200 S. Alloy Drive", True),
307 |     ("Two Hundred S. Alloy Drive", True),
308 |     ("Two Hundred South Alloy Drive", True),
309 |     ("Two Hundred South Alloy Dr.", True),
310 |     ("11001 Fondren Rd.", True),
311 |     ("Suite 500, 9606 North Mopac Expressway", True),
312 |     ("9692 East Arapahoe Road", True),
313 |     ("Building 2, 9 Grand Avenue", True),
314 |     ("9C Grand Avenue", True),
315 |     ("Flat 2, 9 Grand Avenue", True),
316 |     ("Suite 1800 233 Richmond Highway", True),
317 |     ("P.O. Box 472, 354 Eisenhower Parkway ", True),
318 |     ("PO Box 2243, 6645 N Ensign St", True),
319 |     ("POBox 2243, 6645 N Ensign St", True),
320 |     ("1200 Old Fairhaven Pkwy", True),
321 |     ("1659 Scott Blvd", True),
322 |     ("377 Fisher Rd", True),
323 |     ("1833 Stearman Ave", True),
324 |     ("1737 S Lumpkin St ", True),
325 |     ("101 N Court Sq", True),
326 |     ("1790 Yardley Langhorne Rd", True),
327 |     ("280 West Main Street", True),
328 |     ("701 Tennessee Walk", True),
329 |     ("7457 Harwin Dr", True),
330 |     ("700 Davis Avenue", True),
331 |     ("1 W 47th St", True),
332 |     ("832 Seward St", True),
333 |     ("2740 Timber Ridge Lane", True),
334 |     ("810 E Western Ave", True),
335 |     ("6223 Richmond Ave", True),
336 |     ("400 Middle Street", True),
337 |     ("81 N Main St", True),
338 |     ("3705 West Memorial Road", True),
339 |     ("4911 Matterhorn Dr", True),
340 |     ("5830 Yahl Street", True),
341 |     ("9400 Doliver Dr", True),
342 |     ("10701 Stirling Road", True),
343 |     ("1865 Corporate Dr", True),
344 |     ("80 Beaman Rd", True),
345 |     ("9691 Spratley Ave", True),
346 |     ("10835 New Haven Rd NW ", True),
347 |     ("320 W Broussard Rd", True),
348 |     ("9001 Any Old Way", True),
349 |     ("8967 Market St.", True),
350 |     ("3724 Oxford Blvd.", True),
351 |     ("901 Rainier Ave S ", True),
352 |     ("01 Brett Street", True),
353 |     ("Flat 14, Hilary road", True),
354 |     ("049 Maurice island", True),
355 |     ("Flat 81b, Abbie estate", True),
356 |     ("SHEPPEY WAY", True),
357 |     ("185-187 OXFORD STREET", True),
358 |     ("32 London Bridge St", True),
359 |     ("Marlborough Rd", True),
360 |     ("Gresham Street", True),
361 |     ("Corn St", True),
362 |     ("223 30th Ave.", True),
363 |     ("No. 22 The Light", True),
364 |     ("55 Glenfada Park", True),
365 | 
366 | ])
367 | def test_full_street(input, expected):
368 |     ''' tests exact string match for a full street '''
369 |     execute_matching_test(input, expected, data_gb.full_street)
370 | 
371 | 
372 | @pytest.mark.parametrize("input,expected", [
373 |     # positive assertions
374 |     ("BX1 1LT", True),
375 |     ("sw1A 0AA", True),
376 |     ("EC2V 7hh", True),
377 |     ("M25DB", True),
378 |     ("eh12ng", True),
379 |     ("BT1 5GS", True),
380 | 
381 |     # negative assertions
382 |     ("1", False),
383 |     ("23", False),
384 |     ("456", False),
385 |     ("4567", False),
386 |     ("750621", False),
387 |     ("95130-642", False),
388 |     ("95130-64212", False),
389 | ])
390 | def test_postal_code(input, expected):
391 |     ''' test exact string match for postal code '''
392 |     execute_matching_test(input, expected, data_gb.postal_code)
393 | 
394 | 
395 | @pytest.mark.parametrize("input,expected", [
396 |     # positive assertions
397 |     ("Montana", True),
398 |     ("Nebraska", True),
399 |     ("NJ", True),
400 |     ("DC", True),
401 |     ("PuErTO RIco", True),
402 |     ("oregon", True),
403 |     ("Surrey", True),
404 |     ("Middlesex", True),
405 |     ("Greater London", True),
406 | ])
407 | def test_region1(input, expected):
408 |     ''' test exact string match for province '''
409 |     execute_matching_test(input, expected, data_gb.region1)
410 | 
411 | 
412 | @pytest.mark.parametrize("input,expected", [
413 |     # positive assertions
414 |     ("England", True),
415 |     ("ScoTlAnd", True),
416 |     ("wales", True),
417 |     ("CYMRU", True),
418 |     ("United Kingdom", True),
419 |     ("Great Britain", True),
420 |     ("Britain", True),
421 |     ("Britain and Northern Ireland", True),
422 |     ("Great Britain and Northern Ireland", True),
423 |     ("The United Kingdom of Great Britain and Northern Ireland", True),
424 |     ("United States", False),
425 | ])
426 | def test_country(input, expected):
427 |     ''' test exact string match for country '''
428 |     execute_matching_test(input, expected, data_gb.country)
429 | 
430 | 
431 | @pytest.mark.parametrize("input,expected", [
432 |     # positive assertions
433 |     ("11-59 High Road, East Finchley London, N2 8AW", True),
434 |     ("88 White parkway, Stanleyton, L2 3DB", True),
435 |     ("Studio 96D, Graham roads, Westtown, L1A 3GP, Great Britain", True),
436 |     ("01 Brett mall, Lake Donna, W02 3JQ", True),
437 |     ("Flat 05, Byrne shores, Howardshire, GL6 8EA, UK", True),
438 |     ("12 Henry route, Clementsborough, W2 5DQ", True),
439 |     ("195 Jill hollow, Harryside, TF6 4YD, England", True),
440 |     ("195 Jill hollow, TF6 4YD", True),
441 |     ("SHEPPEY WAY, SITTINGBOURNE, ME9 8RZ", True),
442 |     ("185-187 OXFORD STREET, WESTMINSTER, W1D 2JU", True),
443 |     ("32 London Bridge St, London SE1 9SG", True),
444 |     ("Marlborough Rd, St. James's, London SW1A 1BQ", True),
445 |     ("Guildhall, Gresham Street, London, EC2V 7HH", True),
446 |     ("The Corn Exchange, Corn St, Bristol BS1 1JQ", True),
447 |     ("No. 22 The Light, The Headrow, Leeds LS1 8TL", True),
448 |     ("55 Glenfada Park, Londonderry BT48 9DR", True),
449 |     ("Studio 53, Harrison cove, Smithbury, G88 4US", True),
450 |     # negative assertions
451 |     ("85 STEEL REGULAR SHAFT - NE", False),
452 |     ("3 STRUCTURE WITH PE", False),
453 |     ("2013 Courtesy of DONNA LUPI, PR", False),
454 |     ("44 sq. ft. 000 Columbia Ave. See Remarks, Newfield, NJ 08344", False),
455 |     ("7901 SILVER CONDUCTIVE HOLE FILL MA", False),
456 |     ("3 THIRD PARTY LIST IN", False),
457 |     ("9 STORAGE OF INDIVIDUAL IN", False),
458 |     ("4 BODY WAVE MODEL MO", False),
459 |     ("4060 AUTOMATIC STRAPPING MACHINE KZB-II STRAPPING MA", False),
460 |     ("130 AUTOMATIC STRAPPING MACHINE CO", False),
461 |     ("6060 AUTOMATIC STRAPPING MACHINE SK", False),
462 |     ("500 AUTO BLISTER PACKING SEALING MA", False),
463 |     ("23 ELECTRICAL COLOURED-TAPE PR", False),
464 |     ("1900 TRANSISTOR ELECTROMAGNETIC INDUCTION AL", False),
465 |     ("3131 DR. MATTHEW WI", False),
466 |     ("ONE FOR ANY DIRECT, INDIRECT, IN", False),
467 |     ("2 TRACTOR HEAD Actros MP", False),
468 |     ("00 Straight Fit Jean, USA", False),
469 | ])
470 | def test_full_address(input, expected):
471 |     ''' tests exact string match for a full address '''
472 |     execute_matching_test(input, expected, data_gb.full_address)
473 | 
474 | def test_full_address_parts():
475 |     """Tests that the right parts of the address are picked up by the right regex"""
476 |     example_addresses = [
477 |         {
478 |             'full_address': '9 Shaun glen, East Joan, LN4 1LE',
479 |             'street_name': 'Shaun glen',
480 |             'street_number': '9',
481 |             'postal_code': 'LN4 1LE',
482 |         },
483 |         {
484 |             'full_address': '11-59 High Road\nEast Finchley London\nN2 8AW, UK',
485 |             'street_name': 'High Road',
486 |             'street_number': '11-59',
487 |             'postal_code': 'N2 8AW',
488 |             'country': 'UK',
489 |         },
490 |         {
491 |             'full_address': 'Studio 53, Harrison cove, Smithbury, G88 4US, United Kingdom',
492 |             'occupancy': 'Studio 53',
493 |             'street_name': 'Harrison cove',
494 |             'postal_code': 'G88 4US',
495 |             'country': 'United Kingdom',
496 |         },
497 |     ]
498 |     filler_text = "This is filler text that can be inserted both before and after addresses"
499 |     punctuation = ["\n", ", ", ". ", " "]
500 | 
501 |     # Test each of the above addresses
502 |     for address_parts in example_addresses:
503 |         # Test with filler text before and after the address
504 |         for filler_before, filler_after in itertools.product([False, True], [False, True]):
505 |             # Use the following punctuation to join the filler text and the address
506 |             for join_string in punctuation:
507 |                 filler_text_before = (filler_text + join_string) if filler_before else ''
508 |                 filler_text_after = (join_string + filler_text) if filler_after else ''
509 |                 address_text = filler_text_before + address_parts['full_address'] + filler_text_after
510 | 
511 |                 parsed = pyap.parse(address_text, country='GB')
512 |                 print (pyap.parser.AddressParser._normalize_string(address_text))
513 |                 # Ensure that only one address is found
514 |                 assert len(parsed) == 1
515 |                 for k, v in six.iteritems(address_parts):
516 |                     if k == 'full_address':
517 |                         assert parsed[0].full_address == pyap.parser.AddressParser._normalize_string(v)
518 |                     else:
519 |                         # assert that every item in the above address dictionaries match the parsed address
520 |                         assert parsed[0].__getattribute__(k) == v
521 | 
522 | 


--------------------------------------------------------------------------------
/test_parser_us.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """ Test for USA address parser """
  4 | 
  5 | import re
  6 | import pytest
  7 | from pyap import utils
  8 | from pyap.packages import six
  9 | import pyap.source_US.data as data_us
 10 | 
 11 | 
 12 | def execute_matching_test(input, expected, pattern):
 13 |     match = utils.match(pattern, input, re.VERBOSE)
 14 |     is_found = match is not None
 15 |     if expected:
 16 |         assert is_found == expected and match.group(0) == input
 17 |     else:
 18 |         """we check that:
 19 |            - input should not to match our regex
 20 |            - our match should be partial if regex matches some part of string
 21 |         """
 22 |         assert (is_found == expected) or (match.group(0) != input)
 23 | 
 24 | 
 25 | @pytest.mark.parametrize("input,expected", [
 26 |     # positive assertions
 27 |     ("ZERO ", True),
 28 |     ("one ", True),
 29 |     ("two ", True),
 30 |     ("Three ", True),
 31 |     ("FoUr ", True),
 32 |     ("FivE ", True),
 33 |     ("six ", True),
 34 |     ("SEvEn ", True),
 35 |     ("Eight ", True),
 36 |     ("Nine ", True),
 37 |     # negative assertions
 38 |     ("Nidnes", False),
 39 |     ("One", False),
 40 |     ("two", False),
 41 |     ("onetwothree ", False),
 42 | ])
 43 | def test_zero_to_nine(input, expected):
 44 |     ''' test string match for zero_to_nine '''
 45 |     execute_matching_test(input, expected, data_us.zero_to_nine)
 46 | 
 47 | 
 48 | @pytest.mark.parametrize("input,expected", [
 49 |     # positive assertions
 50 |     ("tEN ", True),
 51 |     ("TWENTY ", True),
 52 |     ("tHirtY ", True),
 53 |     ("FOUrty ", True),
 54 |     ("fifty ", True),
 55 |     ("sixty ", True),
 56 |     ("seventy ", True),
 57 |     ("eighty ", True),
 58 |     ("NINety ", True),
 59 |     # negative assertions
 60 |     ("ten", False),
 61 |     ("twenTY", False),
 62 |     ("sixtysixsty ", False),
 63 |     ("one twenty ", False),
 64 | ])
 65 | def test_ten_to_ninety(input, expected):
 66 |     ''' test string match for ten_to_ninety '''
 67 |     execute_matching_test(input, expected, data_us.ten_to_ninety)
 68 | 
 69 | 
 70 | @pytest.mark.parametrize("input,expected", [
 71 |     # positive assertions
 72 |     ("Hundred ", True),
 73 |     ("HuNdred ", True),
 74 |     # negative assertions
 75 |     ("HuNDdred", False),
 76 |     ("HuNDdred hundred ", False),
 77 | ])
 78 | def test_hundred(input, expected):
 79 |     ''' tests string match for a hundred '''
 80 |     execute_matching_test(input, expected, data_us.hundred)
 81 | 
 82 | 
 83 | @pytest.mark.parametrize("input,expected", [
 84 |     # positive assertions
 85 |     ("Thousand ", True),
 86 |     ("thOUSAnd ", True),
 87 |     # negative assertions
 88 |     ("thousand", False),
 89 |     ("THoussand ", False),
 90 |     ("THoussand", False),
 91 |     ("THOUssand THoussand ", False),
 92 | ])
 93 | def test_thousand(input, expected):
 94 |     ''' tests string match for a thousand '''
 95 |     execute_matching_test(input, expected, data_us.thousand)
 96 | 
 97 | 
 98 | @pytest.mark.parametrize("input,expected", [
 99 |     # positive assertions (words)
100 |     ("One Thousand And Fifty Nine ", True),
101 |     ("Two hundred and fifty ", True),
102 |     ("Three hundred four ", True),
103 |     ("Thirty seven ", True),
104 |     ("FIFTY One ", True),
105 |     ("Three hundred Ten ", True),
106 |     # positive assertions (numbers)
107 |     ("1 ", True),
108 |     ("15 ", True),
109 |     ("44 ", True),
110 |     ("256 ", True),
111 |     ("256 ", True),
112 |     ("1256 ", True),
113 |     ("32457 ", True),
114 |     # negative assertions (words)
115 |     ("ONE THousszz22and FIFTY and four onde", False),
116 |     ("ONE one oNe and onE Three", False),
117 |     # negative assertions (numbers)
118 |     ("536233", False),
119 |     ("111111", False),
120 |     ("1111ss11", False),
121 |     ("123 456", False),
122 | ])
123 | def test_street_number(input, expected):
124 |     ''' tests string match for a street number '''
125 |     execute_matching_test(input, expected, data_us.street_number)
126 | 
127 | 
128 | @pytest.mark.parametrize("input,expected", [
129 |     # positive assertions
130 |     ("Northeast Kentucky Industrial ", True),
131 |     ("One ", True),
132 |     ("First ", True),
133 |     ("Ave 123 ", True),
134 |     ("Northeast 5 ", True),
135 |     # negative assertions
136 |     ("Northeast Kentucky Industrial Maple ", False),
137 |     ("a", False),
138 |     ("ab", False),
139 | ])
140 | def test_street_name(input, expected):
141 |     ''' tests positive string match for a street name '''
142 |     execute_matching_test(input, expected, data_us.street_name)
143 | 
144 | 
145 | @pytest.mark.parametrize("input,expected", [
146 |     # positive assertions
147 |     ("N. ", True),
148 |     ("N ", True),
149 |     ("S ", True),
150 |     ("West ", True),
151 |     ("eASt ", True),
152 |     ("NW ", True),
153 |     ("SE ", True),
154 |     # negative assertions
155 |     ("NW.", False),
156 |     ("NW. ", False),
157 |     ("NS ", False),
158 |     ("EW ", False),
159 | ])
160 | def test_post_direction(input, expected):
161 |     ''' tests string match for a post_direction '''
162 |     execute_matching_test(input, expected, data_us.post_direction)
163 | 
164 | 
165 | @pytest.mark.parametrize("input,expected", [
166 |     # positive assertions
167 |     ("Street ", True),
168 |     ("St. ", True),
169 |     ("St.", True),
170 |     ("Blvd.", True),
171 |     ("Blvd. ", True),
172 |     ("LN ", True),
173 |     ("RD", True),
174 |     ("Cir", True),
175 |     ("Highway ", True),
176 |     ("Hwy ", True),
177 |     ("Ct", True),
178 |     ("Sq.", True),
179 |     ("LP. ", True),
180 |     ("LP. (Route A1 )", True),
181 |     ("Street route 5 ", True),
182 |     ("blvd", True),
183 |     ("Estate", True),
184 |     ("Manor", True),
185 |     # negative assertions
186 |     # TODO
187 | 
188 | ])
189 | def test_street_type(input, expected):
190 |     ''' tests string match for a street id '''
191 |     execute_matching_test(input, expected, data_us.street_type)
192 | 
193 | 
194 | @pytest.mark.parametrize("input,expected", [
195 |     # positive assertions
196 |     ("floor 3 ", True),
197 |     ("floor 11 ", True),
198 |     ("floor 15 ", True),
199 |     ("1st floor ", True),
200 |     ("2nd floor ", True),
201 |     ("15th floor ", True),
202 |     ("16th. floor ", True),
203 |     # negative assertions
204 |     ("16th.floor ", False),
205 |     ("1stfloor ", False),
206 | 
207 | ])
208 | def test_floor(input, expected):
209 |     ''' tests string match for a floor '''
210 |     execute_matching_test(input, expected, data_us.floor)
211 | 
212 | 
213 | @pytest.mark.parametrize("input,expected", [
214 |     # positive assertions
215 |     ("bldg m ", True),
216 |     ("Building F ", True),
217 |     ("bldg 2 ", True),
218 |     ("building 3 ", True),
219 |     ("building 100 ", True),
220 |     ("building 1000 ", True),
221 |     ("Building ", True),
222 |     ("building one ", True),
223 |     ("Building three ", True),
224 |     # negative assertions
225 |     ("bldg", False),
226 |     ("bldgm", False),
227 |     ("bldg100 ", False),
228 |     ("building 10000 ", False),
229 | 
230 | ])
231 | def test_building(input, expected):
232 |     ''' tests string match for a building '''
233 |     execute_matching_test(input, expected, data_us.building)
234 | 
235 | 
236 | @pytest.mark.parametrize("input,expected", [
237 |     # positive assertions
238 |     ("suite 900 ", True),
239 |     ("Suite #2 ", True),
240 |     ("suite #218 ", True),
241 |     ("suite J7 ", True),
242 |     ("suite 102A ", True),
243 |     ("suite a&b ", True),
244 |     ("Suite J#200 ", True),
245 |     ("suite 710-327 ", True),
246 |     ("Suite A ", True),
247 |     ("ste A ", True),
248 |     ("Ste 101 ", True),
249 |     ("ste 502b ", True),
250 |     ("ste 14-15 ", True),
251 |     ("ste E ", True),
252 |     ("ste 9E ", True),
253 |     ("Suite 1800 ", True),
254 |     ("Apt 1B ", True),
255 |     ("Rm. 52 ", True),
256 |     ("#2b ", True),
257 |     # positive assertions
258 |     ("suite900 ", False),
259 |     ("Suite#2", False),
260 |     ("suite218 ", False),
261 | ])
262 | def test_occupancy(input, expected):
263 |     ''' tests string match for a place id '''
264 |     execute_matching_test(input, expected, data_us.occupancy)
265 | 
266 | 
267 | @pytest.mark.parametrize("input,expected", [
268 |     # positive assertions
269 |     ("po box 108", True),
270 |     ("Po Box 53485", True),
271 |     ("P.O. box 119", True),
272 |     ("PO box 1070", True),
273 |     # negative assertions
274 |     ("po box108 ", False),
275 |     ("PoBox53485 ", False),
276 |     ("P.O. box119", False),
277 |     ("POb ox1070 ", False),
278 | ])
279 | def test_po_box_positive(input, expected):
280 |     ''' tests exact string match for a po box '''
281 |     execute_matching_test(input, expected, data_us.po_box)
282 | 
283 | 
284 | @pytest.mark.parametrize("input,expected", [
285 |     # positive assertions
286 |     ("9652 Loiret Boulevard", True),
287 |     ("101 MacIntosh Boulevard", True),
288 |     ("1 West Hegeler Lane", True),
289 |     ("1270 Leeds Avenue", True),
290 |     ("85-1190 Ranchview Rd. NW ", True),
291 |     ("62 Portland Road (Route 1)", True),
292 |     ("200 N. Pine Avenue Suite 514", True),
293 |     ("200 S. Alloy Drive", True),
294 |     ("Two Hundred S. Alloy Drive", True),
295 |     ("Two Hundred South Alloy Drive", True),
296 |     ("Two Hundred South Alloy Dr.", True),
297 |     ("11001 Fondren Rd,", True),
298 |     ("9606 North Mopac Expressway Suite 500", True),
299 |     ("9692 East Arapahoe Road,", True),
300 |     ("9 Grand Avenue, Suite 2", True),
301 |     ("9 Grand Avenue Building 2, Suite 2", True),
302 |     ("9 Grand Avenue Building 2, Suite 2A", True),
303 |     ("233 Richmond Highway Suite 1800", True),
304 |     ("354 Eisenhower Parkway P.O. Box 472", True),
305 |     ("6645 N Ensign St", True),
306 |     ("1200 Old Fairhaven Pkwy Apt 106", True),
307 |     ("1659 Scott Blvd Ste 26", True),
308 |     ("377 Fisher Rd Ste C", True),
309 |     ("1833 Stearman Ave", True),
310 |     ("1737 S Lumpkin St Ste B", True),
311 |     ("101 N Court Sq Ste 16", True),
312 |     ("1790 Yardley Langhorne Rd, Suite #205", True),
313 |     ("280 West Main Street", True),
314 |     ("701 Tennessee Walk", True),
315 |     ("7457 Harwin Dr", True),
316 |     ("700 Davis Avenue", True),
317 |     ("1 W 47th St", True),
318 |     ("832 Seward St", True),
319 |     ("2740 Timber Ridge Lane", True),
320 |     ("810 E Western Ave", True),
321 |     ("6223 Richmond Ave Ste 105", True),
322 |     ("400 Middle Street", True),
323 |     ("81 N Main St", True),
324 |     ("3705 West Memorial Road", True),
325 |     ("4911 Matterhorn Dr", True),
326 |     ("5830 Yahl Street, #2b", True),
327 |     ("9400 Doliver Dr Apt 13", True),
328 |     ("10701 Stirling Road", True),
329 |     ("1865 Corporate Dr Ste 225", True),
330 |     ("80 Beaman Rd", True),
331 |     ("9691 Spratley Ave", True),
332 |     ("10835 New Haven Rd NW ", True),
333 |     ("320 W Broussard Rd", True),
334 |     ("9001 Any Old Way", True),
335 |     ("8967 Market St.", True),
336 |     ("3724 Oxford Blvd.", True),
337 |     ("901 Rainier Ave S ", True),
338 | ])
339 | def test_full_street_positive(input, expected):
340 |     ''' tests exact string match for a full street '''
341 |     execute_matching_test(input, expected, data_us.full_street)
342 | 
343 | 
344 | @pytest.mark.parametrize("input,expected", [
345 |     # positive assertions
346 |     ("0 OLD MILL RD, Maynard, MA 01754", True),
347 |     ("103 Morgan Lane, Suite 102 Plainsboro, NJ 08536", True),
348 |     ("3409 16th St Metairie, LA 70002", True),
349 |     ("1505 NW 14th Street Miami, FL 33125", True),
350 |     ("01 Main Rd. Newfield, NJ", True),
351 |     ("28 Gorgo Lane Newfield, NJ", True),
352 |     ("1720 HARDING HWY NEWFIELD, NJ", True),
353 |     ("4409 N DELSEA DR NEWFIELD, NJ", True),
354 |     ("742 FORSYTHIA DR NEWFIELD, NJ", True),
355 |     ("9 N EAST BLVD NEWFIELD, NJ 10000", True),
356 |     ("1640 Harding Hwy Newfield, NJ", True),
357 |     ("1720 Harding Highway NEWFIELD, NJ", True),
358 |     ("1014 CATAWBA AVE NEWFIELD, NJ", True),
359 |     ("11 ARCH AVE NEWFIELD, NJ", True),
360 |     ("133 TAYLOR RD NEWFIELD, NJ", True),
361 |     ("4409 N Delsea Drive Newfield, NJ", True),
362 |     ("8 TAYLOR RD NEWFIELD, NJ", True),
363 |     ("28 GORGO LN NEWFIELD, NJ", True),
364 |     ("900 COLUMBIA AVE. NEWFIELD, NJ", True),
365 |     ("3201 MAIN RD NEWFIELD, NJ", True),
366 |     ("4421 N DELSEA DR NEWFIELD, NJ", True),
367 |     ("742 Forsythia Drive Newfield, NJ", True),
368 |     ("1450 E. Chestnut Avenue, Vineland NJ,", True),
369 |     ("50 Harry S Truman Parkway Annapolis, MD 21401", True),
370 |     ("420 Crompton Street Charlotte , North Carolina 28273", True),
371 |     ("204 East 3rd Ave Cheyenne, WY 82001", True),
372 |     ("1806 Dominion Way Ste B Colorado Spgs, CO 80918-8409", True),
373 |     ("2600 South Shore Blvd Ste. 300 League City, TX 77573", True),
374 |     ("2675 Antler Drive Carson City, NV 89701-1451", True),
375 |     ("3719 Lockwood Dr., Houston, TX 77026", True),
376 |     ("154 Grand Street New York, NY 10013", True),
377 |     ("3655 Torrance Blvd Suite 230 Torrance CA 90503", True),
378 |     ("800 Sixth Ave #31A New York, NY 10001", True),
379 |     ("8861 Research Drive, Ste. 200, Irvine, CA 92618", True),
380 |     ("317 N. Mission St. Ste. 200 Wenatchee, WA 98801", True),
381 |     ("2709 Bickford Avenue, Suite A Snohomish, WA 98290", True),
382 |     ("7307 N. Division Street, Suite 102 Spokane, WA 99208", True),
383 |     ("1530 South Union Avenue, Suite 7 Tacoma, WA 98405", True),
384 |     ("3131 Smokey Point Drive, Suite 14 A Arlington, WA 98223", True),
385 |     ("1603 Grove Street Marysville, WA 98270", True),
386 |     ("15701 E. Sprague Avenue, Suite F Spokane Valley, WA 99037", True),
387 |     ("18204 Bothell Everett Hwy, Suite E Bothell, WA 98012", True),
388 |     ("3505 188th Street SW Lynnwood, WA 98037", True),
389 |     ("3218 NE 12th Street, Suite B Renton, WA 98056", True),
390 |     ("22035 SE Wax Road, Suite 5 Maple Valley, WA 98038", True),
391 |     ("8861 Research Drive, Ste. 200 Irvine, CA 92618", True),
392 |     ("4031 University Drive Suite 200 Fairfax, Virginia 22030", True),
393 |     ("586 W. 207 St. New York, NY 10034", True),
394 |     ("85 Newbury St, Boston, MA 02116", True),
395 |     ("1827 Union St, San Francisco, CA 94123", True),
396 |     ("1636 Main St Sarasota, FL 34236", True),
397 |     ("1015 South Western Avenue, Chicago, IL 60649", True),
398 |     ("510 W 7th St. Los Angeles, CA 90014", True),
399 |     ("225 North Larchmont Blvd Los Angeles, CA 90004", True),
400 |     ("3760 E. Tremont Ave. Throgsneck, NY 10465", True),
401 |     ("8126 S. Stony Island Ave Chicago, IL 60617", True),
402 |     ("68116 HEM 908 B WEST 12th St. Austin, TX 78703", True),
403 |     ("546 West Colorado Street Glendale CA 91204", True),
404 |     ("2210 N Halsted St, Chicago, IL 60614", True),
405 |     ("4090 Westown Pkwy Ste B2 Chicago, IL 60614", True),
406 |     ("7000 Peachtree Dunwoody Rd NE Bldg 7, Miami, FL, USA", True),
407 |     ("98-025 Hekaha St Ste 221A, Cityville, Arizona", True),
408 |     ("225 E. John Carpenter Freeway, Suite 1500 Irving, Texas 75062 U.S.A.", True),
409 |     ("643 Lincoln Rd. Miami Beach, FL 33139", True),
410 |     ("300 Market St. Harrisburg, PA 17101", True),
411 |     ("2 Kings Hwy Shreveport, LA 71104", True),
412 |     ("1500 Westlake Avenue North Suite 108 Seattle, WA 98109", True),
413 |     ("840 Garrison Brooks Suite 985, New Sarah, OH 38255", True),
414 |     ("840 Garrison Brooks Suite 985 New Sarah, OH 38255", True),
415 |     # negative assertions
416 |     ("85 STEEL REGULAR SHAFT - NE", False),
417 |     ("3 STRUCTURE WITH PE", False),
418 |     ("2013 Courtesy of DONNA LUPI, PR", False),
419 |     ("44 sq. ft. 000 Columbia Ave. See Remarks, Newfield, NJ 08344", False),
420 |     ("7901 SILVER CONDUCTIVE HOLE FILL MA", False),
421 |     ("3 THIRD PARTY LIST IN", False),
422 |     ("9 STORAGE OF INDIVIDUAL IN", False),
423 |     ("4 BODY WAVE MODEL MO", False),
424 |     ("4060 AUTOMATIC STRAPPING MACHINE KZB-II STRAPPING MA", False),
425 |     ("130 AUTOMATIC STRAPPING MACHINE CO", False),
426 |     ("6060 AUTOMATIC STRAPPING MACHINE SK", False),
427 |     ("500 AUTO BLISTER PACKING SEALING MA", False),
428 |     ("23 ELECTRICAL COLOURED-TAPE PR", False),
429 |     ("1900 TRANSISTOR ELECTROMAGNETIC INDUCTION AL", False),
430 |     ("3131 DR. MATTHEW WI", False),
431 |     ("ONE FOR ANY DIRECT, INDIRECT, IN", False),
432 |     ("2 TRACTOR HEAD Actros MP", False),
433 |     ("00 Straight Fit Jean, USA", False),
434 | ])
435 | def test_full_address(input, expected):
436 |     ''' tests exact string match for a full address '''
437 |     execute_matching_test(input, expected, data_us.full_address)
438 | 
439 | 
440 | @pytest.mark.parametrize("input,expected", [
441 |     # positive assertions
442 |     ("75062", True),
443 |     ("15032", True),
444 |     ("95130-6482", True),
445 |     # negative assertions
446 |     ("1", False),
447 |     ("23", False),
448 |     ("456", False),
449 |     ("4567", False),
450 |     ("750621", False),
451 |     ("95130-642", False),
452 |     ("95130-64212", False),
453 | ])
454 | def test_postal_code(input, expected):
455 |     ''' test exact string match for postal code '''
456 |     execute_matching_test(input, expected, data_us.postal_code)
457 | 
458 | 
459 | @pytest.mark.parametrize("input,expected", [
460 |     # positive assertions
461 |     ("Montana", True),
462 |     ("Nebraska", True),
463 |     ("NJ", True),
464 |     ("DC", True),
465 |     ("PuErTO RIco", True),
466 |     ("oregon", True),
467 | ])
468 | def test_region1(input, expected):
469 |     ''' test exact string match for province '''
470 |     execute_matching_test(input, expected, data_us.region1)
471 | 
472 | 
473 | @pytest.mark.parametrize("input,expected", [
474 |     # positive assertions
475 |     ("USA", True),
476 |     ("U.S.A", True),
477 |     ("United States", True),
478 | ])
479 | def test_country(input, expected):
480 |     ''' test exact string match for country '''
481 |     execute_matching_test(input, expected, data_us.country)
482 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | # Tox (http://tox.testrun.org/) is a tool for running tests
 2 | # in multiple virtualenvs. This configuration file will run the
 3 | # test suite on all supported python versions. To use it, "pip install tox"
 4 | # and then run "tox" from this directory.
 5 | 
 6 | [tox]
 7 | envlist = py27, py38
 8 | 
 9 | [testenv]
10 | commands = py.test \
11 | 			test_parser.py \
12 | 			test_parser_ca.py \
13 | 			test_parser_us.py \
14 | 			test_parser_gb.py
15 | deps =
16 |     pytest
17 | 


--------------------------------------------------------------------------------