├── .gitignore ├── LICENSE ├── README.md ├── setup.py ├── streetaddress ├── __init__.py └── addressconf.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2012-2013 Mike Jensen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | python-streetaddress 2 | ==================== 3 | 4 | A Python port of the Perl address parser available at http://search.cpan.org/~timb/Geo-StreetAddress-US-1.03/US.pm -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='streetaddress', 4 | version='0.1.0', 5 | description='A Python port of the Perl address parser.', 6 | author='Mike Jensen', 7 | url='https://github.com/jjensenmike/python-streetaddress', 8 | keywords='streetaddress', 9 | packages=['streetaddress'], 10 | classifiers=[ 11 | 'Development Status :: 5 - Production/Stable', 12 | 'Environment :: Other Environment', 13 | 'Intended Audience :: Developers', 14 | 'Natural Language :: English', 15 | 'Operating System :: MacOS :: MacOS X', 16 | 'Operating System :: Microsoft :: Windows', 17 | 'Operating System :: POSIX', 18 | 'Programming Language :: Python :: 3', 19 | 'Topic :: Security', 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /streetaddress/__init__.py: -------------------------------------------------------------------------------- 1 | from .addressconf import Directions, Streets, States, Regexes 2 | import re 3 | 4 | def parse(location): 5 | if Regexes.corner.search(location): 6 | return parse_intersection(location) 7 | else: 8 | return parse_address(location) 9 | 10 | def parse_intersection(inter): 11 | match = Regexes.intersection.match(inter) 12 | if not match: 13 | return 14 | match_data = match.groups() 15 | return normalize_address({'street':match_data[3] or match_data[8], 16 | 'street_type':match_data[4], 17 | 'suffix':match_data[5], 18 | 'prefix':match_data[2], 19 | 'street2':match_data[14] or match_data[19], 20 | 'street_type2':match_data[15], 21 | 'suffix2':match_data[16], 22 | 'prefix2':match_data[13], 23 | 'city':match_data[22], 24 | 'state':match_data[23], 25 | 'postal_code':match_data[24]}) 26 | 27 | def parse_address(addr): 28 | match = Regexes.address.match(addr) 29 | if not match: 30 | return 31 | match_data = match.groups() 32 | return normalize_address({'number':match_data[0], 33 | 'street':match_data[4] or match_data[9] or match_data[1], 34 | 'type':match_data[5] or match_data[2], 35 | 'sec_unit_num':match_data[13], 36 | 'sec_unit_type':match_data[12], 37 | 'suffix':match_data[6] or match_data[11], 38 | 'prefix':match_data[3], 39 | 'city':match_data[14], 40 | 'state':match_data[15], 41 | 'zip':match_data[16], 42 | 'zip_ext':match_data[17]}) 43 | 44 | def normalize_address(addr): 45 | addr['state'] = normalize_state(addr.get('state', None)) 46 | addr['city'] = normalize_city(addr.get('city', None)) 47 | 48 | addr['type'] = normalize_street_type(addr.get('type', None)) 49 | 50 | addr['street_type2'] = normalize_street_type(addr.get('street_type2', None)) 51 | 52 | addr['prefix'] = normalize_directional(addr.get('prefix', None)) 53 | addr['prefix2'] = normalize_directional(addr.get('prefix2', None)) 54 | addr['suffix'] = normalize_directional(addr.get('suffix', None)) 55 | addr['suffix2'] = normalize_directional(addr.get('suffix2', None)) 56 | 57 | addr['unit_prefix'] = _upper_if_exists(addr.get('unit_prefix', None)) 58 | 59 | addr = dict((k,v) for k,v in addr.items() if v) 60 | 61 | return addr 62 | 63 | def normalize_city(city) : 64 | if not city : 65 | return None 66 | city_list = [] 67 | for word in city.split() : 68 | if word in Directions.DIRECTIONAL_CODES : 69 | city_list.append(Directions.DIRECTIONAL_CODES[word].title()) 70 | else : 71 | city_list.append(word) 72 | return ' '.join(city_list) 73 | 74 | def normalize_state(state): 75 | if not state : 76 | return None 77 | if len(state) < 3: 78 | return state.upper() 79 | else: 80 | return States.STATE_CODES[state.lower()] 81 | 82 | def normalize_street_type(s_type): 83 | if not s_type : 84 | return None 85 | if s_type.lower() in Streets.STREET_TYPES: 86 | return Streets.STREET_TYPES[s_type.lower()].title() 87 | elif s_type.lower() in Streets.STREET_TYPES_LIST: 88 | return s_type.title() 89 | 90 | def normalize_directional(direction): 91 | if not direction : 92 | return None 93 | if len(direction) < 3: 94 | return direction.upper() 95 | else: 96 | return Directions.DIRECTIONAL[direction.lower()] 97 | 98 | def _upper_if_exists(field) : 99 | if not field : 100 | return None 101 | else : 102 | return field.upper() 103 | -------------------------------------------------------------------------------- /streetaddress/addressconf.py: -------------------------------------------------------------------------------- 1 | import re 2 | class Directions(object): 3 | DIRECTIONAL = { 4 | "north":"N", 5 | "northeast":"NE", 6 | "east":"E", 7 | "southeast":"SE", 8 | "south":"S", 9 | "southwest":"SW", 10 | "west":"W", 11 | "northwest":"NW"} 12 | DIRECTIONAL_CODES = dict((v,k) for k,v in DIRECTIONAL.items()) 13 | 14 | class Streets(object): 15 | STREET_TYPES = { 16 | "allee":"aly", 17 | "alley":"aly", 18 | "ally":"aly", 19 | "anex":"anx", 20 | "annex":"anx", 21 | "annx":"anx", 22 | "arcade":"arc", 23 | "av":"ave", 24 | "aven":"ave", 25 | "avenu":"ave", 26 | "avenue":"ave", 27 | "avn":"ave", 28 | "avnue":"ave", 29 | "bayoo":"byu", 30 | "bayou":"byu", 31 | "beach":"bch", 32 | "bend":"bnd", 33 | "bluf":"blf", 34 | "bluff":"blf", 35 | "bluffs":"blfs", 36 | "bot":"btm", 37 | "bottm":"btm", 38 | "bottom":"btm", 39 | "boul":"blvd", 40 | "boulevard":"blvd", 41 | "boulv":"blvd", 42 | "branch":"br", 43 | "brdge":"brg", 44 | "bridge":"brg", 45 | "brnch":"br", 46 | "brook":"brk", 47 | "brooks":"brks", 48 | "burg":"bg", 49 | "burgs":"bgs", 50 | "bypa":"byp", 51 | "bypas":"byp", 52 | "bypass":"byp", 53 | "byps":"byp", 54 | "camp":"cp", 55 | "canyn":"cyn", 56 | "canyon":"cyn", 57 | "cape":"cpe", 58 | "causeway":"cswy", 59 | "causway":"cswy", 60 | "cen":"ctr", 61 | "cent":"ctr", 62 | "center":"ctr", 63 | "centers":"ctrs", 64 | "centr":"ctr", 65 | "centre":"ctr", 66 | "circ":"cir", 67 | "circl":"cir", 68 | "circle":"cir", 69 | "circles":"cirs", 70 | "ck":"crk", 71 | "cliff":"clf", 72 | "cliffs":"clfs", 73 | "club":"clb", 74 | "cmp":"cp", 75 | "cnter":"ctr", 76 | "cntr":"ctr", 77 | "cnyn":"cyn", 78 | "common":"cmn", 79 | "corner":"cor", 80 | "corners":"cors", 81 | "course":"crse", 82 | "court":"ct", 83 | "courts":"cts", 84 | "cove":"cv", 85 | "coves":"cvs", 86 | "cr":"crk", 87 | "crcl":"cir", 88 | "crcle":"cir", 89 | "crecent":"cres", 90 | "creek":"crk", 91 | "crescent":"cres", 92 | "cresent":"cres", 93 | "crest":"crst", 94 | "crossing":"xing", 95 | "crossroad":"xrd", 96 | "crscnt":"cres", 97 | "crsent":"cres", 98 | "crsnt":"cres", 99 | "crssing":"xing", 100 | "crssng":"xing", 101 | "crt":"ct", 102 | "curve":"curv", 103 | "dale":"dl", 104 | "dam":"dm", 105 | "div":"dv", 106 | "divide":"dv", 107 | "driv":"dr", 108 | "drive":"dr", 109 | "drives":"drs", 110 | "drv":"dr", 111 | "dvd":"dv", 112 | "estate":"est", 113 | "estates":"ests", 114 | "exp":"expy", 115 | "expr":"expy", 116 | "express":"expy", 117 | "expressway":"expy", 118 | "expw":"expy", 119 | "extension":"ext", 120 | "extensions":"exts", 121 | "extn":"ext", 122 | "extnsn":"ext", 123 | "falls":"fls", 124 | "ferry":"fry", 125 | "field":"fld", 126 | "fields":"flds", 127 | "flat":"flt", 128 | "flats":"flts", 129 | "ford":"frd", 130 | "fords":"frds", 131 | "forest":"frst", 132 | "forests":"frst", 133 | "forg":"frg", 134 | "forge":"frg", 135 | "forges":"frgs", 136 | "fork":"frk", 137 | "forks":"frks", 138 | "fort":"ft", 139 | "freeway":"fwy", 140 | "freewy":"fwy", 141 | "frry":"fry", 142 | "frt":"ft", 143 | "frway":"fwy", 144 | "frwy":"fwy", 145 | "garden":"gdn", 146 | "gardens":"gdns", 147 | "gardn":"gdn", 148 | "gateway":"gtwy", 149 | "gatewy":"gtwy", 150 | "gatway":"gtwy", 151 | "glen":"gln", 152 | "glens":"glns", 153 | "grden":"gdn", 154 | "grdn":"gdn", 155 | "grdns":"gdns", 156 | "green":"grn", 157 | "greens":"grns", 158 | "grov":"grv", 159 | "grove":"grv", 160 | "groves":"grvs", 161 | "gtway":"gtwy", 162 | "harb":"hbr", 163 | "harbor":"hbr", 164 | "harbors":"hbrs", 165 | "harbr":"hbr", 166 | "haven":"hvn", 167 | "havn":"hvn", 168 | "height":"hts", 169 | "heights":"hts", 170 | "hgts":"hts", 171 | "highway":"hwy", 172 | "highwy":"hwy", 173 | "hill":"hl", 174 | "hills":"hls", 175 | "hiway":"hwy", 176 | "hiwy":"hwy", 177 | "hllw":"holw", 178 | "hollow":"holw", 179 | "hollows":"holw", 180 | "holws":"holw", 181 | "hrbor":"hbr", 182 | "ht":"hts", 183 | "hway":"hwy", 184 | "inlet":"inlt", 185 | "island":"is", 186 | "islands":"iss", 187 | "isles":"isle", 188 | "islnd":"is", 189 | "islnds":"iss", 190 | "jction":"jct", 191 | "jctn":"jct", 192 | "jctns":"jcts", 193 | "junction":"jct", 194 | "junctions":"jcts", 195 | "junctn":"jct", 196 | "juncton":"jct", 197 | "key":"ky", 198 | "keys":"kys", 199 | "knol":"knl", 200 | "knoll":"knl", 201 | "knolls":"knls", 202 | "la":"ln", 203 | "lake":"lk", 204 | "lakes":"lks", 205 | "landing":"lndg", 206 | "lane":"ln", 207 | "lanes":"ln", 208 | "ldge":"ldg", 209 | "light":"lgt", 210 | "lights":"lgts", 211 | "lndng":"lndg", 212 | "loaf":"lf", 213 | "lock":"lck", 214 | "locks":"lcks", 215 | "lodg":"ldg", 216 | "lodge":"ldg", 217 | "loops":"loop", 218 | "manor":"mnr", 219 | "manors":"mnrs", 220 | "meadow":"mdw", 221 | "meadows":"mdws", 222 | "medows":"mdws", 223 | "mill":"ml", 224 | "mills":"mls", 225 | "mission":"msn", 226 | "missn":"msn", 227 | "mnt":"mt", 228 | "mntain":"mtn", 229 | "mntn":"mtn", 230 | "mntns":"mtns", 231 | "motorway":"mtwy", 232 | "mount":"mt", 233 | "mountain":"mtn", 234 | "mountains":"mtns", 235 | "mountin":"mtn", 236 | "mssn":"msn", 237 | "mtin":"mtn", 238 | "neck":"nck", 239 | "orchard":"orch", 240 | "orchrd":"orch", 241 | "overpass":"opas", 242 | "ovl":"oval", 243 | "parks":"park", 244 | "parkway":"pkwy", 245 | "parkways":"pkwy", 246 | "parkwy":"pkwy", 247 | "passage":"psge", 248 | "paths":"path", 249 | "pikes":"pike", 250 | "pine":"pne", 251 | "pines":"pnes", 252 | "pk":"park", 253 | "pkway":"pkwy", 254 | "pkwys":"pkwy", 255 | "pky":"pkwy", 256 | "place":"pl", 257 | "plain":"pln", 258 | "plaines":"plns", 259 | "plains":"plns", 260 | "plaza":"plz", 261 | "plza":"plz", 262 | "point":"pt", 263 | "points":"pts", 264 | "port":"prt", 265 | "ports":"prts", 266 | "prairie":"pr", 267 | "prarie":"pr", 268 | "prk":"park", 269 | "prr":"pr", 270 | "rad":"radl", 271 | "radial":"radl", 272 | "radiel":"radl", 273 | "ranch":"rnch", 274 | "ranches":"rnch", 275 | "rapid":"rpd", 276 | "rapids":"rpds", 277 | "rdge":"rdg", 278 | "rest":"rst", 279 | "ridge":"rdg", 280 | "ridges":"rdgs", 281 | "river":"riv", 282 | "rivr":"riv", 283 | "rnchs":"rnch", 284 | "road":"rd", 285 | "roads":"rds", 286 | "route":"rte", 287 | "rvr":"riv", 288 | "shoal":"shl", 289 | "shoals":"shls", 290 | "shoar":"shr", 291 | "shoars":"shrs", 292 | "shore":"shr", 293 | "shores":"shrs", 294 | "skyway":"skwy", 295 | "spng":"spg", 296 | "spngs":"spgs", 297 | "spring":"spg", 298 | "springs":"spgs", 299 | "sprng":"spg", 300 | "sprngs":"spgs", 301 | "spurs":"spur", 302 | "sqr":"sq", 303 | "sqre":"sq", 304 | "sqrs":"sqs", 305 | "squ":"sq", 306 | "square":"sq", 307 | "squares":"sqs", 308 | "station":"sta", 309 | "statn":"sta", 310 | "stn":"sta", 311 | "str":"st", 312 | "strav":"stra", 313 | "strave":"stra", 314 | "straven":"stra", 315 | "stravenue":"stra", 316 | "stravn":"stra", 317 | "stream":"strm", 318 | "street":"st", 319 | "streets":"sts", 320 | "streme":"strm", 321 | "strt":"st", 322 | "strvn":"stra", 323 | "strvnue":"stra", 324 | "sumit":"smt", 325 | "sumitt":"smt", 326 | "summit":"smt", 327 | "terr":"ter", 328 | "terrace":"ter", 329 | "throughway":"trwy", 330 | "tpk":"tpke", 331 | "tr":"trl", 332 | "trace":"trce", 333 | "traces":"trce", 334 | "track":"trak", 335 | "tracks":"trak", 336 | "trafficway":"trfy", 337 | "trail":"trl", 338 | "trails":"trl", 339 | "trk":"trak", 340 | "trks":"trak", 341 | "trls":"trl", 342 | "trnpk":"tpke", 343 | "trpk":"tpke", 344 | "tunel":"tunl", 345 | "tunls":"tunl", 346 | "tunnel":"tunl", 347 | "tunnels":"tunl", 348 | "tunnl":"tunl", 349 | "turnpike":"tpke", 350 | "turnpk":"tpke", 351 | "underpass":"upas", 352 | "union":"un", 353 | "unions":"uns", 354 | "valley":"vly", 355 | "valleys":"vlys", 356 | "vally":"vly", 357 | "vdct":"via", 358 | "viadct":"via", 359 | "viaduct":"via", 360 | "view":"vw", 361 | "views":"vws", 362 | "vill":"vlg", 363 | "villag":"vlg", 364 | "village":"vlg", 365 | "villages":"vlgs", 366 | "ville":"vl", 367 | "villg":"vlg", 368 | "villiage":"vlg", 369 | "vist":"vis", 370 | "vista":"vis", 371 | "vlly":"vly", 372 | "vst":"vis", 373 | "vsta":"vis", 374 | "walks":"walk", 375 | "well":"wl", 376 | "wells":"wls", 377 | "wy":"way" 378 | } 379 | 380 | STREET_TYPES_LIST = set(list(STREET_TYPES) + list(STREET_TYPES.values())) 381 | 382 | class States(object): 383 | STATE_CODES = { 384 | "alabama":"AL", 385 | "alaska":"AK", 386 | "american samoa":"AS", 387 | "arizona":"AZ", 388 | "arkansas":"AR", 389 | "california":"CA", 390 | "colorado":"CO", 391 | "connecticut":"CT", 392 | "delaware":"DE", 393 | "district of columbia":"DC", 394 | "federated states of micronesia":"FM", 395 | "florida":"FL", 396 | "georgia":"GA", 397 | "guam":"GU", 398 | "hawaii":"HI", 399 | "idaho":"ID", 400 | "illinois":"IL", 401 | "indiana":"IN", 402 | "iowa":"IA", 403 | "kansas":"KS", 404 | "kentucky":"KY", 405 | "louisiana":"LA", 406 | "maine":"ME", 407 | "marshall islands":"MH", 408 | "maryland":"MD", 409 | "massachusetts":"MA", 410 | "michigan":"MI", 411 | "minnesota":"MN", 412 | "mississippi":"MS", 413 | "missouri":"MO", 414 | "montana":"MT", 415 | "nebraska":"NE", 416 | "nevada":"NV", 417 | "new hampshire":"NH", 418 | "new jersey":"NJ", 419 | "new mexico":"NM", 420 | "new york":"NY", 421 | "north carolina":"NC", 422 | "north dakota":"ND", 423 | "northern mariana islands":"MP", 424 | "ohio":"OH", 425 | "oklahoma":"OK", 426 | "oregon":"OR", 427 | "palau":"PW", 428 | "pennsylvania":"PA", 429 | "puerto rico":"PR", 430 | "rhode island":"RI", 431 | "south carolina":"SC", 432 | "south dakota":"SD", 433 | "tennessee":"TN", 434 | "texas":"TX", 435 | "utah":"UT", 436 | "vermont":"VT", 437 | "virgin islands":"VI", 438 | "virginia":"VA", 439 | "washington":"WA", 440 | "west virginia":"WV", 441 | "wisconsin":"WI", 442 | "wyoming":"WY" 443 | } 444 | 445 | STATE_NAMES = dict((v,k) for k,v in STATE_CODES.items()) 446 | 447 | STATE_FIPS = { 448 | "01":"AL", 449 | "02":"AK", 450 | "04":"AZ", 451 | "05":"AR", 452 | "06":"CA", 453 | "08":"CO", 454 | "09":"CT", 455 | "10":"DE", 456 | "11":"DC", 457 | "12":"FL", 458 | "13":"GA", 459 | "15":"HI", 460 | "16":"ID", 461 | "17":"IL", 462 | "18":"IN", 463 | "19":"IA", 464 | "20":"KS", 465 | "21":"KY", 466 | "22":"LA", 467 | "23":"ME", 468 | "24":"MD", 469 | "25":"MA", 470 | "26":"MI", 471 | "27":"MN", 472 | "28":"MS", 473 | "29":"MO", 474 | "30":"MT", 475 | "31":"NE", 476 | "32":"NV", 477 | "33":"NH", 478 | "34":"NJ", 479 | "35":"NM", 480 | "36":"NY", 481 | "37":"NC", 482 | "38":"ND", 483 | "39":"OH", 484 | "40":"OK", 485 | "41":"OR", 486 | "42":"PA", 487 | "44":"RI", 488 | "45":"SC", 489 | "46":"SD", 490 | "47":"TN", 491 | "48":"TX", 492 | "49":"UT", 493 | "50":"VT", 494 | "51":"VA", 495 | "53":"WA", 496 | "54":"WV", 497 | "55":"WI", 498 | "56":"WY", 499 | "72":"PR", 500 | "78":"VI" 501 | } 502 | 503 | FIPS_STATES = dict((v,k) for k,v in STATE_FIPS.items()) 504 | 505 | class Regexes(object): 506 | street_type = re.compile('|'.join(Streets.STREET_TYPES_LIST), re.IGNORECASE) 507 | number = re.compile(r'\d+-?\d*') 508 | fraction = re.compile(r'\d+\/\d+') 509 | state = re.compile('|'.join([v.replace(' ', '\\s') for v in (list(States.STATE_CODES.values()) + list(States.STATE_CODES))]), re.IGNORECASE) 510 | direct = re.compile('|'.join(Directions.DIRECTIONAL.keys()) + '|' + '|'.join([(''.join([n+'\\.' for n in v])+'|'+v) for v in sorted(Directions.DIRECTIONAL.values(), key=len, reverse=True)]), re.IGNORECASE) 511 | zip_code = re.compile(r'(\d{5})(?:-(\d{4}))?') 512 | corner = re.compile(r'(?:\band\b|\bat\b|&|\@)', re.IGNORECASE) 513 | unit = re.compile(r'(?:(su?i?te|p\W*[om]\W*b(?:ox)?|dept|apt|trlr|lot|rm|ste|apartment|ro*m|fl|unit|box)\W+|\#\W*)([\w-]+)', re.IGNORECASE) 514 | street = re.compile(r'(?:(?:({0})\W+({1})\b)|(?:({0})\W+)?(?:([^,]+)(?:[^\w,]+({1})\b)(?:[^\w,]+({0})\b)?|([^,]*\d)({0})\b|([^,]+?)(?:[^\w,]+({1})\b)?(?:[^\w,]+({0})\b)?))'.format(direct.pattern, street_type.pattern), re.IGNORECASE) 515 | place = re.compile(r'(?:([^\d,]+?)\W+(${0})\W*)?(?:{1})?'.format(state.pattern,zip_code.pattern), re.IGNORECASE) 516 | address = re.compile(r'\A\W*({0})\W*(?:{1}\W*)?{2}\W+(?:{3}\W+)?{4}\W*\Z'.format(number.pattern, fraction.pattern, street.pattern, unit.pattern, place.pattern), re.IGNORECASE) 517 | intersection = re.compile('\A\W*{0}\W*?\s+{1}\s+{0}\W+{2}\W*\Z'.format(street.pattern, corner.pattern, place.pattern), re.IGNORECASE) 518 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import streetaddress as sa 2 | import unittest 3 | 4 | class addressTests(unittest.TestCase) : 5 | def test_all(self) : 6 | 7 | assert sa.parse('1005 Gravenstein Hwy 95472') == {'number' :'1005', 8 | 'street' : 'Gravenstein', 9 | 'zip' : '95472', 10 | 'type' :'Hwy'} 11 | 12 | assert sa.parse('1005 Gravenstein Hwy, 95472') == {'number' :'1005', 13 | 'street' : 'Gravenstein', 14 | 'zip' : '95472', 15 | 'type' :'Hwy'} 16 | 17 | assert sa.parse('1005 Gravenstein Hwy N, 95472') == {'number' :'1005', 18 | 'street' : 'Gravenstein', 19 | 'zip' : '95472', 20 | 'type' :'Hwy', 21 | 'suffix' : 'N'} 22 | assert sa.parse('1005 Gravenstein Highway North, 95472') == {'number' :'1005', 23 | 'street' : 'Gravenstein', 24 | 'zip' : '95472', 25 | 'type' :'Hwy', 26 | 'suffix' : 'N'} 27 | 28 | assert sa.parse('1005 N Gravenstein Highway, Sebastopol, CA') == {'number' :'1005', 29 | 'street' : 'Gravenstein', 30 | 31 | 'type' :'Hwy', 32 | 'prefix' : 'N', 33 | 'city' : 'Sebastopol', 34 | 'state' : 'CA'} 35 | 36 | 37 | assert sa.parse("1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == { 38 | 'number' : '1005', 39 | 'street' : 'Gravenstein', 40 | 'state' : 'CA', 41 | 'city' : 'Sebastopol', 42 | 'type' : 'Hwy', 43 | 'prefix' : 'N', 44 | 'sec_unit_type' : 'Suite', 45 | 'sec_unit_num' : '500', 46 | } 47 | 48 | 49 | 50 | 51 | 52 | assert sa.parse("1005 N Gravenstein Highway, Suite 500, Sebastopol, CA") == { 53 | 'number' : '1005', 54 | 'street' : 'Gravenstein', 55 | 'state' : 'CA', 56 | 'city' : 'Sebastopol', 57 | 'type' : 'Hwy', 58 | 'prefix' : 'N', 59 | 'sec_unit_type' : 'Suite', 60 | 'sec_unit_num' : '500', 61 | } 62 | assert sa.parse("1005 N Gravenstein Hwy Suite 500 Sebastopol, CA") == { 63 | 'number' : '1005', 64 | 'street' : 'Gravenstein', 65 | 'state' : 'CA', 66 | 'city' : 'Sebastopol', 67 | 'type' : 'Hwy', 68 | 'prefix' : 'N', 69 | 'sec_unit_type' : 'Suite', 70 | 'sec_unit_num' : '500', 71 | } 72 | 73 | assert sa.parse("1005 N Gravenstein Highway, Sebastopol, CA, 95472") == { 74 | 'number' : '1005', 75 | 'street' : 'Gravenstein', 76 | 'state' : 'CA', 77 | 'city' : 'Sebastopol', 78 | 'zip' : '95472', 79 | 'type' : 'Hwy', 80 | 'prefix' : 'N' 81 | } 82 | assert sa.parse("1005 N Gravenstein Highway Sebastopol CA 95472") == { 83 | 'number' : '1005', 84 | 'street' : 'Gravenstein', 85 | 'state' : 'CA', 86 | 'city' : 'Sebastopol', 87 | 'zip' : '95472', 88 | 'type' : 'Hwy', 89 | 'prefix' : 'N' 90 | } 91 | assert sa.parse("1005 Gravenstein Hwy N Sebastopol CA") == { 92 | 'number' : '1005', 93 | 'street' : 'Gravenstein', 94 | 'state' : 'CA', 95 | 'city' : 'Sebastopol', 96 | 'suffix' : 'N', 97 | 'type' : 'Hwy', 98 | } 99 | assert sa.parse("1005 Gravenstein Hwy N, Sebastopol CA") == { 100 | 'number' : '1005', 101 | 'street' : 'Gravenstein', 102 | 'state' : 'CA', 103 | 'city' : 'Sebastopol', 104 | 'suffix' : 'N', 105 | 'type' : 'Hwy', 106 | } 107 | 108 | assert sa.parse("1005 Gravenstein Hwy, N Sebastopol CA") == { 109 | 'number' : '1005', 110 | 'street' : 'Gravenstein', 111 | 'state' : 'CA', 112 | 'city' : 'North Sebastopol', 113 | 'type' : 'Hwy', 114 | } 115 | assert sa.parse("1005 Gravenstein Hwy, North Sebastopol CA") == { 116 | 'number' : '1005', 117 | 'street' : 'Gravenstein', 118 | 'state' : 'CA', 119 | 'city' : 'North Sebastopol', 120 | 'type' : 'Hwy', 121 | } 122 | assert sa.parse("1005 Gravenstein Hwy Sebastopol CA") == { 123 | 'number' : '1005', 124 | 'street' : 'Gravenstein', 125 | 'state' : 'CA', 126 | 'city' : 'Sebastopol', 127 | 'type' : 'Hwy', 128 | } 129 | assert sa.parse("115 Broadway San Francisco CA") == { 130 | 'number' : '115', 131 | 'street' : 'Broadway', 132 | 'state' : 'CA', 133 | 'city' : 'San Francisco', 134 | } 135 | assert sa.parse("7800 Mill Station Rd, Sebastopol, CA 95472") == { 136 | 'number' : '7800', 137 | 'street' : 'Mill Station', 138 | 'state' : 'CA', 139 | 'city' : 'Sebastopol', 140 | 'zip' : '95472', 141 | 'type' : 'Rd', 142 | } 143 | assert sa.parse("7800 Mill Station Rd Sebastopol CA 95472") == { 144 | 'number' : '7800', 145 | 'street' : 'Mill Station', 146 | 'state' : 'CA', 147 | 'city' : 'Sebastopol', 148 | 'zip' : '95472', 149 | 'type' : 'Rd', 150 | } 151 | 152 | assert sa.parse("1005 State Highway 116 Sebastopol CA 95472") == { 153 | 'number' : '1005', 154 | 'street' : 'State Highway 116', 155 | 'state' : 'CA', 156 | 'city' : 'Sebastopol', 157 | 'zip' : '95472', 158 | 'type' : 'Hwy', 159 | } 160 | assert sa.parse("1600 Pennsylvania Ave. Washington DC") == { 161 | 'number' : '1600', 162 | 'street' : 'Pennsylvania', 163 | 'state' : 'DC', 164 | 'city' : 'Washington', 165 | 'type' : 'Ave', 166 | } 167 | assert sa.parse("1600 Pennsylvania Avenue Washington DC") == { 168 | 'number' : '1600', 169 | 'street' : 'Pennsylvania', 170 | 'state' : 'DC', 171 | 'city' : 'Washington', 172 | 'type' : 'Ave', 173 | } 174 | assert sa.parse("48S 400E, Salt Lake City UT") == { 175 | 'type' : '', 176 | 'number' : '48', 177 | 'street' : '400', 178 | 'state' : 'UT', 179 | 'city' : 'Salt Lake City', 180 | 'suffix' : 'E', 181 | 'prefix' : 'S' 182 | } 183 | assert sa.parse("550 S 400 E #3206, Salt Lake City UT 84111") == { 184 | 'number' : '550', 185 | 'street' : '400', 186 | 'state' : 'UT', 187 | 'sec_unit_num' : '3206', 188 | 'zip' : '84111', 189 | 'city' : 'Salt Lake City', 190 | 'suffix' : 'E', 191 | 'type' : '', 192 | 'sec_unit_type' : '#', 193 | 'prefix' : 'S' 194 | } 195 | assert sa.parse("6641 N 2200 W Apt D304 Park City, UT 84098") == { 196 | 'number' : '6641', 197 | 'street' : '2200', 198 | 'state' : 'UT', 199 | 'sec_unit_num' : 'D304', 200 | 'zip' : '84098', 201 | 'city' : 'Park City', 202 | 'suffix' : 'W', 203 | 'type' : '', 204 | 'sec_unit_type' : 'Apt', 205 | 'prefix' : 'N' 206 | } 207 | assert sa.parse("100 South St, Philadelphia, PA") == { 208 | 'number' : '100', 209 | 'street' : 'South', 210 | 'state' : 'PA', 211 | 'city' : 'Philadelphia', 212 | 'type' : 'St', 213 | } 214 | assert sa.parse("100 S.E. Washington Ave, Minneapolis, MN") == { 215 | 'number' : '100', 216 | 'street' : 'Washington', 217 | 'state' : 'MN', 218 | 'city' : 'Minneapolis', 219 | 'type' : 'Ave', 220 | 'prefix' : 'SE' 221 | } 222 | assert sa.parse("3813 1/2 Some Road, Los Angeles, CA") == { 223 | 'number' : '3813', 224 | 'street' : 'Some', 225 | 'state' : 'CA', 226 | 'city' : 'Los Angeles', 227 | 'type' : 'Rd', 228 | } 229 | assert sa.parse("Mission & Valencia San Francisco CA") == { 230 | 'type1' : '', 231 | 'type2' : '', 232 | 'street1' : 'Mission', 233 | 'state' : 'CA', 234 | 'city' : 'San Francisco', 235 | 'street2' : 'Valencia' 236 | } 237 | assert sa.parse("Mission & Valencia, San Francisco CA") == { 238 | 'type1' : '', 239 | 'type2' : '', 240 | 'street1' : 'Mission', 241 | 'state' : 'CA', 242 | 'city' : 'San Francisco', 243 | 'street2' : 'Valencia' 244 | } 245 | assert sa.parse("Mission St and Valencia St San Francisco CA") == { 246 | 'type1' : 'St', 247 | 'type2' : 'St', 248 | 'street1' : 'Mission', 249 | 'state' : 'CA', 250 | 'city' : 'San Francisco', 251 | 'street2' : 'Valencia' 252 | } 253 | assert sa.parse("Mission St & Valencia St San Francisco CA") == { 254 | 'type1' : 'St', 255 | 'type2' : 'St', 256 | 'street1' : 'Mission', 257 | 'state' : 'CA', 258 | 'city' : 'San Francisco', 259 | 'street2' : 'Valencia' 260 | } 261 | assert sa.parse("Mission and Valencia Sts San Francisco CA") == { 262 | 'type1' : 'St', 263 | 'type2' : 'St', 264 | 'street1' : 'Mission', 265 | 'state' : 'CA', 266 | 'city' : 'San Francisco', 267 | 'street2' : 'Valencia' 268 | } 269 | assert sa.parse("Mission & Valencia Sts. San Francisco CA") == { 270 | 'type1' : 'St', 271 | 'type2' : 'St', 272 | 'street1' : 'Mission', 273 | 'state' : 'CA', 274 | 'city' : 'San Francisco', 275 | 'street2' : 'Valencia' 276 | } 277 | assert sa.parse("Mission & Valencia Streets San Francisco CA") == { 278 | 'type1' : 'St', 279 | 'type2' : 'St', 280 | 'street1' : 'Mission', 281 | 'state' : 'CA', 282 | 'city' : 'San Francisco', 283 | 'street2' : 'Valencia' 284 | } 285 | assert sa.parse("Mission Avenue and Valencia Street San Francisco CA") == { 286 | 'type1' : 'Ave', 287 | 'type2' : 'St', 288 | 'street1' : 'Mission', 289 | 'state' : 'CA', 290 | 'city' : 'San Francisco', 291 | 'street2' : 'Valencia' 292 | } 293 | assert sa.parse("1 First St, e San Jose CA") == { # lower case city direction 294 | 'number' : '1', 295 | 'street' : 'First', 296 | 'state' : 'CA', 297 | 'city' : 'East San Jose', 298 | 'type' : 'St', 299 | } 300 | assert sa.parse("123 Maple Rochester, New York") == { # space in state name 301 | 'type' : '', 302 | 'number' : '123', 303 | 'street' : 'Maple', 304 | 'state' : 'NY', 305 | 'city' : 'Rochester', 306 | } 307 | assert sa.parse("233 S Wacker Dr 60606-6306") == { # zip+4 with hyphen 308 | 'number' : '233', 309 | 'street' : 'Wacker', 310 | 'zip' : '60606', 311 | 'type' : 'Dr', 312 | 'prefix' : 'S' 313 | } 314 | assert sa.parse("233 S Wacker Dr 606066306") == { # zip+4 without hyphen 315 | 'number' : '233', 316 | 'street' : 'Wacker', 317 | 'zip' : '60606', 318 | 'type' : 'Dr', 319 | 'prefix' : 'S' 320 | } 321 | assert sa.parse("233 S Wacker Dr lobby 60606") == { # unnumbered secondary unit type 322 | 'number' : '233', 323 | 'street' : 'Wacker', 324 | 'zip' : '60606', 325 | 'type' : 'Dr', 326 | 'prefix' : 'S', 327 | 'sec_unit_type' : 'lobby', 328 | } 329 | assert sa.parse("(233 S Wacker Dr lobby 60606)") == { # surrounding punctuation 330 | 'number' : '233', 331 | 'street' : 'Wacker', 332 | 'zip' : '60606', 333 | 'type' : 'Dr', 334 | 'prefix' : 'S', 335 | 'sec_unit_type' : 'lobby', 336 | } 337 | assert sa.parse("#42 233 S Wacker Dr 60606") == { # leading numbered secondary unit type 338 | 'sec_unit_num' : '42', 339 | 'zip' : '60606', 340 | 'number' : '233', 341 | 'street' : 'Wacker', 342 | 'sec_unit_type' : '#', 343 | 'type' : 'Dr', 344 | 'prefix' : 'S' 345 | } 346 | assert sa.parse("lt42 99 Some Road, Some City LA") == { # no space before sec_unit_num 347 | 'sec_unit_num' : '42', 348 | 'city' : 'Some City', 349 | 'number' : '99', 350 | 'street' : 'Some', 351 | 'sec_unit_type' : 'lt', 352 | 'type' : 'Rd', 353 | 'state' : 'LA' 354 | } 355 | assert sa.parse("36401 County Road 43, Eaton, CO 80615") == { # numbered County Road 356 | 'city' : 'Eaton', 357 | 'zip' : '80615', 358 | 'number' : '36401', 359 | 'street' : 'County Road 43', 360 | 'type' : 'Rd', 361 | 'state' : 'CO' 362 | } 363 | assert sa.parse("1234 COUNTY HWY 60E, Town, CO 12345") == { 364 | 'city' : 'Town', 365 | 'zip' : '12345', 366 | 'number' : '1234', 367 | 'street' : 'COUNTY HWY 60', 368 | 'suffix' : 'E', 369 | 'type' : '', # ? 370 | 'state' : 'CO' 371 | } 372 | 373 | if __name__ == '__main__': 374 | unittest.main() 375 | --------------------------------------------------------------------------------