├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README
├── README.rst
├── address
    ├── __init__.py
    ├── address.py
    ├── address_list.py
    ├── cities.csv
    ├── dstk.py
    ├── streets.csv
    ├── suffixes.csv
    └── test
    │   ├── __init__.py
    │   └── test_address.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | apartment_list.csv
 2 | output.txt
 3 | *.py[cod]
 4 | .idea/
 5 | MANIFEST
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Packages
11 | *.egg
12 | *.egg-info
13 | dist
14 | build
15 | eggs
16 | parts
17 | bin
18 | var
19 | sdist
20 | develop-eggs
21 | .installed.cfg
22 | lib
23 | lib64
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | 
28 | # Unit test / coverage reports
29 | .coverage
30 | .tox
31 | nosetests.xml
32 | 
33 | # Translations
34 | *.mo
35 | 
36 | # Mr Developer
37 | .mr.developer.cfg
38 | .project
39 | .pydevproject
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, Swoop Search LLC
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 5 | 
 6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
 7 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 8 | Neither the name of Swoop Search LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 9 | 
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
11 | AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
12 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
13 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
  1 | address
  2 | =========
  3 | 
  4 | address is an address parsing library, taking the guesswork out of using addresses in your applications. We use it as part of our apartment search and apartment spider applications.
  5 | 
  6 | Installation
  7 | ------------
  8 | 
  9 | `pip install address`
 10 | 
 11 | Example
 12 | -------
 13 | 
 14 | First, we create an AddressParser. AddressParser allows us to feed in lists of cities, streets, and address suffixes. Then we call
 15 | parse_address on our address string, which returns an Address instance with all the attributes filled out. From there, we can
 16 | print parts of the address, change them, validate them, create a database model to store them, or anything else.
 17 | 
 18 | ```python
 19 | from address import AddressParser, Address
 20 | 
 21 | ap = AddressParser()
 22 | address = ap.parse_address('123 West Mifflin Street, Madison, WI, 53703')
 23 | print "Address is: {0} {1} {2} {3}".format(address.house_number, address.street_prefix, address.street, address.street_suffix)
 24 | 
 25 | > Address is: 123 W. Mifflin St.
 26 | ```
 27 | 
 28 | AddressParser
 29 | -------------
 30 | 
 31 | `AddressParser(self, suffixes=None, cities=None, streets=None)`
 32 | 
 33 | suffixes, cities, and streets all accept lists as arguments. If you leave them as none, they will read default files
 34 | from the package, namely suffixes.csv, cities.csv, and streets.csv. Streets is intentionally blank.
 35 | 
 36 | You can provide lists of acceptable suffixes, cities, and streets to lower your false positives. If you know all
 37 | the addresses you are processing are in a small area, you can provide a list of the cities in the area and should
 38 | get more accurate results. If you are only doing one city, you could provide that single city in a list, and a list
 39 | of all streets in that city.
 40 | 
 41 | 
 42 | Address
 43 | -------
 44 | 
 45 | Addresses get returned by AddressParser.parser_address(). They have the following attributes:
 46 | 
 47 | `house_number`
 48 | 
 49 | The number on a house. This is required for all valid addresses. E.g. __123__ W. Mifflin St.
 50 | 
 51 | `street_prefix`
 52 | 
 53 | The direction before the street name. Always represented as one or two letters followed by a period. Not required.
 54 | E.g. 123 __W.__ Mifflin St.
 55 | 
 56 | `street`
 57 | 
 58 | The name of the street. Potentially multiple words. This is required for a valid address. E.g. 123 W. __Mifflin__ St.
 59 | 
 60 | `street_suffix`
 61 | 
 62 | The ending of a street. This will always be the USPS abbreviation followed by a period. Not required, but highly recommended.
 63 |  E.g. 123 W. Mifflin __St.__
 64 | 
 65 | `apartment`
 66 | 
 67 | Apartment number or unit style or any number of things signifying a specific part of an address. Not required. E.g. 123
 68 | W. Mifflin St. __Apt 10__
 69 | 
 70 | `buiding`
 71 | 
 72 | Sometimes addresses are grouped into buildings, or are more commonly known as by building names. Not required, and often
 73 |  in parathenses. E.g. 123 W. Mifflin St. Apt 10 __(The Estates)__
 74 | 
 75 | `city`
 76 | 
 77 | The city part of the address, preferably following a comma. E.g. 123 W. Mifflin St., __Madison__, WI 53703
 78 | 
 79 | `state`
 80 | 
 81 | The state of the address, preferably following the city and a comma. Always two capitalized letters. E.g. 123 W. Mifflin St., Madison, __WI__ 53703
 82 | 
 83 | `zip`
 84 | 
 85 | The 5 digit zip code of the address, preferably following the state. 9 digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI __53703__
 86 | 
 87 | `full_address()`
 88 | 
 89 | Returns a human readable version of the address for display. Follows the same style rules as the above attributes.
 90 | Example return: (The Estates) 123 W. Mifflin St. Apt 10, Madison, WI 53703
 91 | 
 92 | Todo
 93 | ----
 94 | 
 95 | * Add verification of an address through Google Maps API, given an API key.
 96 | 
 97 | * Allow custom validation conditions in AddressParser for what counts as a correct address or not.
 98 | 
 99 | * Add exceptions for incorrect addresses instead of silent failing and letting user validate.
100 | 
101 | GitHub
102 | ------
103 | 
104 | File support requests and obtain the source from https://github.com/SwoopSearch/pyaddress
105 | 
106 | Authors
107 | -------
108 | 
109 | * Josh Gachnang
110 | 
111 | * Rob Jauquet
112 | 
113 | License and Copyright
114 | -------
115 | 
116 | Copyright (c) 2013 Swoop Search LLC.
117 | 
118 | This library is released under the New BSD License.
119 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | address
  2 | =========
  3 | 
  4 | address is an address parsing library, taking the guesswork out of
  5 | using addresses in your applications. We use it as part of our apartment
  6 | search and apartment spider applications.
  7 | 
  8 | Installation
  9 | ------------
 10 | 
 11 | ::
 12 | 
 13 |     pip install address
 14 | 
 15 | Example
 16 | -------
 17 | 
 18 | First, we create an AddressParser. AddressParser allows us to feed in
 19 | lists of cities, streets, and address suffixes. Then we call
 20 | parse\_address on our address string, which returns an Address instance
 21 | with all the attributes filled out. From there, we can print parts of
 22 | the address, change them, validate them, create a database model to
 23 | store them, or anything else.
 24 | 
 25 | ::
 26 | 
 27 |     from address import AddressParser, Address
 28 | 
 29 |     ap = AddressParser()
 30 |     address = ap.parse_address('123 West Mifflin Street, Madison, WI, 53703')
 31 |     print "Address is: {0} {1} {2} {3}".format(address.house_number, address.street_prefix, address.street, address.street_suffix)
 32 | 
 33 |     > Address is: 123 W. Mifflin St.
 34 | 
 35 | AddressParser
 36 | -------------
 37 | 
 38 | ``AddressParser(self, suffixes=None, cities=None, streets=None)``
 39 | 
 40 | suffixes, cities, and streets all accept lists as arguments. If you
 41 | leave them as none, they will read default files from the package,
 42 | namely suffixes.csv, cities.csv, and streets.csv. Streets is
 43 | intentionally blank.
 44 | 
 45 | You can provide lists of acceptable suffixes, cities, and streets to
 46 | lower your false positives. If you know all the addresses you are
 47 | processing are in a small area, you can provide a list of the cities in
 48 | the area and should get more accurate results. If you are only doing one
 49 | city, you could provide that single city in a list, and a list of all
 50 | streets in that city.
 51 | 
 52 | Address
 53 | -------
 54 | 
 55 | Addresses get returned by AddressParser.parser\_address(). They have the
 56 | following attributes:
 57 | 
 58 | ``house_number``
 59 | 
 60 | The number on a house. This is required for all valid addresses. E.g.
 61 | **123** W. Mifflin St.
 62 | 
 63 | ``street_prefix``
 64 | 
 65 | The direction before the street name. Always represented as one or two
 66 | letters followed by a period. Not required. E.g. 123 **W.** Mifflin St.
 67 | 
 68 | ``street``
 69 | 
 70 | The name of the street. Potentially multiple words. This is required for
 71 | a valid address. E.g. 123 W. **Mifflin** St.
 72 | 
 73 | ``street_suffix``
 74 | 
 75 | The ending of a street. This will always be the USPS abbreviation
 76 | followed by a period. Not required, but highly recommended. E.g. 123 W.
 77 | Mifflin **St.**
 78 | 
 79 | ``apartment``
 80 | 
 81 | Apartment number or unit style or any number of things signifying a
 82 | specific part of an address. Not required. E.g. 123 W. Mifflin St. **Apt
 83 | 10**
 84 | 
 85 | ``buiding``
 86 | 
 87 | Sometimes addresses are grouped into buildings, or are more commonly
 88 | known as by building names. Not required, and often in parathenses. E.g.
 89 | 123 W. Mifflin St. Apt 10 **(The Estates)**
 90 | 
 91 | ``city``
 92 | 
 93 | The city part of the address, preferably following a comma. E.g. 123 W.
 94 | Mifflin St., **Madison**, WI 53703
 95 | 
 96 | ``state``
 97 | 
 98 | The state of the address, preferably following the city and a comma.
 99 | Always two capitalized letters. E.g. 123 W. Mifflin St., Madison, **WI**
100 | 53703
101 | 
102 | ``zip``
103 | 
104 | The 5 digit zip code of the address, preferably following the state. 9
105 | digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI
106 | **53703**
107 | 
108 | ``full_address()``
109 | 
110 | Returns a human readable version of the address for display. Follows the
111 | same style rules as the above attributes. Example return: (The Estates)
112 | 123 W. Mifflin St. Apt 10, Madison, WI 53703
113 | 
114 | Todo
115 | ----
116 | 
117 | -  Add verification of an address through Google Maps API, given an API
118 |    key.
119 | 
120 | -  Allow custom validation conditions in AddressParser for what counts
121 |    as a correct address or not.
122 | 
123 | -  Add exceptions for incorrect addresses instead of silent failing and
124 |    letting user validate.
125 | 
126 | GitHub
127 | ------
128 | 
129 | File support requests and obtain the source from
130 | https://github.com/SwoopSearch/pyaddress
131 | 
132 | Authors
133 | -------
134 | 
135 | -  Josh Gachnang
136 | 
137 | -  Rob Jauquet
138 | 
139 | License and Copyright
140 | ---------------------
141 | 
142 | Copyright (c) 2013 Swoop Search LLC.
143 | 
144 | This library is released under the New BSD License.
145 | 


--------------------------------------------------------------------------------
/address/__init__.py:
--------------------------------------------------------------------------------
1 | from .address import Address, AddressParser
2 | 


--------------------------------------------------------------------------------
/address/address.py:
--------------------------------------------------------------------------------
  1 | # Meant to parse out address lines, minus city,state,zip into a usable dict for address matching
  2 | # Ignores periods and commas, because no one cares.
  3 | 
  4 | import re
  5 | import csv
  6 | import os
  7 | import dstk
  8 | import sys
  9 | 
 10 | # Keep lowercase, no periods
 11 | # Requires numbers first, then option dash plus numbers.
 12 | street_num_regex = r'^(\d+)(-?)(\d*)$'
 13 | 
 14 | apartment_regex_number = r'(#?)(\d*)(\w*)'
 15 | cwd = os.path.dirname(os.path.realpath(__file__))
 16 | 
 17 | 
 18 | class AddressParser(object):
 19 |     """
 20 |     AddressParser will be use to create Address objects. It contains a list of preseeded cities, states, prefixes,
 21 |     suffixes, and street names that will help the Address object correctly parse the given string. It is loaded
 22 |     with defaults that work in the average case, but can be adjusted for specific cases.
 23 |     """
 24 |     suffixes = {}
 25 |     # Lower case list of cities, used as a hint
 26 |     cities = []
 27 |     # Lower case list of streets, used as a hint
 28 |     streets = []
 29 |     prefixes = {
 30 |         "n": "N.", "e": "E.", "s": "S.", "w": "W.", "ne": "NE.", "nw": "NW.", 'se': "SE.", 'sw': "SW.", 'north': "N.",
 31 |         'east': "E.", 'south': "S.",
 32 |         'west': "W.", 'northeast': "NE.", 'northwest': "NW.", 'southeast': "SE.", 'southwest': "SW."}
 33 |     states = {
 34 |         'Mississippi': 'MS', 'Oklahoma': 'OK', 'Delaware': 'DE', 'Minnesota': 'MN', 'Illinois': 'IL', 'Arkansas': 'AR',
 35 |         'New Mexico': 'NM', 'Indiana': 'IN', 'Maryland': 'MD', 'Louisiana': 'LA', 'Idaho': 'ID', 'Wyoming': 'WY',
 36 |         'Tennessee': 'TN', 'Arizona': 'AZ', 'Iowa': 'IA', 'Michigan': 'MI', 'Kansas': 'KS', 'Utah': 'UT',
 37 |         'Virginia': 'VA', 'Oregon': 'OR', 'Connecticut': 'CT', 'Montana': 'MT', 'California': 'CA',
 38 |         'Massachusetts': 'MA', 'West Virginia': 'WV', 'South Carolina': 'SC', 'New Hampshire': 'NH',
 39 |         'Wisconsin': 'WI', 'Vermont': 'VT', 'Georgia': 'GA', 'North Dakota': 'ND', 'Pennsylvania': 'PA',
 40 |         'Florida': 'FL', 'Alaska': 'AK', 'Kentucky': 'KY', 'Hawaii': 'HI', 'Nebraska': 'NE', 'Missouri': 'MO',
 41 |         'Ohio': 'OH', 'Alabama': 'AL', 'New York': 'NY', 'South Dakota': 'SD', 'Colorado': 'CO', 'New Jersey': 'NJ',
 42 |         'Washington': 'WA', 'North Carolina': 'NC', 'District of Columbia': 'DC', 'Texas': 'TX', 'Nevada': 'NV',
 43 |         'Maine': 'ME', 'Rhode Island': 'RI'}
 44 | 
 45 |     def __init__(self, suffixes=None, cities=None, streets=None, backend="default", dstk_api_base=None, logger=None, required_confidence=0.65):
 46 |         """
 47 |         suffixes, cities and streets provide a chance to use different lists than the provided lists.
 48 |         suffixes is probably good for most users, unless you have some suffixes not recognized by USPS.
 49 |         cities is a very expansive list that may lead to false positives in some cases. If you only have a few cities
 50 |         you know will show up, provide your own list for better accuracy. If you are doing addresses across the US,
 51 |         the provided list is probably better.
 52 |         streets can be used to limit the list of possible streets the address are on. It comes blank by default and
 53 |         uses positional clues instead. If you are instead just doing a couple cities, a list of all possible streets
 54 |         will decrease incorrect street names.
 55 |         Valid backends include "default" and "dstk". If backend is dstk, it requires a dstk_api_base. Example of
 56 |         dstk_api_base would be 'http://example.com'.
 57 |         """
 58 |         self.logger = logger
 59 |         self.backend = backend
 60 |         self.dstk_api_base = dstk_api_base
 61 |         self.required_confidence = required_confidence
 62 |         if suffixes:
 63 |             self.suffixes = suffixes
 64 |         else:
 65 |             self.load_suffixes(os.path.join(cwd, "suffixes.csv"))
 66 |         if cities:
 67 |             self.cities = cities
 68 |         else:
 69 |             self.load_cities(os.path.join(cwd, "cities.csv"))
 70 |         if streets:
 71 |             self.streets = streets
 72 |         else:
 73 |             self.load_streets(os.path.join(cwd, "streets.csv"))
 74 |         if backend == "dstk":
 75 |             if dstk_api_base is None:
 76 |                 raise ValueError("dstk_api_base is required for dstk backend.")
 77 |             self.dstk = dstk.DSTK({'apiBase': dstk_api_base})
 78 |         elif backend == "default":
 79 |             pass
 80 |         else:
 81 |             raise ValueError("backend must be either 'default' or 'dstk'.")
 82 | 
 83 |     def parse_address(self, address, line_number=-1):
 84 |         """
 85 |         Return an Address object from the given address. Passes itself to the Address constructor to use all the custom
 86 |         loaded suffixes, cities, etc.
 87 |         """
 88 |         return Address(address, self, line_number, self.logger)
 89 | 
 90 |     def dstk_multi_address(self, address_list):
 91 |         if self.backend != "dstk":
 92 |             raise ValueError("Only allowed for DSTK backends.")
 93 |         if self.logger: self.logger.debug("Sending {0} possible addresses to DSTK".format(len(address_list)))
 94 |         multi_address = self.dstk.street2coordinates(address_list)
 95 |         if self.logger: self.logger.debug("Received {0} addresses from DSTK".format(len(multi_address)))
 96 |         # if self.logger: self.logger.debug("End street2coords")
 97 |         addresses = []
 98 |         # if self.logger: self.logger.debug("Multi Addresses: {0}".format(multi_address))
 99 |         for address, dstk_return in multi_address.items():
100 |             try:
101 |                 if dstk_return is None:
102 |                     # if self.logger: self.logger.debug("DSTK None return for: {0}".format(address))
103 |                     continue
104 |                 addresses.append(Address(address, self, -1, self.logger, dstk_pre_parse=dstk_return))
105 |                 if self.logger: self.logger.debug("DSTK Address Appended: {0}".format(dstk_return))
106 |             except InvalidAddressException as e:
107 |                 # if self.logger: self.logger.debug("Error from dstk Address: {0}".format(e.message))
108 |                 continue
109 |             except DSTKConfidenceTooLowException as e:
110 |                 continue
111 |         return addresses
112 | 
113 | 
114 |     def load_suffixes(self, filename):
115 |         """
116 |         Build the suffix dictionary. The keys will be possible long versions, and the values will be the
117 |         accepted abbreviations. Everything should be stored using the value version, and you can search all
118 |         by using building a set of self.suffixes.keys() and self.suffixes.values().
119 |         """
120 |         with open(filename, 'r') as f:
121 |             for line in f:
122 |                 # Make sure we have key and value
123 |                 if len(line.split(',')) != 2:
124 |                     continue
125 |                     # Strip off newlines.
126 |                 self.suffixes[line.strip().split(',')[0]] = line.strip().split(',')[1]
127 | 
128 |     def load_cities(self, filename):
129 |         """
130 |         Load up all cities in lowercase for easier matching. The file should have one city per line, with no extra
131 |         characters. This isn't strictly required, but will vastly increase the accuracy.
132 |         """
133 |         with open(filename, 'r') as f:
134 |             for line in f:
135 |                 self.cities.append(line.strip().lower())
136 | 
137 |     def load_streets(self, filename):
138 |         """
139 |         Load up all streets in lowercase for easier matching. The file should have one street per line, with no extra
140 |         characters. This isn't strictly required, but will vastly increase the accuracy.
141 |         """
142 |         with open(filename, 'r') as f:
143 |             for line in f:
144 |                 self.streets.append(line.strip().lower())
145 | 
146 | 
147 | # Procedure: Go through backwards. First check for apartment number, then
148 | # street suffix, street name, street prefix, then building. For each sub,
149 | # check if that spot is already filled in the dict.
150 | class Address:
151 |     unmatched = False
152 |     house_number = None
153 |     street_prefix = None
154 |     street = None
155 |     street_suffix = None
156 |     apartment = None
157 |     # building = None
158 |     city = None
159 |     state = None
160 |     zip = None
161 |     original = None
162 |     # Only set for dstk
163 |     lat = None
164 |     lng = None
165 |     last_matched = None
166 |     unmatched = False
167 |     # Only used for debug
168 |     line_number = -1
169 |     # Confidence value from DSTK. 0 - 1, -1 for not set.
170 |     confidence = -1
171 | 
172 |     def __init__(self, address, parser, line_number=-1, logger=None, dstk_pre_parse=None):
173 |         """
174 |         @dstk_pre_parse: a single value from a dstk multiple street2coordinates return. @address would be the key then.
175 |         """
176 |         self.parser = parser
177 |         self.line_number = line_number
178 |         self.original = self._clean(address)
179 |         self.logger = logger
180 |         if address is None:
181 |             return
182 |         address = self.preprocess_address(address)
183 |         if parser.backend == "dstk":
184 |             # if self.logger: self.logger.debug("Preparsed: {0}".format(dstk_pre_parse))
185 |             self.dstk_parse(address, parser, pre_parsed_address=dstk_pre_parse)
186 |         elif parser.backend == "default":
187 |             self.parse_address(address)
188 |         else:
189 |             raise ValueError("Parser gave invalid backend, must be either 'default' or 'dstk'.")
190 | 
191 |         if self.house_number is None or self.house_number <= 0:
192 |             raise InvalidAddressException("Addresses must have house numbers.")
193 |         elif self.street is None or self.street == "":
194 |             raise InvalidAddressException("Addresses must have streets.")
195 |             # if self.house_number is None or self.street is None or self.street_suffix is None:
196 |             # raise ValueError("Street addresses require house_number, street, and street_suffix")
197 | 
198 |     def parse_address(self, address):
199 |         # print "YOU ARE PARSING AN ADDRESS"
200 |         # Save the original string
201 | 
202 |         # Get rid of periods and commas, split by spaces, reverse.
203 |         # Periods should not exist, remove them. Commas separate tokens. It's possible we can use commas for better guessing.
204 |         address = address.strip().replace('.', '')
205 |         # We'll use this for guessing.
206 |         self.comma_separated_address = address.split(',')
207 |         address = address.replace(',', '')
208 | 
209 |         # First, do some preprocessing
210 |         # address = self.preprocess_address(address)
211 | 
212 |         # Try all our address regexes. USPS says parse from the back.
213 |         address = reversed(address.split())
214 |         # Save unmatched to process after the rest is processed.
215 |         unmatched = []
216 |         # Use for contextual data
217 |         for token in address:
218 |         #            print token, self
219 |             # Check zip code first
220 |             if self.check_zip(token):
221 |                 continue
222 |             if self.check_state(token):
223 |                 continue
224 |             if self.check_city(token):
225 |                 continue
226 |             if self.check_street_suffix(token):
227 |                 continue
228 |             if self.check_house_number(token):
229 |                 continue
230 |             if self.check_street_prefix(token):
231 |                 continue
232 |             if self.check_street(token):
233 |                 continue
234 |                 # if self.check_building(token):
235 |             #     continue
236 |             if self.guess_unmatched(token):
237 |                 continue
238 |             unmatched.append(token)
239 | 
240 |         # Post processing
241 | 
242 |         for token in unmatched:
243 |         #            print "Unmatched token: ", token
244 |             if self.check_apartment_number(token):
245 |                 continue
246 |                 # print "Unmatched token: ", token
247 |             #            print "Original address: ", self.original
248 |             self.unmatched = True
249 | 
250 |     def preprocess_address(self, address):
251 |         """
252 |         Takes a basic address and attempts to clean it up, extract reasonably assured bits that may throw off the
253 |         rest of the parsing, and return the cleaned address.
254 |         """
255 |         # Run some basic cleaning
256 |         address = address.replace("# ", "#")
257 |         address = address.replace(" & ", "&")
258 |         # Clear the address of things like 'X units', which shouldn't be in an address anyway. We won't save this for now.
259 |         if re.search(r"-?-?\w+ units", address, re.IGNORECASE):
260 |             address = re.sub(r"-?-?\w+ units", "", address, flags=re.IGNORECASE)
261 |             # Sometimes buildings are put in parantheses.
262 |         # building_match = re.search(r"\(.*\)", address, re.IGNORECASE)
263 |         # if building_match:
264 |         #     self.building = self._clean(building_match.group().replace('(', '').replace(')', ''))
265 |         #     address = re.sub(r"\(.*\)", "", address, flags=re.IGNORECASE)
266 |         # Now let's get the apartment stuff out of the way. Using only sure match regexes, delete apartment parts from
267 |         # the address. This prevents things like "Unit" being the street name.
268 |         apartment_regexes = [r'#\w+ & \w+', '#\w+ rm \w+', "#\w+-\w", r'apt #{0,1}\w+', r'apartment #{0,1}\w+', r'#\w+',
269 |                              r'# \w+', r'rm \w+', r'unit #?\w+', r'units #?\w+', r'- #{0,1}\w+', r'no\s?\d+\w*',
270 |                              r'style\s\w{1,2}', r'townhouse style\s\w{1,2}']
271 |         for regex in apartment_regexes:
272 |             apartment_match = re.search(regex, address, re.IGNORECASE)
273 |             if apartment_match:
274 |             #                print "Matched regex: ", regex, apartment_match.group()
275 |                 self.apartment = self._clean(apartment_match.group())
276 |                 address = re.sub(regex, "", address, flags=re.IGNORECASE)
277 |             # Now check for things like ",  ," which throw off dstk
278 |         address = re.sub(r"\,\s*\,", ",", address)
279 |         return address
280 | 
281 |     def check_zip(self, token):
282 |         """
283 |         Returns true if token is matches a zip code (5 numbers). Zip code must be the last token in an address (minus anything
284 |         removed during preprocessing such as --2 units.
285 |         """
286 |         if self.zip is None:
287 |             # print "last matched", self.last_matched
288 |             if self.last_matched is not None:
289 |                 return False
290 |                 # print "zip check", len(token) == 5, re.match(r"\d{5}", token)
291 |             if len(token) == 5 and re.match(r"\d{5}", token):
292 |                 self.zip = self._clean(token)
293 | 
294 |                 return True
295 |         return False
296 | 
297 |     def check_state(self, token):
298 |         """
299 |         Check if state is in either the keys or values of our states list. Must come before the suffix.
300 |         """
301 |         # print "zip", self.zip
302 |         if len(token) == 2 and self.state is None:
303 |             if token.capitalize() in self.parser.states.keys():
304 |                 self.state = self._clean(self.parser.states[token.capitalize()])
305 |                 return True
306 |             elif token.upper() in self.parser.states.values():
307 |                 self.state = self._clean(token.upper())
308 |                 return True
309 |         if self.state is None and self.street_suffix is None and len(self.comma_separated_address) > 1:
310 |             if token.capitalize() in self.parser.states.keys():
311 |                 self.state = self._clean(self.parser.states[token.capitalize()])
312 |                 return True
313 |             elif token.upper() in self.parser.states.values():
314 |                 self.state = self._clean(token.upper())
315 |                 return True
316 |         return False
317 | 
318 |     def check_city(self, token):
319 |         """
320 |         Check if there is a known city from our city list. Must come before the suffix.
321 |         """
322 |         shortened_cities = {'saint': 'st.'}
323 |         if self.city is None and self.state is not None and self.street_suffix is None:
324 |             if token.lower() in self.parser.cities:
325 |                 self.city = self._clean(token.capitalize())
326 |                 return True
327 |             return False
328 |             # Check that we're in the correct location, and that we have at least one comma in the address
329 |         if self.city is None and self.apartment is None and self.street_suffix is None and len(
330 |                 self.comma_separated_address) > 1:
331 |             if token.lower() in self.parser.cities:
332 |                 self.city = self._clean(token.capitalize())
333 |                 return True
334 |             return False
335 |         # Multi word cities
336 |         if self.city is not None and self.street_suffix is None and self.street is None:
337 |             print "Checking for multi part city", token.lower(), token.lower() in shortened_cities.keys()
338 |             if token.lower() + ' ' + self.city in self.parser.cities:
339 |                 self.city = self._clean((token.lower() + ' ' + self.city).capitalize())
340 |                 return True
341 |             if token.lower() in shortened_cities.keys():
342 |                 token = shortened_cities[token.lower()]
343 |                 print "Checking for shorted multi part city", token.lower() + ' ' + self.city
344 |                 if token.lower() + ' ' + self.city.lower() in self.parser.cities:
345 |                     self.city = self._clean(token.capitalize() + ' ' + self.city.capitalize())
346 |                     return True
347 | 
348 |     def check_apartment_number(self, token):
349 |         """
350 |         Finds apartment, unit, #, etc, regardless of spot in string. This needs to come after everything else has been ruled out,
351 |         because it has a lot of false positives.
352 |         """
353 |         apartment_regexes = [r'#\w+ & \w+', '#\w+ rm \w+', "#\w+-\w", r'apt #{0,1}\w+', r'apartment #{0,1}\w+', r'#\w+',
354 |                              r'# \w+', r'rm \w+', r'unit #?\w+', r'units #?\w+', r'- #{0,1}\w+', r'no\s?\d+\w*',
355 |                              r'style\s\w{1,2}', r'\d{1,4}/\d{1,4}', r'\d{1,4}', r'\w{1,2}']
356 |         for regex in apartment_regexes:
357 |             if re.match(regex, token.lower()):
358 |                 self.apartment = self._clean(token)
359 |                 return True
360 |             #        if self.apartment is None and re.match(apartment_regex_number, token.lower()):
361 |             ##            print "Apt regex"
362 |             #            self.apartment = token
363 |             #            return True
364 |             ## If we come on apt or apartment and already have an apartment number, add apt or apartment to the front
365 |         if self.apartment and token.lower() in ['apt', 'apartment']:
366 |         #            print "Apt in a_n"
367 |             self.apartment = self._clean(token + ' ' + self.apartment)
368 |             return True
369 | 
370 |         if not self.street_suffix and not self.street and not self.apartment:
371 |         #            print "Searching for unmatched term: ", token, token.lower(),
372 |             if re.match(r'\d?\w?', token.lower()):
373 |                 self.apartment = self._clean(token)
374 |                 return True
375 |         return False
376 | 
377 |     def check_street_suffix(self, token):
378 |         """
379 |         Attempts to match a street suffix. If found, it will return the abbreviation, with the first letter capitalized
380 |         and a period after it. E.g. "St." or "Ave."
381 |         """
382 |         # Suffix must come before street
383 |         # print "Suffix check", token, "suffix", self.street_suffix, "street", self.street
384 |         if self.street_suffix is None and self.street is None:
385 |             # print "upper", token.upper()
386 |             if token.upper() in self.parser.suffixes.keys():
387 |                 suffix = self.parser.suffixes[token.upper()]
388 |                 self.street_suffix = self._clean(suffix.capitalize() + '.')
389 |                 return True
390 |             elif token.upper() in self.parser.suffixes.values():
391 |                 self.street_suffix = self._clean(token.capitalize() + '.')
392 |                 return True
393 |         return False
394 | 
395 |     def check_street(self, token):
396 |         """
397 |         Let's assume a street comes before a prefix and after a suffix. This isn't always the case, but we'll deal
398 |         with that in our guessing game. Also, two word street names...well...
399 | 
400 |         This check must come after the checks for house_number and street_prefix to help us deal with multi word streets.
401 |         """
402 |         # First check for single word streets between a prefix and a suffix
403 |         if self.street is None and self.street_suffix is not None and self.street_prefix is None and self.house_number is None:
404 |             self.street = self._clean(token.capitalize())
405 |             return True
406 |         # Now check for multiple word streets. This check must come after the check for street_prefix and house_number for this reason.
407 |         elif self.street is not None and self.street_suffix is not None and self.street_prefix is None and self.house_number is None:
408 |             self.street = self._clean(token.capitalize() + ' ' + self.street)
409 |             return True
410 |         if not self.street_suffix and not self.street and token.lower() in self.parser.streets:
411 |             self.street = self._clean(token)
412 |             return True
413 |         return False
414 | 
415 |     def check_street_prefix(self, token):
416 |         """
417 |         Finds street prefixes, such as N. or Northwest, before a street name. Standardizes to 1 or two letters, followed
418 |         by a period.
419 |         """
420 |         if self.street and not self.street_prefix and token.lower().replace('.', '') in self.parser.prefixes.keys():
421 |             self.street_prefix = self._clean(self.parser.prefixes[token.lower().replace('.', '')])
422 |             return True
423 |         return False
424 | 
425 |     def check_house_number(self, token):
426 |         """
427 |         Attempts to find a house number, generally the first thing in an address. If anything is in front of it,
428 |         we assume it is a building name.
429 |         """
430 |         if self.street and self.house_number is None and re.match(street_num_regex, token.lower()):
431 |             if '/' in token:
432 |                 token = token.split('/')[0]
433 |             if '-' in token:
434 |                 token = token.split('-')[0]
435 |             self.house_number = self._clean(str(token))
436 |             return True
437 |         return False
438 | 
439 |     def check_building(self, token):
440 |         """
441 |         Building name check. If we have leftover and everything else is set, probably building names.
442 |         Allows for multi word building names.
443 |         """
444 |         if self.street and self.house_number:
445 |             if not self.building:
446 |                 self.building = self._clean(token)
447 |             else:
448 |                 self.building = self._clean(token + ' ' + self.building)
449 |             return True
450 |         return False
451 | 
452 |     def guess_unmatched(self, token):
453 |         """
454 |         When we find something that doesn't match, we can make an educated guess and log it as such.
455 |         """
456 |         # Check if this is probably an apartment:
457 |         if token.lower() in ['apt', 'apartment']:
458 |             return False
459 |             # Stray dashes are likely useless
460 |         if token.strip() == '-':
461 |             return True
462 |             # Almost definitely not a street if it is one or two characters long.
463 |         if len(token) <= 2:
464 |             return False
465 |             # Let's check for a suffix-less street.
466 |         if self.street_suffix is None and self.street is None and self.street_prefix is None and self.house_number is None:
467 |             # Streets will just be letters
468 |             if re.match(r"[A-Za-z]", token):
469 |                 if self.line_number >= 0:
470 |                     pass
471 |                 #                    print "{0}: Guessing suffix-less street: ".format(self.line_number), token
472 |                 else:
473 |                 #                    print "Guessing suffix-less street: ", token
474 |                     pass
475 |                 self.street = self._clean(token.capitalize())
476 |                 return True
477 |         return False
478 | 
479 |     def full_address(self):
480 |         """
481 |         Print the address in a human readable format
482 |         """
483 |         addr = ""
484 |         # if self.building:
485 |         #     addr = addr + "(" + self.building + ") "
486 |         if self.house_number:
487 |             addr = addr + self.house_number
488 |         if self.street_prefix:
489 |             addr = addr + " " + self.street_prefix
490 |         if self.street:
491 |             addr = addr + " " + self.street
492 |         if self.street_suffix:
493 |             addr = addr + " " + self.street_suffix
494 |         if self.apartment:
495 |             addr = addr + " " + self.apartment
496 |         if self.city:
497 |             addr = addr + ", " + self.city
498 |         if self.state:
499 |             addr = addr + ", " + self.state
500 |         if self.zip:
501 |             addr = addr + " " + self.zip
502 |         return addr
503 | 
504 |     def _clean(self, item):
505 |         if item is None:
506 |             return None
507 |         else:
508 |             return item.encode("utf-8", "replace")
509 | 
510 |     def __repr__(self):
511 |         return unicode(self)
512 | 
513 |     def __str__(self):
514 |         return unicode(self)
515 | 
516 |     def __unicode__(self):
517 |         address_dict = {
518 |             "house_number": self.house_number,
519 |             "street_prefix": self.street_prefix,
520 |             "street": self.street,
521 |             "street_suffix": self.street_suffix,
522 |             "apartment": self.apartment,
523 |             # "building": self.building,
524 |             "city": self.city,
525 |             "state": self.state,
526 |             "zip": self.zip
527 |         }
528 |         # print "Address Dict", address_dict
529 |         return u"Address - House number: {house_number} Prefix: {street_prefix} Street: {street} Suffix: {street_suffix}" \
530 |                u" Apartment: {apartment} City,State,Zip: {city}, {state} {zip}".format(**address_dict)
531 | 
532 |     def dstk_parse(self, address, parser, pre_parsed_address=None):
533 |         """
534 |         Given an address string, use DSTK to parse the address and then coerce it to a normal Address object.
535 |         pre_parsed_address for multi parsed string. Gives the value part for single dstk return value. If
536 |         pre_parsed_address is None, parse it via dstk on its own.
537 |         """
538 |         if pre_parsed_address:
539 |             dstk_address = pre_parsed_address
540 |         else:
541 |             if self.logger: self.logger.debug("Asking DSTK for address parse {0}".format(address.encode("ascii", "ignore")))
542 |             dstk_address = parser.dstk.street2coordinates(address)
543 |             # if self.logger: self.logger.debug("dstk return: {0}".format(dstk_address))
544 |         if 'confidence' not in dstk_address:
545 |             raise InvalidAddressException("Could not deal with DSTK return: {0}".format(dstk_address))
546 |         if dstk_address['street_address'] == "":
547 |             raise InvalidAddressException("Empty street address in DSTK return: {0}".format(dstk_address))
548 |         if dstk_address['street_number']  is None or dstk_address['street_name'] is None:
549 |             raise InvalidAddressException("House number or street name was Non in DSTK return: {0}".format(dstk_address))
550 |         if dstk_address['confidence'] < parser.required_confidence:
551 |             raise DSTKConfidenceTooLowException("Required confidence: {0}. Got confidence: {1}. Address: {2}. Return: {3}.".format(parser.required_confidence, dstk_address['confidence'], address.encode("ascii", "ignore"), dstk_address))
552 |         self.confidence = dstk_address['confidence']
553 |         if 'street_address' in dstk_address:
554 |             intersections = self._get_dstk_intersections(address, dstk_address['street_address'])
555 |         if self.logger: self.logger.debug("Confidence: {0}.".format(dstk_address['confidence']))
556 |         if self.logger: self.logger.debug("Address: {0}.".format(address))
557 |         if self.logger: self.logger.debug("Return: {0}.".format(dstk_address))
558 |         # if self.logger: self.logger.debug("")
559 | 
560 |         addr = dstk_address
561 |         if addr is None:
562 |             raise InvalidAddressException("DSTK could not parse address: {0}".format(self.original))
563 |         if "street_number" in addr:
564 |             if addr["street_number"] not in address:
565 |                 raise InvalidAddressException("DSTK returned a house number not in the original address: {0}".format(addr))
566 |             self.house_number = addr["street_number"]
567 |         else:
568 |             raise InvalidAddressException("(dstk) Addresses must have house numbers: {0}".format(addr))
569 | 
570 |         if "locality" in addr:
571 |             self.city = addr["locality"]
572 |             # DSTK shouldn't be returning unknown cities
573 |             if addr["locality"] not in address:
574 |                 raise InvalidAddressException("DSTK returned a city not in the address. City: {0}, Address: {1}.".format(self.city, address))
575 |         if "region" in addr:
576 |             self.state = addr["region"]
577 |             # if "fips_county" in addr:
578 |             # self.zip = addr["fips_county"]
579 |         if "latitude" in addr:
580 |             self.lat = addr["latitude"]
581 |         if "longitude" in addr:
582 |             self.lng = addr["longitude"]
583 |             # Try and find the apartment
584 |         # First remove the street_address (this doesn't include apartment)
585 |         if "street_address" in addr:
586 |             apartment = address.replace(addr["street_address"], '')
587 |         # Make sure the city doesn't somehow come before the street in the original string.
588 | 
589 |             # try:
590 |             #     end_pos = re.search("(" + addr["locality"] + ")", apartment).start(1) - 1
591 |             #     # self.apartment = apartment[:end_pos]
592 |             # except Exception:
593 |             #     pass
594 |             # self.apartment = None
595 |         # Now that we have an address, try to parse out street suffix, prefix, and street
596 |         if self.apartment:
597 |             street_addr = addr["street_address"].replace(self.apartment, '')
598 |         else:
599 |             street_addr = addr["street_address"]
600 | 
601 |         # We should be left with only prefix, street, suffix. Go for suffix first.
602 |         split_addr = street_addr.split()
603 |         if len(split_addr) == 0:
604 |             if self.logger: self.logger.debug("Could not split street_address: {0}".format(addr))
605 |             raise InvalidAddressException("Could not split street_address: {0}".format(addr))
606 |         # Get rid of house_number
607 |         if split_addr[0] == self.house_number:
608 |             split_addr = split_addr[1:]
609 |         if self.logger: self.logger.debug("Checking {0} for suffixes".format(split_addr[-1].upper()))
610 |         if split_addr[-1].upper() in parser.suffixes.keys() or split_addr[-1].upper() in parser.suffixes.values():
611 |             self.street_suffix = split_addr[-1]
612 |             split_addr = split_addr[:-1]
613 |         if self.logger: self.logger.debug("Checking {0} for prefixes".format(split_addr[0].lower()))
614 |         if split_addr[0].lower() in parser.prefixes.keys() or split_addr[0].upper() in parser.prefixes.values() or \
615 |                                 split_addr[0].upper() + '.' in parser.prefixes.values():
616 |             if split_addr[0][-1] == '.':
617 |                 self.street_prefix = split_addr[0].upper()
618 |             else:
619 |                 self.street_prefix = split_addr[0].upper() + '.'
620 |             if self.logger: self.logger.debug("Saving prefix: {0}".format(self.street_prefix))
621 |             split_addr = split_addr[1:]
622 |         if self.logger: self.logger.debug("Saving street: {0}".format(split_addr))
623 |         self.street = " ".join(split_addr)
624 |         # DSTK shouldn't be guessing cities that come before streets.
625 |         match = re.search(self.street, address)
626 |         if match is None:
627 |             raise InvalidAddressException("DSTK picked a street not in the original address. Street: {0}. Address: {1}.".format(self.street, address))
628 |         street_position = match
629 |         match = re.search(self.city, address)
630 |         if match is None:
631 |             raise InvalidAddressException("DSTK picked a city not in the original address. City: {0}. Address: {1}.".format(self.city, address))
632 |         city_position = match
633 |         if city_position.start(0) < street_position.end(0):
634 |             raise InvalidAddressException("DSTK picked a street that comes after the city. Street: {0}. City: {1}. Address: {2}.".format(self.street, self.city, address))
635 |         if self.logger: self.logger.debug("Successful DSTK address: {0}, house: {1}, street: {2}\n".format(self.original, self.house_number, self.street))
636 | 
637 |     def _get_dstk_intersections(self, address, dstk_address):
638 |         """
639 |         Find the unique tokens in the original address and the returned address.
640 |         """
641 |         # Normalize both addresses
642 |         normalized_address = self._normalize(address)
643 |         normalized_dstk_address = self._normalize(dstk_address)
644 |         address_uniques = set(normalized_address) - set(normalized_dstk_address)
645 |         dstk_address_uniques = set(normalized_dstk_address) - set(normalized_address)
646 |         if self.logger: self.logger.debug("Address Uniques {0}".format(address_uniques))
647 |         if self.logger: self.logger.debug("DSTK Address Uniques {0}".format(dstk_address_uniques))
648 |         return (len(address_uniques), len(dstk_address_uniques))
649 | 
650 |     def _normalize(self, address):
651 |         """
652 |         Normalize prefixes, suffixes and other to make matching original to returned easier.
653 |         """
654 |         normalized_address = []
655 |         if self.logger: self.logger.debug("Normalizing Address: {0}".format(address))
656 |         for token in address.split():
657 |             if token.upper() in self.parser.suffixes.keys():
658 |                 normalized_address.append(self.parser.suffixes[token.upper()].lower())
659 |             elif token.upper() in self.parser.suffixes.values():
660 |                 normalized_address.append(token.lower())
661 |             elif token.upper().replace('.', '') in self.parser.suffixes.values():
662 |                 normalized_address.append(token.lower().replace('.', ''))
663 |             elif token.lower() in self.parser.prefixes.keys():
664 |                 normalized_address.append(self.parser.prefixes[token.lower()].lower())
665 |             elif token.upper() in self.parser.prefixes.values():
666 |                 normalized_address.append(token.lower()[:-1])
667 |             elif token.upper() + '.' in self.parser.prefixes.values():
668 |                 normalized_address.append(token.lower())
669 |             else:
670 |                 normalized_address.append(token.lower())
671 |         return normalized_address
672 | 
673 | 
674 | def create_cities_csv(filename="places2k.txt", output="cities.csv"):
675 |     """
676 |     Takes the places2k.txt from USPS and creates a simple file of all cities.
677 |     """
678 |     with open(filename, 'r') as city_file:
679 |         with open(output, 'w') as out:
680 |             for line in city_file:
681 |                 # Drop Puerto Rico (just looking for the 50 states)
682 |                 if line[0:2] == "PR":
683 |                     continue
684 |                     # Per census.gov, characters 9-72 are the name of the city or place. Cut ,off the last part, which is city, town, etc.
685 |                 #                    print " ".join(line[9:72].split()[:-1])
686 |                 out.write(" ".join(line[9:72].split()[:-1]) + '\n')
687 | 
688 | 
689 | class InvalidAddressException(Exception):
690 |     pass
691 | 
692 | class DSTKConfidenceTooLowException(Exception):
693 |     pass
694 | 
695 | if __name__ == "__main__":
696 |     ap = AddressParser()
697 |     print ap.parse_address(" ".join(sys.argv[1:]))
698 | 


--------------------------------------------------------------------------------
/address/address_list.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import sys
 3 | import os
 4 | from address import Address, AddressParser
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     # The mini test program takes a list of addresses, creates Address objects, and prints errors for each one
 9 |     # with unmatched terms. Takes a filename as the first and only argument. The file should be one address per line.
10 |     if len(sys.argv) != 2:
11 |         print "Usage: test_list.py filename"
12 |         sys.exit(1)
13 |     if not os.path.exists(sys.argv[1]):
14 |         print "File {0} does not exist".format(sys.argv[1])
15 |         sys.exit(2)
16 |     unmatched_count = 0
17 |     line_count = 0
18 |     ap = AddressParser()
19 |     with open(sys.argv[1]) as input:
20 |         for line in input:
21 |             addr = ap.parse_address(line.strip(), line_number=line_count)
22 | 
23 |             if addr.unmatched:
24 |                 print "Unmatched", addr, addr.line_number
25 |                 print ""
26 |                 unmatched_count = unmatched_count + 1
27 |             # All addresses have a house number and a street.
28 |             if addr.house_number is None:
29 |                 print "House number cannot be None: ", addr, addr.line_number
30 |             if addr.street is None:
31 |                 print "Street cannot be None: ", addr, addr.line_number
32 |             line_count = line_count + 1
33 |             print addr.full_address()
34 |             print addr.original
35 |             print ""
36 |     if unmatched_count == 0:
37 |         print "All {0} address matched! Huzzah!".format(line_count)
38 |     else:
39 |         print "{0} addresses of {1} ({2:.2%}) with unmatched terms. :(".format(unmatched_count, line_count, unmatched_count / line_count)
40 | 


--------------------------------------------------------------------------------
/address/dstk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Python interface to the Data Science Toolkit Plugin
  3 | # version: 1.30 (2011-03-16)
  4 | #
  5 | # See http://www.datasciencetoolkit.org/developerdocs#python for full details
  6 | #
  7 | # All code (C) Pete Warden, 2011
  8 | #
  9 | # This program is free software: you can redistribute it and/or modify
 10 | # it under the terms of the GNU General Public License as published by
 11 | # the Free Software Foundation, either version 3 of the License, or
 12 | # (at your option) any later version.
 13 | #
 14 | # This program is distributed in the hope that it will be useful,
 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 17 | # GNU General Public License for more details.
 18 | #
 19 | # You should have received a copy of the GNU General Public License
 20 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 21 | 
 22 | import urllib
 23 | try:
 24 |     import simplejson as json
 25 | except ImportError:
 26 |     import json
 27 | import os
 28 | import httplib
 29 | import mimetypes
 30 | import re
 31 | import csv
 32 | 
 33 | 
 34 | # This is the main interface class. You can see an example of it in use
 35 | # below, implementing a command-line tool, but you basically just instantiate
 36 | # dstk = DSTK()
 37 | # and then call the method you want
 38 | # coordinates = dstk.ip2coordinates('12.34.56.78')
 39 | # The full documentation is at http://www.datasciencetoolkit.org/developerdocs
 40 | class DSTK:
 41 | 
 42 |     api_base = None
 43 | 
 44 |     def __init__(self, options=None):
 45 |         if options is None:
 46 |             options = {}
 47 | 
 48 |         defaultOptions = {
 49 |             'apiBase': 'http://www.datasciencetoolkit.org',
 50 |             'checkVersion': True
 51 |         }
 52 | 
 53 |         if 'DSTK_API_BASE' in os.environ:
 54 |             defaultOptions['apiBase'] = os.environ['DSTK_API_BASE']
 55 | 
 56 |         for key, value in defaultOptions.items():
 57 |             if key not in options:
 58 |                 options[key] = value
 59 | 
 60 |         self.api_base = options['apiBase']
 61 | 
 62 |         if options['checkVersion']:
 63 |             self.check_version()
 64 | 
 65 |     def check_version(self):
 66 | 
 67 |         required_version = 40
 68 | 
 69 |         api_url = self.api_base+'/info'
 70 | 
 71 |         try:
 72 |             response_string = urllib.urlopen(api_url).read()
 73 |             response = json.loads(response_string)
 74 |         except:
 75 |             raise Exception('The server at "'+self.api_base+'" doesn\'t seem to be running DSTK, no version information found.')
 76 | 
 77 |         actual_version = response['version']
 78 |         if actual_version < required_version:
 79 |             raise Exception('DSTK: Version '+str(actual_version)+' found at "'+api_url+'" but '+str(required_version)+' is required')
 80 | 
 81 |     def ip2coordinates(self, ips):
 82 | 
 83 |         if not isinstance(ips, (list, tuple)):
 84 |             ips = [ips]
 85 | 
 86 |         api_url = self.api_base+'/ip2coordinates'
 87 |         api_body = json.dumps(ips)
 88 |         response_string = urllib.urlopen(api_url, api_body).read()
 89 | 
 90 |         response = json.loads(response_string)
 91 | 
 92 |         if 'error' in response:
 93 |             raise Exception(response['error'])
 94 | 
 95 |         return response
 96 | 
 97 |     def street2coordinates(self, addresses):
 98 | 
 99 |         if not isinstance(addresses, (list, tuple)):
100 |             addresses = [addresses]
101 | 
102 |         api_url = self.api_base+'/street2coordinates'
103 |         api_body = json.dumps(addresses)
104 |         response_string = urllib.urlopen(api_url, api_body).read()
105 |         response = json.loads(response_string)
106 | 
107 |         if 'error' in response:
108 |             raise Exception(response['error'])
109 | 
110 |         return response
111 | 
112 |     def coordinates2politics(self, coordinates):
113 | 
114 |         api_url = self.api_base+'/coordinates2politics'
115 |         api_body = json.dumps(coordinates)
116 |         response_string = urllib.urlopen(api_url, api_body).read()
117 |         response = json.loads(response_string)
118 | 
119 |         if 'error' in response:
120 |             raise Exception(response['error'])
121 | 
122 |         return response
123 | 
124 |     def text2places(self, text):
125 | 
126 |         api_url = self.api_base+'/text2places'
127 |         api_body = text
128 |         response_string = urllib.urlopen(api_url, api_body).read()
129 |         response = json.loads(response_string)
130 | 
131 |         if 'error' in response:
132 |             raise Exception(response['error'])
133 | 
134 |         return response
135 | 
136 |     def file2text(self, file_name, file_data):
137 | 
138 |         host = self.api_base.replace('http://', '')
139 | 
140 |         response = post_multipart(host,
141 |                                   '/file2text',[],[('inputfile', file_name, file_data)])
142 | 
143 |         return response
144 | 
145 |     def text2sentences(self, text):
146 | 
147 |         api_url = self.api_base+'/text2sentences'
148 |         api_body = text
149 |         response_string = urllib.urlopen(api_url, api_body).read()
150 |         response = json.loads(response_string)
151 | 
152 |         if 'error' in response:
153 |             raise Exception(response['error'])
154 | 
155 |         return response
156 | 
157 |     def html2text(self, html):
158 | 
159 |         api_url = self.api_base+'/html2text'
160 |         api_body = html
161 |         response_string = urllib.urlopen(api_url, api_body).read()
162 |         response = json.loads(response_string)
163 | 
164 |         if 'error' in response:
165 |             raise Exception(response['error'])
166 | 
167 |         return response
168 | 
169 |     def html2story(self, html):
170 | 
171 |         api_url = self.api_base+'/html2story'
172 |         api_body = html
173 |         response_string = urllib.urlopen(api_url, api_body).read()
174 |         response = json.loads(response_string)
175 | 
176 |         if 'error' in response:
177 |             raise Exception(response['error'])
178 | 
179 |         return response
180 | 
181 |     def text2people(self, text):
182 | 
183 |         api_url = self.api_base+'/text2people'
184 |         api_body = text
185 |         response_string = urllib.urlopen(api_url, api_body).read()
186 |         response = json.loads(response_string)
187 | 
188 |         if 'error' in response:
189 |             raise Exception(response['error'])
190 | 
191 |         return response
192 | 
193 |     def text2times(self, text):
194 | 
195 |         api_url = self.api_base+'/text2times'
196 |         api_body = text
197 |         response_string = urllib.urlopen(api_url, api_body).read()
198 |         response = json.loads(response_string)
199 | 
200 |         if 'error' in response:
201 |             raise Exception(response['error'])
202 | 
203 |         return response
204 | 
205 | # We need to post files as multipart form data, and Python has no native function for
206 | # that, so these utility functions implement what we need.
207 | # See http://code.activestate.com/recipes/146306/
208 | def post_multipart(host, selector, fields, files):
209 |     """
210 |     Post fields and files to an http host as multipart/form-data.
211 |     fields is a sequence of (name, value) elements for regular form fields.
212 |     files is a sequence of (name, filename, value) elements for data to be uploaded as files
213 |     Return the server's response page.
214 |     """
215 |     content_type, body = encode_multipart_formdata(fields, files)
216 |     h = httplib.HTTP(host)
217 |     h.putrequest('POST', selector)
218 |     h.putheader('content-type', content_type)
219 |     h.putheader('content-length', str(len(body)))
220 |     h.endheaders()
221 |     h.send(body)
222 |     errcode, errmsg, headers = h.getreply()
223 |     return h.file.read()
224 | 
225 | def encode_multipart_formdata(fields, files):
226 |     """
227 |     fields is a sequence of (name, value) elements for regular form fields.
228 |     files is a sequence of (name, filename, value) elements for data to be uploaded as files
229 |     Return (content_type, body) ready for httplib.HTTP instance
230 |     """
231 |     BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
232 |     CRLF = '\r\n'
233 |     L = []
234 |     for (key, value) in fields:
235 |         L.append('--' + BOUNDARY)
236 |         L.append('Content-Disposition: form-data; name="%s"' % key)
237 |         L.append('')
238 |         L.append(value)
239 |     for (key, filename, value) in files:
240 |         L.append('--' + BOUNDARY)
241 |         L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
242 |         L.append('Content-Type: %s' % guess_content_type(filename))
243 |         L.append('')
244 |         L.append(value)
245 |     L.append('--' + BOUNDARY + '--')
246 |     L.append('')
247 |     body = CRLF.join(L)
248 |     content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
249 |     return content_type, body
250 | 
251 | def guess_content_type(filename):
252 |     return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
253 | 
254 | # End of the interface. The rest of this file is an example implementation of a
255 | # command line client.
256 | 
257 | def ip2coordinates_cli(dstk, options, inputs, output):
258 | 
259 |     writer = csv.writer(sys.stdout)
260 | 
261 |     input_ips = []
262 |     for input_line in inputs:
263 |         ip_match = re.match(r'[12]?\d?\d\.[12]?\d?\d\.[12]?\d?\d\.[12]?\d?\d', input_line)
264 |         if ip_match is not None:
265 |             input_ips.append(ip_match.group(0))
266 |         else:
267 |             print 'No match'
268 | 
269 |     result = dstk.ip2coordinates(input_ips)
270 | 
271 |     if options['showHeaders']:
272 |         for ip, info in result.items():
273 |             if info is None:
274 |                 continue
275 |             row = ['ip_address']
276 |             for key, value in info.items():
277 |                 row.append(str(key))
278 |             writer.writerow(row)
279 |             break
280 | 
281 |     for ip, info in result.items():
282 | 
283 |         if info is None:
284 |             info = {}
285 | 
286 |         row = [ip]
287 |         for key, value in info.items():
288 |             row.append(str(value))
289 | 
290 |         writer.writerow(row)
291 | 
292 |     return
293 | 
294 | def street2coordinates_cli(dstk, options, inputs, output):
295 | 
296 |     writer = csv.writer(sys.stdout)
297 | 
298 |     result = dstk.street2coordinates(inputs)
299 | 
300 |     if options['showHeaders']:
301 |         for ip, info in result.items():
302 |             if info is None:
303 |                 continue
304 |             row = ['address']
305 |             for key, value in info.items():
306 |                 row.append(str(key))
307 |             writer.writerow(row)
308 |             break
309 | 
310 |     for ip, info in result.items():
311 | 
312 |         if info is None:
313 |             info = {}
314 | 
315 |         row = [ip]
316 |         for key, value in info.items():
317 |             row.append(str(value))
318 | 
319 |         writer.writerow(row)
320 | 
321 |     return
322 | 
323 | def coordinates2politics_cli(dstk, options, inputs, output):
324 | 
325 |     writer = csv.writer(output)
326 | 
327 |     coordinates_list = []
328 |     for input in inputs:
329 |         coordinates = input.split(',')
330 |         if len(coordinates)!=2:
331 |             output.write('You must enter coordinates as a series of comma-separated pairs, eg 37.76,-122.42')
332 |             exit(-1)
333 |         coordinates_list.append([coordinates[0], coordinates[1]])
334 | 
335 |     result = dstk.coordinates2politics(coordinates_list)
336 | 
337 |     if options['showHeaders']:
338 |         row = ['latitude', 'longitude', 'name', 'code', 'type', 'friendly_type']
339 |         writer.writerow(row)
340 | 
341 |     for info in result:
342 | 
343 |         location = info['location']
344 |         politics = info['politics']
345 | 
346 |         for politic in politics:
347 |             row = [location['latitude'],
348 |                    location['longitude'],
349 |                    politic['name'],
350 |                    politic['code'],
351 |                    politic['type'],
352 |                    politic['friendly_type'],
353 |                    ]
354 |             writer.writerow(row)
355 | 
356 |     return
357 | 
358 | def file2text_cli(dstk, options, inputs, output):
359 | 
360 |     for file_name in inputs:
361 |         if os.path.isdir(file_name):
362 |             children = os.listdir(file_name)
363 |             full_children = []
364 |             for child in children:
365 |                 full_children.append(os.path.join(file_name, child))
366 |             file2text_cli(dstk, options, full_children)
367 |         else:
368 |             file_data = get_file_or_url_contents(file_name)
369 |             if options['showHeaders']:
370 |                 output.write('--File--: '+file_name+"\n")
371 |             result = dstk.file2text(file_name, file_data)
372 | 
373 |             print result
374 |     return
375 | 
376 | def text2places_cli(dstk, options, inputs, output):
377 | 
378 |     writer = csv.writer(output)
379 | 
380 |     if options['showHeaders']:
381 |         row = ['latitude', 'longitude', 'name', 'type', 'start_index', 'end_index', 'matched_string', 'file_name']
382 |         writer.writerow(row)
383 |     options['showHeaders'] = False
384 | 
385 |     if options['from_stdin']:
386 |         result = dstk.text2places("\n".join(inputs))
387 |         text2places_format(result, 'stdin', writer)
388 |         return
389 | 
390 |     for file_name in inputs:
391 |         if os.path.isdir(file_name):
392 |             children = os.listdir(file_name)
393 |             full_children = []
394 |             for child in children:
395 |                 full_children.append(os.path.join(file_name, child))
396 |             text2places_cli(dstk, options, full_children, output)
397 |         else:
398 |             file_data = get_file_or_url_contents(file_name)
399 |             result = dstk.text2places(file_data)
400 |             text2places_format(result, file_name, writer)
401 | 
402 |     return
403 | 
404 | def text2places_format(result, file_name, writer):
405 |     for info in result:
406 | 
407 |         row = [info['latitude'],
408 |                info['longitude'],
409 |                info['name'],
410 |                info['type'],
411 |                info['start_index'],
412 |                info['end_index'],
413 |                info['matched_string'],
414 |                file_name
415 |         ]
416 |         writer.writerow(row)
417 |     return
418 | 
419 | def html2text_cli(dstk, options, inputs, output):
420 | 
421 |     if options['from_stdin']:
422 |         result = dstk.html2text("\n".join(inputs))
423 |         print result['text']
424 |         return
425 | 
426 |     for file_name in inputs:
427 |         if os.path.isdir(file_name):
428 |             children = os.listdir(file_name)
429 |             full_children = []
430 |             for child in children:
431 |                 full_children.append(os.path.join(file_name, child))
432 |             html2text_cli(dstk, options, full_children, output)
433 |         else:
434 |             file_data = get_file_or_url_contents(file_name)
435 |             if options['showHeaders']:
436 |                 output.write('--File--: '+file_name+"\n")
437 |             result = dstk.html2text(file_data)
438 |             print result['text']
439 |     return
440 | 
441 | def text2sentences_cli(dstk, options, inputs, output):
442 | 
443 |     if options['from_stdin']:
444 |         result = dstk.text2sentences("\n".join(inputs))
445 |         print result['sentences']
446 |         return
447 | 
448 |     for file_name in inputs:
449 |         if os.path.isdir(file_name):
450 |             children = os.listdir(file_name)
451 |             full_children = []
452 |             for child in children:
453 |                 full_children.append(os.path.join(file_name, child))
454 |             text2sentences_cli(dstk, options, full_children, output)
455 |         else:
456 |             file_data = get_file_or_url_contents(file_name)
457 |             if options['showHeaders']:
458 |                 output.write('--File--: '+file_name+"\n")
459 |             result = dstk.text2sentences(file_data)
460 |             print result['sentences']
461 | 
462 |     return
463 | 
464 | def html2story_cli(dstk, options, inputs, output):
465 | 
466 |     if options['from_stdin']:
467 |         result = dstk.html2story("\n".join(inputs))
468 |         print result['story']
469 |         return
470 | 
471 |     for file_name in inputs:
472 |         if os.path.isdir(file_name):
473 |             children = os.listdir(file_name)
474 |             full_children = []
475 |             for child in children:
476 |                 full_children.append(os.path.join(file_name, child))
477 |             html2story_cli(dstk, options, full_children, output)
478 |         else:
479 |             file_data = get_file_or_url_contents(file_name)
480 |             if options['showHeaders']:
481 |                 output.write('--File--: '+file_name+"\n")
482 |             result = dstk.html2story(file_data)
483 |             print result['story']
484 | 
485 |     return
486 | 
487 | def text2people_cli(dstk, options, inputs, output):
488 | 
489 |     writer = csv.writer(sys.stdout)
490 | 
491 |     if options['showHeaders']:
492 |         row = ['matched_string', 'first_name', 'surnames', 'title', 'gender', 'start_index', 'end_index', 'file_name']
493 |         writer.writerow(row)
494 |     options['showHeaders'] = False
495 | 
496 |     if options['from_stdin']:
497 |         result = dstk.text2people("\n".join(inputs))
498 |         text2people_format(result, 'stdin', writer)
499 |         return
500 | 
501 |     for file_name in inputs:
502 |         if os.path.isdir(file_name):
503 |             children = os.listdir(file_name)
504 |             full_children = []
505 |             for child in children:
506 |                 full_children.append(os.path.join(file_name, child))
507 |             text2places_cli(dstk, options, full_children, output)
508 |         else:
509 |             file_data = get_file_or_url_contents(file_name)
510 |             result = dstk.text2people(file_data)
511 |             text2people_format(result, file_name, writer)
512 | 
513 |     return
514 | 
515 | def text2people_format(result, file_name, writer):
516 |     for info in result:
517 | 
518 |         row = [
519 |             info['matched_string'],
520 |             info['first_name'],
521 |             info['surnames'],
522 |             info['title'],
523 |             info['gender'],
524 |             str(info['start_index']),
525 |             str(info['end_index']),
526 |             file_name
527 |         ]
528 |         writer.writerow(row)
529 |     return
530 | 
531 | def text2times_cli(dstk, options, inputs, output):
532 | 
533 |     writer = csv.writer(sys.stdout)
534 | 
535 |     if options['showHeaders']:
536 |         row = ['matched_string', 'time_string', 'time_seconds', 'is_relative', 'start_index', 'end_index', 'file_name']
537 |         writer.writerow(row)
538 |     options['showHeaders'] = False
539 | 
540 |     if options['from_stdin']:
541 |         result = dstk.text2times("\n".join(inputs))
542 |         text2times_format(result, 'stdin', writer)
543 |         return
544 | 
545 |     for file_name in inputs:
546 |         if os.path.isdir(file_name):
547 |             children = os.listdir(file_name)
548 |             full_children = []
549 |             for child in children:
550 |                 full_children.append(os.path.join(file_name, child))
551 |             text2times_cli(dstk, options, full_children, output)
552 |         else:
553 |             file_data = get_file_or_url_contents(file_name)
554 |             result = dstk.text2times(file_data)
555 |             text2times_format(result, file_name, writer)
556 | 
557 |     return
558 | 
559 | def text2times_format(result, file_name, writer):
560 |     for info in result:
561 | 
562 |         row = [
563 |             info['matched_string'],
564 |             info['time_string'],
565 |             info['time_seconds'],
566 |             info['is_relative'],
567 |             str(info['start_index']),
568 |             str(info['end_index']),
569 |             file_name
570 |         ]
571 |         writer.writerow(row)
572 |     return
573 | 
574 | def get_file_or_url_contents(file_name):
575 |     if re.match(r'http://', file_name):
576 |         file_data = urllib.urlopen(file_name).read()
577 |     else:
578 |         file_data = open(file_name).read()
579 |     return file_data
580 | 
581 | def print_usage(message=''):
582 | 
583 |     print message
584 |     print "Usage:"
585 |     print "python dstk.py <command> [-a/--api_base 'http://yourhost.com'] [-h/--show_headers] <inputs>"
586 |     print "Where <command> is one of:"
587 |     print "  ip2coordinates        (lat/lons for IP addresses)"
588 |     print "  street2coordinates    (lat/lons for postal addresses)"
589 |     print "  coordinates2politics  (country/state/county/constituency/etc for lat/lon)"
590 |     print "  text2places           (lat/lons for places mentioned in unstructured text)"
591 |     print "  file2text             (PDF/Excel/Word to text, and OCR on PNG/Jpeg/Tiff images)"
592 |     print "  text2sentences        (parts of the text that look like proper sentences)"
593 |     print "  html2text             (text version of the HTML document)"
594 |     print "  html2story            (text version of the HTML with no boilerplate)"
595 |     print "  text2people           (gender for people mentioned in unstructured text)"
596 |     print "  text2times            (times and dates mentioned in unstructured text)"
597 |     print "If no inputs are specified, then standard input will be read and used"
598 |     print "See http://www.datasciencetoolkit.org/developerdocs for more details"
599 |     print "Examples:"
600 |     print "python dstk.py ip2coordinates 67.169.73.113"
601 |     print "python dstk.py street2coordinates \"2543 Graystone Place, Simi Valley, CA 93065\""
602 |     print "python dstk.py file2text scanned.jpg"
603 | 
604 |     exit(-1)
605 | 
606 | if __name__ == '__main__':
607 | 
608 |     import sys
609 | 
610 |     commands = {
611 |         'ip2coordinates': { 'handler': ip2coordinates_cli },
612 |         'street2coordinates': { 'handler': street2coordinates_cli },
613 |         'coordinates2politics': { 'handler': coordinates2politics_cli },
614 |         'text2places': { 'handler': text2places_cli },
615 |         'file2text': { 'handler': file2text_cli },
616 |         'text2sentences': { 'handler': text2sentences_cli },
617 |         'html2text': { 'handler': html2text_cli },
618 |         'html2story': { 'handler': html2story_cli },
619 |         'text2people': { 'handler': text2people_cli },
620 |         'text2times': { 'handler': text2times_cli },
621 |         }
622 |     switches = {
623 |         'api_base': True,
624 |         'show_headers': True
625 |     }
626 | 
627 |     command = None
628 |     options = {'showHeaders': False}
629 |     inputs = []
630 | 
631 |     ignore_next = False
632 |     for index, arg in enumerate(sys.argv[1:]):
633 |         if ignore_next:
634 |             ignore_next = False
635 |             continue
636 | 
637 |         if arg[0]=='-' and len(arg)>1:
638 |             if len(arg) == 2:
639 |                 letter = arg[1]
640 |                 if letter == 'a':
641 |                     option = 'api_base'
642 |                 elif letter == 'h':
643 |                     option = 'show_headers'
644 |             else:
645 |                 option = arg[2:]
646 | 
647 |             if option not in switches:
648 |                 print_usage('Unknown option "'+arg+'"')
649 | 
650 |             if option == 'api_base':
651 |                 if (index+2) >= len(sys.argv):
652 |                     print_usage('Missing argument for option "'+arg+'"')
653 |                 options['apiBase'] = sys.argv[index+2]
654 |                 ignore_next = True
655 |             elif option == 'show_headers':
656 |                 options['showHeaders'] = True
657 | 
658 |         else:
659 |             if command is None:
660 |                 command = arg
661 |                 if command not in commands:
662 |                     print_usage('Unknown command "'+arg+'"')
663 |             else:
664 |                 inputs.append(arg)
665 | 
666 |     if command is None:
667 |         print_usage('No command specified')
668 | 
669 |     if len(inputs)<1:
670 |         options['from_stdin'] = True
671 |         inputs = sys.stdin.readlines()
672 |     else:
673 |         options['from_stdin'] = False
674 | 
675 |     command_info = commands[command]
676 | 
677 |     dstk = DSTK(options)
678 | 
679 |     command_info['handler'](dstk, options, inputs, sys.stdout)


--------------------------------------------------------------------------------
/address/streets.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SwoopSearch/pyaddress/62ebb07a6840e710d256406a8ec1d06abec0e1c4/address/streets.csv


--------------------------------------------------------------------------------
/address/suffixes.csv:
--------------------------------------------------------------------------------
  1 | ALLEE,ALY
  2 | ALLEY,ALY
  3 | ALLY,ALY
  4 | ALY,ALY
  5 | ANEX,ANX
  6 | ANNEX,ANX
  7 | ANNX,ANX
  8 | ANX,ANX
  9 | ARCADE,ARC
 10 | ARC,ARC
 11 | AV,AVE
 12 | AVE,AVE
 13 | AVEN,AVE
 14 | AVENU,AVE
 15 | AVENUE,AVE
 16 | AVN,AVE
 17 | AVNU,AVE
 18 | AVNUE,AVE
 19 | BAYOO,BYU
 20 | BAYOU,BYU
 21 | BCH,BCH
 22 | BEACH,BCH
 23 | BEND,BND
 24 | BG,BG
 25 | BL,BLVD
 26 | BLF,BLF
 27 | BLUF,BLF
 28 | BLUFF,BLF
 29 | BLUFFS,BLF
 30 | BLV,BLVD
 31 | BLVD,BLVD
 32 | BND,BND
 33 | BOT,BTM
 34 | BOTTM,BTM
 35 | BOTTOM,BTM
 36 | BOUL,BLVD
 37 | BOULEVARD,BLVD
 38 | BOULV,BLVD
 39 | BPS,BYP
 40 | BPSS,BYP
 41 | BRANCH,BR
 42 | BR,BR
 43 | BRDGE,BRG
 44 | BRG,BRG
 45 | BRIDGE,BRG
 46 | BRK,BRK
 47 | BRNCH,BR
 48 | BROOK,BRK
 49 | BROOKS,BRK
 50 | BTM,BTM
 51 | BULEVARD,BLVD
 52 | BURG,BG
 53 | BURGS,BG
 54 | BV,BLVD
 55 | BVD,BLVD
 56 | BYPA,BYP
 57 | BYPAS,BYP
 58 | BY,PASS
 59 | BYPASS,BYP
 60 | BYP,BYP
 61 | BYPS,BYP
 62 | BYPSS,BYP
 63 | BYU,BYU
 64 | CAMP,CP
 65 | CAN,CYN
 66 | CANYN,CYN
 67 | CANYON,CYN
 68 | CAPE,CPE
 69 | CAUSEWAY,CSWY
 70 | CAUSWAY,CSWY
 71 | CEN,CTR
 72 | CENT,CTR
 73 | CENTER,CTR
 74 | CENTERS,CTR
 75 | CENTR,CTR
 76 | CENTRE,CTR
 77 | CIRC,CIR
 78 | CIR,CIR
 79 | CIRCL,CIR
 80 | CIRCLE,CIR
 81 | CIRCLES,CIR
 82 | CK,CRK
 83 | CLB,CLB
 84 | CLF,CLFS
 85 | CLFS,CLFS
 86 | CLIFF,CLFS
 87 | CLIFFS,CLFS
 88 | CLUB,CLB
 89 | CMP,CP
 90 | CNTER,CTR
 91 | CNTR,CTR
 92 | CNYN,CYN
 93 | COR,COR
 94 | CORNER,COR
 95 | CORNERS,CORS
 96 | CORS,CORS
 97 | COURSE,CRSE
 98 | COURT,CT
 99 | COURTS,CTS
100 | COVE,CV
101 | COVES,CV
102 | CP,CP
103 | CPE,CPE
104 | CRCL,CIR
105 | CRCLE,CIR
106 | CR,CRK
107 | CRECENT,CRES
108 | CREEK,CRK
109 | CRESCENT,CRES
110 | CRES,CRES
111 | CRESENT,CRES
112 | CRK,CRK
113 | CROSSING,XING
114 | CRSCNT,CRES
115 | CRSE,CRSE
116 | CRSENT,CRES
117 | CRSNT,CRES
118 | CRSSING,XING
119 | CRSSNG,XING
120 | CRT,CT
121 | CSWY,CSWY
122 | CT,CT
123 | CTR,CTR
124 | CTS,CTS
125 | CV,CV
126 | CYN,CYN
127 | DALE,DL
128 | DAM,DM
129 | DIV,DV
130 | DIVIDE,DV
131 | DL,DL
132 | DM,DM
133 | DR,DR
134 | DRIV,DR
135 | DRIVE,DR
136 | DRIVES,DR
137 | DRV,DR
138 | DVD,DV
139 | DV,DV
140 | ESTATE,EST
141 | ESTATES,EST
142 | EST,EST
143 | ESTS,EST
144 | EXP,EXPY
145 | EXPRESS,EXPY
146 | EXPRESSWAY,EXPY
147 | EXPR,EXPY
148 | EXPW,EXPY
149 | EXPY,EXPY
150 | EXTENSION,EXT
151 | EXT,EXT
152 | EXTN,EXT
153 | EXTNSN,EXT
154 | EXTS,EXT
155 | FALL,FALL
156 | FALLS,FLS
157 | FERRY,FRY
158 | FIELD,FLD
159 | FIELDS,FLDS
160 | FLAT,FLT
161 | FLATS,FLT
162 | FLD,FLD
163 | FLDS,FLDS
164 | FLS,FLS
165 | FLT,FLT
166 | FLTS,FLT
167 | FORD,FRD
168 | FORDS,FRD
169 | FOREST,FRST
170 | FORESTS,FRST
171 | FORGE,FRG
172 | FORGES,FRG
173 | FORG,FRG
174 | FORK,FRK
175 | FORKS,FRKS
176 | FORT,FT
177 | FRD,FRD
178 | FREEWAY,FWY
179 | FREEWY,FWY
180 | FRG,FRG
181 | FRK,FRK
182 | FRKS,FRKS
183 | FRRY,FRY
184 | FRST,FRST
185 | FRT,FT
186 | FRWAY,FWY
187 | FRWY,FWY
188 | FRY,FRY
189 | FT,FT
190 | FWY,FWY
191 | GARDEN,GDNS
192 | GARDENS,GDNS
193 | GARDN,GDNS
194 | GATEWAY,GTWY
195 | GATEWY,GTWY
196 | GATWAY,GTWY
197 | GDN,GDNS
198 | GDNS,GDNS
199 | GLEN,GLN
200 | GLENS,GLN
201 | GLN,GLN
202 | GRDEN,GDNS
203 | GRDN,GDNS
204 | GRDNS,GDNS
205 | GREEN,GRN
206 | GREENS,GRN
207 | GRN,GRN
208 | GROVE,GRV
209 | GROVES,GRV
210 | GROV,GRV
211 | GRV,GRV
212 | GTWAY,GTWY
213 | GTWY,GTWY
214 | HARB,HBR
215 | HARBOR,HBR
216 | HARBORS,HBR
217 | HARBR,HBR
218 | HAVEN,HVN
219 | HAVN,HVN
220 | HBR,HBR
221 | HEIGHT,HTS
222 | HEIGHTS,HTS
223 | HGTS,HTS
224 | HIGHWAY,HWY
225 | HIGHWY,HWY
226 | HILL,HL
227 | HILLS,HLS
228 | HIWAY,HWY
229 | HIWY,HWY
230 | HL,HL
231 | HLLW,HOLW
232 | HLS,HLS
233 | HOLLOW,HOLW
234 | HOLLOWS,HOLW
235 | HOLW,HOLW
236 | HOLWS,HOLW
237 | HRBOR,HBR
238 | HT,HTS
239 | HTS,HTS
240 | HVN,HVN
241 | HWAY,HWY
242 | HWY,HWY
243 | INLET,INLT
244 | INLT,INLT
245 | IS,IS
246 | ISLAND,IS
247 | ISLANDS,ISS
248 | ISLE,ISLE
249 | ISLES,ISLE
250 | ISLND,IS
251 | ISLNDS,ISS
252 | ISS,ISS
253 | JCTION,JCT
254 | JCT,JCT
255 | JCTN,JCT
256 | JCTNS,JCT
257 | JCTS,JCT
258 | JUNCTION,JCT
259 | JUNCTIONS,JCT
260 | JUNCTN,JCT
261 | JUNCTON,JCT
262 | KEY,KY
263 | KEYS,KY
264 | KNL,KNLS
265 | KNLS,KNLS
266 | KNOL,KNLS
267 | KNOLL,KNLS
268 | KNOLLS,KNLS
269 | KY,KY
270 | KYS,KY
271 | LAKE,LK
272 | LAKES,LKS
273 | LANDING,LNDG
274 | LANE,LN
275 | LANES,LN
276 | LCK,LCKS
277 | LCKS,LCKS
278 | LDGE,LDG
279 | LDG,LDG
280 | LF,LF
281 | LGT,LGT
282 | LIGHT,LGT
283 | LIGHTS,LGT
284 | LK,LK
285 | LKS,LKS
286 | LNDG,LNDG
287 | LNDNG,LNDG
288 | LN,LN
289 | LOAF,LF
290 | LOCK,LCKS
291 | LOCKS,LCKS
292 | LODGE,LDG
293 | LODG,LDG
294 | LOOP,LOOP
295 | LOOPS,LOOP
296 | MALL,MALL
297 | MANOR,MNR
298 | MANORS,MNR
299 | MDW,MDWS
300 | MDWS,MDWS
301 | MEADOW,MDWS
302 | MEADOWS,MDWS
303 | MEDOWS,MDWS
304 | MILL,ML
305 | MILLS,MLS
306 | MISSION,MSN
307 | MISSN,MSN
308 | ML,ML
309 | MLS,MLS
310 | MNR,MNR
311 | MNRS,MNR
312 | MNTAIN,MTN
313 | MNT,MT
314 | MNTN,MTN
315 | MNTNS,MTN
316 | MOUNTAIN,MTN
317 | MOUNTAINS,MTN
318 | MOUNTIN,MTN
319 | MOUNT,MT
320 | MSN,MSN
321 | MSSN,MSN
322 | MTIN,MTN
323 | MT,MT
324 | MTN,MTN
325 | NCK,NCK
326 | NECK,NCK
327 | ORCHARD,ORCH
328 | ORCH,ORCH
329 | ORCHRD,ORCH
330 | OVAL,OVAL
331 | OVL,OVAL
332 | PARK,PARK
333 | PARKS,PARK
334 | PARKWAY,PKY
335 | PARKWAYS,PKY
336 | PARKWY,PKY
337 | PASS,PASS
338 | PATH,PATH
339 | PATHS,PATH
340 | PIKE,PIKE
341 | PIKES,PIKE
342 | PINE,PNES
343 | PINES,PNES
344 | PK,PARK
345 | PKWAY,PKY
346 | PKWY,PKY
347 | PKWYS,PKY
348 | PKY,PKY
349 | PLACE,PL
350 | PLAINES,PLNS
351 | PLAIN,PLN
352 | PLAINS,PLNS
353 | PLAZA,PLZ
354 | PLN,PLN
355 | PLNS,PLNS
356 | PL,PL
357 | PLZA,PLZ
358 | PLZ,PLZ
359 | PNES,PNES
360 | POINT,PT
361 | POINTS,PT
362 | PORT,PRT
363 | PORTS,PRT
364 | PRAIRIE,PR
365 | PRARIE,PR
366 | PRK,PARK
367 | PR,PR
368 | PRR,PR
369 | PRT,PRT
370 | PRTS,PRT
371 | PT,PT
372 | PTS,PT
373 | RADIAL,RADL
374 | RADIEL,RADL
375 | RADL,RADL
376 | RAD,RADL
377 | RANCHES,RNCH
378 | RANCH,RNCH
379 | RAPID,RPDS
380 | RAPIDS,RPDS
381 | RDGE,RDG
382 | RDG,RDG
383 | RDGS,RDG
384 | RD,RD
385 | RDS,RD
386 | REST,RST
387 | RIDGE,RDG
388 | RIDGES,RDG
389 | RIVER,RIV
390 | RIV,RIV
391 | RIVR,RIV
392 | RNCH,RNCH
393 | RNCHS,RNCH
394 | ROAD,RD
395 | ROADS,RD
396 | ROW,ROW
397 | RPD,RPDS
398 | RPDS,RPDS
399 | RST,RST
400 | RUN,RUN
401 | RVR,RIV
402 | SHL,SHL
403 | SHLS,SHLS
404 | SHOAL,SHL
405 | SHOALS,SHLS
406 | SHOAR,SHR
407 | SHOARS,SHRS
408 | SHORE,SHR
409 | SHORES,SHRS
410 | SHR,SHR
411 | SHRS,SHRS
412 | SMT,SMT
413 | SPG,SPG
414 | SPGS,SPGS
415 | SPNG,SPG
416 | SPNGS,SPGS
417 | SPRING,SPG
418 | SPRINGS,SPGS
419 | SPRNG,SPG
420 | SPRNGS,SPGS
421 | SPUR,SPUR
422 | SPURS,SPUR
423 | SQRE,SQ
424 | SQR,SQ
425 | SQ,SQ
426 | SQUARE,SQ
427 | SQUARES,SQ
428 | SQU,SQ
429 | STA,STA
430 | STATION,STA
431 | STATN,STA
432 | STN,STA
433 | STRA,STRA
434 | STRAVEN,STRA
435 | STRAVENUE,STRA
436 | STRAVE,STRA
437 | STRAVN,STRA
438 | STRAV,STRA
439 | STREAM,STRM
440 | STREETS,ST
441 | STREET,ST
442 | STREME,STRM
443 | STRM,STRM
444 | STR,ST
445 | STRT,ST
446 | STRVN,STRA
447 | STRVNUE,STRA
448 | ST,ST
449 | SUMIT,SMT
450 | SUMITT,SMT
451 | SUMMIT,SMT
452 | TERRACE,TER
453 | TERR,TER
454 | TER,TER
455 | TPKE,TPKE
456 | TPK,TPKE
457 | TRACES,TRCE
458 | TRACE,TRCE
459 | TRACKS,TRAK
460 | TRACK,TRAK
461 | TRAFFICWAY,TRFY
462 | TRAILS,TRL
463 | TRAIL,TRL
464 | TRAK,TRAK
465 | TRCE,TRCE
466 | TRFY,TRFY
467 | TRKS,TRAK
468 | TRK,TRAK
469 | TRLS,TRL
470 | TRL,TRL
471 | TRNPK,TPKE
472 | TR,TRL
473 | TUNEL,TUNL
474 | TUNLS,TUNL
475 | TUNL,TUNL
476 | TUNNELS,TUNL
477 | TUNNEL,TUNL
478 | TUNNL,TUNL
479 | TURNPIKE,TPKE
480 | TURNPK,TPKE
481 | UNIONS,UN
482 | UNION,UN
483 | UN,UN
484 | VALLEYS,VLY
485 | VALLEY,VLY
486 | VALLY,VLY
487 | VDCT,VIA
488 | VIADCT,VIA
489 | VIADUCT,VIA
490 | VIA,VIA
491 | VIEWS,VW
492 | VIEW,VW
493 | VILLAGES,VLG
494 | VILLAGE,VLG
495 | VILLAG,VLG
496 | VILLE,VL
497 | VILLG,VLG
498 | VILLIAGE,VLG
499 | VILL,VLG
500 | VISTA,VIS
501 | VIST,VIS
502 | VIS,VIS
503 | VLGS,VLG
504 | VLG,VLG
505 | VLLY,VLY
506 | VL,VL
507 | VLYS,VLY
508 | VLY,VLY
509 | VSTA,VIS
510 | VST,VIS
511 | VWS,VW
512 | VW,VW
513 | WALKS,WALK
514 | WALK,WALK
515 | WAYS,WAY
516 | WAY,WAY
517 | WELLS,WLS
518 | WELL,WLS
519 | WLS,WLS
520 | WY,WAY
521 | XING,XING
522 | 


--------------------------------------------------------------------------------
/address/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SwoopSearch/pyaddress/62ebb07a6840e710d256406a8ec1d06abec0e1c4/address/test/__init__.py


--------------------------------------------------------------------------------
/address/test/test_address.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from ..address import Address, AddressParser
  3 | 
  4 | 
  5 | class AddressTest(unittest.TestCase):
  6 |     parser = None
  7 | 
  8 |     def setUp(self):
  9 |         self.parser = AddressParser()
 10 | 
 11 |     def test_basic_full_address(self):
 12 |         addr = Address("2 N. Park Street, Madison, WI 53703", self.parser)
 13 | #        print addr
 14 |         self.assertTrue(addr.house_number == "2")
 15 |         self.assertTrue(addr.street_prefix == "N.")
 16 |         self.assertTrue(addr.street == "Park")
 17 |         self.assertTrue(addr.street_suffix == "St.")
 18 |         self.assertTrue(addr.city == "Madison")
 19 |         self.assertTrue(addr.state == "WI")
 20 |         self.assertTrue(addr.zip == "53703")
 21 |         self.assertTrue(addr.apartment == None)
 22 |         # self.assertTrue(addr.building == None)
 23 | 
 24 |     def test_multi_address(self):
 25 |         addr = Address("416/418 N. Carroll St.", self.parser)
 26 | #        print addr
 27 |         self.assertTrue(addr.house_number == "416")
 28 |         self.assertTrue(addr.street_prefix == "N.")
 29 |         self.assertTrue(addr.street == "Carroll")
 30 |         self.assertTrue(addr.street_suffix == "St.")
 31 |         self.assertTrue(addr.city == None)
 32 |         self.assertTrue(addr.state == None)
 33 |         self.assertTrue(addr.zip == None)
 34 |         self.assertTrue(addr.apartment == None)
 35 |         # self.assertTrue(addr.building == None)
 36 | 
 37 |     def test_no_suffix(self):
 38 |         addr = Address("230 Lakelawn", self.parser)
 39 | #        print addr
 40 |         self.assertTrue(addr.house_number == "230")
 41 |         self.assertTrue(addr.street_prefix == None)
 42 |         self.assertTrue(addr.street == "Lakelawn")
 43 |         self.assertTrue(addr.street_suffix == None)
 44 |         self.assertTrue(addr.city == None)
 45 |         self.assertTrue(addr.state == None)
 46 |         self.assertTrue(addr.zip == None)
 47 |         self.assertTrue(addr.apartment == None)
 48 |         # self.assertTrue(addr.building == None)
 49 | 
 50 | #     def test_building_in_front(self):
 51 | #         addr = Address("Roundhouse Apartments 626 Langdon", self.parser)
 52 | # #        print addr
 53 | #         self.assertTrue(addr.house_number == "626")
 54 | #         self.assertTrue(addr.street_prefix == None)
 55 | #         self.assertTrue(addr.street == "Langdon")
 56 | #         self.assertTrue(addr.street_suffix == None)
 57 | #         self.assertTrue(addr.city == None)
 58 | #         self.assertTrue(addr.state == None)
 59 | #         self.assertTrue(addr.zip == None)
 60 | #         self.assertTrue(addr.apartment == None)
 61 | #         # self.assertTrue(addr.building == "Roundhouse Apartments")
 62 | 
 63 |     def test_streets_named_after_states(self):
 64 |         addr = Address("504 W. Washington Ave.", self.parser)
 65 | #        print addr
 66 |         self.assertTrue(addr.house_number == "504")
 67 |         self.assertTrue(addr.street_prefix == "W.")
 68 |         self.assertTrue(addr.street == "Washington")
 69 |         self.assertTrue(addr.street_suffix == "Ave.")
 70 |         self.assertTrue(addr.city == None)
 71 |         self.assertTrue(addr.state == None)
 72 |         self.assertTrue(addr.zip == None)
 73 |         self.assertTrue(addr.apartment == None)
 74 |         # self.assertTrue(addr.building == None)
 75 | 
 76 |     def test_hash_apartment(self):
 77 |         addr = Address("407 West Doty St. #2", self.parser)
 78 | #        print addr
 79 |         self.assertTrue(addr.house_number == "407")
 80 |         self.assertTrue(addr.street_prefix == "W.")
 81 |         self.assertTrue(addr.street == "Doty")
 82 |         self.assertTrue(addr.street_suffix == "St.")
 83 |         self.assertTrue(addr.city == None)
 84 |         self.assertTrue(addr.state == None)
 85 |         self.assertTrue(addr.zip == None)
 86 |         self.assertTrue(addr.apartment == "#2")
 87 |         # self.assertTrue(addr.building == None)
 88 | 
 89 |     def test_stray_dash_apartment(self):
 90 |         addr = Address("407 West Doty St. - #2", self.parser)
 91 |         #        print addr
 92 |         self.assertTrue(addr.house_number == "407")
 93 |         self.assertTrue(addr.street_prefix == "W.")
 94 |         self.assertTrue(addr.street == "Doty")
 95 |         self.assertTrue(addr.street_suffix == "St.")
 96 |         self.assertTrue(addr.city == None)
 97 |         self.assertTrue(addr.state == None)
 98 |         self.assertTrue(addr.zip == None)
 99 |         self.assertTrue(addr.apartment == "#2")
100 |         # self.assertTrue(addr.building == None)
101 | 
102 |     def test_suffixless_street_with_city(self):
103 |         addr = Address("431 West Johnson, Madison, WI", self.parser)
104 | #        print addr
105 |         self.assertTrue(addr.house_number == "431")
106 |         self.assertTrue(addr.street_prefix == "W.")
107 |         self.assertTrue(addr.street == "Johnson")
108 |         self.assertTrue(addr.street_suffix == None)
109 |         self.assertTrue(addr.city == "Madison")
110 |         self.assertTrue(addr.state == "WI")
111 |         self.assertTrue(addr.zip == None)
112 |         self.assertTrue(addr.apartment == None)
113 |         # self.assertTrue(addr.building == None)
114 | 
115 | 
116 | class AddressParserTest(unittest.TestCase):
117 |     ap = None
118 | 
119 |     def setUp(self):
120 |         self.ap = AddressParser()
121 | 
122 |     def test_load_suffixes(self):
123 |         self.assertTrue(self.ap.suffixes["ALLEY"] == "ALY")
124 | 
125 |     def test_load_cities(self):
126 |         self.assertTrue("wisconsin rapids" in self.ap.cities)
127 | 
128 |     def test_load_states(self):
129 |         self.assertTrue(self.ap.states["Wisconsin"] == "WI")
130 | 
131 |     # Not using preloaded streets any more.
132 | #    def test_load_streets(self):
133 | #        self.assertTrue("mifflin" in self.ap.streets)
134 | 
135 | if __name__ == '__main__':
136 |     unittest.main()
137 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(
 4 |     name='address',
 5 |     version='0.1.1',
 6 |     url='https://github.com/SwoopSearch/pyaddress',
 7 |     author='Swoop Search LLC, Josh Gachnang, Rob Jauquet',
 8 |     author_email='Josh@SwoopSrch.com',
 9 |     description='address is an address parsing library, taking the guesswork out of using addresses in your applications.',
10 |     long_description=open('README.rst', 'rt').read(),
11 |     #data_files=[('', ['README.rst','pyaddress/cities.csv', 'pyaddress/suffixes.csv', 'pyaddress/streets.csv', 'pyaddress/tests.py', 'pyaddress/test_list.py'])],
12 |     packages=['address'],
13 |     package_dir={'address': 'address'},
14 |     package_data={'address': ['cities.csv', 'streets.csv', 'suffixes.csv']},
15 |     classifiers=[
16 |         "License :: OSI Approved :: BSD License",
17 |         "Natural Language :: English",
18 |         "Programming Language :: Python :: 2 :: Only",
19 |         "Topic :: Software Development :: Libraries",
20 |         "Topic :: Text Processing",
21 |     ],
22 |     keywords = "example documentation tutorial",
23 |     maintainer="Swoop Search LLC, Josh Gachnang, Rob Jauquet",
24 |     maintainer_email="Josh@SwoopSrch.com",
25 | )
26 | 


--------------------------------------------------------------------------------