├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README
├── README.rst
├── address
├── __init__.py
├── address.py
├── address_list.py
├── cities.csv
├── dstk.py
├── streets.csv
├── suffixes.csv
└── test
│ ├── __init__.py
│ └── test_address.py
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | apartment_list.csv
2 | output.txt
3 | *.py[cod]
4 | .idea/
5 | MANIFEST
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Packages
11 | *.egg
12 | *.egg-info
13 | dist
14 | build
15 | eggs
16 | parts
17 | bin
18 | var
19 | sdist
20 | develop-eggs
21 | .installed.cfg
22 | lib
23 | lib64
24 |
25 | # Installer logs
26 | pip-log.txt
27 |
28 | # Unit test / coverage reports
29 | .coverage
30 | .tox
31 | nosetests.xml
32 |
33 | # Translations
34 | *.mo
35 |
36 | # Mr Developer
37 | .mr.developer.cfg
38 | .project
39 | .pydevproject
40 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2013, Swoop Search LLC
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 |
6 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 | Neither the name of Swoop Search LLC nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9 |
10 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
11 | AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
12 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
13 | IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | address
2 | =========
3 |
4 | address is an address parsing library, taking the guesswork out of using addresses in your applications. We use it as part of our apartment search and apartment spider applications.
5 |
6 | Installation
7 | ------------
8 |
9 | `pip install address`
10 |
11 | Example
12 | -------
13 |
14 | First, we create an AddressParser. AddressParser allows us to feed in lists of cities, streets, and address suffixes. Then we call
15 | parse_address on our address string, which returns an Address instance with all the attributes filled out. From there, we can
16 | print parts of the address, change them, validate them, create a database model to store them, or anything else.
17 |
18 | ```python
19 | from address import AddressParser, Address
20 |
21 | ap = AddressParser()
22 | address = ap.parse_address('123 West Mifflin Street, Madison, WI, 53703')
23 | print "Address is: {0} {1} {2} {3}".format(address.house_number, address.street_prefix, address.street, address.street_suffix)
24 |
25 | > Address is: 123 W. Mifflin St.
26 | ```
27 |
28 | AddressParser
29 | -------------
30 |
31 | `AddressParser(self, suffixes=None, cities=None, streets=None)`
32 |
33 | suffixes, cities, and streets all accept lists as arguments. If you leave them as none, they will read default files
34 | from the package, namely suffixes.csv, cities.csv, and streets.csv. Streets is intentionally blank.
35 |
36 | You can provide lists of acceptable suffixes, cities, and streets to lower your false positives. If you know all
37 | the addresses you are processing are in a small area, you can provide a list of the cities in the area and should
38 | get more accurate results. If you are only doing one city, you could provide that single city in a list, and a list
39 | of all streets in that city.
40 |
41 |
42 | Address
43 | -------
44 |
45 | Addresses get returned by AddressParser.parser_address(). They have the following attributes:
46 |
47 | `house_number`
48 |
49 | The number on a house. This is required for all valid addresses. E.g. __123__ W. Mifflin St.
50 |
51 | `street_prefix`
52 |
53 | The direction before the street name. Always represented as one or two letters followed by a period. Not required.
54 | E.g. 123 __W.__ Mifflin St.
55 |
56 | `street`
57 |
58 | The name of the street. Potentially multiple words. This is required for a valid address. E.g. 123 W. __Mifflin__ St.
59 |
60 | `street_suffix`
61 |
62 | The ending of a street. This will always be the USPS abbreviation followed by a period. Not required, but highly recommended.
63 | E.g. 123 W. Mifflin __St.__
64 |
65 | `apartment`
66 |
67 | Apartment number or unit style or any number of things signifying a specific part of an address. Not required. E.g. 123
68 | W. Mifflin St. __Apt 10__
69 |
70 | `buiding`
71 |
72 | Sometimes addresses are grouped into buildings, or are more commonly known as by building names. Not required, and often
73 | in parathenses. E.g. 123 W. Mifflin St. Apt 10 __(The Estates)__
74 |
75 | `city`
76 |
77 | The city part of the address, preferably following a comma. E.g. 123 W. Mifflin St., __Madison__, WI 53703
78 |
79 | `state`
80 |
81 | The state of the address, preferably following the city and a comma. Always two capitalized letters. E.g. 123 W. Mifflin St., Madison, __WI__ 53703
82 |
83 | `zip`
84 |
85 | The 5 digit zip code of the address, preferably following the state. 9 digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI __53703__
86 |
87 | `full_address()`
88 |
89 | Returns a human readable version of the address for display. Follows the same style rules as the above attributes.
90 | Example return: (The Estates) 123 W. Mifflin St. Apt 10, Madison, WI 53703
91 |
92 | Todo
93 | ----
94 |
95 | * Add verification of an address through Google Maps API, given an API key.
96 |
97 | * Allow custom validation conditions in AddressParser for what counts as a correct address or not.
98 |
99 | * Add exceptions for incorrect addresses instead of silent failing and letting user validate.
100 |
101 | GitHub
102 | ------
103 |
104 | File support requests and obtain the source from https://github.com/SwoopSearch/pyaddress
105 |
106 | Authors
107 | -------
108 |
109 | * Josh Gachnang
110 |
111 | * Rob Jauquet
112 |
113 | License and Copyright
114 | -------
115 |
116 | Copyright (c) 2013 Swoop Search LLC.
117 |
118 | This library is released under the New BSD License.
119 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | address
2 | =========
3 |
4 | address is an address parsing library, taking the guesswork out of
5 | using addresses in your applications. We use it as part of our apartment
6 | search and apartment spider applications.
7 |
8 | Installation
9 | ------------
10 |
11 | ::
12 |
13 | pip install address
14 |
15 | Example
16 | -------
17 |
18 | First, we create an AddressParser. AddressParser allows us to feed in
19 | lists of cities, streets, and address suffixes. Then we call
20 | parse\_address on our address string, which returns an Address instance
21 | with all the attributes filled out. From there, we can print parts of
22 | the address, change them, validate them, create a database model to
23 | store them, or anything else.
24 |
25 | ::
26 |
27 | from address import AddressParser, Address
28 |
29 | ap = AddressParser()
30 | address = ap.parse_address('123 West Mifflin Street, Madison, WI, 53703')
31 | print "Address is: {0} {1} {2} {3}".format(address.house_number, address.street_prefix, address.street, address.street_suffix)
32 |
33 | > Address is: 123 W. Mifflin St.
34 |
35 | AddressParser
36 | -------------
37 |
38 | ``AddressParser(self, suffixes=None, cities=None, streets=None)``
39 |
40 | suffixes, cities, and streets all accept lists as arguments. If you
41 | leave them as none, they will read default files from the package,
42 | namely suffixes.csv, cities.csv, and streets.csv. Streets is
43 | intentionally blank.
44 |
45 | You can provide lists of acceptable suffixes, cities, and streets to
46 | lower your false positives. If you know all the addresses you are
47 | processing are in a small area, you can provide a list of the cities in
48 | the area and should get more accurate results. If you are only doing one
49 | city, you could provide that single city in a list, and a list of all
50 | streets in that city.
51 |
52 | Address
53 | -------
54 |
55 | Addresses get returned by AddressParser.parser\_address(). They have the
56 | following attributes:
57 |
58 | ``house_number``
59 |
60 | The number on a house. This is required for all valid addresses. E.g.
61 | **123** W. Mifflin St.
62 |
63 | ``street_prefix``
64 |
65 | The direction before the street name. Always represented as one or two
66 | letters followed by a period. Not required. E.g. 123 **W.** Mifflin St.
67 |
68 | ``street``
69 |
70 | The name of the street. Potentially multiple words. This is required for
71 | a valid address. E.g. 123 W. **Mifflin** St.
72 |
73 | ``street_suffix``
74 |
75 | The ending of a street. This will always be the USPS abbreviation
76 | followed by a period. Not required, but highly recommended. E.g. 123 W.
77 | Mifflin **St.**
78 |
79 | ``apartment``
80 |
81 | Apartment number or unit style or any number of things signifying a
82 | specific part of an address. Not required. E.g. 123 W. Mifflin St. **Apt
83 | 10**
84 |
85 | ``buiding``
86 |
87 | Sometimes addresses are grouped into buildings, or are more commonly
88 | known as by building names. Not required, and often in parathenses. E.g.
89 | 123 W. Mifflin St. Apt 10 **(The Estates)**
90 |
91 | ``city``
92 |
93 | The city part of the address, preferably following a comma. E.g. 123 W.
94 | Mifflin St., **Madison**, WI 53703
95 |
96 | ``state``
97 |
98 | The state of the address, preferably following the city and a comma.
99 | Always two capitalized letters. E.g. 123 W. Mifflin St., Madison, **WI**
100 | 53703
101 |
102 | ``zip``
103 |
104 | The 5 digit zip code of the address, preferably following the state. 9
105 | digit zips not yet supported. E.g. 123 W. Mifflin St., Madison, WI
106 | **53703**
107 |
108 | ``full_address()``
109 |
110 | Returns a human readable version of the address for display. Follows the
111 | same style rules as the above attributes. Example return: (The Estates)
112 | 123 W. Mifflin St. Apt 10, Madison, WI 53703
113 |
114 | Todo
115 | ----
116 |
117 | - Add verification of an address through Google Maps API, given an API
118 | key.
119 |
120 | - Allow custom validation conditions in AddressParser for what counts
121 | as a correct address or not.
122 |
123 | - Add exceptions for incorrect addresses instead of silent failing and
124 | letting user validate.
125 |
126 | GitHub
127 | ------
128 |
129 | File support requests and obtain the source from
130 | https://github.com/SwoopSearch/pyaddress
131 |
132 | Authors
133 | -------
134 |
135 | - Josh Gachnang
136 |
137 | - Rob Jauquet
138 |
139 | License and Copyright
140 | ---------------------
141 |
142 | Copyright (c) 2013 Swoop Search LLC.
143 |
144 | This library is released under the New BSD License.
145 |
--------------------------------------------------------------------------------
/address/__init__.py:
--------------------------------------------------------------------------------
1 | from .address import Address, AddressParser
2 |
--------------------------------------------------------------------------------
/address/address.py:
--------------------------------------------------------------------------------
1 | # Meant to parse out address lines, minus city,state,zip into a usable dict for address matching
2 | # Ignores periods and commas, because no one cares.
3 |
4 | import re
5 | import csv
6 | import os
7 | import dstk
8 | import sys
9 |
10 | # Keep lowercase, no periods
11 | # Requires numbers first, then option dash plus numbers.
12 | street_num_regex = r'^(\d+)(-?)(\d*)$'
13 |
14 | apartment_regex_number = r'(#?)(\d*)(\w*)'
15 | cwd = os.path.dirname(os.path.realpath(__file__))
16 |
17 |
18 | class AddressParser(object):
19 | """
20 | AddressParser will be use to create Address objects. It contains a list of preseeded cities, states, prefixes,
21 | suffixes, and street names that will help the Address object correctly parse the given string. It is loaded
22 | with defaults that work in the average case, but can be adjusted for specific cases.
23 | """
24 | suffixes = {}
25 | # Lower case list of cities, used as a hint
26 | cities = []
27 | # Lower case list of streets, used as a hint
28 | streets = []
29 | prefixes = {
30 | "n": "N.", "e": "E.", "s": "S.", "w": "W.", "ne": "NE.", "nw": "NW.", 'se': "SE.", 'sw': "SW.", 'north': "N.",
31 | 'east': "E.", 'south': "S.",
32 | 'west': "W.", 'northeast': "NE.", 'northwest': "NW.", 'southeast': "SE.", 'southwest': "SW."}
33 | states = {
34 | 'Mississippi': 'MS', 'Oklahoma': 'OK', 'Delaware': 'DE', 'Minnesota': 'MN', 'Illinois': 'IL', 'Arkansas': 'AR',
35 | 'New Mexico': 'NM', 'Indiana': 'IN', 'Maryland': 'MD', 'Louisiana': 'LA', 'Idaho': 'ID', 'Wyoming': 'WY',
36 | 'Tennessee': 'TN', 'Arizona': 'AZ', 'Iowa': 'IA', 'Michigan': 'MI', 'Kansas': 'KS', 'Utah': 'UT',
37 | 'Virginia': 'VA', 'Oregon': 'OR', 'Connecticut': 'CT', 'Montana': 'MT', 'California': 'CA',
38 | 'Massachusetts': 'MA', 'West Virginia': 'WV', 'South Carolina': 'SC', 'New Hampshire': 'NH',
39 | 'Wisconsin': 'WI', 'Vermont': 'VT', 'Georgia': 'GA', 'North Dakota': 'ND', 'Pennsylvania': 'PA',
40 | 'Florida': 'FL', 'Alaska': 'AK', 'Kentucky': 'KY', 'Hawaii': 'HI', 'Nebraska': 'NE', 'Missouri': 'MO',
41 | 'Ohio': 'OH', 'Alabama': 'AL', 'New York': 'NY', 'South Dakota': 'SD', 'Colorado': 'CO', 'New Jersey': 'NJ',
42 | 'Washington': 'WA', 'North Carolina': 'NC', 'District of Columbia': 'DC', 'Texas': 'TX', 'Nevada': 'NV',
43 | 'Maine': 'ME', 'Rhode Island': 'RI'}
44 |
45 | def __init__(self, suffixes=None, cities=None, streets=None, backend="default", dstk_api_base=None, logger=None, required_confidence=0.65):
46 | """
47 | suffixes, cities and streets provide a chance to use different lists than the provided lists.
48 | suffixes is probably good for most users, unless you have some suffixes not recognized by USPS.
49 | cities is a very expansive list that may lead to false positives in some cases. If you only have a few cities
50 | you know will show up, provide your own list for better accuracy. If you are doing addresses across the US,
51 | the provided list is probably better.
52 | streets can be used to limit the list of possible streets the address are on. It comes blank by default and
53 | uses positional clues instead. If you are instead just doing a couple cities, a list of all possible streets
54 | will decrease incorrect street names.
55 | Valid backends include "default" and "dstk". If backend is dstk, it requires a dstk_api_base. Example of
56 | dstk_api_base would be 'http://example.com'.
57 | """
58 | self.logger = logger
59 | self.backend = backend
60 | self.dstk_api_base = dstk_api_base
61 | self.required_confidence = required_confidence
62 | if suffixes:
63 | self.suffixes = suffixes
64 | else:
65 | self.load_suffixes(os.path.join(cwd, "suffixes.csv"))
66 | if cities:
67 | self.cities = cities
68 | else:
69 | self.load_cities(os.path.join(cwd, "cities.csv"))
70 | if streets:
71 | self.streets = streets
72 | else:
73 | self.load_streets(os.path.join(cwd, "streets.csv"))
74 | if backend == "dstk":
75 | if dstk_api_base is None:
76 | raise ValueError("dstk_api_base is required for dstk backend.")
77 | self.dstk = dstk.DSTK({'apiBase': dstk_api_base})
78 | elif backend == "default":
79 | pass
80 | else:
81 | raise ValueError("backend must be either 'default' or 'dstk'.")
82 |
83 | def parse_address(self, address, line_number=-1):
84 | """
85 | Return an Address object from the given address. Passes itself to the Address constructor to use all the custom
86 | loaded suffixes, cities, etc.
87 | """
88 | return Address(address, self, line_number, self.logger)
89 |
90 | def dstk_multi_address(self, address_list):
91 | if self.backend != "dstk":
92 | raise ValueError("Only allowed for DSTK backends.")
93 | if self.logger: self.logger.debug("Sending {0} possible addresses to DSTK".format(len(address_list)))
94 | multi_address = self.dstk.street2coordinates(address_list)
95 | if self.logger: self.logger.debug("Received {0} addresses from DSTK".format(len(multi_address)))
96 | # if self.logger: self.logger.debug("End street2coords")
97 | addresses = []
98 | # if self.logger: self.logger.debug("Multi Addresses: {0}".format(multi_address))
99 | for address, dstk_return in multi_address.items():
100 | try:
101 | if dstk_return is None:
102 | # if self.logger: self.logger.debug("DSTK None return for: {0}".format(address))
103 | continue
104 | addresses.append(Address(address, self, -1, self.logger, dstk_pre_parse=dstk_return))
105 | if self.logger: self.logger.debug("DSTK Address Appended: {0}".format(dstk_return))
106 | except InvalidAddressException as e:
107 | # if self.logger: self.logger.debug("Error from dstk Address: {0}".format(e.message))
108 | continue
109 | except DSTKConfidenceTooLowException as e:
110 | continue
111 | return addresses
112 |
113 |
114 | def load_suffixes(self, filename):
115 | """
116 | Build the suffix dictionary. The keys will be possible long versions, and the values will be the
117 | accepted abbreviations. Everything should be stored using the value version, and you can search all
118 | by using building a set of self.suffixes.keys() and self.suffixes.values().
119 | """
120 | with open(filename, 'r') as f:
121 | for line in f:
122 | # Make sure we have key and value
123 | if len(line.split(',')) != 2:
124 | continue
125 | # Strip off newlines.
126 | self.suffixes[line.strip().split(',')[0]] = line.strip().split(',')[1]
127 |
128 | def load_cities(self, filename):
129 | """
130 | Load up all cities in lowercase for easier matching. The file should have one city per line, with no extra
131 | characters. This isn't strictly required, but will vastly increase the accuracy.
132 | """
133 | with open(filename, 'r') as f:
134 | for line in f:
135 | self.cities.append(line.strip().lower())
136 |
137 | def load_streets(self, filename):
138 | """
139 | Load up all streets in lowercase for easier matching. The file should have one street per line, with no extra
140 | characters. This isn't strictly required, but will vastly increase the accuracy.
141 | """
142 | with open(filename, 'r') as f:
143 | for line in f:
144 | self.streets.append(line.strip().lower())
145 |
146 |
147 | # Procedure: Go through backwards. First check for apartment number, then
148 | # street suffix, street name, street prefix, then building. For each sub,
149 | # check if that spot is already filled in the dict.
150 | class Address:
151 | unmatched = False
152 | house_number = None
153 | street_prefix = None
154 | street = None
155 | street_suffix = None
156 | apartment = None
157 | # building = None
158 | city = None
159 | state = None
160 | zip = None
161 | original = None
162 | # Only set for dstk
163 | lat = None
164 | lng = None
165 | last_matched = None
166 | unmatched = False
167 | # Only used for debug
168 | line_number = -1
169 | # Confidence value from DSTK. 0 - 1, -1 for not set.
170 | confidence = -1
171 |
172 | def __init__(self, address, parser, line_number=-1, logger=None, dstk_pre_parse=None):
173 | """
174 | @dstk_pre_parse: a single value from a dstk multiple street2coordinates return. @address would be the key then.
175 | """
176 | self.parser = parser
177 | self.line_number = line_number
178 | self.original = self._clean(address)
179 | self.logger = logger
180 | if address is None:
181 | return
182 | address = self.preprocess_address(address)
183 | if parser.backend == "dstk":
184 | # if self.logger: self.logger.debug("Preparsed: {0}".format(dstk_pre_parse))
185 | self.dstk_parse(address, parser, pre_parsed_address=dstk_pre_parse)
186 | elif parser.backend == "default":
187 | self.parse_address(address)
188 | else:
189 | raise ValueError("Parser gave invalid backend, must be either 'default' or 'dstk'.")
190 |
191 | if self.house_number is None or self.house_number <= 0:
192 | raise InvalidAddressException("Addresses must have house numbers.")
193 | elif self.street is None or self.street == "":
194 | raise InvalidAddressException("Addresses must have streets.")
195 | # if self.house_number is None or self.street is None or self.street_suffix is None:
196 | # raise ValueError("Street addresses require house_number, street, and street_suffix")
197 |
198 | def parse_address(self, address):
199 | # print "YOU ARE PARSING AN ADDRESS"
200 | # Save the original string
201 |
202 | # Get rid of periods and commas, split by spaces, reverse.
203 | # Periods should not exist, remove them. Commas separate tokens. It's possible we can use commas for better guessing.
204 | address = address.strip().replace('.', '')
205 | # We'll use this for guessing.
206 | self.comma_separated_address = address.split(',')
207 | address = address.replace(',', '')
208 |
209 | # First, do some preprocessing
210 | # address = self.preprocess_address(address)
211 |
212 | # Try all our address regexes. USPS says parse from the back.
213 | address = reversed(address.split())
214 | # Save unmatched to process after the rest is processed.
215 | unmatched = []
216 | # Use for contextual data
217 | for token in address:
218 | # print token, self
219 | # Check zip code first
220 | if self.check_zip(token):
221 | continue
222 | if self.check_state(token):
223 | continue
224 | if self.check_city(token):
225 | continue
226 | if self.check_street_suffix(token):
227 | continue
228 | if self.check_house_number(token):
229 | continue
230 | if self.check_street_prefix(token):
231 | continue
232 | if self.check_street(token):
233 | continue
234 | # if self.check_building(token):
235 | # continue
236 | if self.guess_unmatched(token):
237 | continue
238 | unmatched.append(token)
239 |
240 | # Post processing
241 |
242 | for token in unmatched:
243 | # print "Unmatched token: ", token
244 | if self.check_apartment_number(token):
245 | continue
246 | # print "Unmatched token: ", token
247 | # print "Original address: ", self.original
248 | self.unmatched = True
249 |
250 | def preprocess_address(self, address):
251 | """
252 | Takes a basic address and attempts to clean it up, extract reasonably assured bits that may throw off the
253 | rest of the parsing, and return the cleaned address.
254 | """
255 | # Run some basic cleaning
256 | address = address.replace("# ", "#")
257 | address = address.replace(" & ", "&")
258 | # Clear the address of things like 'X units', which shouldn't be in an address anyway. We won't save this for now.
259 | if re.search(r"-?-?\w+ units", address, re.IGNORECASE):
260 | address = re.sub(r"-?-?\w+ units", "", address, flags=re.IGNORECASE)
261 | # Sometimes buildings are put in parantheses.
262 | # building_match = re.search(r"\(.*\)", address, re.IGNORECASE)
263 | # if building_match:
264 | # self.building = self._clean(building_match.group().replace('(', '').replace(')', ''))
265 | # address = re.sub(r"\(.*\)", "", address, flags=re.IGNORECASE)
266 | # Now let's get the apartment stuff out of the way. Using only sure match regexes, delete apartment parts from
267 | # the address. This prevents things like "Unit" being the street name.
268 | apartment_regexes = [r'#\w+ & \w+', '#\w+ rm \w+', "#\w+-\w", r'apt #{0,1}\w+', r'apartment #{0,1}\w+', r'#\w+',
269 | r'# \w+', r'rm \w+', r'unit #?\w+', r'units #?\w+', r'- #{0,1}\w+', r'no\s?\d+\w*',
270 | r'style\s\w{1,2}', r'townhouse style\s\w{1,2}']
271 | for regex in apartment_regexes:
272 | apartment_match = re.search(regex, address, re.IGNORECASE)
273 | if apartment_match:
274 | # print "Matched regex: ", regex, apartment_match.group()
275 | self.apartment = self._clean(apartment_match.group())
276 | address = re.sub(regex, "", address, flags=re.IGNORECASE)
277 | # Now check for things like ", ," which throw off dstk
278 | address = re.sub(r"\,\s*\,", ",", address)
279 | return address
280 |
281 | def check_zip(self, token):
282 | """
283 | Returns true if token is matches a zip code (5 numbers). Zip code must be the last token in an address (minus anything
284 | removed during preprocessing such as --2 units.
285 | """
286 | if self.zip is None:
287 | # print "last matched", self.last_matched
288 | if self.last_matched is not None:
289 | return False
290 | # print "zip check", len(token) == 5, re.match(r"\d{5}", token)
291 | if len(token) == 5 and re.match(r"\d{5}", token):
292 | self.zip = self._clean(token)
293 |
294 | return True
295 | return False
296 |
297 | def check_state(self, token):
298 | """
299 | Check if state is in either the keys or values of our states list. Must come before the suffix.
300 | """
301 | # print "zip", self.zip
302 | if len(token) == 2 and self.state is None:
303 | if token.capitalize() in self.parser.states.keys():
304 | self.state = self._clean(self.parser.states[token.capitalize()])
305 | return True
306 | elif token.upper() in self.parser.states.values():
307 | self.state = self._clean(token.upper())
308 | return True
309 | if self.state is None and self.street_suffix is None and len(self.comma_separated_address) > 1:
310 | if token.capitalize() in self.parser.states.keys():
311 | self.state = self._clean(self.parser.states[token.capitalize()])
312 | return True
313 | elif token.upper() in self.parser.states.values():
314 | self.state = self._clean(token.upper())
315 | return True
316 | return False
317 |
318 | def check_city(self, token):
319 | """
320 | Check if there is a known city from our city list. Must come before the suffix.
321 | """
322 | shortened_cities = {'saint': 'st.'}
323 | if self.city is None and self.state is not None and self.street_suffix is None:
324 | if token.lower() in self.parser.cities:
325 | self.city = self._clean(token.capitalize())
326 | return True
327 | return False
328 | # Check that we're in the correct location, and that we have at least one comma in the address
329 | if self.city is None and self.apartment is None and self.street_suffix is None and len(
330 | self.comma_separated_address) > 1:
331 | if token.lower() in self.parser.cities:
332 | self.city = self._clean(token.capitalize())
333 | return True
334 | return False
335 | # Multi word cities
336 | if self.city is not None and self.street_suffix is None and self.street is None:
337 | print "Checking for multi part city", token.lower(), token.lower() in shortened_cities.keys()
338 | if token.lower() + ' ' + self.city in self.parser.cities:
339 | self.city = self._clean((token.lower() + ' ' + self.city).capitalize())
340 | return True
341 | if token.lower() in shortened_cities.keys():
342 | token = shortened_cities[token.lower()]
343 | print "Checking for shorted multi part city", token.lower() + ' ' + self.city
344 | if token.lower() + ' ' + self.city.lower() in self.parser.cities:
345 | self.city = self._clean(token.capitalize() + ' ' + self.city.capitalize())
346 | return True
347 |
348 | def check_apartment_number(self, token):
349 | """
350 | Finds apartment, unit, #, etc, regardless of spot in string. This needs to come after everything else has been ruled out,
351 | because it has a lot of false positives.
352 | """
353 | apartment_regexes = [r'#\w+ & \w+', '#\w+ rm \w+', "#\w+-\w", r'apt #{0,1}\w+', r'apartment #{0,1}\w+', r'#\w+',
354 | r'# \w+', r'rm \w+', r'unit #?\w+', r'units #?\w+', r'- #{0,1}\w+', r'no\s?\d+\w*',
355 | r'style\s\w{1,2}', r'\d{1,4}/\d{1,4}', r'\d{1,4}', r'\w{1,2}']
356 | for regex in apartment_regexes:
357 | if re.match(regex, token.lower()):
358 | self.apartment = self._clean(token)
359 | return True
360 | # if self.apartment is None and re.match(apartment_regex_number, token.lower()):
361 | ## print "Apt regex"
362 | # self.apartment = token
363 | # return True
364 | ## If we come on apt or apartment and already have an apartment number, add apt or apartment to the front
365 | if self.apartment and token.lower() in ['apt', 'apartment']:
366 | # print "Apt in a_n"
367 | self.apartment = self._clean(token + ' ' + self.apartment)
368 | return True
369 |
370 | if not self.street_suffix and not self.street and not self.apartment:
371 | # print "Searching for unmatched term: ", token, token.lower(),
372 | if re.match(r'\d?\w?', token.lower()):
373 | self.apartment = self._clean(token)
374 | return True
375 | return False
376 |
377 | def check_street_suffix(self, token):
378 | """
379 | Attempts to match a street suffix. If found, it will return the abbreviation, with the first letter capitalized
380 | and a period after it. E.g. "St." or "Ave."
381 | """
382 | # Suffix must come before street
383 | # print "Suffix check", token, "suffix", self.street_suffix, "street", self.street
384 | if self.street_suffix is None and self.street is None:
385 | # print "upper", token.upper()
386 | if token.upper() in self.parser.suffixes.keys():
387 | suffix = self.parser.suffixes[token.upper()]
388 | self.street_suffix = self._clean(suffix.capitalize() + '.')
389 | return True
390 | elif token.upper() in self.parser.suffixes.values():
391 | self.street_suffix = self._clean(token.capitalize() + '.')
392 | return True
393 | return False
394 |
395 | def check_street(self, token):
396 | """
397 | Let's assume a street comes before a prefix and after a suffix. This isn't always the case, but we'll deal
398 | with that in our guessing game. Also, two word street names...well...
399 |
400 | This check must come after the checks for house_number and street_prefix to help us deal with multi word streets.
401 | """
402 | # First check for single word streets between a prefix and a suffix
403 | if self.street is None and self.street_suffix is not None and self.street_prefix is None and self.house_number is None:
404 | self.street = self._clean(token.capitalize())
405 | return True
406 | # Now check for multiple word streets. This check must come after the check for street_prefix and house_number for this reason.
407 | elif self.street is not None and self.street_suffix is not None and self.street_prefix is None and self.house_number is None:
408 | self.street = self._clean(token.capitalize() + ' ' + self.street)
409 | return True
410 | if not self.street_suffix and not self.street and token.lower() in self.parser.streets:
411 | self.street = self._clean(token)
412 | return True
413 | return False
414 |
415 | def check_street_prefix(self, token):
416 | """
417 | Finds street prefixes, such as N. or Northwest, before a street name. Standardizes to 1 or two letters, followed
418 | by a period.
419 | """
420 | if self.street and not self.street_prefix and token.lower().replace('.', '') in self.parser.prefixes.keys():
421 | self.street_prefix = self._clean(self.parser.prefixes[token.lower().replace('.', '')])
422 | return True
423 | return False
424 |
425 | def check_house_number(self, token):
426 | """
427 | Attempts to find a house number, generally the first thing in an address. If anything is in front of it,
428 | we assume it is a building name.
429 | """
430 | if self.street and self.house_number is None and re.match(street_num_regex, token.lower()):
431 | if '/' in token:
432 | token = token.split('/')[0]
433 | if '-' in token:
434 | token = token.split('-')[0]
435 | self.house_number = self._clean(str(token))
436 | return True
437 | return False
438 |
439 | def check_building(self, token):
440 | """
441 | Building name check. If we have leftover and everything else is set, probably building names.
442 | Allows for multi word building names.
443 | """
444 | if self.street and self.house_number:
445 | if not self.building:
446 | self.building = self._clean(token)
447 | else:
448 | self.building = self._clean(token + ' ' + self.building)
449 | return True
450 | return False
451 |
452 | def guess_unmatched(self, token):
453 | """
454 | When we find something that doesn't match, we can make an educated guess and log it as such.
455 | """
456 | # Check if this is probably an apartment:
457 | if token.lower() in ['apt', 'apartment']:
458 | return False
459 | # Stray dashes are likely useless
460 | if token.strip() == '-':
461 | return True
462 | # Almost definitely not a street if it is one or two characters long.
463 | if len(token) <= 2:
464 | return False
465 | # Let's check for a suffix-less street.
466 | if self.street_suffix is None and self.street is None and self.street_prefix is None and self.house_number is None:
467 | # Streets will just be letters
468 | if re.match(r"[A-Za-z]", token):
469 | if self.line_number >= 0:
470 | pass
471 | # print "{0}: Guessing suffix-less street: ".format(self.line_number), token
472 | else:
473 | # print "Guessing suffix-less street: ", token
474 | pass
475 | self.street = self._clean(token.capitalize())
476 | return True
477 | return False
478 |
479 | def full_address(self):
480 | """
481 | Print the address in a human readable format
482 | """
483 | addr = ""
484 | # if self.building:
485 | # addr = addr + "(" + self.building + ") "
486 | if self.house_number:
487 | addr = addr + self.house_number
488 | if self.street_prefix:
489 | addr = addr + " " + self.street_prefix
490 | if self.street:
491 | addr = addr + " " + self.street
492 | if self.street_suffix:
493 | addr = addr + " " + self.street_suffix
494 | if self.apartment:
495 | addr = addr + " " + self.apartment
496 | if self.city:
497 | addr = addr + ", " + self.city
498 | if self.state:
499 | addr = addr + ", " + self.state
500 | if self.zip:
501 | addr = addr + " " + self.zip
502 | return addr
503 |
504 | def _clean(self, item):
505 | if item is None:
506 | return None
507 | else:
508 | return item.encode("utf-8", "replace")
509 |
510 | def __repr__(self):
511 | return unicode(self)
512 |
513 | def __str__(self):
514 | return unicode(self)
515 |
516 | def __unicode__(self):
517 | address_dict = {
518 | "house_number": self.house_number,
519 | "street_prefix": self.street_prefix,
520 | "street": self.street,
521 | "street_suffix": self.street_suffix,
522 | "apartment": self.apartment,
523 | # "building": self.building,
524 | "city": self.city,
525 | "state": self.state,
526 | "zip": self.zip
527 | }
528 | # print "Address Dict", address_dict
529 | return u"Address - House number: {house_number} Prefix: {street_prefix} Street: {street} Suffix: {street_suffix}" \
530 | u" Apartment: {apartment} City,State,Zip: {city}, {state} {zip}".format(**address_dict)
531 |
532 | def dstk_parse(self, address, parser, pre_parsed_address=None):
533 | """
534 | Given an address string, use DSTK to parse the address and then coerce it to a normal Address object.
535 | pre_parsed_address for multi parsed string. Gives the value part for single dstk return value. If
536 | pre_parsed_address is None, parse it via dstk on its own.
537 | """
538 | if pre_parsed_address:
539 | dstk_address = pre_parsed_address
540 | else:
541 | if self.logger: self.logger.debug("Asking DSTK for address parse {0}".format(address.encode("ascii", "ignore")))
542 | dstk_address = parser.dstk.street2coordinates(address)
543 | # if self.logger: self.logger.debug("dstk return: {0}".format(dstk_address))
544 | if 'confidence' not in dstk_address:
545 | raise InvalidAddressException("Could not deal with DSTK return: {0}".format(dstk_address))
546 | if dstk_address['street_address'] == "":
547 | raise InvalidAddressException("Empty street address in DSTK return: {0}".format(dstk_address))
548 | if dstk_address['street_number'] is None or dstk_address['street_name'] is None:
549 | raise InvalidAddressException("House number or street name was Non in DSTK return: {0}".format(dstk_address))
550 | if dstk_address['confidence'] < parser.required_confidence:
551 | raise DSTKConfidenceTooLowException("Required confidence: {0}. Got confidence: {1}. Address: {2}. Return: {3}.".format(parser.required_confidence, dstk_address['confidence'], address.encode("ascii", "ignore"), dstk_address))
552 | self.confidence = dstk_address['confidence']
553 | if 'street_address' in dstk_address:
554 | intersections = self._get_dstk_intersections(address, dstk_address['street_address'])
555 | if self.logger: self.logger.debug("Confidence: {0}.".format(dstk_address['confidence']))
556 | if self.logger: self.logger.debug("Address: {0}.".format(address))
557 | if self.logger: self.logger.debug("Return: {0}.".format(dstk_address))
558 | # if self.logger: self.logger.debug("")
559 |
560 | addr = dstk_address
561 | if addr is None:
562 | raise InvalidAddressException("DSTK could not parse address: {0}".format(self.original))
563 | if "street_number" in addr:
564 | if addr["street_number"] not in address:
565 | raise InvalidAddressException("DSTK returned a house number not in the original address: {0}".format(addr))
566 | self.house_number = addr["street_number"]
567 | else:
568 | raise InvalidAddressException("(dstk) Addresses must have house numbers: {0}".format(addr))
569 |
570 | if "locality" in addr:
571 | self.city = addr["locality"]
572 | # DSTK shouldn't be returning unknown cities
573 | if addr["locality"] not in address:
574 | raise InvalidAddressException("DSTK returned a city not in the address. City: {0}, Address: {1}.".format(self.city, address))
575 | if "region" in addr:
576 | self.state = addr["region"]
577 | # if "fips_county" in addr:
578 | # self.zip = addr["fips_county"]
579 | if "latitude" in addr:
580 | self.lat = addr["latitude"]
581 | if "longitude" in addr:
582 | self.lng = addr["longitude"]
583 | # Try and find the apartment
584 | # First remove the street_address (this doesn't include apartment)
585 | if "street_address" in addr:
586 | apartment = address.replace(addr["street_address"], '')
587 | # Make sure the city doesn't somehow come before the street in the original string.
588 |
589 | # try:
590 | # end_pos = re.search("(" + addr["locality"] + ")", apartment).start(1) - 1
591 | # # self.apartment = apartment[:end_pos]
592 | # except Exception:
593 | # pass
594 | # self.apartment = None
595 | # Now that we have an address, try to parse out street suffix, prefix, and street
596 | if self.apartment:
597 | street_addr = addr["street_address"].replace(self.apartment, '')
598 | else:
599 | street_addr = addr["street_address"]
600 |
601 | # We should be left with only prefix, street, suffix. Go for suffix first.
602 | split_addr = street_addr.split()
603 | if len(split_addr) == 0:
604 | if self.logger: self.logger.debug("Could not split street_address: {0}".format(addr))
605 | raise InvalidAddressException("Could not split street_address: {0}".format(addr))
606 | # Get rid of house_number
607 | if split_addr[0] == self.house_number:
608 | split_addr = split_addr[1:]
609 | if self.logger: self.logger.debug("Checking {0} for suffixes".format(split_addr[-1].upper()))
610 | if split_addr[-1].upper() in parser.suffixes.keys() or split_addr[-1].upper() in parser.suffixes.values():
611 | self.street_suffix = split_addr[-1]
612 | split_addr = split_addr[:-1]
613 | if self.logger: self.logger.debug("Checking {0} for prefixes".format(split_addr[0].lower()))
614 | if split_addr[0].lower() in parser.prefixes.keys() or split_addr[0].upper() in parser.prefixes.values() or \
615 | split_addr[0].upper() + '.' in parser.prefixes.values():
616 | if split_addr[0][-1] == '.':
617 | self.street_prefix = split_addr[0].upper()
618 | else:
619 | self.street_prefix = split_addr[0].upper() + '.'
620 | if self.logger: self.logger.debug("Saving prefix: {0}".format(self.street_prefix))
621 | split_addr = split_addr[1:]
622 | if self.logger: self.logger.debug("Saving street: {0}".format(split_addr))
623 | self.street = " ".join(split_addr)
624 | # DSTK shouldn't be guessing cities that come before streets.
625 | match = re.search(self.street, address)
626 | if match is None:
627 | raise InvalidAddressException("DSTK picked a street not in the original address. Street: {0}. Address: {1}.".format(self.street, address))
628 | street_position = match
629 | match = re.search(self.city, address)
630 | if match is None:
631 | raise InvalidAddressException("DSTK picked a city not in the original address. City: {0}. Address: {1}.".format(self.city, address))
632 | city_position = match
633 | if city_position.start(0) < street_position.end(0):
634 | raise InvalidAddressException("DSTK picked a street that comes after the city. Street: {0}. City: {1}. Address: {2}.".format(self.street, self.city, address))
635 | if self.logger: self.logger.debug("Successful DSTK address: {0}, house: {1}, street: {2}\n".format(self.original, self.house_number, self.street))
636 |
637 | def _get_dstk_intersections(self, address, dstk_address):
638 | """
639 | Find the unique tokens in the original address and the returned address.
640 | """
641 | # Normalize both addresses
642 | normalized_address = self._normalize(address)
643 | normalized_dstk_address = self._normalize(dstk_address)
644 | address_uniques = set(normalized_address) - set(normalized_dstk_address)
645 | dstk_address_uniques = set(normalized_dstk_address) - set(normalized_address)
646 | if self.logger: self.logger.debug("Address Uniques {0}".format(address_uniques))
647 | if self.logger: self.logger.debug("DSTK Address Uniques {0}".format(dstk_address_uniques))
648 | return (len(address_uniques), len(dstk_address_uniques))
649 |
650 | def _normalize(self, address):
651 | """
652 | Normalize prefixes, suffixes and other to make matching original to returned easier.
653 | """
654 | normalized_address = []
655 | if self.logger: self.logger.debug("Normalizing Address: {0}".format(address))
656 | for token in address.split():
657 | if token.upper() in self.parser.suffixes.keys():
658 | normalized_address.append(self.parser.suffixes[token.upper()].lower())
659 | elif token.upper() in self.parser.suffixes.values():
660 | normalized_address.append(token.lower())
661 | elif token.upper().replace('.', '') in self.parser.suffixes.values():
662 | normalized_address.append(token.lower().replace('.', ''))
663 | elif token.lower() in self.parser.prefixes.keys():
664 | normalized_address.append(self.parser.prefixes[token.lower()].lower())
665 | elif token.upper() in self.parser.prefixes.values():
666 | normalized_address.append(token.lower()[:-1])
667 | elif token.upper() + '.' in self.parser.prefixes.values():
668 | normalized_address.append(token.lower())
669 | else:
670 | normalized_address.append(token.lower())
671 | return normalized_address
672 |
673 |
674 | def create_cities_csv(filename="places2k.txt", output="cities.csv"):
675 | """
676 | Takes the places2k.txt from USPS and creates a simple file of all cities.
677 | """
678 | with open(filename, 'r') as city_file:
679 | with open(output, 'w') as out:
680 | for line in city_file:
681 | # Drop Puerto Rico (just looking for the 50 states)
682 | if line[0:2] == "PR":
683 | continue
684 | # Per census.gov, characters 9-72 are the name of the city or place. Cut ,off the last part, which is city, town, etc.
685 | # print " ".join(line[9:72].split()[:-1])
686 | out.write(" ".join(line[9:72].split()[:-1]) + '\n')
687 |
688 |
689 | class InvalidAddressException(Exception):
690 | pass
691 |
692 | class DSTKConfidenceTooLowException(Exception):
693 | pass
694 |
695 | if __name__ == "__main__":
696 | ap = AddressParser()
697 | print ap.parse_address(" ".join(sys.argv[1:]))
698 |
--------------------------------------------------------------------------------
/address/address_list.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import sys
3 | import os
4 | from address import Address, AddressParser
5 |
6 |
7 | if __name__ == '__main__':
8 | # The mini test program takes a list of addresses, creates Address objects, and prints errors for each one
9 | # with unmatched terms. Takes a filename as the first and only argument. The file should be one address per line.
10 | if len(sys.argv) != 2:
11 | print "Usage: test_list.py filename"
12 | sys.exit(1)
13 | if not os.path.exists(sys.argv[1]):
14 | print "File {0} does not exist".format(sys.argv[1])
15 | sys.exit(2)
16 | unmatched_count = 0
17 | line_count = 0
18 | ap = AddressParser()
19 | with open(sys.argv[1]) as input:
20 | for line in input:
21 | addr = ap.parse_address(line.strip(), line_number=line_count)
22 |
23 | if addr.unmatched:
24 | print "Unmatched", addr, addr.line_number
25 | print ""
26 | unmatched_count = unmatched_count + 1
27 | # All addresses have a house number and a street.
28 | if addr.house_number is None:
29 | print "House number cannot be None: ", addr, addr.line_number
30 | if addr.street is None:
31 | print "Street cannot be None: ", addr, addr.line_number
32 | line_count = line_count + 1
33 | print addr.full_address()
34 | print addr.original
35 | print ""
36 | if unmatched_count == 0:
37 | print "All {0} address matched! Huzzah!".format(line_count)
38 | else:
39 | print "{0} addresses of {1} ({2:.2%}) with unmatched terms. :(".format(unmatched_count, line_count, unmatched_count / line_count)
40 |
--------------------------------------------------------------------------------
/address/dstk.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Python interface to the Data Science Toolkit Plugin
3 | # version: 1.30 (2011-03-16)
4 | #
5 | # See http://www.datasciencetoolkit.org/developerdocs#python for full details
6 | #
7 | # All code (C) Pete Warden, 2011
8 | #
9 | # This program is free software: you can redistribute it and/or modify
10 | # it under the terms of the GNU General Public License as published by
11 | # the Free Software Foundation, either version 3 of the License, or
12 | # (at your option) any later version.
13 | #
14 | # This program is distributed in the hope that it will be useful,
15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 | # GNU General Public License for more details.
18 | #
19 | # You should have received a copy of the GNU General Public License
20 | # along with this program. If not, see .
21 |
22 | import urllib
23 | try:
24 | import simplejson as json
25 | except ImportError:
26 | import json
27 | import os
28 | import httplib
29 | import mimetypes
30 | import re
31 | import csv
32 |
33 |
34 | # This is the main interface class. You can see an example of it in use
35 | # below, implementing a command-line tool, but you basically just instantiate
36 | # dstk = DSTK()
37 | # and then call the method you want
38 | # coordinates = dstk.ip2coordinates('12.34.56.78')
39 | # The full documentation is at http://www.datasciencetoolkit.org/developerdocs
40 | class DSTK:
41 |
42 | api_base = None
43 |
44 | def __init__(self, options=None):
45 | if options is None:
46 | options = {}
47 |
48 | defaultOptions = {
49 | 'apiBase': 'http://www.datasciencetoolkit.org',
50 | 'checkVersion': True
51 | }
52 |
53 | if 'DSTK_API_BASE' in os.environ:
54 | defaultOptions['apiBase'] = os.environ['DSTK_API_BASE']
55 |
56 | for key, value in defaultOptions.items():
57 | if key not in options:
58 | options[key] = value
59 |
60 | self.api_base = options['apiBase']
61 |
62 | if options['checkVersion']:
63 | self.check_version()
64 |
65 | def check_version(self):
66 |
67 | required_version = 40
68 |
69 | api_url = self.api_base+'/info'
70 |
71 | try:
72 | response_string = urllib.urlopen(api_url).read()
73 | response = json.loads(response_string)
74 | except:
75 | raise Exception('The server at "'+self.api_base+'" doesn\'t seem to be running DSTK, no version information found.')
76 |
77 | actual_version = response['version']
78 | if actual_version < required_version:
79 | raise Exception('DSTK: Version '+str(actual_version)+' found at "'+api_url+'" but '+str(required_version)+' is required')
80 |
81 | def ip2coordinates(self, ips):
82 |
83 | if not isinstance(ips, (list, tuple)):
84 | ips = [ips]
85 |
86 | api_url = self.api_base+'/ip2coordinates'
87 | api_body = json.dumps(ips)
88 | response_string = urllib.urlopen(api_url, api_body).read()
89 |
90 | response = json.loads(response_string)
91 |
92 | if 'error' in response:
93 | raise Exception(response['error'])
94 |
95 | return response
96 |
97 | def street2coordinates(self, addresses):
98 |
99 | if not isinstance(addresses, (list, tuple)):
100 | addresses = [addresses]
101 |
102 | api_url = self.api_base+'/street2coordinates'
103 | api_body = json.dumps(addresses)
104 | response_string = urllib.urlopen(api_url, api_body).read()
105 | response = json.loads(response_string)
106 |
107 | if 'error' in response:
108 | raise Exception(response['error'])
109 |
110 | return response
111 |
112 | def coordinates2politics(self, coordinates):
113 |
114 | api_url = self.api_base+'/coordinates2politics'
115 | api_body = json.dumps(coordinates)
116 | response_string = urllib.urlopen(api_url, api_body).read()
117 | response = json.loads(response_string)
118 |
119 | if 'error' in response:
120 | raise Exception(response['error'])
121 |
122 | return response
123 |
124 | def text2places(self, text):
125 |
126 | api_url = self.api_base+'/text2places'
127 | api_body = text
128 | response_string = urllib.urlopen(api_url, api_body).read()
129 | response = json.loads(response_string)
130 |
131 | if 'error' in response:
132 | raise Exception(response['error'])
133 |
134 | return response
135 |
136 | def file2text(self, file_name, file_data):
137 |
138 | host = self.api_base.replace('http://', '')
139 |
140 | response = post_multipart(host,
141 | '/file2text',[],[('inputfile', file_name, file_data)])
142 |
143 | return response
144 |
145 | def text2sentences(self, text):
146 |
147 | api_url = self.api_base+'/text2sentences'
148 | api_body = text
149 | response_string = urllib.urlopen(api_url, api_body).read()
150 | response = json.loads(response_string)
151 |
152 | if 'error' in response:
153 | raise Exception(response['error'])
154 |
155 | return response
156 |
157 | def html2text(self, html):
158 |
159 | api_url = self.api_base+'/html2text'
160 | api_body = html
161 | response_string = urllib.urlopen(api_url, api_body).read()
162 | response = json.loads(response_string)
163 |
164 | if 'error' in response:
165 | raise Exception(response['error'])
166 |
167 | return response
168 |
169 | def html2story(self, html):
170 |
171 | api_url = self.api_base+'/html2story'
172 | api_body = html
173 | response_string = urllib.urlopen(api_url, api_body).read()
174 | response = json.loads(response_string)
175 |
176 | if 'error' in response:
177 | raise Exception(response['error'])
178 |
179 | return response
180 |
181 | def text2people(self, text):
182 |
183 | api_url = self.api_base+'/text2people'
184 | api_body = text
185 | response_string = urllib.urlopen(api_url, api_body).read()
186 | response = json.loads(response_string)
187 |
188 | if 'error' in response:
189 | raise Exception(response['error'])
190 |
191 | return response
192 |
193 | def text2times(self, text):
194 |
195 | api_url = self.api_base+'/text2times'
196 | api_body = text
197 | response_string = urllib.urlopen(api_url, api_body).read()
198 | response = json.loads(response_string)
199 |
200 | if 'error' in response:
201 | raise Exception(response['error'])
202 |
203 | return response
204 |
205 | # We need to post files as multipart form data, and Python has no native function for
206 | # that, so these utility functions implement what we need.
207 | # See http://code.activestate.com/recipes/146306/
208 | def post_multipart(host, selector, fields, files):
209 | """
210 | Post fields and files to an http host as multipart/form-data.
211 | fields is a sequence of (name, value) elements for regular form fields.
212 | files is a sequence of (name, filename, value) elements for data to be uploaded as files
213 | Return the server's response page.
214 | """
215 | content_type, body = encode_multipart_formdata(fields, files)
216 | h = httplib.HTTP(host)
217 | h.putrequest('POST', selector)
218 | h.putheader('content-type', content_type)
219 | h.putheader('content-length', str(len(body)))
220 | h.endheaders()
221 | h.send(body)
222 | errcode, errmsg, headers = h.getreply()
223 | return h.file.read()
224 |
225 | def encode_multipart_formdata(fields, files):
226 | """
227 | fields is a sequence of (name, value) elements for regular form fields.
228 | files is a sequence of (name, filename, value) elements for data to be uploaded as files
229 | Return (content_type, body) ready for httplib.HTTP instance
230 | """
231 | BOUNDARY = '----------ThIs_Is_tHe_bouNdaRY_$'
232 | CRLF = '\r\n'
233 | L = []
234 | for (key, value) in fields:
235 | L.append('--' + BOUNDARY)
236 | L.append('Content-Disposition: form-data; name="%s"' % key)
237 | L.append('')
238 | L.append(value)
239 | for (key, filename, value) in files:
240 | L.append('--' + BOUNDARY)
241 | L.append('Content-Disposition: form-data; name="%s"; filename="%s"' % (key, filename))
242 | L.append('Content-Type: %s' % guess_content_type(filename))
243 | L.append('')
244 | L.append(value)
245 | L.append('--' + BOUNDARY + '--')
246 | L.append('')
247 | body = CRLF.join(L)
248 | content_type = 'multipart/form-data; boundary=%s' % BOUNDARY
249 | return content_type, body
250 |
251 | def guess_content_type(filename):
252 | return mimetypes.guess_type(filename)[0] or 'application/octet-stream'
253 |
254 | # End of the interface. The rest of this file is an example implementation of a
255 | # command line client.
256 |
257 | def ip2coordinates_cli(dstk, options, inputs, output):
258 |
259 | writer = csv.writer(sys.stdout)
260 |
261 | input_ips = []
262 | for input_line in inputs:
263 | ip_match = re.match(r'[12]?\d?\d\.[12]?\d?\d\.[12]?\d?\d\.[12]?\d?\d', input_line)
264 | if ip_match is not None:
265 | input_ips.append(ip_match.group(0))
266 | else:
267 | print 'No match'
268 |
269 | result = dstk.ip2coordinates(input_ips)
270 |
271 | if options['showHeaders']:
272 | for ip, info in result.items():
273 | if info is None:
274 | continue
275 | row = ['ip_address']
276 | for key, value in info.items():
277 | row.append(str(key))
278 | writer.writerow(row)
279 | break
280 |
281 | for ip, info in result.items():
282 |
283 | if info is None:
284 | info = {}
285 |
286 | row = [ip]
287 | for key, value in info.items():
288 | row.append(str(value))
289 |
290 | writer.writerow(row)
291 |
292 | return
293 |
294 | def street2coordinates_cli(dstk, options, inputs, output):
295 |
296 | writer = csv.writer(sys.stdout)
297 |
298 | result = dstk.street2coordinates(inputs)
299 |
300 | if options['showHeaders']:
301 | for ip, info in result.items():
302 | if info is None:
303 | continue
304 | row = ['address']
305 | for key, value in info.items():
306 | row.append(str(key))
307 | writer.writerow(row)
308 | break
309 |
310 | for ip, info in result.items():
311 |
312 | if info is None:
313 | info = {}
314 |
315 | row = [ip]
316 | for key, value in info.items():
317 | row.append(str(value))
318 |
319 | writer.writerow(row)
320 |
321 | return
322 |
323 | def coordinates2politics_cli(dstk, options, inputs, output):
324 |
325 | writer = csv.writer(output)
326 |
327 | coordinates_list = []
328 | for input in inputs:
329 | coordinates = input.split(',')
330 | if len(coordinates)!=2:
331 | output.write('You must enter coordinates as a series of comma-separated pairs, eg 37.76,-122.42')
332 | exit(-1)
333 | coordinates_list.append([coordinates[0], coordinates[1]])
334 |
335 | result = dstk.coordinates2politics(coordinates_list)
336 |
337 | if options['showHeaders']:
338 | row = ['latitude', 'longitude', 'name', 'code', 'type', 'friendly_type']
339 | writer.writerow(row)
340 |
341 | for info in result:
342 |
343 | location = info['location']
344 | politics = info['politics']
345 |
346 | for politic in politics:
347 | row = [location['latitude'],
348 | location['longitude'],
349 | politic['name'],
350 | politic['code'],
351 | politic['type'],
352 | politic['friendly_type'],
353 | ]
354 | writer.writerow(row)
355 |
356 | return
357 |
358 | def file2text_cli(dstk, options, inputs, output):
359 |
360 | for file_name in inputs:
361 | if os.path.isdir(file_name):
362 | children = os.listdir(file_name)
363 | full_children = []
364 | for child in children:
365 | full_children.append(os.path.join(file_name, child))
366 | file2text_cli(dstk, options, full_children)
367 | else:
368 | file_data = get_file_or_url_contents(file_name)
369 | if options['showHeaders']:
370 | output.write('--File--: '+file_name+"\n")
371 | result = dstk.file2text(file_name, file_data)
372 |
373 | print result
374 | return
375 |
376 | def text2places_cli(dstk, options, inputs, output):
377 |
378 | writer = csv.writer(output)
379 |
380 | if options['showHeaders']:
381 | row = ['latitude', 'longitude', 'name', 'type', 'start_index', 'end_index', 'matched_string', 'file_name']
382 | writer.writerow(row)
383 | options['showHeaders'] = False
384 |
385 | if options['from_stdin']:
386 | result = dstk.text2places("\n".join(inputs))
387 | text2places_format(result, 'stdin', writer)
388 | return
389 |
390 | for file_name in inputs:
391 | if os.path.isdir(file_name):
392 | children = os.listdir(file_name)
393 | full_children = []
394 | for child in children:
395 | full_children.append(os.path.join(file_name, child))
396 | text2places_cli(dstk, options, full_children, output)
397 | else:
398 | file_data = get_file_or_url_contents(file_name)
399 | result = dstk.text2places(file_data)
400 | text2places_format(result, file_name, writer)
401 |
402 | return
403 |
404 | def text2places_format(result, file_name, writer):
405 | for info in result:
406 |
407 | row = [info['latitude'],
408 | info['longitude'],
409 | info['name'],
410 | info['type'],
411 | info['start_index'],
412 | info['end_index'],
413 | info['matched_string'],
414 | file_name
415 | ]
416 | writer.writerow(row)
417 | return
418 |
419 | def html2text_cli(dstk, options, inputs, output):
420 |
421 | if options['from_stdin']:
422 | result = dstk.html2text("\n".join(inputs))
423 | print result['text']
424 | return
425 |
426 | for file_name in inputs:
427 | if os.path.isdir(file_name):
428 | children = os.listdir(file_name)
429 | full_children = []
430 | for child in children:
431 | full_children.append(os.path.join(file_name, child))
432 | html2text_cli(dstk, options, full_children, output)
433 | else:
434 | file_data = get_file_or_url_contents(file_name)
435 | if options['showHeaders']:
436 | output.write('--File--: '+file_name+"\n")
437 | result = dstk.html2text(file_data)
438 | print result['text']
439 | return
440 |
441 | def text2sentences_cli(dstk, options, inputs, output):
442 |
443 | if options['from_stdin']:
444 | result = dstk.text2sentences("\n".join(inputs))
445 | print result['sentences']
446 | return
447 |
448 | for file_name in inputs:
449 | if os.path.isdir(file_name):
450 | children = os.listdir(file_name)
451 | full_children = []
452 | for child in children:
453 | full_children.append(os.path.join(file_name, child))
454 | text2sentences_cli(dstk, options, full_children, output)
455 | else:
456 | file_data = get_file_or_url_contents(file_name)
457 | if options['showHeaders']:
458 | output.write('--File--: '+file_name+"\n")
459 | result = dstk.text2sentences(file_data)
460 | print result['sentences']
461 |
462 | return
463 |
464 | def html2story_cli(dstk, options, inputs, output):
465 |
466 | if options['from_stdin']:
467 | result = dstk.html2story("\n".join(inputs))
468 | print result['story']
469 | return
470 |
471 | for file_name in inputs:
472 | if os.path.isdir(file_name):
473 | children = os.listdir(file_name)
474 | full_children = []
475 | for child in children:
476 | full_children.append(os.path.join(file_name, child))
477 | html2story_cli(dstk, options, full_children, output)
478 | else:
479 | file_data = get_file_or_url_contents(file_name)
480 | if options['showHeaders']:
481 | output.write('--File--: '+file_name+"\n")
482 | result = dstk.html2story(file_data)
483 | print result['story']
484 |
485 | return
486 |
487 | def text2people_cli(dstk, options, inputs, output):
488 |
489 | writer = csv.writer(sys.stdout)
490 |
491 | if options['showHeaders']:
492 | row = ['matched_string', 'first_name', 'surnames', 'title', 'gender', 'start_index', 'end_index', 'file_name']
493 | writer.writerow(row)
494 | options['showHeaders'] = False
495 |
496 | if options['from_stdin']:
497 | result = dstk.text2people("\n".join(inputs))
498 | text2people_format(result, 'stdin', writer)
499 | return
500 |
501 | for file_name in inputs:
502 | if os.path.isdir(file_name):
503 | children = os.listdir(file_name)
504 | full_children = []
505 | for child in children:
506 | full_children.append(os.path.join(file_name, child))
507 | text2places_cli(dstk, options, full_children, output)
508 | else:
509 | file_data = get_file_or_url_contents(file_name)
510 | result = dstk.text2people(file_data)
511 | text2people_format(result, file_name, writer)
512 |
513 | return
514 |
515 | def text2people_format(result, file_name, writer):
516 | for info in result:
517 |
518 | row = [
519 | info['matched_string'],
520 | info['first_name'],
521 | info['surnames'],
522 | info['title'],
523 | info['gender'],
524 | str(info['start_index']),
525 | str(info['end_index']),
526 | file_name
527 | ]
528 | writer.writerow(row)
529 | return
530 |
531 | def text2times_cli(dstk, options, inputs, output):
532 |
533 | writer = csv.writer(sys.stdout)
534 |
535 | if options['showHeaders']:
536 | row = ['matched_string', 'time_string', 'time_seconds', 'is_relative', 'start_index', 'end_index', 'file_name']
537 | writer.writerow(row)
538 | options['showHeaders'] = False
539 |
540 | if options['from_stdin']:
541 | result = dstk.text2times("\n".join(inputs))
542 | text2times_format(result, 'stdin', writer)
543 | return
544 |
545 | for file_name in inputs:
546 | if os.path.isdir(file_name):
547 | children = os.listdir(file_name)
548 | full_children = []
549 | for child in children:
550 | full_children.append(os.path.join(file_name, child))
551 | text2times_cli(dstk, options, full_children, output)
552 | else:
553 | file_data = get_file_or_url_contents(file_name)
554 | result = dstk.text2times(file_data)
555 | text2times_format(result, file_name, writer)
556 |
557 | return
558 |
559 | def text2times_format(result, file_name, writer):
560 | for info in result:
561 |
562 | row = [
563 | info['matched_string'],
564 | info['time_string'],
565 | info['time_seconds'],
566 | info['is_relative'],
567 | str(info['start_index']),
568 | str(info['end_index']),
569 | file_name
570 | ]
571 | writer.writerow(row)
572 | return
573 |
574 | def get_file_or_url_contents(file_name):
575 | if re.match(r'http://', file_name):
576 | file_data = urllib.urlopen(file_name).read()
577 | else:
578 | file_data = open(file_name).read()
579 | return file_data
580 |
581 | def print_usage(message=''):
582 |
583 | print message
584 | print "Usage:"
585 | print "python dstk.py [-a/--api_base 'http://yourhost.com'] [-h/--show_headers] "
586 | print "Where is one of:"
587 | print " ip2coordinates (lat/lons for IP addresses)"
588 | print " street2coordinates (lat/lons for postal addresses)"
589 | print " coordinates2politics (country/state/county/constituency/etc for lat/lon)"
590 | print " text2places (lat/lons for places mentioned in unstructured text)"
591 | print " file2text (PDF/Excel/Word to text, and OCR on PNG/Jpeg/Tiff images)"
592 | print " text2sentences (parts of the text that look like proper sentences)"
593 | print " html2text (text version of the HTML document)"
594 | print " html2story (text version of the HTML with no boilerplate)"
595 | print " text2people (gender for people mentioned in unstructured text)"
596 | print " text2times (times and dates mentioned in unstructured text)"
597 | print "If no inputs are specified, then standard input will be read and used"
598 | print "See http://www.datasciencetoolkit.org/developerdocs for more details"
599 | print "Examples:"
600 | print "python dstk.py ip2coordinates 67.169.73.113"
601 | print "python dstk.py street2coordinates \"2543 Graystone Place, Simi Valley, CA 93065\""
602 | print "python dstk.py file2text scanned.jpg"
603 |
604 | exit(-1)
605 |
606 | if __name__ == '__main__':
607 |
608 | import sys
609 |
610 | commands = {
611 | 'ip2coordinates': { 'handler': ip2coordinates_cli },
612 | 'street2coordinates': { 'handler': street2coordinates_cli },
613 | 'coordinates2politics': { 'handler': coordinates2politics_cli },
614 | 'text2places': { 'handler': text2places_cli },
615 | 'file2text': { 'handler': file2text_cli },
616 | 'text2sentences': { 'handler': text2sentences_cli },
617 | 'html2text': { 'handler': html2text_cli },
618 | 'html2story': { 'handler': html2story_cli },
619 | 'text2people': { 'handler': text2people_cli },
620 | 'text2times': { 'handler': text2times_cli },
621 | }
622 | switches = {
623 | 'api_base': True,
624 | 'show_headers': True
625 | }
626 |
627 | command = None
628 | options = {'showHeaders': False}
629 | inputs = []
630 |
631 | ignore_next = False
632 | for index, arg in enumerate(sys.argv[1:]):
633 | if ignore_next:
634 | ignore_next = False
635 | continue
636 |
637 | if arg[0]=='-' and len(arg)>1:
638 | if len(arg) == 2:
639 | letter = arg[1]
640 | if letter == 'a':
641 | option = 'api_base'
642 | elif letter == 'h':
643 | option = 'show_headers'
644 | else:
645 | option = arg[2:]
646 |
647 | if option not in switches:
648 | print_usage('Unknown option "'+arg+'"')
649 |
650 | if option == 'api_base':
651 | if (index+2) >= len(sys.argv):
652 | print_usage('Missing argument for option "'+arg+'"')
653 | options['apiBase'] = sys.argv[index+2]
654 | ignore_next = True
655 | elif option == 'show_headers':
656 | options['showHeaders'] = True
657 |
658 | else:
659 | if command is None:
660 | command = arg
661 | if command not in commands:
662 | print_usage('Unknown command "'+arg+'"')
663 | else:
664 | inputs.append(arg)
665 |
666 | if command is None:
667 | print_usage('No command specified')
668 |
669 | if len(inputs)<1:
670 | options['from_stdin'] = True
671 | inputs = sys.stdin.readlines()
672 | else:
673 | options['from_stdin'] = False
674 |
675 | command_info = commands[command]
676 |
677 | dstk = DSTK(options)
678 |
679 | command_info['handler'](dstk, options, inputs, sys.stdout)
--------------------------------------------------------------------------------
/address/streets.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SwoopSearch/pyaddress/62ebb07a6840e710d256406a8ec1d06abec0e1c4/address/streets.csv
--------------------------------------------------------------------------------
/address/suffixes.csv:
--------------------------------------------------------------------------------
1 | ALLEE,ALY
2 | ALLEY,ALY
3 | ALLY,ALY
4 | ALY,ALY
5 | ANEX,ANX
6 | ANNEX,ANX
7 | ANNX,ANX
8 | ANX,ANX
9 | ARCADE,ARC
10 | ARC,ARC
11 | AV,AVE
12 | AVE,AVE
13 | AVEN,AVE
14 | AVENU,AVE
15 | AVENUE,AVE
16 | AVN,AVE
17 | AVNU,AVE
18 | AVNUE,AVE
19 | BAYOO,BYU
20 | BAYOU,BYU
21 | BCH,BCH
22 | BEACH,BCH
23 | BEND,BND
24 | BG,BG
25 | BL,BLVD
26 | BLF,BLF
27 | BLUF,BLF
28 | BLUFF,BLF
29 | BLUFFS,BLF
30 | BLV,BLVD
31 | BLVD,BLVD
32 | BND,BND
33 | BOT,BTM
34 | BOTTM,BTM
35 | BOTTOM,BTM
36 | BOUL,BLVD
37 | BOULEVARD,BLVD
38 | BOULV,BLVD
39 | BPS,BYP
40 | BPSS,BYP
41 | BRANCH,BR
42 | BR,BR
43 | BRDGE,BRG
44 | BRG,BRG
45 | BRIDGE,BRG
46 | BRK,BRK
47 | BRNCH,BR
48 | BROOK,BRK
49 | BROOKS,BRK
50 | BTM,BTM
51 | BULEVARD,BLVD
52 | BURG,BG
53 | BURGS,BG
54 | BV,BLVD
55 | BVD,BLVD
56 | BYPA,BYP
57 | BYPAS,BYP
58 | BY,PASS
59 | BYPASS,BYP
60 | BYP,BYP
61 | BYPS,BYP
62 | BYPSS,BYP
63 | BYU,BYU
64 | CAMP,CP
65 | CAN,CYN
66 | CANYN,CYN
67 | CANYON,CYN
68 | CAPE,CPE
69 | CAUSEWAY,CSWY
70 | CAUSWAY,CSWY
71 | CEN,CTR
72 | CENT,CTR
73 | CENTER,CTR
74 | CENTERS,CTR
75 | CENTR,CTR
76 | CENTRE,CTR
77 | CIRC,CIR
78 | CIR,CIR
79 | CIRCL,CIR
80 | CIRCLE,CIR
81 | CIRCLES,CIR
82 | CK,CRK
83 | CLB,CLB
84 | CLF,CLFS
85 | CLFS,CLFS
86 | CLIFF,CLFS
87 | CLIFFS,CLFS
88 | CLUB,CLB
89 | CMP,CP
90 | CNTER,CTR
91 | CNTR,CTR
92 | CNYN,CYN
93 | COR,COR
94 | CORNER,COR
95 | CORNERS,CORS
96 | CORS,CORS
97 | COURSE,CRSE
98 | COURT,CT
99 | COURTS,CTS
100 | COVE,CV
101 | COVES,CV
102 | CP,CP
103 | CPE,CPE
104 | CRCL,CIR
105 | CRCLE,CIR
106 | CR,CRK
107 | CRECENT,CRES
108 | CREEK,CRK
109 | CRESCENT,CRES
110 | CRES,CRES
111 | CRESENT,CRES
112 | CRK,CRK
113 | CROSSING,XING
114 | CRSCNT,CRES
115 | CRSE,CRSE
116 | CRSENT,CRES
117 | CRSNT,CRES
118 | CRSSING,XING
119 | CRSSNG,XING
120 | CRT,CT
121 | CSWY,CSWY
122 | CT,CT
123 | CTR,CTR
124 | CTS,CTS
125 | CV,CV
126 | CYN,CYN
127 | DALE,DL
128 | DAM,DM
129 | DIV,DV
130 | DIVIDE,DV
131 | DL,DL
132 | DM,DM
133 | DR,DR
134 | DRIV,DR
135 | DRIVE,DR
136 | DRIVES,DR
137 | DRV,DR
138 | DVD,DV
139 | DV,DV
140 | ESTATE,EST
141 | ESTATES,EST
142 | EST,EST
143 | ESTS,EST
144 | EXP,EXPY
145 | EXPRESS,EXPY
146 | EXPRESSWAY,EXPY
147 | EXPR,EXPY
148 | EXPW,EXPY
149 | EXPY,EXPY
150 | EXTENSION,EXT
151 | EXT,EXT
152 | EXTN,EXT
153 | EXTNSN,EXT
154 | EXTS,EXT
155 | FALL,FALL
156 | FALLS,FLS
157 | FERRY,FRY
158 | FIELD,FLD
159 | FIELDS,FLDS
160 | FLAT,FLT
161 | FLATS,FLT
162 | FLD,FLD
163 | FLDS,FLDS
164 | FLS,FLS
165 | FLT,FLT
166 | FLTS,FLT
167 | FORD,FRD
168 | FORDS,FRD
169 | FOREST,FRST
170 | FORESTS,FRST
171 | FORGE,FRG
172 | FORGES,FRG
173 | FORG,FRG
174 | FORK,FRK
175 | FORKS,FRKS
176 | FORT,FT
177 | FRD,FRD
178 | FREEWAY,FWY
179 | FREEWY,FWY
180 | FRG,FRG
181 | FRK,FRK
182 | FRKS,FRKS
183 | FRRY,FRY
184 | FRST,FRST
185 | FRT,FT
186 | FRWAY,FWY
187 | FRWY,FWY
188 | FRY,FRY
189 | FT,FT
190 | FWY,FWY
191 | GARDEN,GDNS
192 | GARDENS,GDNS
193 | GARDN,GDNS
194 | GATEWAY,GTWY
195 | GATEWY,GTWY
196 | GATWAY,GTWY
197 | GDN,GDNS
198 | GDNS,GDNS
199 | GLEN,GLN
200 | GLENS,GLN
201 | GLN,GLN
202 | GRDEN,GDNS
203 | GRDN,GDNS
204 | GRDNS,GDNS
205 | GREEN,GRN
206 | GREENS,GRN
207 | GRN,GRN
208 | GROVE,GRV
209 | GROVES,GRV
210 | GROV,GRV
211 | GRV,GRV
212 | GTWAY,GTWY
213 | GTWY,GTWY
214 | HARB,HBR
215 | HARBOR,HBR
216 | HARBORS,HBR
217 | HARBR,HBR
218 | HAVEN,HVN
219 | HAVN,HVN
220 | HBR,HBR
221 | HEIGHT,HTS
222 | HEIGHTS,HTS
223 | HGTS,HTS
224 | HIGHWAY,HWY
225 | HIGHWY,HWY
226 | HILL,HL
227 | HILLS,HLS
228 | HIWAY,HWY
229 | HIWY,HWY
230 | HL,HL
231 | HLLW,HOLW
232 | HLS,HLS
233 | HOLLOW,HOLW
234 | HOLLOWS,HOLW
235 | HOLW,HOLW
236 | HOLWS,HOLW
237 | HRBOR,HBR
238 | HT,HTS
239 | HTS,HTS
240 | HVN,HVN
241 | HWAY,HWY
242 | HWY,HWY
243 | INLET,INLT
244 | INLT,INLT
245 | IS,IS
246 | ISLAND,IS
247 | ISLANDS,ISS
248 | ISLE,ISLE
249 | ISLES,ISLE
250 | ISLND,IS
251 | ISLNDS,ISS
252 | ISS,ISS
253 | JCTION,JCT
254 | JCT,JCT
255 | JCTN,JCT
256 | JCTNS,JCT
257 | JCTS,JCT
258 | JUNCTION,JCT
259 | JUNCTIONS,JCT
260 | JUNCTN,JCT
261 | JUNCTON,JCT
262 | KEY,KY
263 | KEYS,KY
264 | KNL,KNLS
265 | KNLS,KNLS
266 | KNOL,KNLS
267 | KNOLL,KNLS
268 | KNOLLS,KNLS
269 | KY,KY
270 | KYS,KY
271 | LAKE,LK
272 | LAKES,LKS
273 | LANDING,LNDG
274 | LANE,LN
275 | LANES,LN
276 | LCK,LCKS
277 | LCKS,LCKS
278 | LDGE,LDG
279 | LDG,LDG
280 | LF,LF
281 | LGT,LGT
282 | LIGHT,LGT
283 | LIGHTS,LGT
284 | LK,LK
285 | LKS,LKS
286 | LNDG,LNDG
287 | LNDNG,LNDG
288 | LN,LN
289 | LOAF,LF
290 | LOCK,LCKS
291 | LOCKS,LCKS
292 | LODGE,LDG
293 | LODG,LDG
294 | LOOP,LOOP
295 | LOOPS,LOOP
296 | MALL,MALL
297 | MANOR,MNR
298 | MANORS,MNR
299 | MDW,MDWS
300 | MDWS,MDWS
301 | MEADOW,MDWS
302 | MEADOWS,MDWS
303 | MEDOWS,MDWS
304 | MILL,ML
305 | MILLS,MLS
306 | MISSION,MSN
307 | MISSN,MSN
308 | ML,ML
309 | MLS,MLS
310 | MNR,MNR
311 | MNRS,MNR
312 | MNTAIN,MTN
313 | MNT,MT
314 | MNTN,MTN
315 | MNTNS,MTN
316 | MOUNTAIN,MTN
317 | MOUNTAINS,MTN
318 | MOUNTIN,MTN
319 | MOUNT,MT
320 | MSN,MSN
321 | MSSN,MSN
322 | MTIN,MTN
323 | MT,MT
324 | MTN,MTN
325 | NCK,NCK
326 | NECK,NCK
327 | ORCHARD,ORCH
328 | ORCH,ORCH
329 | ORCHRD,ORCH
330 | OVAL,OVAL
331 | OVL,OVAL
332 | PARK,PARK
333 | PARKS,PARK
334 | PARKWAY,PKY
335 | PARKWAYS,PKY
336 | PARKWY,PKY
337 | PASS,PASS
338 | PATH,PATH
339 | PATHS,PATH
340 | PIKE,PIKE
341 | PIKES,PIKE
342 | PINE,PNES
343 | PINES,PNES
344 | PK,PARK
345 | PKWAY,PKY
346 | PKWY,PKY
347 | PKWYS,PKY
348 | PKY,PKY
349 | PLACE,PL
350 | PLAINES,PLNS
351 | PLAIN,PLN
352 | PLAINS,PLNS
353 | PLAZA,PLZ
354 | PLN,PLN
355 | PLNS,PLNS
356 | PL,PL
357 | PLZA,PLZ
358 | PLZ,PLZ
359 | PNES,PNES
360 | POINT,PT
361 | POINTS,PT
362 | PORT,PRT
363 | PORTS,PRT
364 | PRAIRIE,PR
365 | PRARIE,PR
366 | PRK,PARK
367 | PR,PR
368 | PRR,PR
369 | PRT,PRT
370 | PRTS,PRT
371 | PT,PT
372 | PTS,PT
373 | RADIAL,RADL
374 | RADIEL,RADL
375 | RADL,RADL
376 | RAD,RADL
377 | RANCHES,RNCH
378 | RANCH,RNCH
379 | RAPID,RPDS
380 | RAPIDS,RPDS
381 | RDGE,RDG
382 | RDG,RDG
383 | RDGS,RDG
384 | RD,RD
385 | RDS,RD
386 | REST,RST
387 | RIDGE,RDG
388 | RIDGES,RDG
389 | RIVER,RIV
390 | RIV,RIV
391 | RIVR,RIV
392 | RNCH,RNCH
393 | RNCHS,RNCH
394 | ROAD,RD
395 | ROADS,RD
396 | ROW,ROW
397 | RPD,RPDS
398 | RPDS,RPDS
399 | RST,RST
400 | RUN,RUN
401 | RVR,RIV
402 | SHL,SHL
403 | SHLS,SHLS
404 | SHOAL,SHL
405 | SHOALS,SHLS
406 | SHOAR,SHR
407 | SHOARS,SHRS
408 | SHORE,SHR
409 | SHORES,SHRS
410 | SHR,SHR
411 | SHRS,SHRS
412 | SMT,SMT
413 | SPG,SPG
414 | SPGS,SPGS
415 | SPNG,SPG
416 | SPNGS,SPGS
417 | SPRING,SPG
418 | SPRINGS,SPGS
419 | SPRNG,SPG
420 | SPRNGS,SPGS
421 | SPUR,SPUR
422 | SPURS,SPUR
423 | SQRE,SQ
424 | SQR,SQ
425 | SQ,SQ
426 | SQUARE,SQ
427 | SQUARES,SQ
428 | SQU,SQ
429 | STA,STA
430 | STATION,STA
431 | STATN,STA
432 | STN,STA
433 | STRA,STRA
434 | STRAVEN,STRA
435 | STRAVENUE,STRA
436 | STRAVE,STRA
437 | STRAVN,STRA
438 | STRAV,STRA
439 | STREAM,STRM
440 | STREETS,ST
441 | STREET,ST
442 | STREME,STRM
443 | STRM,STRM
444 | STR,ST
445 | STRT,ST
446 | STRVN,STRA
447 | STRVNUE,STRA
448 | ST,ST
449 | SUMIT,SMT
450 | SUMITT,SMT
451 | SUMMIT,SMT
452 | TERRACE,TER
453 | TERR,TER
454 | TER,TER
455 | TPKE,TPKE
456 | TPK,TPKE
457 | TRACES,TRCE
458 | TRACE,TRCE
459 | TRACKS,TRAK
460 | TRACK,TRAK
461 | TRAFFICWAY,TRFY
462 | TRAILS,TRL
463 | TRAIL,TRL
464 | TRAK,TRAK
465 | TRCE,TRCE
466 | TRFY,TRFY
467 | TRKS,TRAK
468 | TRK,TRAK
469 | TRLS,TRL
470 | TRL,TRL
471 | TRNPK,TPKE
472 | TR,TRL
473 | TUNEL,TUNL
474 | TUNLS,TUNL
475 | TUNL,TUNL
476 | TUNNELS,TUNL
477 | TUNNEL,TUNL
478 | TUNNL,TUNL
479 | TURNPIKE,TPKE
480 | TURNPK,TPKE
481 | UNIONS,UN
482 | UNION,UN
483 | UN,UN
484 | VALLEYS,VLY
485 | VALLEY,VLY
486 | VALLY,VLY
487 | VDCT,VIA
488 | VIADCT,VIA
489 | VIADUCT,VIA
490 | VIA,VIA
491 | VIEWS,VW
492 | VIEW,VW
493 | VILLAGES,VLG
494 | VILLAGE,VLG
495 | VILLAG,VLG
496 | VILLE,VL
497 | VILLG,VLG
498 | VILLIAGE,VLG
499 | VILL,VLG
500 | VISTA,VIS
501 | VIST,VIS
502 | VIS,VIS
503 | VLGS,VLG
504 | VLG,VLG
505 | VLLY,VLY
506 | VL,VL
507 | VLYS,VLY
508 | VLY,VLY
509 | VSTA,VIS
510 | VST,VIS
511 | VWS,VW
512 | VW,VW
513 | WALKS,WALK
514 | WALK,WALK
515 | WAYS,WAY
516 | WAY,WAY
517 | WELLS,WLS
518 | WELL,WLS
519 | WLS,WLS
520 | WY,WAY
521 | XING,XING
522 |
--------------------------------------------------------------------------------
/address/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SwoopSearch/pyaddress/62ebb07a6840e710d256406a8ec1d06abec0e1c4/address/test/__init__.py
--------------------------------------------------------------------------------
/address/test/test_address.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from ..address import Address, AddressParser
3 |
4 |
5 | class AddressTest(unittest.TestCase):
6 | parser = None
7 |
8 | def setUp(self):
9 | self.parser = AddressParser()
10 |
11 | def test_basic_full_address(self):
12 | addr = Address("2 N. Park Street, Madison, WI 53703", self.parser)
13 | # print addr
14 | self.assertTrue(addr.house_number == "2")
15 | self.assertTrue(addr.street_prefix == "N.")
16 | self.assertTrue(addr.street == "Park")
17 | self.assertTrue(addr.street_suffix == "St.")
18 | self.assertTrue(addr.city == "Madison")
19 | self.assertTrue(addr.state == "WI")
20 | self.assertTrue(addr.zip == "53703")
21 | self.assertTrue(addr.apartment == None)
22 | # self.assertTrue(addr.building == None)
23 |
24 | def test_multi_address(self):
25 | addr = Address("416/418 N. Carroll St.", self.parser)
26 | # print addr
27 | self.assertTrue(addr.house_number == "416")
28 | self.assertTrue(addr.street_prefix == "N.")
29 | self.assertTrue(addr.street == "Carroll")
30 | self.assertTrue(addr.street_suffix == "St.")
31 | self.assertTrue(addr.city == None)
32 | self.assertTrue(addr.state == None)
33 | self.assertTrue(addr.zip == None)
34 | self.assertTrue(addr.apartment == None)
35 | # self.assertTrue(addr.building == None)
36 |
37 | def test_no_suffix(self):
38 | addr = Address("230 Lakelawn", self.parser)
39 | # print addr
40 | self.assertTrue(addr.house_number == "230")
41 | self.assertTrue(addr.street_prefix == None)
42 | self.assertTrue(addr.street == "Lakelawn")
43 | self.assertTrue(addr.street_suffix == None)
44 | self.assertTrue(addr.city == None)
45 | self.assertTrue(addr.state == None)
46 | self.assertTrue(addr.zip == None)
47 | self.assertTrue(addr.apartment == None)
48 | # self.assertTrue(addr.building == None)
49 |
50 | # def test_building_in_front(self):
51 | # addr = Address("Roundhouse Apartments 626 Langdon", self.parser)
52 | # # print addr
53 | # self.assertTrue(addr.house_number == "626")
54 | # self.assertTrue(addr.street_prefix == None)
55 | # self.assertTrue(addr.street == "Langdon")
56 | # self.assertTrue(addr.street_suffix == None)
57 | # self.assertTrue(addr.city == None)
58 | # self.assertTrue(addr.state == None)
59 | # self.assertTrue(addr.zip == None)
60 | # self.assertTrue(addr.apartment == None)
61 | # # self.assertTrue(addr.building == "Roundhouse Apartments")
62 |
63 | def test_streets_named_after_states(self):
64 | addr = Address("504 W. Washington Ave.", self.parser)
65 | # print addr
66 | self.assertTrue(addr.house_number == "504")
67 | self.assertTrue(addr.street_prefix == "W.")
68 | self.assertTrue(addr.street == "Washington")
69 | self.assertTrue(addr.street_suffix == "Ave.")
70 | self.assertTrue(addr.city == None)
71 | self.assertTrue(addr.state == None)
72 | self.assertTrue(addr.zip == None)
73 | self.assertTrue(addr.apartment == None)
74 | # self.assertTrue(addr.building == None)
75 |
76 | def test_hash_apartment(self):
77 | addr = Address("407 West Doty St. #2", self.parser)
78 | # print addr
79 | self.assertTrue(addr.house_number == "407")
80 | self.assertTrue(addr.street_prefix == "W.")
81 | self.assertTrue(addr.street == "Doty")
82 | self.assertTrue(addr.street_suffix == "St.")
83 | self.assertTrue(addr.city == None)
84 | self.assertTrue(addr.state == None)
85 | self.assertTrue(addr.zip == None)
86 | self.assertTrue(addr.apartment == "#2")
87 | # self.assertTrue(addr.building == None)
88 |
89 | def test_stray_dash_apartment(self):
90 | addr = Address("407 West Doty St. - #2", self.parser)
91 | # print addr
92 | self.assertTrue(addr.house_number == "407")
93 | self.assertTrue(addr.street_prefix == "W.")
94 | self.assertTrue(addr.street == "Doty")
95 | self.assertTrue(addr.street_suffix == "St.")
96 | self.assertTrue(addr.city == None)
97 | self.assertTrue(addr.state == None)
98 | self.assertTrue(addr.zip == None)
99 | self.assertTrue(addr.apartment == "#2")
100 | # self.assertTrue(addr.building == None)
101 |
102 | def test_suffixless_street_with_city(self):
103 | addr = Address("431 West Johnson, Madison, WI", self.parser)
104 | # print addr
105 | self.assertTrue(addr.house_number == "431")
106 | self.assertTrue(addr.street_prefix == "W.")
107 | self.assertTrue(addr.street == "Johnson")
108 | self.assertTrue(addr.street_suffix == None)
109 | self.assertTrue(addr.city == "Madison")
110 | self.assertTrue(addr.state == "WI")
111 | self.assertTrue(addr.zip == None)
112 | self.assertTrue(addr.apartment == None)
113 | # self.assertTrue(addr.building == None)
114 |
115 |
116 | class AddressParserTest(unittest.TestCase):
117 | ap = None
118 |
119 | def setUp(self):
120 | self.ap = AddressParser()
121 |
122 | def test_load_suffixes(self):
123 | self.assertTrue(self.ap.suffixes["ALLEY"] == "ALY")
124 |
125 | def test_load_cities(self):
126 | self.assertTrue("wisconsin rapids" in self.ap.cities)
127 |
128 | def test_load_states(self):
129 | self.assertTrue(self.ap.states["Wisconsin"] == "WI")
130 |
131 | # Not using preloaded streets any more.
132 | # def test_load_streets(self):
133 | # self.assertTrue("mifflin" in self.ap.streets)
134 |
135 | if __name__ == '__main__':
136 | unittest.main()
137 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 |
3 | setup(
4 | name='address',
5 | version='0.1.1',
6 | url='https://github.com/SwoopSearch/pyaddress',
7 | author='Swoop Search LLC, Josh Gachnang, Rob Jauquet',
8 | author_email='Josh@SwoopSrch.com',
9 | description='address is an address parsing library, taking the guesswork out of using addresses in your applications.',
10 | long_description=open('README.rst', 'rt').read(),
11 | #data_files=[('', ['README.rst','pyaddress/cities.csv', 'pyaddress/suffixes.csv', 'pyaddress/streets.csv', 'pyaddress/tests.py', 'pyaddress/test_list.py'])],
12 | packages=['address'],
13 | package_dir={'address': 'address'},
14 | package_data={'address': ['cities.csv', 'streets.csv', 'suffixes.csv']},
15 | classifiers=[
16 | "License :: OSI Approved :: BSD License",
17 | "Natural Language :: English",
18 | "Programming Language :: Python :: 2 :: Only",
19 | "Topic :: Software Development :: Libraries",
20 | "Topic :: Text Processing",
21 | ],
22 | keywords = "example documentation tutorial",
23 | maintainer="Swoop Search LLC, Josh Gachnang, Rob Jauquet",
24 | maintainer_email="Josh@SwoopSrch.com",
25 | )
26 |
--------------------------------------------------------------------------------