├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── feature_request.md │ └── improve-parser.md ├── PULL_REQUEST_TEMPLATE.md └── workflows │ └── python-publish.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── conf.py └── index.rst ├── measure_performance └── test_data │ ├── labeled.xml │ ├── multi_word_state_addresses.xml │ ├── simple_address_patterns.xml │ ├── synthetic_clean_osm_data.xml │ ├── synthetic_osm_data.xml │ └── us50_test_tagged.xml ├── parse_scripts ├── import_osm.py ├── parse.py └── parse_openaddress.py ├── pyproject.toml ├── raw ├── LICENSE.md ├── openaddresses │ └── us-ia-linn.json ├── osm_data.xml ├── osm_data_full_addr.xml ├── osm_data_street.xml ├── us50.test.raw ├── us50.test.tagged ├── us50.train.raw └── us50.train.tagged ├── setup.py ├── tests ├── test_labeling.py ├── test_tagging.py ├── test_token_features.py └── test_tokenizing.py ├── training ├── README.md ├── example_training.xml ├── labeled.xml ├── multi_word_state_addresses.xml ├── openaddress_us_ia_linn.xml ├── synthetic_clean_osm_data.xml ├── synthetic_osm_data_xml.xml ├── unparseable.csv ├── us50_messiest_manual_label.xml └── us50_train_tagged.xml └── usaddress └── __init__.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=160 3 | extend-ignore = E203 -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: features 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/improve-parser.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Improve Parser 3 | about: Is a parse not looking the way you expected? Let us know here! 4 | title: '' 5 | labels: bad parse 6 | assignees: '' 7 | 8 | --- 9 | 10 | **The Input Address** 11 | What string are you having an issue with? 12 | 13 | (ex. 123 Main St. Chicago, Illinois) 14 | 15 | **Current Output** 16 | What is the parser currently returning? 17 | 18 | 123 - AddressNumber 19 | Main - AddressNumber 20 | St. - StreetNamePostType 21 | Chicago - PlaceName 22 | Illinois - PlaceName 23 | 24 | **Expected Ouput** 25 | What are you expecting the parser to return? 26 | 27 | 123 - AddressNumber 28 | Main - StreetName 29 | St. - StreetNamePostType 30 | Chicago - PlaceName 31 | Illinois - StateName 32 | 33 | **Examples** 34 | Preferably 8-12 real world examples with a similar pattern that we can use to train the parser. This can be from your dataset if you're comfortable sharing some. 35 | - 456 Second St. Chicago, Illinois 36 | 37 | **Additional context** 38 | Optional. Add any other context here. 39 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | Brief description of what this PR does, and why it is needed. 4 | 5 | If this pr closes an issue, make note of it here 👇 6 | Closes #XXX 7 | 8 | ### Demo 9 | 10 | Optional. Screenshots, `curl` examples, etc. 11 | 12 | ### Notes 13 | 14 | Optional. Ancillary topics, caveats, alternative strategies that didn't work out, anything else. 15 | 16 | ## Testing Instructions 17 | 18 | * How to test this PR 19 | * Prefer bulleted description 20 | * Start after checking out this branch 21 | * Include any setup required, such as bundling scripts, restarting services, etc. 22 | * Include test case, and expected output 23 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Test and Publish Python Package 2 | 3 | on: [push, pull_request] 4 | 5 | permissions: 6 | contents: read 7 | 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-python@v2 14 | with: 15 | python-version: "3.12" 16 | - name: Install dependencies 17 | run: | 18 | pip install --upgrade pip 19 | pip install .[dev] 20 | - name: flake8 21 | run: flake8 usaddress tests 22 | - name: isort 23 | if: always() 24 | run: isort --check-only . 25 | - name: black 26 | if: always() 27 | run: black . --check 28 | - name: mypy 29 | if: always() 30 | run: mypy 31 | test: 32 | timeout-minutes: 40 33 | runs-on: ${{ matrix.os }} 34 | strategy: 35 | fail-fast: false 36 | matrix: 37 | os: [windows-latest, macos-latest, ubuntu-latest] 38 | python-version: [3.9, "3.10", "3.11", "3.12", "3.13-dev"] 39 | 40 | steps: 41 | - uses: actions/checkout@v2 42 | - name: Set up Python ${{ matrix.python-version }} 43 | uses: actions/setup-python@v2 44 | with: 45 | python-version: ${{ matrix.python-version }} 46 | - name: Install dependencies 47 | run: | 48 | pip install --upgrade pip 49 | pip install -e .[dev] 50 | - name: pytest 51 | run: pytest 52 | 53 | deploy: 54 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 55 | needs: [test, lint] 56 | 57 | runs-on: ubuntu-latest 58 | 59 | name: Upload release to PyPI 60 | environment: 61 | name: pypi 62 | url: https://pypi.org/p/usaddress 63 | permissions: 64 | id-token: write 65 | steps: 66 | - uses: actions/checkout@v4 67 | - name: Set up Python 68 | uses: actions/setup-python@v3 69 | with: 70 | python-version: '3.x' 71 | - name: Install dependencies 72 | run: | 73 | python -m pip install --upgrade pip 74 | pip install build 75 | - name: Build package 76 | run: python -m build 77 | - name: Publish package distributions to PyPI 78 | uses: pypa/gh-action-pypi-publish@release/v1 79 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | *.crfsuite 56 | *.csv 57 | *.txt 58 | 59 | # temp files 60 | *~ 61 | *# 62 | 63 | .DS_Store 64 | .venv 65 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 24.8.0 4 | hooks: 5 | - id: black 6 | - repo: https://github.com/pycqa/isort 7 | rev: 5.13.2 8 | hooks: 9 | - id: isort 10 | name: isort (python) 11 | - repo: https://github.com/pycqa/flake8 12 | rev: "7.1.1" 13 | hooks: 14 | - id: flake8 15 | args: [--config=.flake8] 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 The Atlanta Journal Constitution 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | Permission is hereby granted, free of charge, to any person obtaining a copy 24 | of this software and associated documentation files (the "Software"), to deal 25 | in the Software without restriction, including without limitation the rights 26 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 27 | copies of the Software, and to permit persons to whom the Software is 28 | furnished to do so, subject to the following conditions: 29 | 30 | The above copyright notice and this permission notice shall be included in all 31 | copies or substantial portions of the Software. 32 | 33 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 34 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 35 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 36 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 37 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 38 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 39 | SOFTWARE. 40 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include training/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | usaddress 2 | ================= 3 | usaddress is a Python library for parsing unstructured United States address strings into address components, using advanced NLP methods. 4 | 5 | **What this can do:** Using a probabilistic model, it makes (very educated) guesses in identifying address components, even in tricky cases where rule-based parsers typically break down. 6 | 7 | **What this cannot do:** It cannot identify address components with perfect accuracy, nor can it verify that a given address is correct/valid. 8 | 9 | It also does not normalize the address. However, [this library built on top of usaddress does](https://github.com/GreenBuildingRegistry/usaddress-scourgify). 10 | 11 | 12 | ## Tools built with usaddress 13 | 14 | ### [Parserator API](https://parserator.datamade.us/) 15 | A RESTful API built on top of usaddress for programmers who don't use python. Requires an API key and the first 1,000 parses are free. 16 | 17 | ### [Parserator Google Sheets App](https://workspace.google.com/u/0/marketplace/app/parserator_parse_and_split_addresses/945974620840) 18 | Parserator: Parse and Split Addresses allows you to easily split addresses into separate columns by street, city, state, zipcode and more right in Google Sheets. 19 | 20 | ## How to use the usaddress python library 21 | 22 | 1. Install usaddress with [pip](https://pip.readthedocs.io/en/latest/quickstart.html), a tool for installing and managing python packages ([beginner's guide here](http://www.dabapps.com/blog/introduction-to-pip-and-virtualenv-python/)). 23 | 24 | In the terminal, 25 | 26 | ```bash 27 | pip install usaddress 28 | ``` 29 | 2. Parse some addresses! 30 | 31 | ![usaddress](https://cloud.githubusercontent.com/assets/1406537/7869001/65c6ae62-0545-11e5-8b65-5d9e71dface5.gif) 32 | 33 | Note that `parse` and `tag` are different methods: 34 | ```python 35 | import usaddress 36 | addr='123 Main St. Suite 100 Chicago, IL' 37 | 38 | # The parse method will split your address string into components, and label each component. 39 | # expected output: [(u'123', 'AddressNumber'), (u'Main', 'StreetName'), (u'St.', 'StreetNamePostType'), (u'Suite', 'OccupancyType'), (u'100', 'OccupancyIdentifier'), (u'Chicago,', 'PlaceName'), (u'IL', 'StateName')] 40 | usaddress.parse(addr) 41 | 42 | # The tag method will try to be a little smarter 43 | # it will merge consecutive components, strip commas, & return an address type 44 | # expected output: (OrderedDict([('AddressNumber', u'123'), ('StreetName', u'Main'), ('StreetNamePostType', u'St.'), ('OccupancyType', u'Suite'), ('OccupancyIdentifier', u'100'), ('PlaceName', u'Chicago'), ('StateName', u'IL')]), 'Street Address') 45 | usaddress.tag(addr) 46 | ``` 47 | 48 | ## How to use this development code (for the nerds) 49 | usaddress uses [parserator](https://github.com/datamade/parserator), a library for making and improving probabilistic parsers - specifically, parsers that use [python-crfsuite](https://github.com/tpeng/python-crfsuite)'s implementation of conditional random fields. Parserator allows you to train the usaddress parser's model (a .crfsuite settings file) on labeled training data, and provides tools for adding new labeled training data. 50 | 51 | ### Building & testing the code in this repo 52 | 53 | To build a development version of usaddress on your machine, run the following code in your command line: 54 | 55 | ``` 56 | git clone https://github.com/datamade/usaddress.git 57 | cd usaddress 58 | pip install -e ."[dev]" 59 | ``` 60 | 61 | Then run the testing suite to confirm that everything is working properly: 62 | 63 | ``` 64 | pytest 65 | ``` 66 | 67 | Having trouble building the code? [Open an issue](https://github.com/datamade/usaddress/issues/new) and we'd be glad to help you troubleshoot. 68 | 69 | ### Adding new training data 70 | 71 | If usaddress is consistently failing on particular address patterns, you can adjust the parser's behavior by adding new training data to the model. [Follow our guide in the training directory](./training/README.md), and be sure to make a pull request so that we can incorporate your contribution into our next release! 72 | 73 | ## Important links 74 | 75 | * Web Interface: https://parserator.datamade.us/usaddress 76 | * Python Package Distribution: https://pypi.python.org/pypi/usaddress 77 | * Python Package Documentation: https://usaddress.readthedocs.io/ 78 | * API Documentation: https://parserator.datamade.us/api-docs 79 | * Repository: https://github.com/datamade/usaddress 80 | * Issues: https://github.com/datamade/usaddress/issues 81 | * Blog post: http://datamade.us/blog/parsing-addresses-with-usaddress 82 | 83 | ## Team 84 | 85 | * [Forest Gregg](https://github.com/fgregg), DataMade 86 | * [Cathy Deng](https://github.com/cathydeng), DataMade 87 | * [Miroslav Batchkarov](http://mbatchkarov.github.io), University of Sussex 88 | * [Jean Cochrane](https://github.com/jeancochrane), DataMade 89 | 90 | ## Bad Parses / Bugs 91 | 92 | Report issues in the [issue tracker](https://github.com/datamade/usaddress/issues) 93 | 94 | If an address was parsed incorrectly, please let us know! You can either [open an issue](https://github.com/datamade/usaddress/issues/new) or (if you're adventurous) [add new training data to improve the parser's model.](./training/README.md) When possible, please send over a few real-world examples of similar address patterns, along with some info about the source of the data - this will help us train the parser and improve its performance. 95 | 96 | If something in the library is not behaving intuitively, it is a bug, and should be reported. 97 | 98 | ## Note on Patches/Pull Requests 99 | 100 | * Fork the project. 101 | * Make your feature addition or bug fix. 102 | * Send us a pull request. Bonus points for topic branches! 103 | 104 | ## Copyright 105 | 106 | Copyright (c) 2025 Atlanta Journal Constitution. Released under the [MIT License](./LICENSE). 107 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 54 | @echo 55 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 56 | 57 | dirhtml: 58 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 59 | @echo 60 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 61 | 62 | singlehtml: 63 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 64 | @echo 65 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 66 | 67 | pickle: 68 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 69 | @echo 70 | @echo "Build finished; now you can process the pickle files." 71 | 72 | json: 73 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 74 | @echo 75 | @echo "Build finished; now you can process the JSON files." 76 | 77 | htmlhelp: 78 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 79 | @echo 80 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 81 | ".hhp project file in $(BUILDDIR)/htmlhelp." 82 | 83 | qthelp: 84 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 85 | @echo 86 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 87 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 88 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/usaddress.qhcp" 89 | @echo "To view the help file:" 90 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/usaddress.qhc" 91 | 92 | devhelp: 93 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 94 | @echo 95 | @echo "Build finished." 96 | @echo "To view the help file:" 97 | @echo "# mkdir -p $$HOME/.local/share/devhelp/usaddress" 98 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/usaddress" 99 | @echo "# devhelp" 100 | 101 | epub: 102 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 103 | @echo 104 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 105 | 106 | latex: 107 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 108 | @echo 109 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 110 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 111 | "(use \`make latexpdf' here to do that automatically)." 112 | 113 | latexpdf: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo "Running LaTeX files through pdflatex..." 116 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 117 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 118 | 119 | latexpdfja: 120 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 121 | @echo "Running LaTeX files through platex and dvipdfmx..." 122 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 123 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 124 | 125 | text: 126 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 127 | @echo 128 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 129 | 130 | man: 131 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 132 | @echo 133 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 134 | 135 | texinfo: 136 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 137 | @echo 138 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 139 | @echo "Run \`make' in that directory to run these through makeinfo" \ 140 | "(use \`make info' here to do that automatically)." 141 | 142 | info: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo "Running Texinfo files through makeinfo..." 145 | make -C $(BUILDDIR)/texinfo info 146 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 147 | 148 | gettext: 149 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 150 | @echo 151 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 152 | 153 | changes: 154 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 155 | @echo 156 | @echo "The overview file is in $(BUILDDIR)/changes." 157 | 158 | linkcheck: 159 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 160 | @echo 161 | @echo "Link check complete; look for any errors in the above output " \ 162 | "or in $(BUILDDIR)/linkcheck/output.txt." 163 | 164 | doctest: 165 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 166 | @echo "Testing of doctests in the sources finished, look at the " \ 167 | "results in $(BUILDDIR)/doctest/output.txt." 168 | 169 | xml: 170 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 171 | @echo 172 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 173 | 174 | pseudoxml: 175 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 176 | @echo 177 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 178 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # usaddress documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Oct 2 15:12:14 2014. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # sys.path.insert(0, os.path.abspath('.')) 19 | 20 | # -- General configuration ------------------------------------------------ 21 | 22 | # If your documentation needs a minimal Sphinx version, state it here. 23 | # needs_sphinx = '1.0' 24 | 25 | # Add any Sphinx extension module names here, as strings. They can be 26 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 27 | # ones. 28 | extensions = [] 29 | 30 | # Add any paths that contain templates here, relative to this directory. 31 | templates_path = ["_templates"] 32 | 33 | # The suffix of source filenames. 34 | source_suffix = ".rst" 35 | 36 | # The encoding of source files. 37 | # source_encoding = 'utf-8-sig' 38 | 39 | # The master toctree document. 40 | master_doc = "index" 41 | 42 | # General information about the project. 43 | project = "usaddress" 44 | copyright = "2014, Cathy Deng, Forest Gregg" 45 | 46 | # The version info for the project you're documenting, acts as replacement for 47 | # |version| and |release|, also used in various other places throughout the 48 | # built documents. 49 | # 50 | # The short X.Y version. 51 | version = "0.5.4" 52 | # The full version, including alpha/beta/rc tags. 53 | release = "0.5.4" 54 | 55 | # The language for content autogenerated by Sphinx. Refer to documentation 56 | # for a list of supported languages. 57 | # language = None 58 | 59 | # There are two options for replacing |today|: either, you set today to some 60 | # non-false value, then it is used: 61 | # today = '' 62 | # Else, today_fmt is used as the format for a strftime call. 63 | # today_fmt = '%B %d, %Y' 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | exclude_patterns = ["_build"] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all 70 | # documents. 71 | # default_role = None 72 | 73 | # If true, '()' will be appended to :func: etc. cross-reference text. 74 | # add_function_parentheses = True 75 | 76 | # If true, the current module name will be prepended to all description 77 | # unit titles (such as .. function::). 78 | # add_module_names = True 79 | 80 | # If true, sectionauthor and moduleauthor directives will be shown in the 81 | # output. They are ignored by default. 82 | # show_authors = False 83 | 84 | # The name of the Pygments (syntax highlighting) style to use. 85 | pygments_style = "sphinx" 86 | 87 | # A list of ignored prefixes for module index sorting. 88 | # modindex_common_prefix = [] 89 | 90 | # If true, keep warnings as "system message" paragraphs in the built documents. 91 | # keep_warnings = False 92 | 93 | 94 | # -- Options for HTML output ---------------------------------------------- 95 | 96 | # The theme to use for HTML and HTML Help pages. See the documentation for 97 | # a list of builtin themes. 98 | html_theme = "default" 99 | 100 | # Theme options are theme-specific and customize the look and feel of a theme 101 | # further. For a list of options available for each theme, see the 102 | # documentation. 103 | # html_theme_options = {} 104 | 105 | # Add any paths that contain custom themes here, relative to this directory. 106 | # html_theme_path = [] 107 | 108 | # The name for this set of Sphinx documents. If None, it defaults to 109 | # " v documentation". 110 | # html_title = None 111 | 112 | # A shorter title for the navigation bar. Default is the same as html_title. 113 | # html_short_title = None 114 | 115 | # The name of an image file (relative to this directory) to place at the top 116 | # of the sidebar. 117 | # html_logo = None 118 | 119 | # The name of an image file (within the static path) to use as favicon of the 120 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 121 | # pixels large. 122 | # html_favicon = None 123 | 124 | # Add any paths that contain custom static files (such as style sheets) here, 125 | # relative to this directory. They are copied after the builtin static files, 126 | # so a file named "default.css" will overwrite the builtin "default.css". 127 | html_static_path = ["_static"] 128 | 129 | # Add any extra paths that contain custom files (such as robots.txt or 130 | # .htaccess) here, relative to this directory. These files are copied 131 | # directly to the root of the documentation. 132 | # html_extra_path = [] 133 | 134 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 135 | # using the given strftime format. 136 | # html_last_updated_fmt = '%b %d, %Y' 137 | 138 | # If true, SmartyPants will be used to convert quotes and dashes to 139 | # typographically correct entities. 140 | # html_use_smartypants = True 141 | 142 | # Custom sidebar templates, maps document names to template names. 143 | # html_sidebars = {} 144 | 145 | # Additional templates that should be rendered to pages, maps page names to 146 | # template names. 147 | # html_additional_pages = {} 148 | 149 | # If false, no module index is generated. 150 | # html_domain_indices = True 151 | 152 | # If false, no index is generated. 153 | # html_use_index = True 154 | 155 | # If true, the index is split into individual pages for each letter. 156 | # html_split_index = False 157 | 158 | # If true, links to the reST sources are added to the pages. 159 | # html_show_sourcelink = True 160 | 161 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 162 | # html_show_sphinx = True 163 | 164 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 165 | # html_show_copyright = True 166 | 167 | # If true, an OpenSearch description file will be output, and all pages will 168 | # contain a tag referring to it. The value of this option must be the 169 | # base URL from which the finished HTML is served. 170 | # html_use_opensearch = '' 171 | 172 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 173 | # html_file_suffix = None 174 | 175 | # Output file base name for HTML help builder. 176 | htmlhelp_basename = "usaddressdoc" 177 | 178 | 179 | # -- Options for LaTeX output --------------------------------------------- 180 | 181 | latex_elements = { 182 | # The paper size ('letterpaper' or 'a4paper'). 183 | # 'papersize': 'letterpaper', 184 | # The font size ('10pt', '11pt' or '12pt'). 185 | # 'pointsize': '10pt', 186 | # Additional stuff for the LaTeX preamble. 187 | # 'preamble': '', 188 | } 189 | 190 | # Grouping the document tree into LaTeX files. List of tuples 191 | # (source start file, target name, title, 192 | # author, documentclass [howto, manual, or own class]). 193 | latex_documents = [ 194 | ( 195 | "index", 196 | "usaddress.tex", 197 | "usaddress Documentation", 198 | "Cathy Deng, Forest Gregg", 199 | "manual", 200 | ), 201 | ] 202 | 203 | # The name of an image file (relative to this directory) to place at the top of 204 | # the title page. 205 | # latex_logo = None 206 | 207 | # For "manual" documents, if this is true, then toplevel headings are parts, 208 | # not chapters. 209 | # latex_use_parts = False 210 | 211 | # If true, show page references after internal links. 212 | # latex_show_pagerefs = False 213 | 214 | # If true, show URL addresses after external links. 215 | # latex_show_urls = False 216 | 217 | # Documents to append as an appendix to all manuals. 218 | # latex_appendices = [] 219 | 220 | # If false, no module index is generated. 221 | # latex_domain_indices = True 222 | 223 | 224 | # -- Options for manual page output --------------------------------------- 225 | 226 | # One entry per manual page. List of tuples 227 | # (source start file, name, description, authors, manual section). 228 | man_pages = [ 229 | ("index", "usaddress", "usaddress Documentation", ["Cathy Deng, Forest Gregg"], 1) 230 | ] 231 | 232 | # If true, show URL addresses after external links. 233 | # man_show_urls = False 234 | 235 | 236 | # -- Options for Texinfo output ------------------------------------------- 237 | 238 | # Grouping the document tree into Texinfo files. List of tuples 239 | # (source start file, target name, title, author, 240 | # dir menu entry, description, category) 241 | texinfo_documents = [ 242 | ( 243 | "index", 244 | "usaddress", 245 | "usaddress Documentation", 246 | "Cathy Deng, Forest Gregg", 247 | "usaddress", 248 | "One line description of project.", 249 | "Miscellaneous", 250 | ), 251 | ] 252 | 253 | # Documents to append as an appendix to all manuals. 254 | # texinfo_appendices = [] 255 | 256 | # If false, no module index is generated. 257 | # texinfo_domain_indices = True 258 | 259 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 260 | # texinfo_show_urls = 'footnote' 261 | 262 | # If true, do not generate a @detailmenu in the "Top" node's menu. 263 | # texinfo_no_detailmenu = False 264 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. usaddress documentation master file, created by 2 | sphinx-quickstart on Thu Oct 2 15:12:14 2014. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | =================== 7 | usaddress |release| 8 | =================== 9 | 10 | usaddress is a python library for parsing unstructured address strings into address components, using advanced NLP methods. 11 | 12 | Installation 13 | ============ 14 | 15 | .. code-block:: bash 16 | 17 | pip install usaddress 18 | 19 | Usage 20 | ===== 21 | The ``parse`` method will split your address string into components, and label each component. 22 | .. code:: python 23 | 24 | >>> import usaddress 25 | >>> usaddress.parse('Robie House, 5757 South Woodlawn Avenue, Chicago, IL 60637') 26 | [('Robie', 'BuildingName'), 27 | ('House,', 'BuildingName'), 28 | ('5757', 'AddressNumber'), 29 | ('South', 'StreetNamePreDirectional'), 30 | ('Woodlawn', 'StreetName'), 31 | ('Avenue,', 'StreetNamePostType'), 32 | ('Chicago,', 'PlaceName'), 33 | ('IL', 'StateName'), 34 | ('60637', 'ZipCode')] 35 | 36 | The ``tag`` method will try to be a little smarter - it will merge consecutive components & strip commas, as well as return an address type (``Street Address``, ``Intersection``, ``PO Box``, or ``Ambiguous``) 37 | .. code:: python 38 | 39 | >>> import usaddress 40 | >>> usaddress.tag('Robie House, 5757 South Woodlawn Avenue, Chicago, IL 60637') 41 | (OrderedDict([ 42 | ('BuildingName', 'Robie House'), 43 | ('AddressNumber', '5757'), 44 | ('StreetNamePreDirectional', 'South'), 45 | ('StreetName', 'Woodlawn'), 46 | ('StreetNamePostType', 'Avenue'), 47 | ('PlaceName', 'Chicago'), 48 | ('StateName', 'IL'), 49 | ('ZipCode', '60637')]), 50 | 'Street Address') 51 | >>> usaddress.tag('State & Lake, Chicago') 52 | (OrderedDict([ 53 | ('StreetName', 'State'), 54 | ('IntersectionSeparator', '&'), 55 | ('SecondStreetName', 'Lake'), 56 | ('PlaceName', 'Chicago')]), 57 | 'Intersection') 58 | >>> usaddress.tag('P.O. Box 123, Chicago, IL') 59 | (OrderedDict([ 60 | ('USPSBoxType', 'P.O. Box'), 61 | ('USPSBoxID', '123'), 62 | ('PlaceName', 'Chicago'), 63 | ('StateName', 'IL')]), 64 | 'PO Box') 65 | 66 | Because the ``tag`` method returns an OrderedDict with labels as keys, it will throw a ``RepeatedLabelError`` error when multiple areas of an address have the same label, and thus can't be concatenated. When ``RepeatedLabelError`` is raised, it is likely that either (1) the input string is not a valid address, or (2) some tokens were labeled incorrectly. 67 | 68 | ``RepeatedLabelError`` has the attributes ``original_string`` (the input string) and ``parsed_string`` (the output of the ``parse`` method on the input string). You can use these attributes to write custom exception handling, for example: 69 | .. code:: python 70 | 71 | try: 72 | tagged_address, address_type = usaddress.tag(string) 73 | except usaddress.RepeatedLabelError as e : 74 | some_special_instructions(e.parsed_string, e.original_string) 75 | 76 | It is also possible to pass a mapping dict to the ``tag`` method to remap the labels to your own format. For example: 77 | 78 | .. code:: python 79 | 80 | >>> import usaddress 81 | >>> address = 'Robie House, 5757 South Woodlawn Avenue, Chicago, IL 60637' 82 | >>> usaddress.tag(address, tag_mapping={ 83 | 'Recipient': 'recipient', 84 | 'AddressNumber': 'address1', 85 | 'AddressNumberPrefix': 'address1', 86 | 'AddressNumberSuffix': 'address1', 87 | 'StreetName': 'address1', 88 | 'StreetNamePreDirectional': 'address1', 89 | 'StreetNamePreModifier': 'address1', 90 | 'StreetNamePreType': 'address1', 91 | 'StreetNamePostDirectional': 'address1', 92 | 'StreetNamePostModifier': 'address1', 93 | 'StreetNamePostType': 'address1', 94 | 'CornerOf': 'address1', 95 | 'IntersectionSeparator': 'address1', 96 | 'LandmarkName': 'address1', 97 | 'USPSBoxGroupID': 'address1', 98 | 'USPSBoxGroupType': 'address1', 99 | 'USPSBoxID': 'address1', 100 | 'USPSBoxType': 'address1', 101 | 'BuildingName': 'address2', 102 | 'OccupancyType': 'address2', 103 | 'OccupancyIdentifier': 'address2', 104 | 'SubaddressIdentifier': 'address2', 105 | 'SubaddressType': 'address2', 106 | 'PlaceName': 'city', 107 | 'StateName': 'state', 108 | 'ZipCode': 'zip_code', 109 | }) 110 | (OrderedDict([ 111 | ('address2', u'Robie House'), 112 | ('address1', u'5757 South Woodlawn Avenue'), 113 | ('city', u'Chicago'), 114 | ('state', u'IL'), 115 | ('zip_code', u'60637')] 116 | ), 117 | 'Street Address') 118 | 119 | Details 120 | ======= 121 | 122 | The address components are based upon the `United States Thoroughfare, Landmark, and Postal Address Data Standard `__, and usaddress knows about the following types of components: 123 | 124 | * **AddressNumber** - address number 125 | * **AddressNumberPrefix** - a modifier before an address number, e.g. 'Mile', '#' 126 | * **AddressNumberSuffix** - a modifier after an address number, e.g 'B', '1/2' 127 | * **BuildingName** - the name of a building, e.g. 'Atlanta Financial Center' 128 | * **CornerOf** - words indicating that an address is a corner, e.g. 'Junction', 'corner of' 129 | * **IntersectionSeparator** - a conjunction connecting parts of an intersection, e.g. 'and', '&' 130 | * **LandmarkName** - the name of a landmark, e.g. 'Wrigley Field', 'Union Station' 131 | * **NotAddress** - a non-address component that doesn't refer to a recipient 132 | * **OccupancyType** - a type of occupancy within a building, e.g. 'Suite', 'Apt', 'Floor' 133 | * **OccupancyIdentifier** - the identifier of an occupancy, often a number or letter 134 | * **PlaceName** - city 135 | * **Recipient** - a non-address recipient, e.g. the name of a person/organization 136 | * **StateName** - state 137 | * **StreetName** - street name, excluding type & direction 138 | * **StreetNamePreDirectional** - a direction before a street name, e.g. 'North', 'S' 139 | * **StreetNamePreModifier** - a modifier before a street name that is not a direction, e.g. 'Old' 140 | * **StreetNamePreType** - a street type that comes before a street name, e.g. 'Route', 'Ave' 141 | * **StreetNamePostDirectional** - a direction after a street name, e.g. 'North', 'S' 142 | * **StreetNamePostModifier** - a modifier adter a street name, e.g. 'Ext' 143 | * **StreetNamePostType** - a street type that comes after a street name, e.g. 'Avenue', 'Rd' 144 | * **SubaddressIdentifier** - the name/identifier of a subaddress component 145 | * **SubaddressType** - a level of detail in an address that is not an occupancy within a building, e.g. 'Building', 'Tower' 146 | * **USPSBoxGroupID** - the identifier of a USPS box group, usually a number 147 | * **USPSBoxGroupType** - a name for a group of USPS boxes, e.g. 'RR' 148 | * **USPSBoxID** - the identifier of a USPS box, usually a number 149 | * **USPSBoxType** - a USPS box, e.g. 'P.O. Box' 150 | * **ZipCode** - zip code 151 | 152 | 153 | Important links 154 | =============== 155 | 156 | * Documentation: https://usaddress.readthedocs.io/ 157 | * Repository: https://github.com/datamade/usaddress 158 | * Issues: https://github.com/datamade/usaddress/issues 159 | * Distribution: https://pypi.python.org/pypi/usaddress 160 | * Blog Post: http://datamade.us/blog/parsing-addresses-with-usaddress/ 161 | * Web Interface: http://parserator.datamade.us/usaddress 162 | 163 | Indices and tables 164 | ================== 165 | 166 | * :ref:`genindex` 167 | * :ref:`modindex` 168 | * :ref:`search` 169 | 170 | -------------------------------------------------------------------------------- /measure_performance/test_data/labeled.xml: -------------------------------------------------------------------------------- 1 | 2 | 431 Marietta St NW Fl. 3 3 | 1234 West U.S. Hwy 50 4 | 1234 S Martin Luther King Dr Chicago, IL 60637 5 | Apt 1B 626 E Kilbourn Ave Milwaukee, WI 53202 6 | P.O. Box 123456 7 | N165 W2123 Tartan Ct Jackson, WI 53037 8 | Box 123456 9 | 1234 W US Highway 50 10 | 1234 West U.S. Highway 50 11 | 2002 FM 544 Wylie, TX 75098 12 | 519 PR 462 Leonard, TX 75452 13 | 431 Marietta St NW Room 303 14 | 20 Benton Pl St Louis, MO 63104 15 | 431 Marietta St NW Floor 3 16 | 271 spring st nw attn: regus 17 | 120 N. Michigan Ave. Ste 1605, Chicago, IL 60601 18 | 1234 W US Hwy 50 19 | 1234 West US Hwy 50 20 | 1234 West US Highway 50 21 | Box # 123456 22 | 431 Marietta St NW 3rd Floor 23 | 232 Dubei Ct Salt Lake City UT 84111 24 | 123 W Hastings St 123456789 25 | PO Box 123456 26 | CORPORATE TRAVEL 27 | MICHAEL A CASOLO 28 | ATTN SHELIA LEWIS 29 | 77 w WACKER DR SUITE 1800 30 | CHICAGO IL 60601 31 | MARTIN DAVIS 32 | SOS SECURITY IL CHICAGO BRANCH 33 | 30 S WACKER DR 34 | STE 2200 35 | CHICAGO IL 60606 36 | Zendesk 37 | 38 | Christian Golding 39 | 1019 Market St 40 | 41 | San Francisco CA 94103 42 | LEXI HAGENSON 43 | 860 w Blackhawk 305 44 | CHICAGO ll. 60642—2534 45 | Erin Johnson 46 | c/o DTZ 47 | Suite 1800 48 | 77 W Wacker Dr 49 | Chicago, IL 606001 50 | 100 Gold Street, 2nd Fl., New York, NY 10038 51 | 59-17 Junction Boulevard, 17th Floor Conference Room, Flushing, New York 52 | 830 Fifth Avenue, Room 313, New York, NY 10065 53 | 10 Country Road 3668 Splendora TX 77372 54 | 1954 Country Road 43 Greenville TX 75401 55 | 3419 SAINT JOHN W ST 56 | 3301 NEW MEXICO NW AV 57 | 2802 BALLAST PT W BLVD 58 | 2350 WASHINGTON NE PL 59 | 10 POLLY DRUMMOND CTR 60 | PO Box 9580 Rancho Santa Fe CA 92067 61 | 123 E Marcy St Ste 201 Santa Fe NM 87501 62 | 130 Grant Ave Ste 203 Santa Fe NM 87501 63 | 12631 Imperial Hgwy Santa Fe CA 90670 64 | 1000 Cordova Place, # 234 Santa Fe NM 87505 65 | 12145 NW Grand Ave El Mirage, AZ 85335 66 | church street station po box 6793 new york ny 10249 67 | mail code 5021 p o box 660367 dallas tx 75266 68 | rr # 1 box 54 loami il 62661 69 | 1555 palm beach lakes blvd. ste. 406 w. palm beach fl 33401 70 | 2100 n. florida mango rd. w. palm beach fl 33409 71 | rr # 3 box 212 mcleansboro il 62859 72 | 16781 chagrin blvd # 124 shaker heights oh 44120-3721 73 | 3121 187th street crt n east moline il 61244 74 | bin # 11 p o box 9201 minneapolis mn 55480 75 | dept 3234 p o box 123234 dallas tx 75312 76 | 18530 mack ave # 445 grosse pointe farms mi 48236-3254 77 | 1122 2nd st n saint petersburg fl 33701-1710 78 | file 74155 po box 60000 san francisco ca 94160 79 | 10502 shadow ridge ln apt 103 louisville ky 80 | msc 410833 p.o. box 415000 nashville tn 37241 81 | customer # 55-0044943 lock box 427 jamison pa 18929 82 | 2207 d gault ave n ft payne al 35967 83 | po box 33701 dept 33701 sn francisco ca 94139 84 | 1950 n. stemmons freeway 85 | 93 s jackson st # 75108 seattle wa 98104-2818 86 | 701 u.s. highway 1 ste 402 n. palm beach fl 33408 87 | 428 westwind dr north palm beach fl 33408 88 | 3424 camp robinson rd. north little rock ar 72118 89 | po box 2303 department 130 indianapolis in 46206 90 | 9457 s university blvd # 252 highlands ranch co 80126-4976 91 | 62835 JESUS MARIA RD MOKELUMNE HILL CA 95245-9658 92 | 9275 NE 48TH CT APT 105 LIGHTHOUSE POINT FL 33064-7908 93 | 9208 COUNTY LINE RD HUNTINGDON VALLEY PA 19006-1701 94 | 895 W MILLER ST FRUITLAND PK FL 34731-2244 95 | 18 SLEEPY LAGOON WAY FLOWERY BRANCH GA 30542-7556 96 | 1080 BUCK HILL DR HUNTINGDON VY PA 19006-7910 97 | 85 MARTIN TRL FLOWERY BR GA 30542-3549 98 | 10095 TALBOT AVE HUNTINGTON WOODS MI 48070-1134 99 | 709 FOXWOOD CIR LAFAYETTE HL PA 19444-1646 100 | 981 CATHAY CIR HUNTINGTON BH CA 92646-4817 101 | 93 TWINBROOK RD BEAR CREEK TWP PA 18702-8415 102 | 612 N CATHERINE AVE LA GRANGE PARK IL 60526-1511 103 | 38415 BORGMAN AVE HUNTINGTON WD MI 48070-1104 104 | 7810 JORDAN RD GLOUCESTER PT VA 23062-2222 105 | 1503 BEAR CREEK BLVD BEAR CREEK TW PA 18702-9441 106 | 5418 RIVER RD LIBERTY GROVE RD NORTH WILKESBORO 28659 107 | 396 GEORGE W LILES PKWY NW CONCORD 28027 108 | 0 E WARDELL DR # APT 3 PEMBROKE 28372 109 | 406 North Highway 71 Business Lowell AR 72745 110 | 2500 S AND W FARM RD HICKORY 28602 111 | 4315 WEBSTER AVENUE LH 112 | 4483 MANHATTAN COLLEGE PY 113 | 188 ROUTE 690 SPRINGBROOK TOWNSHIP, PA 18444 114 | 333 STATE ROUTE 590 ROARING BROOK TWP, PA 18444 115 | 3750 PRIORITY WAY SOUTH DR 116 | 250 JOHN W MORROW JR PKWY 117 | 900 Business 150 STE 3, Mansfield, PA 19402 118 | 83 Business 15, Mansfield, PA 16933 119 | 900 Business 150 STE 3, Mansfield, PA 19402 120 | 100 Business 40, Kernersville, NC 27284 121 | 400 Calm Lake Circle, Rochester, NY, 14612 122 | 37 Jefferson Crt, Fairport, NY 123 | 4 Cypress Ci, Fairport, NY 124 | 1646 Red Leaf Drive Fort Mill, South Carolina 29715 United States 125 | 15 Bridge Street Providence, Rhode Island 02903 United States 126 | 150 Citizens Circle Little River, South Carolina 29566 United States 127 | 4079 U.S. 17 Business Murrells Inlet, South Carolina 29576 United States 128 | 43 South Broadway Pitman, New Jersey 08071 United States 129 | HC 2333 Box 85 130 | HC 284 Box 27 131 | HC 7326 Box 66 132 | HC 992 Box 88 133 | HC R 32 Box # e3 134 | HC ROUTE 72 BOX 1A 135 | HIGHWAY CONTRACT rte # 46 BOX # 992 136 | HIGHWAY CONtraCT ROUTE 56 BOX 45C 137 | StaR ROUTE 75 BOX 5Z 138 | HCR 4e box # 32 139 | HCR 88 bOX 76E 140 | HWY CONTRACT ROUTE 102 BOX 255A 141 | 4510 COUNTY ROAD GV, APPLETON, WI 54913 142 | 7575 COUNTY ROAD ZZZ, MILWAUKEE, WI 54567 143 | 123A E COUNTY ROAD DV, WAUPACA, WI 54981 144 | 1331 COUNTY ROAD AA NE, AMHERST JUNCTION, WI 54407 145 | 133 W COUNTY ROAD LL, AMHERST, WI 54406 146 | 123 COUNTY ROAD ABC, APT 12, IOLA, WI 54445 147 | 200 EAST ELM, DENVER, COLORADO 148 | 55 WINDSOR PLACE, CHAMPAIGN, ILLINOIS 149 | 5 NORTH MAIN, VAN NUYS, CALIFORNIA 150 | 2609 BAYVIEW, FORT LAUDERDALE, FL 151 | 12855 6TH AVE, N. MIAMI, FL 33161 152 | 783 HOPE ST, PROVIDENCE, RHODE ISLAND 02906 153 | 200 EAST ELM, DENVER, COLORADO 154 | 977 PLEASANT STREET, N. ORANGE, NJ 07052 155 | 610 EAST MAIN MARION KANSAS 156 | 10 EAST LAKE, DENVER, COLORADO 157 | 2735 PAWTUCKET AVE EAST PROVIDENCE RHODE ISLAND 02914 158 | 5548 ELMER AVENUE, N. HOLLYWOOD, CA 91601 159 | 160 | -------------------------------------------------------------------------------- /measure_performance/test_data/multi_word_state_addresses.xml: -------------------------------------------------------------------------------- 1 | 2 | 1646 Red Leaf Drive Fort Mill, South Carolina 29715 United States 3 | 15 Bridge Street Providence, Rhode Island 02903 United States 4 | 150 Citizens Circle Little River, South Carolina 29566 United States 5 | 4079 U.S. 17 Business Murrells Inlet, South Carolina 29576 United States 6 | 43 South Broadway Pitman, New Jersey 08071 United States 7 | 8 | -------------------------------------------------------------------------------- /measure_performance/test_data/simple_address_patterns.xml: -------------------------------------------------------------------------------- 1 | 2 | 9112 Mendenhall Mall Road, Juneau, AK 99801 3 | 2701 Thayer Street, Evanston, 60201 4 | 34115 Sterling Highway, Anchor Point, AK 99556 5 | 2222 Silverside Road, Wilmington, DE 19810 6 | 111 Banyan Drive, Hilo, HI 96720 7 | Route Box # 201, Bingham, ME 04920 8 | 3555 68th Street Southeast, Caledonia, MI 49316 9 | 318 East 51st Street, Kansas City, MO 64112 10 | -------------------------------------------------------------------------------- /parse_scripts/import_osm.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | 3 | import requests 4 | 5 | query1 = """ 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | """ % ( 51 | (-70.000000, 50.000000, 25.000000, -125.000000) * 6 52 | ) 53 | r1 = requests.post("http://overpass-api.de/api/interpreter/", data=query1) 54 | r1.encoding = "utf-8" 55 | 56 | f = codecs.open("data/osm_data.xml", encoding="utf-8", mode="w+") 57 | f.write(r1.text) 58 | 59 | 60 | query2 = """ 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | """ % ( 78 | (-87.61309146881104, 41.890042371392965, 41.87234107841773, -87.64235973358154) * 2 79 | ) 80 | # r2 = requests.post('http://overpass-api.de/api/interpreter/', data=query2) 81 | 82 | # f = codecs.open("data/osm_data_street.xml", "wb", "utf-8") 83 | # r2.encoding = 'utf-8' 84 | # f.write(r2.text) 85 | 86 | 87 | query3 = """ 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | """ % ( 100 | (-70.000000, 50.000000, 25.000000, -125.000000) * 2 101 | ) 102 | 103 | if __name__ == "__main__": 104 | r3 = requests.post("http://overpass-api.de/api/interpreter/", data=query3) 105 | 106 | f = codecs.open("data/osm_data_full_addr.xml", "wb", "utf-8") 107 | r3.encoding = "utf-8" 108 | f.write(r3.text) 109 | -------------------------------------------------------------------------------- /parse_scripts/parse.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import random 3 | import re 4 | 5 | from lxml import etree 6 | 7 | 8 | def xmlToAddrList(xml_file): 9 | # osm xml data -> list of dicts representing osm addresses 10 | tree = etree.parse(xml_file) 11 | root = tree.getroot() 12 | addr_list = [] 13 | for element in root: 14 | if element.tag == "node" or element.tag == "way": 15 | address = {} 16 | for x in element.iter("tag"): 17 | addr = ast.literal_eval(str(x.attrib)) 18 | address[addr["k"]] = addr["v"] 19 | addr_list.append(address) 20 | return addr_list 21 | 22 | 23 | def osmNaturalToTraining(xml_file): 24 | # natural addresses (in addr:full from osm xml data) -> training file (xml) 25 | address_list = xmlToAddrList(xml_file) 26 | train_addr_list = etree.Element("AddressCollection") 27 | trainFileName = "../training_data/" + re.sub(r"\W+", "_", xml_file) + ".xml" 28 | punc_list = ",." 29 | # only the osm tags below will end up in training data; others will be 30 | # ignored 31 | osm_tags_to_addr_tags = { 32 | "addr:housenumber": "AddressNumber", 33 | "addr:street:prefix": "StreetNamePreDirectional", 34 | "addr:street:name": "StreetName", 35 | "addr:street:type": "StreetNamePostType", 36 | "addr:city": "PlaceName", 37 | "addr:state": "StateName", 38 | "addr:postcode": "ZipCode", 39 | } 40 | for address in address_list: 41 | addr_tokens = address["addr:full"].split() 42 | train_addr = etree.Element("AddressString") 43 | is_addr_taggable = True 44 | # loop through tokens & find tags for each 45 | for token in addr_tokens: 46 | is_token_taggable = False 47 | for key, value in list(address.items()): 48 | if all( 49 | [ 50 | key in list(osm_tags_to_addr_tags.keys()), 51 | key != "addr:full", 52 | token in value.split(), 53 | ] 54 | ): 55 | token_xml = etree.Element(osm_tags_to_addr_tags[key]) 56 | # check for punctuation 57 | token_xml.text = token 58 | if token[-1] in punc_list: 59 | token_xml.text = token[0:-1] 60 | token_xml.tail = token[-1] 61 | train_addr.append(token_xml) 62 | if is_token_taggable is False: 63 | is_addr_taggable = False 64 | if is_addr_taggable is True: 65 | train_addr_list.append(train_addr) 66 | output = etree.tostring(train_addr_list, pretty_print=True) 67 | with open(trainFileName, "w") as f: 68 | f.write(output) 69 | 70 | 71 | def osmSyntheticToTraining(xml_file): 72 | # osm xml data -> synthetic addresses -> training & test files (xml) 73 | address_list = xmlToAddrList(xml_file) 74 | train_addr_list = [] 75 | 76 | trainFileName = ( 77 | "training/training_data/synthetic_" 78 | + re.sub(r"\W+", "_", re.sub(r".*/", "", xml_file)) 79 | + ".xml" 80 | ) 81 | testFileName = ( 82 | "training/test_data/synthetic_" 83 | + re.sub(r"\W+", "_", re.sub(r".*/", "", xml_file)) 84 | + ".xml" 85 | ) 86 | 87 | synthetic_order = [ 88 | ("addr:housenumber", "AddressNumber", "Street"), 89 | ("addr:street:prefix", "StreetNamePreDirectional", "Street"), 90 | ("addr:street:name", "StreetName", "Street"), 91 | ("addr:street:type", "StreetNamePostType", "Street"), 92 | ("addr:city", "PlaceName", "City"), 93 | ("addr:state", "StateName", "Area"), 94 | ("addr:postcode", "ZipCode", "Area"), 95 | ] 96 | 97 | for address in address_list: 98 | train_addr = etree.Element("AddressString") 99 | components = {"Street": [], "City": [], "Area": []} 100 | for source_tag, target_tag, tag_type in synthetic_order: 101 | if source_tag in list(address.keys()): 102 | words = address[source_tag].split() 103 | for word in words: 104 | token_xml = etree.Element(target_tag) 105 | token_xml.text = word 106 | token_xml.tail = " " 107 | components[tag_type].append(token_xml) 108 | 109 | for tag_type in ("Street", "City", "Area"): 110 | label = components[tag_type] 111 | if label: 112 | label[-1].text += "," 113 | 114 | address_xml = components["Street"] + components["City"] + components["Area"] 115 | 116 | address_xml[-1].text = address_xml[-1].text[:-1] 117 | address_xml[-1].tail = None 118 | 119 | for xml_element in address_xml: 120 | train_addr.append(xml_element) 121 | 122 | train_addr_list.append(train_addr) 123 | 124 | random.shuffle(train_addr_list) 125 | percent_20 = int(len(train_addr_list) * 0.2) 126 | 127 | test_data = etree.Element("AddressCollection") 128 | test_data.extend(train_addr_list[:percent_20]) 129 | 130 | train_data = etree.Element("AddressCollection") 131 | train_data.extend(train_addr_list[percent_20:]) 132 | 133 | with open(trainFileName, "w") as f: 134 | f.write(etree.tostring(train_data, pretty_print=True)) 135 | 136 | with open(testFileName, "w") as f: 137 | f.write(etree.tostring(test_data, pretty_print=True)) 138 | 139 | 140 | def trainFileFromLines(addr_file, is_train=True): 141 | # us50 data -> training or test file (xml) 142 | lines = open(addr_file) 143 | if is_train is True: 144 | outputFileName = ( 145 | "training/training_data/" 146 | + re.sub(r"\W+", "_", re.sub(r".*/", "", addr_file)) 147 | + ".xml" 148 | ) 149 | else: 150 | outputFileName = ( 151 | "training/test_data/" 152 | + re.sub(r"\W+", "_", re.sub(r".*/", "", addr_file)) 153 | + ".xml" 154 | ) 155 | 156 | tag_list = [ 157 | None, 158 | "AddressNumber", 159 | "USPSBox", 160 | "StreetName", 161 | "StreetNamePostType", 162 | "PlaceName", 163 | "StateName", 164 | "ZipCode", 165 | "suffix", 166 | ] 167 | addr_list = etree.Element("AddressCollection") 168 | addr = etree.Element("AddressString") 169 | for line in lines: 170 | if line == "\n": # add addr to list & reset addr 171 | addr[-1].tail = None 172 | addr_list.append(addr) 173 | addr = etree.Element("AddressString") 174 | else: 175 | split = line.split(" |") 176 | addr_line = split[0] 177 | addr_tokens = addr_line.split() 178 | token_num = int(split[1].rstrip()) 179 | token_tag = tag_list[token_num] 180 | for token in addr_tokens: 181 | token_xml = etree.Element(token_tag) 182 | token_xml.text = token 183 | token_xml.tail = " " 184 | addr.append(token_xml) 185 | 186 | output = etree.tostring(addr_list, pretty_print=True) 187 | with open(outputFileName, "w") as f: 188 | f.write(output) 189 | 190 | 191 | if __name__ == "__main__": 192 | osmSyntheticToTraining("training/data/osm_data.xml") 193 | # trainFileFromLines('training/data/us50.train.tagged') 194 | # trainFileFromLines('training/data/us50.test.tagged', False) 195 | -------------------------------------------------------------------------------- /parse_scripts/parse_openaddress.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from lxml import etree 4 | 5 | from usaddress import tokenize 6 | 7 | 8 | def json2trainingxml(infile, outfile, tagmapping): 9 | with open(infile) as f: 10 | data = json.load(f) 11 | addr_list = json2addrlist(data, tagmapping) 12 | list2xml(addr_list, outfile) 13 | 14 | 15 | def json2addrlist(data, tagmapping): 16 | addr_list = [] 17 | for raw_addr in data["features"]: 18 | addr = [] 19 | for tagset in tagmapping: 20 | if tagset[1]: 21 | addr.append([tagset[0], raw_addr["properties"][tagset[2]]]) 22 | else: 23 | addr.append([tagset[0], tagset[2]]) 24 | addr_list.append(addr) 25 | return addr_list 26 | 27 | 28 | def list2xml(addr_list, outfile): 29 | xml_addr_list = etree.Element("AddressCollection") 30 | for addr in addr_list: 31 | xml_addr = etree.Element("AddressString") 32 | # handle commas? 33 | for component in addr: 34 | if component[1]: 35 | for token in tokenize(component[1]): 36 | token_xml = etree.Element(component[0]) 37 | token_xml.text = token 38 | token_xml.tail = " " 39 | xml_addr.append(token_xml) 40 | xml_addr[-1].tail = None 41 | xml_addr_list.append(xml_addr) 42 | 43 | output = etree.tostring(xml_addr_list, pretty_print=True) 44 | with open(outfile, "w") as f: 45 | f.write(output) 46 | 47 | 48 | # this determines the ordering of training xml tags, & the mapping of address 49 | # strings xml address tag, whether raw data has this tag, corresponding json 50 | # tag in raw data or predetermined value 51 | tag_mapping = [ 52 | ["AddressNumber", True, "HOUSENO"], 53 | ["StreetNamePreDirectional", True, "PREDIR"], 54 | ["StreetNamePreType", True, "PRETYPE"], 55 | ["StreetName", True, "NAME"], 56 | ["StreetNamePostType", True, "SUFTYPE"], 57 | ["StreetNamePostDirectional", True, "SUFDIR"], 58 | ["OccupancyType", True, "UNITTYPE"], 59 | ["OccupancyIdentifier", True, "UNITNO"], 60 | ["PlaceName", True, "CITY"], 61 | ["StateName", False, "IA"], 62 | ["ZipCode", True, "ZIP"], 63 | ] 64 | 65 | 66 | infile = "../data/openaddresses/us-ia-linn.json" 67 | outfile = "../training_data/openaddress_us_ia_linn.xml" 68 | 69 | json2trainingxml(infile, outfile, tag_mapping) 70 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "usaddress" 3 | version = "0.5.14" 4 | description = "Parse US addresses using conditional random fields" 5 | readme = "README.md" 6 | license = {text = "MIT License", url = "http://www.opensource.org/licenses/mit-license.php"} 7 | requires-python = ">=3.9" 8 | dependencies = [ 9 | "python-crfsuite>=0.7", 10 | "probableparsing" 11 | ] 12 | classifiers = [ 13 | "Development Status :: 3 - Alpha", 14 | "Intended Audience :: Developers", 15 | "Intended Audience :: Science/Research", 16 | "License :: OSI Approved :: MIT License", 17 | "Natural Language :: English", 18 | "Operating System :: MacOS :: MacOS X", 19 | "Operating System :: Microsoft :: Windows", 20 | "Operating System :: POSIX", 21 | "Topic :: Software Development :: Libraries :: Python Modules", 22 | "Topic :: Scientific/Engineering", 23 | "Topic :: Scientific/Engineering :: Information Analysis", 24 | ] 25 | 26 | [project.urls] 27 | Homepage = "https://github.com/datamade/usaddress" 28 | 29 | [project.optional-dependencies] 30 | dev = ["pytest", 31 | "black", 32 | "isort", 33 | "mypy", 34 | "flake8", 35 | "parserator" 36 | ] 37 | 38 | [build-system] 39 | requires = ["setuptools>=42", "wheel", "parserator", "probableparsing"] 40 | build-backend = "setuptools.build_meta" 41 | 42 | 43 | [tool.setuptools.packages.find] 44 | include = ["usaddress"] 45 | 46 | 47 | [tool.setuptools.package-data] 48 | usaddress = ['usaddr.crfsuite'] 49 | 50 | 51 | [tool.pytest.ini_options] 52 | addopts = [ 53 | "--import-mode=importlib", 54 | ] 55 | testpaths = [ 56 | "tests", 57 | ] 58 | 59 | [tool.mypy] 60 | files = ["usaddress"] 61 | show_error_codes = true 62 | ignore_missing_imports = true 63 | check_untyped_defs = true 64 | implicit_reexport = false 65 | 66 | [tool.isort] 67 | profile = "black" 68 | src_paths = ["usaddress", "tests"] 69 | -------------------------------------------------------------------------------- /raw/LICENSE.md: -------------------------------------------------------------------------------- 1 | The files us50.test.raw, us50.test.tagged, us50.train.raw us50.train.tagged are 2 | Copyright (c) <2004> 3 | All rights reserved. 4 | 5 | Developed by: 6 | 7 | 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal with the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 10 | 11 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimers. 12 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimers in the documentation and/or other materials provided with the distribution. 13 | * Neither the names of , nor the names of its contributors may be used to endorse or promote products derived from this Software without specific prior written permission. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. 16 | 17 | [This is an instance of the University of Illinois/NCSA Open Source 18 | agreement, obtained from http://www.opensource.org/licenses/UoI-NCSA.php] 19 | 20 | See the file third-party-license.txt for license information of the third 21 | party software used in this package. 22 | 23 | test.py is 24 | Copyright (c) 2012-2013 Mike Jensen 25 | 26 | Permission is hereby granted, free of charge, to any person obtaining a copy 27 | of this software and associated documentation files (the "Software"), to deal 28 | in the Software without restriction, including without limitation the rights 29 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 30 | copies of the Software, and to permit persons to whom the Software is 31 | furnished to do so, subject to the following conditions: 32 | 33 | The above copyright notice and this permission notice shall be included in 34 | all copies or substantial portions of the Software. 35 | 36 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 37 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 38 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 39 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 40 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 41 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 42 | THE SOFTWARE. 43 | -------------------------------------------------------------------------------- /raw/us50.test.raw: -------------------------------------------------------------------------------- 1 | Soldotna, AK 99669 2 | 9112 Mendenhall Mall Road, Juneau, AK 99801 3 | Mile K Beach Road # 1, Kenai, AK 99611 4 | Mile K Beach Road # 1, Kenai, AK 99611 5 | Mi K Beach Road # 2, Kenai, AK 99611 6 | Sand Point, AK 99661 7 | Ridgecrest Drive, Bethel, AK 99559 8 | Bethel, AK 99559 9 | Box # 63, Cordova, AK 99574 10 | Cordova, AK 99574 11 | 32-233 M Street, Elmendorf Afb, AK 99506 12 | 2932 Boniface Parkway, Anchorage, AK 99504 13 | 34115 Sterling Highway, Anchor Point, AK 99556 14 | Anchor Point, AK 99556 15 | 2101 1st Avenue North, Pell City, AL 35125 16 | 8483 Al Highway 157, Moulton, AL 35650 17 | 64 South Water Street, Mobile, AL 36602 18 | 30653 Highway 278, Addison, AL 35540 19 | 325 Copeland Street, Eufaula, AL 36027 20 | 1405 Sunset Drive, Guntersville, AL 35976 21 | 131 Cove Avenue, Gulf Shores, AL 36542 22 | 1014 Market Street, Greensboro, AL 36744 23 | Dthn Arprt Trmnl, Midland City, AL 36350 24 | 1720 North Eufaula Avenue, Eufaula, AL 36027 25 | 88175 US Highway 278, Altoona, AL 35952 26 | 1361 Federal Drive, Montgomery, AL 36107 27 | 1400 Lloyd Street, Montgomery, AL 36107 28 | 120 Oxmoor Boulevard, Birmingham, AL 35209 29 | 7104 Highway 62 West, Gassville, AR 72635 30 | 2112 Linwood Drive, Paragould, AR 72450 31 | 408 North Walnut Street, Searcy, AR 72143 32 | 3209 Northeast 11th Street, Bentonville, AR 72712 33 | Perryville, AR 72126 34 | 302 Veneer Street, Malvern, AR 72104 35 | Highway 71 B, Fayetteville, AR 72701 36 | Springdale, AR 72762 37 | Springdale, AR 72762 38 | 198 South Poplar Street, Marianna, AR 72360 39 | 703 East Race Avenue, Searcy, AR 72143 40 | Lawson, Urbana, AR 71768 41 | 3000 Kavanaugh Boulevard, Little Rock, AR 72205 42 | 406 North Highway 71 Business, Lowell, AR 72745 43 | 1445 West Southern Avenue, Mesa, AZ 85202 44 | 1690 East Ash Street, Globe, AZ 85501 45 | 910 South Beeline Highway, Payson, AZ 85541 46 | 6500 Highway 179, Sedona, AZ 86351 47 | Southgate Mall, Yuma, AZ 85364 48 | 1202 A Avenue, Douglas, AZ 85607 49 | 2890 East Andy Devine Avenue, Kingman, AZ 86401 50 | 11291 East Viaduct Linda, Scottsdale, AZ 85259 51 | Elfrida, AZ 85610 52 | 300 West Apache Trail, Apache Junction, AZ 85220 53 | 5235 North Highway 64, Williams, AZ 86046 54 | 1015 North Stone Avenue, Tucson, AZ 85705 55 | 6300 East Golf Links Road, Tucson, AZ 85730 56 | 2616 South Mission Road, Tucson, AZ 85713 57 | 8501 West Bowles Avenue, Littleton, CO 80123 58 | 2700 Colorado Boulevard, Idaho Springs, CO 80452 59 | 926 Main Street, Longmont, CO 80501 60 | 680 Oak Avenue, Eaton, CO 80615 61 | 1606 West Eisenhower Boulevard, Loveland, CO 80537 62 | 525 North Avenue, Grand Junction, CO 81501 63 | 2255 Sheridan Boulevard, Denver, CO 80214 64 | 1001 West New York Avenue, Gunnison, CO 81230 65 | 132 West 6th Street, Glenwood Springs, CO 81601 66 | 321 South Denver Avenue, Fort Lupton, CO 80621 67 | 390 South McCaslin Boulevard, Louisville, CO 80027 68 | 1298 North College Avenue, Fort Collins, CO 80524 69 | 771 South Santa Fe Avenue, Pueblo, CO 81006 70 | 100 Greyrock Place, Stamford, CT 06901 71 | Meriden Square, Meriden, CT 06450 72 | 228 Main Street, Winsted, CT 06098 73 | 287 New Britain Avenue, Hartford, CT 06106 74 | 316 Main Street, Cromwell, CT 06416 75 | 621 Washington Avenue, Bridgeport, CT 06604 76 | 377 South Center Street, Windsor Locks, CT 06096 77 | 20 West Pine Way, Plainville, CT 06062 78 | 360 Watertown Road, Thomaston, CT 06787 79 | 53 Quinnipiac Avenue, North Haven, CT 06473 80 | 26 Killingworth Road, Higganum, CT 06441 81 | 942 Main Street, Hartford, CT 06103 82 | West Route Box West # 4, Goshen, CT 06756 83 | 907 Boston Post Road, Old Saybrook, CT 06475 84 | 222 7th Street Southeast, Washington, DC 20003 85 | 3320 M Street Northwest, Washington, DC 20007 86 | 1517 Connecticut Avenue Northwest, Washington, DC 20036 87 | 4115 Wisconsin Avenue Northwest, Washington, DC 20016 88 | 613 K Street Northwest, Washington, DC 20001 89 | 2014 P Street Northwest, Washington, DC 20036 90 | 1606 7th Street Northwest, Washington, DC 20001 91 | 1063 31st Street Northwest, Washington, DC 20007 92 | 3500 Connecticut Avenue Northwest, Washington, DC 20008 93 | 1654 Columbia Road Northwest, Washington, DC 20009 94 | 4615 Wisconsin Avenue Northwest, Washington, DC 20016 95 | Union Station, Washington, DC 20001 96 | 499 South Capitol Street Southwest, Washington, DC 20003 97 | 5532 Connecticut Avenue Northwest, Washington, DC 20015 98 | 1720 West Gilpin Drive, Wilmington, DE 19805 99 | 2222 Silverside Road, Wilmington, DE 19810 100 | 213 Anglers Road, Lewes, DE 19958 101 | 500 Greenhill Avenue, Wilmington, DE 19805 102 | 12 East Lea Boulevard, Wilmington, DE 19802 103 | 1270 Ocean Outlt, Rehoboth Beach, DE 19971 104 | 791 North Dual Highway, Seaford, DE 19973 105 | 909 North Dupont Highway, Dover, DE 19901 106 | 900 Churchmans Road, Newark, DE 19713 107 | 2354 Glasgow Avenue, Newark, DE 19702 108 | 134 North Dupont Highway, New Castle, DE 19720 109 | 4000 Concord Pike, Wilmington, DE 19803 110 | 2719 Philadelphia Pike, Claymont, DE 19703 111 | 3211 Kirkwood Highway, Wilmington, DE 19808 112 | 8751 Himes Avenue North, Tampa, FL 33614 113 | 13690 Stoneybrook Drive, Clearwater, FL 33762 114 | 1733 Tamiami Trail South, Venice, FL 34293 115 | 1812 South Osprey Avenue, Sarasota, FL 34239 116 | 6600 Gulf Drive, Bradenton Beach, FL 34217 117 | 1287 1st Street, Sarasota, FL 34236 118 | 5401 West Kennedy Boulevard, Tampa, FL 33609 119 | 301 Gulf Boulevard, Belleair Beach, FL 33786 120 | 1601 Englewood Road Route 776, Englewood, FL 34223 121 | 4023 Water Avenue West, Tampa, FL 33614 122 | 285 West Dearborn, Englewood, FL 34223 123 | 1330 9th Avenue, Tampa, FL 33605 124 | 844 6th Street Northwest, Winter Haven, FL 33881 125 | 1411 49th Street South, Saint Petersburg, FL 33707 126 | Highway 80, Allentown, GA 31003 127 | 108 South Park Avenue, Calhoun, GA 30701 128 | 122 Cedar Street, Arlington, GA 31713 129 | 110 1st Street Southwest, Moultrie, GA 31768 130 | 2567 Main Street West, Snellville, GA 30078 131 | Pineville Road, Buena Vista, GA 31803 132 | 2601 Dawson Road, Albany, GA 31707 133 | Northlake Mall, Tucker, GA 30084 134 | 1000 North Point Circle, Alpharetta, GA 30022 135 | 419 Spaulding Road, Montezuma, GA 31063 136 | 615 Newton Road, Camilla, GA 31730 137 | 932 McDaniel Street Southwest, Atlanta, GA 30310 138 | Buford Street, Shellman, GA 31786 139 | 614 North Hutchinson Avenue, Adel, GA 31620 140 | 1750 Kalakaua Avenue Fl 3rd, Honolulu, HI 96826 141 | 1215 South Kihei Road Unit F, Kihei, HI 96753 142 | 111 Banyan Drive, Hilo, HI 96720 143 | 217 Waianuenue Avenue, Hilo, HI 96720 144 | 1050 Ala Moana Boulevard, Honolulu, HI 96813 145 | 2239 South King Street, Honolulu, HI 96826 146 | 2463 South Kihei Road, Kihei, HI 96753 147 | 935 California Avenue, Wahiawa, HI 96786 148 | 94-210 Leokane, Suite 105, Waipahu, HI 96797 149 | 2500 Kalakaua Avenue Unit A, Honolulu, HI 96815 150 | 1314 North King Street, Honolulu, HI 96817 151 | 120 Hekili, Kailua, HI 96734 152 | 1279 South Kihei Road Suite 122, Kihei, HI 96753 153 | 1121 Nuuanu Avenue Suite 105, Honolulu, HI 96817 154 | Highway 34 East, Albia, IA 52531 155 | Valley West Mall, West Des Moines, IA 50266 156 | 2801 Main Street, Keokuk, IA 52632 157 | 1804 Highway Boulevard, Spencer, IA 51301 158 | 1106 Highway 69 North, Forest City, IA 50436 159 | 3070 Highway 22, Riverside, IA 52327 160 | 9125 Atlantic Drive Southwest, Cedar Rapids, IA 52404 161 | 101 West Taylor Street, Creston, IA 50801 162 | 1302 US Highway 71 North, Carroll, IA 51401 163 | 115 Main Street, Ames, IA 50010 164 | 1905 Cottage Grove Avenue, Des Moines, IA 50314 165 | 127 East 18th Street, Cedar Falls, IA 50613 166 | 812 Illinois Street, Sidney, IA 51652 167 | 215 East Walnut Street, Des Moines, IA 50309 168 | 978 East Main Street, Burley, ID 83318 169 | 348 Addison Avenue West, Twin Falls, ID 83301 170 | 295 South 250 East, Burley, ID 83318 171 | 423 South Oneida Street, Rupert, ID 83350 172 | 123 West 5th North, Burley, ID 83318 173 | 317 Happy Day Boulevard, Caldwell, ID 83607 174 | 524 Airport Loop, Twin Falls, ID 83301 175 | 111 Broadway Avenue, Boise, ID 83702 176 | 6536 Main Street, Bonners Ferry, ID 83805 177 | Highway 77, Albion, ID 83311 178 | 220 6th Street, Wallace, ID 83873 179 | 150 1st Street, Idaho Falls, ID 83401 180 | 160 West Main Street, Oakley, ID 83346 181 | Almo, ID 83312 182 | 2256 Northbrook Court, Northbrook, IL 60062 183 | 1060 Spring Hill Mall, Dundee, IL 60118 184 | 6 West South Water Market, Chicago, IL 60608 185 | 14510 South Indiana Avenue, Riverdale, IL 60827 186 | 719 West Elm Street, Hoopeston, IL 60942 187 | 1612 North Sedgwick Street, Chicago, IL 60614 188 | 3359 West 115th Street, Alsip, IL 60803 189 | 425 West 115th Street, Chicago, IL 60628 190 | 4730 West 103rd Street, Oak Lawn, IL 60453 191 | 6943 Roosevelt Road, Berwyn, IL 60402 192 | 1330 West Madison Street, Chicago, IL 60607 193 | 4128 14th Avenue, Rock Island, IL 61201 194 | 829 Moen Avenue, Joliet, IL 60436 195 | 8922 North 1/2 Greenwood Avenue, Niles, IL 60714 196 | 40 East Main Street, Hagerstown, IN 47346 197 | 313-317 Broadway, Madison, IN 47250 198 | 2371 North 26th Street, Lafayette, IN 47904 199 | 10 West Market Street, Indianapolis, IN 46204 200 | 3906 North Indianapolis Road, Columbus, IN 47201 201 | 1100 West 11th Street, Bloomington, IN 47404 202 | 12510 North Meridian Street, Carmel, IN 46032 203 | 1753 East 12th Street, Mishawaka, IN 46544 204 | 1900 South Mock Avenue, Muncie, IN 47302 205 | 1800 East Market Street, Logansport, IN 46947 206 | 1621 East 10th Street, Jeffersonville, IN 47130 207 | 239 West Monroe Street, Decatur, IN 46733 208 | US 6 Ind 15, Milford, IN 46542 209 | 1000 Indiana Highway 212, Michigan City, IN 46360 210 | 324 West Washington Street, Oskaloosa, KS 66066 211 | 8 East 6th Street, Lawrence, KS 66044 212 | 1044 West 29th Street North, Wichita, KS 67204 213 | 8533 Southwest 21st Street, Topeka, KS 66615 214 | 6249 East 21st Street North, Wichita, KS 67208 215 | 707 North Main Street, Newton, KS 67114 216 | 1135 College Drive, Garden City, KS 67846 217 | 2615 1/2 Gary Avenue, Dodge City, KS 67801 218 | 103 West 5th Street, Admire, KS 66830 219 | 13964 Santa Fe Trail Drive, Shawnee Mission, KS 66215 220 | 11120 Antioch Road, Shawnee Mission, KS 66210 221 | 311 West Holme Street, Norton, KS 67654 222 | 3002 East Central Avenue, Wichita, KS 67214 223 | Municipal Airport, Hutchinson, KS 67501 224 | 211 Clover Lane, Louisville, KY 40207 225 | Crsrds Highway 139 A, Cadiz, KY 42211 226 | 440 East Main Street, Bowling Green, KY 42101 227 | 600 West 9th Street, Russellville, KY 42276 228 | Alvy Prk And Hghwy # 54, Owensboro, KY 42301 229 | 433 Johnson Street, Covington, KY 41011 230 | 610 West Magnolia Avenue, Louisville, KY 40208 231 | 445 Highway 44 East, Shepherdsville, KY 40165 232 | 134 Spout Springs Road, Irvine, KY 40336 233 | 820 Madison Avenue, Covington, KY 41011 234 | 9464 Brownsboro Road, Louisville, KY 40241 235 | 41 East Washington Street, Sebree, KY 42455 236 | 410 West Vine Street, Lexington, KY 40507 237 | 500 Winchester Avenue, Ashland, KY 41101 238 | 1587 Crowley Rayne Highway, Crowley, LA 70526 239 | 101 Cheyenne Drive, Scott, LA 70583 240 | 1003 Hugh Wallis Road South, Lafayette, LA 70508 241 | 724 Iberville Street, New Orleans, LA 70130 242 | 724 Iberville Street, New Orleans, LA 70130 243 | 305 Constitution Drive, West Monroe, LA 71292 244 | 8523 Highway 23, Belle Chasse, LA 70037 245 | 8523 Highway 23, Belle Chasse, LA 70037 246 | 611 Frenchmen Street, New Orleans, LA 70116 247 | New Iberia, LA 70560 248 | 401 Highway 1 North, Oil City, LA 71061 249 | 4396 Natchitoches Highway, Many, LA 71449 250 | 3810 East Texas Street, Bossier City, LA 71111 251 | 6764 Airline Highway, Baton Rouge, LA 70805 252 | 1665 Beacon Street, Brookline, MA 02445 253 | 212 Stuart Street, Boston, MA 02116 254 | 529 King Street, Littleton, MA 01460 255 | 72 Bigelow Avenue, Watertown, MA 02472 256 | 234 John Wise Avenue, Essex, MA 01929 257 | 1655 Lakeview Avenue, Dracut, MA 01826 258 | 30 Boltwood Avenue, Amherst, MA 01002 259 | 45 School Street, Boston, MA 02108 260 | 2465 Massachusetts Avenue, Cambridge, MA 02140 261 | 313 Littleton Road, Westford, MA 01886 262 | 472 Massachusetts Avenue, Cambridge, MA 02139 263 | 752 Plain Street, Marshfield, MA 02050 264 | 259 Union Street, Holbrook, MA 02343 265 | 17503 Redland Road, Derwood, MD 20855 266 | 996 West Patrick Street, Frederick, MD 21703 267 | 6339 Allentown Road, Temple Hills, MD 20748 268 | 190 Thomas Jefferson Drive, Frederick, MD 21702 269 | 58 South Potomac Street, Hagerstown, MD 21740 270 | Pikesville, MD 21208 271 | Drum Point Road, Deale, MD 20751 272 | 1678 Village Green, Crofton, MD 21114 273 | 2700 Turf Valley Road, Ellicott City, MD 21042 274 | 1011 Baltimore Boulevard, Westminster, MD 21157 275 | 580 Ritchie Highway, Severna Park, MD 21146 276 | 106 Reisterstown Road, Pikesville, MD 21208 277 | Kennebunkport, ME 04046 278 | 55 Western Avenue, Kennebunk, ME 04043 279 | Main, Mars Hill, ME 04758 280 | Route Box # 201, Bingham, ME 04920 281 | 21 North Main Street, Rockland, ME 04841 282 | 71 India Street, Portland, ME 04101 283 | 30 Fair Street, Norway, ME 04268 284 | 469 Main Street, Saco, ME 04072 285 | Midtown Mall, Sanford, ME 04073 286 | 458 Center Street, Auburn, ME 04210 287 | 148 Pleasant Street, Brunswick, ME 04011 288 | 9 Dana Street, Portland, ME 04101 289 | Anchor Inn Road, Round Pond, ME 04564 290 | 12 Bridge Street, Boothbay Harbor, ME 04538 291 | 9793 Telegraph Road, Erie, MI 48133 292 | 621 East Michigan Avenue, Lansing, MI 48912 293 | 3555 68th Street Southeast, Caledonia, MI 49316 294 | 84 East 8th Street, Holland, MI 49423 295 | 4346 South Division Avenue, Grand Rapids, MI 49548 296 | 19214 Joy Road, Detroit, MI 48228 297 | 9405 West Fort Street, Detroit, MI 48209 298 | 2229 18 Mile Road, Sterling Heights, MI 48314 299 | 510 North Telegraph Road, Pontiac, MI 48341 300 | 12200 Market Place Drive, Birch Run, MI 48415 301 | 18440 West Warren Avenue, Detroit, MI 48228 302 | 304 North State Street, Big Rapids, MI 49307 303 | 301 Lafayette Avenue, Bay City, MI 48708 304 | 200 South Cherry Street, Flushing, MI 48433 305 | 510 Groveland Avenue, Minneapolis, MN 55403 306 | Highway 59 North, Detroit Lakes, MN 56501 307 | 1364 West 71 Street, Saint Paul, MN 55102 308 | 1285 Highway 7 East, Hutchinson, MN 55350 309 | Minneapolis, MN 55401 310 | 3131 West 3rd Street, Duluth, MN 55806 311 | 1200 12th Street Southwest, Rochester, MN 55902 312 | 103 Main Street South, Aurora, MN 55705 313 | 17280 Highway 23 Northeast, New London, MN 56273 314 | Hwys 16 And 63 North, Spring Valley, MN 55975 315 | 910 4th Street Northwest, Austin, MN 55912 316 | 1529 Robert Street South, Saint Paul, MN 55118 317 | Highway 28 And 55, Glenwood, MN 56334 318 | 510 1st Street West, Fosston, MN 56542 319 | 510 Southwest 3rd Street, Lees Summit, MO 64063 320 | Junction Highway 76 37 & 86, Cassville, MO 65625 321 | 803 Franklin Avenue, Washington, MO 63090 322 | 405 West Main Street, Branson, MO 65616 323 | Tipton, MO 65081 324 | 318 East 51st Street, Kansas City, MO 64112 325 | 609 West Newton Street, Versailles, MO 65084 326 | Hghwy 54 And East Hckry, Nevada, MO 64772 327 | 2003 Southridge Drive, Jefferson City, MO 65109 328 | 303 Northeast Englewood Road, Kansas City, MO 64118 329 | 5801 North Highway 763, Columbia, MO 65202 330 | Highway 63 South, Edgar Springs, MO 65462 331 | 17315 Highway 87, Boonville, MO 65233 332 | 19 Charlestowne Plaza, Saint Charles, MO 63303 333 | 789 Vieux Marche Mall, Biloxi, MS 39530 334 | 130 Airport Cafe Circle, Lucedale, MS 39452 335 | Glfprt Blx Rgnl Arpr, Gulfport, MS 39501 336 | 118 Courthouse Square, Oxford, MS 38655 337 | 2039 Highway 82 East, Greenville, MS 38703 338 | 2028 Beach Boulevard, Biloxi, MS 39531 339 | Highway 27 South, Monticello, MS 39654 340 | 107 East Jackson Street, Belzoni, MS 39038 341 | 646 Highway 48 West, Tylertown, MS 39667 342 | 25357 Highway 330, Oakland, MS 38948 343 | 741 West Broadway Street, Yazoo City, MS 39194 344 | 134 North Front Street, Senatobia, MS 38668 345 | 511 East Waldron Street, Corinth, MS 38834 346 | 3307 Mlk Jr Drive, Jackson, MS 39213 347 | 142 South Pratten, Columbus, MT 59019 348 | US Highway 22, Miles City, MT 59301 349 | 5225 West Broadway Street, Missoula, MT 59808 350 | Kalispell, MT 59901 351 | 419 South Ellery Avenue, Fairview, MT 59221 352 | Sheridan, MT 59749 353 | 22 1st Street East, Kalispell, MT 59901 354 | West Fork Pls Mdw Village, Big Sky, MT 59716 355 | 600 Central Avenue, Great Falls, MT 59401 356 | 101 Main Street, Kalispell, MT 59901 357 | Columbia Falls, MT 59912 358 | 1425 Broadwater Avenue, Billings, MT 59102 359 | 7353 Goddard Drive, Malmstrom A F B, MT 59402 360 | Gldn Egl Lodge Mdw Village, Big Sky, MT 59716 361 | 1 The Square, Lillington, NC 27546 362 | 1205 Shelby Highway, Cherryville, NC 28021 363 | 710 East 10th Street, Roanoke Rapids, NC 27870 364 | 118 South Hancock Street, Rockingham, NC 28379 365 | 121 South Trade Street, Tryon, NC 28782 366 | 1222 North Alston Avenue, Durham, NC 27701 367 | 935 College Street, Oxford, NC 27565 368 | Queen Elizabeth Avenue, Manteo, NC 27954 369 | Tranquil House Inn, Manteo, NC 27954 370 | 123 Landmark Alley, Statesville, NC 28677 371 | 1025 Blue Ridge Road, Raleigh, NC 27607 372 | 443 2nd Avenue Southwest, Hickory, NC 28602 373 | 6500 Aviation Parkway, Morrisville, NC 27560 374 | 200 North Davie Street, Greensboro, NC 27401 375 | 106 5th Avenue Southwest, Valley City, ND 58072 376 | 2302 15th Street Southwest, Minot, ND 58701 377 | 2851 South Columbia Road, Grand Forks, ND 58201 378 | 434 South 3rd Street, Bismarck, ND 58504 379 | 289 15th Street West, Dickinson, ND 58601 380 | 2802 13th Avenue South, Fargo, ND 58103 381 | 106 26th Street West, Williston, ND 58801 382 | 2400 10th Street Southwest, Minot, ND 58701 383 | 1419 South Columbia Road, Grand Forks, ND 58201 384 | 2815 13th Avenue South, Fargo, ND 58103 385 | 1415 42nd Street South, Fargo, ND 58103 386 | 1117 38th Street North, Fargo, ND 58102 387 | 251 14th Street West, Dickinson, ND 58601 388 | 1801 8th Avenue Southwest, Jamestown, ND 58401 389 | 1824 North 120th Street, Omaha, NE 68154 390 | 220 West 2nd Street, Grand Island, NE 68801 391 | 122 East 16th Street, Schuyler, NE 68661 392 | 1006 Howard Street, Omaha, NE 68102 393 | Municipal Airport, Lincoln, NE 68524 394 | Lee Bird Fld, North Platte, NE 69101 395 | 11036 Elm Street, Omaha, NE 68144 396 | 106 East Front Street, Alda, NE 68810 397 | 8021 Blondo Street, Omaha, NE 68134 398 | Plainview, NE 68769 399 | 10220 Regency Circle, Omaha, NE 68114 400 | 432 East Douglas Street, Oneill, NE 68763 401 | 10405 Calhoun Road, Omaha, NE 68112 402 | 520 South Main, Ohiowa, NE 68416 403 | 90 Front Street, Exeter, NH 03833 404 | 17 Newmarket Road, Durham, NH 03824 405 | 9 Depot Street, Hillsboro, NH 03244 406 | Tecumseh Road, Waterville Valley, NH 03215 407 | Westside Road, North Conway, NH 03860 408 | Lee's Mill Road, Moultonborough, NH 03254 409 | 176 Main Street, Keene, NH 03431 410 | Route 16, North Conway, NH 03860 411 | Woodbury Avenue, Portsmouth, NH 03801 412 | 21 Front Street, Manchester, NH 03102 413 | 837 Brattleboro Road, Hinsdale, NH 03451 414 | 11 Main Street, Bennington, NH 03442 415 | Main, Ashland, NH 03217 416 | 667 Palisade Avenue, Cliffside Park, NJ 07010 417 | 2 Bay Avenue, Highlands, NJ 07732 418 | 7 Grove Street, Wanaque, NJ 07465 419 | 1101 U.S. Highway No. 130 North, Burlington, NJ 08016 420 | Main Street, Medford, NJ 08055 421 | 2011 Route 70 West, Cherry Hill, NJ 08002 422 | 185 3rd Avenue, Long Branch, NJ 07740 423 | 301 Howard Street, Cape May, NJ 08204 424 | 254 U.S. Highway No 202, Pluckemin, NJ 07978 425 | State Highway No 33, Hightstown, NJ 08520 426 | 2104 State Highway No 70, Lakehurst, NJ 08733 427 | 118 Berkshire Avenue, Paterson, NJ 07502 428 | 2 Broadway, Somers Point, NJ 08244 429 | 1451 Mechem Drive, Ruidoso, NM 88345 430 | 62 Camino Azul, Jemez Springs, NM 87025 431 | 205 Central Avenue, Tularosa, NM 88352 432 | Los Ojos, NM 87551 433 | 7400 North US Highway 54-70, Tularosa, NM 88352 434 | 12999 Central Avenue Northeast, Albuquerque, NM 87123 435 | 1015 Rio Grande Boulevard Northwest, Albuquerque, NM 87104 436 | 2101 Louisiana Boulevard Northeast, Albuquerque, NM 87110 437 | 500 Marquette Avenue Northwest Suite 1500, Albuquerque, NM 87102 438 | 3297 Cerrillos Road, Santa Fe, NM 87505 439 | Mora, NM 87732 440 | 801 Delaware Avenue, Alamogordo, NM 88310 441 | Box # 209, Glorieta, NM 87535 442 | 1341 State Road 75, Penasco, NM 87553 443 | 515 South Virginia Street, Reno, NV 89501 444 | 442 Flint Street, Reno, NV 89501 445 | 5030 Paradise Road, Las Vegas, NV 89119 446 | 3999 Las Vegas Boulevard South, Las Vegas, NV 89119 447 | 1950 East Greg Street, Sparks, NV 89431 448 | 3763 Las Vegas Boulevard South, Las Vegas, NV 89109 449 | 3100 Needles Highway, Laughlin, NV 89029 450 | 3100 Needles Highway, Laughlin, NV 89029 451 | 3466 South Decatur Boulevard, Las Vegas, NV 89102 452 | 3025 Las Vegas Boulevard South, Las Vegas, NV 89109 453 | 5550 West Charleston Boulevard, Las Vegas, NV 89146 454 | 3400 Las Vegas Boulevard South, Las Vegas, NV 89109 455 | 375 East Harmon Avenue, Las Vegas, NV 89109 456 | 2845 Las Vegas Boulevard South, Las Vegas, NV 89109 457 | 1011 Avn Of Th Amrcs, New York, NY 10018 458 | 361 1st Avenue, New York, NY 10010 459 | 1511 Lexington Avenue, New York, NY 10029 460 | 770 Washington Avenue, Brooklyn, NY 11238 461 | 1007 Clarkson Avenue, Brooklyn, NY 11212 462 | 101 West 23rd Street, New York, NY 10011 463 | 10018 4th Avenue, Brooklyn, NY 11209 464 | 101 East 161st Street, Bronx, NY 10451 465 | 1010 Northern Boulevard, Great Neck, NY 11021 466 | 103 2nd Avenue, New York, NY 10003 467 | 229 East 9th Street, New York, NY 10003 468 | 725 10th Avenue, New York, NY 10019 469 | 104 West 57th Street, New York, NY 10019 470 | 107 Forest Avenue, Locust Valley, NY 11560 471 | 356 Main Street, Wellsville, OH 43968 472 | 4919 Mount Pleasant Street Northwest, Canton, OH 44720 473 | 4919 Mount Pleasant Street Northwest, Green, OH 44232 474 | 389 South Green Road, Cleveland, OH 44121 475 | 123 North Paint Street, Chillicothe, OH 45601 476 | 4200 Kettering Boulevard, Dayton, OH 45439 477 | 2241 East 42nd Street, Lorain, OH 44055 478 | 33587 Aurora Road, Solon, OH 44139 479 | T703 State Route 66, Archbold, OH 43502 480 | T703 State Route 66, Archbold, OH 43502 481 | 3257 West Siebenthaler Avenue, Dayton, OH 45406 482 | 1130 US Highway 52, Aberdeen, OH 45101 483 | 4016 Glenway Avenue, Cincinnati, OH 45205 484 | 33 East Schtock Road, New Albany, OH 43054 485 | 201 East Main, Binger, OK 73009 486 | 2014 West Gary Boulevard, Clinton, OK 73601 487 | 1002 West Jackson Street, Hugo, OK 74743 488 | 119 East 10, Weleetka, OK 74880 489 | 1254 North Eastern Avenue, Oklahoma City, OK 73160 490 | 2118 West Edmond Road, Edmond, OK 73003 491 | 401 Grant, Agra, OK 74824 492 | 722 Asp Avenue, Norman, OK 73069 493 | 8500 South Elwood, Jenks, OK 74037 494 | 11919 North Pennsylvania Avenue, Oklahoma City, OK 73120 495 | 4734 Southeast 29th Street, Oklahoma City, OK 73115 496 | 1809 South Air Depot Boulevard, Oklahoma City, OK 73110 497 | 2917 South Douglas Boulevard, Oklahoma City, OK 73130 498 | 2748 South Harvard Avenue, Tulsa, OK 74114 499 | 4021 Southwest 117th Avenue Suite C, Beaverton, OR 97005 500 | 1341 Northeast Orenco Station Pakway, Hillsboro, OR 97124 501 | 4 South Main Street, Joseph, OR 97846 502 | 13500 Southwest Pacific Highway, Portland, OR 97223 503 | 16165 Southwest Regatta Lane, Suite 1000, Beaverton, OR 97006 504 | 2628 North Highway 99 West, Mcminnville, OR 97128 505 | 2525 Southeast Clinton, Portland, OR 97202 506 | 17210 Southwest Shaw Street, Aloha, OR 97007 507 | 15700 Northwest Blue Ridge Street, Beaverton, OR 97006 508 | 17455 Southwest Farmington Road, Aloha, OR 97007 509 | 500 Southeast Butler Road, Gresham, OR 97080 510 | Portland, OR 97210 511 | 1025 Chetco Avenue, Brookings, OR 97415 512 | 4805 Southwest 229th, Aloha, OR 97007 513 | 12 East Lancaster Avenue, Ardmore, PA 19003 514 | 709 Olde Hickory Road, Lancaster, PA 17601 515 | 2300 Route 309, Wilkes Barre, PA 18702 516 | 4034 Easton Avenue, Bethlehem, PA 18020 517 | 422 East State Street, Sharon, PA 16146 518 | 4431 Easton Avenue, Bethlehem, PA 18020 519 | RR 422 Box, Douglassville, PA 19518 520 | 1591 Big Oak Rd, Yardley, PA 19067 521 | Rt 10, Parkesburg, PA 19365 522 | 522 McKean Avenue, Donora, PA 15033 523 | 800 Spring Mill Avenue, Conshohocken, PA 19428 524 | West Business Center, Wayne, PA 19087 525 | 1358 Columbia Ave, Lancaster, PA 17603 526 | 1 Broadway, Newport, RI 02840 527 | Bristol Harbor, Bristol, RI 02809 528 | 11 John Street, Bristol, RI 02809 529 | 28 Market Street, Warren, RI 02885 530 | 99 Fortin Road, Kingston, RI 02881 531 | 1715 Stafford Road, Tiverton, RI 02878 532 | 130 Chapel Street, Harrisville, RI 02830 533 | 577 South Main Street, Providence, RI 02903 534 | 800 Greenwich Avenue, Warwick, RI 02886 535 | 449 West Shore Road, Warwick, RI 02889 536 | 7570 Post Road, North Kingstown, RI 02852 537 | 134 Providence Street, Woonsocket, RI 02895 538 | 80 River Street, Woonsocket, RI 02895 539 | 54 Smithfield Avenue, Pawtucket, RI 02860 540 | 101 Aiken Road, Graniteville, SC 29829 541 | 2701 David H McLeod Boulevard, Florence, SC 29501 542 | 4634 Factory Stores Boulevard, Myrtle Beach, SC 29579 543 | 1600 Marina Road, Irmo, SC 29063 544 | 931 Senate Street, Columbia, SC 29201 545 | 338 King Street, Charleston, SC 29401 546 | 1010 Ocean Boulevard, Isle Of Palms, SC 29451 547 | 961 West Union Road, West Union, SC 29696 548 | 17166 Highway 72 West, Waterloo, SC 29384 549 | 2401 Reidville Road, Spartanburg, SC 29301 550 | 12188 Old Number 6 Highway, Eutawville, SC 29048 551 | 1215 Saint Andrews Road, Columbia, SC 29210 552 | 702 North Greenwood Avenue, Ware Shoals, SC 29692 553 | 213 Meeting Street, Charleston, SC 29401 554 | 3800 South Louise Avenue Frnt, Sioux Falls, SD 57106 555 | 2160 North Haines Avenue, Rapid City, SD 57701 556 | 333 West Jackson Boulevard, Spearfish, SD 57783 557 | 5050 North Cliff Avenue, Sioux Falls, SD 57104 558 | 3820 West 41st Street, Sioux Falls, SD 57106 559 | 1308 East 60th Street North, Sioux Falls, SD 57104 560 | 915 8th Avenue Northwest, Aberdeen, SD 57401 561 | 535 Mountain View Road, Rapid City, SD 57702 562 | Rushmore Mall, Rapid City, SD 57701 563 | 1500 South Burr Street, Mitchell, SD 57301 564 | The Empire, Sioux Falls, SD 57103 565 | 2729 East 10th Street, Sioux Falls, SD 57103 566 | 1620 Cambell Street, Rapid City, SD 57701 567 | 111 Iowa Street, Alcester, SD 57001 568 | 835 Foothills Mall Drive, Maryville, TN 37801 569 | 929 Dolly Parton Parkway, Sevierville, TN 37862 570 | 1824 East Stone Drive, Kingsport, TN 37660 571 | 4014 Hillsboro Circle, Nashville, TN 37215 572 | 3100 North Roan Street, Johnson City, TN 37601 573 | 1001 Kingston Street, Lenoir City, TN 37771 574 | 1718 Madison Avenue, Memphis, TN 38104 575 | 1021 Jackson Avenue, Memphis, TN 38107 576 | 7409 Middlebrook Pike, Knoxville, TN 37909 577 | 2204 Whitten Road, Memphis, TN 38133 578 | 209 Main Street, Petros, TN 37845 579 | 3445 Poplar Avenue, Memphis, TN 38111 580 | 3445 Poplar Avenue, Memphis, TN 38111 581 | 24 Front Street, Big Sandy, TN 38221 582 | 304 North Llano, Fredericksburg, TX 78624 583 | 1328 West McDermott Suite 200, Allen, TX 75013 584 | 820 East IH35 South Suite A, Denton, TX 76205 585 | 220 East Las Colinas Boulevard, Irving, TX 75039 586 | 2969 West 15th Street, Plano, TX 75075 587 | 4501 Travis Street, Dallas, TX 75205 588 | 807 South Central Expressway, Richardson, TX 75080 589 | 3068 Forest Lane, Dallas, TX 75234 590 | 2029 East Highway 356 (Irving Boulevard), Irving, TX 75038 591 | 3948 Legacy Drive, Plano, TX 75023 592 | 120 South Main, Grapevine, TX 76051 593 | 2001 Coit Road, Plano, TX 75075 594 | 2300 Cross Timbers, Flower Mound, TX 75028 595 | 1045 Hidden Ridge, Irving, TX 75038 596 | 4140 West 5415 South, Salt Lake City, UT 84118 597 | 1500 Kearns Boulevard, Park City, UT 84060 598 | 164 South 100 West, Cedar City, UT 84720 599 | Sandy, UT 84092 600 | 239 South 500 East, Salt Lake City, UT 84102 601 | 3513 Riverdale Road, Ogden, UT 84405 602 | 2901 East 3300 South, Salt Lake City, UT 84109 603 | 1754 West 7800 South, West Jordan, UT 84088 604 | 3765 West 5400 South, Kearns, UT 84118 605 | 4931 South State Street, Salt Lake City, UT 84107 606 | 1168 32nd Street, Ogden, UT 84403 607 | 1499 South State Street, Salt Lake City, UT 84115 608 | 1516 North Freedom Boulevard, Provo, UT 84604 609 | 6065 Jefferson Avenue, Newport News, VA 23605 610 | 232 South Armistead Avenue, Hampton, VA 23669 611 | 6723 Richmond Highway, Alexandria, VA 22306 612 | 4300 Colley Avenue, Norfolk, VA 23508 613 | 2887 Poindexter Road, Trevilians, VA 23170 614 | 422 William Street, Fredericksburg, VA 22401 615 | 1402 Richmond Road, Williamsburg, VA 23185 616 | 516 South Van Dorn Street, Alexandria, VA 22304 617 | 7279 Arlington Boulevard, Falls Church, VA 22042 618 | 8701 Midlothian Turnpike, Richmond, VA 23235 619 | 2926 Franklin Road Southwest, Roanoke, VA 24000 620 | 2500 Riverside Drive, Danville, VA 24540 621 | 116 S. Independence Blvd., Virginia Beach, VA 23462 622 | Route Box # 100, West Dover, VT 05356 623 | 15 Barre Street Suite 1, Montpelier, VT 05602 624 | Route Box # 15, Cambridge, VT 05444 625 | 18 Lake Street, Saint Albans, VT 05478 626 | 4 Fairground Road, Brattleboro, VT 05301 627 | 50 Railroad Street, Montpelier, VT 05602 628 | 20 Susie Wilson Road, Essex Junction, VT 05452 629 | South Route Box South # 7, Bennington, VT 05201 630 | 225 Woodstock Avenue, Rutland, VT 05701 631 | Okemo Market Place, Ludlow, VT 05149 632 | Route Box # 65, Brookfield, VT 05036 633 | Route 7 RR 7 Box, Arlington, VT 05250 634 | 3904 Route # A, Arlington, VT 05250 635 | Route 313 RR 313 Box, Arlington, VT 05250 636 | 31595 SR 20, Oak Harbor, WA 98277 637 | 5694 Third Avenue, Ferndale, WA 98248 638 | 20221 Aurora Avenue North, Seattle, WA 98133 639 | 630 Southeast Pioneer Way, Oak Harbor, WA 98277 640 | 6251 Northeast Bothell Way, Kenmore, WA 98028 641 | 1427 228th Street Suite 82, Bothell, WA 98021 642 | 1351 Southwest Barlow Street, Oak Harbor, WA 98277 643 | 21210 44th Avenue West, Mountlake Terrace, WA 98043 644 | 225 Highway 150, Chelan, WA 98816 645 | 15740 Aurora Avenue North, Seattle, WA 98133 646 | 1114 First Street, Snohomish, WA 98290 647 | 546 5th Avenue South, Edmonds, WA 98020 648 | 12025 Highway 99, Everett, WA 98204 649 | 4520 200th Street Southwest, Lynnwood, WA 98036 650 | 231 East Towne Mall, Madison, WI 53704 651 | 108 West Towne Mall, Madison, WI 53719 652 | 2500 North Mayfair Road, Milwaukee, WI 53226 653 | 4301 West Wisconsin Avenue, Appleton, WI 54913 654 | 2401 South Oneida Street, Green Bay, WI 54304 655 | 325 Central Avenue, Owen, WI 54460 656 | 6189 US Highway 18 And 15, Dodgeville, WI 53533 657 | 2559 South Howell Avenue, Milwaukee, WI 53207 658 | 9067 State Highway 70 West, Saint Germain, WI 54558 659 | Wausau, WI 54403 660 | 635 South Main Street, Shawano, WI 54166 661 | 3201 East Main Street, Merrill, WI 54452 662 | 7441 East County Road Y, Gordon, WI 54838 663 | 102 Old Abe Road, Lac Du Flambeau, WI 54538 664 | 1193 Pineview Drive, Morgantown, WV 26505 665 | 345 High Street, Morgantown, WV 26505 666 | 512 West Main Street, West Union, WV 26456 667 | 78 Main Street, Shinnston, WV 26431 668 | 256 Carolina Avenue, Chester, WV 26034 669 | Keyser, WV 26726 670 | 50 Carmel Road, Wheeling, WV 26003 671 | Terra Alta, WV 26764 672 | 102 Forever Green Drive, Falling Waters, WV 25419 673 | Fairmont, WV 26554 674 | Cortland Road, Davis, WV 26260 675 | 205 West Maple Avenue, Fayetteville, WV 25840 676 | 120 East German, Shepherdstown, WV 25443 677 | 669 Main Street, Follansbee, WV 26037 678 | 1401 Dell Range Boulevard, Cheyenne, WY 82009 679 | 3209 East Grand Avenue, Laramie, WY 82070 680 | 128 West Valley Road, Torrington, WY 82240 681 | 1777 Coffeen Avenue, Sheridan, WY 82801 682 | 2720 South Douglas Highway, Gillette, WY 82718 683 | 1140 West Main Street, Riverton, WY 82501 684 | 91 16th Street, Wheatland, WY 82201 685 | 310 East Lincolnway, Cheyenne, WY 82001 686 | 1500 Big Horn Avenue, Worland, WY 82401 687 | 1503 Sheridan Avenue, Cody, WY 82414 688 | 2148 East Grand Avenue, Laramie, WY 82070 689 | 355 North Russell Avenue, Douglas, WY 82633 690 | 1899 Dewar Drive, Rock Springs, WY 82901 691 | -------------------------------------------------------------------------------- /raw/us50.train.raw: -------------------------------------------------------------------------------- 1 | Homer Spit Road, Homer, AK 99603 2 | Lnlck Shopping Center, Anniston, AL 36201 3 | Center Ridge, AR 72027 4 | 9878 North Metro Parkway East, Phoenix, AZ 85051 5 | 2896 Fairfax Street, Denver, CO 80207 6 | Mesa Mall, Grand Junction, CO 81501 7 | 168 Hillside Avenue, Hartford, CT 06106 8 | 1025 Vermont Avenue Northwest, Washington, DC 20005 9 | 697 North Dupont Boulevard, Milford, DE 19963 10 | 1915 North Republic De Cuba Avenue, Tampa, FL 33602 11 | 2406 North Slappey Boulevard, Albany, GA 31701 12 | 98-1247 Kaahumanu, Aiea, HI 96701 13 | 103 West Main, Ute, IA 51060 14 | 335 Deinhard Lane, Mc Call, ID 83638 15 | 8922 South 1/2 Greenwood Avenue, Chicago, IL 60619 16 | 239 West Monroe Street, Decatur, IN 46733 17 | 827 Frontage Road, Agra, KS 67621 18 | 508 West 6th Street, Lexington, KY 40508 19 | 5103 Hollywood Avenue, Shreveport, LA 71109 20 | 79 Power Road, Westford, MA 01886 21 | 5105 Berwyn Road, College Park, MD 20740 22 | 47 Broad Street, Auburn, ME 04210 23 | 470 South Street, Ortonville, MI 48462 24 | 404 Wilson Avenue, Faribault, MN 55021 25 | 5933 Mc Donnell Boulevard, Hazelwood, MO 63042 26 | 918 East Main Avenue, Lumberton, MS 39455 27 | 107 A Street East, Poplar, MT 59255 28 | Village Shps Of Bnr, Banner Elk, NC 28604 29 | 2601 State Street, Bismarck, ND 58501 30 | 207 South Bell Street, Fremont, NE 68025 31 | 107 State Street, Portsmouth, NH 03801 32 | 1413 State Highway #50, Mays Landing, NJ 08330 33 | I-25 Highway 87, Raton, NM 87740 34 | 516 West Goldfield Avenue, Yerington, NV 89447 35 | 2787 Bway Way, New York, NY 10001 36 | 1380 Bethel Road, Columbus, OH 43220 37 | 305 Main, Fort Cobb, OK 73038 38 | 17375 Southwest Tualatin Valley Hwy, Beaverton, OR 97006 39 | 114 Market Street, Philadelphia, PA 19106 40 | 169 Main Street, Westerly, RI 02891 41 | 70 State Street, Charleston, SC 29401 42 | East Highway 212, Watertown, SD 57201 43 | 615 North 8th Avenue, Humboldt, TN 38343 44 | 5435 North MacArthur Boulevard, Irving, TX 75038 45 | 511 South 300 West, Salt Lake City, UT 84101 46 | 2457 North Harrison Street, Arlington, VA 22207 47 | 1 Lamere Avenue, Ludlow, VT 05149 48 | 5300 South 76th Street, Greendale, WI 53129 49 | 200 Monroe Street, Alderson, WV 24910 50 | 973 US Highway 16, Worland, WY 82401 51 | 30653 Highway 278, Addison, AL 35540 52 | -------------------------------------------------------------------------------- /raw/us50.train.tagged: -------------------------------------------------------------------------------- 1 | Homer Spit |3 2 | Road, |4 3 | Homer, |5 4 | AK |6 5 | 99603 |7 6 | 7 | Lnlck Shopping Center, |3 8 | Anniston, |5 9 | AL |6 10 | 36201 |7 11 | 12 | Center Ridge, |5 13 | AR |6 14 | 72027 |7 15 | 16 | 9878 |1 17 | North Metro |3 18 | Parkway |4 19 | East, |8 20 | Phoenix, |5 21 | AZ |6 22 | 85051 |7 23 | 24 | 2896 |1 25 | Fairfax |3 26 | Street, |4 27 | Denver, |5 28 | CO |6 29 | 80207 |7 30 | 31 | Mesa Mall, |3 32 | Grand Junction, |5 33 | CO |6 34 | 81501 |7 35 | 36 | 168 |1 37 | Hillside |3 38 | Avenue, |4 39 | Hartford, |5 40 | CT |6 41 | 06106 |7 42 | 43 | 1025 |1 44 | Vermont |3 45 | Avenue |4 46 | Northwest, |8 47 | Washington, |5 48 | DC |6 49 | 20005 |7 50 | 51 | 697 |1 52 | North Dupont |3 53 | Boulevard, |4 54 | Milford, |5 55 | DE |6 56 | 19963 |7 57 | 58 | 1915 |1 59 | North Republic De Cuba |3 60 | Avenue, |4 61 | Tampa, |5 62 | FL |6 63 | 33602 |7 64 | 65 | 2406 |1 66 | North Slappey |3 67 | Boulevard, |4 68 | Albany, |5 69 | GA |6 70 | 31701 |7 71 | 72 | 98-1247 |1 73 | Kaahumanu, |3 74 | Aiea, |5 75 | HI |6 76 | 96701 |7 77 | 78 | 103 |1 79 | West Main, |3 80 | Ute, |5 81 | IA |6 82 | 51060 |7 83 | 84 | 335 |1 85 | Deinhard |3 86 | Lane, |4 87 | Mc Call, |5 88 | ID |6 89 | 83638 |7 90 | 91 | 8922 |1 92 | South 1/2 Greenwood |3 93 | Avenue, |4 94 | Chicago, |5 95 | IL |6 96 | 60619 |7 97 | 98 | 239 |1 99 | West Monroe |3 100 | Street, |4 101 | Decatur, |5 102 | IN |6 103 | 46733 |7 104 | 105 | 827 |1 106 | Frontage |3 107 | Road, |4 108 | Agra, |5 109 | KS |6 110 | 67621 |7 111 | 112 | 508 |1 113 | West 6th |3 114 | Street, |4 115 | Lexington, |5 116 | KY |6 117 | 40508 |7 118 | 119 | 5103 |1 120 | Hollywood |3 121 | Avenue, |4 122 | Shreveport, |5 123 | LA |6 124 | 71109 |7 125 | 126 | 79 |1 127 | Power |3 128 | Road, |4 129 | Westford, |5 130 | MA |6 131 | 01886 |7 132 | 133 | 5105 |1 134 | Berwyn |3 135 | Road, |4 136 | College Park, |5 137 | MD |6 138 | 20740 |7 139 | 140 | 47 |1 141 | Broad |3 142 | Street, |4 143 | Auburn, |5 144 | ME |6 145 | 04210 |7 146 | 147 | 470 |1 148 | South |3 149 | Street, |4 150 | Ortonville, |5 151 | MI |6 152 | 48462 |7 153 | 154 | 404 |1 155 | Wilson |3 156 | Avenue, |4 157 | Faribault, |5 158 | MN |6 159 | 55021 |7 160 | 161 | 5933 |1 162 | Mc Donnell |3 163 | Boulevard, |4 164 | Hazelwood, |5 165 | MO |6 166 | 63042 |7 167 | 168 | 918 |1 169 | East Main |3 170 | Avenue, |4 171 | Lumberton, |5 172 | MS |6 173 | 39455 |7 174 | 175 | 107 |1 176 | A Street |3 177 | East, |4 178 | Poplar, |5 179 | MT |6 180 | 59255 |7 181 | 182 | Village Shps Of Bnr, |3 183 | Banner Elk, |5 184 | NC |6 185 | 28604 |7 186 | 187 | 2601 |1 188 | State |3 189 | Street, |4 190 | Bismarck, |5 191 | ND |6 192 | 58501 |7 193 | 194 | 207 |1 195 | South Bell |3 196 | Street, |4 197 | Fremont, |5 198 | NE |6 199 | 68025 |7 200 | 201 | 107 |1 202 | State |3 203 | Street, |4 204 | Portsmouth, |5 205 | NH |6 206 | 03801 |7 207 | 208 | 1413 |1 209 | State Highway #50, |3 210 | Mays Landing, |5 211 | NJ |6 212 | 08330 |7 213 | 214 | I-25 Highway 87, |3 215 | Raton, |5 216 | NM |6 217 | 87740 |7 218 | 219 | 516 |1 220 | West Goldfield |3 221 | Avenue, |4 222 | Yerington, |5 223 | NV |6 224 | 89447 |7 225 | 226 | 2787 |1 227 | Bway |3 228 | Way, |4 229 | New York, |5 230 | NY |6 231 | 10001 |7 232 | 233 | 1380 |1 234 | Bethel |3 235 | Road, |4 236 | Columbus, |5 237 | OH |6 238 | 43220 |7 239 | 240 | 305 |1 241 | Main, |3 242 | Fort Cobb, |5 243 | OK |6 244 | 73038 |7 245 | 246 | 17375 |1 247 | Southwest Tualatin Valley |3 248 | Hwy, |4 249 | Beaverton, |5 250 | OR |6 251 | 97006 |7 252 | 253 | 114 |1 254 | Market |3 255 | Street, |4 256 | Philadelphia, |5 257 | PA |6 258 | 19106 |7 259 | 260 | 169 |1 261 | Main |3 262 | Street, |4 263 | Westerly, |5 264 | RI |6 265 | 02891 |7 266 | 267 | 70 |1 268 | State |3 269 | Street, |4 270 | Charleston, |5 271 | SC |6 272 | 29401 |7 273 | 274 | East Highway 212, |3 275 | Watertown, |5 276 | SD |6 277 | 57201 |7 278 | 279 | 615 |1 280 | North 8th |3 281 | Avenue, |4 282 | Humboldt, |5 283 | TN |6 284 | 38343 |7 285 | 286 | 5435 |1 287 | North MacArthur |3 288 | Boulevard, |4 289 | Irving, |5 290 | TX |6 291 | 75038 |7 292 | 293 | 511 |1 294 | South 300 West, |3 295 | Salt Lake City, |5 296 | UT |6 297 | 84101 |7 298 | 299 | 2457 |1 300 | North Harrison |3 301 | Street, |4 302 | Arlington, |5 303 | VA |6 304 | 22207 |7 305 | 306 | 1 |1 307 | Lamere |3 308 | Avenue, |4 309 | Ludlow, |5 310 | VT |6 311 | 05149 |7 312 | 313 | 5300 |1 314 | South 76th |3 315 | Street, |4 316 | Greendale, |5 317 | WI |6 318 | 53129 |7 319 | 320 | 200 |1 321 | Monroe |3 322 | Street, |4 323 | Alderson, |5 324 | WV |6 325 | 24910 |7 326 | 327 | 973 |1 328 | US Highway 16, |3 329 | Worland, |5 330 | WY |6 331 | 82401 |7 332 | 333 | 30653 |1 334 | Highway 278, |3 335 | Addison, |5 336 | AL |6 337 | 35540 |7 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from distutils.cmd import Command 4 | 5 | from setuptools import setup 6 | from setuptools.command.build_py import build_py as _build_py 7 | 8 | 9 | class TrainModel(Command): 10 | description = "Training the model before building the package" 11 | user_options = [] 12 | 13 | def initialize_options(self): 14 | pass 15 | 16 | def finalize_options(self): 17 | pass 18 | 19 | def run(self): 20 | PYTHONPATH = os.environ.get("PYTHONPATH", "") 21 | subprocess.run( 22 | ["parserator", "train", "training/labeled.xml", "usaddress"], 23 | env=dict(os.environ, PYTHONPATH=f".{os.pathsep}{PYTHONPATH}"), 24 | ) 25 | 26 | 27 | class build_py(_build_py): 28 | def run(self): 29 | self.run_command("train_model") # Run the custom command 30 | super().run() 31 | 32 | 33 | # Standard setup configuration 34 | setup( 35 | cmdclass={ 36 | "build_py": build_py, # Override build_py 37 | "train_model": TrainModel, # Register custom command 38 | }, 39 | ) 40 | -------------------------------------------------------------------------------- /tests/test_labeling.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from parserator.training import readTrainingData 3 | 4 | from usaddress import GROUP_LABEL, parse 5 | 6 | 7 | # these are simple address patterns 8 | @pytest.mark.parametrize( 9 | "address_text,components", 10 | readTrainingData( 11 | ["measure_performance/test_data/simple_address_patterns.xml"], GROUP_LABEL 12 | ), 13 | ) 14 | def test_simple_addresses(address_text, components): 15 | 16 | _, labels_true = list(zip(*components)) 17 | _, labels_pred = list(zip(*parse(address_text))) 18 | assert labels_pred == labels_true 19 | 20 | 21 | # for making sure that performance isn't degrading 22 | # from now on, labeled examples of new address formats 23 | # should go both in training data & test data 24 | # these are simple address patterns 25 | @pytest.mark.parametrize( 26 | "address_text,components", 27 | readTrainingData(["measure_performance/test_data/labeled.xml"], GROUP_LABEL), 28 | ) 29 | def test_all(address_text, components): 30 | 31 | _, labels_true = list(zip(*components)) 32 | _, labels_pred = list(zip(*parse(address_text))) 33 | assert labels_pred == labels_true 34 | 35 | 36 | @pytest.mark.parametrize( 37 | "address_text,components", 38 | readTrainingData( 39 | ["measure_performance/test_data/synthetic_osm_data.xml"], GROUP_LABEL 40 | ), 41 | ) 42 | def test_synthetic_addresses(address_text, components): 43 | 44 | _, labels_true = list(zip(*components)) 45 | _, labels_pred = list(zip(*parse(address_text))) 46 | assert labels_pred == labels_true 47 | 48 | 49 | @pytest.mark.parametrize( 50 | "address_text,components", 51 | readTrainingData( 52 | ["measure_performance/test_data/us50_test_tagged.xml"], GROUP_LABEL 53 | ), 54 | ) 55 | def test_us50(address_text, components): 56 | 57 | _, labels_true = list(zip(*components)) 58 | _, labels_pred = list(zip(*parse(address_text))) 59 | fuzzyEquals(labels_pred, labels_true) 60 | 61 | 62 | def fuzzyEquals(labels_pred, labels_true): 63 | labels = [] 64 | fuzzy_labels = [] 65 | for label in labels_pred: 66 | if label.startswith("StreetName"): 67 | fuzzy_labels.append("StreetName") 68 | elif label.startswith("AddressNumber"): 69 | fuzzy_labels.append("AddressNumber") 70 | elif label == ("Null"): 71 | fuzzy_labels.append("NotAddress") 72 | else: 73 | fuzzy_labels.append(label) 74 | for label in labels_true: 75 | labels.append(label) 76 | 77 | assert fuzzy_labels == labels 78 | -------------------------------------------------------------------------------- /tests/test_tagging.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import usaddress 4 | 5 | 6 | class TestTagging(unittest.TestCase): 7 | def test_broadway(self): 8 | s1 = "1775 Broadway And 57th, Newyork NY" 9 | usaddress.tag(s1) 10 | -------------------------------------------------------------------------------- /tests/test_token_features.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from usaddress import tokenFeatures 4 | 5 | 6 | class TestTokenFeatures(unittest.TestCase): 7 | def test_unicode(self): 8 | features = tokenFeatures("å") 9 | assert features["endsinpunc"] is False 10 | 11 | 12 | if __name__ == "__main__": 13 | unittest.main() 14 | -------------------------------------------------------------------------------- /tests/test_tokenizing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from usaddress import tokenize 4 | 5 | 6 | class TestTokenizing(unittest.TestCase): 7 | def test_hash(self): 8 | self.assertEqual(tokenize("# 1 abc st"), ["#", "1", "abc", "st"]) 9 | self.assertEqual(tokenize("#1 abc st"), ["#", "1", "abc", "st"]) 10 | self.assertEqual(tokenize("box # 1 abc st"), ["box", "#", "1", "abc", "st"]) 11 | self.assertEqual(tokenize("box #1 abc st"), ["box", "#", "1", "abc", "st"]) 12 | self.assertEqual( 13 | tokenize("box# 1 abc st"), 14 | ["box", "#", "1", "abc", "st"], 15 | ) 16 | self.assertEqual(tokenize("box#1 abc st"), ["box", "#", "1", "abc", "st"]) 17 | 18 | def test_split_on_punc(self): 19 | self.assertEqual( 20 | tokenize("1 abc st,suite 1"), ["1", "abc", "st,", "suite", "1"] 21 | ) 22 | self.assertEqual( 23 | tokenize("1 abc st;suite 1"), ["1", "abc", "st;", "suite", "1"] 24 | ) 25 | self.assertEqual( 26 | tokenize("1-5 abc road"), 27 | ["1-5", "abc", "road"], 28 | ) 29 | 30 | def test_spaces(self): 31 | self.assertEqual(tokenize("1 abc st"), ["1", "abc", "st"]) 32 | self.assertEqual( 33 | tokenize("1 abc st"), 34 | ["1", "abc", "st"], 35 | ) 36 | self.assertEqual(tokenize("1 abc st "), ["1", "abc", "st"]) 37 | self.assertEqual( 38 | tokenize(" 1 abc st"), 39 | ["1", "abc", "st"], 40 | ) 41 | 42 | def test_capture_punc(self): 43 | self.assertEqual( 44 | tokenize("222 W. Merchandise Mart Plaza"), 45 | ["222", "W.", "Merchandise", "Mart", "Plaza"], 46 | ) 47 | self.assertEqual( 48 | tokenize("222 W Merchandise Mart Plaza, Chicago, IL"), 49 | ["222", "W", "Merchandise", "Mart", "Plaza,", "Chicago,", "IL"], 50 | ) 51 | self.assertEqual(tokenize("123 Monroe- St"), ["123", "Monroe-", "St"]) 52 | 53 | def test_nums(self): 54 | self.assertEqual( 55 | tokenize("222 W Merchandise Mart Plaza Chicago IL 60654"), 56 | ["222", "W", "Merchandise", "Mart", "Plaza", "Chicago", "IL", "60654"], 57 | ) 58 | 59 | def test_ampersand(self): 60 | self.assertEqual(tokenize("123 & 456"), ["123", "&", "456"]) 61 | self.assertEqual(tokenize("123&456"), ["123", "&", "456"]) 62 | self.assertEqual(tokenize("123& 456"), ["123", "&", "456"]) 63 | self.assertEqual(tokenize("123 &456"), ["123", "&", "456"]) 64 | self.assertEqual(tokenize("123 & 456"), ["123", "&", "456"]) 65 | self.assertEqual(tokenize("123&456"), ["123", "&", "456"]) 66 | self.assertEqual(tokenize("123& 456"), ["123", "&", "456"]) 67 | self.assertEqual(tokenize("123 &456"), ["123", "&", "456"]) 68 | self.assertEqual(tokenize("123 & 456"), ["123", "&", "456"]) 69 | self.assertEqual(tokenize("123&456"), ["123", "&", "456"]) 70 | self.assertEqual(tokenize("123& 456"), ["123", "&", "456"]) 71 | self.assertEqual(tokenize("123 &456"), ["123", "&", "456"]) 72 | 73 | def test_paren(self): 74 | self.assertEqual( 75 | tokenize("222 W Merchandise Mart Plaza (1871) Chicago IL 60654"), 76 | [ 77 | "222", 78 | "W", 79 | "Merchandise", 80 | "Mart", 81 | "Plaza", 82 | "(1871)", 83 | "Chicago", 84 | "IL", 85 | "60654", 86 | ], 87 | ) 88 | self.assertEqual( 89 | tokenize("222 W Merchandise Mart Plaza (1871), Chicago IL 60654"), 90 | [ 91 | "222", 92 | "W", 93 | "Merchandise", 94 | "Mart", 95 | "Plaza", 96 | "(1871),", 97 | "Chicago", 98 | "IL", 99 | "60654", 100 | ], 101 | ) 102 | self.assertEqual( 103 | tokenize("222 W Merchandise Mart Plaza(1871) Chicago IL 60654"), 104 | [ 105 | "222", 106 | "W", 107 | "Merchandise", 108 | "Mart", 109 | "Plaza", 110 | "(1871)", 111 | "Chicago", 112 | "IL", 113 | "60654", 114 | ], 115 | ) 116 | 117 | 118 | if __name__ == "__main__": 119 | unittest.main() 120 | -------------------------------------------------------------------------------- /training/README.md: -------------------------------------------------------------------------------- 1 | Training your own model 2 | ======================= 3 | 4 | You can always install a stable version of usaddress from [the Python Package Index](https://pypi.python.org/pypi/usaddress) by running `pip install usaddress`. But what if you want to **train your own version** of the model to do things that the current release isn't capable of? By labelling and training your own data, you'll be able to: 5 | 6 | 1. parse addresses that aren't yet supported 7 | 2. push usaddress beyond its limits 8 | 3. help make this library work better for everyone 9 | 10 | What are we doing? 11 | -------- 12 | 13 | Creating a new model involves three core steps: 14 | 15 | 1. **Labeling** addresses – help the machine understand them 16 | 2. **Training** the model – let usaddress see patterns in the data 17 | 3. **Testing** the model - make sure your changes made things better 18 | 19 | Sound interesting? Let's look at the steps in more detail. 20 | 21 | How it's done 22 | ----- 23 | 24 | **0. Create a local version of the repo.** 25 | 26 | The first step in contributing to any open-source project is to fork the repository. If this is your first time, GitHub has a [nice guide to forking and contributing](https://help.github.com/articles/fork-a-repo/) that you should take a look at. (We also encourage contributors to [make a separate branch](https://help.github.com/articles/creating-and-deleting-branches-within-your-repository/) for their work, which makes things easier on our end.) When you've forked the repo and you're ready to roll, come back here to get started developing. 27 | 28 | After forking the repo, you'll need to get usaddress running on your machine. Running the following commands in the command line will install the proper dependencies and initialize a development version of usaddress: 29 | 30 | ``` 31 | cd usaddress 32 | pip install setuptools 33 | python setup.py develop 34 | parserator train training/labeled.xml usaddress 35 | ``` 36 | 37 | If you run into problems building your own copy of usaddress, don't hesitate to [open an issue](https://github.com/datamade/usaddress/issues/new) and the DataMade team will help you get started. 38 | 39 | **1. Collect the addresses that are making usaddress fail.** 40 | 41 | Once you have a local version of usaddress up and running, you're ready to start collecting addresses. 42 | 43 | For each pattern that fails, you'll want to collect a handful of examples to make into **training data.** These examples should correspond to real world addresses that make usaddress fail - it's important to make sure that you're not influencing usaddress with how you *think* addresses work, as opposed to how they *really* work. To get started collecting your examples, make a new CSV file in the `training/` directory. For this guide, we'll call the file `new_addresses.csv`. 44 | 45 | Since usaddress is smart, it usually only needs 4-6 examples to understand any given pattern. Grab a few examples and copy the addresses to `training/new_addresses.csv`, separating each address with a new line: 46 | 47 | ``` 48 | training/new_addresses.csv 49 | -------------------------- 50 | 51 | 2822 HENRIETTA AVE HUNTINGDON VY PA 19006-8504 52 | 6625 HUNTINGDON PIKE HUNTINGDON VY PA 19006-8307 53 | 3555 HILLVIEW TURN HUNTINGDON VY PA 19006-2816 54 | 47 AMES CIR UNIT F4 HUNTINGDON VALLEY PA 19006-7976 55 | ``` 56 | 57 | (While CSV files are most often represented as spreadsheets, they're just plain text, so you can make them in any standard text editor. Just make sure to save your file with the .csv extension.) 58 | 59 | Remember that CSV files interpret **commas** as delimiters between table cells. For usaddress to understand your addresses, each line needs to be only one cell. That means that if any of your addresses include commas, you'll need to encapsulate them in quotes: 60 | 61 | ``` 62 | training/new_addresses.csv 63 | -------------------------- 64 | 65 | "JASON BOURNE, 123 MAIN ST, HUNTINGDON VY PA 19006-8504" 66 | ``` 67 | 68 | For each pattern, you'll also want to make **testing data**. Whereas training data helps your model make new connections, testing data makes sure that your model is actually learning the patterns that you want it to learn (and isn't overriding patterns that it has already learned through past training data). Make a new CSV file in the directory `measure_performance/testing data/` - we'll call the file `new_tests.csv` - and add an address or two for each new pattern you've identified: 69 | 70 | ``` 71 | measure_performance/testing_data/new_tests.csv 72 | -------------------------------------------- 73 | 74 | 1080 BUCK HILL DR HUNTINGDON VY PA 19006-7910 75 | "Barack Obama, 1600 Pennsylvania Ave NW, Washington, DC 20500" 76 | ``` 77 | 78 | Resist the urge to make your testing data identical to your training data. For the most robust results, testing and training data should be **different instances of the same pattern.** This ensures that usaddress is learning to see new patterns in addresses, and not merely learning to regurgitate the information you've fed into it. 79 | 80 | **2. Label your addresses so that usaddress can understand them.** 81 | 82 | Unfortunately, usaddress doesn't read text the same way that humans do. It needs to have **labeled data** to help it make sense of the address patterns you're feeding into it. For our training format, we use XML tagged strings corresponding to the [United States Thoroughfare, Landmark, and Postal Address Data Standard](http://www.urisa.org/advocacy/united-states-thoroughfare-landmark-and-postal-address-data-standard/). 83 | 84 | After it's been labeled, training data looks something like this: 85 | 86 | ```xml 87 | 88 | 89 | Soldotna , AK 996699 90 | 9112 Mendenhall Mall Road , Juneau , AK 99801 91 | Box # 63 , Cardova , AK 99574 92 | 32 - 233 M Street , Elmendorf Afb , AK 99506 93 | Ridgecrest Drive , Bethel , AK 99559 94 | 123 E. Main Road , Suite A. , Juneau , AK 99801 95 | 96 | ``` 97 | 98 | Thankfully, you don't have to write this code by hand! This repo comes with a built-in labelling program to help you generate tagged XML strings quickly and easily. The labelling program runs in the command line, and you can start it with the following command: 99 | 100 | ``` 101 | parserator label usaddress 102 | ``` 103 | 104 | The **output** filepath can be anything you want, since usaddress will make a new file with the name and location described by the path, but it's good practice to give it a similar name and location as the input CSV file. For our example, the command for labeling our training data will look like this: 105 | 106 | ``` 107 | parserator label training/new_addresses.csv training/new_addresses.xml usaddress 108 | ``` 109 | 110 | Run this command and the labeling program will launch in the command line. It will start by printing some information to describe the commands that you can use to label addresses: 111 | 112 | ``` 113 | Start console labeling! 114 | 115 | ************************************************** 116 | These are the tags available for labeling: 117 | 0 : AddressNumberPrefix 118 | 1 : AddressNumber 119 | 2 : AddressNumberSuffix 120 | 3 : StreetNamePreModifier 121 | 4 : StreetNamePreDirectional 122 | 5 : StreetNamePreType 123 | 6 : StreetName 124 | 7 : StreetNamePostType 125 | 8 : StreetNamePostDirectional 126 | 9 : SubaddressType 127 | 10 : SubaddressIdentifier 128 | 11 : BuildingName 129 | 12 : OccupancyType 130 | 13 : OccupancyIdentifier 131 | 14 : CornerOf 132 | 15 : LandmarkName 133 | 16 : PlaceName 134 | 17 : StateName 135 | 18 : ZipCode 136 | 19 : USPSBoxType 137 | 20 : USPSBoxID 138 | 21 : USPSBoxGroupType 139 | 22 : USPSBoxGroupID 140 | 23 : IntersectionSeparator 141 | 24 : Recipient 142 | 25 : NotAddress 143 | 144 | type 'help' at any time to see labels 145 | type 'oops' if you make a labeling error 146 | 147 | ************************************************** 148 | ``` 149 | 150 | During the labeling process, the program will ask you to match portions of the address to the **tags** that you can see above. Tagging an address is like diagramming a sentence: it breaks down the address into its smallest components and describes how each part relates to the whole. 151 | 152 | Our tagging standard can take some time to get used to if you're not familiar with it. If you're confused about how to tag certain parts of an address, [follow the short guidelines in our documentation](http://usaddress.readthedocs.io/en/latest/#details) or consult the [official data standard](http://www.urisa.org/clientuploads/directory/GMI/Professional%20Practice/Address%20Standard/AddressStandard_Approved_Apr11_02Content.pdf). For more complicated questions, feel free to [open an issue in this repo](https://github.com/datamade/usaddress/issues/new) and the DataMade team can weigh in on your problem. 153 | 154 | After the instructions print, the program will begin prompting you to label addresses. Each prompt starts by using the current model to make an educated guess about the proper labels: 155 | 156 | ``` 157 | -------------------------------------------------- 158 | STRING: 2822 HENRIETTA AVE HUNTINGDON VY PA 19006-8504 159 | | 2822 | AddressNumber | 160 | | HENRIETTA | StreetName | 161 | | AVE | StreetNamePostType | 162 | | HUNTINGDON | PlaceName | 163 | | VY | StateName | 164 | | PA | StateName | 165 | | 19006-8504 | ZipCode | 166 | Is this correct? (y)es / (n)o / (s)kip / (f)inish tagging / (h)elp 167 | ``` 168 | 169 | In this case, usaddress got this address mostly right, but mislabelled `VY` as a `StateName` instead of a `PlaceName`. Enter `n` to tell it that the labels aren't correct, and then enter `return` (or `enter`) to accept all of the labels up to `VY`: 170 | 171 | ``` 172 | -------------------------------------------------- 173 | STRING: 2822 HENRIETTA AVE HUNTINGDON VY PA 19006-8504 174 | | 2822 | AddressNumber | 175 | | HENRIETTA | StreetName | 176 | | AVE | StreetNamePostType | 177 | | HUNTINGDON | PlaceName | 178 | | VY | StateName | 179 | | PA | StateName | 180 | | 19006-8504 | ZipCode | 181 | Is this correct? (y)es / (n)o / (s)kip / (f)inish tagging / (h)elp 182 | n 183 | What is '2822' ? If AddressNumber hit return 184 | 185 | What is 'HENRIETTA' ? If StreetName hit return 186 | 187 | What is 'AVE' ? If StreetNamePostType hit return 188 | 189 | What is 'HUNTINGDON' ? If PlaceName hit return 190 | 191 | What is 'VY' ? If StateName hit return 192 | ``` 193 | 194 | Based on the tag list above, we can see that `PlaceName` corresponds to the input `16` in the program. So add the appropriate label: 195 | 196 | ``` 197 | What is 'VY' ? If StateName hit return 198 | 16 199 | ``` 200 | 201 | Then accept the rest of the labels, since the model guessed them correctly: 202 | 203 | ``` 204 | What is 'PA' ? If StateName hit return 205 | 206 | What is '19006-8504' ? If ZipCode hit return 207 | 208 | ``` 209 | 210 | Once you've evaluated every portion of the address, the program will move on to another example and the process will start over. 211 | 212 | Note that you can make use of helper commands to speed up the labeling process. If you decide that an address is not representative and you want to skip it, you can enter `s`; or if you want to quit labeling entirely, you can enter `f` and the program will stop, saving your progress in a secondary file. If you make a mistake during the labeling process itself, you can always enter `oops` to restart the labelling of the current address or `help` to see a list of possible labels. 213 | 214 | After the program has prompted you to label every address, navigate to the target directory that you specified and confirm that a new XML file has been created (in this case, `training/new_addresses.xml`). 215 | 216 | **3. Train the model.** 217 | 218 | So you've got a labeled XML file for our training data. Great! Now it's time to use it to teach the model to parse new patterns. 219 | 220 | The training command for usaddress follows the following format: 221 | 222 | ``` 223 | parserator train usaddress 224 | ``` 225 | 226 | For stable releases, the DataMade team collects canonical training data in the file `training/labeled.xml`. Recall that when you initialized usaddress on your machine, you ran the command like this: 227 | 228 | ``` 229 | parserator train training/labeled.xml usaddress 230 | ``` 231 | 232 | But usaddress can also accept *multiple* files to use as training data. As you develop a new model, you should enforce separation between new and canonical training data to make debugging easier. So you can feed the model both files as input, separated by a comma: 233 | 234 | ``` 235 | parserator train training/labeled.xml,training/new_addresses.xml usaddress 236 | ``` 237 | 238 | After running the command, you should see output that looks something like this: 239 | 240 | ``` 241 | renaming old model: usaddress/usaddr.crfsuite -> usaddress/usaddr_2016_12_19_21286.crfsuite 242 | 243 | training model on 1359 training examples from ['training/labeled.xml', 'trainingnew_addresses.xml'] 244 | 245 | done training! model file created: usaddress/usaddr.crfsuite 246 | ``` 247 | 248 | This output confirms that usaddress has learned from the new training data. Nice! 249 | 250 | **4. Test the model.** 251 | 252 | It's certainly exciting to know that you've added new training data to usaddress and changed the model, but it won't be very helpful unless we can verify that parsing behavior has actually *improved* based on the changes. To do that, you can check the model against a set of **testing data**. 253 | 254 | Recall that you set aside a small portion of your addresses for testing in the CSV file `measure_performance/test_data/new_tests.csv`. Now that usaddress has (hopefully) learned to parse your new patterns, you can spot check it by labeling the testing data: 255 | 256 | ``` 257 | parserator label measure_performance/test_data/new_tests.csv measure_performance/test_data/new_tests.xml usaddress 258 | ``` 259 | 260 | The labeling program will launch, and if usaddress can suggest the proper labels for your testing data, you'll know that it has correctly learned the patterns you identified. (If, on the other hand, usaddress seems to fail on those patterns, you'll have to go back and add more examples of that pattern to your training data and retrain the model following steps 2 and 3.) 261 | 262 | But it's not good enough to confirm that usaddress has learned new patterns – you also need to confirm that it hasn't *unlearned* old patterns in the process of incorporating your new training data. To do that, run the usaddress testing suite with the following command: 263 | 264 | ``` 265 | pytest 266 | ``` 267 | 268 | The output will fill your screen with a big block of dots (.) and/or Fs (F). Each dot corresponds to a test that *passed* (meaning that usaddress produced the expected parse for an address) while each F corresponds to a test that *failed* (meaning that usaddress failed to properly parse the address). 269 | 270 | If all the tests passed, look below the results block and you'll see a short confirmation output: 271 | 272 | ``` 273 | -------------------------------- 274 | Ran 4896 tests in 2.158s 275 | 276 | OK 277 | ``` 278 | 279 | Congratulations! The model has officially improved. You can safely move on to step 5b, where you'll get your work ready to be shared. 280 | 281 | If any of our tests failed, however, things become more complicated. The output will break down the tests that failed, showing you the parse that the model produced (labeled `pred`) and the parse that the test expected (labeled `true`). In this case, jump to step 5a to debug your errors. 282 | 283 | If you'd like to additionally spot check singular addresses in the python shell, install a virtual environment, activate it, install your WIP version of this package, and open a shell. 284 | ```bash 285 | python3 -m venv .venv 286 | source .venv/bin/activate 287 | pip install -e ".[dev]" -v 288 | python 289 | # shell starts up 290 | >>> 291 | ``` 292 | 293 | Then import usaddress and start parsing! 294 | ```python 295 | >>> import usaddress 296 | >>> usaddress.parse("a funky address") 297 | ``` 298 | 299 | **5a. Repeat steps 1-4 until the tests pass.** 300 | 301 | If you've arrived at this step, it means that some of your tests failed. Uh oh! 302 | 303 | To cut a new release of usaddress, all canonical tests need to pass. That means before sharing your work, you'll have to go back and retrain the model to properly parse the addresses that it's failing on. 304 | 305 | Take the failing addresses and try to find real-world addresses that match the pattern. Mapping software like [Open Street Map](http://www.openstreetmap.org/#map=5/51.500/-0.100) and [Google Maps](https://www.google.com/maps) can be helpful for searching for similar address patterns. Collect new addresses and repeat steps 1-4 until all of the tests in the testing suite pass. 306 | 307 | Once all of the tests are passing, you're safe to move on to step 5b. 308 | 309 | **5b. Add your training and testing data.** 310 | 311 | If you've arrived at this step, it means that all of your new and old tests passed and your model is good to go. Fantastic! Next up in order to have the public package trained and tested on your data, you'll need to add it to the canonical data. 312 | 313 | To do this, just copy your everything within the `` tags of your `new_addresses.xml` file, and paste it towards the end of the same tags within the `labeled.xml` file found in the `training/` directory. Repeat the same steps for the testing data and the `test_data/` directory. 314 | 315 | **5c. Make a pull request.** 316 | 317 | Now it's time to share your work. GitHub provides a powerful way of sharing code through the *pull request* feature (and has a [really nice guide](https://help.github.com/articles/creating-a-pull-request/) for first-timers explaining how it works). Open up a new pull request and give us a short description of what you changed: What address patterns did you fix? Where did you store your training data? How many new examples/tests did you add? The clearer your description of your work, the easier it will be for the DataMade team to determine whether it's ready to go. 318 | 319 | If you made it this far, **great job!** We appreciate your dedication to making usaddress better for the whole community. Drop us a line on [GitHub](https://github.com/datamade) or on [Twitter](https://twitter.com/DataMadeCo) and let us know how you're using usaddress. 320 | 321 | Need help? 322 | ---------- 323 | 324 | We want contributing to usaddress to be as painless as possible. If you run into problems following any of our documentation, feel free to [open an issue](https://github.com/datamade/usaddress/issues/new) describing your problem and the DataMade team would be glad to help. 325 | -------------------------------------------------------------------------------- /training/example_training.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Soldotna 5 | , 6 | AK 7 | 996699 8 | 9 | 10 | 9112 11 | Mendenhall Mall 12 | Road 13 | , 14 | Juneau 15 | , 16 | AK 17 | 99801 18 | 19 | 20 | Box 21 | # 22 | 63 23 | , 24 | Cardova 25 | , 26 | AK 27 | 99574 28 | 29 | 30 | 32 31 | - 32 | 233 33 | M 34 | Street 35 | , 36 | Elmendorf Afb 37 | , 38 | AK 39 | 99506 40 | 41 | 42 | Ridgecrest 43 | Drive 44 | , 45 | Bethel 46 | , 47 | AK 48 | 99559 49 | 50 | 51 | 123 52 | E. 53 | Main 54 | Road 55 | , 56 | Suite 57 | A. 58 | , 59 | Juneau 60 | , 61 | AK 62 | 99801 63 | 64 | -------------------------------------------------------------------------------- /training/multi_word_state_addresses.xml: -------------------------------------------------------------------------------- 1 | 2 | 84 Social Street Woonsocket, Rhode Island 02895 United States 3 | 3481 Kingstown Road South Kingstown, Rhode Island 02892 United States 4 | 209 4th Avenue Asbury Park, New Jersey 07712 United States 5 | 600 E Boulevard Ave, Dept 301 Bismarck, North Dakota 58505 United States 6 | 510 U.S. 17 Business Surfside Beach, South Carolina 29575 United States 7 | 3110 West 12th Street Sioux Falls, South Dakota 57104 United States 8 | 42 Water Street New Shoreham, Rhode Island 02807 United States 9 | 291 Dairy Barn Lane Fort Mill, South Carolina 29715 United States 10 | 11 | -------------------------------------------------------------------------------- /training/us50_messiest_manual_label.xml: -------------------------------------------------------------------------------- 1 | 2 | Mile K Beach Road# 1, Kenai, AK 99611 3 | Mile K Beach Road# 1, Kenai, AK 99611 4 | Mi K Beach Road# 2, Kenai, AK 99611 5 | Bethel, AK 99559 6 | Box# 63, Cordova, AK 99574 7 | 88175 US Highway 278, Altoona, AL 35952 8 | 222 7th Street Southeast, Washington, DC 20003 9 | 613 K Street Northwest, Washington, DC 20001 10 | 1063 31st Street Northwest, Washington, DC 20007 11 | 844 6th Street Northwest, Winter Haven, FL 33881 12 | 1411 49th Street South, Saint Petersburg, FL 33707 13 | 110 1st Street Southwest, Moultrie, GA 31768 14 | 94-210 Leokane, Suite 105, Waipahu, HI 96797 15 | Highway 34 East, Albia, IA 52531 16 | 2615 1/2 Gary Avenue, Dodge City, KS 67801 17 | New Iberia, LA 70560 18 | Drum Point Road, Deale, MD 20751 19 | Main, Mars Hill, ME 04758 20 | Anchor Inn Road, Round Pond, ME 04564 21 | 3555 68th Street Southeast, Caledonia, MI 49316 22 | Highway 59 North, Detroit Lakes, MN 56501 23 | 1200 12th Street Southwest, Rochester, MN 55902 24 | 910 4th Street Northwest, Austin, MN 55912 25 | 510 1st Street West, Fosston, MN 56542 26 | Tipton, MO 65081 27 | Highway 63 South, Edgar Springs, MO 65462 28 | Highway 27 South, Monticello, MS 39654 29 | Kalispell, MT 59901 30 | 22 1st Street East, Kalispell, MT 59901 31 | Columbia Falls, MT 59912 32 | Queen Elizabeth Avenue, Manteo, NC 27954 33 | 443 2nd Avenue Southwest, Hickory, NC 28602 34 | 106 5th Avenue Southwest, Valley City, ND 58072 35 | 2302 15th Street Southwest, Minot, ND 58701 36 | 289 15th Street West, Dickinson, ND 58601 37 | 2802 13th Avenue South, Fargo, ND 58103 38 | 106 26th Street West, Williston, ND 58801 39 | 2400 10th Street Southwest, Minot, ND 58701 40 | 2815 13th Avenue South, Fargo, ND 58103 41 | 1415 42nd Street South, Fargo, ND 58103 42 | 1117 38th Street North, Fargo, ND 58102 43 | 251 14th Street West, Dickinson, ND 58601 44 | Plainview, NE 68769 45 | Lee's Mill Road, Moultonborough, NH 03254 46 | Main, Ashland, NH 03217 47 | 2 Broadway, Somers Point, NJ 08244 48 | Los Ojos, NM 87551 49 | Box # 209, Glorieta, NM 87535 50 | 16165 Southwest Regatta Lane, Suite 1000, Beaverton, OR 97006 51 | 1 Broadway, Newport, RI 02840 52 | 1010 Ocean Boulevard, Isle Of Palms, SC 29451 53 | 12188 Old Number 6 Highway, Eutawville, SC 29048 54 | 915 8th Avenue Northwest, Aberdeen, SD 57401 55 | Sandy, UT 84092 56 | 4931 South State Street, Salt Lake City, UT 84107 57 | 1499 South State Street, Salt Lake City, UT 84115 58 | 116 S. Independence Blvd., Virginia Beach, VA 23462 59 | 21210 44th Avenue West, Mountlake Terrace, WA 98043 60 | 546 5th Avenue South, Edmonds, WA 98020 61 | Wausau, WI 54403 62 | 102 Old Abe Road, Lac Du Flambeau, WI 54538 63 | Keyser, WV 26726 64 | Terra Alta, WV 26764 65 | 66 | -------------------------------------------------------------------------------- /training/us50_train_tagged.xml: -------------------------------------------------------------------------------- 1 | 2 | Homer Spit Road, Homer, AK 99603 3 | Lnlck Shopping Center, Anniston, AL 36201 4 | Center Ridge, AR 72027 5 | 9878 North Metro Parkway East, Phoenix, AZ 85051 6 | 2896 Fairfax Street, Denver, CO 80207 7 | Mesa Mall, Grand Junction, CO 81501 8 | 168 Hillside Avenue, Hartford, CT 06106 9 | 1025 Vermont Avenue Northwest, Washington, DC 20005 10 | 697 North Dupont Boulevard, Milford, DE 19963 11 | 1915 North Republic De Cuba Avenue, Tampa, FL 33602 12 | 2406 North Slappey Boulevard, Albany, GA 31701 13 | 98-1247 Kaahumanu, Aiea, HI 96701 14 | 103 West Main, Ute, IA 51060 15 | 335 Deinhard Lane, Mc Call, ID 83638 16 | 8922 South 1/2 Greenwood Avenue, Chicago, IL 60619 17 | 239 West Monroe Street, Decatur, IN 46733 18 | 827 Frontage Road, Agra, KS 67621 19 | 508 West 6th Street, Lexington, KY 40508 20 | 5103 Hollywood Avenue, Shreveport, LA 71109 21 | 79 Power Road, Westford, MA 01886 22 | 5105 Berwyn Road, College Park, MD 20740 23 | 47 Broad Street, Auburn, ME 04210 24 | 470 South Street, Ortonville, MI 48462 25 | 404 Wilson Avenue, Faribault, MN 55021 26 | 5933 Mc Donnell Boulevard, Hazelwood, MO 63042 27 | 918 East Main Avenue, Lumberton, MS 39455 28 | 107 A Street East, Poplar, MT 59255 29 | Village Shps Of Bnr, Banner Elk, NC 28604 30 | 2601 State Street, Bismarck, ND 58501 31 | 207 South Bell Street, Fremont, NE 68025 32 | 107 State Street, Portsmouth, NH 03801 33 | 1413 State Highway #50, Mays Landing, NJ 08330 34 | I-25 Highway 87, Raton, NM 87740 35 | 516 West Goldfield Avenue, Yerington, NV 89447 36 | 2787 Bway Way, New York, NY 10001 37 | 1380 Bethel Road, Columbus, OH 43220 38 | 305 Main, Fort Cobb, OK 73038 39 | 17375 Southwest Tualatin Valley Hwy, Beaverton, OR 97006 40 | 114 Market Street, Philadelphia, PA 19106 41 | 169 Main Street, Westerly, RI 02891 42 | 70 State Street, Charleston, SC 29401 43 | East Highway 212, Watertown, SD 57201 44 | 615 North 8th Avenue, Humboldt, TN 38343 45 | 5435 North MacArthur Boulevard, Irving, TX 75038 46 | 511 South 300 West, Salt Lake City, UT 84101 47 | 2457 North Harrison Street, Arlington, VA 22207 48 | 1 Lamere Avenue, Ludlow, VT 05149 49 | 5300 South 76th Street, Greendale, WI 53129 50 | 200 Monroe Street, Alderson, WV 24910 51 | 973 US Highway 16, Worland, WY 82401 52 | 53 | -------------------------------------------------------------------------------- /usaddress/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import string 4 | import typing 5 | import warnings 6 | 7 | import probableparsing 8 | import pycrfsuite 9 | 10 | # The address components are based upon the `United States Thoroughfare, 11 | # Landmark, and Postal Address Data Standard 12 | # http://www.urisa.org/advocacy/united-states-thoroughfare-landmark-and-postal-address-data-standard 13 | 14 | LABELS = [ 15 | "AddressNumberPrefix", 16 | "AddressNumber", 17 | "AddressNumberSuffix", 18 | "StreetNamePreModifier", 19 | "StreetNamePreDirectional", 20 | "StreetNamePreType", 21 | "StreetName", 22 | "StreetNamePostType", 23 | "StreetNamePostDirectional", 24 | "SubaddressType", 25 | "SubaddressIdentifier", 26 | "BuildingName", 27 | "OccupancyType", 28 | "OccupancyIdentifier", 29 | "CornerOf", 30 | "LandmarkName", 31 | "PlaceName", 32 | "StateName", 33 | "ZipCode", 34 | "USPSBoxType", 35 | "USPSBoxID", 36 | "USPSBoxGroupType", 37 | "USPSBoxGroupID", 38 | "IntersectionSeparator", 39 | "Recipient", 40 | "NotAddress", 41 | ] 42 | 43 | PARENT_LABEL = "AddressString" 44 | GROUP_LABEL = "AddressCollection" 45 | 46 | MODEL_FILE = "usaddr.crfsuite" 47 | MODEL_PATH = os.path.split(os.path.abspath(__file__))[0] + "/" + MODEL_FILE 48 | 49 | DIRECTIONS = { 50 | "n", 51 | "s", 52 | "e", 53 | "w", 54 | "ne", 55 | "nw", 56 | "se", 57 | "sw", 58 | "north", 59 | "south", 60 | "east", 61 | "west", 62 | "northeast", 63 | "northwest", 64 | "southeast", 65 | "southwest", 66 | } 67 | 68 | STREET_NAMES = { 69 | "allee", 70 | "alley", 71 | "ally", 72 | "aly", 73 | "anex", 74 | "annex", 75 | "annx", 76 | "anx", 77 | "arc", 78 | "arcade", 79 | "av", 80 | "ave", 81 | "aven", 82 | "avenu", 83 | "avenue", 84 | "avn", 85 | "avnue", 86 | "bayoo", 87 | "bayou", 88 | "bch", 89 | "beach", 90 | "bend", 91 | "bg", 92 | "bgs", 93 | "bl", 94 | "blf", 95 | "blfs", 96 | "bluf", 97 | "bluff", 98 | "bluffs", 99 | "blvd", 100 | "bnd", 101 | "bot", 102 | "bottm", 103 | "bottom", 104 | "boul", 105 | "boulevard", 106 | "boulv", 107 | "br", 108 | "branch", 109 | "brdge", 110 | "brg", 111 | "bridge", 112 | "brk", 113 | "brks", 114 | "brnch", 115 | "brook", 116 | "brooks", 117 | "btm", 118 | "burg", 119 | "burgs", 120 | "byp", 121 | "bypa", 122 | "bypas", 123 | "bypass", 124 | "byps", 125 | "byu", 126 | "camp", 127 | "canyn", 128 | "canyon", 129 | "cape", 130 | "causeway", 131 | "causwa", 132 | "causway", 133 | "cen", 134 | "cent", 135 | "center", 136 | "centers", 137 | "centr", 138 | "centre", 139 | "ci", 140 | "cir", 141 | "circ", 142 | "circl", 143 | "circle", 144 | "circles", 145 | "cirs", 146 | "ck", 147 | "clb", 148 | "clf", 149 | "clfs", 150 | "cliff", 151 | "cliffs", 152 | "club", 153 | "cmn", 154 | "cmns", 155 | "cmp", 156 | "cnter", 157 | "cntr", 158 | "cnyn", 159 | "common", 160 | "commons", 161 | "cor", 162 | "corner", 163 | "corners", 164 | "cors", 165 | "course", 166 | "court", 167 | "courts", 168 | "cove", 169 | "coves", 170 | "cp", 171 | "cpe", 172 | "cr", 173 | "crcl", 174 | "crcle", 175 | "crecent", 176 | "creek", 177 | "cres", 178 | "crescent", 179 | "cresent", 180 | "crest", 181 | "crk", 182 | "crossing", 183 | "crossroad", 184 | "crossroads", 185 | "crscnt", 186 | "crse", 187 | "crsent", 188 | "crsnt", 189 | "crssing", 190 | "crssng", 191 | "crst", 192 | "crt", 193 | "cswy", 194 | "ct", 195 | "ctr", 196 | "ctrs", 197 | "cts", 198 | "curv", 199 | "curve", 200 | "cv", 201 | "cvs", 202 | "cyn", 203 | "dale", 204 | "dam", 205 | "div", 206 | "divide", 207 | "dl", 208 | "dm", 209 | "dr", 210 | "driv", 211 | "drive", 212 | "drives", 213 | "drs", 214 | "drv", 215 | "dv", 216 | "dvd", 217 | "est", 218 | "estate", 219 | "estates", 220 | "ests", 221 | "ex", 222 | "exp", 223 | "expr", 224 | "express", 225 | "expressway", 226 | "expw", 227 | "expy", 228 | "ext", 229 | "extension", 230 | "extensions", 231 | "extn", 232 | "extnsn", 233 | "exts", 234 | "fall", 235 | "falls", 236 | "ferry", 237 | "field", 238 | "fields", 239 | "flat", 240 | "flats", 241 | "fld", 242 | "flds", 243 | "fls", 244 | "flt", 245 | "flts", 246 | "ford", 247 | "fords", 248 | "forest", 249 | "forests", 250 | "forg", 251 | "forge", 252 | "forges", 253 | "fork", 254 | "forks", 255 | "fort", 256 | "frd", 257 | "frds", 258 | "freeway", 259 | "freewy", 260 | "frg", 261 | "frgs", 262 | "frk", 263 | "frks", 264 | "frry", 265 | "frst", 266 | "frt", 267 | "frway", 268 | "frwy", 269 | "fry", 270 | "ft", 271 | "fwy", 272 | "garden", 273 | "gardens", 274 | "gardn", 275 | "gateway", 276 | "gatewy", 277 | "gatway", 278 | "gdn", 279 | "gdns", 280 | "glen", 281 | "glens", 282 | "gln", 283 | "glns", 284 | "grden", 285 | "grdn", 286 | "grdns", 287 | "green", 288 | "greens", 289 | "grn", 290 | "grns", 291 | "grov", 292 | "grove", 293 | "groves", 294 | "grv", 295 | "grvs", 296 | "gtway", 297 | "gtwy", 298 | "harb", 299 | "harbor", 300 | "harbors", 301 | "harbr", 302 | "haven", 303 | "havn", 304 | "hbr", 305 | "hbrs", 306 | "height", 307 | "heights", 308 | "hgts", 309 | "highway", 310 | "highwy", 311 | "hill", 312 | "hills", 313 | "hiway", 314 | "hiwy", 315 | "hl", 316 | "hllw", 317 | "hls", 318 | "hollow", 319 | "hollows", 320 | "holw", 321 | "holws", 322 | "hrbor", 323 | "ht", 324 | "hts", 325 | "hvn", 326 | "hway", 327 | "hwy", 328 | "inlet", 329 | "inlt", 330 | "is", 331 | "island", 332 | "islands", 333 | "isle", 334 | "isles", 335 | "islnd", 336 | "islnds", 337 | "iss", 338 | "jct", 339 | "jction", 340 | "jctn", 341 | "jctns", 342 | "jcts", 343 | "junction", 344 | "junctions", 345 | "junctn", 346 | "juncton", 347 | "key", 348 | "keys", 349 | "knl", 350 | "knls", 351 | "knol", 352 | "knoll", 353 | "knolls", 354 | "ky", 355 | "kys", 356 | "la", 357 | "lake", 358 | "lakes", 359 | "land", 360 | "landing", 361 | "lane", 362 | "lanes", 363 | "lck", 364 | "lcks", 365 | "ldg", 366 | "ldge", 367 | "lf", 368 | "lgt", 369 | "lgts", 370 | "light", 371 | "lights", 372 | "lk", 373 | "lks", 374 | "ln", 375 | "lndg", 376 | "lndng", 377 | "loaf", 378 | "lock", 379 | "locks", 380 | "lodg", 381 | "lodge", 382 | "loop", 383 | "loops", 384 | "lp", 385 | "mall", 386 | "manor", 387 | "manors", 388 | "mdw", 389 | "mdws", 390 | "meadow", 391 | "meadows", 392 | "medows", 393 | "mews", 394 | "mi", 395 | "mile", 396 | "mill", 397 | "mills", 398 | "mission", 399 | "missn", 400 | "ml", 401 | "mls", 402 | "mn", 403 | "mnr", 404 | "mnrs", 405 | "mnt", 406 | "mntain", 407 | "mntn", 408 | "mntns", 409 | "motorway", 410 | "mount", 411 | "mountain", 412 | "mountains", 413 | "mountin", 414 | "msn", 415 | "mssn", 416 | "mt", 417 | "mtin", 418 | "mtn", 419 | "mtns", 420 | "mtwy", 421 | "nck", 422 | "neck", 423 | "opas", 424 | "orch", 425 | "orchard", 426 | "orchrd", 427 | "oval", 428 | "overlook", 429 | "overpass", 430 | "ovl", 431 | "ovlk", 432 | "park", 433 | "parks", 434 | "parkway", 435 | "parkways", 436 | "parkwy", 437 | "pass", 438 | "passage", 439 | "path", 440 | "paths", 441 | "pike", 442 | "pikes", 443 | "pine", 444 | "pines", 445 | "pk", 446 | "pkway", 447 | "pkwy", 448 | "pkwys", 449 | "pky", 450 | "pl", 451 | "place", 452 | "plain", 453 | "plaines", 454 | "plains", 455 | "plaza", 456 | "pln", 457 | "plns", 458 | "plz", 459 | "plza", 460 | "pne", 461 | "pnes", 462 | "point", 463 | "points", 464 | "port", 465 | "ports", 466 | "pr", 467 | "prairie", 468 | "prarie", 469 | "prk", 470 | "prr", 471 | "prt", 472 | "prts", 473 | "psge", 474 | "pt", 475 | "pts", 476 | "pw", 477 | "pwy", 478 | "rad", 479 | "radial", 480 | "radiel", 481 | "radl", 482 | "ramp", 483 | "ranch", 484 | "ranches", 485 | "rapid", 486 | "rapids", 487 | "rd", 488 | "rdg", 489 | "rdge", 490 | "rdgs", 491 | "rds", 492 | "rest", 493 | "ri", 494 | "ridge", 495 | "ridges", 496 | "rise", 497 | "riv", 498 | "river", 499 | "rivr", 500 | "rn", 501 | "rnch", 502 | "rnchs", 503 | "road", 504 | "roads", 505 | "route", 506 | "row", 507 | "rpd", 508 | "rpds", 509 | "rst", 510 | "rte", 511 | "rue", 512 | "run", 513 | "rvr", 514 | "shl", 515 | "shls", 516 | "shoal", 517 | "shoals", 518 | "shoar", 519 | "shoars", 520 | "shore", 521 | "shores", 522 | "shr", 523 | "shrs", 524 | "skwy", 525 | "skyway", 526 | "smt", 527 | "spg", 528 | "spgs", 529 | "spng", 530 | "spngs", 531 | "spring", 532 | "springs", 533 | "sprng", 534 | "sprngs", 535 | "spur", 536 | "spurs", 537 | "sq", 538 | "sqr", 539 | "sqre", 540 | "sqrs", 541 | "sqs", 542 | "squ", 543 | "square", 544 | "squares", 545 | "st", 546 | "sta", 547 | "station", 548 | "statn", 549 | "stn", 550 | "str", 551 | "stra", 552 | "strav", 553 | "strave", 554 | "straven", 555 | "stravenue", 556 | "stravn", 557 | "stream", 558 | "street", 559 | "streets", 560 | "streme", 561 | "strm", 562 | "strt", 563 | "strvn", 564 | "strvnue", 565 | "sts", 566 | "sumit", 567 | "sumitt", 568 | "summit", 569 | "te", 570 | "ter", 571 | "terr", 572 | "terrace", 573 | "throughway", 574 | "tl", 575 | "tpk", 576 | "tpke", 577 | "tr", 578 | "trace", 579 | "traces", 580 | "track", 581 | "tracks", 582 | "trafficway", 583 | "trail", 584 | "trailer", 585 | "trails", 586 | "trak", 587 | "trce", 588 | "trfy", 589 | "trk", 590 | "trks", 591 | "trl", 592 | "trlr", 593 | "trlrs", 594 | "trls", 595 | "trnpk", 596 | "trpk", 597 | "trwy", 598 | "tunel", 599 | "tunl", 600 | "tunls", 601 | "tunnel", 602 | "tunnels", 603 | "tunnl", 604 | "turn", 605 | "turnpike", 606 | "turnpk", 607 | "un", 608 | "underpass", 609 | "union", 610 | "unions", 611 | "uns", 612 | "upas", 613 | "valley", 614 | "valleys", 615 | "vally", 616 | "vdct", 617 | "via", 618 | "viadct", 619 | "viaduct", 620 | "view", 621 | "views", 622 | "vill", 623 | "villag", 624 | "village", 625 | "villages", 626 | "ville", 627 | "villg", 628 | "villiage", 629 | "vis", 630 | "vist", 631 | "vista", 632 | "vl", 633 | "vlg", 634 | "vlgs", 635 | "vlly", 636 | "vly", 637 | "vlys", 638 | "vst", 639 | "vsta", 640 | "vw", 641 | "vws", 642 | "walk", 643 | "walks", 644 | "wall", 645 | "way", 646 | "ways", 647 | "well", 648 | "wells", 649 | "wl", 650 | "wls", 651 | "wy", 652 | "xc", 653 | "xg", 654 | "xing", 655 | "xrd", 656 | "xrds", 657 | } 658 | 659 | 660 | try: 661 | TAGGER = pycrfsuite.Tagger() 662 | TAGGER.open(MODEL_PATH) 663 | except OSError: 664 | warnings.warn( 665 | "You must train the model (parserator train --trainfile " 666 | "FILES) to create the %s file before you can use the parse " 667 | "and tag methods" % MODEL_FILE 668 | ) 669 | 670 | 671 | def parse(address_string: str) -> list[tuple[str, str]]: 672 | tokens = tokenize(address_string) 673 | 674 | if not tokens: 675 | return [] 676 | 677 | features = tokens2features(tokens) 678 | 679 | tags = TAGGER.tag(features) 680 | return list(zip(tokens, tags)) 681 | 682 | 683 | def tag(address_string: str, tag_mapping=None) -> tuple[dict[str, str], str]: 684 | tagged_components: dict[str, list] = {} 685 | 686 | last_label = None 687 | is_intersection = False 688 | og_labels = [] 689 | 690 | for token, label in parse(address_string): 691 | if label == "IntersectionSeparator": 692 | is_intersection = True 693 | if "StreetName" in label and is_intersection: 694 | label = "Second" + label 695 | 696 | # saving old label 697 | og_labels.append(label) 698 | # map tag to a new tag if tag mapping is provided 699 | if tag_mapping and tag_mapping.get(label): 700 | label = tag_mapping.get(label) 701 | else: 702 | label = label 703 | 704 | if label == last_label: 705 | tagged_components[label].append(token) 706 | elif label not in tagged_components: 707 | tagged_components[label] = [token] 708 | else: 709 | raise RepeatedLabelError(address_string, parse(address_string), label) 710 | 711 | last_label = label 712 | 713 | tagged_address: dict[str, str] = {} 714 | for token in tagged_components: 715 | component = " ".join(tagged_components[token]) 716 | component = component.strip(" ,;") 717 | tagged_address[token] = component 718 | 719 | if "AddressNumber" in og_labels and not is_intersection: 720 | address_type = "Street Address" 721 | elif is_intersection and "AddressNumber" not in og_labels: 722 | address_type = "Intersection" 723 | elif "USPSBoxID" in og_labels: 724 | address_type = "PO Box" 725 | else: 726 | address_type = "Ambiguous" 727 | 728 | return tagged_address, address_type 729 | 730 | 731 | def tokenize(address_string: str) -> list[str]: 732 | if isinstance(address_string, bytes): 733 | address_string = str(address_string, encoding="utf-8") 734 | address_string = re.sub("(&)|(&)", "&", address_string) 735 | re_tokens = re.compile( 736 | r""" 737 | \(*\b[^\s,;#&()]+[.,;)\n]* # ['ab. cd,ef '] -> ['ab.', 'cd,', 'ef'] 738 | | 739 | [#&] # [^'#abc'] -> ['#'] 740 | """, 741 | re.VERBOSE | re.UNICODE, 742 | ) 743 | 744 | tokens = re_tokens.findall(address_string) 745 | 746 | if not tokens: 747 | return [] 748 | 749 | return tokens 750 | 751 | 752 | Feature = dict[str, typing.Union[str, bool, "Feature"]] 753 | 754 | 755 | def tokenFeatures(token: str) -> Feature: 756 | if token in ("&", "#", "½"): 757 | token_clean = token 758 | else: 759 | token_clean = re.sub(r"(^[\W]*)|([^.\w]*$)", "", token, flags=re.UNICODE) 760 | 761 | token_abbrev = re.sub(r"[.]", "", token_clean.lower()) 762 | features = { 763 | "abbrev": token_clean[-1] == ".", 764 | "digits": digits(token_clean), 765 | "word": (token_abbrev if not token_abbrev.isdigit() else False), 766 | "trailing.zeros": ( 767 | trailingZeros(token_abbrev) if token_abbrev.isdigit() else False 768 | ), 769 | "length": ( 770 | "d:" + str(len(token_abbrev)) 771 | if token_abbrev.isdigit() 772 | else "w:" + str(len(token_abbrev)) 773 | ), 774 | "endsinpunc": ( 775 | token[-1] if bool(re.match(r".+[^.\w]", token, flags=re.UNICODE)) else False 776 | ), 777 | "directional": token_abbrev in DIRECTIONS, 778 | "street_name": token_abbrev in STREET_NAMES, 779 | "has.vowels": bool(set(token_abbrev[1:]) & set("aeiou")), 780 | } 781 | 782 | return features 783 | 784 | 785 | def tokens2features(address: list[str]) -> list[Feature]: 786 | feature_sequence = [tokenFeatures(address[0])] 787 | previous_features = feature_sequence[-1].copy() 788 | 789 | for token in address[1:]: 790 | token_features = tokenFeatures(token) 791 | current_features = token_features.copy() 792 | 793 | feature_sequence[-1]["next"] = current_features 794 | token_features["previous"] = previous_features 795 | 796 | feature_sequence.append(token_features) 797 | 798 | previous_features = current_features 799 | 800 | feature_sequence[0]["address.start"] = True 801 | feature_sequence[-1]["address.end"] = True 802 | 803 | if len(feature_sequence) > 1: 804 | feature_sequence[1]["previous"]["address.start"] = True # type: ignore [index] 805 | feature_sequence[-2]["next"]["address.end"] = True # type: ignore [index] 806 | 807 | return feature_sequence 808 | 809 | 810 | def digits(token: str) -> typing.Literal["all_digits", "some_digits", "no_digits"]: 811 | if token.isdigit(): 812 | return "all_digits" 813 | elif set(token) & set(string.digits): 814 | return "some_digits" 815 | else: 816 | return "no_digits" 817 | 818 | 819 | # for some reason mypy can't believe that this will return a str as of 10/2024 820 | def trailingZeros(token): 821 | results = re.findall(r"(0+)$", token) 822 | if results: 823 | return results[0] 824 | else: 825 | return "" 826 | 827 | 828 | class RepeatedLabelError(probableparsing.RepeatedLabelError): 829 | REPO_URL = "https://github.com/datamade/usaddress/issues/new" 830 | DOCS_URL = "https://usaddress.readthedocs.io/" 831 | --------------------------------------------------------------------------------