├── .gitignore
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── docs
    ├── Makefile
    ├── how_to_build.txt
    ├── make.bat
    └── source
    │   ├── README.rst
    │   ├── conf.py
    │   ├── geoparse.rst
    │   ├── index.rst
    │   ├── modules.rst
    │   ├── mordecai.rst
    │   └── mordecai.tests.rst
├── examples
    ├── README.md
    ├── geocode_cities.csv
    ├── geocode_cities.py
    └── out.csv
├── mordecai
    ├── MANIFEST.in
    ├── __init__.py
    ├── data
    │   ├── admin1CodesASCII.json
    │   ├── countries.json
    │   ├── feature_codes.txt
    │   ├── nat_df.csv
    │   └── stopword_country_names.json
    ├── geoparse.py
    ├── models
    │   ├── country_model.h5
    │   ├── country_model_multi.h5
    │   └── rank_model.h5
    ├── tests
    │   ├── __init__.py
    │   ├── conftest.py
    │   └── test_mordecai.py
    └── utilities.py
├── paper
    ├── mordecai_geoparsing.png
    ├── paper.bib
    └── paper.md
├── requirements.txt
├── setup.cfg
├── setup.py
└── train
    ├── train_country_model.py
    └── train_ranker.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | *.swp
 3 | .ropeproject
 4 | *.pyc
 5 | *.bin.gz
 6 | *.tar.bz2
 7 | *data/MITIE-models
 8 | *.ipynb
 9 | .cache
10 | mordecai/.cache
11 | build/
12 | dist/
13 | mordecai.egg-info
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | 
 3 | language: python
 4 | 
 5 | python:
 6 |   - 3.6
 7 | 
 8 | dist: trusty
 9 | 
10 | services:
11 |   - docker
12 | 
13 | before_install:
14 |   - docker pull elasticsearch:5.5.2
15 |   - wget 	https://s3.amazonaws.com/ahalterman-geo/geonames_index.tar.gz --output-file=wget_log.txt
16 |   - tar -xzf geonames_index.tar.gz
17 |   - docker run -d -p 127.0.0.1:9200:9200 -v $(pwd)/geonames_index/:/usr/share/elasticsearch/data elasticsearch:5.5.2
18 | 
19 | install:
20 |   - travis_wait pip install -r requirements.txt --quiet
21 | 
22 | script:
23 |   - pytest 
24 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 Andy Halterman, 2015 Caerus Associates
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include mordecai/data/ *
2 | recursive-include mordecai/models/ *
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![](paper/mordecai_geoparsing.png)
  2 | 
  3 | Full text geoparsing as a Python library. Extract the place names from a
  4 | piece of English-language text, resolve them to the correct place, and return
  5 | their coordinates and structured geographic information.
  6 | 
  7 | **Mordecai is ready for an upgrade!** Please take the user survey [here](https://z0l4ihmu0ud.typeform.com/to/b8FmCfMt) 
  8 | to help shape what v3 will look like.
  9 | 
 10 | Example usage
 11 | -------------
 12 | 
 13 | ```
 14 | >>> from mordecai import Geoparser
 15 | >>> geo = Geoparser()
 16 | >>> geo.geoparse("I traveled from Oxford to Ottawa.")
 17 | 
 18 | [{'country_conf': 0.96474487,
 19 |   'country_predicted': 'GBR',
 20 |   'geo': {'admin1': 'England',
 21 |    'country_code3': 'GBR',
 22 |    'feature_class': 'P',
 23 |    'feature_code': 'PPLA2',
 24 |    'geonameid': '2640729',
 25 |    'lat': '51.75222',
 26 |    'lon': '-1.25596',
 27 |    'place_name': 'Oxford'},
 28 |   'spans': [{'end': 22, 'start': 16}],
 29 |   'word': 'Oxford'},
 30 |  {'country_conf': 0.83302397,
 31 |   'country_predicted': 'CAN',
 32 |   'geo': {'admin1': 'Ontario',
 33 |    'country_code3': 'CAN',
 34 |    'feature_class': 'P',
 35 |    'feature_code': 'PPLC',
 36 |    'geonameid': '6094817',
 37 |    'lat': '45.41117',
 38 |    'lon': '-75.69812',
 39 |    'place_name': 'Ottawa'},
 40 |   'spans': [{'end': 32, 'start': 26}],
 41 |   'word': 'Ottawa'}]
 42 | ```
 43 | 
 44 | Mordecai requires a running Elasticsearch service with Geonames in it. See
 45 | "Installation" below for instructions.
 46 | 
 47 | 
 48 | Installation and Requirements
 49 | --------------------
 50 | 
 51 | 1. Mordecai is on PyPI and can be installed for Python 3 with pip:
 52 | 
 53 | ```
 54 | pip install mordecai
 55 | ```
 56 | 
 57 | **Note**: It's *strongly* recommended that you run Mordecai in a virtual
 58 | environment. The libraries that Mordecai depends on are not always the most
 59 | recent versions and using a virtual environment prevents libraries from being
 60 | downgraded or running into other issues:
 61 | 
 62 | ```
 63 | python -m venv mordecai-env
 64 | source mordecai-env/bin/activate
 65 | pip install mordecai
 66 | ```
 67 | 
 68 | 2. You should then download the required spaCy NLP model:
 69 | 
 70 | ```
 71 | python -m spacy download en_core_web_lg
 72 | ```
 73 | 
 74 | 3. In order to work, Mordecai needs access to a Geonames gazetteer running in
 75 | Elasticsearch. The easiest way to set it up is by running the following
 76 | commands (you must have [Docker](https://docs.docker.com/engine/installation/)
 77 | installed first).
 78 | 
 79 | ```
 80 | docker pull elasticsearch:5.5.2
 81 | wget https://andrewhalterman.com/files/geonames_index.tar.gz --output-file=wget_log.txt
 82 | tar -xzf geonames_index.tar.gz
 83 | docker run -d -p 127.0.0.1:9200:9200 -v $(pwd)/geonames_index/:/usr/share/elasticsearch/data elasticsearch:5.5.2
 84 | ```
 85 | 
 86 | See the [es-geonames](https://github.com/openeventdata/es-geonames) for the code used
 87 | to produce this index.
 88 | 
 89 | To update the index, simply shut down the old container, re-download the index
 90 | from s3, and restart the container with the new index.
 91 | 
 92 | Citing
 93 | ------
 94 | 
 95 | If you use this software in academic work, please cite as 
 96 | 
 97 | ```
 98 | @article{halterman2017mordecai,
 99 |   title={Mordecai: Full Text Geoparsing and Event Geocoding},
100 |   author={Halterman, Andrew},
101 |   journal={The Journal of Open Source Software},
102 |   volume={2},
103 |   number={9},
104 |   year={2017},
105 |   doi={10.21105/joss.00091}
106 | }
107 | ```
108 | 
109 | How does it work?
110 | -----------------
111 | 
112 | Mordecai takes in unstructured text and returns structured geographic information extracted
113 | from it. 
114 | 
115 | - It uses [spaCy](https://github.com/explosion/spaCy/)'s named entity recognition to
116 |   extract placenames from the text.
117 | 
118 | - It uses the [geonames](http://www.geonames.org/)
119 |   gazetteer in an [Elasticsearch](https://www.elastic.co/products/elasticsearch) index 
120 |   (with some custom logic) to find the potential coordinates of
121 |   extracted place names.
122 | 
123 | - It uses neural networks implemented in [Keras](https://keras.io/) and trained on new annotated
124 |   English-language data labeled with [Prodigy](https://prodi.gy/) to infer the correct country and correct gazetteer entries for each
125 |   placename. 
126 | 
127 | The training data for the two models includes copyrighted text so cannot be
128 | shared freely. Applying Mordecai to non-English language text would require labeling data
129 | in the target language and retraining.
130 | 
131 | API and Configuration
132 | ---------------------
133 | 
134 | When instantiating the `Geoparser()` module, the following options can be changed:
135 | 
136 | - `es_hosts` : List of hosts where the Geonames Elasticsearch service is
137 |     running. Defaults to `['localhost']`, which is where it runs if you're using
138 |     the default Docker setup described above.
139 | - `es_port` : What port the Geonames Elasticsearch service is running on.
140 |     Defaults to `9200`, which is where the Docker setup has it
141 | - `es_ssl` : Whether Elasticsearch requires an SSL connection.
142 |     Defaults to `False`.
143 | - `es_auth` : Optional HTTP auth parameters to use with ES.
144 |     If provided, it should be a two-tuple of `(user, password)`.
145 | - `country_confidence` : Set the country model confidence below which no
146 |     geolocation will be returned. If it's really low, the model's probably
147 |     wrong and will return weird results. Defaults to `0.6`. 
148 | - `verbose` : Return all the features used in the country picking model?
149 |     Defaults to `False`. 
150 | - `threads`: whether to use threads to make parallel queries to the
151 |     Elasticsearch database. Defaults to `True`, which gives a ~6x speedup.
152 | 
153 | `geoparse` is the primary endpoint and the only one that most users will need.
154 | Other, mostly internal, methods may be useful in some cases:
155 | 
156 | - `lookup_city` takes a city name, country, and (optionally) ADM1/state/governorate and 
157 |     does a rule-based lookup for the city.
158 | - `infer_country` take a document and attempts to infer the most probable
159 |     country for each.
160 | - `query_geonames` and `query_geonames_country` can be used for performing a
161 |     search over Geonames in Elasticsearch
162 | - methods with the `_feature` prefix are internal methods for
163 |     calculating country picking features from text.
164 | 
165 | `batch_geoparse` takes in a list of documents and uses spaCy's `nlp.pipe`
166 | method to process them more efficiently in the NLP step. 
167 | 
168 | Advanced users on large machines can increase the `lru_cache` parameter from 250
169 | to 1000. This will use more memory but will increase parsing speed.
170 | 
171 | Tests
172 | -----
173 | 
174 | Mordecai includes unit tests. To run the tests, `cd` into the
175 | `mordecai` directory and run:
176 | 
177 | ```
178 | pytest
179 | ```
180 | 
181 | The tests require access to a running Elastic/Geonames service to
182 | complete. The tests are currently failing on TravisCI with an unexplained
183 | segfault but run fine locally. Mordecai has only been tested with Python 3.
184 | 
185 | 
186 | Acknowledgements
187 | ----------------
188 | 
189 | An earlier verion of this software was donated to the Open Event Data Alliance
190 | by Caerus Associates.  See [Releases](https://github.com/openeventdata/mordecai/releases) 
191 | or the [legacy-docker](https://github.com/openeventdata/mordecai/tree/legacy-docker) branch for the
192 | 2015-2016 and the 2016-2017 production versions of Mordecai.
193 | 
194 | This work was funded in part by DARPA's XDATA program, the U.S. Army Research
195 | Laboratory and the U.S. Army Research Office through the Minerva Initiative
196 | under grant number W911NF-13-0332, and the National Science Foundation under
197 | award number SBE-SMA-1539302. Any opinions, findings, and conclusions or
198 | recommendations expressed in this material are those of the authors and do not
199 | necessarily reflect the views of DARPA, ARO, Minerva, NSF, or the U.S.
200 | government.
201 | 
202 | 
203 | Contributing
204 | ------------
205 | 
206 | Contributions via pull requests are welcome. Please make sure that changes
207 | pass the unit tests. Any bugs and problems can be reported
208 | on the repo's [issues page](https://github.com/openeventdata/mordecai/issues).
209 | 
210 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = mordecai
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/how_to_build.txt:
--------------------------------------------------------------------------------
1 | # convert docstrings to restructured text
2 | sphinx-apidoc -f -o source/ ../mordecai
3 | # build the docs
4 | make html
5 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | set SPHINXPROJ=mordecai
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/source/README.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../README.md
2 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # mordecai documentation build configuration file, created by
  5 | # sphinx-quickstart on Mon Nov 20 12:24:51 2017.
  6 | #
  7 | # This file is execfile()d with the current directory set to its
  8 | # containing dir.
  9 | #
 10 | # Note that not all possible configuration values are present in this
 11 | # autogenerated file.
 12 | #
 13 | # All configuration values have a default; values that are commented out
 14 | # serve to show the default.
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #
 20 | import os
 21 | import sys
 22 | sys.path.insert(0, os.path.abspath('../../mordecai'))
 23 | 
 24 | 
 25 | # -- General configuration ------------------------------------------------
 26 | 
 27 | # If your documentation needs a minimal Sphinx version, state it here.
 28 | #
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon']
 35 | 
 36 | # Add any paths that contain templates here, relative to this directory.
 37 | templates_path = ['_templates']
 38 | 
 39 | # The suffix(es) of source filenames.
 40 | # You can specify multiple suffix as a list of string:
 41 | #
 42 | # source_suffix = ['.rst', '.md']
 43 | source_suffix = '.rst'
 44 | 
 45 | # The master toctree document.
 46 | master_doc = 'index'
 47 | 
 48 | # General information about the project.
 49 | project = 'mordecai'
 50 | copyright = '2017, Andy Halterman'
 51 | author = 'Andy Halterman'
 52 | 
 53 | # The version info for the project you're documenting, acts as replacement for
 54 | # |version| and |release|, also used in various other places throughout the
 55 | # built documents.
 56 | #
 57 | # The short X.Y version.
 58 | version = '2.0.0'
 59 | # The full version, including alpha/beta/rc tags.
 60 | release = '2.0.0a1'
 61 | 
 62 | # The language for content autogenerated by Sphinx. Refer to documentation
 63 | # for a list of supported languages.
 64 | #
 65 | # This is also used if you do content translation via gettext catalogs.
 66 | # Usually you set "language" from the command line for these cases.
 67 | language = None
 68 | 
 69 | # List of patterns, relative to source directory, that match files and
 70 | # directories to ignore when looking for source files.
 71 | # This patterns also effect to html_static_path and html_extra_path
 72 | exclude_patterns = []
 73 | 
 74 | # The name of the Pygments (syntax highlighting) style to use.
 75 | pygments_style = 'sphinx'
 76 | 
 77 | # If true, `todo` and `todoList` produce output, else they produce nothing.
 78 | todo_include_todos = False
 79 | 
 80 | 
 81 | # -- Options for HTML output ----------------------------------------------
 82 | 
 83 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 84 | # a list of builtin themes.
 85 | #
 86 | html_theme = 'alabaster'
 87 | 
 88 | # Theme options are theme-specific and customize the look and feel of a theme
 89 | # further.  For a list of options available for each theme, see the
 90 | # documentation.
 91 | #
 92 | # html_theme_options = {}
 93 | 
 94 | # Add any paths that contain custom static files (such as style sheets) here,
 95 | # relative to this directory. They are copied after the builtin static files,
 96 | # so a file named "default.css" will overwrite the builtin "default.css".
 97 | html_static_path = ['_static']
 98 | 
 99 | 
100 | # -- Options for HTMLHelp output ------------------------------------------
101 | 
102 | # Output file base name for HTML help builder.
103 | htmlhelp_basename = 'mordecaidoc'
104 | 
105 | 
106 | # -- Options for LaTeX output ---------------------------------------------
107 | 
108 | latex_elements = {
109 |     # The paper size ('letterpaper' or 'a4paper').
110 |     #
111 |     # 'papersize': 'letterpaper',
112 | 
113 |     # The font size ('10pt', '11pt' or '12pt').
114 |     #
115 |     # 'pointsize': '10pt',
116 | 
117 |     # Additional stuff for the LaTeX preamble.
118 |     #
119 |     # 'preamble': '',
120 | 
121 |     # Latex figure (float) alignment
122 |     #
123 |     # 'figure_align': 'htbp',
124 | }
125 | 
126 | # Grouping the document tree into LaTeX files. List of tuples
127 | # (source start file, target name, title,
128 | #  author, documentclass [howto, manual, or own class]).
129 | latex_documents = [
130 |     (master_doc, 'mordecai.tex', 'mordecai Documentation',
131 |      'Andy Halterman', 'manual'),
132 | ]
133 | 
134 | 
135 | # -- Options for manual page output ---------------------------------------
136 | 
137 | # One entry per manual page. List of tuples
138 | # (source start file, name, description, authors, manual section).
139 | man_pages = [
140 |     (master_doc, 'mordecai', 'mordecai Documentation',
141 |      [author], 1)
142 | ]
143 | 
144 | 
145 | # -- Options for Texinfo output -------------------------------------------
146 | 
147 | # Grouping the document tree into Texinfo files. List of tuples
148 | # (source start file, target name, title, author,
149 | #  dir menu entry, description, category)
150 | texinfo_documents = [
151 |     (master_doc, 'mordecai', 'mordecai Documentation',
152 |      author, 'mordecai', 'One line description of project.',
153 |      'Miscellaneous'),
154 | ]
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/docs/source/geoparse.rst:
--------------------------------------------------------------------------------
1 | Geoparse documentation
2 | =================
3 | 
4 | .. automodule:: mordecai.Geoparse
5 |    :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. mordecai documentation master file, created by
 2 |    sphinx-quickstart on Mon Nov 20 12:24:51 2017.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to mordecai's documentation!
 7 | ====================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    geoparse
14 | 
15 | 
16 | Indices and tables
17 | ==================
18 | 
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 | 


--------------------------------------------------------------------------------
/docs/source/modules.rst:
--------------------------------------------------------------------------------
1 | mordecai
2 | ========
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    mordecai
8 | 


--------------------------------------------------------------------------------
/docs/source/mordecai.rst:
--------------------------------------------------------------------------------
 1 | mordecai package
 2 | ================
 3 | 
 4 | Subpackages
 5 | -----------
 6 | 
 7 | .. toctree::
 8 | 
 9 |     mordecai.tests
10 | 
11 | Submodules
12 | ----------
13 | 
14 | mordecai.geoparse module
15 | ------------------------
16 | 
17 | .. automodule:: mordecai.geoparse
18 |     :members:
19 |     :undoc-members:
20 |     :show-inheritance:
21 | 
22 | mordecai.utilities module
23 | -------------------------
24 | 
25 | .. automodule:: mordecai.utilities
26 |     :members:
27 |     :undoc-members:
28 |     :show-inheritance:
29 | 
30 | 
31 | Module contents
32 | ---------------
33 | 
34 | .. automodule:: mordecai
35 |     :members:
36 |     :undoc-members:
37 |     :show-inheritance:
38 | 


--------------------------------------------------------------------------------
/docs/source/mordecai.tests.rst:
--------------------------------------------------------------------------------
 1 | mordecai.tests package
 2 | ======================
 3 | 
 4 | Submodules
 5 | ----------
 6 | 
 7 | mordecai.tests.conftest module
 8 | ------------------------------
 9 | 
10 | .. automodule:: mordecai.tests.conftest
11 |     :members:
12 |     :undoc-members:
13 |     :show-inheritance:
14 | 
15 | mordecai.tests.test_mordecai module
16 | -----------------------------------
17 | 
18 | .. automodule:: mordecai.tests.test_mordecai
19 |     :members:
20 |     :undoc-members:
21 |     :show-inheritance:
22 | 
23 | 
24 | Module contents
25 | ---------------
26 | 
27 | .. automodule:: mordecai.tests
28 |     :members:
29 |     :undoc-members:
30 |     :show-inheritance:
31 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Mordecai examples
 2 | 
 3 | ## Geocoding cities
 4 | 
 5 | This script is an example usage of `geo.lookup_city()`, which takes a CSV
 6 | containing columns with city names, country 3 letter codes, and (optionally)
 7 | state/ADM1 names. If the columns are named (respectively) `city`, `adm1`, and
 8 | `country`, you can run it like this:
 9 | 
10 | ```
11 | python geocode_cities.py geocode_cities.csv out.csv
12 | ```
13 | 
14 | Otherwise, you'll have to specify the column names as part of the call. The
15 | geocoder returns lat/lon and Geonames information, as well as providing the
16 | reason for why it selected a particular location and cautions when the results
17 | were ambiguous. 


--------------------------------------------------------------------------------
/examples/geocode_cities.csv:
--------------------------------------------------------------------------------
1 | city,adm1,country
2 | Norman,OK,USA
3 | College Park,MD,USA
4 | Cambridge,MA,USA
5 | Whaugbggoan,OK,USA
6 | Columbia Heights,DC,USA
7 | Aleppo,Aleppo,SYR
8 | 


--------------------------------------------------------------------------------
/examples/geocode_cities.py:
--------------------------------------------------------------------------------
 1 | import plac
 2 | import pandas as pd
 3 | from mordecai import Geoparser
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | def main(in_file: ("input CSV file"), 
 8 |         out_file: ("filename to write ouput to"), 
 9 |         city_col: ("column in CSV with city col") = "city",
10 |          adm1_col: ("column in CSV with state/governorate/ADM1") = "adm1", 
11 |          country_col: ("column in CSV with country name") = "country"):
12 |     """Geocode a csv with a city, ADM1, and country columns."""
13 |     print("Loading Mordecai...")
14 |     geo = Geoparser() 
15 |     df = pd.read_csv(in_file)
16 |     geocoded = []
17 |     print("Geocoding...")
18 |     for i in tqdm(df.iterrows()):
19 |         row = i[1]
20 |         if pd.isnull(row[adm1_col]):
21 |             # Elasticsearch doesn't like NaN, change to None
22 |             adm1 = None
23 |         else:
24 |             adm1 = row[adm1_col] 
25 |         res = geo.lookup_city(city = row[city_col], 
26 |                               adm1 = adm1, 
27 |                               country = row[country_col])
28 |         try:
29 |             gc = {"admin1_code" : res['geo']['admin1_code'],
30 |                   "admin2_code": res['geo']['admin2_code'],
31 |                   "asciiname": res['geo']['asciiname'],
32 |                   "name": res['geo']['name'],
33 |                   "geonameid": res['geo']['geonameid'],
34 |                   "feature_class": res['geo']['feature_class'],
35 |                   "feature_code": res['geo']['feature_code'],
36 |                   "country_code3": res['geo']['country_code3'],
37 |                   "lat": float(res['geo']['coordinates'].split(",")[0]),
38 |                   "lon": float(res['geo']['coordinates'].split(",")[1])}
39 |         except TypeError:
40 |             gc = {"admin1_code" : "",
41 |                   "admin2_code": "",
42 |                   "asciiname": "",
43 |                   "name": "",
44 |                   "geonameid": "",
45 |                   "feature_class": "",
46 |                   "feature_code": "", 
47 |                   "country_code3": "",
48 |                   "lat": "",
49 |                   "lon": ""}
50 |         gc['search_city'] = row[city_col]
51 |         gc['search_adm1'] = row[adm1_col]
52 |         gc['search_country'] = row[country_col]
53 |         gc["info"] = res['info']
54 |         gc["reason"] = res['reason']
55 |         geocoded.append(gc)
56 |     geo_df = pd.DataFrame(geocoded)
57 |     geo_df.to_csv(out_file)
58 |     print("Wrote file out to ", out_file)
59 | 
60 |     
61 | if __name__ == '__main__':
62 |     plac.call(main)


--------------------------------------------------------------------------------
/examples/out.csv:
--------------------------------------------------------------------------------
1 | ,admin1_code,admin2_code,asciiname,name,geonameid,feature_class,feature_code,country_code3,lat,lon,search_city,search_adm1,search_country,info,reason
2 | 0,OK,027,Norman,Norman,4543762,P,PPLA2,USA,35.22257,-97.43948,Norman,OK,USA,50 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country."
3 | 1,MD,033,College Park,College Park,4351977,P,PPL,USA,38.98067,-76.93692,College Park,MD,USA,2 elasticsearch matches for cities out of 37 total results of all types,Exact name match for city.
4 | 2,ID,005,Cambridge,Cambridge,5587778,P,PPL,USA,42.45047,-112.11663,Cambridge,MA,USA,33 entries within minimum edit distance. Picking closest average distance: 2.25.,CAUTION: Best of several edit distance matches.
5 | 3,,,,,,,,,,,Whaugbggoan,OK,USA,0 total results of all types.,FAILURE: No fuzzy match for city or neighborhood.
6 | 4,DC,001,Columbia Heights,Columbia Heights,4138102,P,PPL,USA,38.92567,-77.02942,Columbia Heights,DC,USA,6 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country."
7 | 5,09,,Aleppo,Aleppo,170063,P,PPLA,SYR,36.20124,37.16117,Aleppo,Aleppo,SYR,9 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country."
8 | 


--------------------------------------------------------------------------------
/mordecai/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include data/nat_df.csv
2 | 


--------------------------------------------------------------------------------
/mordecai/__init__.py:
--------------------------------------------------------------------------------
1 | from .geoparse import Geoparser
2 | 
3 | __version__ = "2.1.0"
4 | 


--------------------------------------------------------------------------------
/mordecai/data/countries.json:
--------------------------------------------------------------------------------
1 | "{\"Afghanistan\":\"AFG\", \"\u00c5land Islands\":\"ALA\", \"Albania\":\"ALB\", \"Algeria\":\"DZA\", \"American Samoa\":\"ASM\", \"Andorra\":\"AND\", \"Angola\":\"AGO\", \"Anguilla\":\"AIA\", \"Antarctica\":\"ATA\", \"Antigua Barbuda\":\"ATG\", \"Argentina\":\"ARG\", \"Armenia\":\"ARM\", \"Aruba\":\"ABW\", \"Ascension_Island\":\"NA\", \"Australia\":\"AUS\", \"Austria\":\"AUT\", \"Azerbaijan\":\"AZE\", \"Bahamas\":\"BHS\", \"Bahrain\":\"BHR\", \"Bangladesh\":\"BGD\", \"Barbados\":\"BRB\", \"Belarus\":\"BLR\", \"Belgium\":\"BEL\", \"Belize\":\"BLZ\", \"Benin\":\"BEN\", \"Bermuda\":\"BMU\", \"Bhutan\":\"BTN\", \"Bolivia\":\"BOL\", \"Bosnia_Herzegovina\":\"BIH\", \"Botswana\":\"BWA\", \"Bouvet Island\":\"BVT\", \"Brazil\":\"BRA\", \"Britain\":\"GBR\", \"Great_Britain\":\"GBR\", \"British Virgin Islands\":\"VGB\", \"Brunei\":\"BRN\", \"Bulgaria\":\"BGR\", \"Burkina_Faso\":\"BFA\", \"Burundi\":\"BDI\", \"Cambodia\":\"KHM\", \"Cameroon\":\"CMR\", \"Canada\":\"CAN\",\"Cape Verde\":\"CPV\", \"Cayman_Islands\":\"CYM\", \"Central African Republic\":\"CAF\", \"Chad\":\"TCD\", \"Chile\":\"CHL\", \"China\":\"CHN\", \"Cocos_Islands\":\"CCK\", \"Colombia\":\"COL\", \"Comoros\":\"COM\", \"Congo Brazzaville\":\"COG\", \"Congo Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"CRI\", \"C Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"CRI\",ur Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_R:\" Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"CRI\"Ecua Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"GNQ\", \"Eritrea\":\"ERI\", \"Estonia\":\"EST\", \"Ethiopia\":\"ETH\", \"Falkland_Islands \"Eritrea\":\"ERI\", \"Estonia\":\"EST\", \"Ethiopia\":\"ETH\", \"Falkland, \"France\":\"FRA\", \"French_Guiana\":\"GUF\", \"French_Polynesia\":\"PYF\",\"Gabon\": \"French_Guiana\":\"GUF\", \"French_Polynesia\":\"PYF\",\"Gabon\": \"French_Guiana\":\"GUa\":\"GHA\", \"Gibraltar\":\"GIB\", \"Greece\":\"GRC\", \"Greenland\":\"GRL\", \"Grenada\":\"GRD\", \"Guadeloupe\":\"GLP\", \"Guam\":\"GUM\", \"Guatemala\":\"GTM\", \"Guernsey\":\"GGY\", \"Guinea\":\"GIN\", \"Guinea_Bissau\":\"GNB\", \"Guyana\":\"GUY\", \"Haiti\":\"HTI\",\"Honduras\":\"HND\", \"Hong_Kong\":\"HKG\", \"Hungary\":\"HUN\", \"Iceland\":\"ISL\", \"India\":\"IND\", \"Indonesia\":\"IDN\", \"Iran\":\"IRN\", \"Iraq\":\"IRQ\", \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"IreD\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ire\", \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\"G\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Iue\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ir\", \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Irelandtil \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\":\":\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": th Mayen\":\"SJM\", \"Swaziland\":\"SWZ\",\"Sweden\":\"SWE\", \"Switzerland\":\"CHE\", \"Syria\":\"SYR\", \"Taiwan\":\"TWN\",\"Tajikistan\":\"TJK\", \"Tanzania\":\"TZA\", \"Thailand\":\"THA\", \"Timor Leste\":\"TLS\",\"East_Timor\":\"TLS\",\"Togo\":\"TGO\", \"Tokelau\":\"TKL\", \"Tonga\":\"TON\", \"TrinidadTobago\":\"TTO\", \"Tunisia\":\"TUN\", \"Turkey\":\"TUR\", \"Turkmenistan\":\"TKM\", \"TurksCaicos Islands\":\"TCA\", \"Tuvalu\":\"TUV\", \"U.S. Minor Outlying Islands\":\"UMI\",\"Virgin_Islands\":\"VIR\", \"Uganda\":\"UGA\", \"Ukraine\":\"UKR\",\"United_Arab_Emirates\":\"ARE\", \"United_Kingdom\":\"GBR\", \"UK\":\"GBR\",\"United_States\":\"USA\", \"USA\":\"USA\", \"America\":\"USA\", \"Uruguay\":\"URY\",\"Uzbekistan\":\"UZB\", \"Vanuatu\":\"VUT\", \"Vatican\":\"VAT\", \"Venezuela\":\"VEN\",\"Vietnam\":\"VNM\", \"Wallis Futuna\":\"WLF\", \"Western_Sahara\":\"ESH\", \"Yemen\":\"YEM\",\"Zambia\":\"ZMB\", \"Zimbabwe\":\"ZWE\"}"
2 | 


--------------------------------------------------------------------------------
/mordecai/data/feature_codes.txt:
--------------------------------------------------------------------------------
  1 | A	ADM1	A.ADM1	first-order administrative division	a primary administrative division of a country, such as a state in the United States
  2 | A	ADM2	A.ADM2	second-order administrative division	a subdivision of a first-order administrative division
  3 | A	ADM3	A.ADM3	third-order administrative division	a subdivision of a second-order administrative division
  4 | A	ADM4	A.ADM4	fourth-order administrative division	a subdivision of a third-order administrative division
  5 | A	ADM5	A.ADM5	fifth-order administrative division	a subdivision of a fourth-order administrative division
  6 | A	ADMD	A.ADMD	administrative division	an administrative division of a country, undifferentiated as to administrative level
  7 | A	LTER	A.LTER	leased area	a tract of land leased to another country, usually for military installations
  8 | A	PCL	A.PCL	political entity
  9 | A	PCLD	A.PCLD	dependent political entity
 10 | A	PCLF	A.PCLF	freely associated state
 11 | A	PCLI	A.PCLI	independent political entity
 12 | A	PCLIX	A.PCLIX	section of independent political entity
 13 | A	PCLS	A.PCLS	semi-independent political entity
 14 | A	PRSH	A.PRSH	parish	an ecclesiastical district
 15 | A	TERR	A.TERR	territory
 16 | A	ZN	A.ZN	zone
 17 | A	ZNB	A.ZNB	buffer zone	a zone recognized as a buffer between two nations in which military presence is minimal or absent
 18 | H	AIRS	H.AIRS	seaplane landing area	a place on a waterbody where floatplanes land and take off
 19 | H	ANCH	H.ANCH	anchorage	an area where vessels may anchor
 20 | H	BAY	H.BAY	bay	a coastal indentation between two capes or headlands, larger than a cove but smaller than a gulf
 21 | H	BAYS	H.BAYS	bays	coastal indentations between two capes or headlands, larger than a cove but smaller than a gulf
 22 | H	BGHT	H.BGHT	bight(s)	an open body of water forming a slight recession in a coastline
 23 | H	BNK	H.BNK	bank(s)	an elevation, typically located on a shelf, over which the depth of water is relatively shallow but sufficient for most surface navigation
 24 | H	BNKR	H.BNKR	stream bank	a sloping margin of a stream channel which normally confines the stream to its channel on land
 25 | H	BNKX	H.BNKX	section of bank
 26 | H	BOG	H.BOG	bog(s)	a wetland characterized by peat forming sphagnum moss, sedge, and other acid-water plants
 27 | H	CAPG	H.CAPG	icecap	a dome-shaped mass of glacial ice covering an area of mountain summits or other high lands; smaller than an ice sheet
 28 | H	CHN	H.CHN	channel	the deepest part of a stream, bay, lagoon, or strait, through which the main current flows
 29 | H	CHNL	H.CHNL	lake channel(s)	that part of a lake having water deep enough for navigation between islands, shoals, etc.
 30 | H	CHNM	H.CHNM	marine channel	that part of a body of water deep enough for navigation through an area otherwise not suitable
 31 | H	CHNN	H.CHNN	navigation channel	a buoyed channel of sufficient depth for the safe navigation of vessels
 32 | H	CNFL	H.CNFL	confluence	a place where two or more streams or intermittent streams flow together
 33 | H	CNL	H.CNL	canal	an artificial watercourse
 34 | H	CNLA	H.CNLA	aqueduct	a conduit used to carry water
 35 | H	CNLB	H.CNLB	canal bend	a conspicuously curved or bent section of a canal
 36 | H	CNLD	H.CNLD	drainage canal	an artificial waterway carrying water away from a wetland or from drainage ditches
 37 | H	CNLI	H.CNLI	irrigation canal	a canal which serves as a main conduit for irrigation water
 38 | H	CNLN	H.CNLN	navigation canal(s)	a watercourse constructed for navigation of vessels
 39 | H	CNLQ	H.CNLQ	abandoned canal
 40 | H	CNLSB	H.CNLSB	underground irrigation canal(s)	a gently inclined underground tunnel bringing water for irrigation from aquifers
 41 | H	CNLX	H.CNLX	section of canal
 42 | H	COVE	H.COVE	cove(s)	a small coastal indentation, smaller than a bay
 43 | H	CRKT	H.CRKT	tidal creek(s)	a meandering channel in a coastal wetland subject to bi-directional tidal currents
 44 | H	CRNT	H.CRNT	current	a horizontal flow of water in a given direction with uniform velocity
 45 | H	CUTF	H.CUTF	cutoff	a channel formed as a result of a stream cutting through a meander neck
 46 | H	DCK	H.DCK	dock(s)	a waterway between two piers, or cut into the land for the berthing of ships
 47 | H	DCKB	H.DCKB	docking basin	a part of a harbor where ships dock
 48 | H	DOMG	H.DOMG	icecap dome	a comparatively elevated area on an icecap
 49 | H	DPRG	H.DPRG	icecap depression	a comparatively depressed area on an icecap
 50 | H	DTCH	H.DTCH	ditch	a small artificial watercourse dug for draining or irrigating the land
 51 | H	DTCHD	H.DTCHD	drainage ditch	a ditch which serves to drain the land
 52 | H	DTCHI	H.DTCHI	irrigation ditch	a ditch which serves to distribute irrigation water
 53 | H	DTCHM	H.DTCHM	ditch mouth(s)	an area where a drainage ditch enters a lagoon, lake or bay
 54 | H	ESTY	H.ESTY	estuary	a funnel-shaped stream mouth or embayment where fresh water mixes with sea water under tidal influences
 55 | H	FISH	H.FISH	fishing area	a fishing ground, bank or area where fishermen go to catch fish
 56 | H	FJD	H.FJD	fjord	a long, narrow, steep-walled, deep-water arm of the sea at high latitudes, usually along mountainous coasts
 57 | H	FJDS	H.FJDS	fjords	long, narrow, steep-walled, deep-water arms of the sea at high latitudes, usually along mountainous coasts
 58 | H	FLLS	H.FLLS	waterfall(s)	a perpendicular or very steep descent of the water of a stream
 59 | H	FLLSX	H.FLLSX	section of waterfall(s)
 60 | H	FLTM	H.FLTM	mud flat(s)	a relatively level area of mud either between high and low tide lines, or subject to flooding
 61 | H	FLTT	H.FLTT	tidal flat(s)	a large flat area of mud or sand attached to the shore and alternately covered and uncovered by the tide
 62 | H	GLCR	H.GLCR	glacier(s)	a mass of ice, usually at high latitudes or high elevations, with sufficient thickness to flow away from the source area in lobes, tongues, or masses
 63 | H	GULF	H.GULF	gulf	a large recess in the coastline, larger than a bay
 64 | H	GYSR	H.GYSR	geyser	a type of hot spring with intermittent eruptions of jets of hot water and steam
 65 | H	HBR	H.HBR	harbor(s)	a haven or space of deep water so sheltered by the adjacent land as to afford a safe anchorage for ships
 66 | H	HBRX	H.HBRX	section of harbor
 67 | H	INLT	H.INLT	inlet	a narrow waterway extending into the land, or connecting a bay or lagoon with a larger body of water
 68 | H	INLTQ	H.INLTQ	former inlet	an inlet which has been filled in, or blocked by deposits
 69 | H	LBED	H.LBED	lake bed(s)	a dried up or drained area of a former lake
 70 | H	LGN	H.LGN	lagoon	a shallow coastal waterbody, completely or partly separated from a larger body of water by a barrier island, coral reef or other depositional feature
 71 | H	LGNS	H.LGNS	lagoons	shallow coastal waterbodies, completely or partly separated from a larger body of water by a barrier island, coral reef or other depositional feature
 72 | H	LGNX	H.LGNX	section of lagoon
 73 | H	LK	H.LK	lake	a large inland body of standing water
 74 | H	LKC	H.LKC	crater lake	a lake in a crater or caldera
 75 | H	LKI	H.LKI	intermittent lake
 76 | H	LKN	H.LKN	salt lake	an inland body of salt water with no outlet
 77 | H	LKNI	H.LKNI	intermittent salt lake
 78 | H	LKO	H.LKO	oxbow lake	a crescent-shaped lake commonly found adjacent to meandering streams
 79 | H	LKOI	H.LKOI	intermittent oxbow lake
 80 | H	LKS	H.LKS	lakes	large inland bodies of standing water
 81 | H	LKSB	H.LKSB	underground lake	a standing body of water in a cave
 82 | H	LKSC	H.LKSC	crater lakes	lakes in a crater or caldera
 83 | H	LKSI	H.LKSI	intermittent lakes
 84 | H	LKSN	H.LKSN	salt lakes	inland bodies of salt water with no outlet
 85 | H	LKSNI	H.LKSNI	intermittent salt lakes
 86 | H	LKX	H.LKX	section of lake
 87 | H	MFGN	H.MFGN	salt evaporation ponds	diked salt ponds used in the production of solar evaporated salt
 88 | H	MGV	H.MGV	mangrove swamp	a tropical tidal mud flat characterized by mangrove vegetation
 89 | H	MOOR	H.MOOR	moor(s)	an area of open ground overlaid with wet peaty soils
 90 | H	MRSH	H.MRSH	marsh(es)	a wetland dominated by grass-like vegetation
 91 | H	MRSHN	H.MRSHN	salt marsh	a flat area, subject to periodic salt water inundation, dominated by grassy salt-tolerant plants
 92 | H	NRWS	H.NRWS	narrows	a navigable narrow part of a bay, strait, river, etc.
 93 | H	OCN	H.OCN	ocean	one of the major divisions of the vast expanse of salt water covering part of the earth
 94 | H	OVF	H.OVF	overfalls	an area of breaking waves caused by the meeting of currents or by waves moving against the current
 95 | H	PND	H.PND	pond	a small standing waterbody
 96 | H	PNDI	H.PNDI	intermittent pond
 97 | H	PNDN	H.PNDN	salt pond	a small standing body of salt water often in a marsh or swamp, usually along a seacoast
 98 | H	PNDNI	H.PNDNI	intermittent salt pond(s)
 99 | H	PNDS	H.PNDS	ponds	small standing waterbodies
100 | H	PNDSF	H.PNDSF	fishponds	ponds or enclosures in which fish are kept or raised
101 | H	PNDSI	H.PNDSI	intermittent ponds
102 | H	PNDSN	H.PNDSN	salt ponds	small standing bodies of salt water often in a marsh or swamp, usually along a seacoast
103 | H	POOL	H.POOL	pool(s)	a small and comparatively still, deep part of a larger body of water such as a stream or harbor; or a small body of standing water
104 | H	POOLI	H.POOLI	intermittent pool
105 | H	RCH	H.RCH	reach	a straight section of a navigable stream or channel between two bends
106 | H	RDGG	H.RDGG	icecap ridge	a linear elevation on an icecap
107 | H	RDST	H.RDST	roadstead	an open anchorage affording less protection than a harbor
108 | H	RF	H.RF	reef(s)	a surface-navigation hazard composed of consolidated material
109 | H	RFC	H.RFC	coral reef(s)	a surface-navigation hazard composed of coral
110 | H	RFX	H.RFX	section of reef
111 | H	RPDS	H.RPDS	rapids	a turbulent section of a stream associated with a steep, irregular stream bed
112 | H	RSV	H.RSV	reservoir(s)	an artificial pond or lake
113 | H	RSVI	H.RSVI	intermittent reservoir
114 | H	RSVT	H.RSVT	water tank	a contained pool or tank of water at, below, or above ground level
115 | H	RVN	H.RVN	ravine(s)	a small, narrow, deep, steep-sided stream channel, smaller than a gorge
116 | H	SBKH	H.SBKH	sabkha(s)	a salt flat or salt encrusted plain subject to periodic inundation from flooding or high tides
117 | H	SD	H.SD	sound	a long arm of the sea forming a channel between the mainland and an island or islands; or connecting two larger bodies of water
118 | H	SEA	H.SEA	sea	a large body of salt water more or less confined by continuous land or chains of islands forming a subdivision of an ocean
119 | H	SHOL	H.SHOL	shoal(s)	a surface-navigation hazard composed of unconsolidated material
120 | H	SILL	H.SILL	sill	the low part of an underwater gap or saddle separating basins, including a similar feature at the mouth of a fjord
121 | H	SPNG	H.SPNG	spring(s)	a place where ground water flows naturally out of the ground
122 | H	SPNS	H.SPNS	sulphur spring(s)	a place where sulphur ground water flows naturally out of the ground
123 | H	SPNT	H.SPNT	hot spring(s)	a place where hot ground water flows naturally out of the ground
124 | H	STM	H.STM	stream	a body of running water moving to a lower level in a channel on land
125 | H	STMA	H.STMA	anabranch	a diverging branch flowing out of a main stream and rejoining it downstream
126 | H	STMB	H.STMB	stream bend	a conspicuously curved or bent segment of a stream
127 | H	STMC	H.STMC	canalized stream	a stream that has been substantially ditched, diked, or straightened
128 | H	STMD	H.STMD	distributary(-ies)	a branch which flows away from the main stream, as in a delta or irrigation canal
129 | H	STMH	H.STMH	headwaters	the source and upper part of a stream, including the upper drainage basin
130 | H	STMI	H.STMI	intermittent stream
131 | H	STMIX	H.STMIX	section of intermittent stream
132 | H	STMM	H.STMM	stream mouth(s)	a place where a stream discharges into a lagoon, lake, or the sea
133 | H	STMQ	H.STMQ	abandoned watercourse	a former stream or distributary no longer carrying flowing water, but still evident due to lakes, wetland, topographic or vegetation patterns
134 | H	STMS	H.STMS	streams	bodies of running water moving to a lower level in a channel on land
135 | H	STMSB	H.STMSB	lost river	a surface stream that disappears into an underground channel, or dries up in an arid area
136 | H	STMX	H.STMX	section of stream
137 | H	STRT	H.STRT	strait	a relatively narrow waterway, usually narrower and less extensive than a sound, connecting two larger bodies of water
138 | H	SWMP	H.SWMP	swamp	a wetland dominated by tree vegetation
139 | H	SYSI	H.SYSI	irrigation system	a network of ditches and one or more of the following elements: water supply, reservoir, canal, pump, well, drain, etc.
140 | H	TNLC	H.TNLC	canal tunnel	a tunnel through which a canal passes
141 | H	WAD	H.WAD	wadi	a valley or ravine, bounded by relatively steep banks, which in the rainy season becomes a watercourse; found primarily in North Africa and the Middle East
142 | H	WADB	H.WADB	wadi bend	a conspicuously curved or bent segment of a wadi
143 | H	WADJ	H.WADJ	wadi junction	a place where two or more wadies join
144 | H	WADM	H.WADM	wadi mouth	the lower terminus of a wadi where it widens into an adjoining floodplain, depression, or waterbody
145 | H	WADS	H.WADS	wadies	valleys or ravines, bounded by relatively steep banks, which in the rainy season become watercourses; found primarily in North Africa and the Middle East
146 | H	WADX	H.WADX	section of wadi
147 | H	WHRL	H.WHRL	whirlpool	a turbulent, rotating movement of water in a stream
148 | H	WLL	H.WLL	well	a cylindrical hole, pit, or tunnel drilled or dug down to a depth from which water, oil, or gas can be pumped or brought to the surface
149 | H	WLLQ	H.WLLQ	abandoned well
150 | H	WLLS	H.WLLS	wells	cylindrical holes, pits, or tunnels drilled or dug down to a depth from which water, oil, or gas can be pumped or brought to the surface
151 | H	WTLD	H.WTLD	wetland	an area subject to inundation, usually characterized by bog, marsh, or swamp vegetation
152 | H	WTLDI	H.WTLDI	intermittent wetland
153 | H	WTRC	H.WTRC	watercourse	a natural, well-defined channel produced by flowing water, or an artificial channel designed to carry flowing water
154 | H	WTRH	H.WTRH	waterhole(s)	a natural hole, hollow, or small depression that contains water, used by man and animals, especially in arid areas
155 | L	AGRC	L.AGRC	agricultural colony	a tract of land set aside for agricultural settlement
156 | L	AMUS	L.AMUS	amusement park	Amusement Park are theme parks, adventure parks offering entertainment, similar to funfairs but with a fix location
157 | L	AREA	L.AREA	area	a tract of land without homogeneous character or boundaries
158 | L	BSND	L.BSND	drainage basin	an area drained by a stream
159 | L	BSNP	L.BSNP	petroleum basin	an area underlain by an oil-rich structural basin
160 | L	BTL	L.BTL	battlefield	a site of a land battle of historical importance
161 | L	CLG	L.CLG	clearing	an area in a forest with trees removed
162 | L	CMN	L.CMN	common	a park or pasture for community use
163 | L	CNS	L.CNS	concession area	a lease of land by a government for economic development, e.g., mining, forestry
164 | L	COLF	L.COLF	coalfield	a region in which coal deposits of possible economic value occur
165 | L	CONT	L.CONT	continent	continent : Europe, Africa, Asia, North America, South America, Oceania,Antarctica
166 | L	CST	L.CST	coast	a zone of variable width straddling the shoreline
167 | L	CTRB	L.CTRB	business center	a place where a number of businesses are located
168 | L	DEVH	L.DEVH	housing development	a tract of land on which many houses of similar design are built according to a development plan
169 | L	FLD	L.FLD	field(s)	an open as opposed to wooded area
170 | L	FLDI	L.FLDI	irrigated field(s)	a tract of level or terraced land which is irrigated
171 | L	GASF	L.GASF	gasfield	an area containing a subterranean store of natural gas of economic value
172 | L	GRAZ	L.GRAZ	grazing area	an area of grasses and shrubs used for grazing
173 | L	GVL	L.GVL	gravel area	an area covered with gravel
174 | L	INDS	L.INDS	industrial area	an area characterized by industrial activity
175 | L	LAND	L.LAND	arctic land	a tract of land in the Arctic
176 | L	LCTY	L.LCTY	locality	a minor area or place of unspecified or mixed character and indefinite boundaries
177 | L	MILB	L.MILB	military base	a place used by an army or other armed service for storing arms and supplies, and for accommodating and training troops, a base from which operations can be initiated
178 | L	MNA	L.MNA	mining area	an area of mine sites where minerals and ores are extracted
179 | L	MVA	L.MVA	maneuver area	a tract of land where military field exercises are carried out
180 | L	NVB	L.NVB	naval base	an area used to store supplies, provide barracks for troops and naval personnel, a port for naval vessels, and from which operations are initiated
181 | L	OAS	L.OAS	oasis(-es)	an area in a desert made productive by the availability of water
182 | L	OILF	L.OILF	oilfield	an area containing a subterranean store of petroleum of economic value
183 | L	PEAT	L.PEAT	peat cutting area	an area where peat is harvested
184 | L	PRK	L.PRK	park	an area, often of forested land, maintained as a place of beauty, or for recreation
185 | L	PRT	L.PRT	port	a place provided with terminal and transfer facilities for loading and discharging waterborne cargo or passengers, usually located in a harbor
186 | L	QCKS	L.QCKS	quicksand	an area where loose sand with water moving through it may become unstable when heavy objects are placed at the surface, causing them to sink
187 | L	RES	L.RES	reserve	a tract of public land reserved for future use or restricted as to use
188 | L	RESA	L.RESA	agricultural reserve	a tract of land reserved for agricultural reclamation and/or development
189 | L	RESF	L.RESF	forest reserve	a forested area set aside for preservation or controlled use
190 | L	RESH	L.RESH	hunting reserve	a tract of land used primarily for hunting
191 | L	RESN	L.RESN	nature reserve	an area reserved for the maintenance of a natural habitat
192 | L	RESP	L.RESP	palm tree reserve	an area of palm trees where use is controlled
193 | L	RESV	L.RESV	reservation	a tract of land set aside for aboriginal, tribal, or native populations
194 | L	RESW	L.RESW	wildlife reserve	a tract of public land reserved for the preservation of wildlife
195 | L	RGN	L.RGN	region	an area distinguished by one or more observable physical or cultural characteristics
196 | L	RGNE	L.RGNE	economic region	a region of a country established for economic development or for statistical purposes
197 | L	RGNL	L.RGNL	lake region	a tract of land distinguished by numerous lakes
198 | L	RNGA	L.RNGA	artillery range	a tract of land used for artillery firing practice
199 | L	SALT	L.SALT	salt area	a shallow basin or flat where salt accumulates after periodic inundation
200 | L	SNOW	L.SNOW	snowfield	an area of permanent snow and ice forming the accumulation area of a glacier
201 | L	TRB	L.TRB	tribal area	a tract of land used by nomadic or other tribes
202 | P	PPL	P.PPL	populated place	a city, town, village, or other agglomeration of buildings where people live and work
203 | P	PPLA	P.PPLA	seat of a first-order administrative division	seat of a first-order administrative division (PPLC takes precedence over PPLA)
204 | P	PPLA2	P.PPLA2	seat of a second-order administrative division
205 | P	PPLA3	P.PPLA3	seat of a third-order administrative division
206 | P	PPLA4	P.PPLA4	seat of a fourth-order administrative division
207 | P	PPLC	P.PPLC	capital of a political entity
208 | P	PPLF	P.PPLF	farm village	a populated place where the population is largely engaged in agricultural activities
209 | P	PPLG	P.PPLG	seat of government of a political entity
210 | P	PPLL	P.PPLL	populated locality	an area similar to a locality but with a small group of dwellings or other buildings
211 | P	PPLQ	P.PPLQ	abandoned populated place
212 | P	PPLR	P.PPLR	religious populated place	a populated place whose population is largely engaged in religious occupations
213 | P	PPLS	P.PPLS	populated places	cities, towns, villages, or other agglomerations of buildings where people live and work
214 | P	PPLW	P.PPLW	destroyed populated place	a village, town or city destroyed by a natural disaster, or by war
215 | P	PPLX	P.PPLX	section of populated place
216 | P	STLMT	P.STLMT	israeli settlement
217 | R	CSWY	R.CSWY	causeway	a raised roadway across wet ground or shallow water
218 | R	OILP	R.OILP	oil pipeline	a pipeline used for transporting oil
219 | R	PRMN	R.PRMN	promenade	a place for public walking, usually along a beach front
220 | R	PTGE	R.PTGE	portage	a place where boats, goods, etc., are carried overland between navigable waters
221 | R	RD	R.RD	road	an open way with improved surface for transportation of animals, people and vehicles
222 | R	RDA	R.RDA	ancient road	the remains of a road used by ancient cultures
223 | R	RDB	R.RDB	road bend	a conspicuously curved or bent section of a road
224 | R	RDCUT	R.RDCUT	road cut	an excavation cut through a hill or ridge for a road
225 | R	RDJCT	R.RDJCT	road junction	a place where two or more roads join
226 | R	RJCT	R.RJCT	railroad junction	a place where two or more railroad tracks join
227 | R	RR	R.RR	railroad	a permanent twin steel-rail track on which freight and passenger cars move long distances
228 | R	RRQ	R.RRQ	abandoned railroad
229 | R	RTE	R.RTE	caravan route	the route taken by caravans
230 | R	RYD	R.RYD	railroad yard	a system of tracks used for the making up of trains, and switching and storing freight cars
231 | R	ST	R.ST	street	a paved urban thoroughfare
232 | R	STKR	R.STKR	stock route	a route taken by livestock herds
233 | R	TNL	R.TNL	tunnel	a subterranean passageway for transportation
234 | R	TNLN	R.TNLN	natural tunnel	a cave that is open at both ends
235 | R	TNLRD	R.TNLRD	road tunnel	a tunnel through which a road passes
236 | R	TNLRR	R.TNLRR	railroad tunnel	a tunnel through which a railroad passes
237 | R	TNLS	R.TNLS	tunnels	subterranean passageways for transportation
238 | R	TRL	R.TRL	trail	a path, track, or route used by pedestrians, animals, or off-road vehicles
239 | S	ADMF	S.ADMF	administrative facility	a government building
240 | S	AGRF	S.AGRF	agricultural facility	a building and/or tract of land used for improving agriculture
241 | S	AIRB	S.AIRB	airbase	an area used to store supplies, provide barracks for air force personnel, hangars and runways for aircraft, and from which operations are initiated
242 | S	AIRF	S.AIRF	airfield	a place on land where aircraft land and take off; no facilities provided for the commercial handling of passengers and cargo
243 | S	AIRH	S.AIRH	heliport	a place where helicopters land and take off
244 | S	AIRP	S.AIRP	airport	a place where aircraft regularly land and take off, with runways, navigational aids, and major facilities for the commercial handling of passengers and cargo
245 | S	AIRQ	S.AIRQ	abandoned airfield
246 | S	AMTH	S.AMTH	amphitheater	an oval or circular structure with rising tiers of seats about a stage or open space
247 | S	ANS	S.ANS	ancient site	a place where archeological remains, old structures, or cultural artifacts are located
248 | S	AQC	S.AQC	aquaculture facility	facility or area for the cultivation of aquatic animals and plants, especially fish, shellfish, and seaweed, in natural or controlled marine or freshwater environments; underwater agriculture
249 | S	ARCH	S.ARCH	arch	a natural or man-made structure in the form of an arch
250 | S	ASTR	S.ASTR	astronomical station	a point on the earth whose position has been determined by observations of celestial bodies
251 | S	ASYL	S.ASYL	asylum	a facility where the insane are cared for and protected
252 | S	ATHF	S.ATHF	athletic field	a tract of land used for playing team sports, and athletic track and field events
253 | S	ATM	S.ATM	automatic teller machine	An unattended electronic machine in a public place, connected to a data system and related equipment and activated by a bank customer to obtain cash withdrawals and other banking services.
254 | S	BANK	S.BANK	bank	A business establishment in which money is kept for saving or commercial purposes or is invested, supplied for loans, or exchanged.
255 | S	BCN	S.BCN	beacon	a fixed artificial navigation mark
256 | S	BDG	S.BDG	bridge	a structure erected across an obstacle such as a stream, road, etc., in order to carry roads, railroads, and pedestrians across
257 | S	BDGQ	S.BDGQ	ruined bridge	a destroyed or decayed bridge which is no longer functional
258 | S	BLDG	S.BLDG	building(s)	a structure built for permanent use, as a house, factory, etc.
259 | S	BLDO	S.BLDO	office building	commercial building where business and/or services are conducted
260 | S	BP	S.BP	boundary marker	a fixture marking a point along a boundary
261 | S	BRKS	S.BRKS	barracks	a building for lodging military personnel
262 | S	BRKW	S.BRKW	breakwater	a structure erected to break the force of waves at the entrance to a harbor or port
263 | S	BSTN	S.BSTN	baling station	a facility for baling agricultural products
264 | S	BTYD	S.BTYD	boatyard	a waterside facility for servicing, repairing, and building small vessels
265 | S	BUR	S.BUR	burial cave(s)	a cave used for human burials
266 | S	BUSTN	S.BUSTN	bus station	a facility comprising ticket office, platforms, etc. for loading and unloading passengers
267 | S	BUSTP	S.BUSTP	bus stop	a place lacking station facilities
268 | S	CARN	S.CARN	cairn	a heap of stones erected as a landmark or for other purposes
269 | S	CAVE	S.CAVE	cave(s)	an underground passageway or chamber, or cavity on the side of a cliff
270 | S	CH	S.CH	church	a building for public Christian worship
271 | S	CMP	S.CMP	camp(s)	a site occupied by tents, huts, or other shelters for temporary use
272 | S	CMPL	S.CMPL	logging camp	a camp used by loggers
273 | S	CMPLA	S.CMPLA	labor camp	a camp used by migrant or temporary laborers
274 | S	CMPMN	S.CMPMN	mining camp	a camp used by miners
275 | S	CMPO	S.CMPO	oil camp	a camp used by oilfield workers
276 | S	CMPQ	S.CMPQ	abandoned camp
277 | S	CMPRF	S.CMPRF	refugee camp	a camp used by refugees
278 | S	CMTY	S.CMTY	cemetery	a burial place or ground
279 | S	COMC	S.COMC	communication center	a facility, including buildings, antennae, towers and electronic equipment for receiving and transmitting information
280 | S	CRRL	S.CRRL	corral(s)	a pen or enclosure for confining or capturing animals
281 | S	CSNO	S.CSNO	casino	a building used for entertainment, especially gambling
282 | S	CSTL	S.CSTL	castle	a large fortified building or set of buildings
283 | S	CSTM	S.CSTM	customs house	a building in a port where customs and duties are paid, and where vessels are entered and cleared
284 | S	CTHSE	S.CTHSE	courthouse	a building in which courts of law are held
285 | S	CTRA	S.CTRA	atomic center	a facility where atomic research is carried out
286 | S	CTRCM	S.CTRCM	community center	a facility for community recreation and other activities
287 | S	CTRF	S.CTRF	facility center	a place where more than one facility is situated
288 | S	CTRM	S.CTRM	medical center	a complex of health care buildings including two or more of the following: hospital, medical school, clinic, pharmacy, doctor's offices, etc.
289 | S	CTRR	S.CTRR	religious center	a facility where more than one religious activity is carried out, e.g., retreat, school, monastery, worship
290 | S	CTRS	S.CTRS	space center	a facility for launching, tracking, or controlling satellites and space vehicles
291 | S	CVNT	S.CVNT	convent	a building where a community of nuns lives in seclusion
292 | S	DAM	S.DAM	dam	a barrier constructed across a stream to impound water
293 | S	DAMQ	S.DAMQ	ruined dam	a destroyed or decayed dam which is no longer functional
294 | S	DAMSB	S.DAMSB	sub-surface dam	a dam put down to bedrock in a sand river
295 | S	DARY	S.DARY	dairy	a facility for the processing, sale and distribution of milk or milk products
296 | S	DCKD	S.DCKD	dry dock	a dock providing support for a vessel, and means for removing the water so that the bottom of the vessel can be exposed
297 | S	DCKY	S.DCKY	dockyard	a facility for servicing, building, or repairing ships
298 | S	DIKE	S.DIKE	dike	an earth or stone embankment usually constructed for flood or stream control
299 | S	DIP	S.DIP	diplomatic facility	office, residence, or facility of a foreign government, which may include an embassy, consulate, chancery, office of charge d?affaires, or other diplomatic, economic, military, or cultural mission
300 | S	DPOF	S.DPOF	fuel depot	an area where fuel is stored
301 | S	EST	S.EST	estate(s)	a large commercialized agricultural landholding with associated buildings and other facilities
302 | S	ESTO	S.ESTO	oil palm plantation	an estate specializing in the cultivation of oil palm trees
303 | S	ESTR	S.ESTR	rubber plantation	an estate which specializes in growing and tapping rubber trees
304 | S	ESTSG	S.ESTSG	sugar plantation	an estate that specializes in growing sugar cane
305 | S	ESTT	S.ESTT	tea plantation	an estate which specializes in growing tea bushes
306 | S	ESTX	S.ESTX	section of estate
307 | S	FCL	S.FCL	facility	a building or buildings housing a center, institute, foundation, hospital, prison, mission, courthouse, etc.
308 | S	FNDY	S.FNDY	foundry	a building or works where metal casting is carried out
309 | S	FRM	S.FRM	farm	a tract of land with associated buildings devoted to agriculture
310 | S	FRMQ	S.FRMQ	abandoned farm
311 | S	FRMS	S.FRMS	farms	tracts of land with associated buildings devoted to agriculture
312 | S	FRMT	S.FRMT	farmstead	the buildings and adjacent service areas of a farm
313 | S	FT	S.FT	fort	a defensive structure or earthworks
314 | S	FY	S.FY	ferry	a boat or other floating conveyance and terminal facilities regularly used to transport people and vehicles across a waterbody
315 | S	GATE	S.GATE	gate	a controlled access entrance or exit
316 | S	GDN	S.GDN	garden(s)	an enclosure for displaying selected plant or animal life
317 | S	GHAT	S.GHAT	ghat	a set of steps leading to a river, which are of religious significance, and at their base is usually a platform for bathing
318 | S	GHSE	S.GHSE	guest house	a house used to provide lodging for paying guests
319 | S	GOSP	S.GOSP	gas-oil separator plant	a facility for separating gas from oil
320 | S	GOVL	S.GOVL	local government office	a facility housing local governmental offices, usually a city, town, or village hall
321 | S	GRVE	S.GRVE	grave	a burial site
322 | S	HERM	S.HERM	hermitage	a secluded residence, usually for religious sects
323 | S	HLT	S.HLT	halting place	a place where caravans stop for rest
324 | S	HSE	S.HSE	house(s)	a building used as a human habitation
325 | S	HSEC	S.HSEC	country house	a large house, mansion, or chateau, on a large estate
326 | S	HSP	S.HSP	hospital	a building in which sick or injured, especially those confined to bed, are medically treated
327 | S	HSPC	S.HSPC	clinic	a medical facility associated with a hospital for outpatients
328 | S	HSPD	S.HSPD	dispensary	a building where medical or dental aid is dispensed
329 | S	HSPL	S.HSPL	leprosarium	an asylum or hospital for lepers
330 | S	HSTS	S.HSTS	historical site	a place of historical importance
331 | S	HTL	S.HTL	hotel	a building providing lodging and/or meals for the public
332 | S	HUT	S.HUT	hut	a small primitive house
333 | S	HUTS	S.HUTS	huts	small primitive houses
334 | S	INSM	S.INSM	military installation	a facility for use of and control by armed forces
335 | S	ITTR	S.ITTR	research institute	a facility where research is carried out
336 | S	JTY	S.JTY	jetty	a structure built out into the water at a river mouth or harbor entrance to regulate currents and silting
337 | S	LDNG	S.LDNG	landing	a place where boats receive or discharge passengers and freight, but lacking most port facilities
338 | S	LEPC	S.LEPC	leper colony	a settled area inhabited by lepers in relative isolation
339 | S	LIBR	S.LIBR	library	A place in which information resources such as books are kept for reading, reference, or lending.
340 | S	LNDF	S.LNDF	landfill	a place for trash and garbage disposal in which the waste is buried between layers of earth to build up low-lying land
341 | S	LOCK	S.LOCK	lock(s)	a basin in a waterway with gates at each end by means of which vessels are passed from one water level to another
342 | S	LTHSE	S.LTHSE	lighthouse	a distinctive structure exhibiting a major navigation light
343 | S	MALL	S.MALL	mall	A large, often enclosed shopping complex containing various stores, businesses, and restaurants usually accessible by common passageways.
344 | S	MAR	S.MAR	marina	a harbor facility for small boats, yachts, etc.
345 | S	MFG	S.MFG	factory	one or more buildings where goods are manufactured, processed or fabricated
346 | S	MFGB	S.MFGB	brewery	one or more buildings where beer is brewed
347 | S	MFGC	S.MFGC	cannery	a building where food items are canned
348 | S	MFGCU	S.MFGCU	copper works	a facility for processing copper ore
349 | S	MFGLM	S.MFGLM	limekiln	a furnace in which limestone is reduced to lime
350 | S	MFGM	S.MFGM	munitions plant	a factory where ammunition is made
351 | S	MFGPH	S.MFGPH	phosphate works	a facility for producing fertilizer
352 | S	MFGQ	S.MFGQ	abandoned factory
353 | S	MFGSG	S.MFGSG	sugar refinery	a facility for converting raw sugar into refined sugar
354 | S	MKT	S.MKT	market	a place where goods are bought and sold at regular intervals
355 | S	ML	S.ML	mill(s)	a building housing machines for transforming, shaping, finishing, grinding, or extracting products
356 | S	MLM	S.MLM	ore treatment plant	a facility for improving the metal content of ore by concentration
357 | S	MLO	S.MLO	olive oil mill	a mill where oil is extracted from olives
358 | S	MLSG	S.MLSG	sugar mill	a facility where sugar cane is processed into raw sugar
359 | S	MLSGQ	S.MLSGQ	former sugar mill	a sugar mill no longer used as a sugar mill
360 | S	MLSW	S.MLSW	sawmill	a mill where logs or lumber are sawn to specified shapes and sizes
361 | S	MLWND	S.MLWND	windmill	a mill or water pump powered by wind
362 | S	MLWTR	S.MLWTR	water mill	a mill powered by running water
363 | S	MN	S.MN	mine(s)	a site where mineral ores are extracted from the ground by excavating surface pits and subterranean passages
364 | S	MNAU	S.MNAU	gold mine(s)	a mine where gold ore, or alluvial gold is extracted
365 | S	MNC	S.MNC	coal mine(s)	a mine where coal is extracted
366 | S	MNCR	S.MNCR	chrome mine(s)	a mine where chrome ore is extracted
367 | S	MNCU	S.MNCU	copper mine(s)	a mine where copper ore is extracted
368 | S	MNFE	S.MNFE	iron mine(s)	a mine where iron ore is extracted
369 | S	MNMT	S.MNMT	monument	a commemorative structure or statue
370 | S	MNN	S.MNN	salt mine(s)	a mine from which salt is extracted
371 | S	MNQ	S.MNQ	abandoned mine
372 | S	MNQR	S.MNQR	quarry(-ies)	a surface mine where building stone or gravel and sand, etc. are extracted
373 | S	MOLE	S.MOLE	mole	a massive structure of masonry or large stones serving as a pier or breakwater
374 | S	MSQE	S.MSQE	mosque	a building for public Islamic worship
375 | S	MSSN	S.MSSN	mission	a place characterized by dwellings, school, church, hospital and other facilities operated by a religious group for the purpose of providing charitable services and to propagate religion
376 | S	MSSNQ	S.MSSNQ	abandoned mission
377 | S	MSTY	S.MSTY	monastery	a building and grounds where a community of monks lives in seclusion
378 | S	MTRO	S.MTRO	metro station	metro station (Underground, Tube, or M?tro)
379 | S	MUS	S.MUS	museum	a building where objects of permanent interest in one or more of the arts and sciences are preserved and exhibited
380 | S	NOV	S.NOV	novitiate	a religious house or school where novices are trained
381 | S	NSY	S.NSY	nursery(-ies)	a place where plants are propagated for transplanting or grafting
382 | S	OBPT	S.OBPT	observation point	a wildlife or scenic observation point
383 | S	OBS	S.OBS	observatory	a facility equipped for observation of atmospheric or space phenomena
384 | S	OBSR	S.OBSR	radio observatory	a facility equipped with an array of antennae for receiving radio waves from space
385 | S	OILJ	S.OILJ	oil pipeline junction	a section of an oil pipeline where two or more pipes join together
386 | S	OILQ	S.OILQ	abandoned oil well
387 | S	OILR	S.OILR	oil refinery	a facility for converting crude oil into refined petroleum products
388 | S	OILT	S.OILT	tank farm	a tract of land occupied by large, cylindrical, metal tanks in which oil or liquid petrochemicals are stored
389 | S	OILW	S.OILW	oil well	a well from which oil may be pumped
390 | S	OPRA	S.OPRA	opera house	A theater designed chiefly for the performance of operas.
391 | S	PAL	S.PAL	palace	a large stately house, often a royal or presidential residence
392 | S	PGDA	S.PGDA	pagoda	a tower-like storied structure, usually a Buddhist shrine
393 | S	PIER	S.PIER	pier	a structure built out into navigable water on piles providing berthing for ships and recreation
394 | S	PKLT	S.PKLT	parking lot	an area used for parking vehicles
395 | S	PMPO	S.PMPO	oil pumping station	a facility for pumping oil through a pipeline
396 | S	PMPW	S.PMPW	water pumping station	a facility for pumping water from a major well or through a pipeline
397 | S	PO	S.PO	post office	a public building in which mail is received, sorted and distributed
398 | S	PP	S.PP	police post	a building in which police are stationed
399 | S	PPQ	S.PPQ	abandoned police post
400 | S	PRKGT	S.PRKGT	park gate	a controlled access to a park
401 | S	PRKHQ	S.PRKHQ	park headquarters	a park administrative facility
402 | S	PRN	S.PRN	prison	a facility for confining prisoners
403 | S	PRNJ	S.PRNJ	reformatory	a facility for confining, training, and reforming young law offenders
404 | S	PRNQ	S.PRNQ	abandoned prison
405 | S	PS	S.PS	power station	a facility for generating electric power
406 | S	PSH	S.PSH	hydroelectric power station	a building where electricity is generated from water power
407 | S	PSTB	S.PSTB	border post	a post or station at an international boundary for the regulation of movement of people and goods
408 | S	PSTC	S.PSTC	customs post	a building at an international boundary where customs and duties are paid on goods
409 | S	PSTP	S.PSTP	patrol post	a post from which patrols are sent out
410 | S	PYR	S.PYR	pyramid	an ancient massive structure of square ground plan with four triangular faces meeting at a point and used for enclosing tombs
411 | S	PYRS	S.PYRS	pyramids	ancient massive structures of square ground plan with four triangular faces meeting at a point and used for enclosing tombs
412 | S	QUAY	S.QUAY	quay	a structure of solid construction along a shore or bank which provides berthing for ships and which generally provides cargo handling facilities
413 | S	RDCR	S.RDCR	traffic circle	a road junction formed around a central circle about which traffic moves in one direction only
414 | S	RECG	S.RECG	golf course	a recreation field where golf is played
415 | S	RECR	S.RECR	racetrack	a track where races are held
416 | S	REST	S.REST	restaurant	A place where meals are served to the public
417 | S	RET	S.RET	store	a building where goods and/or services are offered for sale
418 | S	RHSE	S.RHSE	resthouse	a structure maintained for the rest and shelter of travelers
419 | S	RKRY	S.RKRY	rookery	a breeding place of a colony of birds or seals
420 | S	RLG	S.RLG	religious site	an ancient site of significant religious importance
421 | S	RLGR	S.RLGR	retreat	a place of temporary seclusion, especially for religious groups
422 | S	RNCH	S.RNCH	ranch(es)	a large farm specializing in extensive grazing of livestock
423 | S	RSD	S.RSD	railroad siding	a short track parallel to and joining the main track
424 | S	RSGNL	S.RSGNL	railroad signal	a signal at the entrance of a particular section of track governing the movement of trains
425 | S	RSRT	S.RSRT	resort	a specialized facility for vacation, health, or participation sports activities
426 | S	RSTN	S.RSTN	railroad station	a facility comprising ticket office, platforms, etc. for loading and unloading train passengers and freight
427 | S	RSTNQ	S.RSTNQ	abandoned railroad station
428 | S	RSTP	S.RSTP	railroad stop	a place lacking station facilities where trains stop to pick up and unload passengers and freight
429 | S	RSTPQ	S.RSTPQ	abandoned railroad stop
430 | S	RUIN	S.RUIN	ruin(s)	a destroyed or decayed structure which is no longer functional
431 | S	SCH	S.SCH	school	building(s) where instruction in one or more branches of knowledge takes place
432 | S	SCHA	S.SCHA	agricultural school	a school with a curriculum focused on agriculture
433 | S	SCHC	S.SCHC	college	the grounds and buildings of an institution of higher learning
434 | S	SCHL	S.SCHL	language school	Language Schools & Institutions
435 | S	SCHM	S.SCHM	military school	a school at which military science forms the core of the curriculum
436 | S	SCHN	S.SCHN	maritime school	a school at which maritime sciences form the core of the curriculum
437 | S	SCHT	S.SCHT	technical school	post-secondary school with a specifically technical or vocational curriculum
438 | S	SECP	S.SECP	State Exam Prep Centre	state exam preparation centres
439 | S	SHPF	S.SHPF	sheepfold	a fence or wall enclosure for sheep and other small herd animals
440 | S	SHRN	S.SHRN	shrine	a structure or place memorializing a person or religious concept
441 | S	SHSE	S.SHSE	storehouse	a building for storing goods, especially provisions
442 | S	SLCE	S.SLCE	sluice	a conduit or passage for carrying off surplus water from a waterbody, usually regulated by means of a sluice gate
443 | S	SNTR	S.SNTR	sanatorium	a facility where victims of physical or mental disorders are treated
444 | S	SPA	S.SPA	spa	a resort area usually developed around a medicinal spring
445 | S	SPLY	S.SPLY	spillway	a passage or outlet through which surplus water flows over, around or through a dam
446 | S	SQR	S.SQR	square	a broad, open, public area near the center of a town or city
447 | S	STBL	S.STBL	stable	a building for the shelter and feeding of farm animals, especially horses
448 | S	STDM	S.STDM	stadium	a structure with an enclosure for athletic games with tiers of seats for spectators
449 | S	STNB	S.STNB	scientific research base	a scientific facility used as a base from which research is carried out or monitored
450 | S	STNC	S.STNC	coast guard station	a facility from which the coast is guarded by armed vessels
451 | S	STNE	S.STNE	experiment station	a facility for carrying out experiments
452 | S	STNF	S.STNF	forest station	a collection of buildings and facilities for carrying out forest management
453 | S	STNI	S.STNI	inspection station	a station at which vehicles, goods, and people are inspected
454 | S	STNM	S.STNM	meteorological station	a station at which weather elements are recorded
455 | S	STNR	S.STNR	radio station	a facility for producing and transmitting information by radio waves
456 | S	STNS	S.STNS	satellite station	a facility for tracking and communicating with orbiting satellites
457 | S	STNW	S.STNW	whaling station	a facility for butchering whales and processing train oil
458 | S	STPS	S.STPS	steps	stones or slabs placed for ease in ascending or descending a steep slope
459 | S	SWT	S.SWT	sewage treatment plant	facility for the processing of sewage and/or wastewater
460 | S	THTR	S.THTR	theater	A building, room, or outdoor structure for the presentation of plays, films, or other dramatic performances
461 | S	TMB	S.TMB	tomb(s)	a structure for interring bodies
462 | S	TMPL	S.TMPL	temple(s)	an edifice dedicated to religious worship
463 | S	TNKD	S.TNKD	cattle dipping tank	a small artificial pond used for immersing cattle in chemically treated water for disease control
464 | S	TOWR	S.TOWR	tower	a high conspicuous structure, typically much higher than its diameter
465 | S	TRANT	S.TRANT	transit terminal	facilities for the handling of vehicular freight and passengers
466 | S	TRIG	S.TRIG	triangulation station	a point on the earth whose position has been determined by triangulation
467 | S	TRMO	S.TRMO	oil pipeline terminal	a tank farm or loading facility at the end of an oil pipeline
468 | S	TWO	S.TWO	temp work office	Temporary Work Offices
469 | S	UNIP	S.UNIP	university prep school	University Preparation Schools & Institutions
470 | S	UNIV	S.UNIV	university	An institution for higher learning with teaching and research facilities constituting a graduate school and professional schools that award master's degrees and doctorates and an undergraduate division that awards bachelor's degrees.
471 | S	USGE	S.USGE	united states government establishment	a facility operated by the United States Government in Panama
472 | S	VETF	S.VETF	veterinary facility	a building or camp at which veterinary services are available
473 | S	WALL	S.WALL	wall	a thick masonry structure, usually enclosing a field or building, or forming the side of a structure
474 | S	WALLA	S.WALLA	ancient wall	the remains of a linear defensive stone structure
475 | S	WEIR	S.WEIR	weir(s)	a small dam in a stream, designed to raise the water level or to divert stream flow through a desired channel
476 | S	WHRF	S.WHRF	wharf(-ves)	a structure of open rather than solid construction along a shore or a bank which provides berthing for ships and cargo-handling facilities
477 | S	WRCK	S.WRCK	wreck	the site of the remains of a wrecked vessel
478 | S	WTRW	S.WTRW	waterworks	a facility for supplying potable water through a water source and a system of pumps and filtration beds
479 | S	ZNF	S.ZNF	free trade zone	an area, usually a section of a port, where goods may be received and shipped free of customs duty and of most customs regulations
480 | S	ZOO	S.ZOO	zoo	a zoological garden or park where wild animals are kept for exhibition
481 | T	ASPH	T.ASPH	asphalt lake	a small basin containing naturally occurring asphalt
482 | T	ATOL	T.ATOL	atoll(s)	a ring-shaped coral reef which has closely spaced islands on it encircling a lagoon
483 | T	BAR	T.BAR	bar	a shallow ridge or mound of coarse unconsolidated material in a stream channel, at the mouth of a stream, estuary, or lagoon and in the wave-break zone along coasts
484 | T	BCH	T.BCH	beach	a shore zone of coarse unconsolidated sediment that extends from the low-water line to the highest reach of storm waves
485 | T	BCHS	T.BCHS	beaches	a shore zone of coarse unconsolidated sediment that extends from the low-water line to the highest reach of storm waves
486 | T	BDLD	T.BDLD	badlands	an area characterized by a maze of very closely spaced, deep, narrow, steep-sided ravines, and sharp crests and pinnacles
487 | T	BLDR	T.BLDR	boulder field	a high altitude or high latitude bare, flat area covered with large angular rocks
488 | T	BLHL	T.BLHL	blowhole(s)	a hole in coastal rock through which sea water is forced by a rising tide or waves and spurted through an outlet into the air
489 | T	BLOW	T.BLOW	blowout(s)	a small depression in sandy terrain, caused by wind erosion
490 | T	BNCH	T.BNCH	bench	a long, narrow bedrock platform bounded by steeper slopes above and below, usually overlooking a waterbody
491 | T	BUTE	T.BUTE	butte(s)	a small, isolated, usually flat-topped hill with steep sides
492 | T	CAPE	T.CAPE	cape	a land area, more prominent than a point, projecting into the sea and marking a notable change in coastal direction
493 | T	CFT	T.CFT	cleft(s)	a deep narrow slot, notch, or groove in a coastal cliff
494 | T	CLDA	T.CLDA	caldera	a depression measuring kilometers across formed by the collapse of a volcanic mountain
495 | T	CLF	T.CLF	cliff(s)	a high, steep to perpendicular slope overlooking a waterbody or lower area
496 | T	CNYN	T.CNYN	canyon	a deep, narrow valley with steep sides cutting into a plateau or mountainous area
497 | T	CONE	T.CONE	cone(s)	a conical landform composed of mud or volcanic material
498 | T	CRDR	T.CRDR	corridor	a strip or area of land having significance as an access way
499 | T	CRQ	T.CRQ	cirque	a bowl-like hollow partially surrounded by cliffs or steep slopes at the head of a glaciated valley
500 | T	CRQS	T.CRQS	cirques	bowl-like hollows partially surrounded by cliffs or steep slopes at the head of a glaciated valley
501 | T	CRTR	T.CRTR	crater(s)	a generally circular saucer or bowl-shaped depression caused by volcanic or meteorite explosive action
502 | T	CUET	T.CUET	cuesta(s)	an asymmetric ridge formed on tilted strata
503 | T	DLTA	T.DLTA	delta	a flat plain formed by alluvial deposits at the mouth of a stream
504 | T	DPR	T.DPR	depression(s)	a low area surrounded by higher land and usually characterized by interior drainage
505 | T	DSRT	T.DSRT	desert	a large area with little or no vegetation due to extreme environmental conditions
506 | T	DUNE	T.DUNE	dune(s)	a wave form, ridge or star shape feature composed of sand
507 | T	DVD	T.DVD	divide	a line separating adjacent drainage basins
508 | T	ERG	T.ERG	sandy desert	an extensive tract of shifting sand and sand dunes
509 | T	FAN	T.FAN	fan(s)	a fan-shaped wedge of coarse alluvium with apex merging with a mountain stream bed and the fan spreading out at a low angle slope onto an adjacent plain
510 | T	FORD	T.FORD	ford	a shallow part of a stream which can be crossed on foot or by land vehicle
511 | T	FSR	T.FSR	fissure	a crack associated with volcanism
512 | T	GAP	T.GAP	gap	a low place in a ridge, not used for transportation
513 | T	GRGE	T.GRGE	gorge(s)	a short, narrow, steep-sided section of a stream valley
514 | T	HDLD	T.HDLD	headland	a high projection of land extending into a large body of water beyond the line of the coast
515 | T	HLL	T.HLL	hill	a rounded elevation of limited extent rising above the surrounding land with local relief of less than 300m
516 | T	HLLS	T.HLLS	hills	rounded elevations of limited extent rising above the surrounding land with local relief of less than 300m
517 | T	HMCK	T.HMCK	hammock(s)	a patch of ground, distinct from and slightly above the surrounding plain or wetland. Often occurs in groups
518 | T	HMDA	T.HMDA	rock desert	a relatively sand-free, high bedrock plateau in a hot desert, with or without a gravel veneer
519 | T	INTF	T.INTF	interfluve	a relatively undissected upland between adjacent stream valleys
520 | T	ISL	T.ISL	island	a tract of land, smaller than a continent, surrounded by water at high water
521 | T	ISLET	T.ISLET	islet	small island, bigger than rock, smaller than island.
522 | T	ISLF	T.ISLF	artificial island	an island created by landfill or diking and filling in a wetland, bay, or lagoon
523 | T	ISLM	T.ISLM	mangrove island	a mangrove swamp surrounded by a waterbody
524 | T	ISLS	T.ISLS	islands	tracts of land, smaller than a continent, surrounded by water at high water
525 | T	ISLT	T.ISLT	land-tied island	a coastal island connected to the mainland by barrier beaches, levees or dikes
526 | T	ISLX	T.ISLX	section of island
527 | T	ISTH	T.ISTH	isthmus	a narrow strip of land connecting two larger land masses and bordered by water
528 | T	KRST	T.KRST	karst area	a distinctive landscape developed on soluble rock such as limestone characterized by sinkholes, caves, disappearing streams, and underground drainage
529 | T	LAVA	T.LAVA	lava area	an area of solidified lava
530 | T	LEV	T.LEV	levee	a natural low embankment bordering a distributary or meandering stream; often built up artificially to control floods
531 | T	MESA	T.MESA	mesa(s)	a flat-topped, isolated elevation with steep slopes on all sides, less extensive than a plateau
532 | T	MND	T.MND	mound(s)	a low, isolated, rounded hill
533 | T	MRN	T.MRN	moraine	a mound, ridge, or other accumulation of glacial till
534 | T	MT	T.MT	mountain	an elevation standing high above the surrounding area with small summit area, steep slopes and local relief of 300m or more
535 | T	MTS	T.MTS	mountains	a mountain range or a group of mountains or high ridges
536 | T	NKM	T.NKM	meander neck	a narrow strip of land between the two limbs of a meander loop at its narrowest point
537 | T	NTK	T.NTK	nunatak	a rock or mountain peak protruding through glacial ice
538 | T	NTKS	T.NTKS	nunataks	rocks or mountain peaks protruding through glacial ice
539 | T	PAN	T.PAN	pan	a near-level shallow, natural depression or basin, usually containing an intermittent lake, pond, or pool
540 | T	PANS	T.PANS	pans	a near-level shallow, natural depression or basin, usually containing an intermittent lake, pond, or pool
541 | T	PASS	T.PASS	pass	a break in a mountain range or other high obstruction, used for transportation from one side to the other [See also gap]
542 | T	PEN	T.PEN	peninsula	an elongate area of land projecting into a body of water and nearly surrounded by water
543 | T	PENX	T.PENX	section of peninsula
544 | T	PK	T.PK	peak	a pointed elevation atop a mountain, ridge, or other hypsographic feature
545 | T	PKS	T.PKS	peaks	pointed elevations atop a mountain, ridge, or other hypsographic features
546 | T	PLAT	T.PLAT	plateau	an elevated plain with steep slopes on one or more sides, and often with incised streams
547 | T	PLATX	T.PLATX	section of plateau
548 | T	PLDR	T.PLDR	polder	an area reclaimed from the sea by diking and draining
549 | T	PLN	T.PLN	plain(s)	an extensive area of comparatively level to gently undulating land, lacking surface irregularities, and usually adjacent to a higher area
550 | T	PLNX	T.PLNX	section of plain
551 | T	PROM	T.PROM	promontory(-ies)	a bluff or prominent hill overlooking or projecting into a lowland
552 | T	PT	T.PT	point	a tapering piece of land projecting into a body of water, less prominent than a cape
553 | T	PTS	T.PTS	points	tapering pieces of land projecting into a body of water, less prominent than a cape
554 | T	RDGB	T.RDGB	beach ridge	a ridge of sand just inland and parallel to the beach, usually in series
555 | T	RDGE	T.RDGE	ridge(s)	a long narrow elevation with steep sides, and a more or less continuous crest
556 | T	REG	T.REG	stony desert	a desert plain characterized by a surface veneer of gravel and stones
557 | T	RK	T.RK	rock	a conspicuous, isolated rocky mass
558 | T	RKFL	T.RKFL	rockfall	an irregular mass of fallen rock at the base of a cliff or steep slope
559 | T	RKS	T.RKS	rocks	conspicuous, isolated rocky masses
560 | T	SAND	T.SAND	sand area	a tract of land covered with sand
561 | T	SBED	T.SBED	dry stream bed	a channel formerly containing the water of a stream
562 | T	SCRP	T.SCRP	escarpment	a long line of cliffs or steep slopes separating level surfaces above and below
563 | T	SDL	T.SDL	saddle	a broad, open pass crossing a ridge or between hills or mountains
564 | T	SHOR	T.SHOR	shore	a narrow zone bordering a waterbody which covers and uncovers at high and low water, respectively
565 | T	SINK	T.SINK	sinkhole	a small crater-shape depression in a karst area
566 | T	SLID	T.SLID	slide	a mound of earth material, at the base of a slope and the associated scoured area
567 | T	SLP	T.SLP	slope(s)	a surface with a relatively uniform slope angle
568 | T	SPIT	T.SPIT	spit	a narrow, straight or curved continuation of a beach into a waterbody
569 | T	SPUR	T.SPUR	spur(s)	a subordinate ridge projecting outward from a hill, mountain or other elevation
570 | T	TAL	T.TAL	talus slope	a steep concave slope formed by an accumulation of loose rock fragments at the base of a cliff or steep slope
571 | T	TRGD	T.TRGD	interdune trough(s)	a long wind-swept trough between parallel longitudinal dunes
572 | T	TRR	T.TRR	terrace	a long, narrow alluvial platform bounded by steeper slopes above and below, usually overlooking a waterbody
573 | T	UPLD	T.UPLD	upland	an extensive interior region of high land with low to moderate surface relief
574 | T	VAL	T.VAL	valley	an elongated depression usually traversed by a stream
575 | T	VALG	T.VALG	hanging valley	a valley the floor of which is notably higher than the valley or shore to which it leads; most common in areas that have been glaciated
576 | T	VALS	T.VALS	valleys	elongated depressions usually traversed by a stream
577 | T	VALX	T.VALX	section of valley
578 | T	VLC	T.VLC	volcano	a conical elevation composed of volcanic materials with a crater at the top
579 | U	APNU	U.APNU	apron	a gentle slope, with a generally smooth surface, particularly found around groups of islands and seamounts
580 | U	ARCU	U.ARCU	arch	a low bulge around the southeastern end of the island of Hawaii
581 | U	ARRU	U.ARRU	arrugado	an area of subdued corrugations off Baja California
582 | U	BDLU	U.BDLU	borderland	a region adjacent to a continent, normally occupied by or bordering a shelf, that is highly irregular with depths well in excess of those typical of a shelf
583 | U	BKSU	U.BKSU	banks	elevations, typically located on a shelf, over which the depth of water is relatively shallow but sufficient for safe surface navigation
584 | U	BNKU	U.BNKU	bank	an elevation, typically located on a shelf, over which the depth of water is relatively shallow but sufficient for safe surface navigation
585 | U	BSNU	U.BSNU	basin	a depression more or less equidimensional in plan and of variable extent
586 | U	CDAU	U.CDAU	cordillera	an entire mountain system including the subordinate ranges, interior plateaus, and basins
587 | U	CNSU	U.CNSU	canyons	relatively narrow, deep depressions with steep sides, the bottom of which generally has a continuous slope
588 | U	CNYU	U.CNYU	canyon	a relatively narrow, deep depression with steep sides, the bottom of which generally has a continuous slope
589 | U	CRSU	U.CRSU	continental rise	a gentle slope rising from oceanic depths towards the foot of a continental slope
590 | U	DEPU	U.DEPU	deep	a localized deep area within the confines of a larger feature, such as a trough, basin or trench
591 | U	EDGU	U.EDGU	shelf edge	a line along which there is a marked increase of slope at the outer margin of a continental shelf or island shelf
592 | U	ESCU	U.ESCU	escarpment (or scarp)	an elongated and comparatively steep slope separating flat or gently sloping areas
593 | U	FANU	U.FANU	fan	a relatively smooth feature normally sloping away from the lower termination of a canyon or canyon system
594 | U	FLTU	U.FLTU	flat	a small level or nearly level area
595 | U	FRZU	U.FRZU	fracture zone	an extensive linear zone of irregular topography of the sea floor, characterized by steep-sided or asymmetrical ridges, troughs, or escarpments
596 | U	FURU	U.FURU	furrow	a closed, linear, narrow, shallow depression
597 | U	GAPU	U.GAPU	gap	a narrow break in a ridge or rise
598 | U	GLYU	U.GLYU	gully	a small valley-like feature
599 | U	HLLU	U.HLLU	hill	an elevation rising generally less than 500 meters
600 | U	HLSU	U.HLSU	hills	elevations rising generally less than 500 meters
601 | U	HOLU	U.HOLU	hole	a small depression of the sea floor
602 | U	KNLU	U.KNLU	knoll	an elevation rising generally more than 500 meters and less than 1,000 meters and of limited extent across the summit
603 | U	KNSU	U.KNSU	knolls	elevations rising generally more than 500 meters and less than 1,000 meters and of limited extent across the summits
604 | U	LDGU	U.LDGU	ledge	a rocky projection or outcrop, commonly linear and near shore
605 | U	LEVU	U.LEVU	levee	an embankment bordering a canyon, valley, or seachannel
606 | U	MESU	U.MESU	mesa	an isolated, extensive, flat-topped elevation on the shelf, with relatively steep sides
607 | U	MNDU	U.MNDU	mound	a low, isolated, rounded hill
608 | U	MOTU	U.MOTU	moat	an annular depression that may not be continuous, located at the base of many seamounts, islands, and other isolated elevations
609 | U	MTU	U.MTU	mountain	a well-delineated subdivision of a large and complex positive feature
610 | U	PKSU	U.PKSU	peaks	prominent elevations, part of a larger feature, either pointed or of very limited extent across the summit
611 | U	PKU	U.PKU	peak	a prominent elevation, part of a larger feature, either pointed or of very limited extent across the summit
612 | U	PLNU	U.PLNU	plain	a flat, gently sloping or nearly level region
613 | U	PLTU	U.PLTU	plateau	a comparatively flat-topped feature of considerable extent, dropping off abruptly on one or more sides
614 | U	PNLU	U.PNLU	pinnacle	a high tower or spire-shaped pillar of rock or coral, alone or cresting a summit
615 | U	PRVU	U.PRVU	province	a region identifiable by a group of similar physiographic features whose characteristics are markedly in contrast with surrounding areas
616 | U	RDGU	U.RDGU	ridge	a long narrow elevation with steep sides
617 | U	RDSU	U.RDSU	ridges	long narrow elevations with steep sides
618 | U	RFSU	U.RFSU	reefs	surface-navigation hazards composed of consolidated material
619 | U	RFU	U.RFU	reef	a surface-navigation hazard composed of consolidated material
620 | U	RISU	U.RISU	rise	a broad elevation that rises gently, and generally smoothly, from the sea floor
621 | U	SCNU	U.SCNU	seachannel	a continuously sloping, elongated depression commonly found in fans or plains and customarily bordered by levees on one or two sides
622 | U	SCSU	U.SCSU	seachannels	continuously sloping, elongated depressions commonly found in fans or plains and customarily bordered by levees on one or two sides
623 | U	SDLU	U.SDLU	saddle	a low part, resembling in shape a saddle, in a ridge or between contiguous seamounts
624 | U	SHFU	U.SHFU	shelf	a zone adjacent to a continent (or around an island) that extends from the low water line to a depth at which there is usually a marked increase of slope towards oceanic depths
625 | U	SHLU	U.SHLU	shoal	a surface-navigation hazard composed of unconsolidated material
626 | U	SHSU	U.SHSU	shoals	hazards to surface navigation composed of unconsolidated material
627 | U	SHVU	U.SHVU	shelf valley	a valley on the shelf, generally the shoreward extension of a canyon
628 | U	SILU	U.SILU	sill	the low part of a gap or saddle separating basins
629 | U	SLPU	U.SLPU	slope	the slope seaward from the shelf edge to the beginning of a continental rise or the point where there is a general reduction in slope
630 | U	SMSU	U.SMSU	seamounts	elevations rising generally more than 1,000 meters and of limited extent across the summit
631 | U	SMU	U.SMU	seamount	an elevation rising generally more than 1,000 meters and of limited extent across the summit
632 | U	SPRU	U.SPRU	spur	a subordinate elevation, ridge, or rise projecting outward from a larger feature
633 | U	TERU	U.TERU	terrace	a relatively flat horizontal or gently inclined surface, sometimes long and narrow, which is bounded by a steeper ascending slope on one side and by a steep descending slope on the opposite side
634 | U	TMSU	U.TMSU	tablemounts (or guyots)	seamounts having a comparatively smooth, flat top
635 | U	TMTU	U.TMTU	tablemount (or guyot)	a seamount having a comparatively smooth, flat top
636 | U	TNGU	U.TNGU	tongue	an elongate (tongue-like) extension of a flat sea floor into an adjacent higher feature
637 | U	TRGU	U.TRGU	trough	a long depression of the sea floor characteristically flat bottomed and steep sided, and normally shallower than a trench
638 | U	TRNU	U.TRNU	trench	a long, narrow, characteristically very deep and asymmetrical depression of the sea floor, with relatively steep sides
639 | U	VALU	U.VALU	valley	a relatively shallow, wide depression, the bottom of which usually has a continuous gradient
640 | U	VLSU	U.VLSU	valleys	a relatively shallow, wide depression, the bottom of which usually has a continuous gradient
641 | V	BUSH	V.BUSH	bush(es)	a small clump of conspicuous bushes in an otherwise bare area
642 | V	CULT	V.CULT	cultivated area	an area under cultivation
643 | V	FRST	V.FRST	forest(s)	an area dominated by tree vegetation
644 | V	FRSTF	V.FRSTF	fossilized forest	a forest fossilized by geologic processes and now exposed at the earth's surface
645 | V	GRSLD	V.GRSLD	grassland	an area dominated by grass vegetation
646 | V	GRVC	V.GRVC	coconut grove	a planting of coconut trees
647 | V	GRVO	V.GRVO	olive grove	a planting of olive trees
648 | V	GRVP	V.GRVP	palm grove	a planting of palm trees
649 | V	GRVPN	V.GRVPN	pine grove	a planting of pine trees
650 | V	HTH	V.HTH	heath	an upland moor or sandy area dominated by low shrubby vegetation including heather
651 | V	MDW	V.MDW	meadow	a small, poorly drained area dominated by grassy vegetation
652 | V	OCH	V.OCH	orchard(s)	a planting of fruit or nut trees
653 | V	SCRB	V.SCRB	scrubland	an area of low trees, bushes, and shrubs stunted by some environmental limitation
654 | V	TREE	V.TREE	tree(s)	a conspicuous tree used as a landmark
655 | V	TUND	V.TUND	tundra	a marshy, treeless, high latitude plain, dominated by mosses, lichens, and low shrub vegetation under permafrost conditions
656 | V	VIN	V.VIN	vineyard	a planting of grapevines
657 | V	VINS	V.VINS	vineyards	plantings of grapevines


--------------------------------------------------------------------------------
/mordecai/data/nat_df.csv:
--------------------------------------------------------------------------------
  1 | nationality,alpha_3_code
  2 | Afghan,AFG
  3 | Åland Island,ALA
  4 | Albanian,ALB
  5 | Algerian,DZA
  6 | American Samoan,ASM
  7 | Andorran,AND
  8 | Angolan,AGO
  9 | Anguillan,AIA
 10 | Antarctic,ATA
 11 | Antiguan,ATG
 12 | Barbudan,ATG
 13 | Argentine,ARG
 14 | Armenian,ARM
 15 | Aruban,ABW
 16 | Australian,AUS
 17 | Austrian,AUT
 18 | Azerbaijani,AZE
 19 | Azeri, AZE
 20 | Bahamian,BHS
 21 | Bahraini,BHR
 22 | Bangladeshi,BGD
 23 | Barbadian,BRB
 24 | Belarusian,BLR
 25 | Belgian,BEL
 26 | Belizean,BLZ
 27 | Beninese,BEN
 28 | Beninois,BEN
 29 | BermudianBMU
 30 | Bermudan,BMU
 31 | Bhutanese,BTN
 32 | Bolivian,BOL
 33 | Bonaire,BES
 34 | Bosnian,BIH
 35 | Motswana,BWA
 36 | Botswanan,BWA
 37 | Bouvet Island,BVT
 38 | Brazilian,BRA
 39 | Bruneian,BRN
 40 | Bulgarian,BGR
 41 | Burkinabé,BFA
 42 | Burundian,BDI
 43 | Cabo Verdean,CPV
 44 | Cambodian,KHM
 45 | Cameroonian,CMR
 46 | Canadian,CAN
 47 | Caymanian,CYM
 48 | Chadian,TCD
 49 | Chilean,CHL
 50 | Chinese,CHN
 51 | Christmas Islander,CXR
 52 | Cocos Island,CCK
 53 | Cocos Islander,CCK
 54 | Colombian,COL
 55 | Comoran,COM
 56 | Comorian,COM
 57 | Congolese,COG
 58 | Congolese,COD
 59 | Cook Islander,COK
 60 | Costa Rican,CRI
 61 | Ivorian,CIV
 62 | Croatian,HRV
 63 | Cuban,CUB
 64 | Curaçaoan,CUW
 65 | Cypriot,CYP
 66 | Czech,CZE
 67 | Danish,DNK
 68 | Djiboutian,DJI
 69 | Dominican,DMA
 70 | Dominican,DOM
 71 | Ecuadorian,ECU
 72 | Egyptian,EGY
 73 | Salvadoran,SLV
 74 | Equatorial Guinean,GNQ
 75 | Equatoguinean,GNQ
 76 | Eritrean,ERI
 77 | Estonian,EST
 78 | Ethiopian,ETH
 79 | Falkland Island,FLK
 80 | Faroese,FRO
 81 | Fijian,FJI
 82 | Finnish,FIN
 83 | French,FRA
 84 | French Guianese,GUF
 85 | French Polynesian,PYF
 86 | Gabonese,GAB
 87 | Gambian,GMB
 88 | Georgian,GEO
 89 | German,DEU
 90 | Ghanaian,GHA
 91 | Gibraltar,GIB
 92 | Greek,GRC
 93 | Hellenic,GRC
 94 | Greenlandic,GRL
 95 | Grenadian,GRD
 96 | Guadeloupe,GLP
 97 | Guamanian,GUM
 98 | Guambat,GUM
 99 | Guatemalan,GTM
100 | Channel Islander,GGY
101 | Guinean,GIN
102 | Bissau-Guinean,GNB
103 | Guyanese,GUY
104 | Haitian,HTI
105 | Honduran,HND
106 | Hong Kongese,HKG
107 | Hungarian,HUN
108 | Magyar,HUN
109 | Icelandic,ISL
110 | Indian,IND
111 | Indonesian,IDN
112 | Iranian,IRN
113 | Persian,IRN
114 | Iraqi,IRQ
115 | Irish,IRL
116 | Manx,IMN
117 | Israeli,ISR
118 | Italian,ITA
119 | Jamaican,JAM
120 | Japanese,JPN
121 | Channel Island,JEY
122 | Jordanian,JOR
123 | Kazakhstani,KAZ
124 | Kazakh,KAZ
125 | Kenyan,KEN
126 | I-Kiribati,KIR
127 | North Korean,PRK
128 | South Korean,KOR
129 | Kuwaiti,KWT
130 | Kyrgyzstani,KGZ
131 | Kyrgyz,KGZ
132 | Kirgiz,KGZ
133 | Kirghiz,KGZ
134 | Lao,LAO
135 | Laotian,LAO
136 | Latvian,LVA
137 | Lebanese,LBN
138 | Basotho,LSO
139 | Liberian,LBR
140 | Libyan,LBY
141 | Liechtenstein,LIE
142 | Lithuanian,LTU
143 | Luxembourgish,LUX
144 | Macanese,MAC
145 | Macedonian,MKD
146 | Malagasy,MDG
147 | Malawian,MWI
148 | Malaysian,MYS
149 | Maldivian,MDV
150 | Malian,MLI
151 | Malinese,MLI
152 | Maltese,MLT
153 | Marshallese,MHL
154 | Martiniquais,MTQ
155 | Martinican,MTQ
156 | Mauritanian,MRT
157 | Mauritian,MUS
158 | Mahoran,MYT
159 | Mexican,MEX
160 | Micronesian,FSM
161 | Moldovan,MDA
162 | Monégasque,MCO
163 | Monacan,MCO
164 | Mongolian,MNG
165 | Montenegrin,MNE
166 | Montserratian,MSR
167 | Moroccan,MAR
168 | Mozambican,MOZ
169 | Burmese,MMR
170 | Namibian,NAM
171 | Nauruan,NRU
172 | Nepali,NPL
173 | Nepalese,NPL
174 | Dutch, NLD
175 | Netherlandic,NLD
176 | New Caledonian,NCL
177 | New Zealander,NZL
178 | Nicaraguan,NIC
179 | Nigerien,NER
180 | Nigerian,NGA
181 | Niuean,NIU
182 | Norfolk Island,NFK
183 | Northern Marianan,MNP
184 | Norwegian,NOR
185 | Omani,OMN
186 | Pakistani,PAK
187 | Palauan,PLW
188 | Palestinian,PSE
189 | Panamanian,PAN
190 | Papua New Guinean, PNG
191 | Papuan,PNG
192 | Paraguayan,PRY
193 | Peruvian,PER
194 | Philippine, PHL
195 | Filipino,PHL
196 | Pitcairn Island,PCN
197 | Polish,POL
198 | Portuguese,PRT
199 | Puerto Rican,PRI
200 | Qatari,QAT
201 | Réunionese, REU
202 | Réunionnais,REU
203 | Romanian,ROU
204 | Russian,RUS
205 | Rwandan,RWA
206 | Barthélemois,BLM
207 | Saint Helenian,SHN
208 | Kittitian or Nevisian,KNA
209 | Saint Lucian,LCA
210 | Saint-Martinoise,MAF
211 | Saint-Pierrais,SPM
212 | Miquelonnais,SPM
213 | Saint Vincentian, VCT
214 | Vincentian,VCT
215 | Samoan,WSM
216 | Sammarinese,SMR
217 | São Toméan,STP
218 | Saudi, SAU
219 | Saudi Arabian,SAU
220 | Senegalese,SEN
221 | Serbian,SRB
222 | Serb,SRB
223 | Seychellois,SYC
224 | Sierra Leonean,SLE
225 | Singaporean,SGP
226 | Sint Maarten,SXM
227 | Slovak,SVK
228 | Slovenian, SVN
229 | Slovene,SVN
230 | Solomon Island,SLB
231 | Somali,SOM
232 | Somalian,SOM
233 | South African,ZAF
234 | South Sudanese,SSD
235 | Spanish,ESP
236 | Sri Lankan,LKA
237 | Sudanese,SDN
238 | Surinamese,SUR
239 | Svalbard,SJM
240 | Swazi,SWZ
241 | Swedish,SWE
242 | Swiss,CHE
243 | Syrian,SYR
244 | Taiwanese,TWN
245 | Tajikistani,TJK
246 | Tanzanian,TZA
247 | Thai,THA
248 | Timorese,TLS
249 | Togolese,TGO
250 | Tokelauan,TKL
251 | Tongan,TON
252 | Trinidadian,TTO
253 | Tobagonian,TTO
254 | Tunisian,TUN
255 | Turkish,TUR
256 | Turkmen,TKM
257 | Turkmeni,TKM
258 | Tuvaluan,TUV
259 | Ugandan,UGA
260 | Ukrainian,UKR
261 | Emirati,ARE
262 | Emirian,ARE
263 | Emiri,ARE
264 | British,GBR
265 | UK, GBR
266 | American,USA
267 | Uruguayan,URY
268 | Uzbekistani,UZB
269 | Uzbek,UZB
270 | Uzbeki,UZB
271 | Ni-Vanuatu,VUT
272 | Vanuatuan,VUT
273 | Venezuelan,VEN
274 | Vietnamese,VNM
275 | Wallisian,WLF
276 | Futunan,WLF
277 | Sahrawi,ESH
278 | Sahrawian,ESH
279 | Sahraouian,ESH
280 | Yemeni,YEM
281 | Zambian,ZMB
282 | Zimbabwean,ZWE
283 | 


--------------------------------------------------------------------------------
/mordecai/data/stopword_country_names.json:
--------------------------------------------------------------------------------
 1 | {"Afghanistan":"AFG", "Åland Islands":"ALA", "Albania":"ALB", "Algeria":"DZA",
 2 |     "American Samoa":"ASM", "Andorra":"AND", "Angola":"AGO", "Anguilla":"AIA",
 3 |     "Antarctica":"ATA", "Antigua Barbuda":"ATG", "Argentina":"ARG",
 4 |     "Armenia":"ARM", "Aruba":"ABW", "Ascension_Island":"NA", "Australia":"AUS",
 5 |     "Austria":"AUT", "Azerbaijan":"AZE", "Bahamas":"BHS", "Bahrain":"BHR",
 6 |     "Bangladesh":"BGD", "Barbados":"BRB", "Belarus":"BLR", "Belgium":"BEL",
 7 |     "Belize":"BLZ", "Benin":"BEN", "Bermuda":"BMU", "Bhutan":"BTN",
 8 |     "Bolivia":"BOL", "Bosnia_Herzegovina":"BIH",
 9 |     "Botswana":"BWA", "Bouvet Island":"BVT", "Brazil":"BRA",
10 |     "Britain":"GBR", "Great_Britain":"GBR",
11 |     "British Virgin Islands":"VGB", "Brunei":"BRN", "Bulgaria":"BGR", "Burkina_Faso":"BFA",
12 |     "Burundi":"BDI", "Cambodia":"KHM", "Cameroon":"CMR",
13 |     "Canada":"CAN","Cape Verde":"CPV", "Cayman_Islands":"CYM",
14 |     "Central African Republic":"CAF", "Chad":"TCD", "Chile":"CHL", "China":"CHN",
15 |     "Cocos_Islands":"CCK", "Colombia":"COL",
16 |     "Comoros":"COM", "Congo Brazzaville":"COG", "Congo Kinshasa":"COD",
17 |     "Congo":"COG", "Cook_Islands":"COK",
18 |     "Costa_Rica":"CRI", "Cote Ivoire":"CIV", "Ivory_Coast":"CIV","Croatia":"HRV", "Cuba":"CUB",
19 |     "Curaçao":"CUW", "Cyprus":"CYP", "Czech_Republic":"CZE", "Denmark":"DNK",
20 |     "Djibouti":"DJI", "Dominica":"DMA", "Dominican_Republic":"DOM",
21 |     "Ecuador":"ECU", "Egypt":"EGY", "El_Salvador":"SLV",
22 |     "Equatorial_Guinea":"GNQ", "Eritrea":"ERI", "Estonia":"EST", "Ethiopia":"ETH",
23 |     "Falkland_Islands":"FLK", "Faroe_Islands":"FRO",
24 |     "Fiji":"FJI", "Finland":"FIN", "France":"FRA", "French_Guiana":"GUF",
25 |     "French_Polynesia":"PYF","Gabon":"GAB",
26 |     "Gambia":"GMB", "Gaza":"PSE", "Georgia":"GEO", "Germany":"DEU", "Ghana":"GHA",
27 |     "Gibraltar":"GIB", "Greece":"GRC", "Greenland":"GRL", "Grenada":"GRD",
28 |     "Guadeloupe":"GLP", "Guam":"GUM", "Guatemala":"GTM", "Guernsey":"GGY",
29 |     "Guinea":"GIN", "Guinea_Bissau":"GNB", "Guyana":"GUY", "Haiti":"HTI","Honduras":"HND",
30 |     "Hong_Kong":"HKG",  "Hungary":"HUN", "Iceland":"ISL",
31 |     "India":"IND", "Indonesia":"IDN", "Iran":"IRN", "Iraq":"IRQ", "Ireland":"IRL",
32 |     "Israel":"ISR", "Italy":"ITA", "Jamaica":"JAM", "Japan":"JPN",
33 |     "Jordan":"JOR", "Kazakhstan":"KAZ", "Kenya":"KEN",
34 |     "Kiribati":"KIR", "Kuwait":"KWT", "Kyrgyzstan":"KGZ", "Laos":"LAO",
35 |     "Latvia":"LVA", "Lebanon":"LBN", "Lesotho":"LSO", "Liberia":"LBR",
36 |     "Libya":"LBY", "Liechtenstein":"LIE", "Lithuania":"LTU", "Luxembourg":"LUX",
37 |     "Macau":"MAC", "Macedonia":"MKD", "Madagascar":"MDG", "Malawi":"MWI",
38 |     "Malaysia":"MYS", "Maldives":"MDV", "Mali":"MLI", "Malta":"MLT", "Marshall_Islands":"MHL",
39 |     "Martinique":"MTQ", "Mauritania":"MRT", "Mauritius":"MUS",
40 |     "Mayotte":"MYT", "Mexico":"MEX", "Micronesia":"FSM", "Moldova":"MDA",
41 |     "Monaco":"MCO", "Mongolia":"MNG", "Montenegro":"MNE", "Montserrat":"MSR",
42 |     "Morocco":"MAR", "Mozambique":"MOZ", "Myanmar":"MMR", "Burma":"MMR", "Namibia":"NAM",
43 |     "Nauru":"NRU", "Nepal":"NPL", "Netherlands":"NLD", "Netherlands Antilles":"ANT",
44 |     "New Caledonia":"NCL", "New_Zealand":"NZL", "Nicaragua":"NIC",
45 |     "Niger":"NER", "Nigeria":"NGA", "Niue":"NIU", "North_Korea":"PRK",
46 |     "Northern Ireland":"IRL", "Northern Mariana Islands":"MNP",
47 |     "Norway":"NOR", "Oman":"OMN", "Pakistan":"PAK",
48 |     "Palau":"PLW", "Palestinian_Territories":"PSE", "Palestine":"PSE","Panama":"PAN", "Papua New Guinea":"PNG",
49 |     "Paraguay":"PRY", "Peru":"PER", "Philippines":"PHL", "Pitcairn_Islands":"PCN",
50 |     "Poland":"POL", "Portugal":"PRT", "Puerto_Rico":"PRI",
51 |     "Qatar":"QAT", "Réunion":"REU", "Romania":"ROU", "Russia":"RUS",
52 |     "Rwanda":"RWA", "Saint Barthélemy":"BLM", "Saint Helena":"SHN",
53 |     "Saint Kitts Nevis":"KNA", "Saint Lucia":"LCA",
54 |     "Saint Pierre Miquelon":"SPM", "Saint Vincent Grenadines":"VCT",
55 |     "Samoa":"WSM", "San_Marino":"SMR", "São Tomé Príncipe":"STP", "Saudi_Arabia":"SAU",
56 |     "Senegal":"SEN", "Serbia":"SRB",
57 |     "Seychelles":"SYC", "Sierra_Leone":"SLE", "Singapore":"SGP", "Sint Maarten":"SXM",
58 |     "Slovakia":"SVK", "Slovenia":"SVN", "Solomon_Islands":"SLB",
59 |     "Somalia":"SOM", "South_Africa":"ZAF",
60 |     "South_Korea":"KOR", "South Sudan":"SSD", "Spain":"ESP", "Sri_Lanka":"LKA", "Sudan":"SDN",
61 |     "Suriname":"SUR", "Svalbard Jan Mayen":"SJM",
62 |     "Swaziland":"SWZ", "Sweden":"SWE", "Switzerland":"CHE", "Syria":"SYR",
63 |     "Taiwan":"TWN", "Tajikistan":"TJK", "Tanzania":"TZA", "Thailand":"THA",
64 |     "Timor Leste":"TLS", "East_Timor":"TLS","Togo":"TGO", "Tokelau":"TKL", "Tonga":"TON", "Trinidad Tobago":"TTO",
65 |     "Tunisia":"TUN", "Turkey":"TUR",
66 |     "Turkmenistan":"TKM", "Turks Caicos Islands":"TCA", "Tuvalu":"TUV", "U.S. Minor Outlying Islands":"UMI",
67 |     "Virgin_Islands":"VIR", "Uganda":"UGA",
68 |     "Ukraine":"UKR", "United_Arab_Emirates":"ARE", "United_Kingdom":"GBR",
69 |     "UK":"GBR", "United_States":"USA", "USA":"USA", "America":"USA",
70 |     "Uruguay":"URY", "Uzbekistan":"UZB", "Vanuatu":"VUT", "Vatican":"VAT", "Venezuela":"VEN",
71 |     "Vietnam":"VNM", "Wallis Futuna":"WLF",
72 |     "Western_Sahara":"ESH", "Yemen":"YEM", "Zambia":"ZMB", "Zimbabwe":"ZWE"}
73 | 


--------------------------------------------------------------------------------
/mordecai/geoparse.py:
--------------------------------------------------------------------------------
   1 | from tensorflow import keras
   2 | import pandas as pd
   3 | import numpy as np
   4 | from collections import Counter
   5 | import editdistance
   6 | import pkg_resources
   7 | import spacy
   8 | from . import utilities
   9 | from multiprocessing.pool import ThreadPool
  10 | from elasticsearch.exceptions import ConnectionTimeout, ConnectionError
  11 | import multiprocessing
  12 | from tqdm import tqdm
  13 | import warnings
  14 | import re
  15 | 
  16 | import traceback
  17 | 
  18 | try:
  19 |     from functools import lru_cache
  20 | except ImportError:
  21 |     from backports.functools_lru_cache import lru_cache
  22 |     print("Mordecai requires Python 3 and seems to be running in Python 2.")
  23 | 
  24 | 
  25 | class Geoparser:
  26 |     def __init__(self, nlp=None, es_hosts=None, es_port=None, es_ssl=False, es_auth=None,
  27 |                  verbose=False, country_threshold=0.6, threads=True,
  28 |                  progress=True, training=None, models_path=None, **kwargs):
  29 |         DATA_PATH = pkg_resources.resource_filename('mordecai', 'data/')
  30 |         if not models_path:
  31 |             models_path = pkg_resources.resource_filename('mordecai', 'models/')
  32 |             print("Models path:", models_path)
  33 |         if nlp:
  34 |             self.nlp = nlp
  35 |         else:
  36 |             try:
  37 |                 self.nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger']) 
  38 |             except OSError:
  39 |                 print("""ERROR: No spaCy NLP model installed. Install with this command: 
  40 |                 `python -m spacy download en_core_web_lg`.""") 
  41 |         self._cts = utilities.country_list_maker()
  42 |         self._just_cts = utilities.country_list_maker()
  43 |         self._inv_cts = utilities.make_inv_cts(self._cts)
  44 |         country_state_city = utilities.other_vectors()
  45 |         self._cts.update(country_state_city)
  46 |         self._ct_nlp = utilities.country_list_nlp(self._cts)
  47 |         self._prebuilt_vec = [w.vector for w in self._ct_nlp]
  48 |         self._both_codes = utilities.make_country_nationality_list(self._cts, DATA_PATH + "nat_df.csv")
  49 |         self._admin1_dict = utilities.read_in_admin1(DATA_PATH + "admin1CodesASCII.json")
  50 |         self.conn = utilities.setup_es(es_hosts, es_port, es_ssl, es_auth)
  51 |         if not training:
  52 |             # when retraining models, don't load old models
  53 |             self.country_model = keras.models.load_model(models_path + "country_model.h5")
  54 |             self.rank_model = keras.models.load_model(models_path + "rank_model.h5")
  55 |         elif training == "ranker":
  56 |             self.country_model = keras.models.load_model(models_path + "country_model.h5")
  57 |         self._skip_list = utilities.make_skip_list(self._cts)
  58 |         self.training_setting = False  # make this true if you want training formatted
  59 |         # if the best country guess is below the country threshold, don't return anything at all
  60 |         self.country_threshold = country_threshold
  61 |         feature_codes = pd.read_csv(DATA_PATH + "feature_codes.txt", sep="\t", header=None)
  62 |         self._code_to_text = dict(zip(feature_codes[1], feature_codes[3]))  # human readable geonames IDs
  63 |         self.verbose = verbose  # return the full dictionary or just the good parts?
  64 |         self.progress = progress  # display progress bars?
  65 |         self.threads = threads
  66 |         if 'n_threads' in kwargs.keys():
  67 |             warnings.warn("n_threads is deprecated. Use threads=True instead.", DeprecationWarning)
  68 |         try:
  69 |             # https://www.reddit.com/r/Python/comments/3a2erd/exception_catch_not_catching_everything/
  70 |             # with nostderr():
  71 |             self.conn.count()
  72 |         except:
  73 |             raise ConnectionError("""Could not establish contact with Elasticsearch at {0} on port {1}.
  74 | Are you sure it's running?
  75 | Mordecai needs access to the Geonames/Elasticsearch gazetteer to function.
  76 | See https://github.com/openeventdata/mordecai#installation-and-requirements
  77 | for instructions on setting up Geonames/Elasticsearch""".format(es_hosts, es_port))
  78 |         es_date = utilities.check_geonames_date(self.conn)
  79 |         mod_date = "2020-07-11"
  80 |         if es_date != mod_date:
  81 |             print("""You may be using an outdated Geonames index/Mordecai version.
  82 | Your index is from {0}, while your Mordecai version is from {1}. Please see
  83 | https://github.com/openeventdata/mordecai/ for instructions on updating.""".format(es_date, mod_date))
  84 | 
  85 | 
  86 |     def _feature_country_mentions(self, doc):
  87 |         """
  88 |         Given a document, count how many times different country names and adjectives are mentioned.
  89 |         These are features used in the country picking phase.
  90 | 
  91 |         Parameters
  92 |         ---------
  93 |         doc: a spaCy nlp'ed piece of text
  94 | 
  95 |         Returns
  96 |         -------
  97 |         countries: dict
  98 |             the top two countries (ISO code) and their frequency of mentions.
  99 |         """
 100 |         c_list = []
 101 |         for i in doc.ents:
 102 |             try:
 103 |                 country = self._both_codes[i.text]
 104 |                 c_list.append(country)
 105 |             except KeyError:
 106 |                 pass
 107 |         count = Counter(c_list).most_common()
 108 |         try:
 109 |             top, top_count = count[0]
 110 |         except:
 111 |             top = ""
 112 |             top_count = 0
 113 |         try:
 114 |             two, two_count = count[1]
 115 |         except:
 116 |             two = ""
 117 |             two_count = 0
 118 | 
 119 |         countries = (top, top_count, two, two_count)
 120 |         return countries
 121 | 
 122 | 
 123 |     def clean_entity(self, ent):
 124 |         """
 125 |         Strip out extra words that often get picked up by spaCy's NER.
 126 | 
 127 |         To do: preserve info about what got stripped out to help with ES/Geonames
 128 |             resolution later.
 129 | 
 130 |         Parameters
 131 |         ---------
 132 |         ent: a spaCy named entity Span
 133 | 
 134 |         Returns
 135 |         -------
 136 |         new_ent: a spaCy Span, with extra words stripped out.
 137 | 
 138 |         """
 139 |         dump_list = ['province', 'the', 'area', 'airport', 'district', 'square',
 140 |                     'town', 'village', 'prison', "river", "valley", "provincial", "prison",
 141 |                     "region", "municipality", "state", "territory", "of", "in",
 142 |                     "county", "central"]
 143 |         keep_positions = []
 144 |         for word in ent:
 145 |             if word.text.lower() not in dump_list:
 146 |                 keep_positions.append(word.i)
 147 | 
 148 |         keep_positions = np.asarray(keep_positions)
 149 |         try:
 150 |             new_ent = ent.doc[keep_positions.min():keep_positions.max() + 1]
 151 |             # can't set directly
 152 |             #new_ent.label_.__set__(ent.label_)
 153 |         except ValueError:
 154 |             new_ent = ent
 155 |         return new_ent
 156 | 
 157 | 
 158 |     def _feature_most_common(self, results):
 159 |         """
 160 |         Find the most common country name in ES/Geonames results
 161 | 
 162 |         Paramaters
 163 |         ----------
 164 |         results: dict
 165 |             output of `query_geonames`
 166 | 
 167 |         Returns
 168 |         -------
 169 |         most_common: str
 170 |             ISO code of most common country, or empty string if none
 171 |         """
 172 |         try:
 173 |             country_count = Counter([i['country_code3'] for i in results['hits']['hits']])
 174 |             most_common = country_count.most_common()[0][0]
 175 |             return most_common
 176 |         except IndexError:
 177 |             return ""
 178 |         except TypeError:
 179 |             return ""
 180 | 
 181 | 
 182 |     def _feature_most_alternative(self, results, full_results=False):
 183 |         """
 184 |         Find the placename with the most alternative names and return its country.
 185 |         More alternative names are a rough measure of importance.
 186 | 
 187 |         Paramaters
 188 |         ----------
 189 |         results: dict
 190 |             output of `query_geonames`
 191 | 
 192 |         Returns
 193 |         -------
 194 |         most_alt: str
 195 |             ISO code of country of place with most alternative names,
 196 |             or empty string if none
 197 |         """
 198 |         try:
 199 |             alt_names = [len(i['alternativenames']) for i in results['hits']['hits']]
 200 |             most_alt = results['hits']['hits'][np.array(alt_names).argmax()]
 201 |             if full_results:
 202 |                 return most_alt
 203 |             else:
 204 |                 return most_alt['country_code3']
 205 |         except (IndexError, ValueError, TypeError):
 206 |             return ""
 207 | 
 208 | 
 209 |     def _feature_most_population(self, results):
 210 |         """
 211 |         Find the placename with the largest population and return its country.
 212 |         More population is a rough measure of importance.
 213 | 
 214 |         Paramaters
 215 |         ----------
 216 |         results: dict
 217 |             output of `query_geonames`
 218 | 
 219 |         Returns
 220 |         -------
 221 |         most_pop: str
 222 |             ISO code of country of place with largest population,
 223 |             or empty string if none
 224 |         """
 225 | 
 226 |         try:
 227 |             populations = [i['population'] for i in results['hits']['hits']]
 228 |             most_pop = results['hits']['hits'][np.array(populations).astype("int").argmax()]
 229 |             return most_pop['country_code3']
 230 |         except Exception as e:
 231 |             return ""
 232 | 
 233 | 
 234 |     def _feature_word_embedding(self, text):
 235 |         """
 236 |         Given a word, guess the appropriate country by word vector.
 237 | 
 238 |         Parameters
 239 |         ---------
 240 |         text: str
 241 |             the text to extract locations from.
 242 | 
 243 |         Returns
 244 |         -------
 245 |         country_picking: dict
 246 |             The top two countries (ISO codes) and two measures
 247 |             confidence for the first choice.
 248 |         """
 249 |         try:
 250 |             simils = np.dot(self._prebuilt_vec, text.vector)
 251 |         except Exception as e:
 252 |             #print("Vector problem, ", Exception, e)
 253 |             return {"country_1" : "",
 254 |                 "confid_a" : 0,
 255 |                 "confid_b" : 0,
 256 |                 "country_2" : ""}
 257 |         ranks = simils.argsort()[::-1]
 258 |         confid = simils.max()
 259 |         confid2 = simils[ranks[0]] - simils[ranks[1]]
 260 |         if confid == 0 or confid2 == 0:
 261 |             return ""
 262 |         country_code = self._cts[str(self._ct_nlp[ranks[0]])]
 263 |         country_picking = {"country_1" : country_code,
 264 |                 "confid_a" : confid,
 265 |                 "confid_b" : confid2,
 266 |                 "country_2" : self._cts[str(self._ct_nlp[ranks[1]])]}
 267 |         return country_picking
 268 | 
 269 | 
 270 |     def _feature_first_back(self, results):
 271 |         """
 272 |         Get the country of the first two results back from geonames.
 273 | 
 274 |         Parameters
 275 |         -----------
 276 |         results: dict
 277 |             elasticsearch results
 278 | 
 279 |         Returns
 280 |         -------
 281 |         top: tuple
 282 |             first and second results' country name (ISO)
 283 |         """
 284 |         try:
 285 |             first_back = results['hits']['hits'][0]['country_code3']
 286 |         except (TypeError, IndexError):
 287 |             # usually occurs if no Geonames result
 288 |             first_back = ""
 289 |         try:
 290 |             second_back = results['hits']['hits'][1]['country_code3']
 291 |         except (TypeError, IndexError):
 292 |             second_back = ""
 293 |         top = (first_back, second_back)
 294 |         return top
 295 | 
 296 | 
 297 |     def is_country(self, text):
 298 |         """Check if a piece of text is in the list of countries"""
 299 |         ct_list = self._just_cts.keys()
 300 |         if text in ct_list:
 301 |             return True
 302 |         else:
 303 |             return False
 304 | 
 305 | 
 306 |     @lru_cache(maxsize=250)
 307 |     def query_geonames(self, placename):
 308 |         """
 309 |         Wrap search parameters into an elasticsearch query to the geonames index
 310 |         and return results.
 311 | 
 312 |         Parameters
 313 |         ---------
 314 |         conn: an elasticsearch Search conn, like the one returned by `setup_es()`
 315 | 
 316 |         placename: str
 317 |             the placename text extracted by NER system
 318 | 
 319 |         Returns
 320 |         -------
 321 |         out: The raw results of the elasticsearch query
 322 |         """
 323 |         # first first, try for country name
 324 |         if self.is_country(placename):
 325 |             q = {"multi_match": {"query": placename,
 326 |                                  "fields": ['name', 'asciiname', 'alternativenames'],
 327 |                                 "type" : "phrase"}}
 328 |             res = self.conn.filter("term", feature_code='PCLI').query(q)[0:5].execute()  # always 5
 329 |         else:
 330 |             # second, try for an exact phrase match
 331 |             q = {"multi_match": {"query": placename,
 332 |                                  "fields": ['name^5', 'asciiname^5', 'alternativenames'],
 333 |                                 "type" : "phrase"}}
 334 |             res = self.conn.query(q)[0:50].execute()
 335 |             # if no results, use some fuzziness, but still require all terms to be present.
 336 |             # Fuzzy is not allowed in "phrase" searches.
 337 |             if res.hits.total == 0:
 338 |                 # tried wrapping this in a {"constant_score" : {"query": ... but made it worse
 339 |                 q = {"multi_match": {"query": placename,
 340 |                                      "fields": ['name', 'asciiname', 'alternativenames'],
 341 |                                          "fuzziness" : 1,
 342 |                                          "operator":   "and"
 343 |                                      }
 344 |                     }
 345 |                 res = self.conn.query(q)[0:50].execute()
 346 |         es_result = utilities.structure_results(res)
 347 |         return es_result
 348 | 
 349 | 
 350 |     #@lru_cache(maxsize=250)  # cache won't work with dictionary inputs
 351 |     def query_geonames_country(self, placename, country, filter_params=None):
 352 |         """
 353 |         Like query_geonames, but limited to a specified country or (optionally) another filter.
 354 | 
 355 |         The filter_params argument can be used to limit results to a particular adm1 (e.g.
 356 |         {"adm1" : "09"}) or feature type {"feature_code" : "adm1"}.
 357 | 
 358 |         Parameters
 359 |         ---------
 360 |         placename: str, the place name to search for
 361 |         country: str, country to limit search to in ISO 3 char code
 362 |         filter_params: dict, a further filter to apply, e.g. {"feature_code":"ADM1"}
 363 | 
 364 |         Returns
 365 |         ------
 366 |         out: dict, the structured geonames results
 367 |         """
 368 |         # first, try for an exact phrase match
 369 |         q = {"multi_match": {"query": placename,
 370 |                              "fields": ['name^5', 'asciiname^5', 'alternativenames'],
 371 |                             "type": "phrase"}}
 372 |         if filter_params:
 373 |             res = self.conn.filter("term", **filter_params).filter("term", country_code3=country).query(q)[0:50].execute()
 374 |         else:
 375 |             res = self.conn.filter("term", country_code3=country).query(q)[0:50].execute()
 376 | 
 377 |         # if no results, use some fuzziness, but still require all terms to be present.
 378 |         # Fuzzy is not allowed in "phrase" searches.
 379 |         if res.hits.total == 0:
 380 |             # tried wrapping this in a {"constant_score" : {"query": ... but made it worse
 381 |             q = {"multi_match": {"query": placename,
 382 |                                  "fields": ['name', 'asciiname', 'alternativenames'],
 383 |                                      "fuzziness": 2,
 384 |                                      "operator":   "and"}}
 385 |             if filter_params:
 386 |                 res = self.conn.filter("term", **filter_params).filter("term", country_code3=country).query(q)[0:50].execute()
 387 |             else:
 388 |                 res = self.conn.filter("term", country_code3=country).query(q)[0:50].execute()
 389 |         out = utilities.structure_results(res)
 390 |         return out
 391 | 
 392 |  
 393 | 
 394 |     # The following three lookup functions are used for the threaded queries.
 395 |     def proc_lookup(self, loc):
 396 |         try:
 397 |             loc = self.query_geonames(loc['word'])
 398 |         except ConnectionTimeout:
 399 |             loc = ""
 400 |         return loc
 401 | 
 402 | 
 403 |     def proc_lookup_country(self, loc):
 404 |         if loc['country_conf'] >= self.country_threshold:
 405 |             loc = self.query_geonames_country(loc['word'], loc['country_predicted'])
 406 |             return loc
 407 |         else:
 408 |             return ""
 409 | 
 410 | 
 411 |     def simple_lookup(self, word):
 412 |         try:
 413 |             loc = self.query_geonames(word)
 414 |         except ConnectionTimeout:
 415 |             loc = ""
 416 |         return loc
 417 | 
 418 | 
 419 |     def _feature_location_type_mention(self, ent):
 420 |         """
 421 |         Count forward 1 word from each entity, looking for defined terms that indicate
 422 |         geographic feature types (e.g. "village" = "P").
 423 | 
 424 |         Parameters
 425 |         -----------
 426 |         ent : spacy entity span
 427 |             It has to be an entity to handle indexing in the document
 428 | 
 429 |         Returns
 430 |         --------
 431 |         tuple (length 2)
 432 |             (feature_code, feature_class) derived from explicit word usage
 433 | 
 434 |         """
 435 | 
 436 |         P_list = ["city", "cities", "town", "towns", "villages", "village", "settlement",
 437 |                   "capital", "town", "towns", "neighborhood", "neighborhoods",
 438 |                  "municipality"]
 439 |         ADM1_list = ["province", "governorate", "state", "department", "oblast",
 440 |                      "changwat", "countryside"]
 441 |         ADM2_list = ["district", "rayon", "amphoe", "county"]
 442 |         A_other = ["region"]
 443 |         AIRPORT_list = ["airport"]
 444 |         TERRAIN_list = ["mountain", "mountains", "stream", "river"]
 445 |         FOREST_list = ["forest"]
 446 |         # TODO: incorporate positions, especially now that we don't split by
 447 |         # sentence
 448 |         feature_positions = []
 449 |         feature_class = feature_code = ""
 450 | 
 451 |         interest_words = ent.doc[ent.end - 1 : ent.end + 1]  # last word or next word following
 452 | 
 453 |         for word in interest_words:
 454 |             if ent.text in self._just_cts.keys():
 455 |                 feature_class = "A"
 456 |                 feature_code = "PCLI"
 457 |             elif word.text.lower() in P_list:
 458 |                 feature_class = "P"
 459 |                 feature_code = ""
 460 |             elif word.text.lower() in ADM1_list:
 461 |                 feature_class = "A"
 462 |                 feature_code = "ADM1"
 463 |             elif word.text.lower() in ADM2_list:
 464 |                 feature_class = "A"
 465 |                 feature_code = "ADM2"
 466 |             elif word.text.lower() in TERRAIN_list:
 467 |                 feature_class = "T"
 468 |                 feature_code = ""
 469 |             elif word.text.lower() in AIRPORT_list:
 470 |                 feature_class = "S"
 471 |                 feature_code = "AIRP"
 472 |             elif word.text.lower() in A_other:
 473 |                 feature_class = "A"
 474 |                 feature_code = ""
 475 |         return (feature_class, feature_code)
 476 | 
 477 | 
 478 |     def make_country_features(self, doc, require_maj=False):
 479 |         """
 480 |         Create features for the country picking model. Function where all the individual
 481 |         feature maker functions are called and aggregated. (Formerly "process_text")
 482 | 
 483 |         Parameters
 484 |         -----------
 485 |         doc : str or spaCy doc
 486 | 
 487 |         Returns
 488 |         -------
 489 |         task_list : list of dicts
 490 |             Each entry has the word, surrounding text, span, and the country picking features.
 491 |             This output can be put into Prodigy for labeling almost as-is (the "features" key needs
 492 |             to be renamed "meta" or be deleted.)
 493 |         """
 494 |         if not hasattr(doc, "ents"):
 495 |             doc = self.nlp(doc)
 496 |         # initialize the place to store finalized tasks
 497 |         task_list = []
 498 | 
 499 |         # get document vector
 500 |         #doc_vec = self._feature_word_embedding(text)['country_1']
 501 | 
 502 |         # get explicit counts of country names
 503 |         ct_mention, ctm_count1, ct_mention2, ctm_count2 = self._feature_country_mentions(doc)
 504 | 
 505 |         #  pull out the place names, skipping empty ones, countries, and known
 506 |         #  junk from the skip list (like "Atlanic Ocean"
 507 |         ents = []
 508 |         for ent in doc.ents:
 509 |             if not ent.text.strip():
 510 |                 continue
 511 |             if ent.label_ not in ["GPE", "LOC", "FAC"]:
 512 |                 continue
 513 |             # don't include country names (make a parameter)
 514 |             if ent.text.strip() in self._skip_list:
 515 |                 continue
 516 |             ents.append(ent)
 517 |         if not ents:
 518 |             return []
 519 |         # Look them up in geonames, either sequentially if no threading, or
 520 |         # in parallel if threads.
 521 |         if self.threads:
 522 |             pool = ThreadPool(len(ents))
 523 |             ent_text = [i.text for i in ents]
 524 |             ent_results = pool.map(self.simple_lookup, ent_text)
 525 |             pool.close()
 526 |             pool.join()
 527 |         else:
 528 |             ent_results = []
 529 |             for ent in ents:
 530 |                 try:
 531 |                     result = self.query_geonames(ent.text)
 532 |                 except ConnectionTimeout:
 533 |                     result = ""
 534 |                 ent_results.append(result)
 535 | 
 536 |         for n, ent in enumerate(ents):
 537 |             result = ent_results[n]
 538 |             #skip_list.add(ent.text.strip())
 539 |             ent_label = ent.label_  # destroyed by trimming
 540 |             ent = self.clean_entity(ent)
 541 | 
 542 |             # vector for just the solo word
 543 |             vp = self._feature_word_embedding(ent)
 544 |             try:
 545 |                 word_vec = vp['country_1']
 546 |                 wv_confid = float(vp['confid_a'])
 547 |             except TypeError:
 548 |                 # no idea why this comes up
 549 |                 word_vec = ""
 550 |                 wv_confid = "0"
 551 | 
 552 |             # look for explicit mentions of feature names
 553 |             class_mention, code_mention = self._feature_location_type_mention(ent)
 554 |             # build results-based features
 555 |             most_alt = self._feature_most_alternative(result)
 556 |             # TODO check if most_common feature really isn't that useful
 557 |             most_common = self._feature_most_common(result)
 558 |             most_pop = self._feature_most_population(result)
 559 |             first_back, second_back = self._feature_first_back(result)
 560 | 
 561 |             try:
 562 |                 maj_vote = Counter([word_vec, most_alt,
 563 |                                     first_back, most_pop,
 564 |                                     ct_mention
 565 |                                     #doc_vec_sent, doc_vec
 566 |                                     ]).most_common()[0][0]
 567 |             except Exception as e:
 568 |                 print("Problem taking majority vote: ", ent, e)
 569 |                 maj_vote = ""
 570 | 
 571 |             if not maj_vote:
 572 |                 maj_vote = ""
 573 |             # We only want all this junk for the labeling task. We just want to straight to features
 574 |             # and the model when in production.
 575 |             try:
 576 |                 start = ent.start_char
 577 |                 end = ent.end_char
 578 |                 iso_label = maj_vote
 579 |                 try:
 580 |                     text_label = self._inv_cts[iso_label]
 581 |                 except KeyError:
 582 |                     text_label = ""
 583 |                 task = {"text" : ent.text,
 584 |                         "label" : text_label,  # human-readable country name
 585 |                         "word" : ent.text,
 586 |                         "spans" : [{
 587 |                             "start" : start,
 588 |                             "end" : end,
 589 |                             }  # make sure to rename for Prodigy
 590 |                                 ],
 591 |                         "features" : {
 592 |                                 "maj_vote" : iso_label,
 593 |                                 "word_vec" : word_vec,
 594 |                                 "first_back" : first_back,
 595 |                                 #"doc_vec" : doc_vec,
 596 |                                 "most_alt" : most_alt,
 597 |                                 "most_pop" : most_pop,
 598 |                                 "ct_mention" : ct_mention,
 599 |                                 "ctm_count1" : ctm_count1,
 600 |                                 "ct_mention2" : ct_mention2,
 601 |                                 "ctm_count2" : ctm_count2,
 602 |                                 "wv_confid" : wv_confid,
 603 |                                 "class_mention" : class_mention,  # inferred geonames class from mentions
 604 |                                 "code_mention" : code_mention,
 605 |                                 #"places_vec" : places_vec,
 606 |                                 #"doc_vec_sent" : doc_vec_sent
 607 |                                 }
 608 |                         }
 609 |                 task_list.append(task)
 610 |             except Exception as e:
 611 |                 print(ent.text,)
 612 |                 print(e)
 613 |         return task_list  # rename this var
 614 |     # Two modules that call `make_country_features`:
 615 |     #  1. write out with majority vote for training
 616 |     #  2. turn into features, run model, return countries
 617 |     #  A third, standalone function will convert the labeled JSON from Prodigy into
 618 |     #    features for updating the model.
 619 | 
 620 | 
 621 |     def make_country_matrix(self, loc):
 622 |         """
 623 |         Create features for all possible country labels, return as matrix for keras.
 624 | 
 625 |         Parameters
 626 |         ----------
 627 |         loc: dict
 628 |             one entry from the list of locations and features that come out of make_country_features
 629 | 
 630 |         Returns
 631 |         --------
 632 |         keras_inputs: dict with two keys, "label" and "matrix"
 633 |         """
 634 | 
 635 |         top = loc['features']['ct_mention']
 636 |         top_count = loc['features']['ctm_count1']
 637 |         two = loc['features']['ct_mention2']
 638 |         two_count = loc['features']['ctm_count2']
 639 |         word_vec = loc['features']['word_vec']
 640 |         first_back = loc['features']['first_back']
 641 |         most_alt = loc['features']['most_alt']
 642 |         most_pop = loc['features']['most_pop']
 643 | 
 644 |         possible_labels = set([top, two, word_vec, first_back, most_alt, most_pop])
 645 |         possible_labels = [i for i in possible_labels if i]
 646 | 
 647 |         X_mat = []
 648 | 
 649 |         for label in possible_labels:
 650 |             inputs = np.array([word_vec, first_back, most_alt, most_pop])
 651 |             x = inputs == label
 652 |             x = np.asarray((x * 2) - 1) # convert to -1, 1
 653 | 
 654 |             # get missing values
 655 |             exists = inputs != ""
 656 |             exists = np.asarray((exists * 2) - 1)
 657 | 
 658 |             counts = np.asarray([top_count, two_count])  # cludgy, should be up with "inputs"
 659 |             right = np.asarray([top, two]) == label
 660 |             right = right * 2 - 1
 661 |             right[counts == 0] = 0
 662 | 
 663 |             # get correct values
 664 |             features = np.concatenate([x, exists, counts, right])
 665 |             X_mat.append(np.asarray(features))
 666 | 
 667 |         keras_inputs = {"labels": possible_labels,
 668 |                         "matrix": np.asmatrix(X_mat),
 669 |                         "word": loc['word']}
 670 |         return keras_inputs
 671 | 
 672 | 
 673 | 
 674 |     def infer_country(self, doc):
 675 |         """NLP a doc, find its entities, get their features, and return the model's country guess for each.
 676 |         Maybe use a better name.
 677 | 
 678 |         Parameters
 679 |         -----------
 680 |         doc: str or spaCy
 681 |             the document to country-resolve the entities in
 682 | 
 683 |         Returns
 684 |         -------
 685 |         proced: list of dict
 686 |             the feature output of "make_country_features" updated with the model's
 687 |             estimated country for each entity.
 688 |             E.g.:
 689 |                 {'all_confidence': array([ 0.95783567,  0.03769876,  0.00454875], dtype=float32),
 690 |                   'all_countries': array(['SYR', 'USA', 'JAM'], dtype='<U3'),
 691 |                   'country_conf': 0.95783567,
 692 |                   'country_predicted': 'SYR',
 693 |                   'features': {'ct_mention': '',
 694 |                        'ct_mention2': '',
 695 |                        'ctm_count1': 0,
 696 |                        'ctm_count2': 0,
 697 |                        'first_back': 'JAM',
 698 |                        'maj_vote': 'SYR',
 699 |                        'most_alt': 'USA',
 700 |                        'most_pop': 'SYR',
 701 |                        'word_vec': 'SYR',
 702 |                        'wv_confid': '29.3188'},
 703 |                   'label': 'Syria',
 704 |                   'spans': [{'end': 26, 'start': 20}],
 705 |                   'text': "There's fighting in Aleppo and Homs.",
 706 |                   'word': 'Aleppo'}
 707 | 
 708 |         """
 709 |         if not hasattr(doc, "ents"):
 710 |             doc = self.nlp(doc)
 711 |         proced = self.make_country_features(doc, require_maj=False)
 712 |         if not proced:
 713 |             pass
 714 |         feat_list = []
 715 | 
 716 |         for loc in proced:
 717 |             feat = self.make_country_matrix(loc)
 718 |             #print("feat:", feat)
 719 |             #labels = loc['labels']
 720 |             feat_list.append(feat)
 721 |             #try:
 722 |             # for each potential country...
 723 |             for n, i in enumerate(feat_list):
 724 |                 labels = i['labels']
 725 |                 try:
 726 |                     prediction = self.country_model.predict(i['matrix']).transpose()[0]
 727 |                     ranks = prediction.argsort()[::-1]
 728 |                     labels = np.asarray(labels)[ranks]
 729 |                     prediction = prediction[ranks]
 730 |                 except ValueError:
 731 |                     print(traceback.print_exc())
 732 |                     prediction = np.array([0])
 733 |                     labels = np.array([""])
 734 | 
 735 |             loc['country_predicted'] = labels[0]
 736 |             loc['country_conf'] = prediction[0]
 737 |             loc['all_countries'] = labels
 738 |             loc['all_confidence'] = prediction
 739 | 
 740 |         return proced
 741 | 
 742 |     def get_admin1(self, country_code2, admin1_code):
 743 |         """
 744 |         Convert a geonames admin1 code to the associated place name.
 745 | 
 746 |         Parameters
 747 |         ---------
 748 |         country_code2: string
 749 |                        The two character country code
 750 |         admin1_code: string
 751 |                      The admin1 code to be converted. (Admin1 is the highest
 752 |                      subnational political unit, state/region/provice/etc.
 753 |         admin1_dict: dictionary
 754 |                      The dictionary containing the country code + admin1 code
 755 |                      as keys and the admin1 names as values.
 756 | 
 757 |         Returns
 758 |         ------
 759 |         admin1_name: string
 760 |                      The admin1 name. If none is found, return "NA".
 761 |         """
 762 |         lookup_key = ".".join([country_code2, admin1_code])
 763 |         try:
 764 |             admin1_name = self._admin1_dict[lookup_key]
 765 |             return admin1_name
 766 |         except KeyError:
 767 |             #print("No admin code found for country {} and code {}".format(country_code2, admin1_code))
 768 |             return "NA"
 769 | 
 770 |     def features_for_rank(self, proc, results):
 771 |         """Compute features for ranking results from ES/geonames
 772 | 
 773 | 
 774 |         Parameters
 775 |         ----------
 776 |         proc : dict
 777 |             One dictionary from the list that comes back from geoparse or from make_country_features (doesn't matter)
 778 |         results : dict
 779 |             the response from a geonames query
 780 | 
 781 |         Returns
 782 |         --------
 783 |         X : numpy matrix
 784 |             holding the computed features
 785 | 
 786 |         meta: list of dicts
 787 |             including feature information
 788 |         """
 789 |         feature_list = []
 790 |         meta = []
 791 |         results = results['hits']['hits']
 792 |         search_name = proc['word']
 793 |         code_mention = proc['features']['code_mention']
 794 |         class_mention = proc['features']['class_mention']
 795 | 
 796 |         for rank, entry in enumerate(results):
 797 |             # go through the results and calculate some features
 798 |             # get population number and exists
 799 |             try:
 800 |                 pop = int(entry['population'])
 801 |                 has_pop = 1
 802 |             except Exception as e:
 803 |                 pop = 0
 804 |                 has_pop = 0
 805 |             if pop > 0:
 806 |                 logp = np.log(pop)
 807 |             else:
 808 |                 logp = 0
 809 |             ### order the results came back
 810 |             adj_rank = 1 / np.log(rank + 2)
 811 |             # alternative names
 812 |             len_alt = len(entry['alternativenames'])
 813 |             adj_alt = np.log(len_alt)
 814 |             ### feature class (just boost the good ones)
 815 |             if entry['feature_class'] == "A" or entry['feature_class'] == "P":
 816 |                 good_type = 1
 817 |             else:
 818 |                 good_type = 0
 819 |                 #fc_score = 3
 820 |             ### feature class/code matching
 821 |             if entry['feature_class'] == class_mention:
 822 |                 good_class_mention = 1
 823 |             else:
 824 |                 good_class_mention = 0
 825 |             if entry['feature_code'] == code_mention:
 826 |                 good_code_mention = 1
 827 |             else:
 828 |                 good_code_mention = 0
 829 |             ### edit distance
 830 |             ed = editdistance.eval(search_name, entry['name'])
 831 |             ed = ed  # shrug
 832 |             # maybe also get min edit distance to alternative names...
 833 | 
 834 |             features = [has_pop, pop, logp, adj_rank, len_alt, adj_alt,
 835 |                         good_type, good_class_mention, good_code_mention, ed]
 836 |             m = self.format_geonames(entry)
 837 | 
 838 |             feature_list.append(features)
 839 |             meta.append(m)
 840 | 
 841 |         #meta = geo.format_geonames(results)
 842 |         X = np.asmatrix(feature_list)
 843 |         return (X, meta)
 844 | 
 845 |     def ranker(self, X, meta):
 846 |         """
 847 |         Sort the place features list by the score of its relevance.
 848 |         """
 849 |         # total score is just a sum of each row
 850 |         total_score = X.sum(axis=1).transpose()
 851 |         total_score = np.squeeze(np.asarray(total_score))  # matrix to array
 852 |         ranks = total_score.argsort()
 853 |         ranks = ranks[::-1]
 854 |         # sort the list of dicts according to ranks
 855 |         sorted_meta = [meta[r] for r in ranks]
 856 |         sorted_X = X[ranks]
 857 |         return (sorted_X, sorted_meta)
 858 | 
 859 |     def format_for_prodigy(self, X, meta, placename, return_feature_subset=False):
 860 |         """
 861 |         Given a feature matrix, geonames data, and the original query,
 862 |         construct a prodigy task.
 863 | 
 864 |         Make meta nicely readable: "A town in Germany"
 865 | 
 866 |         Parameters
 867 |         ----------
 868 | 
 869 |         X: matrix
 870 |             vector of features for ranking. Output of features_for_rank()
 871 |         meta: list of dictionaries
 872 |             other place information. Output of features_for_rank(). Used to provide
 873 |             information like "city in Germany" to the coding task.
 874 |         placename: str
 875 |             The extracted place name from text
 876 | 
 877 | 
 878 |         Returns
 879 |         --------
 880 |         task_list: list of dicts
 881 |             Tasks ready to be written to JSONL and use in Prodigy. Each potential match includes
 882 |             a text description to the annotator can pick the right one.
 883 |         """
 884 | 
 885 |         all_tasks = []
 886 | 
 887 |         sorted_X, sorted_meta = self.ranker(X, meta)
 888 |         sorted_meta = sorted_meta[:4]
 889 |         sorted_X = sorted_X[:4]
 890 |         for n, i in enumerate(sorted_meta):
 891 |             feature_code = i['feature_code']
 892 |             try:
 893 |                 fc = self._code_to_text[feature_code]
 894 |             except KeyError:
 895 |                 fc = ''
 896 |             text = ''.join(['"', i['place_name'], '"',
 897 |                             ", a ", fc,
 898 |                             " in ", i['country_code3'],
 899 |                             ", id: ", i['geonameid']])
 900 |             d = {"id" : n + 1, "text" : text}
 901 |             all_tasks.append(d)
 902 | 
 903 |         if return_feature_subset:
 904 |             return (all_tasks, sorted_meta, sorted_X)
 905 |         else:
 906 |             return all_tasks
 907 | 
 908 | 
 909 |     def format_geonames(self, entry, searchterm=None):
 910 |         """
 911 |         Pull out just the fields we want from a geonames entry
 912 | 
 913 |         To do:
 914 |         - switch to model picking
 915 | 
 916 |         Parameters
 917 |         -----------
 918 |         res : dict
 919 |             ES/geonames result
 920 | 
 921 |         searchterm : str
 922 |             (not implemented). Needed for better results picking
 923 | 
 924 |         Returns
 925 |         --------
 926 |         new_res : dict
 927 |             containing selected fields from selected geonames entry
 928 |         """
 929 |         try:
 930 |             lat, lon = entry['coordinates'].split(",")
 931 |             new_res = {"admin1" : self.get_admin1(entry['country_code2'], entry['admin1_code']),
 932 |                   "lat" : lat,
 933 |                   "lon" : lon,
 934 |                   "country_code3" : entry["country_code3"],
 935 |                   "geonameid" : entry["geonameid"],
 936 |                   "place_name" : entry["name"],
 937 |                   "feature_class" : entry["feature_class"],
 938 |                    "feature_code" : entry["feature_code"]}
 939 |             return new_res
 940 |         except (IndexError, TypeError):
 941 |             # two conditions for these errors:
 942 |             # 1. there are no results for some reason (Index)
 943 |             # 2. res is set to "" because the country model was below the thresh
 944 |             new_res = {"admin1" : "",
 945 |                   "lat" : "",
 946 |                   "lon" : "",
 947 |                   "country_code3" : "",
 948 |                   "geonameid" : "",
 949 |                   "place_name" : "",
 950 |                   "feature_class" : "",
 951 |                    "feature_code" : ""}
 952 |             return new_res
 953 | 
 954 |     def _check_exact(self, placename, match_list):
 955 |         """Find Geonames entries that have an exact match place name.
 956 | 
 957 |         When multiple hits come back for a query, this looks to see if any of them have
 958 |         an exact place name match in the `alternative_names` field. If only one does,
 959 |         it returns that one. Otherwise it returns None.
 960 |         """
 961 |         exact_matches = []
 962 |         for m in match_list:
 963 |             all_names = m['alternativenames']
 964 |             all_names.append(m['name'])
 965 |             if placename in all_names:
 966 |                 exact_matches.append(m)
 967 |         if len(exact_matches) == 1:
 968 |             return exact_matches[0]
 969 |         else:
 970 |             None
 971 | 
 972 |     def _check_editdist(self, placename, matchlist, threshold=2):
 973 |         """
 974 |         Check canonical, alternative, and ascii names for a close match.
 975 | 
 976 |         Parameters
 977 |         ------------
 978 |         placename: str
 979 |           The placename being searched for
 980 |         matchlist: list
 981 |           The results from Elasticsearch
 982 |         threshold: int
 983 |           The maximum edits allowed (defaults to 2)
 984 | 
 985 |         Returns
 986 |         --------
 987 |         tuple, the edit distance and the actual match
 988 |         """
 989 |         min_dists = []
 990 |         avg_dists = []
 991 |         for m in matchlist:
 992 |             all_names = m['alternativenames']
 993 |             all_names.extend([m['asciiname'], m['name']])
 994 | 
 995 |             ds = [editdistance.eval(placename, i)  for i in all_names]
 996 |             min_dists.append(np.min(ds))
 997 |             avg_dists.append(np.mean(ds))
 998 | 
 999 |         if np.sum([i <= threshold for i in min_dists]) == 1:
1000 |             dist = round(np.min(min_dists), 2)
1001 |             m = matchlist[np.argmin(min_dists)]
1002 |             reason = "CAUTION: Single edit distance match."
1003 |             info = "One entry of {0} within minimum edit distance of {1}".format(len(matchlist), dist)
1004 |             return m, reason, info
1005 |         elif np.sum([i <= threshold for i in min_dists]) > 1:
1006 |             dist = round(np.min(min_dists), 2)
1007 |             m = matchlist[np.argmin(avg_dists)]
1008 |             reason = "CAUTION: Best of several edit distance matches."
1009 |             info = "{0} entries within minimum edit distance. Picking closest average distance: {1}.".format(len(matchlist), round(np.min(avg_dists), 2))
1010 |             return m, reason, info
1011 |         else:
1012 |             return None, None, None
1013 | 
1014 |     def lookup_city(self, city, country, adm1=None):
1015 |         """
1016 |         Return the "best" Geonames entry for a city name.
1017 | 
1018 |         Queries the ES-Geonames gazetteer for the the given city, province/state/ADM1, and country,
1019 |         and uses a set of  rules to determine the best result to return. If adm1 is supplied,
1020 |         only results from that ADM1 will be returned.
1021 | 
1022 |         This code was modified from Halterman's (2019) Syria casualties working paper.
1023 | 
1024 |         Parameters
1025 |         ----------
1026 |         placename: str
1027 |           The name of the city to look up
1028 |         country: str
1029 |           The three character country code (iso3c)
1030 |         adm1: str
1031 |           (Optional) the name of the state/governorate/province
1032 | 
1033 |         Returns
1034 |         -------
1035 |         match: dict or list
1036 |           The single entry from Geonames that best matches the query, or [] if no match at all.
1037 |         """
1038 |         adm_limit = None
1039 |         if adm1:
1040 |             adm_res = self.query_geonames_country(placename=adm1,
1041 |                                                  country=country,
1042 |                                                  filter_params={"feature_code": "ADM1"})
1043 |             adm_res = adm_res['hits']['hits']
1044 |             if len(adm_res) == 1:
1045 |                 adm1 = adm_res[0]['admin1_code']
1046 |                 adm_limit = {"admin1_code" : adm1}
1047 |         res = self.query_geonames_country(city, country, adm_limit)
1048 |         res = res['hits']['hits']
1049 | 
1050 |         # look for a city first
1051 |         match = [i for i in res if i['feature_code'] in ['PPL', 'PPLA', 'PPLC', 'PPLA2', 'PPLA3', 'PPLA3']]
1052 |         if match:
1053 |             if len(match) == 1:
1054 |                 return {"geo" : match[0],
1055 |                         "query" : city,
1056 |                         "info" : "{0} total results of all types".format(len(res)),
1057 |                         "reason" : "Single match for city in Elasticsearch with name, ADM1, country."}
1058 |             # if there's more than one match:
1059 |             m = self._check_exact(city, match)
1060 |             if m:
1061 |                 return {"geo" : m,
1062 |                         "query" : city,
1063 |                             "info": "{0} elasticsearch matches for cities out of {1} total results of all types".format(len(match), len(res)),
1064 |                         "reason" : "Exact name match for city."}
1065 |             # check the editdistance
1066 |             m, reason, info = self._check_editdist(city, match)
1067 |             if m:
1068 |                 return {"geo" : m,
1069 |                         "query" : city,
1070 |                         "info": info,
1071 |                         "reason" : reason}
1072 | 
1073 |         # if there's no city match, look for a neighborhood
1074 |         match = [i for i in res if i['feature_code'] in ['PPLX', 'LCTY', 'PPLL', 'AREA']]
1075 |         if match:
1076 |             #print("neighborhood")
1077 |             # if there's just a single match, we're done
1078 |             if len(match) == 1:
1079 |                 reason = "Single elasticsearch match for neighborhood."
1080 |                 info = "{0} total results of all types".format(len(res))
1081 |                 return {"geo" : match[0],
1082 |                     "query" : city,
1083 |                     "info" : info,
1084 |                     "reason" : reason}
1085 |             # if there are multiple matches, look for exact matches
1086 |             else:
1087 |                 m = self._check_exact(city, match)
1088 |                 if m:
1089 |                     reason = "Exact place name match for neighborhood."
1090 |                     info = "{0} elasticsearch matches out of {1} total results of all types".format(len(match), len(res))
1091 |                     return {"geo" : m,
1092 |                             "query" : city,
1093 |                             "info" : info,
1094 |                             "reason" : reason}
1095 | 
1096 |                 m, reason, info = self._check_editdist(city, match)
1097 |                 if m:
1098 |                     return {"geo" : m,
1099 |                         "query" : city,
1100 |                         "info": info,
1101 |                         "reason" : reason}
1102 | 
1103 |         if len(res) == 1:
1104 |             reason = "CAUTION: One fuzzy match, not a city-type location."
1105 |             return {"geo" : res[0],
1106 |                     "query" : city,
1107 |                     "reason" : reason,
1108 |                     "info" :  "{0} total results of all types.".format(len(res))}
1109 | 
1110 |         if len(res) == 0:
1111 |             reason = "FAILURE: No fuzzy match for city or neighborhood."
1112 |         else:
1113 |             reason = "FAILURE: Too many matches for city or neighborhood, none exact."
1114 |         return {"geo" : None,
1115 |                     "query" : city,
1116 |                     "reason" : reason,
1117 |                     "info" :  "{0} total results of all types.".format(len(res))}
1118 | 
1119 | 
1120 | 
1121 |     def clean_proced(self, proced):
1122 |         """Small helper function to delete the features from the final dictionary.
1123 |         These features are mostly interesting for debugging but won't be relevant for most users.
1124 |         """
1125 |         for loc in proced:
1126 |             try:
1127 |                 del loc['all_countries']
1128 |             except KeyError:
1129 |                 pass
1130 |             try:
1131 |                 del loc['matrix']
1132 |             except KeyError:
1133 |                 pass
1134 |             try:
1135 |                 del loc['all_confidence']
1136 |             except KeyError:
1137 |                 pass
1138 |             try:
1139 |                 del loc['place_confidence']
1140 |             except KeyError:
1141 |                 pass
1142 |             try:
1143 |                 del loc['text']
1144 |             except KeyError:
1145 |                 pass
1146 |             try:
1147 |                 del loc['label']
1148 |             except KeyError:
1149 |                 pass
1150 |             try:
1151 |                 del loc['features']
1152 |             except KeyError:
1153 |                 pass
1154 |         return proced
1155 | 
1156 |     def geoparse(self, doc, verbose=False):
1157 |         """Main geoparsing function. Text to extracted, resolved entities.
1158 | 
1159 |         Parameters
1160 |         ----------
1161 |         doc : str or spaCy
1162 |             The document to be geoparsed. Can be either raw text or already spacy processed.
1163 |             In some cases, it makes sense to bulk parse using spacy's .pipe() before sending
1164 |             through to Mordecai
1165 | 
1166 |         Returns
1167 |         -------
1168 |         proced : list of dicts
1169 |             Each entity gets an entry in the list, with the dictionary including geo info, spans,
1170 |             and optionally, the input features.
1171 |         """
1172 |         if not hasattr(doc, "ents"):
1173 |             doc = self.nlp(doc)
1174 |         proced = self.infer_country(doc)
1175 |         if not proced:
1176 |             return []
1177 |             # logging!
1178 |             #print("Nothing came back from infer_country...")
1179 |         if self.threads:
1180 |             pool = ThreadPool(len(proced))
1181 |             results = pool.map(self.proc_lookup_country, proced)
1182 |             pool.close()
1183 |             pool.join()
1184 |         else:
1185 |             results = []
1186 |             for loc in proced:
1187 |                 if self.is_country(loc['word']):
1188 |                     # if it's a country name, just query that
1189 |                     res = self.query_geonames_country(loc['word'], 
1190 |                                                       self._just_cts[loc['word']],
1191 |                                                       filter_params={"feature_code": "PCLI"}) 
1192 |                     results.append(res)
1193 |                 # if the confidence is too low, don't use the country info
1194 |                 elif loc['country_conf'] > self.country_threshold:
1195 |                     res = self.query_geonames_country(loc['word'], loc['country_predicted'])
1196 |                     results.append(res)
1197 |                 else:
1198 |                     results.append("")
1199 | 
1200 |         for n, loc in enumerate(proced):
1201 |             res = results[n]
1202 |             try:
1203 |                 _ = res['hits']['hits']
1204 |                 # If there's no geonames result, what to do?
1205 |                 # For now, just continue.
1206 |                 # In the future, delete? Or add an empty "loc" field?
1207 |             except (TypeError, KeyError):
1208 |                 continue
1209 |             # Pick the best place
1210 |             X, meta = self.features_for_rank(loc, res)
1211 |             if X.shape[1] == 0:
1212 |                 # This happens if there are no results...
1213 |                 continue
1214 |             all_tasks, sorted_meta, sorted_X = self.format_for_prodigy(X, meta, loc['word'], return_feature_subset=True)
1215 |             fl_pad = np.pad(sorted_X, ((0, 5 - sorted_X.shape[0]), (0, 0)), 'constant')
1216 |             fl_unwrap = np.asmatrix(fl_pad.flatten())
1217 |             prediction = self.rank_model.predict(fl_unwrap)
1218 |             place_confidence = prediction.max()
1219 |             loc['geo'] = sorted_meta[prediction.argmax()]
1220 |             loc['place_confidence'] = place_confidence
1221 |         if not self.verbose:
1222 |             proced = self.clean_proced(proced)
1223 |         return proced
1224 | 
1225 | 
1226 |                 #labels = np.pad(labels, (0, 5 - len(labels)), 'constant')
1227 |                 # pad the matrix with empty rows
1228 |                 #fl_pad = np.pad(fl_subset, ((0, 5 - fl_subset.shape[0]), (0, 0)), 'constant')
1229 | 
1230 |     def batch_geoparse(self, text_list):
1231 |         """
1232 |         Batch geoparsing function. Take in a list of text documents and return a list of lists
1233 |         of the geoparsed documents. The speed improvements come exclusively from using spaCy's `nlp.pipe`.
1234 | 
1235 |         Parameters
1236 |         ----------
1237 |         text_list : list of strs
1238 |             List of documents. The documents should not have been pre-processed by spaCy.
1239 | 
1240 |         Returns
1241 |         -------
1242 |         processed : list of list of dictionaries.
1243 |             The list is the same length as the input list of documents. Each element is a list of dicts, one for
1244 |             each geolocated entity.
1245 |         """
1246 |         if not self.threads:
1247 |             print("batch_geoparsed should be used with threaded searches. Please set `threads=True` when initializing the geoparser.")
1248 |         nlped_docs = list(self.nlp.pipe(text_list, as_tuples=False, n_threads=multiprocessing.cpu_count()))
1249 |         processed = []
1250 |         for i in tqdm(nlped_docs, disable=not self.progress):
1251 |             p = self.geoparse(i)
1252 |             processed.append(p)
1253 |         return processed
1254 | 
1255 | 


--------------------------------------------------------------------------------
/mordecai/models/country_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/models/country_model.h5


--------------------------------------------------------------------------------
/mordecai/models/country_model_multi.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/models/country_model_multi.h5


--------------------------------------------------------------------------------
/mordecai/models/rank_model.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/models/rank_model.h5


--------------------------------------------------------------------------------
/mordecai/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/tests/__init__.py


--------------------------------------------------------------------------------
/mordecai/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from ..geoparse import Geoparser
 2 | import pytest
 3 | 
 4 | import spacy
 5 | nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger'])
 6 | 
 7 | @pytest.fixture(scope='session', autouse=True)
 8 | def geo():
 9 |     return Geoparser(nlp=nlp, threads=False)
10 | 
11 | @pytest.fixture(scope='session', autouse=True)
12 | def geo_thread():
13 |     return Geoparser(nlp=nlp, threads=True)
14 | 


--------------------------------------------------------------------------------
/mordecai/tests/test_mordecai.py:
--------------------------------------------------------------------------------
  1 | from elasticsearch_dsl import Q
  2 | import numpy as np
  3 | from ..utilities import structure_results
  4 | 
  5 | import spacy
  6 | nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger'])
  7 | 
  8 | def test_issue_40_2_thread(geo_thread):
  9 |     doc_list = ["Government forces attacked the cities in Aleppo Governorate, while rebel leaders met in Geneva.",
 10 |                 "EULEX is based in Prishtina, Kosovo.",
 11 |                 "Clientelism may depend on brokers."]
 12 |     locs = geo_thread.batch_geoparse(doc_list)
 13 |     assert len(locs) == 3
 14 |     assert locs[0][0]['geo']['geonameid'] == '170063'
 15 |     assert locs[0][1]['country_predicted'] == 'CHE'
 16 |     assert locs[1][0]['geo']['feature_code'] == 'PPLC'
 17 |     assert locs[1][1]['geo']['country_code3'] == 'XKX'
 18 |     assert locs[2] == []
 19 | 
 20 | def test_fm_methods_exist(geo):
 21 |     assert hasattr(geo, "_feature_most_alternative")
 22 |     assert hasattr(geo, "_feature_first_back")
 23 |     assert hasattr(geo, "_feature_word_embedding")
 24 |     assert hasattr(geo, "clean_entity")
 25 | 
 26 | def test_fm_methods_exist_thread(geo_thread):
 27 |     assert hasattr(geo_thread, "_feature_most_alternative")
 28 |     assert hasattr(geo_thread, "_feature_first_back")
 29 |     assert hasattr(geo_thread, "_feature_word_embedding")
 30 |     assert hasattr(geo_thread, "clean_entity")
 31 | 
 32 | def test_cts(geo):
 33 |     assert "Kosovo" in geo._cts.keys()
 34 |     assert "Kosovo" not in geo._cts.values()
 35 |     assert "AFG" in geo._cts.values()
 36 | 
 37 | def test_cts_thread(geo_thread):
 38 |     assert "Kosovo" in geo_thread._cts.keys()
 39 |     assert "Kosovo" not in geo_thread._cts.values()
 40 |     assert "AFG" in geo_thread._cts.values()
 41 | 
 42 | def test_country_mentions(geo):
 43 |     doc = nlp("Puerto Cabello is a port city in Venezuela")
 44 |     f = geo._feature_country_mentions(doc)
 45 |     assert f == ('VEN', 1, '', 0)
 46 | 
 47 | def test_country_mentions_thread(geo_thread):
 48 |     doc = nlp("Puerto Cabello is a port city in Venezuela")
 49 |     f = geo_thread._feature_country_mentions(doc)
 50 |     assert f == ('VEN', 1, '', 0)
 51 | 
 52 | def test_vector_picking(geo):
 53 |     entity = nlp("Mosul")
 54 |     vp = geo._feature_word_embedding(entity)
 55 |     assert vp['country_1'] == "IRQ"
 56 | 
 57 | def test_vector_picking_thread(geo_thread):
 58 |     entity = nlp("Mosul")
 59 |     vp = geo_thread._feature_word_embedding(entity)
 60 |     assert vp['country_1'] == "IRQ"
 61 | 
 62 | def test_cts2(geo):
 63 |     out = geo._inv_cts['DEU']
 64 |     assert out == "Germany"
 65 | 
 66 | def test_cts2_thread(geo_thread):
 67 |     out = geo_thread._inv_cts['DEU']
 68 |     assert out == "Germany"
 69 | 
 70 | def test_lookup_city(geo):
 71 |     out = geo.lookup_city("Norman", country="USA", adm1="Oklahoma")
 72 |     assert out['geo']['geonameid'] == '4543762'
 73 |     assert out['reason'] == 'Single match for city in Elasticsearch with name, ADM1, country.'
 74 | 
 75 | def test_lookup_city2(geo):
 76 |     out = geo.lookup_city("Rukn al-Din", "SYR")
 77 |     assert out['geo']['geonameid'] == '7642446'
 78 |     assert out['reason'] ==  'CAUTION: Single edit distance match.'
 79 | 
 80 | def test_city_lookup3(geo):
 81 |     # two easy cases
 82 |     res = geo.lookup_city("Norman", adm1 = "OK", country = "USA")
 83 |     assert res['geo']['geonameid'] == '4543762'
 84 |     res = geo.lookup_city("College Park", adm1 = "MD", country = "USA") 
 85 |     assert res['geo']['geonameid'] == '4351977'
 86 |     res = geo.lookup_city("College Park", adm1 = "OK", country = "USA") 
 87 |     assert res['geo'] is None
 88 |     # for some reason, Cambridge neighborhoods are PPL, not PPLX.
 89 |     res =  geo.lookup_city("East Cambridge", adm1 = "MA", country = "USA")
 90 |     assert res['geo']['geonameid'] == '5152577'
 91 |     assert res['geo']['feature_code'] == 'PPL'
 92 |     # Non-US check
 93 |     res =  geo.lookup_city("Aleppo", adm1 = "Aleppo", country = "SYR")
 94 |     assert res['geo']['feature_code'] == 'PPLA'
 95 |     assert res['geo']['geonameid'] == '170063'
 96 |     res = geo.lookup_city("Munich", country = "DEU")
 97 |     assert res['geo']['geonameid'] == '2867714'
 98 |     # Another US check
 99 |     res =  geo.lookup_city("Aleppo", country = "USA")
100 |     assert res['geo']['geonameid'] == '4556251'
101 |     # test neighborhood
102 |     res = geo.lookup_city("Bustan al-Qasr", adm1 = "Aleppo", country = "SYR")
103 |     assert res['geo']['feature_code'] == 'PPLX'
104 |     assert res['geo']['geonameid'] == '7753543'
105 |     # check nonsense
106 |     res = geo.lookup_city("qwertyqwerty", adm1 = "Aleppo", country = "SYR")
107 |     assert res['geo'] is None
108 | 
109 | def test_most_population(geo):
110 |     res_a = geo.query_geonames("Berlin")
111 |     res_b = geo.query_geonames("Oklahoma City")
112 |     res_c = geo.query_geonames("Tripoli")
113 |     a = geo._feature_most_population(res_a)
114 |     b = geo._feature_most_population(res_b)
115 |     c = geo._feature_most_population(res_c)
116 |     assert a == "DEU"
117 |     assert b == "USA"
118 |     assert c == "LBY"
119 | 
120 | def test_most_population_thread(geo_thread):
121 |     res_a = geo_thread.query_geonames("Berlin")
122 |     res_b = geo_thread.query_geonames("Oklahoma City")
123 |     res_c = geo_thread.query_geonames("Tripoli")
124 |     a = geo_thread._feature_most_population(res_a)
125 |     b = geo_thread._feature_most_population(res_b)
126 |     c = geo_thread._feature_most_population(res_c)
127 |     assert a == "DEU"
128 |     assert b == "USA"
129 |     assert c == "LBY"
130 | 
131 | def test_is_country(geo):
132 |     a = geo.is_country("Senegal")
133 |     assert a
134 | 
135 | def test_make_country_features(geo):
136 |     doc = nlp("EULEX is based in Prishtina, Kosovo.")
137 |     f = geo.make_country_features(doc)
138 |     assert f[0]['features']['most_alt'] == "XKX"
139 |     assert f[1]['features']['most_alt'] == "XKX"
140 |     assert f[0]['features']['word_vec'] == "XKX"
141 |     assert f[1]['features']['word_vec'] == "XKX"
142 |     assert f[0]['features']['wv_confid'] > 10
143 |     assert f[1]['features']['wv_confid'] > 10
144 |     assert len(f[0]['spans']) == 1
145 |     assert len(f[1]['spans']) == 1
146 | 
147 | def test_make_country_features_thread(geo_thread):
148 |     doc = nlp("EULEX is based in Prishtina, Kosovo.")
149 |     f = geo_thread.make_country_features(doc)
150 |     assert f[0]['features']['most_alt'] == "XKX"
151 |     assert f[1]['features']['most_alt'] == "XKX"
152 |     assert f[0]['features']['word_vec'] == "XKX"
153 |     assert f[1]['features']['word_vec'] == "XKX"
154 |     assert f[0]['features']['wv_confid'] > 10
155 |     assert f[1]['features']['wv_confid'] > 10
156 |     assert len(f[0]['spans']) == 1
157 |     assert len(f[1]['spans']) == 1
158 | 
159 | 
160 | def test_infer_country1(geo):
161 |     doc = "There's fighting in Aleppo and Homs."
162 |     loc = geo.infer_country(doc)
163 |     assert loc[0]['country_predicted'] == "SYR"
164 |     assert loc[1]['country_predicted'] == "SYR"
165 | 
166 | def test_infer_country1_thread(geo_thread):
167 |     doc = "There's fighting in Aleppo and Homs."
168 |     loc = geo_thread.infer_country(doc)
169 |     assert loc[0]['country_predicted'] == "SYR"
170 |     assert loc[1]['country_predicted'] == "SYR"
171 | 
172 | 
173 | def test_infer_country2(geo):
174 |     doc = "There's fighting in Berlin and Hamburg."
175 |     loc = geo.infer_country(doc)
176 |     assert loc[0]['country_predicted'] == "DEU"
177 |     assert loc[1]['country_predicted'] == "DEU"
178 | 
179 | def test_infer_country2_thread(geo_thread):
180 |     doc = "There's fighting in Berlin and Hamburg."
181 |     loc = geo_thread.infer_country(doc)
182 |     assert loc[0]['country_predicted'] == "DEU"
183 |     assert loc[1]['country_predicted'] == "DEU"
184 | 
185 | def test_two_countries(geo):
186 |     doc = "There's fighting in Aleppo and talking in Geneva."
187 |     loc = geo.geoparse(doc)
188 |     assert loc[0]['country_predicted'] == "SYR"
189 |     assert loc[1]['country_predicted'] == "CHE"
190 | 
191 | def test_two_countries_thread(geo_thread):
192 |     doc = "There's fighting in Aleppo and talking in Geneva."
193 |     loc = geo_thread.geoparse(doc)
194 |     assert loc[0]['country_predicted'] == "SYR"
195 |     assert loc[1]['country_predicted'] == "CHE"
196 | 
197 | def test_US_city(geo):
198 |     doc = "There's fighting in Norman, Oklahoma."
199 |     locs = geo.geoparse(doc)
200 |     assert locs[0]['geo']['geonameid'] == '4543762'
201 |     assert locs[1]['geo']['geonameid'] == '4544379'
202 | 
203 | def test_US_city_thread(geo_thread):
204 |     doc = "There's fighting in Norman, Oklahoma."
205 |     locs = geo_thread.geoparse(doc)
206 |     assert locs[0]['geo']['geonameid'] == '4543762'
207 |     assert locs[1]['geo']['geonameid'] == '4544379'
208 | 
209 | def test_admin1(geo):
210 |     doc = "There's fighting in Norman, Oklahoma."
211 |     locs = geo.geoparse(doc)
212 |     assert locs[0]['geo']['admin1'] == 'Oklahoma'
213 | 
214 | def test_admin1_thread(geo_thread):
215 |     doc = "There's fighting in Norman, Oklahoma."
216 |     locs = geo_thread.geoparse(doc)
217 |     assert locs[0]['geo']['admin1'] == 'Oklahoma'
218 | 
219 | def test_weird_loc(geo):
220 |     doc = "There's fighting in Ajnsdgjb city."
221 |     loc = geo.geoparse(doc)
222 |     assert loc[0]['country_conf'] < 0.3
223 | 
224 | def test_weird_loc_thread(geo_thread):
225 |     doc = "There's fighting in GOUOsabgoajwh city."
226 |     loc = geo_thread.geoparse(doc)
227 |     assert loc[0]['country_conf'] < 0.3
228 | 
229 | def test_no_loc(geo):
230 |     doc = "The dog ran through the park."
231 |     loc = geo.geoparse(doc)
232 |     assert len(loc) == 0
233 | 
234 | def test_no_loc_thread(geo_thread):
235 |     doc = "The dog ran through the park."
236 |     loc = geo_thread.geoparse(doc)
237 |     assert len(loc) == 0
238 | 
239 | def test_query(geo):
240 |     results = geo.query_geonames("Berlin")
241 |     assert results['hits']['hits'][15]['country_code3']
242 | 
243 | def test_query_thread(geo_thread):
244 |     results = geo_thread.query_geonames("Berlin")
245 |     assert results['hits']['hits'][15]['country_code3']
246 | 
247 | def test_missing_feature_code(geo):
248 |     doc = "Congress and in the legislatures of Alabama, California, Florida, and Michigan."
249 |     locs = geo.geoparse(doc)
250 |     assert locs
251 | 
252 | def test_missing_feature_code_thread(geo_thread):
253 |     doc = "Congress and in the legislatures of Alabama, California, Florida, and Michigan."
254 |     locs = geo_thread.geoparse(doc)
255 |     assert locs
256 | 
257 | def test_aleppo_geneva(geo):
258 |     locs = geo.geoparse("Government forces attacked the cities in Aleppo Governorate, while rebel leaders met in Geneva.")
259 |     assert locs[0]['geo']['country_code3'] == 'SYR'
260 |     assert locs[1]['geo']['country_code3'] == 'CHE'
261 | 
262 | def test_aleppo_geneva_thread(geo_thread):
263 |     locs = geo_thread.geoparse("Government forces attacked the cities in Aleppo Governorate, while rebel leaders met in Geneva.")
264 |     assert locs[0]['geo']['country_code3'] == 'SYR'
265 |     assert locs[1]['geo']['country_code3'] == 'CHE'
266 | 
267 | def test_issue_40(geo):
268 |     doc = "In early 1938, the Prime Minister cut grants-in-aid to the provinces, effectively killing the relief project scheme. Premier Thomas Dufferin Pattullo closed the projects in April, claiming that British Columbia could not shoulder the burden alone. Unemployed men again flocked to Vancouver to protest government insensitivity and intransigence to their plight. The RCPU organized demonstrations and tin-canning (organized begging) in the city. Under the guidance of twenty-six-year-old Steve Brodie, the leader of the Youth Division who had cut his activist teeth during the 1935 relief camp strike, protesters occupied Hotel Georgia, the Vancouver Art Gallery (then located at 1145 West Georgia Street), and the main post office (now the Sinclair Centre)."
269 |     locs = geo.geoparse(doc)
270 |     assert len(locs) > 2
271 | 
272 | 
273 | def test_issue_45(geo):
274 |     text = """Santa Cruz is a first class municipality in
275 | the province of Davao del Sur, Philippines. It has a population of 81,093
276 | people as of 2010. The Municipality of Santa Cruz is part of Metropolitan
277 | Davao. Santa Cruz is politically subdivided into 18 barangays. Of the 18
278 | barangays, 7 are uplands, 9 are upland-lowland and coastal and 2 are
279 | lowland-coastal. Pista sa Kinaiyahan A yearly activity conducted every last
280 | week of April as a tribute to the Mother Nature through tree-growing, cleanup
281 | activities and Boulder Face challenge. Araw ng Santa Cruz It is celebrated
282 | every October 5 in commemoration of the legal creation of the municipality in
283 | 1884. Highlights include parades, field demonstrations, trade fairs, carnivals
284 | and traditional festivities. Sinabbadan Festival A festival of ethnic ritual
285 | and dances celebrated every September. Santa Cruz is accessible by land
286 | transportation vehicles plying the Davao-Digos City, Davao-Kidapawan City,
287 | Davao-Cotabato City, Davao-Koronadal City and Davao-Tacurong City routes
288 | passing through the town's single, 27 kilometres (17 mi) stretch of national
289 | highway that traverses its 11 barangays. From Davao City, the administrative
290 | center of Region XI, it is 38 kilometres (24 mi) away within a 45-minute ride,
291 | while it is 16 kilometres (9.9 mi) or about 15-minute ride from provincial
292 | capital city of Digos."""
293 |     locs = geo.geoparse(text)
294 |     assert len(locs) > 0
295 | 
296 | def test_issue_45_thread(geo_thread):
297 |     text = """Santa Cruz is a first class municipality in
298 | the province of Davao del Sur, Philippines. It has a population of 81,093
299 | people as of 2010. The Municipality of Santa Cruz is part of Metropolitan
300 | Davao. Santa Cruz is politically subdivided into 18 barangays. Of the 18
301 | barangays, 7 are uplands, 9 are upland-lowland and coastal and 2 are
302 | lowland-coastal. Pista sa Kinaiyahan A yearly activity conducted every last
303 | week of April as a tribute to the Mother Nature through tree-growing, cleanup
304 | activities and Boulder Face challenge. Araw ng Santa Cruz It is celebrated
305 | every October 5 in commemoration of the legal creation of the municipality in
306 | 1884. Highlights include parades, field demonstrations, trade fairs, carnivals
307 | and traditional festivities. Sinabbadan Festival A festival of ethnic ritual
308 | and dances celebrated every September. Santa Cruz is accessible by land
309 | transportation vehicles plying the Davao-Digos City, Davao-Kidapawan City,
310 | Davao-Cotabato City, Davao-Koronadal City and Davao-Tacurong City routes
311 | passing through the town's single, 27 kilometres (17 mi) stretch of national
312 | highway that traverses its 11 barangays. From Davao City, the administrative
313 | center of Region XI, it is 38 kilometres (24 mi) away within a 45-minute ride,
314 | while it is 16 kilometres (9.9 mi) or about 15-minute ride from provincial
315 | capital city of Digos."""
316 |     locs = geo_thread.geoparse(text)
317 |     assert len(locs) > 0
318 | 
319 | def test_ohio(geo):
320 |     # This was a problem in issue 41
321 |     r = Q("match", geonameid='5165418')
322 |     result = geo.conn.query(r).execute()
323 |     output = structure_results(result)
324 |     assert output['hits']['hits'][0]['asciiname'] == "Ohio"
325 | 
326 | def test_readme_example(geo):
327 |     output = geo.geoparse("I traveled from Oxford to Ottawa.")
328 |     correct = [{'country_conf': np.float32(0.957188),
329 |           'country_predicted': 'GBR',
330 |           'geo': {'admin1': 'England',
331 |            'country_code3': 'GBR',
332 |            'feature_class': 'P',
333 |            'feature_code': 'PPLA2',
334 |            'geonameid': '2640729',
335 |            'lat': '51.75222',
336 |            'lon': '-1.25596',
337 |            'place_name': 'Oxford'},
338 |           'spans': [{'end': 22, 'start': 16}],
339 |           'word': 'Oxford'},
340 |          {'country_conf': np.float32(0.8799221),
341 |           'country_predicted': 'CAN',
342 |           'geo': {'admin1': 'Ontario',
343 |            'country_code3': 'CAN',
344 |            'feature_class': 'P',
345 |            'feature_code': 'PPLC',
346 |            'geonameid': '6094817',
347 |            'lat': '45.41117',
348 |            'lon': '-75.69812',
349 |            'place_name': 'Ottawa'},
350 |           'spans': [{'end': 32, 'start': 26}],
351 |           'word': 'Ottawa'}]
352 |     assert output == correct
353 | 
354 | def test_readme_example_thread(geo_thread):
355 |     output = geo_thread.geoparse("I traveled from Oxford to Ottawa.")
356 |     correct = [{'country_conf': np.float32(0.957188),
357 |           'country_predicted': 'GBR',
358 |           'geo': {'admin1': 'England',
359 |            'country_code3': 'GBR',
360 |            'feature_class': 'P',
361 |            'feature_code': 'PPLA2',
362 |            'geonameid': '2640729',
363 |            'lat': '51.75222',
364 |            'lon': '-1.25596',
365 |            'place_name': 'Oxford'},
366 |           'spans': [{'end': 22, 'start': 16}],
367 |           'word': 'Oxford'},
368 |          {'country_conf': np.float32(0.8799221),
369 |           'country_predicted': 'CAN',
370 |           'geo': {'admin1': 'Ontario',
371 |            'country_code3': 'CAN',
372 |            'feature_class': 'P',
373 |            'feature_code': 'PPLC',
374 |            'geonameid': '6094817',
375 |            'lat': '45.41117',
376 |            'lon': '-75.69812',
377 |            'place_name': 'Ottawa'},
378 |           'spans': [{'end': 32, 'start': 26}],
379 |           'word': 'Ottawa'}]
380 |     assert output == correct
381 | 
382 | def test_issue_53(geo):
383 |     # the spans issue
384 |     output = geo.geoparse("I traveled from Oxford to Ottawa.")
385 |     assert output[0]['spans'][0]['start'] == 16
386 |     assert output[0]['spans'][0]['end'] == 22
387 |     assert output[1]['spans'][0]['start'] == 26
388 |     assert output[1]['spans'][0]['end'] == 32
389 | 
390 | def test_issue_68_verbose(geo):
391 |     res = geo.geoparse("The ship entered Greenville from Tarboro", verbose=True)
392 |     assert res
393 | 
394 | def test_issue_77(geo):
395 |     res = geo.geoparse("We traveled to the USA")
396 |     assert res[0]['geo']['feature_code'] == "PCLI"
397 |     res = geo.geoparse("We traveled to the United States.")
398 |     assert res[0]['geo']['feature_code'] == "PCLI"
399 |     res = geo.geoparse("We traveled to Germany.")
400 |     assert res[0]['geo']['feature_code'] == "PCLI"
401 |     res = geo.geoparse("We traveled to France.")
402 |     assert res[0]['geo']['feature_code'] == "PCLI"
403 | 
404 | def test_issue_82(geo):
405 |     ents = geo.nlp(""" Wuppertal (remote-option) """).ents
406 |     res = geo.geoparse( """ Wuppertal (remote-option) """ )
407 |     assert len(ents) == len(res)
408 |     ents = geo.nlp(""" Wuppertal remote-option """).ents
409 |     res = geo.geoparse( """ Wuppertal remote-option """ )
410 |     assert len(ents) == len(res)


--------------------------------------------------------------------------------
/mordecai/utilities.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | from __future__ import unicode_literals
  3 | from __future__ import print_function
  4 | import os
  5 | import sys
  6 | import json
  7 | import numpy
  8 | import pandas as pd
  9 | from elasticsearch_dsl import Search, Q
 10 | from elasticsearch import Elasticsearch
 11 | 
 12 | import spacy
 13 | 
 14 | try:
 15 |     nlp
 16 | except NameError:
 17 |     nlp = spacy.load('en_core_web_lg')
 18 | 
 19 | 
 20 | def country_list_maker():
 21 |     """
 22 |     Helper function to return dictionary of countries in {"country" : "iso"} form.
 23 |     """
 24 |     cts = {"Afghanistan":"AFG", "Åland Islands":"ALA", "Albania":"ALB", "Algeria":"DZA",
 25 |     "American Samoa":"ASM", "Andorra":"AND", "Angola":"AGO", "Anguilla":"AIA",
 26 |     "Antarctica":"ATA", "Antigua Barbuda":"ATG", "Argentina":"ARG",
 27 |     "Armenia":"ARM", "Aruba":"ABW", "Ascension Island":"NA", "Australia":"AUS",
 28 |     "Austria":"AUT", "Azerbaijan":"AZE", "Bahamas":"BHS", "Bahrain":"BHR",
 29 |     "Bangladesh":"BGD", "Barbados":"BRB", "Belarus":"BLR", "Belgium":"BEL",
 30 |     "Belize":"BLZ", "Benin":"BEN", "Bermuda":"BMU", "Bhutan":"BTN",
 31 |     "Bolivia":"BOL", "Bosnia Herzegovina":"BIH",
 32 |     "Botswana":"BWA", "Bouvet Island":"BVT", "Brazil":"BRA",
 33 |     "Britain":"GBR", "Great Britain":"GBR",
 34 |     "British Virgin Islands":"VGB", "Brunei":"BRN", "Bulgaria":"BGR", "Burkina Faso":"BFA",
 35 |     "Burundi":"BDI", "Cambodia":"KHM", "Cameroon":"CMR",
 36 |     "Canada":"CAN","Cape Verde":"CPV", "Cayman Islands":"CYM",
 37 |     "Central African Republic":"CAF", "Chad":"TCD", "Chile":"CHL", "China":"CHN",
 38 |     "Cocos Islands":"CCK", "Colombia":"COL",
 39 |     "Comoros":"COM",     "Republic of Congo":"COG", "Cook Islands":"COK",
 40 |     "Costa Rica":"CRI", "Cote Ivoire":"CIV", "Ivory Coast":"CIV","Croatia":"HRV", "Cuba":"CUB",
 41 |     "Curaçao":"CUW", "Cyprus":"CYP", "Czech Republic":"CZE", "Denmark":"DNK",
 42 |     "Djibouti":"DJI", "Dominica":"DMA", "Dominican Republic":"DOM", "Democratic Republic of Congo" : "COD",
 43 |     "Ecuador":"ECU", "Egypt":"EGY", "El Salvador":"SLV", "England" : "GBR",
 44 |     "Equatorial Guinea":"GNQ", "Eritrea":"ERI", "Estonia":"EST", "Ethiopia":"ETH",
 45 |     "Falkland Islands":"FLK", "Faroe Islands":"FRO",
 46 |     "Fiji":"FJI", "Finland":"FIN", "France":"FRA", "French Guiana":"GUF",
 47 |     "French Polynesia":"PYF","Gabon":"GAB",
 48 |     "Gambia":"GMB", "Georgia":"GEO", "Germany":"DEU", "Ghana":"GHA",
 49 |     "Gibraltar":"GIB", "Greece":"GRC", "Greenland":"GRL", "Grenada":"GRD",
 50 |     "Guadeloupe":"GLP", "Guam":"GUM", "Guatemala":"GTM", "Guernsey":"GGY",
 51 |     "Guinea":"GIN", "Guinea Bissau":"GNB", "Guyana":"GUY", "Haiti":"HTI","Honduras":"HND",
 52 |     "Hong Kong":"HKG",  "Hungary":"HUN", "Iceland":"ISL",
 53 |     "India":"IND", "Indonesia":"IDN", "Iran":"IRN", "Iraq":"IRQ", "Ireland":"IRL",
 54 |     "Israel":"ISR", "Italy":"ITA", "Jamaica":"JAM", "Japan":"JPN",
 55 |     "Jordan":"JOR", "Kazakhstan":"KAZ", "Kenya":"KEN",
 56 |     "Kiribati":"KIR", "Kosovo": "XKX", "Kuwait":"KWT", "Kyrgyzstan":"KGZ", "Laos":"LAO",
 57 |     "Latvia":"LVA", "Lebanon":"LBN", "Lesotho":"LSO", "Liberia":"LBR",
 58 |     "Libya":"LBY", "Liechtenstein":"LIE", "Lithuania":"LTU", "Luxembourg":"LUX",
 59 |     "Macau":"MAC", "Macedonia":"MKD", "Madagascar":"MDG", "Malawi":"MWI",
 60 |     "Malaysia":"MYS", "Maldives":"MDV", "Mali":"MLI", "Malta":"MLT", "Marshall Islands":"MHL",
 61 |     "Martinique":"MTQ", "Mauritania":"MRT", "Mauritius":"MUS",
 62 |     "Mayotte":"MYT", "Mexico":"MEX", "Micronesia":"FSM", "Moldova":"MDA",
 63 |     "Monaco":"MCO", "Mongolia":"MNG", "Montenegro":"MNE", "Montserrat":"MSR",
 64 |     "Morocco":"MAR", "Mozambique":"MOZ", "Myanmar":"MMR", "Burma":"MMR", "Namibia":"NAM",
 65 |     "Nauru":"NRU", "Nepal":"NPL", "Netherlands":"NLD", "Netherlands Antilles":"ANT",
 66 |     "New Caledonia":"NCL", "New Zealand":"NZL", "Nicaragua":"NIC",
 67 |     "Niger":"NER", "Nigeria":"NGA", "Niue":"NIU", "North Korea":"PRK",
 68 |     "Northern Ireland":"IRL", "Northern Mariana Islands":"MNP",
 69 |     "Norway":"NOR", "Oman":"OMN", "Pakistan":"PAK",
 70 |     "Palau":"PLW", "Palestine":"PSE","Panama":"PAN", "Papua New Guinea":"PNG",
 71 |     "Paraguay":"PRY", "Peru":"PER", "Philippines":"PHL", "Pitcairn Islands":"PCN",
 72 |     "Poland":"POL", "Portugal":"PRT", "Puerto Rico":"PRI",
 73 |     "Qatar":"QAT", "Réunion":"REU", "Romania":"ROU", "Russia":"RUS",
 74 |     "Rwanda":"RWA", "Saint Barthélemy":"BLM", "Saint Helena":"SHN",
 75 |     "Saint Kitts Nevis":"KNA", "Saint Lucia":"LCA",
 76 |     "Saint Pierre Miquelon":"SPM", "Saint Vincent Grenadines":"VCT",
 77 |     "Samoa":"WSM", "San Marino":"SMR", "São Tomé Príncipe":"STP", "Saudi Arabia":"SAU",
 78 |     "Senegal":"SEN", "Serbia":"SRB",
 79 |     "Seychelles":"SYC", "Sierra Leone":"SLE", "Singapore":"SGP", "Sint Maarten":"SXM",
 80 |     "Slovakia":"SVK", "Slovenia":"SVN", "Solomon Islands":"SLB",
 81 |     "Somalia":"SOM", "South Africa":"ZAF",
 82 |     "South Korea":"KOR", "South Sudan":"SSD", "Spain":"ESP", "Sri Lanka":"LKA", "Sudan":"SDN",
 83 |     "Suriname":"SUR", "Svalbard Jan Mayen":"SJM",
 84 |     "Swaziland":"SWZ", "Sweden":"SWE", "Switzerland":"CHE", "Syria":"SYR",
 85 |     "Taiwan":"TWN", "Tajikistan":"TJK", "Tanzania":"TZA", "Thailand":"THA",
 86 |     "Timor Leste":"TLS", "East Timor":"TLS","Togo":"TGO", "Tokelau":"TKL", "Tonga":"TON", "Trinidad Tobago":"TTO",
 87 |     "Tunisia":"TUN", "Turkey":"TUR",
 88 |     "Turkmenistan":"TKM", "Turks Caicos Islands":"TCA", "Tuvalu":"TUV", "U.S. Minor Outlying Islands":"UMI",
 89 |     "Virgin Islands":"VIR", "Uganda":"UGA",
 90 |     "Ukraine":"UKR", "United Arab Emirates":"ARE", "United Kingdom":"GBR",
 91 |     "United States":"USA",    "Uruguay":"URY", "Uzbekistan":"UZB", "Vanuatu":"VUT", "Vatican":"VAT",
 92 |     "Venezuela":"VEN",
 93 |     "Vietnam":"VNM", "Wallis Futuna":"WLF",
 94 |     "Western Sahara":"ESH", "Yemen":"YEM", "Zambia":"ZMB", "Zimbabwe":"ZWE",
 95 |     "UK":"GBR", "United States":"USA", "USA":"USA", "America":"USA",  "Palestinian Territories":"PSE",
 96 |     "Congo Brazzaville":"COG", "DRC":"COD", "Congo Kinshasa":"COD", "Wales" : "GBR",
 97 |     "Scotland" : "GBR", "Britain" : "GBR",}
 98 | 
 99 |     return cts
100 | 
101 | 
102 | def other_vectors():
103 |     """
104 |     Define more {placename : iso} mappings to improve performance of vector-based
105 |     country picking. An easy hack to force a placename to resolve to a defined country
106 |     would be to add it to this list.
107 |     """
108 |     # We want the advantage of having more defined vector terms to help
109 |     # matching, but we also want to make sure that when we invert the
110 |     # dictionary for labeling, each ISO code gets resolved to a single country
111 |     # name, as opposed to an alternative name, city, or state.
112 |     other_vecs = {
113 |     # alt. country names
114 |     # US states
115 |     "Alabama" :  "USA", "Alaska" : "USA", "Arizona" : "USA", "Arkansas" : "USA",
116 |     "California" : "USA", "Colorado" : "USA", "Connecticut" : "USA", "Delaware" : "USA",
117 |     "Florida" : "USA",
118 |     #    "Georgia" : "USA",  <----- hmmmm
119 |     "Hawaii" : "USA", "Idaho" : "USA",
120 |     "Illinois" : "USA", "Indiana" : "USA", "Iowa" : "USA", "Kansas" : "USA",
121 |     "Kentucky" : "USA", "Louisiana" : "USA", "Maine" : "USA",
122 |     "Maryland" : "USA", "Massachusetts" : "USA", "Michigan" : "USA",
123 |     "Minnesota" : "USA", "Mississippi" : "USA", "Missouri" : "USA",
124 |     "Montana" : "USA", "Nebraska" : "USA", "Nevada" : "USA", "New  Hampshire" : "USA",
125 |     "New Jersey" : "USA", "New Mexico" : "USA", "New York" : "USA",
126 |     "North Carolina" : "USA", "North Dakota" : "USA", "Ohio" : "USA",
127 |     "Oklahoma" : "USA", "Oregon" : "USA", "Pennsylvania" : "USA",
128 |     "Rhode Island" : "USA", "South Carolina" : "USA", "South Dakota" : "USA",
129 |     "Tennessee" : "USA", "Texas" : "USA", "Utah" : "USA",
130 |     "Vermont" : "USA", "Virginia" : "USA", "Washington" : "USA",
131 |     "West Virginia" : "USA", "Wisconsin" : "USA", "Wyoming" : "USA",
132 |     # cities
133 |     "Beijing" : "CHN", "Chicago" : "USA",
134 |     "Tbilisi" : "GEO", "Gaza":"PSE"}
135 |     return other_vecs
136 | 
137 | 
138 | def make_skip_list(cts):
139 |     """
140 |     Return hand-defined list of place names to skip and not attempt to geolocate. If users would like to exclude
141 |     country names, this would be the function to do it with.
142 |     """
143 |     # maybe make these non-country searches but don't discard, at least for
144 |     # some (esp. bodies of water)
145 |     special_terms = ["Europe", "West", "the West", "South Pacific", "Gulf of Mexico", "Atlantic",
146 |                     "the Black Sea", "Black Sea", "North America", "Mideast", "Middle East",
147 |                      "the Middle East", "Asia", "the Caucasus", "Africa",
148 |                     "Central Asia", "Balkans", "Eastern Europe", "Arctic", "Ottoman Empire",
149 |                     "Asia-Pacific", "East Asia", "Horn of Africa", "Americas",
150 |                     "North Africa", "the Strait of Hormuz", "Mediterranean", "East", "North",
151 |                      "South", "Latin America", "Southeast Asia", "Western Pacific", "South Asia",
152 |                     "Persian Gulf", "Central Europe", "Western Hemisphere", "Western Europe",
153 |                     "European Union (E.U.)", "EU", "European Union", "E.U.", "Asia-Pacific",
154 |                  "Europe", "Caribbean", "US", "U.S.", "Persian Gulf", "West Africa", "North", "East",
155 |                      "South", "West", "Western Countries"
156 |                 ]
157 | 
158 |     # Some words are recurring spacy problems...
159 |     spacy_problems = ["Kurd", "Qur'an"]
160 | 
161 |     #skip_list = list(cts.keys()) + special_terms
162 |     skip_list =  special_terms + spacy_problems
163 |     skip_list = set(skip_list)
164 |     return skip_list
165 | 
166 | 
167 | def country_list_nlp(cts):
168 |     """NLP countries so we can use for vector comparisons"""
169 |     ct_nlp = []
170 |     for i in cts.keys():
171 |         nlped = nlp(i)
172 |         ct_nlp.append(nlped)
173 |     return ct_nlp
174 | 
175 | 
176 | def make_country_nationality_list(cts, ct_file):
177 |     """Combine list of countries and list of nationalities"""
178 |     countries = pd.read_csv(ct_file)
179 |     nationality = dict(zip(countries.nationality,countries.alpha_3_code))
180 |     both_codes = {**nationality, **cts}
181 |     return both_codes
182 | 
183 | 
184 | def make_inv_cts(cts):
185 |     """
186 |     cts is e.g. {"Germany" : "DEU"}. inv_cts is the inverse: {"DEU" : "Germany"}
187 |     """
188 |     inv_ct = {}
189 |     for old_k, old_v in cts.items():
190 |         if old_v not in inv_ct.keys():
191 |             inv_ct.update({old_v : old_k})
192 |     return inv_ct
193 | 
194 | 
195 | def read_in_admin1(filepath):
196 |     """
197 |     Small helper function to read in a admin1 code <--> admin1 name document.
198 | 
199 |     Parameters
200 |     ----------
201 |     filepath: string
202 |               path to the admin1 mapping JSON. This file is usually
203 |               mordecai/resources/data/admin1CodesASCII.json
204 | 
205 |     Returns
206 |     -------
207 |     admin1_dict: dictionary
208 |                  keys are country + admin1codes, values are names
209 |                  Example: "US.OK" : "Oklahoma"
210 |                  Example: "SE.21": "Uppsala"
211 |     """
212 |     with open(filepath) as admin1file:
213 |         admin1_dict = json.loads(admin1file.read())
214 |     return admin1_dict
215 | 
216 | 
217 | 
218 | def structure_results(res):
219 |     """Format Elasticsearch result as Python dictionary"""
220 |     out = {'hits': {'hits': []}}
221 |     keys = ['admin1_code', 'admin2_code', 'admin3_code', 'admin4_code',
222 |             'alternativenames', 'asciiname',  'coordinates',
223 |             'country_code2', 'country_code3', 
224 |             'feature_class', 'feature_code', 'geonameid',
225 |             'modification_date', 'name', 'population']
226 |     for i in res:
227 |         i_out = {}
228 |         for k in keys:
229 |             i_out[k] = i[k]
230 |         out['hits']['hits'].append(i_out)
231 |     return out
232 | 
233 | def setup_es(hosts, port, use_ssl=False, auth=None):
234 |     """
235 |     Setup an Elasticsearch connection
236 | 
237 |     Parameters
238 |     ----------
239 |     hosts: list
240 |             Hostnames / IP addresses for elasticsearch cluster
241 |     port: string
242 |             Port for elasticsearch cluster
243 |     use_ssl: boolean
244 |             Whether to use SSL for the elasticsearch connection
245 |     auth: tuple
246 |             (username, password) to use with HTTP auth
247 |     Returns
248 |     -------
249 |     es_conn: an elasticsearch_dsl Search connection object.
250 |     """
251 |     kwargs = dict(
252 |         hosts=hosts or ['localhost'],
253 |         port=port or 9200,
254 |         use_ssl=use_ssl,
255 |     )
256 |     if auth:
257 |         kwargs.update(http_auth=auth)
258 | 
259 |     CLIENT = Elasticsearch(**kwargs)
260 |     S = Search(using=CLIENT, index="geonames")
261 |     return S
262 | 
263 | def check_geonames_date(conn):
264 |     r = Q("match", geonameid='4943351')
265 |     result = conn.query(r).execute()
266 |     output = structure_results(result)
267 |     return output['hits']['hits'][0]['modification_date']
268 | 


--------------------------------------------------------------------------------
/paper/mordecai_geoparsing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/paper/mordecai_geoparsing.png


--------------------------------------------------------------------------------
/paper/paper.bib:
--------------------------------------------------------------------------------
 1 | @article{mikolov2013efficient,
 2 |   title={Efficient estimation of word representations in vector space},
 3 |   author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
 4 |   journal={arXiv preprint arXiv:1301.3781},
 5 |   year={2013}
 6 | }
 7 | 
 8 | 
 9 | @online{geonames,
10 |   author = {Geonames},
11 |   title = {Geonames},
12 |   year = 2016,
13 |   url = {http://geonames.org},
14 |   urldate = {2016-09-08}
15 | }
16 | 


--------------------------------------------------------------------------------
/paper/paper.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Mordecai: Full Text Geoparsing and Event Geocoding"
 3 | tags:
 4 |   - geocoding
 5 |   - geoparsing
 6 |   - natural language processing
 7 |   - Python
 8 |   - word embeddings
 9 | authors:
10 |   - name: Andrew Halterman
11 |     orcid: 0000-0001-9716-9555
12 |     affiliation: 1
13 | affiliations:
14 |   - name: MIT
15 |     index: 1
16 | date: 8 December 2017
17 | bibliography: paper.bib
18 | ---
19 | 
20 | # Summary
21 | 
22 | Mordecai is a new full-text geoparsing system that extracts place names from
23 | text, resolves them to their correct entries in a gazetteer, and returns
24 | structured geographic information for the resolved place name. Geoparsing can
25 | be used in a number of tasks, including media monitoring, improved information
26 | extraction, document annotation for search, and geolocating text-derived event
27 | data, which is the task for which is was built. Mordecai was created to provide
28 | provide several features missing in existing geoparsers, including better
29 | handling of non-US place names, easy and portable setup and use though a Docker
30 | REST architecture, and easy customization with Python and swappable named
31 | entity recognition systems. Mordecai's key technical innovations are in a
32 | language-agnostic architecture that uses word2vec [@mikolov2013efficient] for
33 | inferring the correct country for a set of locations in a piece of text and
34 | easily changed named entity recognition models. As a gazetteer, it uses
35 | Geonames [@geonames] in a custom-build Elasticsearch database.
36 | 
37 | # References
38 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | editdistance>=0.5.3
 2 | elasticsearch==5.4.0
 3 | elasticsearch-dsl==5.3.0
 4 | h5py>=2.10.0
 5 | pandas>=0.24.2
 6 | spacy>=2.3,<3.0
 7 | tensorflow>=2.2.0
 8 | tqdm>=4.28.1
 9 | numpy>=1.12
10 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(name='mordecai',
 4 |       version='2.1.0',
 5 |       description='Full text geoparsing and event geocoding',
 6 |       url='http://github.com/openeventdata/mordecai/',
 7 |       author='Andy Halterman',
 8 |       author_email='ahalterman0@gmail.com',
 9 |       license='MIT',
10 |       packages=['mordecai'],
11 |       keywords = ['geoparsing', 'nlp', 'geocoding', 'toponym resolution'],
12 |       install_requires = ['editdistance>=0.5.3',
13 |                           'elasticsearch==5.4.0',
14 |                           'elasticsearch-dsl==5.3.0',
15 |                           'h5py>=2.10.0',
16 |                           'pandas>=0.24.2',
17 |                           'spacy>=2.3,<3.0',
18 |                           'tensorflow>=2.2.0',
19 |                           'tqdm>=4.28.1',
20 |                           'numpy>=1.12'], 
21 |       dependency_links=['https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz'],
22 |       include_package_data=True,
23 |       package_data = {'data': ['admin1CodesASCII.json',
24 |                              'countries.json',
25 |                              'nat_df.csv',
26 |                              'stopword_country_names.json'],
27 |                     'models' : ['country_model.h5',
28 |                                 'rank_model.h5']}
29 |      )
30 | 


--------------------------------------------------------------------------------
/train/train_country_model.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import jsonlines
  4 | from pandas import DataFrame
  5 | import os
  6 | import re
  7 | import keras
  8 | from keras.models import Sequential
  9 | from keras.layers import Dense, Dropout, Activation
 10 | from keras.optimizers import SGD
 11 | from collections import Counter
 12 | import sklearn
 13 | import pandas as pd
 14 | 
 15 | import spacy
 16 | nlp = spacy.load('en_core_web_lg', parser=False)
 17 | 
 18 | from mordecai import geoparse
 19 | 
 20 | geo = geoparse.Geoparse(verbose = True)
 21 | 
 22 | def entry_to_matrix(prodigy_entry):
 23 |     """
 24 |     Take in a line from the labeled json and return a vector of labels and a matrix of features
 25 |     for training.
 26 | 
 27 |     Two ways to get 0s:
 28 |         - marked as false by user
 29 |         - generated automatically from other entries when guess is correct
 30 | 
 31 |     Rather than iterating through entities, just get the number of the correct entity directly.
 32 |     Then get one or two GPEs before and after.
 33 |     """
 34 |     doc = prodigy_entry['text']
 35 |     doc = nlp(doc)
 36 |     geo_proced = geo.process_text(doc, require_maj=False)
 37 | 
 38 |     # find the geoproced entity that matches the Prodigy entry
 39 |     ent_text = np.asarray([gp['word'] for gp in geo_proced]) # get mask for correct ent
 40 |     #print(ent_text)
 41 |     match = ent_text == entry['meta']['word']
 42 |     #print("match: ", match)
 43 |     anti_match = np.abs(match - 1)
 44 |     #print("Anti-match ", anti_match)
 45 |     match_position = match.argmax()
 46 | 
 47 |     geo_proc = geo_proced[match_position]
 48 | 
 49 |     iso = geo.cts[prodigy_entry['label']] # convert country text label to ISO
 50 |     feat = geo.features_to_matrix(geo_proc)
 51 |     answer_x = feat['matrix']
 52 |     label = np.asarray(feat['labels'])
 53 | 
 54 |     if prodigy_entry['answer'] == "accept":
 55 |         answer_binary = label == iso
 56 |         answer_binary = answer_binary.astype('int')
 57 |         #print(answer_x.shape)
 58 |         #print(answer_binary.shape)
 59 | 
 60 | 
 61 |     elif prodigy_entry['answer'] == "reject":
 62 |         # all we know is that the label that was presented is wrong.
 63 |         # just return the corresponding row in the feature matrix,
 64 |         #   and force the label to be 0
 65 |         answer_binary = label == iso
 66 |         answer_x = answer_x[answer_binary,:] # just take the row corresponding to the answer
 67 |         answer_binary = np.asarray([0]) # set the outcome to 0 because reject
 68 | 
 69 |     # NEED TO SHARE LABELS ACROSS! THE CORRECT ONE MIGHT NOT EVEN APPEAR FOR ALL ENTITIES
 70 | 
 71 |     x = feat['matrix']
 72 |     other_x = x[anti_match,:]
 73 |     #print(other_x)
 74 |     #print(label[anti_match])
 75 |     # here, need to get the rows corresponding to the correct label
 76 | 
 77 |     #    print(geo_proc['meta'])
 78 |         # here's where we get the other place name features.
 79 |         # Need to:
 80 |         #  1. do features_to_matrix but use the label of the current entity
 81 |         #     to determine 0/1 in the feature matrix
 82 |         #  2. put them all into one big feature matrix,
 83 |         #  3. ...ordering by distance? And need to decide max entity length
 84 |         #  4. also include these distances as one of the features
 85 | 
 86 |     #print(answer_x.shape[0])
 87 |     #print(answer_binary.shape[0])
 88 |     try:
 89 |         if answer_x.shape[0] == answer_binary.shape[0]:
 90 |             return (answer_x, answer_binary)
 91 |     except:
 92 |         pass
 93 | 
 94 |     #return (answer_x, answer_binary)
 95 | 
 96 |             # If it's accept, convert the label of the correct one to 1, the others to 0, return all
 97 |             # If it's reject, convert the label of the presented one to 0, and DELETE the rows in the
 98 |             #   matrix/vector. If the presented one is false, we don't know if the other, non-presented
 99 |             #   ones were correct or not.
100 | 
101 |             # return the text labels, too, so we can look at per-country accuracy later.
102 | 
103 |     #    feat_list.append(feat)
104 | 
105 | error_count = 0
106 | with jsonlines.open('geo_annotated/geo_country_db.jsonl') as reader:
107 |     X = []
108 |     Y = []
109 |     for obj in reader:
110 |         if obj['answer'] != 'ignore':
111 |             try:
112 |                 x, label = entry_to_matrix(obj) # change to return matrices/vectors
113 |                 X.append(x)
114 |                 Y.append(label)
115 |             except Exception as e:
116 |                 error_count += 1
117 |                 pass
118 | 
119 | print(error_count)
120 | 
121 | # format numpy
122 | Y = np.hstack(Y)
123 | Y = np.asarray(Y).astype(int)
124 | 
125 | X = np.vstack(X)
126 | X_df = DataFrame(X)
127 | 
128 | # train/test split
129 | msk = np.random.rand(len(X_df)) < 0.7
130 | X_train = X_df[msk].as_matrix()
131 | X_test = X_df[~msk].as_matrix()
132 | y_train = Y[msk]
133 | y_test = Y[~msk]
134 | 
135 | 
136 | model = Sequential()
137 | model.add(Dense(512, activation='relu', input_dim=X_train.shape[1]))
138 | model.add(Dropout(0.5))
139 | model.add(Dense(512, activation='relu'))
140 | model.add(Dropout(0.5))
141 | model.add(Dense(512, activation='relu'))
142 | model.add(Dropout(0.5))
143 | model.add(Dense(1, activation='sigmoid'))
144 | 
145 | #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
146 | model.compile(loss='binary_crossentropy',
147 |               optimizer='rmsprop',
148 |               metrics=['accuracy'])
149 | 
150 | model.fit(X_train, y_train,
151 |           epochs=15,
152 |           batch_size=128)
153 | 
154 | score = model.evaluate(X_test, y_test, batch_size=12)
155 | print(score)
156 | 
157 | y_predicted = model.predict(X_test)
158 | 
159 | print(sklearn.metrics.classification_report(y_pred = y_predicted>0.5, y_true = y_test))
160 | 
161 | model.save("country_model_updated_script.h5")
162 | 
163 | 


--------------------------------------------------------------------------------
/train/train_ranker.py:
--------------------------------------------------------------------------------
  1 | import plac
  2 | import mordecai
  3 | import random
  4 | import jsonlines
  5 | from tqdm import tqdm
  6 | import re
  7 | import numpy as np
  8 | import editdistance
  9 | import pandas as pd
 10 | import os
 11 | import json
 12 | import pickle
 13 | 
 14 | import keras
 15 | from keras.models import Sequential
 16 | from keras.layers import Dense, Dropout, Activation
 17 | from keras.optimizers import SGD
 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 19 | import sklearn
 20 | 
 21 | 
 22 | geo = mordecai.Geoparser()
 23 | # Here's the format of the Prodigy labeled place picking data:
 24 | # ```
 25 | # {"text":"On July 15, state security services in Idleb arrested Mahmoud Barish, an opposition activist, for his dealings with the Damascus Declaration.",
 26 | # "spans":[{"start":39,"end":44}],
 27 | # "options":[
 28 | #   {"id":1,"text":"\"Idlib District\", a second-order administrative division in SYR, id: 169388"},
 29 | #   {"id":2,"text":"\"Idlib\", a seat of a first-order administrative division in SYR, id: 169389,
 30 | #   {"id":4,"text":"None/Other/Incorrect"}],
 31 | # "_input_hash":1212285619,"_task_hash":-1410881973,
 32 | # "accept":[2],
 33 | # "answer":"accept"}
 34 | # ```
 35 | 
 36 | def ingest_prodigy_ranks(filename):
 37 |     """
 38 |     Ingest Prodigy-labeled Mordecai data for place picking and produce training data
 39 |     for Keras.
 40 | 
 41 |     For each labeled example, match it to the output of Mordecai, and make sure there's an accepted answer
 42 |     from Prodigy.
 43 | 
 44 |     Parameters
 45 |     ----------
 46 |     filename: filepath, location of Prodigy data
 47 | 
 48 |     Returns
 49 |     -------
 50 |     X: list of matrices, Mordecai features.
 51 |       Each element in the list is a matrix of features for ranking (so 5 rows)
 52 |     Y: list of arrays of length 5, indicating correct location.
 53 |     """
 54 |     with jsonlines.open(filename) as reader:
 55 |         X = []
 56 |         Y = []
 57 |         i = 0
 58 |         accumulate = []
 59 |         for obj in reader:
 60 |             i = i+1
 61 |             if i % 250 == 0:
 62 |                 print(i)
 63 |             # run the text through mordecai
 64 |             proced = geo.geoparse(obj['text'], verbose = True,)
 65 |             for proc in proced:
 66 |                 # for each result, see if the spans overlap the labeled spans
 67 |                 if proc['spans'][0]['start'] != obj['spans'][0]['start']:
 68 |                     # make sure we have the right entity
 69 |                     continue
 70 |                 ent_word = proc['word']
 71 |                 if not ent_word:
 72 |                     continue
 73 |                 # if it all works, take the results.
 74 |                 results = geo.query_geonames_country(ent_word, proc['country_predicted'])
 75 | 
 76 |             if obj['answer'] == 'accept':
 77 |                 #start_char = obj['spans']['start']
 78 |                 # get the geonames ids of the options
 79 |                 geoids = [re.findall("id: (.+)", i['text']) for i in obj['options']]
 80 |                 geoids = [i[0] for i in geoids if i]
 81 |                 # get the correct of if any
 82 |                 try:
 83 |                     correct = obj['accept'][0]
 84 |                     correct_id = str(geoids[correct - 1])
 85 |                 except (KeyError, IndexError):
 86 |                     continue
 87 | 
 88 |             elif obj['answer'] != 'accept':
 89 |                 correct_id = 4
 90 | 
 91 |             try:
 92 |                 fl, meta = geo.features_for_rank(proc, results)
 93 |                 # just look at the top 4 results by deterministic rule
 94 |                 # This matches what went into the annotation task
 95 |                 choices, sorted_meta, fl_subset = geo.format_for_prodigy(fl, meta, ent_word, return_feature_subset=True)
 96 |                 result_ids = np.asarray([m['geonameid'] for m in sorted_meta])
 97 |                 if obj['answer'] == 'accept':
 98 |                     labels = result_ids == correct_id
 99 |                 elif obj['answer'] == 'reject':
100 |                     # give rejects their own special category
101 |                     # reject means the country was right but none of the options were.
102 |                     labels = np.asarray([0, 0, 0, 0, 1])
103 |                 else:
104 |                     # skip ignores
105 |                     continue
106 |                 #print(labels)
107 |                 if labels.sum() == 0:
108 |                     #print("No correct labels")
109 |                     pass
110 |                 # if fewer than 4 options were presented for tagging,
111 |                 #   pad it out with 0s to length 4 + 1 (1 for the all wrong reject answer)
112 |                 labels = np.pad(labels, (0, 5 - len(labels)), 'constant')
113 |                 # pad the matrix with empty rows
114 |                 fl_pad = np.pad(fl_subset, ((0, 5 - fl_subset.shape[0]), (0, 0)), 'constant')
115 |                 # turn the matrix into a vector
116 |                 fl_unwrap = fl_pad.flatten()
117 |                 Y.append(labels)
118 |                 X.append(fl_unwrap)
119 |             except Exception as e:
120 |                 print(e)
121 |                 #print(meta)
122 |                 continue
123 |     return X, Y
124 | 
125 | def prep_data(X, Y, train_split):
126 |     X_stack = np.vstack(X)
127 |     X_stack.shape
128 |     Y_stack = np.vstack(Y)
129 |     Y_stack = Y_stack.astype(int)
130 |     Y_stack.shape
131 |     X_df = pd.DataFrame(X_stack)
132 | 
133 |     print("Using a cutpoint of ", train_split)
134 |     np.random.seed(73071)
135 |     msk = np.random.rand(len(X_df)) < train_split
136 |     X_train = X_df[msk].as_matrix()
137 |     X_test = X_df[~msk].as_matrix()
138 |     y_train = Y_stack[msk]
139 |     y_test = Y_stack[~msk]
140 | 
141 |     for i in [X_train, X_test, y_train, y_test]:
142 |         print(i.shape)
143 |     return X_train, X_test, y_train, y_test
144 | 
145 | def train_model(X_train, X_test, y_train, y_test, save_file):
146 |     model = Sequential()
147 |     model.add(Dense(128, activation='relu', input_shape = (X_train.shape[1],)))
148 |     model.add(Dropout(0.3))
149 |     model.add(Dense(128, activation='relu'))
150 |     model.add(Dropout(0.3))
151 |     model.add(Dense(128, activation='relu'))
152 |     model.add(Dropout(0.3))
153 |     model.add(Dense(y_train.shape[1], activation='softmax'))
154 | 
155 |     #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
156 |     model.compile(loss='categorical_crossentropy',
157 |                   optimizer='rmsprop',
158 |                   metrics=['accuracy'])
159 | 
160 |     callbacks = [EarlyStopping(monitor='val_loss', patience=50)]
161 |     save_model = ModelCheckpoint(save_file, monitor='val_loss',
162 |                                                  verbose=0, save_best_only=True,
163 |                                                  save_weights_only=False)
164 |     callbacks.append(save_model)
165 | 
166 |     model.fit(X_train, y_train,
167 |               epochs=100,
168 |               validation_split=0.2,
169 |               callbacks = callbacks,
170 |               batch_size=16)
171 | 
172 |     return model
173 | 
174 | 
175 | @plac.annotations(
176 |     input_file=("Location of Prodigy labeled output", "option", "i", str),
177 |     train_split=("Fraction of data to use for training vs. validation", "option", "s", float),
178 |     use_cache=("Use cached data?", "flag", "c"))
179 | def main(input_file, train_split, use_cache):
180 |     save_file = "rank_model_new.h5"
181 |     if use_cache:
182 |         print("Using saved data...")
183 |         with open("ranker_X.pkl", "rb") as f:
184 |             X = pickle.load(f)
185 |         with open("ranker_y.pkl", "rb") as f:
186 |             Y = pickle.load(f)
187 |     else:
188 |         print("Recalculating data...")
189 |         X, Y = ingest_prodigy_ranks(input_file)
190 |         #print("X.shape:", X.shape)
191 |         #print("Y.shape:", Y.shape)
192 |         with open("ranker_X.pkl", "wb") as f:
193 |             pickle.dump(X, f)
194 |         with open("ranker_Y.pkl", "wb") as f:
195 |             pickle.dump(Y, f)
196 |     X_train, X_test, y_train, y_test = prep_data(X, Y, train_split)
197 |     model = train_model(X_train, X_test, y_train, y_test, save_file)
198 |     score = model.evaluate(X_test, y_test)
199 |     print(score)
200 | 
201 |     y_predicted = model.predict(X_test)
202 |     print(sklearn.metrics.classification_report(y_pred = y_predicted>0.5, y_true = y_test))
203 |     #model.save()
204 | 
205 | if __name__ == '__main__':
206 |     plac.call(main)
207 | 


--------------------------------------------------------------------------------