├── .gitignore ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── docs ├── Makefile ├── how_to_build.txt ├── make.bat └── source │ ├── README.rst │ ├── conf.py │ ├── geoparse.rst │ ├── index.rst │ ├── modules.rst │ ├── mordecai.rst │ └── mordecai.tests.rst ├── examples ├── README.md ├── geocode_cities.csv ├── geocode_cities.py └── out.csv ├── mordecai ├── MANIFEST.in ├── __init__.py ├── data │ ├── admin1CodesASCII.json │ ├── countries.json │ ├── feature_codes.txt │ ├── nat_df.csv │ └── stopword_country_names.json ├── geoparse.py ├── models │ ├── country_model.h5 │ ├── country_model_multi.h5 │ └── rank_model.h5 ├── tests │ ├── __init__.py │ ├── conftest.py │ └── test_mordecai.py └── utilities.py ├── paper ├── mordecai_geoparsing.png ├── paper.bib └── paper.md ├── requirements.txt ├── setup.cfg ├── setup.py └── train ├── train_country_model.py └── train_ranker.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | *.swp 3 | .ropeproject 4 | *.pyc 5 | *.bin.gz 6 | *.tar.bz2 7 | *data/MITIE-models 8 | *.ipynb 9 | .cache 10 | mordecai/.cache 11 | build/ 12 | dist/ 13 | mordecai.egg-info 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | language: python 4 | 5 | python: 6 | - 3.6 7 | 8 | dist: trusty 9 | 10 | services: 11 | - docker 12 | 13 | before_install: 14 | - docker pull elasticsearch:5.5.2 15 | - wget https://s3.amazonaws.com/ahalterman-geo/geonames_index.tar.gz --output-file=wget_log.txt 16 | - tar -xzf geonames_index.tar.gz 17 | - docker run -d -p 127.0.0.1:9200:9200 -v $(pwd)/geonames_index/:/usr/share/elasticsearch/data elasticsearch:5.5.2 18 | 19 | install: 20 | - travis_wait pip install -r requirements.txt --quiet 21 | 22 | script: 23 | - pytest 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 Andy Halterman, 2015 Caerus Associates 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include mordecai/data/ * 2 | recursive-include mordecai/models/ * 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](paper/mordecai_geoparsing.png) 2 | 3 | Full text geoparsing as a Python library. Extract the place names from a 4 | piece of English-language text, resolve them to the correct place, and return 5 | their coordinates and structured geographic information. 6 | 7 | **Mordecai is ready for an upgrade!** Please take the user survey [here](https://z0l4ihmu0ud.typeform.com/to/b8FmCfMt) 8 | to help shape what v3 will look like. 9 | 10 | Example usage 11 | ------------- 12 | 13 | ``` 14 | >>> from mordecai import Geoparser 15 | >>> geo = Geoparser() 16 | >>> geo.geoparse("I traveled from Oxford to Ottawa.") 17 | 18 | [{'country_conf': 0.96474487, 19 | 'country_predicted': 'GBR', 20 | 'geo': {'admin1': 'England', 21 | 'country_code3': 'GBR', 22 | 'feature_class': 'P', 23 | 'feature_code': 'PPLA2', 24 | 'geonameid': '2640729', 25 | 'lat': '51.75222', 26 | 'lon': '-1.25596', 27 | 'place_name': 'Oxford'}, 28 | 'spans': [{'end': 22, 'start': 16}], 29 | 'word': 'Oxford'}, 30 | {'country_conf': 0.83302397, 31 | 'country_predicted': 'CAN', 32 | 'geo': {'admin1': 'Ontario', 33 | 'country_code3': 'CAN', 34 | 'feature_class': 'P', 35 | 'feature_code': 'PPLC', 36 | 'geonameid': '6094817', 37 | 'lat': '45.41117', 38 | 'lon': '-75.69812', 39 | 'place_name': 'Ottawa'}, 40 | 'spans': [{'end': 32, 'start': 26}], 41 | 'word': 'Ottawa'}] 42 | ``` 43 | 44 | Mordecai requires a running Elasticsearch service with Geonames in it. See 45 | "Installation" below for instructions. 46 | 47 | 48 | Installation and Requirements 49 | -------------------- 50 | 51 | 1. Mordecai is on PyPI and can be installed for Python 3 with pip: 52 | 53 | ``` 54 | pip install mordecai 55 | ``` 56 | 57 | **Note**: It's *strongly* recommended that you run Mordecai in a virtual 58 | environment. The libraries that Mordecai depends on are not always the most 59 | recent versions and using a virtual environment prevents libraries from being 60 | downgraded or running into other issues: 61 | 62 | ``` 63 | python -m venv mordecai-env 64 | source mordecai-env/bin/activate 65 | pip install mordecai 66 | ``` 67 | 68 | 2. You should then download the required spaCy NLP model: 69 | 70 | ``` 71 | python -m spacy download en_core_web_lg 72 | ``` 73 | 74 | 3. In order to work, Mordecai needs access to a Geonames gazetteer running in 75 | Elasticsearch. The easiest way to set it up is by running the following 76 | commands (you must have [Docker](https://docs.docker.com/engine/installation/) 77 | installed first). 78 | 79 | ``` 80 | docker pull elasticsearch:5.5.2 81 | wget https://andrewhalterman.com/files/geonames_index.tar.gz --output-file=wget_log.txt 82 | tar -xzf geonames_index.tar.gz 83 | docker run -d -p 127.0.0.1:9200:9200 -v $(pwd)/geonames_index/:/usr/share/elasticsearch/data elasticsearch:5.5.2 84 | ``` 85 | 86 | See the [es-geonames](https://github.com/openeventdata/es-geonames) for the code used 87 | to produce this index. 88 | 89 | To update the index, simply shut down the old container, re-download the index 90 | from s3, and restart the container with the new index. 91 | 92 | Citing 93 | ------ 94 | 95 | If you use this software in academic work, please cite as 96 | 97 | ``` 98 | @article{halterman2017mordecai, 99 | title={Mordecai: Full Text Geoparsing and Event Geocoding}, 100 | author={Halterman, Andrew}, 101 | journal={The Journal of Open Source Software}, 102 | volume={2}, 103 | number={9}, 104 | year={2017}, 105 | doi={10.21105/joss.00091} 106 | } 107 | ``` 108 | 109 | How does it work? 110 | ----------------- 111 | 112 | Mordecai takes in unstructured text and returns structured geographic information extracted 113 | from it. 114 | 115 | - It uses [spaCy](https://github.com/explosion/spaCy/)'s named entity recognition to 116 | extract placenames from the text. 117 | 118 | - It uses the [geonames](http://www.geonames.org/) 119 | gazetteer in an [Elasticsearch](https://www.elastic.co/products/elasticsearch) index 120 | (with some custom logic) to find the potential coordinates of 121 | extracted place names. 122 | 123 | - It uses neural networks implemented in [Keras](https://keras.io/) and trained on new annotated 124 | English-language data labeled with [Prodigy](https://prodi.gy/) to infer the correct country and correct gazetteer entries for each 125 | placename. 126 | 127 | The training data for the two models includes copyrighted text so cannot be 128 | shared freely. Applying Mordecai to non-English language text would require labeling data 129 | in the target language and retraining. 130 | 131 | API and Configuration 132 | --------------------- 133 | 134 | When instantiating the `Geoparser()` module, the following options can be changed: 135 | 136 | - `es_hosts` : List of hosts where the Geonames Elasticsearch service is 137 | running. Defaults to `['localhost']`, which is where it runs if you're using 138 | the default Docker setup described above. 139 | - `es_port` : What port the Geonames Elasticsearch service is running on. 140 | Defaults to `9200`, which is where the Docker setup has it 141 | - `es_ssl` : Whether Elasticsearch requires an SSL connection. 142 | Defaults to `False`. 143 | - `es_auth` : Optional HTTP auth parameters to use with ES. 144 | If provided, it should be a two-tuple of `(user, password)`. 145 | - `country_confidence` : Set the country model confidence below which no 146 | geolocation will be returned. If it's really low, the model's probably 147 | wrong and will return weird results. Defaults to `0.6`. 148 | - `verbose` : Return all the features used in the country picking model? 149 | Defaults to `False`. 150 | - `threads`: whether to use threads to make parallel queries to the 151 | Elasticsearch database. Defaults to `True`, which gives a ~6x speedup. 152 | 153 | `geoparse` is the primary endpoint and the only one that most users will need. 154 | Other, mostly internal, methods may be useful in some cases: 155 | 156 | - `lookup_city` takes a city name, country, and (optionally) ADM1/state/governorate and 157 | does a rule-based lookup for the city. 158 | - `infer_country` take a document and attempts to infer the most probable 159 | country for each. 160 | - `query_geonames` and `query_geonames_country` can be used for performing a 161 | search over Geonames in Elasticsearch 162 | - methods with the `_feature` prefix are internal methods for 163 | calculating country picking features from text. 164 | 165 | `batch_geoparse` takes in a list of documents and uses spaCy's `nlp.pipe` 166 | method to process them more efficiently in the NLP step. 167 | 168 | Advanced users on large machines can increase the `lru_cache` parameter from 250 169 | to 1000. This will use more memory but will increase parsing speed. 170 | 171 | Tests 172 | ----- 173 | 174 | Mordecai includes unit tests. To run the tests, `cd` into the 175 | `mordecai` directory and run: 176 | 177 | ``` 178 | pytest 179 | ``` 180 | 181 | The tests require access to a running Elastic/Geonames service to 182 | complete. The tests are currently failing on TravisCI with an unexplained 183 | segfault but run fine locally. Mordecai has only been tested with Python 3. 184 | 185 | 186 | Acknowledgements 187 | ---------------- 188 | 189 | An earlier verion of this software was donated to the Open Event Data Alliance 190 | by Caerus Associates. See [Releases](https://github.com/openeventdata/mordecai/releases) 191 | or the [legacy-docker](https://github.com/openeventdata/mordecai/tree/legacy-docker) branch for the 192 | 2015-2016 and the 2016-2017 production versions of Mordecai. 193 | 194 | This work was funded in part by DARPA's XDATA program, the U.S. Army Research 195 | Laboratory and the U.S. Army Research Office through the Minerva Initiative 196 | under grant number W911NF-13-0332, and the National Science Foundation under 197 | award number SBE-SMA-1539302. Any opinions, findings, and conclusions or 198 | recommendations expressed in this material are those of the authors and do not 199 | necessarily reflect the views of DARPA, ARO, Minerva, NSF, or the U.S. 200 | government. 201 | 202 | 203 | Contributing 204 | ------------ 205 | 206 | Contributions via pull requests are welcome. Please make sure that changes 207 | pass the unit tests. Any bugs and problems can be reported 208 | on the repo's [issues page](https://github.com/openeventdata/mordecai/issues). 209 | 210 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = mordecai 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/how_to_build.txt: -------------------------------------------------------------------------------- 1 | # convert docstrings to restructured text 2 | sphinx-apidoc -f -o source/ ../mordecai 3 | # build the docs 4 | make html 5 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | set SPHINXPROJ=mordecai 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/source/README.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../README.md 2 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # 4 | # mordecai documentation build configuration file, created by 5 | # sphinx-quickstart on Mon Nov 20 12:24:51 2017. 6 | # 7 | # This file is execfile()d with the current directory set to its 8 | # containing dir. 9 | # 10 | # Note that not all possible configuration values are present in this 11 | # autogenerated file. 12 | # 13 | # All configuration values have a default; values that are commented out 14 | # serve to show the default. 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | # 20 | import os 21 | import sys 22 | sys.path.insert(0, os.path.abspath('../../mordecai')) 23 | 24 | 25 | # -- General configuration ------------------------------------------------ 26 | 27 | # If your documentation needs a minimal Sphinx version, state it here. 28 | # 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon'] 35 | 36 | # Add any paths that contain templates here, relative to this directory. 37 | templates_path = ['_templates'] 38 | 39 | # The suffix(es) of source filenames. 40 | # You can specify multiple suffix as a list of string: 41 | # 42 | # source_suffix = ['.rst', '.md'] 43 | source_suffix = '.rst' 44 | 45 | # The master toctree document. 46 | master_doc = 'index' 47 | 48 | # General information about the project. 49 | project = 'mordecai' 50 | copyright = '2017, Andy Halterman' 51 | author = 'Andy Halterman' 52 | 53 | # The version info for the project you're documenting, acts as replacement for 54 | # |version| and |release|, also used in various other places throughout the 55 | # built documents. 56 | # 57 | # The short X.Y version. 58 | version = '2.0.0' 59 | # The full version, including alpha/beta/rc tags. 60 | release = '2.0.0a1' 61 | 62 | # The language for content autogenerated by Sphinx. Refer to documentation 63 | # for a list of supported languages. 64 | # 65 | # This is also used if you do content translation via gettext catalogs. 66 | # Usually you set "language" from the command line for these cases. 67 | language = None 68 | 69 | # List of patterns, relative to source directory, that match files and 70 | # directories to ignore when looking for source files. 71 | # This patterns also effect to html_static_path and html_extra_path 72 | exclude_patterns = [] 73 | 74 | # The name of the Pygments (syntax highlighting) style to use. 75 | pygments_style = 'sphinx' 76 | 77 | # If true, `todo` and `todoList` produce output, else they produce nothing. 78 | todo_include_todos = False 79 | 80 | 81 | # -- Options for HTML output ---------------------------------------------- 82 | 83 | # The theme to use for HTML and HTML Help pages. See the documentation for 84 | # a list of builtin themes. 85 | # 86 | html_theme = 'alabaster' 87 | 88 | # Theme options are theme-specific and customize the look and feel of a theme 89 | # further. For a list of options available for each theme, see the 90 | # documentation. 91 | # 92 | # html_theme_options = {} 93 | 94 | # Add any paths that contain custom static files (such as style sheets) here, 95 | # relative to this directory. They are copied after the builtin static files, 96 | # so a file named "default.css" will overwrite the builtin "default.css". 97 | html_static_path = ['_static'] 98 | 99 | 100 | # -- Options for HTMLHelp output ------------------------------------------ 101 | 102 | # Output file base name for HTML help builder. 103 | htmlhelp_basename = 'mordecaidoc' 104 | 105 | 106 | # -- Options for LaTeX output --------------------------------------------- 107 | 108 | latex_elements = { 109 | # The paper size ('letterpaper' or 'a4paper'). 110 | # 111 | # 'papersize': 'letterpaper', 112 | 113 | # The font size ('10pt', '11pt' or '12pt'). 114 | # 115 | # 'pointsize': '10pt', 116 | 117 | # Additional stuff for the LaTeX preamble. 118 | # 119 | # 'preamble': '', 120 | 121 | # Latex figure (float) alignment 122 | # 123 | # 'figure_align': 'htbp', 124 | } 125 | 126 | # Grouping the document tree into LaTeX files. List of tuples 127 | # (source start file, target name, title, 128 | # author, documentclass [howto, manual, or own class]). 129 | latex_documents = [ 130 | (master_doc, 'mordecai.tex', 'mordecai Documentation', 131 | 'Andy Halterman', 'manual'), 132 | ] 133 | 134 | 135 | # -- Options for manual page output --------------------------------------- 136 | 137 | # One entry per manual page. List of tuples 138 | # (source start file, name, description, authors, manual section). 139 | man_pages = [ 140 | (master_doc, 'mordecai', 'mordecai Documentation', 141 | [author], 1) 142 | ] 143 | 144 | 145 | # -- Options for Texinfo output ------------------------------------------- 146 | 147 | # Grouping the document tree into Texinfo files. List of tuples 148 | # (source start file, target name, title, author, 149 | # dir menu entry, description, category) 150 | texinfo_documents = [ 151 | (master_doc, 'mordecai', 'mordecai Documentation', 152 | author, 'mordecai', 'One line description of project.', 153 | 'Miscellaneous'), 154 | ] 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /docs/source/geoparse.rst: -------------------------------------------------------------------------------- 1 | Geoparse documentation 2 | ================= 3 | 4 | .. automodule:: mordecai.Geoparse 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. mordecai documentation master file, created by 2 | sphinx-quickstart on Mon Nov 20 12:24:51 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to mordecai's documentation! 7 | ==================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | geoparse 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | mordecai 2 | ======== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | mordecai 8 | -------------------------------------------------------------------------------- /docs/source/mordecai.rst: -------------------------------------------------------------------------------- 1 | mordecai package 2 | ================ 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | mordecai.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | mordecai.geoparse module 15 | ------------------------ 16 | 17 | .. automodule:: mordecai.geoparse 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | mordecai.utilities module 23 | ------------------------- 24 | 25 | .. automodule:: mordecai.utilities 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: mordecai 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/mordecai.tests.rst: -------------------------------------------------------------------------------- 1 | mordecai.tests package 2 | ====================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | mordecai.tests.conftest module 8 | ------------------------------ 9 | 10 | .. automodule:: mordecai.tests.conftest 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | mordecai.tests.test_mordecai module 16 | ----------------------------------- 17 | 18 | .. automodule:: mordecai.tests.test_mordecai 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | 24 | Module contents 25 | --------------- 26 | 27 | .. automodule:: mordecai.tests 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Mordecai examples 2 | 3 | ## Geocoding cities 4 | 5 | This script is an example usage of `geo.lookup_city()`, which takes a CSV 6 | containing columns with city names, country 3 letter codes, and (optionally) 7 | state/ADM1 names. If the columns are named (respectively) `city`, `adm1`, and 8 | `country`, you can run it like this: 9 | 10 | ``` 11 | python geocode_cities.py geocode_cities.csv out.csv 12 | ``` 13 | 14 | Otherwise, you'll have to specify the column names as part of the call. The 15 | geocoder returns lat/lon and Geonames information, as well as providing the 16 | reason for why it selected a particular location and cautions when the results 17 | were ambiguous. -------------------------------------------------------------------------------- /examples/geocode_cities.csv: -------------------------------------------------------------------------------- 1 | city,adm1,country 2 | Norman,OK,USA 3 | College Park,MD,USA 4 | Cambridge,MA,USA 5 | Whaugbggoan,OK,USA 6 | Columbia Heights,DC,USA 7 | Aleppo,Aleppo,SYR 8 | -------------------------------------------------------------------------------- /examples/geocode_cities.py: -------------------------------------------------------------------------------- 1 | import plac 2 | import pandas as pd 3 | from mordecai import Geoparser 4 | from tqdm import tqdm 5 | 6 | 7 | def main(in_file: ("input CSV file"), 8 | out_file: ("filename to write ouput to"), 9 | city_col: ("column in CSV with city col") = "city", 10 | adm1_col: ("column in CSV with state/governorate/ADM1") = "adm1", 11 | country_col: ("column in CSV with country name") = "country"): 12 | """Geocode a csv with a city, ADM1, and country columns.""" 13 | print("Loading Mordecai...") 14 | geo = Geoparser() 15 | df = pd.read_csv(in_file) 16 | geocoded = [] 17 | print("Geocoding...") 18 | for i in tqdm(df.iterrows()): 19 | row = i[1] 20 | if pd.isnull(row[adm1_col]): 21 | # Elasticsearch doesn't like NaN, change to None 22 | adm1 = None 23 | else: 24 | adm1 = row[adm1_col] 25 | res = geo.lookup_city(city = row[city_col], 26 | adm1 = adm1, 27 | country = row[country_col]) 28 | try: 29 | gc = {"admin1_code" : res['geo']['admin1_code'], 30 | "admin2_code": res['geo']['admin2_code'], 31 | "asciiname": res['geo']['asciiname'], 32 | "name": res['geo']['name'], 33 | "geonameid": res['geo']['geonameid'], 34 | "feature_class": res['geo']['feature_class'], 35 | "feature_code": res['geo']['feature_code'], 36 | "country_code3": res['geo']['country_code3'], 37 | "lat": float(res['geo']['coordinates'].split(",")[0]), 38 | "lon": float(res['geo']['coordinates'].split(",")[1])} 39 | except TypeError: 40 | gc = {"admin1_code" : "", 41 | "admin2_code": "", 42 | "asciiname": "", 43 | "name": "", 44 | "geonameid": "", 45 | "feature_class": "", 46 | "feature_code": "", 47 | "country_code3": "", 48 | "lat": "", 49 | "lon": ""} 50 | gc['search_city'] = row[city_col] 51 | gc['search_adm1'] = row[adm1_col] 52 | gc['search_country'] = row[country_col] 53 | gc["info"] = res['info'] 54 | gc["reason"] = res['reason'] 55 | geocoded.append(gc) 56 | geo_df = pd.DataFrame(geocoded) 57 | geo_df.to_csv(out_file) 58 | print("Wrote file out to ", out_file) 59 | 60 | 61 | if __name__ == '__main__': 62 | plac.call(main) -------------------------------------------------------------------------------- /examples/out.csv: -------------------------------------------------------------------------------- 1 | ,admin1_code,admin2_code,asciiname,name,geonameid,feature_class,feature_code,country_code3,lat,lon,search_city,search_adm1,search_country,info,reason 2 | 0,OK,027,Norman,Norman,4543762,P,PPLA2,USA,35.22257,-97.43948,Norman,OK,USA,50 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country." 3 | 1,MD,033,College Park,College Park,4351977,P,PPL,USA,38.98067,-76.93692,College Park,MD,USA,2 elasticsearch matches for cities out of 37 total results of all types,Exact name match for city. 4 | 2,ID,005,Cambridge,Cambridge,5587778,P,PPL,USA,42.45047,-112.11663,Cambridge,MA,USA,33 entries within minimum edit distance. Picking closest average distance: 2.25.,CAUTION: Best of several edit distance matches. 5 | 3,,,,,,,,,,,Whaugbggoan,OK,USA,0 total results of all types.,FAILURE: No fuzzy match for city or neighborhood. 6 | 4,DC,001,Columbia Heights,Columbia Heights,4138102,P,PPL,USA,38.92567,-77.02942,Columbia Heights,DC,USA,6 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country." 7 | 5,09,,Aleppo,Aleppo,170063,P,PPLA,SYR,36.20124,37.16117,Aleppo,Aleppo,SYR,9 total results of all types,"Single match for city in Elasticsearch with name, ADM1, country." 8 | -------------------------------------------------------------------------------- /mordecai/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include data/nat_df.csv 2 | -------------------------------------------------------------------------------- /mordecai/__init__.py: -------------------------------------------------------------------------------- 1 | from .geoparse import Geoparser 2 | 3 | __version__ = "2.1.0" 4 | -------------------------------------------------------------------------------- /mordecai/data/countries.json: -------------------------------------------------------------------------------- 1 | "{\"Afghanistan\":\"AFG\", \"\u00c5land Islands\":\"ALA\", \"Albania\":\"ALB\", \"Algeria\":\"DZA\", \"American Samoa\":\"ASM\", \"Andorra\":\"AND\", \"Angola\":\"AGO\", \"Anguilla\":\"AIA\", \"Antarctica\":\"ATA\", \"Antigua Barbuda\":\"ATG\", \"Argentina\":\"ARG\", \"Armenia\":\"ARM\", \"Aruba\":\"ABW\", \"Ascension_Island\":\"NA\", \"Australia\":\"AUS\", \"Austria\":\"AUT\", \"Azerbaijan\":\"AZE\", \"Bahamas\":\"BHS\", \"Bahrain\":\"BHR\", \"Bangladesh\":\"BGD\", \"Barbados\":\"BRB\", \"Belarus\":\"BLR\", \"Belgium\":\"BEL\", \"Belize\":\"BLZ\", \"Benin\":\"BEN\", \"Bermuda\":\"BMU\", \"Bhutan\":\"BTN\", \"Bolivia\":\"BOL\", \"Bosnia_Herzegovina\":\"BIH\", \"Botswana\":\"BWA\", \"Bouvet Island\":\"BVT\", \"Brazil\":\"BRA\", \"Britain\":\"GBR\", \"Great_Britain\":\"GBR\", \"British Virgin Islands\":\"VGB\", \"Brunei\":\"BRN\", \"Bulgaria\":\"BGR\", \"Burkina_Faso\":\"BFA\", \"Burundi\":\"BDI\", \"Cambodia\":\"KHM\", \"Cameroon\":\"CMR\", \"Canada\":\"CAN\",\"Cape Verde\":\"CPV\", \"Cayman_Islands\":\"CYM\", \"Central African Republic\":\"CAF\", \"Chad\":\"TCD\", \"Chile\":\"CHL\", \"China\":\"CHN\", \"Cocos_Islands\":\"CCK\", \"Colombia\":\"COL\", \"Comoros\":\"COM\", \"Congo Brazzaville\":\"COG\", \"Congo Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"CRI\", \"C Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"CRI\",ur Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_R:\" Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"CRI\"Ecua Kinshasa\":\"COD\", \"Congo\":\"COG\", \"Cook_Islands\":\"COK\", \"Costa_Rica\":\"GNQ\", \"Eritrea\":\"ERI\", \"Estonia\":\"EST\", \"Ethiopia\":\"ETH\", \"Falkland_Islands \"Eritrea\":\"ERI\", \"Estonia\":\"EST\", \"Ethiopia\":\"ETH\", \"Falkland, \"France\":\"FRA\", \"French_Guiana\":\"GUF\", \"French_Polynesia\":\"PYF\",\"Gabon\": \"French_Guiana\":\"GUF\", \"French_Polynesia\":\"PYF\",\"Gabon\": \"French_Guiana\":\"GUa\":\"GHA\", \"Gibraltar\":\"GIB\", \"Greece\":\"GRC\", \"Greenland\":\"GRL\", \"Grenada\":\"GRD\", \"Guadeloupe\":\"GLP\", \"Guam\":\"GUM\", \"Guatemala\":\"GTM\", \"Guernsey\":\"GGY\", \"Guinea\":\"GIN\", \"Guinea_Bissau\":\"GNB\", \"Guyana\":\"GUY\", \"Haiti\":\"HTI\",\"Honduras\":\"HND\", \"Hong_Kong\":\"HKG\", \"Hungary\":\"HUN\", \"Iceland\":\"ISL\", \"India\":\"IND\", \"Indonesia\":\"IDN\", \"Iran\":\"IRN\", \"Iraq\":\"IRQ\", \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"IreD\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ire\", \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\"G\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Iue\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ir\", \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Irelandtil \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\":\":\" \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": \"Ireland\": th Mayen\":\"SJM\", \"Swaziland\":\"SWZ\",\"Sweden\":\"SWE\", \"Switzerland\":\"CHE\", \"Syria\":\"SYR\", \"Taiwan\":\"TWN\",\"Tajikistan\":\"TJK\", \"Tanzania\":\"TZA\", \"Thailand\":\"THA\", \"Timor Leste\":\"TLS\",\"East_Timor\":\"TLS\",\"Togo\":\"TGO\", \"Tokelau\":\"TKL\", \"Tonga\":\"TON\", \"TrinidadTobago\":\"TTO\", \"Tunisia\":\"TUN\", \"Turkey\":\"TUR\", \"Turkmenistan\":\"TKM\", \"TurksCaicos Islands\":\"TCA\", \"Tuvalu\":\"TUV\", \"U.S. Minor Outlying Islands\":\"UMI\",\"Virgin_Islands\":\"VIR\", \"Uganda\":\"UGA\", \"Ukraine\":\"UKR\",\"United_Arab_Emirates\":\"ARE\", \"United_Kingdom\":\"GBR\", \"UK\":\"GBR\",\"United_States\":\"USA\", \"USA\":\"USA\", \"America\":\"USA\", \"Uruguay\":\"URY\",\"Uzbekistan\":\"UZB\", \"Vanuatu\":\"VUT\", \"Vatican\":\"VAT\", \"Venezuela\":\"VEN\",\"Vietnam\":\"VNM\", \"Wallis Futuna\":\"WLF\", \"Western_Sahara\":\"ESH\", \"Yemen\":\"YEM\",\"Zambia\":\"ZMB\", \"Zimbabwe\":\"ZWE\"}" 2 | -------------------------------------------------------------------------------- /mordecai/data/feature_codes.txt: -------------------------------------------------------------------------------- 1 | A ADM1 A.ADM1 first-order administrative division a primary administrative division of a country, such as a state in the United States 2 | A ADM2 A.ADM2 second-order administrative division a subdivision of a first-order administrative division 3 | A ADM3 A.ADM3 third-order administrative division a subdivision of a second-order administrative division 4 | A ADM4 A.ADM4 fourth-order administrative division a subdivision of a third-order administrative division 5 | A ADM5 A.ADM5 fifth-order administrative division a subdivision of a fourth-order administrative division 6 | A ADMD A.ADMD administrative division an administrative division of a country, undifferentiated as to administrative level 7 | A LTER A.LTER leased area a tract of land leased to another country, usually for military installations 8 | A PCL A.PCL political entity 9 | A PCLD A.PCLD dependent political entity 10 | A PCLF A.PCLF freely associated state 11 | A PCLI A.PCLI independent political entity 12 | A PCLIX A.PCLIX section of independent political entity 13 | A PCLS A.PCLS semi-independent political entity 14 | A PRSH A.PRSH parish an ecclesiastical district 15 | A TERR A.TERR territory 16 | A ZN A.ZN zone 17 | A ZNB A.ZNB buffer zone a zone recognized as a buffer between two nations in which military presence is minimal or absent 18 | H AIRS H.AIRS seaplane landing area a place on a waterbody where floatplanes land and take off 19 | H ANCH H.ANCH anchorage an area where vessels may anchor 20 | H BAY H.BAY bay a coastal indentation between two capes or headlands, larger than a cove but smaller than a gulf 21 | H BAYS H.BAYS bays coastal indentations between two capes or headlands, larger than a cove but smaller than a gulf 22 | H BGHT H.BGHT bight(s) an open body of water forming a slight recession in a coastline 23 | H BNK H.BNK bank(s) an elevation, typically located on a shelf, over which the depth of water is relatively shallow but sufficient for most surface navigation 24 | H BNKR H.BNKR stream bank a sloping margin of a stream channel which normally confines the stream to its channel on land 25 | H BNKX H.BNKX section of bank 26 | H BOG H.BOG bog(s) a wetland characterized by peat forming sphagnum moss, sedge, and other acid-water plants 27 | H CAPG H.CAPG icecap a dome-shaped mass of glacial ice covering an area of mountain summits or other high lands; smaller than an ice sheet 28 | H CHN H.CHN channel the deepest part of a stream, bay, lagoon, or strait, through which the main current flows 29 | H CHNL H.CHNL lake channel(s) that part of a lake having water deep enough for navigation between islands, shoals, etc. 30 | H CHNM H.CHNM marine channel that part of a body of water deep enough for navigation through an area otherwise not suitable 31 | H CHNN H.CHNN navigation channel a buoyed channel of sufficient depth for the safe navigation of vessels 32 | H CNFL H.CNFL confluence a place where two or more streams or intermittent streams flow together 33 | H CNL H.CNL canal an artificial watercourse 34 | H CNLA H.CNLA aqueduct a conduit used to carry water 35 | H CNLB H.CNLB canal bend a conspicuously curved or bent section of a canal 36 | H CNLD H.CNLD drainage canal an artificial waterway carrying water away from a wetland or from drainage ditches 37 | H CNLI H.CNLI irrigation canal a canal which serves as a main conduit for irrigation water 38 | H CNLN H.CNLN navigation canal(s) a watercourse constructed for navigation of vessels 39 | H CNLQ H.CNLQ abandoned canal 40 | H CNLSB H.CNLSB underground irrigation canal(s) a gently inclined underground tunnel bringing water for irrigation from aquifers 41 | H CNLX H.CNLX section of canal 42 | H COVE H.COVE cove(s) a small coastal indentation, smaller than a bay 43 | H CRKT H.CRKT tidal creek(s) a meandering channel in a coastal wetland subject to bi-directional tidal currents 44 | H CRNT H.CRNT current a horizontal flow of water in a given direction with uniform velocity 45 | H CUTF H.CUTF cutoff a channel formed as a result of a stream cutting through a meander neck 46 | H DCK H.DCK dock(s) a waterway between two piers, or cut into the land for the berthing of ships 47 | H DCKB H.DCKB docking basin a part of a harbor where ships dock 48 | H DOMG H.DOMG icecap dome a comparatively elevated area on an icecap 49 | H DPRG H.DPRG icecap depression a comparatively depressed area on an icecap 50 | H DTCH H.DTCH ditch a small artificial watercourse dug for draining or irrigating the land 51 | H DTCHD H.DTCHD drainage ditch a ditch which serves to drain the land 52 | H DTCHI H.DTCHI irrigation ditch a ditch which serves to distribute irrigation water 53 | H DTCHM H.DTCHM ditch mouth(s) an area where a drainage ditch enters a lagoon, lake or bay 54 | H ESTY H.ESTY estuary a funnel-shaped stream mouth or embayment where fresh water mixes with sea water under tidal influences 55 | H FISH H.FISH fishing area a fishing ground, bank or area where fishermen go to catch fish 56 | H FJD H.FJD fjord a long, narrow, steep-walled, deep-water arm of the sea at high latitudes, usually along mountainous coasts 57 | H FJDS H.FJDS fjords long, narrow, steep-walled, deep-water arms of the sea at high latitudes, usually along mountainous coasts 58 | H FLLS H.FLLS waterfall(s) a perpendicular or very steep descent of the water of a stream 59 | H FLLSX H.FLLSX section of waterfall(s) 60 | H FLTM H.FLTM mud flat(s) a relatively level area of mud either between high and low tide lines, or subject to flooding 61 | H FLTT H.FLTT tidal flat(s) a large flat area of mud or sand attached to the shore and alternately covered and uncovered by the tide 62 | H GLCR H.GLCR glacier(s) a mass of ice, usually at high latitudes or high elevations, with sufficient thickness to flow away from the source area in lobes, tongues, or masses 63 | H GULF H.GULF gulf a large recess in the coastline, larger than a bay 64 | H GYSR H.GYSR geyser a type of hot spring with intermittent eruptions of jets of hot water and steam 65 | H HBR H.HBR harbor(s) a haven or space of deep water so sheltered by the adjacent land as to afford a safe anchorage for ships 66 | H HBRX H.HBRX section of harbor 67 | H INLT H.INLT inlet a narrow waterway extending into the land, or connecting a bay or lagoon with a larger body of water 68 | H INLTQ H.INLTQ former inlet an inlet which has been filled in, or blocked by deposits 69 | H LBED H.LBED lake bed(s) a dried up or drained area of a former lake 70 | H LGN H.LGN lagoon a shallow coastal waterbody, completely or partly separated from a larger body of water by a barrier island, coral reef or other depositional feature 71 | H LGNS H.LGNS lagoons shallow coastal waterbodies, completely or partly separated from a larger body of water by a barrier island, coral reef or other depositional feature 72 | H LGNX H.LGNX section of lagoon 73 | H LK H.LK lake a large inland body of standing water 74 | H LKC H.LKC crater lake a lake in a crater or caldera 75 | H LKI H.LKI intermittent lake 76 | H LKN H.LKN salt lake an inland body of salt water with no outlet 77 | H LKNI H.LKNI intermittent salt lake 78 | H LKO H.LKO oxbow lake a crescent-shaped lake commonly found adjacent to meandering streams 79 | H LKOI H.LKOI intermittent oxbow lake 80 | H LKS H.LKS lakes large inland bodies of standing water 81 | H LKSB H.LKSB underground lake a standing body of water in a cave 82 | H LKSC H.LKSC crater lakes lakes in a crater or caldera 83 | H LKSI H.LKSI intermittent lakes 84 | H LKSN H.LKSN salt lakes inland bodies of salt water with no outlet 85 | H LKSNI H.LKSNI intermittent salt lakes 86 | H LKX H.LKX section of lake 87 | H MFGN H.MFGN salt evaporation ponds diked salt ponds used in the production of solar evaporated salt 88 | H MGV H.MGV mangrove swamp a tropical tidal mud flat characterized by mangrove vegetation 89 | H MOOR H.MOOR moor(s) an area of open ground overlaid with wet peaty soils 90 | H MRSH H.MRSH marsh(es) a wetland dominated by grass-like vegetation 91 | H MRSHN H.MRSHN salt marsh a flat area, subject to periodic salt water inundation, dominated by grassy salt-tolerant plants 92 | H NRWS H.NRWS narrows a navigable narrow part of a bay, strait, river, etc. 93 | H OCN H.OCN ocean one of the major divisions of the vast expanse of salt water covering part of the earth 94 | H OVF H.OVF overfalls an area of breaking waves caused by the meeting of currents or by waves moving against the current 95 | H PND H.PND pond a small standing waterbody 96 | H PNDI H.PNDI intermittent pond 97 | H PNDN H.PNDN salt pond a small standing body of salt water often in a marsh or swamp, usually along a seacoast 98 | H PNDNI H.PNDNI intermittent salt pond(s) 99 | H PNDS H.PNDS ponds small standing waterbodies 100 | H PNDSF H.PNDSF fishponds ponds or enclosures in which fish are kept or raised 101 | H PNDSI H.PNDSI intermittent ponds 102 | H PNDSN H.PNDSN salt ponds small standing bodies of salt water often in a marsh or swamp, usually along a seacoast 103 | H POOL H.POOL pool(s) a small and comparatively still, deep part of a larger body of water such as a stream or harbor; or a small body of standing water 104 | H POOLI H.POOLI intermittent pool 105 | H RCH H.RCH reach a straight section of a navigable stream or channel between two bends 106 | H RDGG H.RDGG icecap ridge a linear elevation on an icecap 107 | H RDST H.RDST roadstead an open anchorage affording less protection than a harbor 108 | H RF H.RF reef(s) a surface-navigation hazard composed of consolidated material 109 | H RFC H.RFC coral reef(s) a surface-navigation hazard composed of coral 110 | H RFX H.RFX section of reef 111 | H RPDS H.RPDS rapids a turbulent section of a stream associated with a steep, irregular stream bed 112 | H RSV H.RSV reservoir(s) an artificial pond or lake 113 | H RSVI H.RSVI intermittent reservoir 114 | H RSVT H.RSVT water tank a contained pool or tank of water at, below, or above ground level 115 | H RVN H.RVN ravine(s) a small, narrow, deep, steep-sided stream channel, smaller than a gorge 116 | H SBKH H.SBKH sabkha(s) a salt flat or salt encrusted plain subject to periodic inundation from flooding or high tides 117 | H SD H.SD sound a long arm of the sea forming a channel between the mainland and an island or islands; or connecting two larger bodies of water 118 | H SEA H.SEA sea a large body of salt water more or less confined by continuous land or chains of islands forming a subdivision of an ocean 119 | H SHOL H.SHOL shoal(s) a surface-navigation hazard composed of unconsolidated material 120 | H SILL H.SILL sill the low part of an underwater gap or saddle separating basins, including a similar feature at the mouth of a fjord 121 | H SPNG H.SPNG spring(s) a place where ground water flows naturally out of the ground 122 | H SPNS H.SPNS sulphur spring(s) a place where sulphur ground water flows naturally out of the ground 123 | H SPNT H.SPNT hot spring(s) a place where hot ground water flows naturally out of the ground 124 | H STM H.STM stream a body of running water moving to a lower level in a channel on land 125 | H STMA H.STMA anabranch a diverging branch flowing out of a main stream and rejoining it downstream 126 | H STMB H.STMB stream bend a conspicuously curved or bent segment of a stream 127 | H STMC H.STMC canalized stream a stream that has been substantially ditched, diked, or straightened 128 | H STMD H.STMD distributary(-ies) a branch which flows away from the main stream, as in a delta or irrigation canal 129 | H STMH H.STMH headwaters the source and upper part of a stream, including the upper drainage basin 130 | H STMI H.STMI intermittent stream 131 | H STMIX H.STMIX section of intermittent stream 132 | H STMM H.STMM stream mouth(s) a place where a stream discharges into a lagoon, lake, or the sea 133 | H STMQ H.STMQ abandoned watercourse a former stream or distributary no longer carrying flowing water, but still evident due to lakes, wetland, topographic or vegetation patterns 134 | H STMS H.STMS streams bodies of running water moving to a lower level in a channel on land 135 | H STMSB H.STMSB lost river a surface stream that disappears into an underground channel, or dries up in an arid area 136 | H STMX H.STMX section of stream 137 | H STRT H.STRT strait a relatively narrow waterway, usually narrower and less extensive than a sound, connecting two larger bodies of water 138 | H SWMP H.SWMP swamp a wetland dominated by tree vegetation 139 | H SYSI H.SYSI irrigation system a network of ditches and one or more of the following elements: water supply, reservoir, canal, pump, well, drain, etc. 140 | H TNLC H.TNLC canal tunnel a tunnel through which a canal passes 141 | H WAD H.WAD wadi a valley or ravine, bounded by relatively steep banks, which in the rainy season becomes a watercourse; found primarily in North Africa and the Middle East 142 | H WADB H.WADB wadi bend a conspicuously curved or bent segment of a wadi 143 | H WADJ H.WADJ wadi junction a place where two or more wadies join 144 | H WADM H.WADM wadi mouth the lower terminus of a wadi where it widens into an adjoining floodplain, depression, or waterbody 145 | H WADS H.WADS wadies valleys or ravines, bounded by relatively steep banks, which in the rainy season become watercourses; found primarily in North Africa and the Middle East 146 | H WADX H.WADX section of wadi 147 | H WHRL H.WHRL whirlpool a turbulent, rotating movement of water in a stream 148 | H WLL H.WLL well a cylindrical hole, pit, or tunnel drilled or dug down to a depth from which water, oil, or gas can be pumped or brought to the surface 149 | H WLLQ H.WLLQ abandoned well 150 | H WLLS H.WLLS wells cylindrical holes, pits, or tunnels drilled or dug down to a depth from which water, oil, or gas can be pumped or brought to the surface 151 | H WTLD H.WTLD wetland an area subject to inundation, usually characterized by bog, marsh, or swamp vegetation 152 | H WTLDI H.WTLDI intermittent wetland 153 | H WTRC H.WTRC watercourse a natural, well-defined channel produced by flowing water, or an artificial channel designed to carry flowing water 154 | H WTRH H.WTRH waterhole(s) a natural hole, hollow, or small depression that contains water, used by man and animals, especially in arid areas 155 | L AGRC L.AGRC agricultural colony a tract of land set aside for agricultural settlement 156 | L AMUS L.AMUS amusement park Amusement Park are theme parks, adventure parks offering entertainment, similar to funfairs but with a fix location 157 | L AREA L.AREA area a tract of land without homogeneous character or boundaries 158 | L BSND L.BSND drainage basin an area drained by a stream 159 | L BSNP L.BSNP petroleum basin an area underlain by an oil-rich structural basin 160 | L BTL L.BTL battlefield a site of a land battle of historical importance 161 | L CLG L.CLG clearing an area in a forest with trees removed 162 | L CMN L.CMN common a park or pasture for community use 163 | L CNS L.CNS concession area a lease of land by a government for economic development, e.g., mining, forestry 164 | L COLF L.COLF coalfield a region in which coal deposits of possible economic value occur 165 | L CONT L.CONT continent continent : Europe, Africa, Asia, North America, South America, Oceania,Antarctica 166 | L CST L.CST coast a zone of variable width straddling the shoreline 167 | L CTRB L.CTRB business center a place where a number of businesses are located 168 | L DEVH L.DEVH housing development a tract of land on which many houses of similar design are built according to a development plan 169 | L FLD L.FLD field(s) an open as opposed to wooded area 170 | L FLDI L.FLDI irrigated field(s) a tract of level or terraced land which is irrigated 171 | L GASF L.GASF gasfield an area containing a subterranean store of natural gas of economic value 172 | L GRAZ L.GRAZ grazing area an area of grasses and shrubs used for grazing 173 | L GVL L.GVL gravel area an area covered with gravel 174 | L INDS L.INDS industrial area an area characterized by industrial activity 175 | L LAND L.LAND arctic land a tract of land in the Arctic 176 | L LCTY L.LCTY locality a minor area or place of unspecified or mixed character and indefinite boundaries 177 | L MILB L.MILB military base a place used by an army or other armed service for storing arms and supplies, and for accommodating and training troops, a base from which operations can be initiated 178 | L MNA L.MNA mining area an area of mine sites where minerals and ores are extracted 179 | L MVA L.MVA maneuver area a tract of land where military field exercises are carried out 180 | L NVB L.NVB naval base an area used to store supplies, provide barracks for troops and naval personnel, a port for naval vessels, and from which operations are initiated 181 | L OAS L.OAS oasis(-es) an area in a desert made productive by the availability of water 182 | L OILF L.OILF oilfield an area containing a subterranean store of petroleum of economic value 183 | L PEAT L.PEAT peat cutting area an area where peat is harvested 184 | L PRK L.PRK park an area, often of forested land, maintained as a place of beauty, or for recreation 185 | L PRT L.PRT port a place provided with terminal and transfer facilities for loading and discharging waterborne cargo or passengers, usually located in a harbor 186 | L QCKS L.QCKS quicksand an area where loose sand with water moving through it may become unstable when heavy objects are placed at the surface, causing them to sink 187 | L RES L.RES reserve a tract of public land reserved for future use or restricted as to use 188 | L RESA L.RESA agricultural reserve a tract of land reserved for agricultural reclamation and/or development 189 | L RESF L.RESF forest reserve a forested area set aside for preservation or controlled use 190 | L RESH L.RESH hunting reserve a tract of land used primarily for hunting 191 | L RESN L.RESN nature reserve an area reserved for the maintenance of a natural habitat 192 | L RESP L.RESP palm tree reserve an area of palm trees where use is controlled 193 | L RESV L.RESV reservation a tract of land set aside for aboriginal, tribal, or native populations 194 | L RESW L.RESW wildlife reserve a tract of public land reserved for the preservation of wildlife 195 | L RGN L.RGN region an area distinguished by one or more observable physical or cultural characteristics 196 | L RGNE L.RGNE economic region a region of a country established for economic development or for statistical purposes 197 | L RGNL L.RGNL lake region a tract of land distinguished by numerous lakes 198 | L RNGA L.RNGA artillery range a tract of land used for artillery firing practice 199 | L SALT L.SALT salt area a shallow basin or flat where salt accumulates after periodic inundation 200 | L SNOW L.SNOW snowfield an area of permanent snow and ice forming the accumulation area of a glacier 201 | L TRB L.TRB tribal area a tract of land used by nomadic or other tribes 202 | P PPL P.PPL populated place a city, town, village, or other agglomeration of buildings where people live and work 203 | P PPLA P.PPLA seat of a first-order administrative division seat of a first-order administrative division (PPLC takes precedence over PPLA) 204 | P PPLA2 P.PPLA2 seat of a second-order administrative division 205 | P PPLA3 P.PPLA3 seat of a third-order administrative division 206 | P PPLA4 P.PPLA4 seat of a fourth-order administrative division 207 | P PPLC P.PPLC capital of a political entity 208 | P PPLF P.PPLF farm village a populated place where the population is largely engaged in agricultural activities 209 | P PPLG P.PPLG seat of government of a political entity 210 | P PPLL P.PPLL populated locality an area similar to a locality but with a small group of dwellings or other buildings 211 | P PPLQ P.PPLQ abandoned populated place 212 | P PPLR P.PPLR religious populated place a populated place whose population is largely engaged in religious occupations 213 | P PPLS P.PPLS populated places cities, towns, villages, or other agglomerations of buildings where people live and work 214 | P PPLW P.PPLW destroyed populated place a village, town or city destroyed by a natural disaster, or by war 215 | P PPLX P.PPLX section of populated place 216 | P STLMT P.STLMT israeli settlement 217 | R CSWY R.CSWY causeway a raised roadway across wet ground or shallow water 218 | R OILP R.OILP oil pipeline a pipeline used for transporting oil 219 | R PRMN R.PRMN promenade a place for public walking, usually along a beach front 220 | R PTGE R.PTGE portage a place where boats, goods, etc., are carried overland between navigable waters 221 | R RD R.RD road an open way with improved surface for transportation of animals, people and vehicles 222 | R RDA R.RDA ancient road the remains of a road used by ancient cultures 223 | R RDB R.RDB road bend a conspicuously curved or bent section of a road 224 | R RDCUT R.RDCUT road cut an excavation cut through a hill or ridge for a road 225 | R RDJCT R.RDJCT road junction a place where two or more roads join 226 | R RJCT R.RJCT railroad junction a place where two or more railroad tracks join 227 | R RR R.RR railroad a permanent twin steel-rail track on which freight and passenger cars move long distances 228 | R RRQ R.RRQ abandoned railroad 229 | R RTE R.RTE caravan route the route taken by caravans 230 | R RYD R.RYD railroad yard a system of tracks used for the making up of trains, and switching and storing freight cars 231 | R ST R.ST street a paved urban thoroughfare 232 | R STKR R.STKR stock route a route taken by livestock herds 233 | R TNL R.TNL tunnel a subterranean passageway for transportation 234 | R TNLN R.TNLN natural tunnel a cave that is open at both ends 235 | R TNLRD R.TNLRD road tunnel a tunnel through which a road passes 236 | R TNLRR R.TNLRR railroad tunnel a tunnel through which a railroad passes 237 | R TNLS R.TNLS tunnels subterranean passageways for transportation 238 | R TRL R.TRL trail a path, track, or route used by pedestrians, animals, or off-road vehicles 239 | S ADMF S.ADMF administrative facility a government building 240 | S AGRF S.AGRF agricultural facility a building and/or tract of land used for improving agriculture 241 | S AIRB S.AIRB airbase an area used to store supplies, provide barracks for air force personnel, hangars and runways for aircraft, and from which operations are initiated 242 | S AIRF S.AIRF airfield a place on land where aircraft land and take off; no facilities provided for the commercial handling of passengers and cargo 243 | S AIRH S.AIRH heliport a place where helicopters land and take off 244 | S AIRP S.AIRP airport a place where aircraft regularly land and take off, with runways, navigational aids, and major facilities for the commercial handling of passengers and cargo 245 | S AIRQ S.AIRQ abandoned airfield 246 | S AMTH S.AMTH amphitheater an oval or circular structure with rising tiers of seats about a stage or open space 247 | S ANS S.ANS ancient site a place where archeological remains, old structures, or cultural artifacts are located 248 | S AQC S.AQC aquaculture facility facility or area for the cultivation of aquatic animals and plants, especially fish, shellfish, and seaweed, in natural or controlled marine or freshwater environments; underwater agriculture 249 | S ARCH S.ARCH arch a natural or man-made structure in the form of an arch 250 | S ASTR S.ASTR astronomical station a point on the earth whose position has been determined by observations of celestial bodies 251 | S ASYL S.ASYL asylum a facility where the insane are cared for and protected 252 | S ATHF S.ATHF athletic field a tract of land used for playing team sports, and athletic track and field events 253 | S ATM S.ATM automatic teller machine An unattended electronic machine in a public place, connected to a data system and related equipment and activated by a bank customer to obtain cash withdrawals and other banking services. 254 | S BANK S.BANK bank A business establishment in which money is kept for saving or commercial purposes or is invested, supplied for loans, or exchanged. 255 | S BCN S.BCN beacon a fixed artificial navigation mark 256 | S BDG S.BDG bridge a structure erected across an obstacle such as a stream, road, etc., in order to carry roads, railroads, and pedestrians across 257 | S BDGQ S.BDGQ ruined bridge a destroyed or decayed bridge which is no longer functional 258 | S BLDG S.BLDG building(s) a structure built for permanent use, as a house, factory, etc. 259 | S BLDO S.BLDO office building commercial building where business and/or services are conducted 260 | S BP S.BP boundary marker a fixture marking a point along a boundary 261 | S BRKS S.BRKS barracks a building for lodging military personnel 262 | S BRKW S.BRKW breakwater a structure erected to break the force of waves at the entrance to a harbor or port 263 | S BSTN S.BSTN baling station a facility for baling agricultural products 264 | S BTYD S.BTYD boatyard a waterside facility for servicing, repairing, and building small vessels 265 | S BUR S.BUR burial cave(s) a cave used for human burials 266 | S BUSTN S.BUSTN bus station a facility comprising ticket office, platforms, etc. for loading and unloading passengers 267 | S BUSTP S.BUSTP bus stop a place lacking station facilities 268 | S CARN S.CARN cairn a heap of stones erected as a landmark or for other purposes 269 | S CAVE S.CAVE cave(s) an underground passageway or chamber, or cavity on the side of a cliff 270 | S CH S.CH church a building for public Christian worship 271 | S CMP S.CMP camp(s) a site occupied by tents, huts, or other shelters for temporary use 272 | S CMPL S.CMPL logging camp a camp used by loggers 273 | S CMPLA S.CMPLA labor camp a camp used by migrant or temporary laborers 274 | S CMPMN S.CMPMN mining camp a camp used by miners 275 | S CMPO S.CMPO oil camp a camp used by oilfield workers 276 | S CMPQ S.CMPQ abandoned camp 277 | S CMPRF S.CMPRF refugee camp a camp used by refugees 278 | S CMTY S.CMTY cemetery a burial place or ground 279 | S COMC S.COMC communication center a facility, including buildings, antennae, towers and electronic equipment for receiving and transmitting information 280 | S CRRL S.CRRL corral(s) a pen or enclosure for confining or capturing animals 281 | S CSNO S.CSNO casino a building used for entertainment, especially gambling 282 | S CSTL S.CSTL castle a large fortified building or set of buildings 283 | S CSTM S.CSTM customs house a building in a port where customs and duties are paid, and where vessels are entered and cleared 284 | S CTHSE S.CTHSE courthouse a building in which courts of law are held 285 | S CTRA S.CTRA atomic center a facility where atomic research is carried out 286 | S CTRCM S.CTRCM community center a facility for community recreation and other activities 287 | S CTRF S.CTRF facility center a place where more than one facility is situated 288 | S CTRM S.CTRM medical center a complex of health care buildings including two or more of the following: hospital, medical school, clinic, pharmacy, doctor's offices, etc. 289 | S CTRR S.CTRR religious center a facility where more than one religious activity is carried out, e.g., retreat, school, monastery, worship 290 | S CTRS S.CTRS space center a facility for launching, tracking, or controlling satellites and space vehicles 291 | S CVNT S.CVNT convent a building where a community of nuns lives in seclusion 292 | S DAM S.DAM dam a barrier constructed across a stream to impound water 293 | S DAMQ S.DAMQ ruined dam a destroyed or decayed dam which is no longer functional 294 | S DAMSB S.DAMSB sub-surface dam a dam put down to bedrock in a sand river 295 | S DARY S.DARY dairy a facility for the processing, sale and distribution of milk or milk products 296 | S DCKD S.DCKD dry dock a dock providing support for a vessel, and means for removing the water so that the bottom of the vessel can be exposed 297 | S DCKY S.DCKY dockyard a facility for servicing, building, or repairing ships 298 | S DIKE S.DIKE dike an earth or stone embankment usually constructed for flood or stream control 299 | S DIP S.DIP diplomatic facility office, residence, or facility of a foreign government, which may include an embassy, consulate, chancery, office of charge d?affaires, or other diplomatic, economic, military, or cultural mission 300 | S DPOF S.DPOF fuel depot an area where fuel is stored 301 | S EST S.EST estate(s) a large commercialized agricultural landholding with associated buildings and other facilities 302 | S ESTO S.ESTO oil palm plantation an estate specializing in the cultivation of oil palm trees 303 | S ESTR S.ESTR rubber plantation an estate which specializes in growing and tapping rubber trees 304 | S ESTSG S.ESTSG sugar plantation an estate that specializes in growing sugar cane 305 | S ESTT S.ESTT tea plantation an estate which specializes in growing tea bushes 306 | S ESTX S.ESTX section of estate 307 | S FCL S.FCL facility a building or buildings housing a center, institute, foundation, hospital, prison, mission, courthouse, etc. 308 | S FNDY S.FNDY foundry a building or works where metal casting is carried out 309 | S FRM S.FRM farm a tract of land with associated buildings devoted to agriculture 310 | S FRMQ S.FRMQ abandoned farm 311 | S FRMS S.FRMS farms tracts of land with associated buildings devoted to agriculture 312 | S FRMT S.FRMT farmstead the buildings and adjacent service areas of a farm 313 | S FT S.FT fort a defensive structure or earthworks 314 | S FY S.FY ferry a boat or other floating conveyance and terminal facilities regularly used to transport people and vehicles across a waterbody 315 | S GATE S.GATE gate a controlled access entrance or exit 316 | S GDN S.GDN garden(s) an enclosure for displaying selected plant or animal life 317 | S GHAT S.GHAT ghat a set of steps leading to a river, which are of religious significance, and at their base is usually a platform for bathing 318 | S GHSE S.GHSE guest house a house used to provide lodging for paying guests 319 | S GOSP S.GOSP gas-oil separator plant a facility for separating gas from oil 320 | S GOVL S.GOVL local government office a facility housing local governmental offices, usually a city, town, or village hall 321 | S GRVE S.GRVE grave a burial site 322 | S HERM S.HERM hermitage a secluded residence, usually for religious sects 323 | S HLT S.HLT halting place a place where caravans stop for rest 324 | S HSE S.HSE house(s) a building used as a human habitation 325 | S HSEC S.HSEC country house a large house, mansion, or chateau, on a large estate 326 | S HSP S.HSP hospital a building in which sick or injured, especially those confined to bed, are medically treated 327 | S HSPC S.HSPC clinic a medical facility associated with a hospital for outpatients 328 | S HSPD S.HSPD dispensary a building where medical or dental aid is dispensed 329 | S HSPL S.HSPL leprosarium an asylum or hospital for lepers 330 | S HSTS S.HSTS historical site a place of historical importance 331 | S HTL S.HTL hotel a building providing lodging and/or meals for the public 332 | S HUT S.HUT hut a small primitive house 333 | S HUTS S.HUTS huts small primitive houses 334 | S INSM S.INSM military installation a facility for use of and control by armed forces 335 | S ITTR S.ITTR research institute a facility where research is carried out 336 | S JTY S.JTY jetty a structure built out into the water at a river mouth or harbor entrance to regulate currents and silting 337 | S LDNG S.LDNG landing a place where boats receive or discharge passengers and freight, but lacking most port facilities 338 | S LEPC S.LEPC leper colony a settled area inhabited by lepers in relative isolation 339 | S LIBR S.LIBR library A place in which information resources such as books are kept for reading, reference, or lending. 340 | S LNDF S.LNDF landfill a place for trash and garbage disposal in which the waste is buried between layers of earth to build up low-lying land 341 | S LOCK S.LOCK lock(s) a basin in a waterway with gates at each end by means of which vessels are passed from one water level to another 342 | S LTHSE S.LTHSE lighthouse a distinctive structure exhibiting a major navigation light 343 | S MALL S.MALL mall A large, often enclosed shopping complex containing various stores, businesses, and restaurants usually accessible by common passageways. 344 | S MAR S.MAR marina a harbor facility for small boats, yachts, etc. 345 | S MFG S.MFG factory one or more buildings where goods are manufactured, processed or fabricated 346 | S MFGB S.MFGB brewery one or more buildings where beer is brewed 347 | S MFGC S.MFGC cannery a building where food items are canned 348 | S MFGCU S.MFGCU copper works a facility for processing copper ore 349 | S MFGLM S.MFGLM limekiln a furnace in which limestone is reduced to lime 350 | S MFGM S.MFGM munitions plant a factory where ammunition is made 351 | S MFGPH S.MFGPH phosphate works a facility for producing fertilizer 352 | S MFGQ S.MFGQ abandoned factory 353 | S MFGSG S.MFGSG sugar refinery a facility for converting raw sugar into refined sugar 354 | S MKT S.MKT market a place where goods are bought and sold at regular intervals 355 | S ML S.ML mill(s) a building housing machines for transforming, shaping, finishing, grinding, or extracting products 356 | S MLM S.MLM ore treatment plant a facility for improving the metal content of ore by concentration 357 | S MLO S.MLO olive oil mill a mill where oil is extracted from olives 358 | S MLSG S.MLSG sugar mill a facility where sugar cane is processed into raw sugar 359 | S MLSGQ S.MLSGQ former sugar mill a sugar mill no longer used as a sugar mill 360 | S MLSW S.MLSW sawmill a mill where logs or lumber are sawn to specified shapes and sizes 361 | S MLWND S.MLWND windmill a mill or water pump powered by wind 362 | S MLWTR S.MLWTR water mill a mill powered by running water 363 | S MN S.MN mine(s) a site where mineral ores are extracted from the ground by excavating surface pits and subterranean passages 364 | S MNAU S.MNAU gold mine(s) a mine where gold ore, or alluvial gold is extracted 365 | S MNC S.MNC coal mine(s) a mine where coal is extracted 366 | S MNCR S.MNCR chrome mine(s) a mine where chrome ore is extracted 367 | S MNCU S.MNCU copper mine(s) a mine where copper ore is extracted 368 | S MNFE S.MNFE iron mine(s) a mine where iron ore is extracted 369 | S MNMT S.MNMT monument a commemorative structure or statue 370 | S MNN S.MNN salt mine(s) a mine from which salt is extracted 371 | S MNQ S.MNQ abandoned mine 372 | S MNQR S.MNQR quarry(-ies) a surface mine where building stone or gravel and sand, etc. are extracted 373 | S MOLE S.MOLE mole a massive structure of masonry or large stones serving as a pier or breakwater 374 | S MSQE S.MSQE mosque a building for public Islamic worship 375 | S MSSN S.MSSN mission a place characterized by dwellings, school, church, hospital and other facilities operated by a religious group for the purpose of providing charitable services and to propagate religion 376 | S MSSNQ S.MSSNQ abandoned mission 377 | S MSTY S.MSTY monastery a building and grounds where a community of monks lives in seclusion 378 | S MTRO S.MTRO metro station metro station (Underground, Tube, or M?tro) 379 | S MUS S.MUS museum a building where objects of permanent interest in one or more of the arts and sciences are preserved and exhibited 380 | S NOV S.NOV novitiate a religious house or school where novices are trained 381 | S NSY S.NSY nursery(-ies) a place where plants are propagated for transplanting or grafting 382 | S OBPT S.OBPT observation point a wildlife or scenic observation point 383 | S OBS S.OBS observatory a facility equipped for observation of atmospheric or space phenomena 384 | S OBSR S.OBSR radio observatory a facility equipped with an array of antennae for receiving radio waves from space 385 | S OILJ S.OILJ oil pipeline junction a section of an oil pipeline where two or more pipes join together 386 | S OILQ S.OILQ abandoned oil well 387 | S OILR S.OILR oil refinery a facility for converting crude oil into refined petroleum products 388 | S OILT S.OILT tank farm a tract of land occupied by large, cylindrical, metal tanks in which oil or liquid petrochemicals are stored 389 | S OILW S.OILW oil well a well from which oil may be pumped 390 | S OPRA S.OPRA opera house A theater designed chiefly for the performance of operas. 391 | S PAL S.PAL palace a large stately house, often a royal or presidential residence 392 | S PGDA S.PGDA pagoda a tower-like storied structure, usually a Buddhist shrine 393 | S PIER S.PIER pier a structure built out into navigable water on piles providing berthing for ships and recreation 394 | S PKLT S.PKLT parking lot an area used for parking vehicles 395 | S PMPO S.PMPO oil pumping station a facility for pumping oil through a pipeline 396 | S PMPW S.PMPW water pumping station a facility for pumping water from a major well or through a pipeline 397 | S PO S.PO post office a public building in which mail is received, sorted and distributed 398 | S PP S.PP police post a building in which police are stationed 399 | S PPQ S.PPQ abandoned police post 400 | S PRKGT S.PRKGT park gate a controlled access to a park 401 | S PRKHQ S.PRKHQ park headquarters a park administrative facility 402 | S PRN S.PRN prison a facility for confining prisoners 403 | S PRNJ S.PRNJ reformatory a facility for confining, training, and reforming young law offenders 404 | S PRNQ S.PRNQ abandoned prison 405 | S PS S.PS power station a facility for generating electric power 406 | S PSH S.PSH hydroelectric power station a building where electricity is generated from water power 407 | S PSTB S.PSTB border post a post or station at an international boundary for the regulation of movement of people and goods 408 | S PSTC S.PSTC customs post a building at an international boundary where customs and duties are paid on goods 409 | S PSTP S.PSTP patrol post a post from which patrols are sent out 410 | S PYR S.PYR pyramid an ancient massive structure of square ground plan with four triangular faces meeting at a point and used for enclosing tombs 411 | S PYRS S.PYRS pyramids ancient massive structures of square ground plan with four triangular faces meeting at a point and used for enclosing tombs 412 | S QUAY S.QUAY quay a structure of solid construction along a shore or bank which provides berthing for ships and which generally provides cargo handling facilities 413 | S RDCR S.RDCR traffic circle a road junction formed around a central circle about which traffic moves in one direction only 414 | S RECG S.RECG golf course a recreation field where golf is played 415 | S RECR S.RECR racetrack a track where races are held 416 | S REST S.REST restaurant A place where meals are served to the public 417 | S RET S.RET store a building where goods and/or services are offered for sale 418 | S RHSE S.RHSE resthouse a structure maintained for the rest and shelter of travelers 419 | S RKRY S.RKRY rookery a breeding place of a colony of birds or seals 420 | S RLG S.RLG religious site an ancient site of significant religious importance 421 | S RLGR S.RLGR retreat a place of temporary seclusion, especially for religious groups 422 | S RNCH S.RNCH ranch(es) a large farm specializing in extensive grazing of livestock 423 | S RSD S.RSD railroad siding a short track parallel to and joining the main track 424 | S RSGNL S.RSGNL railroad signal a signal at the entrance of a particular section of track governing the movement of trains 425 | S RSRT S.RSRT resort a specialized facility for vacation, health, or participation sports activities 426 | S RSTN S.RSTN railroad station a facility comprising ticket office, platforms, etc. for loading and unloading train passengers and freight 427 | S RSTNQ S.RSTNQ abandoned railroad station 428 | S RSTP S.RSTP railroad stop a place lacking station facilities where trains stop to pick up and unload passengers and freight 429 | S RSTPQ S.RSTPQ abandoned railroad stop 430 | S RUIN S.RUIN ruin(s) a destroyed or decayed structure which is no longer functional 431 | S SCH S.SCH school building(s) where instruction in one or more branches of knowledge takes place 432 | S SCHA S.SCHA agricultural school a school with a curriculum focused on agriculture 433 | S SCHC S.SCHC college the grounds and buildings of an institution of higher learning 434 | S SCHL S.SCHL language school Language Schools & Institutions 435 | S SCHM S.SCHM military school a school at which military science forms the core of the curriculum 436 | S SCHN S.SCHN maritime school a school at which maritime sciences form the core of the curriculum 437 | S SCHT S.SCHT technical school post-secondary school with a specifically technical or vocational curriculum 438 | S SECP S.SECP State Exam Prep Centre state exam preparation centres 439 | S SHPF S.SHPF sheepfold a fence or wall enclosure for sheep and other small herd animals 440 | S SHRN S.SHRN shrine a structure or place memorializing a person or religious concept 441 | S SHSE S.SHSE storehouse a building for storing goods, especially provisions 442 | S SLCE S.SLCE sluice a conduit or passage for carrying off surplus water from a waterbody, usually regulated by means of a sluice gate 443 | S SNTR S.SNTR sanatorium a facility where victims of physical or mental disorders are treated 444 | S SPA S.SPA spa a resort area usually developed around a medicinal spring 445 | S SPLY S.SPLY spillway a passage or outlet through which surplus water flows over, around or through a dam 446 | S SQR S.SQR square a broad, open, public area near the center of a town or city 447 | S STBL S.STBL stable a building for the shelter and feeding of farm animals, especially horses 448 | S STDM S.STDM stadium a structure with an enclosure for athletic games with tiers of seats for spectators 449 | S STNB S.STNB scientific research base a scientific facility used as a base from which research is carried out or monitored 450 | S STNC S.STNC coast guard station a facility from which the coast is guarded by armed vessels 451 | S STNE S.STNE experiment station a facility for carrying out experiments 452 | S STNF S.STNF forest station a collection of buildings and facilities for carrying out forest management 453 | S STNI S.STNI inspection station a station at which vehicles, goods, and people are inspected 454 | S STNM S.STNM meteorological station a station at which weather elements are recorded 455 | S STNR S.STNR radio station a facility for producing and transmitting information by radio waves 456 | S STNS S.STNS satellite station a facility for tracking and communicating with orbiting satellites 457 | S STNW S.STNW whaling station a facility for butchering whales and processing train oil 458 | S STPS S.STPS steps stones or slabs placed for ease in ascending or descending a steep slope 459 | S SWT S.SWT sewage treatment plant facility for the processing of sewage and/or wastewater 460 | S THTR S.THTR theater A building, room, or outdoor structure for the presentation of plays, films, or other dramatic performances 461 | S TMB S.TMB tomb(s) a structure for interring bodies 462 | S TMPL S.TMPL temple(s) an edifice dedicated to religious worship 463 | S TNKD S.TNKD cattle dipping tank a small artificial pond used for immersing cattle in chemically treated water for disease control 464 | S TOWR S.TOWR tower a high conspicuous structure, typically much higher than its diameter 465 | S TRANT S.TRANT transit terminal facilities for the handling of vehicular freight and passengers 466 | S TRIG S.TRIG triangulation station a point on the earth whose position has been determined by triangulation 467 | S TRMO S.TRMO oil pipeline terminal a tank farm or loading facility at the end of an oil pipeline 468 | S TWO S.TWO temp work office Temporary Work Offices 469 | S UNIP S.UNIP university prep school University Preparation Schools & Institutions 470 | S UNIV S.UNIV university An institution for higher learning with teaching and research facilities constituting a graduate school and professional schools that award master's degrees and doctorates and an undergraduate division that awards bachelor's degrees. 471 | S USGE S.USGE united states government establishment a facility operated by the United States Government in Panama 472 | S VETF S.VETF veterinary facility a building or camp at which veterinary services are available 473 | S WALL S.WALL wall a thick masonry structure, usually enclosing a field or building, or forming the side of a structure 474 | S WALLA S.WALLA ancient wall the remains of a linear defensive stone structure 475 | S WEIR S.WEIR weir(s) a small dam in a stream, designed to raise the water level or to divert stream flow through a desired channel 476 | S WHRF S.WHRF wharf(-ves) a structure of open rather than solid construction along a shore or a bank which provides berthing for ships and cargo-handling facilities 477 | S WRCK S.WRCK wreck the site of the remains of a wrecked vessel 478 | S WTRW S.WTRW waterworks a facility for supplying potable water through a water source and a system of pumps and filtration beds 479 | S ZNF S.ZNF free trade zone an area, usually a section of a port, where goods may be received and shipped free of customs duty and of most customs regulations 480 | S ZOO S.ZOO zoo a zoological garden or park where wild animals are kept for exhibition 481 | T ASPH T.ASPH asphalt lake a small basin containing naturally occurring asphalt 482 | T ATOL T.ATOL atoll(s) a ring-shaped coral reef which has closely spaced islands on it encircling a lagoon 483 | T BAR T.BAR bar a shallow ridge or mound of coarse unconsolidated material in a stream channel, at the mouth of a stream, estuary, or lagoon and in the wave-break zone along coasts 484 | T BCH T.BCH beach a shore zone of coarse unconsolidated sediment that extends from the low-water line to the highest reach of storm waves 485 | T BCHS T.BCHS beaches a shore zone of coarse unconsolidated sediment that extends from the low-water line to the highest reach of storm waves 486 | T BDLD T.BDLD badlands an area characterized by a maze of very closely spaced, deep, narrow, steep-sided ravines, and sharp crests and pinnacles 487 | T BLDR T.BLDR boulder field a high altitude or high latitude bare, flat area covered with large angular rocks 488 | T BLHL T.BLHL blowhole(s) a hole in coastal rock through which sea water is forced by a rising tide or waves and spurted through an outlet into the air 489 | T BLOW T.BLOW blowout(s) a small depression in sandy terrain, caused by wind erosion 490 | T BNCH T.BNCH bench a long, narrow bedrock platform bounded by steeper slopes above and below, usually overlooking a waterbody 491 | T BUTE T.BUTE butte(s) a small, isolated, usually flat-topped hill with steep sides 492 | T CAPE T.CAPE cape a land area, more prominent than a point, projecting into the sea and marking a notable change in coastal direction 493 | T CFT T.CFT cleft(s) a deep narrow slot, notch, or groove in a coastal cliff 494 | T CLDA T.CLDA caldera a depression measuring kilometers across formed by the collapse of a volcanic mountain 495 | T CLF T.CLF cliff(s) a high, steep to perpendicular slope overlooking a waterbody or lower area 496 | T CNYN T.CNYN canyon a deep, narrow valley with steep sides cutting into a plateau or mountainous area 497 | T CONE T.CONE cone(s) a conical landform composed of mud or volcanic material 498 | T CRDR T.CRDR corridor a strip or area of land having significance as an access way 499 | T CRQ T.CRQ cirque a bowl-like hollow partially surrounded by cliffs or steep slopes at the head of a glaciated valley 500 | T CRQS T.CRQS cirques bowl-like hollows partially surrounded by cliffs or steep slopes at the head of a glaciated valley 501 | T CRTR T.CRTR crater(s) a generally circular saucer or bowl-shaped depression caused by volcanic or meteorite explosive action 502 | T CUET T.CUET cuesta(s) an asymmetric ridge formed on tilted strata 503 | T DLTA T.DLTA delta a flat plain formed by alluvial deposits at the mouth of a stream 504 | T DPR T.DPR depression(s) a low area surrounded by higher land and usually characterized by interior drainage 505 | T DSRT T.DSRT desert a large area with little or no vegetation due to extreme environmental conditions 506 | T DUNE T.DUNE dune(s) a wave form, ridge or star shape feature composed of sand 507 | T DVD T.DVD divide a line separating adjacent drainage basins 508 | T ERG T.ERG sandy desert an extensive tract of shifting sand and sand dunes 509 | T FAN T.FAN fan(s) a fan-shaped wedge of coarse alluvium with apex merging with a mountain stream bed and the fan spreading out at a low angle slope onto an adjacent plain 510 | T FORD T.FORD ford a shallow part of a stream which can be crossed on foot or by land vehicle 511 | T FSR T.FSR fissure a crack associated with volcanism 512 | T GAP T.GAP gap a low place in a ridge, not used for transportation 513 | T GRGE T.GRGE gorge(s) a short, narrow, steep-sided section of a stream valley 514 | T HDLD T.HDLD headland a high projection of land extending into a large body of water beyond the line of the coast 515 | T HLL T.HLL hill a rounded elevation of limited extent rising above the surrounding land with local relief of less than 300m 516 | T HLLS T.HLLS hills rounded elevations of limited extent rising above the surrounding land with local relief of less than 300m 517 | T HMCK T.HMCK hammock(s) a patch of ground, distinct from and slightly above the surrounding plain or wetland. Often occurs in groups 518 | T HMDA T.HMDA rock desert a relatively sand-free, high bedrock plateau in a hot desert, with or without a gravel veneer 519 | T INTF T.INTF interfluve a relatively undissected upland between adjacent stream valleys 520 | T ISL T.ISL island a tract of land, smaller than a continent, surrounded by water at high water 521 | T ISLET T.ISLET islet small island, bigger than rock, smaller than island. 522 | T ISLF T.ISLF artificial island an island created by landfill or diking and filling in a wetland, bay, or lagoon 523 | T ISLM T.ISLM mangrove island a mangrove swamp surrounded by a waterbody 524 | T ISLS T.ISLS islands tracts of land, smaller than a continent, surrounded by water at high water 525 | T ISLT T.ISLT land-tied island a coastal island connected to the mainland by barrier beaches, levees or dikes 526 | T ISLX T.ISLX section of island 527 | T ISTH T.ISTH isthmus a narrow strip of land connecting two larger land masses and bordered by water 528 | T KRST T.KRST karst area a distinctive landscape developed on soluble rock such as limestone characterized by sinkholes, caves, disappearing streams, and underground drainage 529 | T LAVA T.LAVA lava area an area of solidified lava 530 | T LEV T.LEV levee a natural low embankment bordering a distributary or meandering stream; often built up artificially to control floods 531 | T MESA T.MESA mesa(s) a flat-topped, isolated elevation with steep slopes on all sides, less extensive than a plateau 532 | T MND T.MND mound(s) a low, isolated, rounded hill 533 | T MRN T.MRN moraine a mound, ridge, or other accumulation of glacial till 534 | T MT T.MT mountain an elevation standing high above the surrounding area with small summit area, steep slopes and local relief of 300m or more 535 | T MTS T.MTS mountains a mountain range or a group of mountains or high ridges 536 | T NKM T.NKM meander neck a narrow strip of land between the two limbs of a meander loop at its narrowest point 537 | T NTK T.NTK nunatak a rock or mountain peak protruding through glacial ice 538 | T NTKS T.NTKS nunataks rocks or mountain peaks protruding through glacial ice 539 | T PAN T.PAN pan a near-level shallow, natural depression or basin, usually containing an intermittent lake, pond, or pool 540 | T PANS T.PANS pans a near-level shallow, natural depression or basin, usually containing an intermittent lake, pond, or pool 541 | T PASS T.PASS pass a break in a mountain range or other high obstruction, used for transportation from one side to the other [See also gap] 542 | T PEN T.PEN peninsula an elongate area of land projecting into a body of water and nearly surrounded by water 543 | T PENX T.PENX section of peninsula 544 | T PK T.PK peak a pointed elevation atop a mountain, ridge, or other hypsographic feature 545 | T PKS T.PKS peaks pointed elevations atop a mountain, ridge, or other hypsographic features 546 | T PLAT T.PLAT plateau an elevated plain with steep slopes on one or more sides, and often with incised streams 547 | T PLATX T.PLATX section of plateau 548 | T PLDR T.PLDR polder an area reclaimed from the sea by diking and draining 549 | T PLN T.PLN plain(s) an extensive area of comparatively level to gently undulating land, lacking surface irregularities, and usually adjacent to a higher area 550 | T PLNX T.PLNX section of plain 551 | T PROM T.PROM promontory(-ies) a bluff or prominent hill overlooking or projecting into a lowland 552 | T PT T.PT point a tapering piece of land projecting into a body of water, less prominent than a cape 553 | T PTS T.PTS points tapering pieces of land projecting into a body of water, less prominent than a cape 554 | T RDGB T.RDGB beach ridge a ridge of sand just inland and parallel to the beach, usually in series 555 | T RDGE T.RDGE ridge(s) a long narrow elevation with steep sides, and a more or less continuous crest 556 | T REG T.REG stony desert a desert plain characterized by a surface veneer of gravel and stones 557 | T RK T.RK rock a conspicuous, isolated rocky mass 558 | T RKFL T.RKFL rockfall an irregular mass of fallen rock at the base of a cliff or steep slope 559 | T RKS T.RKS rocks conspicuous, isolated rocky masses 560 | T SAND T.SAND sand area a tract of land covered with sand 561 | T SBED T.SBED dry stream bed a channel formerly containing the water of a stream 562 | T SCRP T.SCRP escarpment a long line of cliffs or steep slopes separating level surfaces above and below 563 | T SDL T.SDL saddle a broad, open pass crossing a ridge or between hills or mountains 564 | T SHOR T.SHOR shore a narrow zone bordering a waterbody which covers and uncovers at high and low water, respectively 565 | T SINK T.SINK sinkhole a small crater-shape depression in a karst area 566 | T SLID T.SLID slide a mound of earth material, at the base of a slope and the associated scoured area 567 | T SLP T.SLP slope(s) a surface with a relatively uniform slope angle 568 | T SPIT T.SPIT spit a narrow, straight or curved continuation of a beach into a waterbody 569 | T SPUR T.SPUR spur(s) a subordinate ridge projecting outward from a hill, mountain or other elevation 570 | T TAL T.TAL talus slope a steep concave slope formed by an accumulation of loose rock fragments at the base of a cliff or steep slope 571 | T TRGD T.TRGD interdune trough(s) a long wind-swept trough between parallel longitudinal dunes 572 | T TRR T.TRR terrace a long, narrow alluvial platform bounded by steeper slopes above and below, usually overlooking a waterbody 573 | T UPLD T.UPLD upland an extensive interior region of high land with low to moderate surface relief 574 | T VAL T.VAL valley an elongated depression usually traversed by a stream 575 | T VALG T.VALG hanging valley a valley the floor of which is notably higher than the valley or shore to which it leads; most common in areas that have been glaciated 576 | T VALS T.VALS valleys elongated depressions usually traversed by a stream 577 | T VALX T.VALX section of valley 578 | T VLC T.VLC volcano a conical elevation composed of volcanic materials with a crater at the top 579 | U APNU U.APNU apron a gentle slope, with a generally smooth surface, particularly found around groups of islands and seamounts 580 | U ARCU U.ARCU arch a low bulge around the southeastern end of the island of Hawaii 581 | U ARRU U.ARRU arrugado an area of subdued corrugations off Baja California 582 | U BDLU U.BDLU borderland a region adjacent to a continent, normally occupied by or bordering a shelf, that is highly irregular with depths well in excess of those typical of a shelf 583 | U BKSU U.BKSU banks elevations, typically located on a shelf, over which the depth of water is relatively shallow but sufficient for safe surface navigation 584 | U BNKU U.BNKU bank an elevation, typically located on a shelf, over which the depth of water is relatively shallow but sufficient for safe surface navigation 585 | U BSNU U.BSNU basin a depression more or less equidimensional in plan and of variable extent 586 | U CDAU U.CDAU cordillera an entire mountain system including the subordinate ranges, interior plateaus, and basins 587 | U CNSU U.CNSU canyons relatively narrow, deep depressions with steep sides, the bottom of which generally has a continuous slope 588 | U CNYU U.CNYU canyon a relatively narrow, deep depression with steep sides, the bottom of which generally has a continuous slope 589 | U CRSU U.CRSU continental rise a gentle slope rising from oceanic depths towards the foot of a continental slope 590 | U DEPU U.DEPU deep a localized deep area within the confines of a larger feature, such as a trough, basin or trench 591 | U EDGU U.EDGU shelf edge a line along which there is a marked increase of slope at the outer margin of a continental shelf or island shelf 592 | U ESCU U.ESCU escarpment (or scarp) an elongated and comparatively steep slope separating flat or gently sloping areas 593 | U FANU U.FANU fan a relatively smooth feature normally sloping away from the lower termination of a canyon or canyon system 594 | U FLTU U.FLTU flat a small level or nearly level area 595 | U FRZU U.FRZU fracture zone an extensive linear zone of irregular topography of the sea floor, characterized by steep-sided or asymmetrical ridges, troughs, or escarpments 596 | U FURU U.FURU furrow a closed, linear, narrow, shallow depression 597 | U GAPU U.GAPU gap a narrow break in a ridge or rise 598 | U GLYU U.GLYU gully a small valley-like feature 599 | U HLLU U.HLLU hill an elevation rising generally less than 500 meters 600 | U HLSU U.HLSU hills elevations rising generally less than 500 meters 601 | U HOLU U.HOLU hole a small depression of the sea floor 602 | U KNLU U.KNLU knoll an elevation rising generally more than 500 meters and less than 1,000 meters and of limited extent across the summit 603 | U KNSU U.KNSU knolls elevations rising generally more than 500 meters and less than 1,000 meters and of limited extent across the summits 604 | U LDGU U.LDGU ledge a rocky projection or outcrop, commonly linear and near shore 605 | U LEVU U.LEVU levee an embankment bordering a canyon, valley, or seachannel 606 | U MESU U.MESU mesa an isolated, extensive, flat-topped elevation on the shelf, with relatively steep sides 607 | U MNDU U.MNDU mound a low, isolated, rounded hill 608 | U MOTU U.MOTU moat an annular depression that may not be continuous, located at the base of many seamounts, islands, and other isolated elevations 609 | U MTU U.MTU mountain a well-delineated subdivision of a large and complex positive feature 610 | U PKSU U.PKSU peaks prominent elevations, part of a larger feature, either pointed or of very limited extent across the summit 611 | U PKU U.PKU peak a prominent elevation, part of a larger feature, either pointed or of very limited extent across the summit 612 | U PLNU U.PLNU plain a flat, gently sloping or nearly level region 613 | U PLTU U.PLTU plateau a comparatively flat-topped feature of considerable extent, dropping off abruptly on one or more sides 614 | U PNLU U.PNLU pinnacle a high tower or spire-shaped pillar of rock or coral, alone or cresting a summit 615 | U PRVU U.PRVU province a region identifiable by a group of similar physiographic features whose characteristics are markedly in contrast with surrounding areas 616 | U RDGU U.RDGU ridge a long narrow elevation with steep sides 617 | U RDSU U.RDSU ridges long narrow elevations with steep sides 618 | U RFSU U.RFSU reefs surface-navigation hazards composed of consolidated material 619 | U RFU U.RFU reef a surface-navigation hazard composed of consolidated material 620 | U RISU U.RISU rise a broad elevation that rises gently, and generally smoothly, from the sea floor 621 | U SCNU U.SCNU seachannel a continuously sloping, elongated depression commonly found in fans or plains and customarily bordered by levees on one or two sides 622 | U SCSU U.SCSU seachannels continuously sloping, elongated depressions commonly found in fans or plains and customarily bordered by levees on one or two sides 623 | U SDLU U.SDLU saddle a low part, resembling in shape a saddle, in a ridge or between contiguous seamounts 624 | U SHFU U.SHFU shelf a zone adjacent to a continent (or around an island) that extends from the low water line to a depth at which there is usually a marked increase of slope towards oceanic depths 625 | U SHLU U.SHLU shoal a surface-navigation hazard composed of unconsolidated material 626 | U SHSU U.SHSU shoals hazards to surface navigation composed of unconsolidated material 627 | U SHVU U.SHVU shelf valley a valley on the shelf, generally the shoreward extension of a canyon 628 | U SILU U.SILU sill the low part of a gap or saddle separating basins 629 | U SLPU U.SLPU slope the slope seaward from the shelf edge to the beginning of a continental rise or the point where there is a general reduction in slope 630 | U SMSU U.SMSU seamounts elevations rising generally more than 1,000 meters and of limited extent across the summit 631 | U SMU U.SMU seamount an elevation rising generally more than 1,000 meters and of limited extent across the summit 632 | U SPRU U.SPRU spur a subordinate elevation, ridge, or rise projecting outward from a larger feature 633 | U TERU U.TERU terrace a relatively flat horizontal or gently inclined surface, sometimes long and narrow, which is bounded by a steeper ascending slope on one side and by a steep descending slope on the opposite side 634 | U TMSU U.TMSU tablemounts (or guyots) seamounts having a comparatively smooth, flat top 635 | U TMTU U.TMTU tablemount (or guyot) a seamount having a comparatively smooth, flat top 636 | U TNGU U.TNGU tongue an elongate (tongue-like) extension of a flat sea floor into an adjacent higher feature 637 | U TRGU U.TRGU trough a long depression of the sea floor characteristically flat bottomed and steep sided, and normally shallower than a trench 638 | U TRNU U.TRNU trench a long, narrow, characteristically very deep and asymmetrical depression of the sea floor, with relatively steep sides 639 | U VALU U.VALU valley a relatively shallow, wide depression, the bottom of which usually has a continuous gradient 640 | U VLSU U.VLSU valleys a relatively shallow, wide depression, the bottom of which usually has a continuous gradient 641 | V BUSH V.BUSH bush(es) a small clump of conspicuous bushes in an otherwise bare area 642 | V CULT V.CULT cultivated area an area under cultivation 643 | V FRST V.FRST forest(s) an area dominated by tree vegetation 644 | V FRSTF V.FRSTF fossilized forest a forest fossilized by geologic processes and now exposed at the earth's surface 645 | V GRSLD V.GRSLD grassland an area dominated by grass vegetation 646 | V GRVC V.GRVC coconut grove a planting of coconut trees 647 | V GRVO V.GRVO olive grove a planting of olive trees 648 | V GRVP V.GRVP palm grove a planting of palm trees 649 | V GRVPN V.GRVPN pine grove a planting of pine trees 650 | V HTH V.HTH heath an upland moor or sandy area dominated by low shrubby vegetation including heather 651 | V MDW V.MDW meadow a small, poorly drained area dominated by grassy vegetation 652 | V OCH V.OCH orchard(s) a planting of fruit or nut trees 653 | V SCRB V.SCRB scrubland an area of low trees, bushes, and shrubs stunted by some environmental limitation 654 | V TREE V.TREE tree(s) a conspicuous tree used as a landmark 655 | V TUND V.TUND tundra a marshy, treeless, high latitude plain, dominated by mosses, lichens, and low shrub vegetation under permafrost conditions 656 | V VIN V.VIN vineyard a planting of grapevines 657 | V VINS V.VINS vineyards plantings of grapevines -------------------------------------------------------------------------------- /mordecai/data/nat_df.csv: -------------------------------------------------------------------------------- 1 | nationality,alpha_3_code 2 | Afghan,AFG 3 | Åland Island,ALA 4 | Albanian,ALB 5 | Algerian,DZA 6 | American Samoan,ASM 7 | Andorran,AND 8 | Angolan,AGO 9 | Anguillan,AIA 10 | Antarctic,ATA 11 | Antiguan,ATG 12 | Barbudan,ATG 13 | Argentine,ARG 14 | Armenian,ARM 15 | Aruban,ABW 16 | Australian,AUS 17 | Austrian,AUT 18 | Azerbaijani,AZE 19 | Azeri, AZE 20 | Bahamian,BHS 21 | Bahraini,BHR 22 | Bangladeshi,BGD 23 | Barbadian,BRB 24 | Belarusian,BLR 25 | Belgian,BEL 26 | Belizean,BLZ 27 | Beninese,BEN 28 | Beninois,BEN 29 | BermudianBMU 30 | Bermudan,BMU 31 | Bhutanese,BTN 32 | Bolivian,BOL 33 | Bonaire,BES 34 | Bosnian,BIH 35 | Motswana,BWA 36 | Botswanan,BWA 37 | Bouvet Island,BVT 38 | Brazilian,BRA 39 | Bruneian,BRN 40 | Bulgarian,BGR 41 | Burkinabé,BFA 42 | Burundian,BDI 43 | Cabo Verdean,CPV 44 | Cambodian,KHM 45 | Cameroonian,CMR 46 | Canadian,CAN 47 | Caymanian,CYM 48 | Chadian,TCD 49 | Chilean,CHL 50 | Chinese,CHN 51 | Christmas Islander,CXR 52 | Cocos Island,CCK 53 | Cocos Islander,CCK 54 | Colombian,COL 55 | Comoran,COM 56 | Comorian,COM 57 | Congolese,COG 58 | Congolese,COD 59 | Cook Islander,COK 60 | Costa Rican,CRI 61 | Ivorian,CIV 62 | Croatian,HRV 63 | Cuban,CUB 64 | Curaçaoan,CUW 65 | Cypriot,CYP 66 | Czech,CZE 67 | Danish,DNK 68 | Djiboutian,DJI 69 | Dominican,DMA 70 | Dominican,DOM 71 | Ecuadorian,ECU 72 | Egyptian,EGY 73 | Salvadoran,SLV 74 | Equatorial Guinean,GNQ 75 | Equatoguinean,GNQ 76 | Eritrean,ERI 77 | Estonian,EST 78 | Ethiopian,ETH 79 | Falkland Island,FLK 80 | Faroese,FRO 81 | Fijian,FJI 82 | Finnish,FIN 83 | French,FRA 84 | French Guianese,GUF 85 | French Polynesian,PYF 86 | Gabonese,GAB 87 | Gambian,GMB 88 | Georgian,GEO 89 | German,DEU 90 | Ghanaian,GHA 91 | Gibraltar,GIB 92 | Greek,GRC 93 | Hellenic,GRC 94 | Greenlandic,GRL 95 | Grenadian,GRD 96 | Guadeloupe,GLP 97 | Guamanian,GUM 98 | Guambat,GUM 99 | Guatemalan,GTM 100 | Channel Islander,GGY 101 | Guinean,GIN 102 | Bissau-Guinean,GNB 103 | Guyanese,GUY 104 | Haitian,HTI 105 | Honduran,HND 106 | Hong Kongese,HKG 107 | Hungarian,HUN 108 | Magyar,HUN 109 | Icelandic,ISL 110 | Indian,IND 111 | Indonesian,IDN 112 | Iranian,IRN 113 | Persian,IRN 114 | Iraqi,IRQ 115 | Irish,IRL 116 | Manx,IMN 117 | Israeli,ISR 118 | Italian,ITA 119 | Jamaican,JAM 120 | Japanese,JPN 121 | Channel Island,JEY 122 | Jordanian,JOR 123 | Kazakhstani,KAZ 124 | Kazakh,KAZ 125 | Kenyan,KEN 126 | I-Kiribati,KIR 127 | North Korean,PRK 128 | South Korean,KOR 129 | Kuwaiti,KWT 130 | Kyrgyzstani,KGZ 131 | Kyrgyz,KGZ 132 | Kirgiz,KGZ 133 | Kirghiz,KGZ 134 | Lao,LAO 135 | Laotian,LAO 136 | Latvian,LVA 137 | Lebanese,LBN 138 | Basotho,LSO 139 | Liberian,LBR 140 | Libyan,LBY 141 | Liechtenstein,LIE 142 | Lithuanian,LTU 143 | Luxembourgish,LUX 144 | Macanese,MAC 145 | Macedonian,MKD 146 | Malagasy,MDG 147 | Malawian,MWI 148 | Malaysian,MYS 149 | Maldivian,MDV 150 | Malian,MLI 151 | Malinese,MLI 152 | Maltese,MLT 153 | Marshallese,MHL 154 | Martiniquais,MTQ 155 | Martinican,MTQ 156 | Mauritanian,MRT 157 | Mauritian,MUS 158 | Mahoran,MYT 159 | Mexican,MEX 160 | Micronesian,FSM 161 | Moldovan,MDA 162 | Monégasque,MCO 163 | Monacan,MCO 164 | Mongolian,MNG 165 | Montenegrin,MNE 166 | Montserratian,MSR 167 | Moroccan,MAR 168 | Mozambican,MOZ 169 | Burmese,MMR 170 | Namibian,NAM 171 | Nauruan,NRU 172 | Nepali,NPL 173 | Nepalese,NPL 174 | Dutch, NLD 175 | Netherlandic,NLD 176 | New Caledonian,NCL 177 | New Zealander,NZL 178 | Nicaraguan,NIC 179 | Nigerien,NER 180 | Nigerian,NGA 181 | Niuean,NIU 182 | Norfolk Island,NFK 183 | Northern Marianan,MNP 184 | Norwegian,NOR 185 | Omani,OMN 186 | Pakistani,PAK 187 | Palauan,PLW 188 | Palestinian,PSE 189 | Panamanian,PAN 190 | Papua New Guinean, PNG 191 | Papuan,PNG 192 | Paraguayan,PRY 193 | Peruvian,PER 194 | Philippine, PHL 195 | Filipino,PHL 196 | Pitcairn Island,PCN 197 | Polish,POL 198 | Portuguese,PRT 199 | Puerto Rican,PRI 200 | Qatari,QAT 201 | Réunionese, REU 202 | Réunionnais,REU 203 | Romanian,ROU 204 | Russian,RUS 205 | Rwandan,RWA 206 | Barthélemois,BLM 207 | Saint Helenian,SHN 208 | Kittitian or Nevisian,KNA 209 | Saint Lucian,LCA 210 | Saint-Martinoise,MAF 211 | Saint-Pierrais,SPM 212 | Miquelonnais,SPM 213 | Saint Vincentian, VCT 214 | Vincentian,VCT 215 | Samoan,WSM 216 | Sammarinese,SMR 217 | São Toméan,STP 218 | Saudi, SAU 219 | Saudi Arabian,SAU 220 | Senegalese,SEN 221 | Serbian,SRB 222 | Serb,SRB 223 | Seychellois,SYC 224 | Sierra Leonean,SLE 225 | Singaporean,SGP 226 | Sint Maarten,SXM 227 | Slovak,SVK 228 | Slovenian, SVN 229 | Slovene,SVN 230 | Solomon Island,SLB 231 | Somali,SOM 232 | Somalian,SOM 233 | South African,ZAF 234 | South Sudanese,SSD 235 | Spanish,ESP 236 | Sri Lankan,LKA 237 | Sudanese,SDN 238 | Surinamese,SUR 239 | Svalbard,SJM 240 | Swazi,SWZ 241 | Swedish,SWE 242 | Swiss,CHE 243 | Syrian,SYR 244 | Taiwanese,TWN 245 | Tajikistani,TJK 246 | Tanzanian,TZA 247 | Thai,THA 248 | Timorese,TLS 249 | Togolese,TGO 250 | Tokelauan,TKL 251 | Tongan,TON 252 | Trinidadian,TTO 253 | Tobagonian,TTO 254 | Tunisian,TUN 255 | Turkish,TUR 256 | Turkmen,TKM 257 | Turkmeni,TKM 258 | Tuvaluan,TUV 259 | Ugandan,UGA 260 | Ukrainian,UKR 261 | Emirati,ARE 262 | Emirian,ARE 263 | Emiri,ARE 264 | British,GBR 265 | UK, GBR 266 | American,USA 267 | Uruguayan,URY 268 | Uzbekistani,UZB 269 | Uzbek,UZB 270 | Uzbeki,UZB 271 | Ni-Vanuatu,VUT 272 | Vanuatuan,VUT 273 | Venezuelan,VEN 274 | Vietnamese,VNM 275 | Wallisian,WLF 276 | Futunan,WLF 277 | Sahrawi,ESH 278 | Sahrawian,ESH 279 | Sahraouian,ESH 280 | Yemeni,YEM 281 | Zambian,ZMB 282 | Zimbabwean,ZWE 283 | -------------------------------------------------------------------------------- /mordecai/data/stopword_country_names.json: -------------------------------------------------------------------------------- 1 | {"Afghanistan":"AFG", "Åland Islands":"ALA", "Albania":"ALB", "Algeria":"DZA", 2 | "American Samoa":"ASM", "Andorra":"AND", "Angola":"AGO", "Anguilla":"AIA", 3 | "Antarctica":"ATA", "Antigua Barbuda":"ATG", "Argentina":"ARG", 4 | "Armenia":"ARM", "Aruba":"ABW", "Ascension_Island":"NA", "Australia":"AUS", 5 | "Austria":"AUT", "Azerbaijan":"AZE", "Bahamas":"BHS", "Bahrain":"BHR", 6 | "Bangladesh":"BGD", "Barbados":"BRB", "Belarus":"BLR", "Belgium":"BEL", 7 | "Belize":"BLZ", "Benin":"BEN", "Bermuda":"BMU", "Bhutan":"BTN", 8 | "Bolivia":"BOL", "Bosnia_Herzegovina":"BIH", 9 | "Botswana":"BWA", "Bouvet Island":"BVT", "Brazil":"BRA", 10 | "Britain":"GBR", "Great_Britain":"GBR", 11 | "British Virgin Islands":"VGB", "Brunei":"BRN", "Bulgaria":"BGR", "Burkina_Faso":"BFA", 12 | "Burundi":"BDI", "Cambodia":"KHM", "Cameroon":"CMR", 13 | "Canada":"CAN","Cape Verde":"CPV", "Cayman_Islands":"CYM", 14 | "Central African Republic":"CAF", "Chad":"TCD", "Chile":"CHL", "China":"CHN", 15 | "Cocos_Islands":"CCK", "Colombia":"COL", 16 | "Comoros":"COM", "Congo Brazzaville":"COG", "Congo Kinshasa":"COD", 17 | "Congo":"COG", "Cook_Islands":"COK", 18 | "Costa_Rica":"CRI", "Cote Ivoire":"CIV", "Ivory_Coast":"CIV","Croatia":"HRV", "Cuba":"CUB", 19 | "Curaçao":"CUW", "Cyprus":"CYP", "Czech_Republic":"CZE", "Denmark":"DNK", 20 | "Djibouti":"DJI", "Dominica":"DMA", "Dominican_Republic":"DOM", 21 | "Ecuador":"ECU", "Egypt":"EGY", "El_Salvador":"SLV", 22 | "Equatorial_Guinea":"GNQ", "Eritrea":"ERI", "Estonia":"EST", "Ethiopia":"ETH", 23 | "Falkland_Islands":"FLK", "Faroe_Islands":"FRO", 24 | "Fiji":"FJI", "Finland":"FIN", "France":"FRA", "French_Guiana":"GUF", 25 | "French_Polynesia":"PYF","Gabon":"GAB", 26 | "Gambia":"GMB", "Gaza":"PSE", "Georgia":"GEO", "Germany":"DEU", "Ghana":"GHA", 27 | "Gibraltar":"GIB", "Greece":"GRC", "Greenland":"GRL", "Grenada":"GRD", 28 | "Guadeloupe":"GLP", "Guam":"GUM", "Guatemala":"GTM", "Guernsey":"GGY", 29 | "Guinea":"GIN", "Guinea_Bissau":"GNB", "Guyana":"GUY", "Haiti":"HTI","Honduras":"HND", 30 | "Hong_Kong":"HKG", "Hungary":"HUN", "Iceland":"ISL", 31 | "India":"IND", "Indonesia":"IDN", "Iran":"IRN", "Iraq":"IRQ", "Ireland":"IRL", 32 | "Israel":"ISR", "Italy":"ITA", "Jamaica":"JAM", "Japan":"JPN", 33 | "Jordan":"JOR", "Kazakhstan":"KAZ", "Kenya":"KEN", 34 | "Kiribati":"KIR", "Kuwait":"KWT", "Kyrgyzstan":"KGZ", "Laos":"LAO", 35 | "Latvia":"LVA", "Lebanon":"LBN", "Lesotho":"LSO", "Liberia":"LBR", 36 | "Libya":"LBY", "Liechtenstein":"LIE", "Lithuania":"LTU", "Luxembourg":"LUX", 37 | "Macau":"MAC", "Macedonia":"MKD", "Madagascar":"MDG", "Malawi":"MWI", 38 | "Malaysia":"MYS", "Maldives":"MDV", "Mali":"MLI", "Malta":"MLT", "Marshall_Islands":"MHL", 39 | "Martinique":"MTQ", "Mauritania":"MRT", "Mauritius":"MUS", 40 | "Mayotte":"MYT", "Mexico":"MEX", "Micronesia":"FSM", "Moldova":"MDA", 41 | "Monaco":"MCO", "Mongolia":"MNG", "Montenegro":"MNE", "Montserrat":"MSR", 42 | "Morocco":"MAR", "Mozambique":"MOZ", "Myanmar":"MMR", "Burma":"MMR", "Namibia":"NAM", 43 | "Nauru":"NRU", "Nepal":"NPL", "Netherlands":"NLD", "Netherlands Antilles":"ANT", 44 | "New Caledonia":"NCL", "New_Zealand":"NZL", "Nicaragua":"NIC", 45 | "Niger":"NER", "Nigeria":"NGA", "Niue":"NIU", "North_Korea":"PRK", 46 | "Northern Ireland":"IRL", "Northern Mariana Islands":"MNP", 47 | "Norway":"NOR", "Oman":"OMN", "Pakistan":"PAK", 48 | "Palau":"PLW", "Palestinian_Territories":"PSE", "Palestine":"PSE","Panama":"PAN", "Papua New Guinea":"PNG", 49 | "Paraguay":"PRY", "Peru":"PER", "Philippines":"PHL", "Pitcairn_Islands":"PCN", 50 | "Poland":"POL", "Portugal":"PRT", "Puerto_Rico":"PRI", 51 | "Qatar":"QAT", "Réunion":"REU", "Romania":"ROU", "Russia":"RUS", 52 | "Rwanda":"RWA", "Saint Barthélemy":"BLM", "Saint Helena":"SHN", 53 | "Saint Kitts Nevis":"KNA", "Saint Lucia":"LCA", 54 | "Saint Pierre Miquelon":"SPM", "Saint Vincent Grenadines":"VCT", 55 | "Samoa":"WSM", "San_Marino":"SMR", "São Tomé Príncipe":"STP", "Saudi_Arabia":"SAU", 56 | "Senegal":"SEN", "Serbia":"SRB", 57 | "Seychelles":"SYC", "Sierra_Leone":"SLE", "Singapore":"SGP", "Sint Maarten":"SXM", 58 | "Slovakia":"SVK", "Slovenia":"SVN", "Solomon_Islands":"SLB", 59 | "Somalia":"SOM", "South_Africa":"ZAF", 60 | "South_Korea":"KOR", "South Sudan":"SSD", "Spain":"ESP", "Sri_Lanka":"LKA", "Sudan":"SDN", 61 | "Suriname":"SUR", "Svalbard Jan Mayen":"SJM", 62 | "Swaziland":"SWZ", "Sweden":"SWE", "Switzerland":"CHE", "Syria":"SYR", 63 | "Taiwan":"TWN", "Tajikistan":"TJK", "Tanzania":"TZA", "Thailand":"THA", 64 | "Timor Leste":"TLS", "East_Timor":"TLS","Togo":"TGO", "Tokelau":"TKL", "Tonga":"TON", "Trinidad Tobago":"TTO", 65 | "Tunisia":"TUN", "Turkey":"TUR", 66 | "Turkmenistan":"TKM", "Turks Caicos Islands":"TCA", "Tuvalu":"TUV", "U.S. Minor Outlying Islands":"UMI", 67 | "Virgin_Islands":"VIR", "Uganda":"UGA", 68 | "Ukraine":"UKR", "United_Arab_Emirates":"ARE", "United_Kingdom":"GBR", 69 | "UK":"GBR", "United_States":"USA", "USA":"USA", "America":"USA", 70 | "Uruguay":"URY", "Uzbekistan":"UZB", "Vanuatu":"VUT", "Vatican":"VAT", "Venezuela":"VEN", 71 | "Vietnam":"VNM", "Wallis Futuna":"WLF", 72 | "Western_Sahara":"ESH", "Yemen":"YEM", "Zambia":"ZMB", "Zimbabwe":"ZWE"} 73 | -------------------------------------------------------------------------------- /mordecai/geoparse.py: -------------------------------------------------------------------------------- 1 | from tensorflow import keras 2 | import pandas as pd 3 | import numpy as np 4 | from collections import Counter 5 | import editdistance 6 | import pkg_resources 7 | import spacy 8 | from . import utilities 9 | from multiprocessing.pool import ThreadPool 10 | from elasticsearch.exceptions import ConnectionTimeout, ConnectionError 11 | import multiprocessing 12 | from tqdm import tqdm 13 | import warnings 14 | import re 15 | 16 | import traceback 17 | 18 | try: 19 | from functools import lru_cache 20 | except ImportError: 21 | from backports.functools_lru_cache import lru_cache 22 | print("Mordecai requires Python 3 and seems to be running in Python 2.") 23 | 24 | 25 | class Geoparser: 26 | def __init__(self, nlp=None, es_hosts=None, es_port=None, es_ssl=False, es_auth=None, 27 | verbose=False, country_threshold=0.6, threads=True, 28 | progress=True, training=None, models_path=None, **kwargs): 29 | DATA_PATH = pkg_resources.resource_filename('mordecai', 'data/') 30 | if not models_path: 31 | models_path = pkg_resources.resource_filename('mordecai', 'models/') 32 | print("Models path:", models_path) 33 | if nlp: 34 | self.nlp = nlp 35 | else: 36 | try: 37 | self.nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger']) 38 | except OSError: 39 | print("""ERROR: No spaCy NLP model installed. Install with this command: 40 | `python -m spacy download en_core_web_lg`.""") 41 | self._cts = utilities.country_list_maker() 42 | self._just_cts = utilities.country_list_maker() 43 | self._inv_cts = utilities.make_inv_cts(self._cts) 44 | country_state_city = utilities.other_vectors() 45 | self._cts.update(country_state_city) 46 | self._ct_nlp = utilities.country_list_nlp(self._cts) 47 | self._prebuilt_vec = [w.vector for w in self._ct_nlp] 48 | self._both_codes = utilities.make_country_nationality_list(self._cts, DATA_PATH + "nat_df.csv") 49 | self._admin1_dict = utilities.read_in_admin1(DATA_PATH + "admin1CodesASCII.json") 50 | self.conn = utilities.setup_es(es_hosts, es_port, es_ssl, es_auth) 51 | if not training: 52 | # when retraining models, don't load old models 53 | self.country_model = keras.models.load_model(models_path + "country_model.h5") 54 | self.rank_model = keras.models.load_model(models_path + "rank_model.h5") 55 | elif training == "ranker": 56 | self.country_model = keras.models.load_model(models_path + "country_model.h5") 57 | self._skip_list = utilities.make_skip_list(self._cts) 58 | self.training_setting = False # make this true if you want training formatted 59 | # if the best country guess is below the country threshold, don't return anything at all 60 | self.country_threshold = country_threshold 61 | feature_codes = pd.read_csv(DATA_PATH + "feature_codes.txt", sep="\t", header=None) 62 | self._code_to_text = dict(zip(feature_codes[1], feature_codes[3])) # human readable geonames IDs 63 | self.verbose = verbose # return the full dictionary or just the good parts? 64 | self.progress = progress # display progress bars? 65 | self.threads = threads 66 | if 'n_threads' in kwargs.keys(): 67 | warnings.warn("n_threads is deprecated. Use threads=True instead.", DeprecationWarning) 68 | try: 69 | # https://www.reddit.com/r/Python/comments/3a2erd/exception_catch_not_catching_everything/ 70 | # with nostderr(): 71 | self.conn.count() 72 | except: 73 | raise ConnectionError("""Could not establish contact with Elasticsearch at {0} on port {1}. 74 | Are you sure it's running? 75 | Mordecai needs access to the Geonames/Elasticsearch gazetteer to function. 76 | See https://github.com/openeventdata/mordecai#installation-and-requirements 77 | for instructions on setting up Geonames/Elasticsearch""".format(es_hosts, es_port)) 78 | es_date = utilities.check_geonames_date(self.conn) 79 | mod_date = "2020-07-11" 80 | if es_date != mod_date: 81 | print("""You may be using an outdated Geonames index/Mordecai version. 82 | Your index is from {0}, while your Mordecai version is from {1}. Please see 83 | https://github.com/openeventdata/mordecai/ for instructions on updating.""".format(es_date, mod_date)) 84 | 85 | 86 | def _feature_country_mentions(self, doc): 87 | """ 88 | Given a document, count how many times different country names and adjectives are mentioned. 89 | These are features used in the country picking phase. 90 | 91 | Parameters 92 | --------- 93 | doc: a spaCy nlp'ed piece of text 94 | 95 | Returns 96 | ------- 97 | countries: dict 98 | the top two countries (ISO code) and their frequency of mentions. 99 | """ 100 | c_list = [] 101 | for i in doc.ents: 102 | try: 103 | country = self._both_codes[i.text] 104 | c_list.append(country) 105 | except KeyError: 106 | pass 107 | count = Counter(c_list).most_common() 108 | try: 109 | top, top_count = count[0] 110 | except: 111 | top = "" 112 | top_count = 0 113 | try: 114 | two, two_count = count[1] 115 | except: 116 | two = "" 117 | two_count = 0 118 | 119 | countries = (top, top_count, two, two_count) 120 | return countries 121 | 122 | 123 | def clean_entity(self, ent): 124 | """ 125 | Strip out extra words that often get picked up by spaCy's NER. 126 | 127 | To do: preserve info about what got stripped out to help with ES/Geonames 128 | resolution later. 129 | 130 | Parameters 131 | --------- 132 | ent: a spaCy named entity Span 133 | 134 | Returns 135 | ------- 136 | new_ent: a spaCy Span, with extra words stripped out. 137 | 138 | """ 139 | dump_list = ['province', 'the', 'area', 'airport', 'district', 'square', 140 | 'town', 'village', 'prison', "river", "valley", "provincial", "prison", 141 | "region", "municipality", "state", "territory", "of", "in", 142 | "county", "central"] 143 | keep_positions = [] 144 | for word in ent: 145 | if word.text.lower() not in dump_list: 146 | keep_positions.append(word.i) 147 | 148 | keep_positions = np.asarray(keep_positions) 149 | try: 150 | new_ent = ent.doc[keep_positions.min():keep_positions.max() + 1] 151 | # can't set directly 152 | #new_ent.label_.__set__(ent.label_) 153 | except ValueError: 154 | new_ent = ent 155 | return new_ent 156 | 157 | 158 | def _feature_most_common(self, results): 159 | """ 160 | Find the most common country name in ES/Geonames results 161 | 162 | Paramaters 163 | ---------- 164 | results: dict 165 | output of `query_geonames` 166 | 167 | Returns 168 | ------- 169 | most_common: str 170 | ISO code of most common country, or empty string if none 171 | """ 172 | try: 173 | country_count = Counter([i['country_code3'] for i in results['hits']['hits']]) 174 | most_common = country_count.most_common()[0][0] 175 | return most_common 176 | except IndexError: 177 | return "" 178 | except TypeError: 179 | return "" 180 | 181 | 182 | def _feature_most_alternative(self, results, full_results=False): 183 | """ 184 | Find the placename with the most alternative names and return its country. 185 | More alternative names are a rough measure of importance. 186 | 187 | Paramaters 188 | ---------- 189 | results: dict 190 | output of `query_geonames` 191 | 192 | Returns 193 | ------- 194 | most_alt: str 195 | ISO code of country of place with most alternative names, 196 | or empty string if none 197 | """ 198 | try: 199 | alt_names = [len(i['alternativenames']) for i in results['hits']['hits']] 200 | most_alt = results['hits']['hits'][np.array(alt_names).argmax()] 201 | if full_results: 202 | return most_alt 203 | else: 204 | return most_alt['country_code3'] 205 | except (IndexError, ValueError, TypeError): 206 | return "" 207 | 208 | 209 | def _feature_most_population(self, results): 210 | """ 211 | Find the placename with the largest population and return its country. 212 | More population is a rough measure of importance. 213 | 214 | Paramaters 215 | ---------- 216 | results: dict 217 | output of `query_geonames` 218 | 219 | Returns 220 | ------- 221 | most_pop: str 222 | ISO code of country of place with largest population, 223 | or empty string if none 224 | """ 225 | 226 | try: 227 | populations = [i['population'] for i in results['hits']['hits']] 228 | most_pop = results['hits']['hits'][np.array(populations).astype("int").argmax()] 229 | return most_pop['country_code3'] 230 | except Exception as e: 231 | return "" 232 | 233 | 234 | def _feature_word_embedding(self, text): 235 | """ 236 | Given a word, guess the appropriate country by word vector. 237 | 238 | Parameters 239 | --------- 240 | text: str 241 | the text to extract locations from. 242 | 243 | Returns 244 | ------- 245 | country_picking: dict 246 | The top two countries (ISO codes) and two measures 247 | confidence for the first choice. 248 | """ 249 | try: 250 | simils = np.dot(self._prebuilt_vec, text.vector) 251 | except Exception as e: 252 | #print("Vector problem, ", Exception, e) 253 | return {"country_1" : "", 254 | "confid_a" : 0, 255 | "confid_b" : 0, 256 | "country_2" : ""} 257 | ranks = simils.argsort()[::-1] 258 | confid = simils.max() 259 | confid2 = simils[ranks[0]] - simils[ranks[1]] 260 | if confid == 0 or confid2 == 0: 261 | return "" 262 | country_code = self._cts[str(self._ct_nlp[ranks[0]])] 263 | country_picking = {"country_1" : country_code, 264 | "confid_a" : confid, 265 | "confid_b" : confid2, 266 | "country_2" : self._cts[str(self._ct_nlp[ranks[1]])]} 267 | return country_picking 268 | 269 | 270 | def _feature_first_back(self, results): 271 | """ 272 | Get the country of the first two results back from geonames. 273 | 274 | Parameters 275 | ----------- 276 | results: dict 277 | elasticsearch results 278 | 279 | Returns 280 | ------- 281 | top: tuple 282 | first and second results' country name (ISO) 283 | """ 284 | try: 285 | first_back = results['hits']['hits'][0]['country_code3'] 286 | except (TypeError, IndexError): 287 | # usually occurs if no Geonames result 288 | first_back = "" 289 | try: 290 | second_back = results['hits']['hits'][1]['country_code3'] 291 | except (TypeError, IndexError): 292 | second_back = "" 293 | top = (first_back, second_back) 294 | return top 295 | 296 | 297 | def is_country(self, text): 298 | """Check if a piece of text is in the list of countries""" 299 | ct_list = self._just_cts.keys() 300 | if text in ct_list: 301 | return True 302 | else: 303 | return False 304 | 305 | 306 | @lru_cache(maxsize=250) 307 | def query_geonames(self, placename): 308 | """ 309 | Wrap search parameters into an elasticsearch query to the geonames index 310 | and return results. 311 | 312 | Parameters 313 | --------- 314 | conn: an elasticsearch Search conn, like the one returned by `setup_es()` 315 | 316 | placename: str 317 | the placename text extracted by NER system 318 | 319 | Returns 320 | ------- 321 | out: The raw results of the elasticsearch query 322 | """ 323 | # first first, try for country name 324 | if self.is_country(placename): 325 | q = {"multi_match": {"query": placename, 326 | "fields": ['name', 'asciiname', 'alternativenames'], 327 | "type" : "phrase"}} 328 | res = self.conn.filter("term", feature_code='PCLI').query(q)[0:5].execute() # always 5 329 | else: 330 | # second, try for an exact phrase match 331 | q = {"multi_match": {"query": placename, 332 | "fields": ['name^5', 'asciiname^5', 'alternativenames'], 333 | "type" : "phrase"}} 334 | res = self.conn.query(q)[0:50].execute() 335 | # if no results, use some fuzziness, but still require all terms to be present. 336 | # Fuzzy is not allowed in "phrase" searches. 337 | if res.hits.total == 0: 338 | # tried wrapping this in a {"constant_score" : {"query": ... but made it worse 339 | q = {"multi_match": {"query": placename, 340 | "fields": ['name', 'asciiname', 'alternativenames'], 341 | "fuzziness" : 1, 342 | "operator": "and" 343 | } 344 | } 345 | res = self.conn.query(q)[0:50].execute() 346 | es_result = utilities.structure_results(res) 347 | return es_result 348 | 349 | 350 | #@lru_cache(maxsize=250) # cache won't work with dictionary inputs 351 | def query_geonames_country(self, placename, country, filter_params=None): 352 | """ 353 | Like query_geonames, but limited to a specified country or (optionally) another filter. 354 | 355 | The filter_params argument can be used to limit results to a particular adm1 (e.g. 356 | {"adm1" : "09"}) or feature type {"feature_code" : "adm1"}. 357 | 358 | Parameters 359 | --------- 360 | placename: str, the place name to search for 361 | country: str, country to limit search to in ISO 3 char code 362 | filter_params: dict, a further filter to apply, e.g. {"feature_code":"ADM1"} 363 | 364 | Returns 365 | ------ 366 | out: dict, the structured geonames results 367 | """ 368 | # first, try for an exact phrase match 369 | q = {"multi_match": {"query": placename, 370 | "fields": ['name^5', 'asciiname^5', 'alternativenames'], 371 | "type": "phrase"}} 372 | if filter_params: 373 | res = self.conn.filter("term", **filter_params).filter("term", country_code3=country).query(q)[0:50].execute() 374 | else: 375 | res = self.conn.filter("term", country_code3=country).query(q)[0:50].execute() 376 | 377 | # if no results, use some fuzziness, but still require all terms to be present. 378 | # Fuzzy is not allowed in "phrase" searches. 379 | if res.hits.total == 0: 380 | # tried wrapping this in a {"constant_score" : {"query": ... but made it worse 381 | q = {"multi_match": {"query": placename, 382 | "fields": ['name', 'asciiname', 'alternativenames'], 383 | "fuzziness": 2, 384 | "operator": "and"}} 385 | if filter_params: 386 | res = self.conn.filter("term", **filter_params).filter("term", country_code3=country).query(q)[0:50].execute() 387 | else: 388 | res = self.conn.filter("term", country_code3=country).query(q)[0:50].execute() 389 | out = utilities.structure_results(res) 390 | return out 391 | 392 | 393 | 394 | # The following three lookup functions are used for the threaded queries. 395 | def proc_lookup(self, loc): 396 | try: 397 | loc = self.query_geonames(loc['word']) 398 | except ConnectionTimeout: 399 | loc = "" 400 | return loc 401 | 402 | 403 | def proc_lookup_country(self, loc): 404 | if loc['country_conf'] >= self.country_threshold: 405 | loc = self.query_geonames_country(loc['word'], loc['country_predicted']) 406 | return loc 407 | else: 408 | return "" 409 | 410 | 411 | def simple_lookup(self, word): 412 | try: 413 | loc = self.query_geonames(word) 414 | except ConnectionTimeout: 415 | loc = "" 416 | return loc 417 | 418 | 419 | def _feature_location_type_mention(self, ent): 420 | """ 421 | Count forward 1 word from each entity, looking for defined terms that indicate 422 | geographic feature types (e.g. "village" = "P"). 423 | 424 | Parameters 425 | ----------- 426 | ent : spacy entity span 427 | It has to be an entity to handle indexing in the document 428 | 429 | Returns 430 | -------- 431 | tuple (length 2) 432 | (feature_code, feature_class) derived from explicit word usage 433 | 434 | """ 435 | 436 | P_list = ["city", "cities", "town", "towns", "villages", "village", "settlement", 437 | "capital", "town", "towns", "neighborhood", "neighborhoods", 438 | "municipality"] 439 | ADM1_list = ["province", "governorate", "state", "department", "oblast", 440 | "changwat", "countryside"] 441 | ADM2_list = ["district", "rayon", "amphoe", "county"] 442 | A_other = ["region"] 443 | AIRPORT_list = ["airport"] 444 | TERRAIN_list = ["mountain", "mountains", "stream", "river"] 445 | FOREST_list = ["forest"] 446 | # TODO: incorporate positions, especially now that we don't split by 447 | # sentence 448 | feature_positions = [] 449 | feature_class = feature_code = "" 450 | 451 | interest_words = ent.doc[ent.end - 1 : ent.end + 1] # last word or next word following 452 | 453 | for word in interest_words: 454 | if ent.text in self._just_cts.keys(): 455 | feature_class = "A" 456 | feature_code = "PCLI" 457 | elif word.text.lower() in P_list: 458 | feature_class = "P" 459 | feature_code = "" 460 | elif word.text.lower() in ADM1_list: 461 | feature_class = "A" 462 | feature_code = "ADM1" 463 | elif word.text.lower() in ADM2_list: 464 | feature_class = "A" 465 | feature_code = "ADM2" 466 | elif word.text.lower() in TERRAIN_list: 467 | feature_class = "T" 468 | feature_code = "" 469 | elif word.text.lower() in AIRPORT_list: 470 | feature_class = "S" 471 | feature_code = "AIRP" 472 | elif word.text.lower() in A_other: 473 | feature_class = "A" 474 | feature_code = "" 475 | return (feature_class, feature_code) 476 | 477 | 478 | def make_country_features(self, doc, require_maj=False): 479 | """ 480 | Create features for the country picking model. Function where all the individual 481 | feature maker functions are called and aggregated. (Formerly "process_text") 482 | 483 | Parameters 484 | ----------- 485 | doc : str or spaCy doc 486 | 487 | Returns 488 | ------- 489 | task_list : list of dicts 490 | Each entry has the word, surrounding text, span, and the country picking features. 491 | This output can be put into Prodigy for labeling almost as-is (the "features" key needs 492 | to be renamed "meta" or be deleted.) 493 | """ 494 | if not hasattr(doc, "ents"): 495 | doc = self.nlp(doc) 496 | # initialize the place to store finalized tasks 497 | task_list = [] 498 | 499 | # get document vector 500 | #doc_vec = self._feature_word_embedding(text)['country_1'] 501 | 502 | # get explicit counts of country names 503 | ct_mention, ctm_count1, ct_mention2, ctm_count2 = self._feature_country_mentions(doc) 504 | 505 | # pull out the place names, skipping empty ones, countries, and known 506 | # junk from the skip list (like "Atlanic Ocean" 507 | ents = [] 508 | for ent in doc.ents: 509 | if not ent.text.strip(): 510 | continue 511 | if ent.label_ not in ["GPE", "LOC", "FAC"]: 512 | continue 513 | # don't include country names (make a parameter) 514 | if ent.text.strip() in self._skip_list: 515 | continue 516 | ents.append(ent) 517 | if not ents: 518 | return [] 519 | # Look them up in geonames, either sequentially if no threading, or 520 | # in parallel if threads. 521 | if self.threads: 522 | pool = ThreadPool(len(ents)) 523 | ent_text = [i.text for i in ents] 524 | ent_results = pool.map(self.simple_lookup, ent_text) 525 | pool.close() 526 | pool.join() 527 | else: 528 | ent_results = [] 529 | for ent in ents: 530 | try: 531 | result = self.query_geonames(ent.text) 532 | except ConnectionTimeout: 533 | result = "" 534 | ent_results.append(result) 535 | 536 | for n, ent in enumerate(ents): 537 | result = ent_results[n] 538 | #skip_list.add(ent.text.strip()) 539 | ent_label = ent.label_ # destroyed by trimming 540 | ent = self.clean_entity(ent) 541 | 542 | # vector for just the solo word 543 | vp = self._feature_word_embedding(ent) 544 | try: 545 | word_vec = vp['country_1'] 546 | wv_confid = float(vp['confid_a']) 547 | except TypeError: 548 | # no idea why this comes up 549 | word_vec = "" 550 | wv_confid = "0" 551 | 552 | # look for explicit mentions of feature names 553 | class_mention, code_mention = self._feature_location_type_mention(ent) 554 | # build results-based features 555 | most_alt = self._feature_most_alternative(result) 556 | # TODO check if most_common feature really isn't that useful 557 | most_common = self._feature_most_common(result) 558 | most_pop = self._feature_most_population(result) 559 | first_back, second_back = self._feature_first_back(result) 560 | 561 | try: 562 | maj_vote = Counter([word_vec, most_alt, 563 | first_back, most_pop, 564 | ct_mention 565 | #doc_vec_sent, doc_vec 566 | ]).most_common()[0][0] 567 | except Exception as e: 568 | print("Problem taking majority vote: ", ent, e) 569 | maj_vote = "" 570 | 571 | if not maj_vote: 572 | maj_vote = "" 573 | # We only want all this junk for the labeling task. We just want to straight to features 574 | # and the model when in production. 575 | try: 576 | start = ent.start_char 577 | end = ent.end_char 578 | iso_label = maj_vote 579 | try: 580 | text_label = self._inv_cts[iso_label] 581 | except KeyError: 582 | text_label = "" 583 | task = {"text" : ent.text, 584 | "label" : text_label, # human-readable country name 585 | "word" : ent.text, 586 | "spans" : [{ 587 | "start" : start, 588 | "end" : end, 589 | } # make sure to rename for Prodigy 590 | ], 591 | "features" : { 592 | "maj_vote" : iso_label, 593 | "word_vec" : word_vec, 594 | "first_back" : first_back, 595 | #"doc_vec" : doc_vec, 596 | "most_alt" : most_alt, 597 | "most_pop" : most_pop, 598 | "ct_mention" : ct_mention, 599 | "ctm_count1" : ctm_count1, 600 | "ct_mention2" : ct_mention2, 601 | "ctm_count2" : ctm_count2, 602 | "wv_confid" : wv_confid, 603 | "class_mention" : class_mention, # inferred geonames class from mentions 604 | "code_mention" : code_mention, 605 | #"places_vec" : places_vec, 606 | #"doc_vec_sent" : doc_vec_sent 607 | } 608 | } 609 | task_list.append(task) 610 | except Exception as e: 611 | print(ent.text,) 612 | print(e) 613 | return task_list # rename this var 614 | # Two modules that call `make_country_features`: 615 | # 1. write out with majority vote for training 616 | # 2. turn into features, run model, return countries 617 | # A third, standalone function will convert the labeled JSON from Prodigy into 618 | # features for updating the model. 619 | 620 | 621 | def make_country_matrix(self, loc): 622 | """ 623 | Create features for all possible country labels, return as matrix for keras. 624 | 625 | Parameters 626 | ---------- 627 | loc: dict 628 | one entry from the list of locations and features that come out of make_country_features 629 | 630 | Returns 631 | -------- 632 | keras_inputs: dict with two keys, "label" and "matrix" 633 | """ 634 | 635 | top = loc['features']['ct_mention'] 636 | top_count = loc['features']['ctm_count1'] 637 | two = loc['features']['ct_mention2'] 638 | two_count = loc['features']['ctm_count2'] 639 | word_vec = loc['features']['word_vec'] 640 | first_back = loc['features']['first_back'] 641 | most_alt = loc['features']['most_alt'] 642 | most_pop = loc['features']['most_pop'] 643 | 644 | possible_labels = set([top, two, word_vec, first_back, most_alt, most_pop]) 645 | possible_labels = [i for i in possible_labels if i] 646 | 647 | X_mat = [] 648 | 649 | for label in possible_labels: 650 | inputs = np.array([word_vec, first_back, most_alt, most_pop]) 651 | x = inputs == label 652 | x = np.asarray((x * 2) - 1) # convert to -1, 1 653 | 654 | # get missing values 655 | exists = inputs != "" 656 | exists = np.asarray((exists * 2) - 1) 657 | 658 | counts = np.asarray([top_count, two_count]) # cludgy, should be up with "inputs" 659 | right = np.asarray([top, two]) == label 660 | right = right * 2 - 1 661 | right[counts == 0] = 0 662 | 663 | # get correct values 664 | features = np.concatenate([x, exists, counts, right]) 665 | X_mat.append(np.asarray(features)) 666 | 667 | keras_inputs = {"labels": possible_labels, 668 | "matrix": np.asmatrix(X_mat), 669 | "word": loc['word']} 670 | return keras_inputs 671 | 672 | 673 | 674 | def infer_country(self, doc): 675 | """NLP a doc, find its entities, get their features, and return the model's country guess for each. 676 | Maybe use a better name. 677 | 678 | Parameters 679 | ----------- 680 | doc: str or spaCy 681 | the document to country-resolve the entities in 682 | 683 | Returns 684 | ------- 685 | proced: list of dict 686 | the feature output of "make_country_features" updated with the model's 687 | estimated country for each entity. 688 | E.g.: 689 | {'all_confidence': array([ 0.95783567, 0.03769876, 0.00454875], dtype=float32), 690 | 'all_countries': array(['SYR', 'USA', 'JAM'], dtype=' 0: 806 | logp = np.log(pop) 807 | else: 808 | logp = 0 809 | ### order the results came back 810 | adj_rank = 1 / np.log(rank + 2) 811 | # alternative names 812 | len_alt = len(entry['alternativenames']) 813 | adj_alt = np.log(len_alt) 814 | ### feature class (just boost the good ones) 815 | if entry['feature_class'] == "A" or entry['feature_class'] == "P": 816 | good_type = 1 817 | else: 818 | good_type = 0 819 | #fc_score = 3 820 | ### feature class/code matching 821 | if entry['feature_class'] == class_mention: 822 | good_class_mention = 1 823 | else: 824 | good_class_mention = 0 825 | if entry['feature_code'] == code_mention: 826 | good_code_mention = 1 827 | else: 828 | good_code_mention = 0 829 | ### edit distance 830 | ed = editdistance.eval(search_name, entry['name']) 831 | ed = ed # shrug 832 | # maybe also get min edit distance to alternative names... 833 | 834 | features = [has_pop, pop, logp, adj_rank, len_alt, adj_alt, 835 | good_type, good_class_mention, good_code_mention, ed] 836 | m = self.format_geonames(entry) 837 | 838 | feature_list.append(features) 839 | meta.append(m) 840 | 841 | #meta = geo.format_geonames(results) 842 | X = np.asmatrix(feature_list) 843 | return (X, meta) 844 | 845 | def ranker(self, X, meta): 846 | """ 847 | Sort the place features list by the score of its relevance. 848 | """ 849 | # total score is just a sum of each row 850 | total_score = X.sum(axis=1).transpose() 851 | total_score = np.squeeze(np.asarray(total_score)) # matrix to array 852 | ranks = total_score.argsort() 853 | ranks = ranks[::-1] 854 | # sort the list of dicts according to ranks 855 | sorted_meta = [meta[r] for r in ranks] 856 | sorted_X = X[ranks] 857 | return (sorted_X, sorted_meta) 858 | 859 | def format_for_prodigy(self, X, meta, placename, return_feature_subset=False): 860 | """ 861 | Given a feature matrix, geonames data, and the original query, 862 | construct a prodigy task. 863 | 864 | Make meta nicely readable: "A town in Germany" 865 | 866 | Parameters 867 | ---------- 868 | 869 | X: matrix 870 | vector of features for ranking. Output of features_for_rank() 871 | meta: list of dictionaries 872 | other place information. Output of features_for_rank(). Used to provide 873 | information like "city in Germany" to the coding task. 874 | placename: str 875 | The extracted place name from text 876 | 877 | 878 | Returns 879 | -------- 880 | task_list: list of dicts 881 | Tasks ready to be written to JSONL and use in Prodigy. Each potential match includes 882 | a text description to the annotator can pick the right one. 883 | """ 884 | 885 | all_tasks = [] 886 | 887 | sorted_X, sorted_meta = self.ranker(X, meta) 888 | sorted_meta = sorted_meta[:4] 889 | sorted_X = sorted_X[:4] 890 | for n, i in enumerate(sorted_meta): 891 | feature_code = i['feature_code'] 892 | try: 893 | fc = self._code_to_text[feature_code] 894 | except KeyError: 895 | fc = '' 896 | text = ''.join(['"', i['place_name'], '"', 897 | ", a ", fc, 898 | " in ", i['country_code3'], 899 | ", id: ", i['geonameid']]) 900 | d = {"id" : n + 1, "text" : text} 901 | all_tasks.append(d) 902 | 903 | if return_feature_subset: 904 | return (all_tasks, sorted_meta, sorted_X) 905 | else: 906 | return all_tasks 907 | 908 | 909 | def format_geonames(self, entry, searchterm=None): 910 | """ 911 | Pull out just the fields we want from a geonames entry 912 | 913 | To do: 914 | - switch to model picking 915 | 916 | Parameters 917 | ----------- 918 | res : dict 919 | ES/geonames result 920 | 921 | searchterm : str 922 | (not implemented). Needed for better results picking 923 | 924 | Returns 925 | -------- 926 | new_res : dict 927 | containing selected fields from selected geonames entry 928 | """ 929 | try: 930 | lat, lon = entry['coordinates'].split(",") 931 | new_res = {"admin1" : self.get_admin1(entry['country_code2'], entry['admin1_code']), 932 | "lat" : lat, 933 | "lon" : lon, 934 | "country_code3" : entry["country_code3"], 935 | "geonameid" : entry["geonameid"], 936 | "place_name" : entry["name"], 937 | "feature_class" : entry["feature_class"], 938 | "feature_code" : entry["feature_code"]} 939 | return new_res 940 | except (IndexError, TypeError): 941 | # two conditions for these errors: 942 | # 1. there are no results for some reason (Index) 943 | # 2. res is set to "" because the country model was below the thresh 944 | new_res = {"admin1" : "", 945 | "lat" : "", 946 | "lon" : "", 947 | "country_code3" : "", 948 | "geonameid" : "", 949 | "place_name" : "", 950 | "feature_class" : "", 951 | "feature_code" : ""} 952 | return new_res 953 | 954 | def _check_exact(self, placename, match_list): 955 | """Find Geonames entries that have an exact match place name. 956 | 957 | When multiple hits come back for a query, this looks to see if any of them have 958 | an exact place name match in the `alternative_names` field. If only one does, 959 | it returns that one. Otherwise it returns None. 960 | """ 961 | exact_matches = [] 962 | for m in match_list: 963 | all_names = m['alternativenames'] 964 | all_names.append(m['name']) 965 | if placename in all_names: 966 | exact_matches.append(m) 967 | if len(exact_matches) == 1: 968 | return exact_matches[0] 969 | else: 970 | None 971 | 972 | def _check_editdist(self, placename, matchlist, threshold=2): 973 | """ 974 | Check canonical, alternative, and ascii names for a close match. 975 | 976 | Parameters 977 | ------------ 978 | placename: str 979 | The placename being searched for 980 | matchlist: list 981 | The results from Elasticsearch 982 | threshold: int 983 | The maximum edits allowed (defaults to 2) 984 | 985 | Returns 986 | -------- 987 | tuple, the edit distance and the actual match 988 | """ 989 | min_dists = [] 990 | avg_dists = [] 991 | for m in matchlist: 992 | all_names = m['alternativenames'] 993 | all_names.extend([m['asciiname'], m['name']]) 994 | 995 | ds = [editdistance.eval(placename, i) for i in all_names] 996 | min_dists.append(np.min(ds)) 997 | avg_dists.append(np.mean(ds)) 998 | 999 | if np.sum([i <= threshold for i in min_dists]) == 1: 1000 | dist = round(np.min(min_dists), 2) 1001 | m = matchlist[np.argmin(min_dists)] 1002 | reason = "CAUTION: Single edit distance match." 1003 | info = "One entry of {0} within minimum edit distance of {1}".format(len(matchlist), dist) 1004 | return m, reason, info 1005 | elif np.sum([i <= threshold for i in min_dists]) > 1: 1006 | dist = round(np.min(min_dists), 2) 1007 | m = matchlist[np.argmin(avg_dists)] 1008 | reason = "CAUTION: Best of several edit distance matches." 1009 | info = "{0} entries within minimum edit distance. Picking closest average distance: {1}.".format(len(matchlist), round(np.min(avg_dists), 2)) 1010 | return m, reason, info 1011 | else: 1012 | return None, None, None 1013 | 1014 | def lookup_city(self, city, country, adm1=None): 1015 | """ 1016 | Return the "best" Geonames entry for a city name. 1017 | 1018 | Queries the ES-Geonames gazetteer for the the given city, province/state/ADM1, and country, 1019 | and uses a set of rules to determine the best result to return. If adm1 is supplied, 1020 | only results from that ADM1 will be returned. 1021 | 1022 | This code was modified from Halterman's (2019) Syria casualties working paper. 1023 | 1024 | Parameters 1025 | ---------- 1026 | placename: str 1027 | The name of the city to look up 1028 | country: str 1029 | The three character country code (iso3c) 1030 | adm1: str 1031 | (Optional) the name of the state/governorate/province 1032 | 1033 | Returns 1034 | ------- 1035 | match: dict or list 1036 | The single entry from Geonames that best matches the query, or [] if no match at all. 1037 | """ 1038 | adm_limit = None 1039 | if adm1: 1040 | adm_res = self.query_geonames_country(placename=adm1, 1041 | country=country, 1042 | filter_params={"feature_code": "ADM1"}) 1043 | adm_res = adm_res['hits']['hits'] 1044 | if len(adm_res) == 1: 1045 | adm1 = adm_res[0]['admin1_code'] 1046 | adm_limit = {"admin1_code" : adm1} 1047 | res = self.query_geonames_country(city, country, adm_limit) 1048 | res = res['hits']['hits'] 1049 | 1050 | # look for a city first 1051 | match = [i for i in res if i['feature_code'] in ['PPL', 'PPLA', 'PPLC', 'PPLA2', 'PPLA3', 'PPLA3']] 1052 | if match: 1053 | if len(match) == 1: 1054 | return {"geo" : match[0], 1055 | "query" : city, 1056 | "info" : "{0} total results of all types".format(len(res)), 1057 | "reason" : "Single match for city in Elasticsearch with name, ADM1, country."} 1058 | # if there's more than one match: 1059 | m = self._check_exact(city, match) 1060 | if m: 1061 | return {"geo" : m, 1062 | "query" : city, 1063 | "info": "{0} elasticsearch matches for cities out of {1} total results of all types".format(len(match), len(res)), 1064 | "reason" : "Exact name match for city."} 1065 | # check the editdistance 1066 | m, reason, info = self._check_editdist(city, match) 1067 | if m: 1068 | return {"geo" : m, 1069 | "query" : city, 1070 | "info": info, 1071 | "reason" : reason} 1072 | 1073 | # if there's no city match, look for a neighborhood 1074 | match = [i for i in res if i['feature_code'] in ['PPLX', 'LCTY', 'PPLL', 'AREA']] 1075 | if match: 1076 | #print("neighborhood") 1077 | # if there's just a single match, we're done 1078 | if len(match) == 1: 1079 | reason = "Single elasticsearch match for neighborhood." 1080 | info = "{0} total results of all types".format(len(res)) 1081 | return {"geo" : match[0], 1082 | "query" : city, 1083 | "info" : info, 1084 | "reason" : reason} 1085 | # if there are multiple matches, look for exact matches 1086 | else: 1087 | m = self._check_exact(city, match) 1088 | if m: 1089 | reason = "Exact place name match for neighborhood." 1090 | info = "{0} elasticsearch matches out of {1} total results of all types".format(len(match), len(res)) 1091 | return {"geo" : m, 1092 | "query" : city, 1093 | "info" : info, 1094 | "reason" : reason} 1095 | 1096 | m, reason, info = self._check_editdist(city, match) 1097 | if m: 1098 | return {"geo" : m, 1099 | "query" : city, 1100 | "info": info, 1101 | "reason" : reason} 1102 | 1103 | if len(res) == 1: 1104 | reason = "CAUTION: One fuzzy match, not a city-type location." 1105 | return {"geo" : res[0], 1106 | "query" : city, 1107 | "reason" : reason, 1108 | "info" : "{0} total results of all types.".format(len(res))} 1109 | 1110 | if len(res) == 0: 1111 | reason = "FAILURE: No fuzzy match for city or neighborhood." 1112 | else: 1113 | reason = "FAILURE: Too many matches for city or neighborhood, none exact." 1114 | return {"geo" : None, 1115 | "query" : city, 1116 | "reason" : reason, 1117 | "info" : "{0} total results of all types.".format(len(res))} 1118 | 1119 | 1120 | 1121 | def clean_proced(self, proced): 1122 | """Small helper function to delete the features from the final dictionary. 1123 | These features are mostly interesting for debugging but won't be relevant for most users. 1124 | """ 1125 | for loc in proced: 1126 | try: 1127 | del loc['all_countries'] 1128 | except KeyError: 1129 | pass 1130 | try: 1131 | del loc['matrix'] 1132 | except KeyError: 1133 | pass 1134 | try: 1135 | del loc['all_confidence'] 1136 | except KeyError: 1137 | pass 1138 | try: 1139 | del loc['place_confidence'] 1140 | except KeyError: 1141 | pass 1142 | try: 1143 | del loc['text'] 1144 | except KeyError: 1145 | pass 1146 | try: 1147 | del loc['label'] 1148 | except KeyError: 1149 | pass 1150 | try: 1151 | del loc['features'] 1152 | except KeyError: 1153 | pass 1154 | return proced 1155 | 1156 | def geoparse(self, doc, verbose=False): 1157 | """Main geoparsing function. Text to extracted, resolved entities. 1158 | 1159 | Parameters 1160 | ---------- 1161 | doc : str or spaCy 1162 | The document to be geoparsed. Can be either raw text or already spacy processed. 1163 | In some cases, it makes sense to bulk parse using spacy's .pipe() before sending 1164 | through to Mordecai 1165 | 1166 | Returns 1167 | ------- 1168 | proced : list of dicts 1169 | Each entity gets an entry in the list, with the dictionary including geo info, spans, 1170 | and optionally, the input features. 1171 | """ 1172 | if not hasattr(doc, "ents"): 1173 | doc = self.nlp(doc) 1174 | proced = self.infer_country(doc) 1175 | if not proced: 1176 | return [] 1177 | # logging! 1178 | #print("Nothing came back from infer_country...") 1179 | if self.threads: 1180 | pool = ThreadPool(len(proced)) 1181 | results = pool.map(self.proc_lookup_country, proced) 1182 | pool.close() 1183 | pool.join() 1184 | else: 1185 | results = [] 1186 | for loc in proced: 1187 | if self.is_country(loc['word']): 1188 | # if it's a country name, just query that 1189 | res = self.query_geonames_country(loc['word'], 1190 | self._just_cts[loc['word']], 1191 | filter_params={"feature_code": "PCLI"}) 1192 | results.append(res) 1193 | # if the confidence is too low, don't use the country info 1194 | elif loc['country_conf'] > self.country_threshold: 1195 | res = self.query_geonames_country(loc['word'], loc['country_predicted']) 1196 | results.append(res) 1197 | else: 1198 | results.append("") 1199 | 1200 | for n, loc in enumerate(proced): 1201 | res = results[n] 1202 | try: 1203 | _ = res['hits']['hits'] 1204 | # If there's no geonames result, what to do? 1205 | # For now, just continue. 1206 | # In the future, delete? Or add an empty "loc" field? 1207 | except (TypeError, KeyError): 1208 | continue 1209 | # Pick the best place 1210 | X, meta = self.features_for_rank(loc, res) 1211 | if X.shape[1] == 0: 1212 | # This happens if there are no results... 1213 | continue 1214 | all_tasks, sorted_meta, sorted_X = self.format_for_prodigy(X, meta, loc['word'], return_feature_subset=True) 1215 | fl_pad = np.pad(sorted_X, ((0, 5 - sorted_X.shape[0]), (0, 0)), 'constant') 1216 | fl_unwrap = np.asmatrix(fl_pad.flatten()) 1217 | prediction = self.rank_model.predict(fl_unwrap) 1218 | place_confidence = prediction.max() 1219 | loc['geo'] = sorted_meta[prediction.argmax()] 1220 | loc['place_confidence'] = place_confidence 1221 | if not self.verbose: 1222 | proced = self.clean_proced(proced) 1223 | return proced 1224 | 1225 | 1226 | #labels = np.pad(labels, (0, 5 - len(labels)), 'constant') 1227 | # pad the matrix with empty rows 1228 | #fl_pad = np.pad(fl_subset, ((0, 5 - fl_subset.shape[0]), (0, 0)), 'constant') 1229 | 1230 | def batch_geoparse(self, text_list): 1231 | """ 1232 | Batch geoparsing function. Take in a list of text documents and return a list of lists 1233 | of the geoparsed documents. The speed improvements come exclusively from using spaCy's `nlp.pipe`. 1234 | 1235 | Parameters 1236 | ---------- 1237 | text_list : list of strs 1238 | List of documents. The documents should not have been pre-processed by spaCy. 1239 | 1240 | Returns 1241 | ------- 1242 | processed : list of list of dictionaries. 1243 | The list is the same length as the input list of documents. Each element is a list of dicts, one for 1244 | each geolocated entity. 1245 | """ 1246 | if not self.threads: 1247 | print("batch_geoparsed should be used with threaded searches. Please set `threads=True` when initializing the geoparser.") 1248 | nlped_docs = list(self.nlp.pipe(text_list, as_tuples=False, n_threads=multiprocessing.cpu_count())) 1249 | processed = [] 1250 | for i in tqdm(nlped_docs, disable=not self.progress): 1251 | p = self.geoparse(i) 1252 | processed.append(p) 1253 | return processed 1254 | 1255 | -------------------------------------------------------------------------------- /mordecai/models/country_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/models/country_model.h5 -------------------------------------------------------------------------------- /mordecai/models/country_model_multi.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/models/country_model_multi.h5 -------------------------------------------------------------------------------- /mordecai/models/rank_model.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/models/rank_model.h5 -------------------------------------------------------------------------------- /mordecai/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/mordecai/tests/__init__.py -------------------------------------------------------------------------------- /mordecai/tests/conftest.py: -------------------------------------------------------------------------------- 1 | from ..geoparse import Geoparser 2 | import pytest 3 | 4 | import spacy 5 | nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger']) 6 | 7 | @pytest.fixture(scope='session', autouse=True) 8 | def geo(): 9 | return Geoparser(nlp=nlp, threads=False) 10 | 11 | @pytest.fixture(scope='session', autouse=True) 12 | def geo_thread(): 13 | return Geoparser(nlp=nlp, threads=True) 14 | -------------------------------------------------------------------------------- /mordecai/tests/test_mordecai.py: -------------------------------------------------------------------------------- 1 | from elasticsearch_dsl import Q 2 | import numpy as np 3 | from ..utilities import structure_results 4 | 5 | import spacy 6 | nlp = spacy.load('en_core_web_lg', disable=['parser', 'tagger']) 7 | 8 | def test_issue_40_2_thread(geo_thread): 9 | doc_list = ["Government forces attacked the cities in Aleppo Governorate, while rebel leaders met in Geneva.", 10 | "EULEX is based in Prishtina, Kosovo.", 11 | "Clientelism may depend on brokers."] 12 | locs = geo_thread.batch_geoparse(doc_list) 13 | assert len(locs) == 3 14 | assert locs[0][0]['geo']['geonameid'] == '170063' 15 | assert locs[0][1]['country_predicted'] == 'CHE' 16 | assert locs[1][0]['geo']['feature_code'] == 'PPLC' 17 | assert locs[1][1]['geo']['country_code3'] == 'XKX' 18 | assert locs[2] == [] 19 | 20 | def test_fm_methods_exist(geo): 21 | assert hasattr(geo, "_feature_most_alternative") 22 | assert hasattr(geo, "_feature_first_back") 23 | assert hasattr(geo, "_feature_word_embedding") 24 | assert hasattr(geo, "clean_entity") 25 | 26 | def test_fm_methods_exist_thread(geo_thread): 27 | assert hasattr(geo_thread, "_feature_most_alternative") 28 | assert hasattr(geo_thread, "_feature_first_back") 29 | assert hasattr(geo_thread, "_feature_word_embedding") 30 | assert hasattr(geo_thread, "clean_entity") 31 | 32 | def test_cts(geo): 33 | assert "Kosovo" in geo._cts.keys() 34 | assert "Kosovo" not in geo._cts.values() 35 | assert "AFG" in geo._cts.values() 36 | 37 | def test_cts_thread(geo_thread): 38 | assert "Kosovo" in geo_thread._cts.keys() 39 | assert "Kosovo" not in geo_thread._cts.values() 40 | assert "AFG" in geo_thread._cts.values() 41 | 42 | def test_country_mentions(geo): 43 | doc = nlp("Puerto Cabello is a port city in Venezuela") 44 | f = geo._feature_country_mentions(doc) 45 | assert f == ('VEN', 1, '', 0) 46 | 47 | def test_country_mentions_thread(geo_thread): 48 | doc = nlp("Puerto Cabello is a port city in Venezuela") 49 | f = geo_thread._feature_country_mentions(doc) 50 | assert f == ('VEN', 1, '', 0) 51 | 52 | def test_vector_picking(geo): 53 | entity = nlp("Mosul") 54 | vp = geo._feature_word_embedding(entity) 55 | assert vp['country_1'] == "IRQ" 56 | 57 | def test_vector_picking_thread(geo_thread): 58 | entity = nlp("Mosul") 59 | vp = geo_thread._feature_word_embedding(entity) 60 | assert vp['country_1'] == "IRQ" 61 | 62 | def test_cts2(geo): 63 | out = geo._inv_cts['DEU'] 64 | assert out == "Germany" 65 | 66 | def test_cts2_thread(geo_thread): 67 | out = geo_thread._inv_cts['DEU'] 68 | assert out == "Germany" 69 | 70 | def test_lookup_city(geo): 71 | out = geo.lookup_city("Norman", country="USA", adm1="Oklahoma") 72 | assert out['geo']['geonameid'] == '4543762' 73 | assert out['reason'] == 'Single match for city in Elasticsearch with name, ADM1, country.' 74 | 75 | def test_lookup_city2(geo): 76 | out = geo.lookup_city("Rukn al-Din", "SYR") 77 | assert out['geo']['geonameid'] == '7642446' 78 | assert out['reason'] == 'CAUTION: Single edit distance match.' 79 | 80 | def test_city_lookup3(geo): 81 | # two easy cases 82 | res = geo.lookup_city("Norman", adm1 = "OK", country = "USA") 83 | assert res['geo']['geonameid'] == '4543762' 84 | res = geo.lookup_city("College Park", adm1 = "MD", country = "USA") 85 | assert res['geo']['geonameid'] == '4351977' 86 | res = geo.lookup_city("College Park", adm1 = "OK", country = "USA") 87 | assert res['geo'] is None 88 | # for some reason, Cambridge neighborhoods are PPL, not PPLX. 89 | res = geo.lookup_city("East Cambridge", adm1 = "MA", country = "USA") 90 | assert res['geo']['geonameid'] == '5152577' 91 | assert res['geo']['feature_code'] == 'PPL' 92 | # Non-US check 93 | res = geo.lookup_city("Aleppo", adm1 = "Aleppo", country = "SYR") 94 | assert res['geo']['feature_code'] == 'PPLA' 95 | assert res['geo']['geonameid'] == '170063' 96 | res = geo.lookup_city("Munich", country = "DEU") 97 | assert res['geo']['geonameid'] == '2867714' 98 | # Another US check 99 | res = geo.lookup_city("Aleppo", country = "USA") 100 | assert res['geo']['geonameid'] == '4556251' 101 | # test neighborhood 102 | res = geo.lookup_city("Bustan al-Qasr", adm1 = "Aleppo", country = "SYR") 103 | assert res['geo']['feature_code'] == 'PPLX' 104 | assert res['geo']['geonameid'] == '7753543' 105 | # check nonsense 106 | res = geo.lookup_city("qwertyqwerty", adm1 = "Aleppo", country = "SYR") 107 | assert res['geo'] is None 108 | 109 | def test_most_population(geo): 110 | res_a = geo.query_geonames("Berlin") 111 | res_b = geo.query_geonames("Oklahoma City") 112 | res_c = geo.query_geonames("Tripoli") 113 | a = geo._feature_most_population(res_a) 114 | b = geo._feature_most_population(res_b) 115 | c = geo._feature_most_population(res_c) 116 | assert a == "DEU" 117 | assert b == "USA" 118 | assert c == "LBY" 119 | 120 | def test_most_population_thread(geo_thread): 121 | res_a = geo_thread.query_geonames("Berlin") 122 | res_b = geo_thread.query_geonames("Oklahoma City") 123 | res_c = geo_thread.query_geonames("Tripoli") 124 | a = geo_thread._feature_most_population(res_a) 125 | b = geo_thread._feature_most_population(res_b) 126 | c = geo_thread._feature_most_population(res_c) 127 | assert a == "DEU" 128 | assert b == "USA" 129 | assert c == "LBY" 130 | 131 | def test_is_country(geo): 132 | a = geo.is_country("Senegal") 133 | assert a 134 | 135 | def test_make_country_features(geo): 136 | doc = nlp("EULEX is based in Prishtina, Kosovo.") 137 | f = geo.make_country_features(doc) 138 | assert f[0]['features']['most_alt'] == "XKX" 139 | assert f[1]['features']['most_alt'] == "XKX" 140 | assert f[0]['features']['word_vec'] == "XKX" 141 | assert f[1]['features']['word_vec'] == "XKX" 142 | assert f[0]['features']['wv_confid'] > 10 143 | assert f[1]['features']['wv_confid'] > 10 144 | assert len(f[0]['spans']) == 1 145 | assert len(f[1]['spans']) == 1 146 | 147 | def test_make_country_features_thread(geo_thread): 148 | doc = nlp("EULEX is based in Prishtina, Kosovo.") 149 | f = geo_thread.make_country_features(doc) 150 | assert f[0]['features']['most_alt'] == "XKX" 151 | assert f[1]['features']['most_alt'] == "XKX" 152 | assert f[0]['features']['word_vec'] == "XKX" 153 | assert f[1]['features']['word_vec'] == "XKX" 154 | assert f[0]['features']['wv_confid'] > 10 155 | assert f[1]['features']['wv_confid'] > 10 156 | assert len(f[0]['spans']) == 1 157 | assert len(f[1]['spans']) == 1 158 | 159 | 160 | def test_infer_country1(geo): 161 | doc = "There's fighting in Aleppo and Homs." 162 | loc = geo.infer_country(doc) 163 | assert loc[0]['country_predicted'] == "SYR" 164 | assert loc[1]['country_predicted'] == "SYR" 165 | 166 | def test_infer_country1_thread(geo_thread): 167 | doc = "There's fighting in Aleppo and Homs." 168 | loc = geo_thread.infer_country(doc) 169 | assert loc[0]['country_predicted'] == "SYR" 170 | assert loc[1]['country_predicted'] == "SYR" 171 | 172 | 173 | def test_infer_country2(geo): 174 | doc = "There's fighting in Berlin and Hamburg." 175 | loc = geo.infer_country(doc) 176 | assert loc[0]['country_predicted'] == "DEU" 177 | assert loc[1]['country_predicted'] == "DEU" 178 | 179 | def test_infer_country2_thread(geo_thread): 180 | doc = "There's fighting in Berlin and Hamburg." 181 | loc = geo_thread.infer_country(doc) 182 | assert loc[0]['country_predicted'] == "DEU" 183 | assert loc[1]['country_predicted'] == "DEU" 184 | 185 | def test_two_countries(geo): 186 | doc = "There's fighting in Aleppo and talking in Geneva." 187 | loc = geo.geoparse(doc) 188 | assert loc[0]['country_predicted'] == "SYR" 189 | assert loc[1]['country_predicted'] == "CHE" 190 | 191 | def test_two_countries_thread(geo_thread): 192 | doc = "There's fighting in Aleppo and talking in Geneva." 193 | loc = geo_thread.geoparse(doc) 194 | assert loc[0]['country_predicted'] == "SYR" 195 | assert loc[1]['country_predicted'] == "CHE" 196 | 197 | def test_US_city(geo): 198 | doc = "There's fighting in Norman, Oklahoma." 199 | locs = geo.geoparse(doc) 200 | assert locs[0]['geo']['geonameid'] == '4543762' 201 | assert locs[1]['geo']['geonameid'] == '4544379' 202 | 203 | def test_US_city_thread(geo_thread): 204 | doc = "There's fighting in Norman, Oklahoma." 205 | locs = geo_thread.geoparse(doc) 206 | assert locs[0]['geo']['geonameid'] == '4543762' 207 | assert locs[1]['geo']['geonameid'] == '4544379' 208 | 209 | def test_admin1(geo): 210 | doc = "There's fighting in Norman, Oklahoma." 211 | locs = geo.geoparse(doc) 212 | assert locs[0]['geo']['admin1'] == 'Oklahoma' 213 | 214 | def test_admin1_thread(geo_thread): 215 | doc = "There's fighting in Norman, Oklahoma." 216 | locs = geo_thread.geoparse(doc) 217 | assert locs[0]['geo']['admin1'] == 'Oklahoma' 218 | 219 | def test_weird_loc(geo): 220 | doc = "There's fighting in Ajnsdgjb city." 221 | loc = geo.geoparse(doc) 222 | assert loc[0]['country_conf'] < 0.3 223 | 224 | def test_weird_loc_thread(geo_thread): 225 | doc = "There's fighting in GOUOsabgoajwh city." 226 | loc = geo_thread.geoparse(doc) 227 | assert loc[0]['country_conf'] < 0.3 228 | 229 | def test_no_loc(geo): 230 | doc = "The dog ran through the park." 231 | loc = geo.geoparse(doc) 232 | assert len(loc) == 0 233 | 234 | def test_no_loc_thread(geo_thread): 235 | doc = "The dog ran through the park." 236 | loc = geo_thread.geoparse(doc) 237 | assert len(loc) == 0 238 | 239 | def test_query(geo): 240 | results = geo.query_geonames("Berlin") 241 | assert results['hits']['hits'][15]['country_code3'] 242 | 243 | def test_query_thread(geo_thread): 244 | results = geo_thread.query_geonames("Berlin") 245 | assert results['hits']['hits'][15]['country_code3'] 246 | 247 | def test_missing_feature_code(geo): 248 | doc = "Congress and in the legislatures of Alabama, California, Florida, and Michigan." 249 | locs = geo.geoparse(doc) 250 | assert locs 251 | 252 | def test_missing_feature_code_thread(geo_thread): 253 | doc = "Congress and in the legislatures of Alabama, California, Florida, and Michigan." 254 | locs = geo_thread.geoparse(doc) 255 | assert locs 256 | 257 | def test_aleppo_geneva(geo): 258 | locs = geo.geoparse("Government forces attacked the cities in Aleppo Governorate, while rebel leaders met in Geneva.") 259 | assert locs[0]['geo']['country_code3'] == 'SYR' 260 | assert locs[1]['geo']['country_code3'] == 'CHE' 261 | 262 | def test_aleppo_geneva_thread(geo_thread): 263 | locs = geo_thread.geoparse("Government forces attacked the cities in Aleppo Governorate, while rebel leaders met in Geneva.") 264 | assert locs[0]['geo']['country_code3'] == 'SYR' 265 | assert locs[1]['geo']['country_code3'] == 'CHE' 266 | 267 | def test_issue_40(geo): 268 | doc = "In early 1938, the Prime Minister cut grants-in-aid to the provinces, effectively killing the relief project scheme. Premier Thomas Dufferin Pattullo closed the projects in April, claiming that British Columbia could not shoulder the burden alone. Unemployed men again flocked to Vancouver to protest government insensitivity and intransigence to their plight. The RCPU organized demonstrations and tin-canning (organized begging) in the city. Under the guidance of twenty-six-year-old Steve Brodie, the leader of the Youth Division who had cut his activist teeth during the 1935 relief camp strike, protesters occupied Hotel Georgia, the Vancouver Art Gallery (then located at 1145 West Georgia Street), and the main post office (now the Sinclair Centre)." 269 | locs = geo.geoparse(doc) 270 | assert len(locs) > 2 271 | 272 | 273 | def test_issue_45(geo): 274 | text = """Santa Cruz is a first class municipality in 275 | the province of Davao del Sur, Philippines. It has a population of 81,093 276 | people as of 2010. The Municipality of Santa Cruz is part of Metropolitan 277 | Davao. Santa Cruz is politically subdivided into 18 barangays. Of the 18 278 | barangays, 7 are uplands, 9 are upland-lowland and coastal and 2 are 279 | lowland-coastal. Pista sa Kinaiyahan A yearly activity conducted every last 280 | week of April as a tribute to the Mother Nature through tree-growing, cleanup 281 | activities and Boulder Face challenge. Araw ng Santa Cruz It is celebrated 282 | every October 5 in commemoration of the legal creation of the municipality in 283 | 1884. Highlights include parades, field demonstrations, trade fairs, carnivals 284 | and traditional festivities. Sinabbadan Festival A festival of ethnic ritual 285 | and dances celebrated every September. Santa Cruz is accessible by land 286 | transportation vehicles plying the Davao-Digos City, Davao-Kidapawan City, 287 | Davao-Cotabato City, Davao-Koronadal City and Davao-Tacurong City routes 288 | passing through the town's single, 27 kilometres (17 mi) stretch of national 289 | highway that traverses its 11 barangays. From Davao City, the administrative 290 | center of Region XI, it is 38 kilometres (24 mi) away within a 45-minute ride, 291 | while it is 16 kilometres (9.9 mi) or about 15-minute ride from provincial 292 | capital city of Digos.""" 293 | locs = geo.geoparse(text) 294 | assert len(locs) > 0 295 | 296 | def test_issue_45_thread(geo_thread): 297 | text = """Santa Cruz is a first class municipality in 298 | the province of Davao del Sur, Philippines. It has a population of 81,093 299 | people as of 2010. The Municipality of Santa Cruz is part of Metropolitan 300 | Davao. Santa Cruz is politically subdivided into 18 barangays. Of the 18 301 | barangays, 7 are uplands, 9 are upland-lowland and coastal and 2 are 302 | lowland-coastal. Pista sa Kinaiyahan A yearly activity conducted every last 303 | week of April as a tribute to the Mother Nature through tree-growing, cleanup 304 | activities and Boulder Face challenge. Araw ng Santa Cruz It is celebrated 305 | every October 5 in commemoration of the legal creation of the municipality in 306 | 1884. Highlights include parades, field demonstrations, trade fairs, carnivals 307 | and traditional festivities. Sinabbadan Festival A festival of ethnic ritual 308 | and dances celebrated every September. Santa Cruz is accessible by land 309 | transportation vehicles plying the Davao-Digos City, Davao-Kidapawan City, 310 | Davao-Cotabato City, Davao-Koronadal City and Davao-Tacurong City routes 311 | passing through the town's single, 27 kilometres (17 mi) stretch of national 312 | highway that traverses its 11 barangays. From Davao City, the administrative 313 | center of Region XI, it is 38 kilometres (24 mi) away within a 45-minute ride, 314 | while it is 16 kilometres (9.9 mi) or about 15-minute ride from provincial 315 | capital city of Digos.""" 316 | locs = geo_thread.geoparse(text) 317 | assert len(locs) > 0 318 | 319 | def test_ohio(geo): 320 | # This was a problem in issue 41 321 | r = Q("match", geonameid='5165418') 322 | result = geo.conn.query(r).execute() 323 | output = structure_results(result) 324 | assert output['hits']['hits'][0]['asciiname'] == "Ohio" 325 | 326 | def test_readme_example(geo): 327 | output = geo.geoparse("I traveled from Oxford to Ottawa.") 328 | correct = [{'country_conf': np.float32(0.957188), 329 | 'country_predicted': 'GBR', 330 | 'geo': {'admin1': 'England', 331 | 'country_code3': 'GBR', 332 | 'feature_class': 'P', 333 | 'feature_code': 'PPLA2', 334 | 'geonameid': '2640729', 335 | 'lat': '51.75222', 336 | 'lon': '-1.25596', 337 | 'place_name': 'Oxford'}, 338 | 'spans': [{'end': 22, 'start': 16}], 339 | 'word': 'Oxford'}, 340 | {'country_conf': np.float32(0.8799221), 341 | 'country_predicted': 'CAN', 342 | 'geo': {'admin1': 'Ontario', 343 | 'country_code3': 'CAN', 344 | 'feature_class': 'P', 345 | 'feature_code': 'PPLC', 346 | 'geonameid': '6094817', 347 | 'lat': '45.41117', 348 | 'lon': '-75.69812', 349 | 'place_name': 'Ottawa'}, 350 | 'spans': [{'end': 32, 'start': 26}], 351 | 'word': 'Ottawa'}] 352 | assert output == correct 353 | 354 | def test_readme_example_thread(geo_thread): 355 | output = geo_thread.geoparse("I traveled from Oxford to Ottawa.") 356 | correct = [{'country_conf': np.float32(0.957188), 357 | 'country_predicted': 'GBR', 358 | 'geo': {'admin1': 'England', 359 | 'country_code3': 'GBR', 360 | 'feature_class': 'P', 361 | 'feature_code': 'PPLA2', 362 | 'geonameid': '2640729', 363 | 'lat': '51.75222', 364 | 'lon': '-1.25596', 365 | 'place_name': 'Oxford'}, 366 | 'spans': [{'end': 22, 'start': 16}], 367 | 'word': 'Oxford'}, 368 | {'country_conf': np.float32(0.8799221), 369 | 'country_predicted': 'CAN', 370 | 'geo': {'admin1': 'Ontario', 371 | 'country_code3': 'CAN', 372 | 'feature_class': 'P', 373 | 'feature_code': 'PPLC', 374 | 'geonameid': '6094817', 375 | 'lat': '45.41117', 376 | 'lon': '-75.69812', 377 | 'place_name': 'Ottawa'}, 378 | 'spans': [{'end': 32, 'start': 26}], 379 | 'word': 'Ottawa'}] 380 | assert output == correct 381 | 382 | def test_issue_53(geo): 383 | # the spans issue 384 | output = geo.geoparse("I traveled from Oxford to Ottawa.") 385 | assert output[0]['spans'][0]['start'] == 16 386 | assert output[0]['spans'][0]['end'] == 22 387 | assert output[1]['spans'][0]['start'] == 26 388 | assert output[1]['spans'][0]['end'] == 32 389 | 390 | def test_issue_68_verbose(geo): 391 | res = geo.geoparse("The ship entered Greenville from Tarboro", verbose=True) 392 | assert res 393 | 394 | def test_issue_77(geo): 395 | res = geo.geoparse("We traveled to the USA") 396 | assert res[0]['geo']['feature_code'] == "PCLI" 397 | res = geo.geoparse("We traveled to the United States.") 398 | assert res[0]['geo']['feature_code'] == "PCLI" 399 | res = geo.geoparse("We traveled to Germany.") 400 | assert res[0]['geo']['feature_code'] == "PCLI" 401 | res = geo.geoparse("We traveled to France.") 402 | assert res[0]['geo']['feature_code'] == "PCLI" 403 | 404 | def test_issue_82(geo): 405 | ents = geo.nlp(""" Wuppertal (remote-option) """).ents 406 | res = geo.geoparse( """ Wuppertal (remote-option) """ ) 407 | assert len(ents) == len(res) 408 | ents = geo.nlp(""" Wuppertal remote-option """).ents 409 | res = geo.geoparse( """ Wuppertal remote-option """ ) 410 | assert len(ents) == len(res) -------------------------------------------------------------------------------- /mordecai/utilities.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from __future__ import unicode_literals 3 | from __future__ import print_function 4 | import os 5 | import sys 6 | import json 7 | import numpy 8 | import pandas as pd 9 | from elasticsearch_dsl import Search, Q 10 | from elasticsearch import Elasticsearch 11 | 12 | import spacy 13 | 14 | try: 15 | nlp 16 | except NameError: 17 | nlp = spacy.load('en_core_web_lg') 18 | 19 | 20 | def country_list_maker(): 21 | """ 22 | Helper function to return dictionary of countries in {"country" : "iso"} form. 23 | """ 24 | cts = {"Afghanistan":"AFG", "Åland Islands":"ALA", "Albania":"ALB", "Algeria":"DZA", 25 | "American Samoa":"ASM", "Andorra":"AND", "Angola":"AGO", "Anguilla":"AIA", 26 | "Antarctica":"ATA", "Antigua Barbuda":"ATG", "Argentina":"ARG", 27 | "Armenia":"ARM", "Aruba":"ABW", "Ascension Island":"NA", "Australia":"AUS", 28 | "Austria":"AUT", "Azerbaijan":"AZE", "Bahamas":"BHS", "Bahrain":"BHR", 29 | "Bangladesh":"BGD", "Barbados":"BRB", "Belarus":"BLR", "Belgium":"BEL", 30 | "Belize":"BLZ", "Benin":"BEN", "Bermuda":"BMU", "Bhutan":"BTN", 31 | "Bolivia":"BOL", "Bosnia Herzegovina":"BIH", 32 | "Botswana":"BWA", "Bouvet Island":"BVT", "Brazil":"BRA", 33 | "Britain":"GBR", "Great Britain":"GBR", 34 | "British Virgin Islands":"VGB", "Brunei":"BRN", "Bulgaria":"BGR", "Burkina Faso":"BFA", 35 | "Burundi":"BDI", "Cambodia":"KHM", "Cameroon":"CMR", 36 | "Canada":"CAN","Cape Verde":"CPV", "Cayman Islands":"CYM", 37 | "Central African Republic":"CAF", "Chad":"TCD", "Chile":"CHL", "China":"CHN", 38 | "Cocos Islands":"CCK", "Colombia":"COL", 39 | "Comoros":"COM", "Republic of Congo":"COG", "Cook Islands":"COK", 40 | "Costa Rica":"CRI", "Cote Ivoire":"CIV", "Ivory Coast":"CIV","Croatia":"HRV", "Cuba":"CUB", 41 | "Curaçao":"CUW", "Cyprus":"CYP", "Czech Republic":"CZE", "Denmark":"DNK", 42 | "Djibouti":"DJI", "Dominica":"DMA", "Dominican Republic":"DOM", "Democratic Republic of Congo" : "COD", 43 | "Ecuador":"ECU", "Egypt":"EGY", "El Salvador":"SLV", "England" : "GBR", 44 | "Equatorial Guinea":"GNQ", "Eritrea":"ERI", "Estonia":"EST", "Ethiopia":"ETH", 45 | "Falkland Islands":"FLK", "Faroe Islands":"FRO", 46 | "Fiji":"FJI", "Finland":"FIN", "France":"FRA", "French Guiana":"GUF", 47 | "French Polynesia":"PYF","Gabon":"GAB", 48 | "Gambia":"GMB", "Georgia":"GEO", "Germany":"DEU", "Ghana":"GHA", 49 | "Gibraltar":"GIB", "Greece":"GRC", "Greenland":"GRL", "Grenada":"GRD", 50 | "Guadeloupe":"GLP", "Guam":"GUM", "Guatemala":"GTM", "Guernsey":"GGY", 51 | "Guinea":"GIN", "Guinea Bissau":"GNB", "Guyana":"GUY", "Haiti":"HTI","Honduras":"HND", 52 | "Hong Kong":"HKG", "Hungary":"HUN", "Iceland":"ISL", 53 | "India":"IND", "Indonesia":"IDN", "Iran":"IRN", "Iraq":"IRQ", "Ireland":"IRL", 54 | "Israel":"ISR", "Italy":"ITA", "Jamaica":"JAM", "Japan":"JPN", 55 | "Jordan":"JOR", "Kazakhstan":"KAZ", "Kenya":"KEN", 56 | "Kiribati":"KIR", "Kosovo": "XKX", "Kuwait":"KWT", "Kyrgyzstan":"KGZ", "Laos":"LAO", 57 | "Latvia":"LVA", "Lebanon":"LBN", "Lesotho":"LSO", "Liberia":"LBR", 58 | "Libya":"LBY", "Liechtenstein":"LIE", "Lithuania":"LTU", "Luxembourg":"LUX", 59 | "Macau":"MAC", "Macedonia":"MKD", "Madagascar":"MDG", "Malawi":"MWI", 60 | "Malaysia":"MYS", "Maldives":"MDV", "Mali":"MLI", "Malta":"MLT", "Marshall Islands":"MHL", 61 | "Martinique":"MTQ", "Mauritania":"MRT", "Mauritius":"MUS", 62 | "Mayotte":"MYT", "Mexico":"MEX", "Micronesia":"FSM", "Moldova":"MDA", 63 | "Monaco":"MCO", "Mongolia":"MNG", "Montenegro":"MNE", "Montserrat":"MSR", 64 | "Morocco":"MAR", "Mozambique":"MOZ", "Myanmar":"MMR", "Burma":"MMR", "Namibia":"NAM", 65 | "Nauru":"NRU", "Nepal":"NPL", "Netherlands":"NLD", "Netherlands Antilles":"ANT", 66 | "New Caledonia":"NCL", "New Zealand":"NZL", "Nicaragua":"NIC", 67 | "Niger":"NER", "Nigeria":"NGA", "Niue":"NIU", "North Korea":"PRK", 68 | "Northern Ireland":"IRL", "Northern Mariana Islands":"MNP", 69 | "Norway":"NOR", "Oman":"OMN", "Pakistan":"PAK", 70 | "Palau":"PLW", "Palestine":"PSE","Panama":"PAN", "Papua New Guinea":"PNG", 71 | "Paraguay":"PRY", "Peru":"PER", "Philippines":"PHL", "Pitcairn Islands":"PCN", 72 | "Poland":"POL", "Portugal":"PRT", "Puerto Rico":"PRI", 73 | "Qatar":"QAT", "Réunion":"REU", "Romania":"ROU", "Russia":"RUS", 74 | "Rwanda":"RWA", "Saint Barthélemy":"BLM", "Saint Helena":"SHN", 75 | "Saint Kitts Nevis":"KNA", "Saint Lucia":"LCA", 76 | "Saint Pierre Miquelon":"SPM", "Saint Vincent Grenadines":"VCT", 77 | "Samoa":"WSM", "San Marino":"SMR", "São Tomé Príncipe":"STP", "Saudi Arabia":"SAU", 78 | "Senegal":"SEN", "Serbia":"SRB", 79 | "Seychelles":"SYC", "Sierra Leone":"SLE", "Singapore":"SGP", "Sint Maarten":"SXM", 80 | "Slovakia":"SVK", "Slovenia":"SVN", "Solomon Islands":"SLB", 81 | "Somalia":"SOM", "South Africa":"ZAF", 82 | "South Korea":"KOR", "South Sudan":"SSD", "Spain":"ESP", "Sri Lanka":"LKA", "Sudan":"SDN", 83 | "Suriname":"SUR", "Svalbard Jan Mayen":"SJM", 84 | "Swaziland":"SWZ", "Sweden":"SWE", "Switzerland":"CHE", "Syria":"SYR", 85 | "Taiwan":"TWN", "Tajikistan":"TJK", "Tanzania":"TZA", "Thailand":"THA", 86 | "Timor Leste":"TLS", "East Timor":"TLS","Togo":"TGO", "Tokelau":"TKL", "Tonga":"TON", "Trinidad Tobago":"TTO", 87 | "Tunisia":"TUN", "Turkey":"TUR", 88 | "Turkmenistan":"TKM", "Turks Caicos Islands":"TCA", "Tuvalu":"TUV", "U.S. Minor Outlying Islands":"UMI", 89 | "Virgin Islands":"VIR", "Uganda":"UGA", 90 | "Ukraine":"UKR", "United Arab Emirates":"ARE", "United Kingdom":"GBR", 91 | "United States":"USA", "Uruguay":"URY", "Uzbekistan":"UZB", "Vanuatu":"VUT", "Vatican":"VAT", 92 | "Venezuela":"VEN", 93 | "Vietnam":"VNM", "Wallis Futuna":"WLF", 94 | "Western Sahara":"ESH", "Yemen":"YEM", "Zambia":"ZMB", "Zimbabwe":"ZWE", 95 | "UK":"GBR", "United States":"USA", "USA":"USA", "America":"USA", "Palestinian Territories":"PSE", 96 | "Congo Brazzaville":"COG", "DRC":"COD", "Congo Kinshasa":"COD", "Wales" : "GBR", 97 | "Scotland" : "GBR", "Britain" : "GBR",} 98 | 99 | return cts 100 | 101 | 102 | def other_vectors(): 103 | """ 104 | Define more {placename : iso} mappings to improve performance of vector-based 105 | country picking. An easy hack to force a placename to resolve to a defined country 106 | would be to add it to this list. 107 | """ 108 | # We want the advantage of having more defined vector terms to help 109 | # matching, but we also want to make sure that when we invert the 110 | # dictionary for labeling, each ISO code gets resolved to a single country 111 | # name, as opposed to an alternative name, city, or state. 112 | other_vecs = { 113 | # alt. country names 114 | # US states 115 | "Alabama" : "USA", "Alaska" : "USA", "Arizona" : "USA", "Arkansas" : "USA", 116 | "California" : "USA", "Colorado" : "USA", "Connecticut" : "USA", "Delaware" : "USA", 117 | "Florida" : "USA", 118 | # "Georgia" : "USA", <----- hmmmm 119 | "Hawaii" : "USA", "Idaho" : "USA", 120 | "Illinois" : "USA", "Indiana" : "USA", "Iowa" : "USA", "Kansas" : "USA", 121 | "Kentucky" : "USA", "Louisiana" : "USA", "Maine" : "USA", 122 | "Maryland" : "USA", "Massachusetts" : "USA", "Michigan" : "USA", 123 | "Minnesota" : "USA", "Mississippi" : "USA", "Missouri" : "USA", 124 | "Montana" : "USA", "Nebraska" : "USA", "Nevada" : "USA", "New Hampshire" : "USA", 125 | "New Jersey" : "USA", "New Mexico" : "USA", "New York" : "USA", 126 | "North Carolina" : "USA", "North Dakota" : "USA", "Ohio" : "USA", 127 | "Oklahoma" : "USA", "Oregon" : "USA", "Pennsylvania" : "USA", 128 | "Rhode Island" : "USA", "South Carolina" : "USA", "South Dakota" : "USA", 129 | "Tennessee" : "USA", "Texas" : "USA", "Utah" : "USA", 130 | "Vermont" : "USA", "Virginia" : "USA", "Washington" : "USA", 131 | "West Virginia" : "USA", "Wisconsin" : "USA", "Wyoming" : "USA", 132 | # cities 133 | "Beijing" : "CHN", "Chicago" : "USA", 134 | "Tbilisi" : "GEO", "Gaza":"PSE"} 135 | return other_vecs 136 | 137 | 138 | def make_skip_list(cts): 139 | """ 140 | Return hand-defined list of place names to skip and not attempt to geolocate. If users would like to exclude 141 | country names, this would be the function to do it with. 142 | """ 143 | # maybe make these non-country searches but don't discard, at least for 144 | # some (esp. bodies of water) 145 | special_terms = ["Europe", "West", "the West", "South Pacific", "Gulf of Mexico", "Atlantic", 146 | "the Black Sea", "Black Sea", "North America", "Mideast", "Middle East", 147 | "the Middle East", "Asia", "the Caucasus", "Africa", 148 | "Central Asia", "Balkans", "Eastern Europe", "Arctic", "Ottoman Empire", 149 | "Asia-Pacific", "East Asia", "Horn of Africa", "Americas", 150 | "North Africa", "the Strait of Hormuz", "Mediterranean", "East", "North", 151 | "South", "Latin America", "Southeast Asia", "Western Pacific", "South Asia", 152 | "Persian Gulf", "Central Europe", "Western Hemisphere", "Western Europe", 153 | "European Union (E.U.)", "EU", "European Union", "E.U.", "Asia-Pacific", 154 | "Europe", "Caribbean", "US", "U.S.", "Persian Gulf", "West Africa", "North", "East", 155 | "South", "West", "Western Countries" 156 | ] 157 | 158 | # Some words are recurring spacy problems... 159 | spacy_problems = ["Kurd", "Qur'an"] 160 | 161 | #skip_list = list(cts.keys()) + special_terms 162 | skip_list = special_terms + spacy_problems 163 | skip_list = set(skip_list) 164 | return skip_list 165 | 166 | 167 | def country_list_nlp(cts): 168 | """NLP countries so we can use for vector comparisons""" 169 | ct_nlp = [] 170 | for i in cts.keys(): 171 | nlped = nlp(i) 172 | ct_nlp.append(nlped) 173 | return ct_nlp 174 | 175 | 176 | def make_country_nationality_list(cts, ct_file): 177 | """Combine list of countries and list of nationalities""" 178 | countries = pd.read_csv(ct_file) 179 | nationality = dict(zip(countries.nationality,countries.alpha_3_code)) 180 | both_codes = {**nationality, **cts} 181 | return both_codes 182 | 183 | 184 | def make_inv_cts(cts): 185 | """ 186 | cts is e.g. {"Germany" : "DEU"}. inv_cts is the inverse: {"DEU" : "Germany"} 187 | """ 188 | inv_ct = {} 189 | for old_k, old_v in cts.items(): 190 | if old_v not in inv_ct.keys(): 191 | inv_ct.update({old_v : old_k}) 192 | return inv_ct 193 | 194 | 195 | def read_in_admin1(filepath): 196 | """ 197 | Small helper function to read in a admin1 code <--> admin1 name document. 198 | 199 | Parameters 200 | ---------- 201 | filepath: string 202 | path to the admin1 mapping JSON. This file is usually 203 | mordecai/resources/data/admin1CodesASCII.json 204 | 205 | Returns 206 | ------- 207 | admin1_dict: dictionary 208 | keys are country + admin1codes, values are names 209 | Example: "US.OK" : "Oklahoma" 210 | Example: "SE.21": "Uppsala" 211 | """ 212 | with open(filepath) as admin1file: 213 | admin1_dict = json.loads(admin1file.read()) 214 | return admin1_dict 215 | 216 | 217 | 218 | def structure_results(res): 219 | """Format Elasticsearch result as Python dictionary""" 220 | out = {'hits': {'hits': []}} 221 | keys = ['admin1_code', 'admin2_code', 'admin3_code', 'admin4_code', 222 | 'alternativenames', 'asciiname', 'coordinates', 223 | 'country_code2', 'country_code3', 224 | 'feature_class', 'feature_code', 'geonameid', 225 | 'modification_date', 'name', 'population'] 226 | for i in res: 227 | i_out = {} 228 | for k in keys: 229 | i_out[k] = i[k] 230 | out['hits']['hits'].append(i_out) 231 | return out 232 | 233 | def setup_es(hosts, port, use_ssl=False, auth=None): 234 | """ 235 | Setup an Elasticsearch connection 236 | 237 | Parameters 238 | ---------- 239 | hosts: list 240 | Hostnames / IP addresses for elasticsearch cluster 241 | port: string 242 | Port for elasticsearch cluster 243 | use_ssl: boolean 244 | Whether to use SSL for the elasticsearch connection 245 | auth: tuple 246 | (username, password) to use with HTTP auth 247 | Returns 248 | ------- 249 | es_conn: an elasticsearch_dsl Search connection object. 250 | """ 251 | kwargs = dict( 252 | hosts=hosts or ['localhost'], 253 | port=port or 9200, 254 | use_ssl=use_ssl, 255 | ) 256 | if auth: 257 | kwargs.update(http_auth=auth) 258 | 259 | CLIENT = Elasticsearch(**kwargs) 260 | S = Search(using=CLIENT, index="geonames") 261 | return S 262 | 263 | def check_geonames_date(conn): 264 | r = Q("match", geonameid='4943351') 265 | result = conn.query(r).execute() 266 | output = structure_results(result) 267 | return output['hits']['hits'][0]['modification_date'] 268 | -------------------------------------------------------------------------------- /paper/mordecai_geoparsing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openeventdata/mordecai/9d37110f6cd1275852548fc53fd7a21bb77593f9/paper/mordecai_geoparsing.png -------------------------------------------------------------------------------- /paper/paper.bib: -------------------------------------------------------------------------------- 1 | @article{mikolov2013efficient, 2 | title={Efficient estimation of word representations in vector space}, 3 | author={Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey}, 4 | journal={arXiv preprint arXiv:1301.3781}, 5 | year={2013} 6 | } 7 | 8 | 9 | @online{geonames, 10 | author = {Geonames}, 11 | title = {Geonames}, 12 | year = 2016, 13 | url = {http://geonames.org}, 14 | urldate = {2016-09-08} 15 | } 16 | -------------------------------------------------------------------------------- /paper/paper.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Mordecai: Full Text Geoparsing and Event Geocoding" 3 | tags: 4 | - geocoding 5 | - geoparsing 6 | - natural language processing 7 | - Python 8 | - word embeddings 9 | authors: 10 | - name: Andrew Halterman 11 | orcid: 0000-0001-9716-9555 12 | affiliation: 1 13 | affiliations: 14 | - name: MIT 15 | index: 1 16 | date: 8 December 2017 17 | bibliography: paper.bib 18 | --- 19 | 20 | # Summary 21 | 22 | Mordecai is a new full-text geoparsing system that extracts place names from 23 | text, resolves them to their correct entries in a gazetteer, and returns 24 | structured geographic information for the resolved place name. Geoparsing can 25 | be used in a number of tasks, including media monitoring, improved information 26 | extraction, document annotation for search, and geolocating text-derived event 27 | data, which is the task for which is was built. Mordecai was created to provide 28 | provide several features missing in existing geoparsers, including better 29 | handling of non-US place names, easy and portable setup and use though a Docker 30 | REST architecture, and easy customization with Python and swappable named 31 | entity recognition systems. Mordecai's key technical innovations are in a 32 | language-agnostic architecture that uses word2vec [@mikolov2013efficient] for 33 | inferring the correct country for a set of locations in a piece of text and 34 | easily changed named entity recognition models. As a gazetteer, it uses 35 | Geonames [@geonames] in a custom-build Elasticsearch database. 36 | 37 | # References 38 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | editdistance>=0.5.3 2 | elasticsearch==5.4.0 3 | elasticsearch-dsl==5.3.0 4 | h5py>=2.10.0 5 | pandas>=0.24.2 6 | spacy>=2.3,<3.0 7 | tensorflow>=2.2.0 8 | tqdm>=4.28.1 9 | numpy>=1.12 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup(name='mordecai', 4 | version='2.1.0', 5 | description='Full text geoparsing and event geocoding', 6 | url='http://github.com/openeventdata/mordecai/', 7 | author='Andy Halterman', 8 | author_email='ahalterman0@gmail.com', 9 | license='MIT', 10 | packages=['mordecai'], 11 | keywords = ['geoparsing', 'nlp', 'geocoding', 'toponym resolution'], 12 | install_requires = ['editdistance>=0.5.3', 13 | 'elasticsearch==5.4.0', 14 | 'elasticsearch-dsl==5.3.0', 15 | 'h5py>=2.10.0', 16 | 'pandas>=0.24.2', 17 | 'spacy>=2.3,<3.0', 18 | 'tensorflow>=2.2.0', 19 | 'tqdm>=4.28.1', 20 | 'numpy>=1.12'], 21 | dependency_links=['https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.3.1/en_core_web_lg-2.3.1.tar.gz'], 22 | include_package_data=True, 23 | package_data = {'data': ['admin1CodesASCII.json', 24 | 'countries.json', 25 | 'nat_df.csv', 26 | 'stopword_country_names.json'], 27 | 'models' : ['country_model.h5', 28 | 'rank_model.h5']} 29 | ) 30 | -------------------------------------------------------------------------------- /train/train_country_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import jsonlines 4 | from pandas import DataFrame 5 | import os 6 | import re 7 | import keras 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Dropout, Activation 10 | from keras.optimizers import SGD 11 | from collections import Counter 12 | import sklearn 13 | import pandas as pd 14 | 15 | import spacy 16 | nlp = spacy.load('en_core_web_lg', parser=False) 17 | 18 | from mordecai import geoparse 19 | 20 | geo = geoparse.Geoparse(verbose = True) 21 | 22 | def entry_to_matrix(prodigy_entry): 23 | """ 24 | Take in a line from the labeled json and return a vector of labels and a matrix of features 25 | for training. 26 | 27 | Two ways to get 0s: 28 | - marked as false by user 29 | - generated automatically from other entries when guess is correct 30 | 31 | Rather than iterating through entities, just get the number of the correct entity directly. 32 | Then get one or two GPEs before and after. 33 | """ 34 | doc = prodigy_entry['text'] 35 | doc = nlp(doc) 36 | geo_proced = geo.process_text(doc, require_maj=False) 37 | 38 | # find the geoproced entity that matches the Prodigy entry 39 | ent_text = np.asarray([gp['word'] for gp in geo_proced]) # get mask for correct ent 40 | #print(ent_text) 41 | match = ent_text == entry['meta']['word'] 42 | #print("match: ", match) 43 | anti_match = np.abs(match - 1) 44 | #print("Anti-match ", anti_match) 45 | match_position = match.argmax() 46 | 47 | geo_proc = geo_proced[match_position] 48 | 49 | iso = geo.cts[prodigy_entry['label']] # convert country text label to ISO 50 | feat = geo.features_to_matrix(geo_proc) 51 | answer_x = feat['matrix'] 52 | label = np.asarray(feat['labels']) 53 | 54 | if prodigy_entry['answer'] == "accept": 55 | answer_binary = label == iso 56 | answer_binary = answer_binary.astype('int') 57 | #print(answer_x.shape) 58 | #print(answer_binary.shape) 59 | 60 | 61 | elif prodigy_entry['answer'] == "reject": 62 | # all we know is that the label that was presented is wrong. 63 | # just return the corresponding row in the feature matrix, 64 | # and force the label to be 0 65 | answer_binary = label == iso 66 | answer_x = answer_x[answer_binary,:] # just take the row corresponding to the answer 67 | answer_binary = np.asarray([0]) # set the outcome to 0 because reject 68 | 69 | # NEED TO SHARE LABELS ACROSS! THE CORRECT ONE MIGHT NOT EVEN APPEAR FOR ALL ENTITIES 70 | 71 | x = feat['matrix'] 72 | other_x = x[anti_match,:] 73 | #print(other_x) 74 | #print(label[anti_match]) 75 | # here, need to get the rows corresponding to the correct label 76 | 77 | # print(geo_proc['meta']) 78 | # here's where we get the other place name features. 79 | # Need to: 80 | # 1. do features_to_matrix but use the label of the current entity 81 | # to determine 0/1 in the feature matrix 82 | # 2. put them all into one big feature matrix, 83 | # 3. ...ordering by distance? And need to decide max entity length 84 | # 4. also include these distances as one of the features 85 | 86 | #print(answer_x.shape[0]) 87 | #print(answer_binary.shape[0]) 88 | try: 89 | if answer_x.shape[0] == answer_binary.shape[0]: 90 | return (answer_x, answer_binary) 91 | except: 92 | pass 93 | 94 | #return (answer_x, answer_binary) 95 | 96 | # If it's accept, convert the label of the correct one to 1, the others to 0, return all 97 | # If it's reject, convert the label of the presented one to 0, and DELETE the rows in the 98 | # matrix/vector. If the presented one is false, we don't know if the other, non-presented 99 | # ones were correct or not. 100 | 101 | # return the text labels, too, so we can look at per-country accuracy later. 102 | 103 | # feat_list.append(feat) 104 | 105 | error_count = 0 106 | with jsonlines.open('geo_annotated/geo_country_db.jsonl') as reader: 107 | X = [] 108 | Y = [] 109 | for obj in reader: 110 | if obj['answer'] != 'ignore': 111 | try: 112 | x, label = entry_to_matrix(obj) # change to return matrices/vectors 113 | X.append(x) 114 | Y.append(label) 115 | except Exception as e: 116 | error_count += 1 117 | pass 118 | 119 | print(error_count) 120 | 121 | # format numpy 122 | Y = np.hstack(Y) 123 | Y = np.asarray(Y).astype(int) 124 | 125 | X = np.vstack(X) 126 | X_df = DataFrame(X) 127 | 128 | # train/test split 129 | msk = np.random.rand(len(X_df)) < 0.7 130 | X_train = X_df[msk].as_matrix() 131 | X_test = X_df[~msk].as_matrix() 132 | y_train = Y[msk] 133 | y_test = Y[~msk] 134 | 135 | 136 | model = Sequential() 137 | model.add(Dense(512, activation='relu', input_dim=X_train.shape[1])) 138 | model.add(Dropout(0.5)) 139 | model.add(Dense(512, activation='relu')) 140 | model.add(Dropout(0.5)) 141 | model.add(Dense(512, activation='relu')) 142 | model.add(Dropout(0.5)) 143 | model.add(Dense(1, activation='sigmoid')) 144 | 145 | #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) 146 | model.compile(loss='binary_crossentropy', 147 | optimizer='rmsprop', 148 | metrics=['accuracy']) 149 | 150 | model.fit(X_train, y_train, 151 | epochs=15, 152 | batch_size=128) 153 | 154 | score = model.evaluate(X_test, y_test, batch_size=12) 155 | print(score) 156 | 157 | y_predicted = model.predict(X_test) 158 | 159 | print(sklearn.metrics.classification_report(y_pred = y_predicted>0.5, y_true = y_test)) 160 | 161 | model.save("country_model_updated_script.h5") 162 | 163 | -------------------------------------------------------------------------------- /train/train_ranker.py: -------------------------------------------------------------------------------- 1 | import plac 2 | import mordecai 3 | import random 4 | import jsonlines 5 | from tqdm import tqdm 6 | import re 7 | import numpy as np 8 | import editdistance 9 | import pandas as pd 10 | import os 11 | import json 12 | import pickle 13 | 14 | import keras 15 | from keras.models import Sequential 16 | from keras.layers import Dense, Dropout, Activation 17 | from keras.optimizers import SGD 18 | from keras.callbacks import EarlyStopping, ModelCheckpoint 19 | import sklearn 20 | 21 | 22 | geo = mordecai.Geoparser() 23 | # Here's the format of the Prodigy labeled place picking data: 24 | # ``` 25 | # {"text":"On July 15, state security services in Idleb arrested Mahmoud Barish, an opposition activist, for his dealings with the Damascus Declaration.", 26 | # "spans":[{"start":39,"end":44}], 27 | # "options":[ 28 | # {"id":1,"text":"\"Idlib District\", a second-order administrative division in SYR, id: 169388"}, 29 | # {"id":2,"text":"\"Idlib\", a seat of a first-order administrative division in SYR, id: 169389, 30 | # {"id":4,"text":"None/Other/Incorrect"}], 31 | # "_input_hash":1212285619,"_task_hash":-1410881973, 32 | # "accept":[2], 33 | # "answer":"accept"} 34 | # ``` 35 | 36 | def ingest_prodigy_ranks(filename): 37 | """ 38 | Ingest Prodigy-labeled Mordecai data for place picking and produce training data 39 | for Keras. 40 | 41 | For each labeled example, match it to the output of Mordecai, and make sure there's an accepted answer 42 | from Prodigy. 43 | 44 | Parameters 45 | ---------- 46 | filename: filepath, location of Prodigy data 47 | 48 | Returns 49 | ------- 50 | X: list of matrices, Mordecai features. 51 | Each element in the list is a matrix of features for ranking (so 5 rows) 52 | Y: list of arrays of length 5, indicating correct location. 53 | """ 54 | with jsonlines.open(filename) as reader: 55 | X = [] 56 | Y = [] 57 | i = 0 58 | accumulate = [] 59 | for obj in reader: 60 | i = i+1 61 | if i % 250 == 0: 62 | print(i) 63 | # run the text through mordecai 64 | proced = geo.geoparse(obj['text'], verbose = True,) 65 | for proc in proced: 66 | # for each result, see if the spans overlap the labeled spans 67 | if proc['spans'][0]['start'] != obj['spans'][0]['start']: 68 | # make sure we have the right entity 69 | continue 70 | ent_word = proc['word'] 71 | if not ent_word: 72 | continue 73 | # if it all works, take the results. 74 | results = geo.query_geonames_country(ent_word, proc['country_predicted']) 75 | 76 | if obj['answer'] == 'accept': 77 | #start_char = obj['spans']['start'] 78 | # get the geonames ids of the options 79 | geoids = [re.findall("id: (.+)", i['text']) for i in obj['options']] 80 | geoids = [i[0] for i in geoids if i] 81 | # get the correct of if any 82 | try: 83 | correct = obj['accept'][0] 84 | correct_id = str(geoids[correct - 1]) 85 | except (KeyError, IndexError): 86 | continue 87 | 88 | elif obj['answer'] != 'accept': 89 | correct_id = 4 90 | 91 | try: 92 | fl, meta = geo.features_for_rank(proc, results) 93 | # just look at the top 4 results by deterministic rule 94 | # This matches what went into the annotation task 95 | choices, sorted_meta, fl_subset = geo.format_for_prodigy(fl, meta, ent_word, return_feature_subset=True) 96 | result_ids = np.asarray([m['geonameid'] for m in sorted_meta]) 97 | if obj['answer'] == 'accept': 98 | labels = result_ids == correct_id 99 | elif obj['answer'] == 'reject': 100 | # give rejects their own special category 101 | # reject means the country was right but none of the options were. 102 | labels = np.asarray([0, 0, 0, 0, 1]) 103 | else: 104 | # skip ignores 105 | continue 106 | #print(labels) 107 | if labels.sum() == 0: 108 | #print("No correct labels") 109 | pass 110 | # if fewer than 4 options were presented for tagging, 111 | # pad it out with 0s to length 4 + 1 (1 for the all wrong reject answer) 112 | labels = np.pad(labels, (0, 5 - len(labels)), 'constant') 113 | # pad the matrix with empty rows 114 | fl_pad = np.pad(fl_subset, ((0, 5 - fl_subset.shape[0]), (0, 0)), 'constant') 115 | # turn the matrix into a vector 116 | fl_unwrap = fl_pad.flatten() 117 | Y.append(labels) 118 | X.append(fl_unwrap) 119 | except Exception as e: 120 | print(e) 121 | #print(meta) 122 | continue 123 | return X, Y 124 | 125 | def prep_data(X, Y, train_split): 126 | X_stack = np.vstack(X) 127 | X_stack.shape 128 | Y_stack = np.vstack(Y) 129 | Y_stack = Y_stack.astype(int) 130 | Y_stack.shape 131 | X_df = pd.DataFrame(X_stack) 132 | 133 | print("Using a cutpoint of ", train_split) 134 | np.random.seed(73071) 135 | msk = np.random.rand(len(X_df)) < train_split 136 | X_train = X_df[msk].as_matrix() 137 | X_test = X_df[~msk].as_matrix() 138 | y_train = Y_stack[msk] 139 | y_test = Y_stack[~msk] 140 | 141 | for i in [X_train, X_test, y_train, y_test]: 142 | print(i.shape) 143 | return X_train, X_test, y_train, y_test 144 | 145 | def train_model(X_train, X_test, y_train, y_test, save_file): 146 | model = Sequential() 147 | model.add(Dense(128, activation='relu', input_shape = (X_train.shape[1],))) 148 | model.add(Dropout(0.3)) 149 | model.add(Dense(128, activation='relu')) 150 | model.add(Dropout(0.3)) 151 | model.add(Dense(128, activation='relu')) 152 | model.add(Dropout(0.3)) 153 | model.add(Dense(y_train.shape[1], activation='softmax')) 154 | 155 | #sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True) 156 | model.compile(loss='categorical_crossentropy', 157 | optimizer='rmsprop', 158 | metrics=['accuracy']) 159 | 160 | callbacks = [EarlyStopping(monitor='val_loss', patience=50)] 161 | save_model = ModelCheckpoint(save_file, monitor='val_loss', 162 | verbose=0, save_best_only=True, 163 | save_weights_only=False) 164 | callbacks.append(save_model) 165 | 166 | model.fit(X_train, y_train, 167 | epochs=100, 168 | validation_split=0.2, 169 | callbacks = callbacks, 170 | batch_size=16) 171 | 172 | return model 173 | 174 | 175 | @plac.annotations( 176 | input_file=("Location of Prodigy labeled output", "option", "i", str), 177 | train_split=("Fraction of data to use for training vs. validation", "option", "s", float), 178 | use_cache=("Use cached data?", "flag", "c")) 179 | def main(input_file, train_split, use_cache): 180 | save_file = "rank_model_new.h5" 181 | if use_cache: 182 | print("Using saved data...") 183 | with open("ranker_X.pkl", "rb") as f: 184 | X = pickle.load(f) 185 | with open("ranker_y.pkl", "rb") as f: 186 | Y = pickle.load(f) 187 | else: 188 | print("Recalculating data...") 189 | X, Y = ingest_prodigy_ranks(input_file) 190 | #print("X.shape:", X.shape) 191 | #print("Y.shape:", Y.shape) 192 | with open("ranker_X.pkl", "wb") as f: 193 | pickle.dump(X, f) 194 | with open("ranker_Y.pkl", "wb") as f: 195 | pickle.dump(Y, f) 196 | X_train, X_test, y_train, y_test = prep_data(X, Y, train_split) 197 | model = train_model(X_train, X_test, y_train, y_test, save_file) 198 | score = model.evaluate(X_test, y_test) 199 | print(score) 200 | 201 | y_predicted = model.predict(X_test) 202 | print(sklearn.metrics.classification_report(y_pred = y_predicted>0.5, y_true = y_test)) 203 | #model.save() 204 | 205 | if __name__ == '__main__': 206 | plac.call(main) 207 | --------------------------------------------------------------------------------