├── example_preprocess.py ├── LICENSE.txt ├── example.py └── readme.md /example_preprocess.py: -------------------------------------------------------------------------------- 1 | import os, sys, logging, traceback, codecs, datetime, copy, time, ast, math, re, random, shutil, json 2 | import soton_corenlppy, geoparsepy 3 | 4 | LOG_FORMAT = ('%(message)s') 5 | logger = logging.getLogger( __name__ ) 6 | logging.basicConfig( level=logging.INFO, format=LOG_FORMAT ) 7 | logger.info('logging started') 8 | 9 | dictFocusAreaSpec = { 10 | 'southampton' : { 11 | 'focus_area_id' : 'southampton', 12 | 'admin': ['southampton','south east england', 'united kingdom'], 13 | 'admin_lookup_table' : 'global_cities_admin', 14 | } 15 | } 16 | 17 | dictGlobalSpec = None 18 | 19 | dictGeospatialConfig = geoparsepy.geo_parse_lib.get_geoparse_config( 20 | lang_codes = ['en'], 21 | logger = logger, 22 | whitespace = u'"\u201a\u201b\u201c\u201d()', 23 | sent_token_seps = ['\n','\r\n', '\f', u'\u2026'], 24 | punctuation = """,;\/:+-#~&*=!?""", 25 | ) 26 | 27 | dbHandlerPool = {} 28 | dbHandlerPool['admin'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 29 | dbHandlerPool['point'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 30 | dbHandlerPool['poly'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 31 | dbHandlerPool['line'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 32 | 33 | for strFocusArea in dictFocusAreaSpec.keys() : 34 | logger.info( 'starting focus area ' + strFocusArea ) 35 | jsonFocusArea = dictFocusAreaSpec[strFocusArea] 36 | geoparsepy.geo_preprocess_lib.create_preprocessing_tables( jsonFocusArea, dbHandlerPool['admin'], 'public', delete_contents = False, logger = logger ) 37 | dictNewLocations = geoparsepy.geo_preprocess_lib.execute_preprocessing_focus_area( jsonFocusArea, dbHandlerPool, 'public', logger = logger ) 38 | logger.info( 'finished focus area ' + strFocusArea ) 39 | logger.info( 'location id range : ' + repr(dictNewLocations) ) 40 | 41 | dbHandlerPool['admin'].close() 42 | dbHandlerPool['point'].close() 43 | dbHandlerPool['poly'].close() 44 | dbHandlerPool['line'].close() 45 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020, University of Southampton 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | 3. All advertising and publication materials containing results from the use of this software 15 | must acknowledge the University of Southampton and cite 16 | the following paper which describes the software: 17 | 18 | Middleton, S.E. Kordopatis-Zilos, G. Papadopoulos, S. Kompatsiaris, Y. "Location Extraction from Social Media: Geoparsing, Location Disambiguation, and Geotagging", ACM Transactions on Information Systems (TOIS) 36, 4, Article 40 (June 2018), 27 pages. DOI: https://doi.org/10.1145/3202662 19 | 20 | 4. Neither the name of the University of Southampton nor the 21 | names of its contributors may be used to endorse or promote products 22 | derived from this software without specific prior written permission. 23 | 24 | 5. This software should only be used for research, education or evaluation purposes. A free 25 | commercial license is available on request to {sem03}@soton.ac.uk. 26 | 27 | THIS SOFTWARE IS PROVIDED BY University of Southampton ''AS IS'' AND ANY 28 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 29 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 30 | DISCLAIMED. IN NO EVENT SHALL University of Southampton BE LIABLE FOR ANY 31 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 32 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 33 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 34 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 36 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 | -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | import os, sys, logging, traceback, codecs, datetime, copy, time, ast, math, re, random, shutil, json 2 | import soton_corenlppy, geoparsepy 3 | 4 | LOG_FORMAT = ('%(message)s') 5 | logger = logging.getLogger( __name__ ) 6 | logging.basicConfig( level=logging.INFO, format=LOG_FORMAT ) 7 | logger.info('logging started') 8 | 9 | dictGeospatialConfig = geoparsepy.geo_parse_lib.get_geoparse_config( 10 | lang_codes = ['en'], 11 | logger = logger, 12 | whitespace = u'"\u201a\u201b\u201c\u201d()', 13 | sent_token_seps = ['\n','\r\n', '\f', u'\u2026'], 14 | punctuation = """,;\/:+-#~&*=!?""", 15 | ) 16 | 17 | databaseHandle = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap', 600 ) 18 | 19 | dictLocationIDs = {} 20 | listFocusArea=[ 'global_cities', 'europe_places', 'north_america_places', 'uk_places' ] 21 | for strFocusArea in listFocusArea : 22 | dictLocationIDs[strFocusArea + '_admin'] = [-1,-1] 23 | dictLocationIDs[strFocusArea + '_poly'] = [-1,-1] 24 | dictLocationIDs[strFocusArea + '_line'] = [-1,-1] 25 | dictLocationIDs[strFocusArea + '_point'] = [-1,-1] 26 | 27 | cached_locations = geoparsepy.geo_preprocess_lib.cache_preprocessed_locations( databaseHandle, dictLocationIDs, 'public', dictGeospatialConfig ) 28 | logger.info( 'number of cached locations = ' + str(len(cached_locations)) ) 29 | 30 | databaseHandle.close() 31 | 32 | indexed_locations = geoparsepy.geo_parse_lib.calc_inverted_index( cached_locations, dictGeospatialConfig ) 33 | logger.info( 'number of indexed phrases = ' + str(len(indexed_locations.keys())) ) 34 | 35 | indexed_geoms = geoparsepy.geo_parse_lib.calc_geom_index( cached_locations ) 36 | logger.info( 'number of indexed geoms = ' + str(len(indexed_geoms.keys())) ) 37 | 38 | osmid_lookup = geoparsepy.geo_parse_lib.calc_osmid_lookup( cached_locations ) 39 | 40 | dictGeomResultsCache = {} 41 | 42 | listText = [ 43 | u'hello New York, USA its Bill from Bassett calling', 44 | u'live on the BBC Victoria Derbyshire is visiting Derbyshire for an exclusive UK interview', 45 | ] 46 | 47 | listTokenSets = [] 48 | listGeotags = [] 49 | for nIndex in range(len(listText)) : 50 | strUTF8Text = listText[ nIndex ] 51 | listToken = soton_corenlppy.common_parse_lib.unigram_tokenize_text( text = strUTF8Text, dict_common_config = dictGeospatialConfig ) 52 | listTokenSets.append( listToken ) 53 | listGeotags.append( None ) 54 | 55 | listMatchSet = geoparsepy.geo_parse_lib.geoparse_token_set( listTokenSets, indexed_locations, dictGeospatialConfig ) 56 | 57 | strGeom = 'POINT(-1.4052268 50.9369033)' 58 | listGeotags[0] = strGeom 59 | 60 | listMatchGeotag = geoparsepy.geo_parse_lib.reverse_geocode_geom( [strGeom], indexed_geoms, dictGeospatialConfig ) 61 | if len( listMatchGeotag[0] ) > 0 : 62 | for tupleOSMIDs in listMatchGeotag[0] : 63 | setIndexLoc = osmid_lookup[ tupleOSMIDs ] 64 | for nIndexLoc in setIndexLoc : 65 | strName = cached_locations[nIndexLoc][1] 66 | logger.info( 'Reverse geocoded geotag location [index ' + str(nIndexLoc) + ' osmid ' + repr(tupleOSMIDs) + '] = ' + strName ) 67 | 68 | for nIndex in range(len(listMatchSet)) : 69 | logger.info( 'Text = ' + listText[nIndex] ) 70 | listMatch = listMatchSet[ nIndex ] 71 | strGeom = listGeotags[ nIndex ] 72 | setOSMID = set([]) 73 | for tupleMatch in listMatch : 74 | nTokenStart = tupleMatch[0] 75 | nTokenEnd = tupleMatch[1] 76 | tuplePhrase = tupleMatch[3] 77 | for tupleOSMIDs in tupleMatch[2] : 78 | setIndexLoc = osmid_lookup[ tupleOSMIDs ] 79 | for nIndexLoc in setIndexLoc : 80 | logger.info( 'Location [index ' + str(nIndexLoc) + ' osmid ' + repr(tupleOSMIDs) + ' @ ' + str(nTokenStart) + ' : ' + str(nTokenEnd) + '] = ' + ' '.join(tuplePhrase) ) 81 | break 82 | listLocMatches = geoparsepy.geo_parse_lib.create_matched_location_list( listMatch, cached_locations, osmid_lookup ) 83 | geoparsepy.geo_parse_lib.filter_matches_by_confidence( listLocMatches, dictGeospatialConfig, geom_context = strGeom, geom_cache = dictGeomResultsCache ) 84 | geoparsepy.geo_parse_lib.filter_matches_by_geom_area( listLocMatches, dictGeospatialConfig ) 85 | geoparsepy.geo_parse_lib.filter_matches_by_region_of_interest( listLocMatches, [-148838, -62149], dictGeospatialConfig ) 86 | setOSMID = set([]) 87 | for nMatchIndex in range(len(listLocMatches)) : 88 | nTokenStart = listLocMatches[nMatchIndex][1] 89 | nTokenEnd = listLocMatches[nMatchIndex][2] 90 | tuplePhrase = listLocMatches[nMatchIndex][3] 91 | strGeom = listLocMatches[nMatchIndex][4] 92 | tupleOSMID = listLocMatches[nMatchIndex][5] 93 | dictOSMTags = listLocMatches[nMatchIndex][6] 94 | if not tupleOSMID in setOSMID : 95 | setOSMID.add( tupleOSMID ) 96 | listNameMultilingual = geoparsepy.geo_parse_lib.calc_multilingual_osm_name_set( dictOSMTags, dictGeospatialConfig ) 97 | strNameList = ';'.join( listNameMultilingual ) 98 | strOSMURI = geoparsepy.geo_parse_lib.calc_OSM_uri( tupleOSMID, strGeom ) 99 | logger.info( 'Disambiguated Location [index ' + str(nMatchIndex) + ' osmid ' + repr(tupleOSMID) + ' @ ' + str(nTokenStart) + ' : ' + str(nTokenEnd) + '] = ' + strNameList + ' : ' + strOSMURI ) 100 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # geoparsepy project 2 | 3 | geoparsepy is a Python geoparsing library that will extract and disambiguate locations from text. It uses a local OpenStreetMap database which allows very high and unlimited geoparsing throughput, unlike approaches that use a third-party geocoding service (e.g. Google Geocoding API). 4 | 5 | [geoparsepy PyPI](https://pypi.org/project/geoparsepy/) 6 | 7 | Geoparsing is based on named entity matching against OpenStreetMap (OSM) locations. All locations with names that match tokens will be selected from a target text sentence. This will result in a set of OSM locations, all with a common name or name variant, for each token in the text. Geoparsing included the following features: 8 | * **token expansion** using location name variants (i.e. OSM multi-lingual names, short names and acronyms) 9 | * **token expansion** using location type variants (e.g. street, st.) 10 | * **token filtering** of single token location names against WordNet (non-nouns), language specific stoplists and peoples first names (nltk.corpus.names.words()) to reduce false positive matches 11 | * **prefix checking** when matching in case a first name prefixes a location token(s) to avoid matching peoples full names as locations (e.g. Victoria Derbyshire != Derbyshire) 12 | 13 | Location disambiguation is the process of choosing which of a set of possible OSM locations, all with the same name, is the best match. Location disambiguation is based on an evidential approach, with evidential features detailed below in order of importance: 14 | * **token subsumption**, rejecting smaller phrases over larger ones (e.g. 'New York' will prefer [New York, USA] to [York, UK]) 15 | * **nearby parent region**, preferring locations with a parent region also appearing within a semantic distance (e.g. 'New York in USA' will prefer [New York, USA] to [New York, BO, Sierra Leone]) 16 | * **nearby locations**, preferring locations with closeby or overlapping locations within a semantic distance (e.g. 'London St and Commercial Road' will select from road name choices with the same name based on spatial proximity) 17 | * **nearby geotag**, preferring locations that are closeby or overlapping a geotag 18 | * **general before specific**, rejecting locations with a higher admin level (or no admin level at all) compared to locations with a lower admin level (e.g. 'New York' will prefer [New York, USA] to [New York, BO, Sierra Leone] 19 | 20 | Currently the following languages are supported: 21 | * English, French, German, Italian, Portuguese, Russian, Ukrainian 22 | * All other languages will work but there will be no language specific token expansion available 23 | 24 | geoparsepy works with Python 3.7 and has been tested on Windows 10 and Ubuntu 18.04 LTS. 25 | 26 | This geoparsing algorithm uses a large memory footprint (e.g. 12 Gbytes RAM for global cities), RAM size proportional to the number of cached locations, to maximize matching speed. It can be naively parallelized, with multiple geoparse processes loaded with different sets of locations and the geoparse results aggregated in a last process where location disambiguation is applied. This approach has been validated across an APACHE Storm cluster. 27 | 28 | The software is copyright 2020 [University of Southampton](https://www.ecs.soton.ac.uk/people/sem03), UK. It was created over a multi-year period under EU FP7 projects TRIDEC (258723), REVEAL (610928), InnovateUK project LPLP (104875) and ESRC project FloraGuard (ES/R003254/1). This software can only be used for research, education or evaluation purposes. A free commercial license is available on request to {sem03}@soton.ac.uk. The University of Southampton is open to discussions regarding [collaboration](https://www.southampton.ac.uk/~sem03/engagement.html) in future research projects relating to this work. 29 | 30 | Feature suggestions and/or bug reports can be sent to {sem03}@soton.ac.uk. We do not however offer any software support beyond the examples and API documentation already provided. 31 | 32 | 33 | # Scientific publications 34 | Middleton, S.E. Middleton, L. Modafferi, S. [Real-time Crisis Mapping of Natural Disasters using Social Media](http://eprints.soton.ac.uk/370581/), Intelligent Systems, IEEE , vol.29, no.2, pp.9,17, Mar.-Apr. 2014 35 | 36 | Middleton, S.E. Krivcovs, V. [Geoparsing and Geosemantics for Social Media: Spatio-Temporal Grounding of Content Propagating Rumours to support Trust and Veracity Analysis during Breaking News](http://eprints.soton.ac.uk/390820/), ACM Transactions on Information Systems (TOIS), 34, 3, Article 16 (April 2016), 26 pages. DOI=10.1145/2842604 37 | 38 | Middleton, S.E. Kordopatis-Zilos, G. Papadopoulos, S. Kompatsiaris, Y. [Location Extraction from Social Media: Geoparsing, Location Disambiguation, and Geotagging](https://www.southampton.ac.uk/~sem03/middleton_tois_2018.pdf), ACM Transactions on Information Systems (TOIS) 36, 4, Article 40 (June 2018), 27 pages. DOI: https://doi.org/10.1145/3202662. Presented at SIGIR 2019 39 | 40 | A benchmark geoparse dataset is also available for free from the University of Southampton on request via email to {sem03}@soton.ac.uk. 41 | 42 | 43 | # geoparsepy documentation resources 44 | 45 | geoparsepy [API](https://www.southampton.ac.uk/~sem03/geoparsepy/api/index.html) 46 | 47 | geoparsepy example code on [github](https://github.com/stuartemiddleton/geoparsepy) 48 | 49 | # Python libs needed (earlier versions may be suitable but are untested) 50 | 51 | Python libs: psycopg2 >= 2.8, nltk >= 3.4, numpy >= 1.18, shapely >= 1.6, setuptools >= 46, soton-corenlppy>=1.0 52 | 53 | Database: PostgreSQL >= 11.3, PostGIS >= 2.5 54 | 55 | For LINUX deployments the following is needed: 56 | 57 | ``` 58 | sudo apt-get install libgeos-dev libgeos-3.4.2 libpq-dev 59 | ``` 60 | 61 | You will need to download NLTK corpra before running geoparsepy: 62 | 63 | ```python 64 | python 65 | import nltk 66 | nltk.download() 67 | ==> install all or at least stopwords, names and wordnet 68 | ``` 69 | 70 | # Installation 71 | 72 | python3 -m pip install geoparsepy 73 | 74 | # Databases needed for geoparsing 75 | Download pre-processed UTF-8 encoded SQL table dumps from OSM image dated dec 2019. SQL dump is a 1.2 GB tar/zip file created using pg_dump and zipped using 7Zip tool. 76 | 77 | ``` 78 | download zip file from Google drive https://drive.google.com/file/d/1xyCjQox6gCoN8e0upHHyeMLV-uLirthS/view?usp=sharing 79 | unzip geoparsepy_preprocessed_tables.tar.zip 80 | tar -xvf geoparsepy_preprocessed_tables.tar 81 | ``` 82 | 83 | Connect to PostgreSQL and create the database with the required PostGIS and hstore extensions 84 | 85 | ``` 86 | psql -U postgres 87 | CREATE DATABASE openstreetmap; 88 | CREATE EXTENSION IF NOT EXISTS postgis; 89 | CREATE EXTENSION IF NOT EXISTS fuzzystrmatch; 90 | CREATE EXTENSION IF NOT EXISTS postgis_tiger_geocoder; 91 | CREATE EXTENSION IF NOT EXISTS hstore; 92 | ``` 93 | 94 | Import the precomputed database tables for global cities and places 95 | 96 | ``` 97 | # Linux 98 | psql -U postgres -d openstreetmap -f global_cities.sql 99 | psql -U postgres -d openstreetmap -f uk_places.sql 100 | psql -U postgres -d openstreetmap -f north_america_places.sql 101 | psql -U postgres -d openstreetmap -f europe_places.sql 102 | 103 | # Windows 10 using powershell 104 | & 'C:\Program Files\PostgreSQL\11\bin\psql.exe' -U postgres -d openstreetmap -f global_cities.sql 105 | & 'C:\Program Files\PostgreSQL\11\bin\psql.exe' -U postgres -d openstreetmap -f uk_places.sql 106 | & 'C:\Program Files\PostgreSQL\11\bin\psql.exe' -U postgres -d openstreetmap -f north_america_places.sql 107 | & 'C:\Program Files\PostgreSQL\11\bin\psql.exe' -U postgres -d openstreetmap -f europe_places.sql 108 | 109 | # Linux if username not sem (used for sql dump) not added as a user and some other name is needed (e.g. sem03) 110 | find * -name \*.sql -exec sed -i "s/TO sem;/TO sem03;/g" {} \; 111 | psql -d openstreetmap -f uk_places.sql 112 | psql -d openstreetmap -f global_cities.sql 113 | psql -d openstreetmap -f north_america_places.sql 114 | psql -d openstreetmap -f europe_places.sql 115 | 116 | ``` 117 | 118 | # Example code geoparse (start here) 119 | 120 | Geoparse some text using the default focus areas in the Postgres database. Fully documented example PY file can be found at geoparsepy.example_geoparse.py 121 | note: loading 300,000+ global locations into memory at startup is slow (10 minutes) but subsequently the geoparsing of text is very fast (real-time speeds) 122 | 123 | ```python 124 | import os, sys, logging, traceback, codecs, datetime, copy, time, ast, math, re, random, shutil, json 125 | import soton_corenlppy, geoparsepy 126 | 127 | LOG_FORMAT = ('%(message)s') 128 | logger = logging.getLogger( __name__ ) 129 | logging.basicConfig( level=logging.INFO, format=LOG_FORMAT ) 130 | logger.info('logging started') 131 | 132 | dictGeospatialConfig = geoparsepy.geo_parse_lib.get_geoparse_config( 133 | lang_codes = ['en'], 134 | logger = logger, 135 | whitespace = u'"\u201a\u201b\u201c\u201d()', 136 | sent_token_seps = ['\n','\r\n', '\f', u'\u2026'], 137 | punctuation = """,;\/:+-#~&*=!?""", 138 | ) 139 | 140 | databaseHandle = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap', 600 ) 141 | 142 | dictLocationIDs = {} 143 | listFocusArea=[ 'global_cities', 'europe_places', 'north_america_places', 'uk_places' ] 144 | for strFocusArea in listFocusArea : 145 | dictLocationIDs[strFocusArea + '_admin'] = [-1,-1] 146 | dictLocationIDs[strFocusArea + '_poly'] = [-1,-1] 147 | dictLocationIDs[strFocusArea + '_line'] = [-1,-1] 148 | dictLocationIDs[strFocusArea + '_point'] = [-1,-1] 149 | 150 | cached_locations = geoparsepy.geo_preprocess_lib.cache_preprocessed_locations( databaseHandle, dictLocationIDs, 'public', dictGeospatialConfig ) 151 | logger.info( 'number of cached locations = ' + str(len(cached_locations)) ) 152 | 153 | databaseHandle.close() 154 | 155 | indexed_locations = geoparsepy.geo_parse_lib.calc_inverted_index( cached_locations, dictGeospatialConfig ) 156 | logger.info( 'number of indexed phrases = ' + str(len(indexed_locations.keys())) ) 157 | 158 | indexed_geoms = geoparsepy.geo_parse_lib.calc_geom_index( cached_locations ) 159 | logger.info( 'number of indexed geoms = ' + str(len(indexed_geoms.keys())) ) 160 | 161 | osmid_lookup = geoparsepy.geo_parse_lib.calc_osmid_lookup( cached_locations ) 162 | 163 | dictGeomResultsCache = {} 164 | 165 | listText = [ 166 | u'hello New York, USA its Bill from Bassett calling', 167 | u'live on the BBC Victoria Derbyshire is visiting Derbyshire for an exclusive UK interview', 168 | ] 169 | 170 | listTokenSets = [] 171 | listGeotags = [] 172 | for nIndex in range(len(listText)) : 173 | strUTF8Text = listText[ nIndex ] 174 | listToken = soton_corenlppy.common_parse_lib.unigram_tokenize_text( text = strUTF8Text, dict_common_config = dictGeospatialConfig ) 175 | listTokenSets.append( listToken ) 176 | listGeotags.append( None ) 177 | 178 | listMatchSet = geoparsepy.geo_parse_lib.geoparse_token_set( listTokenSets, indexed_locations, dictGeospatialConfig ) 179 | 180 | strGeom = 'POINT(-1.4052268 50.9369033)' 181 | listGeotags[0] = strGeom 182 | 183 | listMatchGeotag = geoparsepy.geo_parse_lib.reverse_geocode_geom( [strGeom], indexed_geoms, dictGeospatialConfig ) 184 | if len( listMatchGeotag[0] ) > 0 : 185 | for tupleOSMIDs in listMatchGeotag[0] : 186 | setIndexLoc = osmid_lookup[ tupleOSMIDs ] 187 | for nIndexLoc in setIndexLoc : 188 | strName = cached_locations[nIndexLoc][1] 189 | logger.info( 'Reverse geocoded geotag location [index ' + str(nIndexLoc) + ' osmid ' + repr(tupleOSMIDs) + '] = ' + strName ) 190 | 191 | for nIndex in range(len(listMatchSet)) : 192 | logger.info( 'Text = ' + listText[nIndex] ) 193 | listMatch = listMatchSet[ nIndex ] 194 | strGeom = listGeotags[ nIndex ] 195 | setOSMID = set([]) 196 | for tupleMatch in listMatch : 197 | nTokenStart = tupleMatch[0] 198 | nTokenEnd = tupleMatch[1] 199 | tuplePhrase = tupleMatch[3] 200 | for tupleOSMIDs in tupleMatch[2] : 201 | setIndexLoc = osmid_lookup[ tupleOSMIDs ] 202 | for nIndexLoc in setIndexLoc : 203 | logger.info( 'Location [index ' + str(nIndexLoc) + ' osmid ' + repr(tupleOSMIDs) + ' @ ' + str(nTokenStart) + ' : ' + str(nTokenEnd) + '] = ' + ' '.join(tuplePhrase) ) 204 | break 205 | listLocMatches = geoparsepy.geo_parse_lib.create_matched_location_list( listMatch, cached_locations, osmid_lookup ) 206 | geoparsepy.geo_parse_lib.filter_matches_by_confidence( listLocMatches, dictGeospatialConfig, geom_context = strGeom, geom_cache = dictGeomResultsCache ) 207 | geoparsepy.geo_parse_lib.filter_matches_by_geom_area( listLocMatches, dictGeospatialConfig ) 208 | geoparsepy.geo_parse_lib.filter_matches_by_region_of_interest( listLocMatches, [-148838, -62149], dictGeospatialConfig ) 209 | setOSMID = set([]) 210 | for nMatchIndex in range(len(listLocMatches)) : 211 | nTokenStart = listLocMatches[nMatchIndex][1] 212 | nTokenEnd = listLocMatches[nMatchIndex][2] 213 | tuplePhrase = listLocMatches[nMatchIndex][3] 214 | strGeom = listLocMatches[nMatchIndex][4] 215 | tupleOSMID = listLocMatches[nMatchIndex][5] 216 | dictOSMTags = listLocMatches[nMatchIndex][6] 217 | if not tupleOSMID in setOSMID : 218 | setOSMID.add( tupleOSMID ) 219 | listNameMultilingual = geoparsepy.geo_parse_lib.calc_multilingual_osm_name_set( dictOSMTags, dictGeospatialConfig ) 220 | strNameList = ';'.join( listNameMultilingual ) 221 | strOSMURI = geoparsepy.geo_parse_lib.calc_OSM_uri( tupleOSMID, strGeom ) 222 | logger.info( 'Disambiguated Location [index ' + str(nMatchIndex) + ' osmid ' + repr(tupleOSMID) + ' @ ' + str(nTokenStart) + ' : ' + str(nTokenEnd) + '] = ' + strNameList + ' : ' + strOSMURI ) 223 | ``` 224 | 225 | 226 | # Example geoparse output 227 | ``` 228 | logging started 229 | loading stoplist from C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-geo-stoplist-en.txt 230 | loading whitelist from C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-geo-whitelist.txt 231 | loading blacklist from C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-geo-blacklist.txt 232 | loading building types from C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-buildingtype-en.txt 233 | loading location type corpus C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-buildingtype-en.txt 234 | - 3 unique titles 235 | - 76 unique types 236 | loading street types from C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-streettype-en.txt 237 | loading location type corpus C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-streettype-en.txt 238 | - 15 unique titles 239 | - 32 unique types 240 | loading admin types from C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-admintype-en.txt 241 | loading location type corpus C:\Program Files\Python3\lib\site-packages\geoparsepy\corpus-admintype-en.txt 242 | - 14 unique titles 243 | - 0 unique types 244 | loading gazeteer from C:\Program Files\Python3\lib\site-packages\geoparsepy\gazeteer-en.txt 245 | caching locations : {'global_cities_admin': [-1, -1], 'global_cities_poly': [-1, -1], 'global_cities_line': [-1, -1], 'global_cities_point': [-1, -1], 'europe_places_admin': [-1, -1], 'europe_places_poly': [-1, -1], 'europe_places_line': [-1, -1], 'europe_places_point': [-1, -1], 'north_america_places_admin': [-1, -1], 'north_america_places_poly': [-1, -1], 'north_america_places_line': [-1, -1], 'north_america_places_point': [-1, -1], 'uk_places_admin': [-1, -1], 'uk_places_poly': [-1, -1], 'uk_places_line': [-1, -1], 'uk_places_point': [-1, -1]} 246 | number of cached locations = 800820 247 | number of indexed phrases = 645697 248 | number of indexed geoms = 657264 249 | Reverse geocoded geotag location [index 190787 osmid (253067120,)] = Bassett 250 | Reverse geocoded geotag location [index 779038 osmid (253067120,)] = Bassett 251 | Text = hello New York, USA its Bill from Bassett calling 252 | Location [index 792265 osmid (29457403,) @ 1 : 2] = new york 253 | Location [index 737029 osmid (151937435,) @ 1 : 2] = new york 254 | Location [index 737030 osmid (316976734,) @ 1 : 2] = new york 255 | Location [index 140096 osmid (-175905,) @ 1 : 2] = new york 256 | Location [index 737028 osmid (61785451,) @ 1 : 2] = new york 257 | Location [index 792266 osmid (2218262347,) @ 1 : 2] = new york 258 | Location [index 146732 osmid (-61320,) @ 1 : 2] = new york 259 | Location [index 126105 osmid (-134353,) @ 2 : 2] = york 260 | Location [index 758451 osmid (153595296,) @ 2 : 2] = york 261 | Location [index 758454 osmid (153968758,) @ 2 : 2] = york 262 | Location [index 114051 osmid (-1425436,) @ 2 : 2] = york 263 | Location [index 758455 osmid (158656063,) @ 2 : 2] = york 264 | Location [index 758452 osmid (153924230,) @ 2 : 2] = york 265 | Location [index 758450 osmid (153473841,) @ 2 : 2] = york 266 | Location [index 758449 osmid (151672942,) @ 2 : 2] = york 267 | Location [index 758458 osmid (316990182,) @ 2 : 2] = york 268 | Location [index 758448 osmid (151651405,) @ 2 : 2] = york 269 | Location [index 800785 osmid (20913294,) @ 2 : 2] = york 270 | Location [index 758447 osmid (151528825,) @ 2 : 2] = york 271 | Location [index 140948 osmid (-148838,) @ 4 : 4] = usa 272 | Location [index 190787 osmid (253067120,) @ 8 : 8] = bassett 273 | Location [index 705552 osmid (151840681,) @ 8 : 8] = bassett 274 | Location [index 705551 osmid (151463868,) @ 8 : 8] = bassett 275 | Disambiguated Location [index 0 osmid (-61320,) @ 1 : 2] = New York;NY;New York State : http://www.openstreetmap.org/relation/61320 276 | Disambiguated Location [index 3 osmid (-148838,) @ 4 : 4] = United States;US;USA;United States of America : http://www.openstreetmap.org/relation/148838 277 | Disambiguated Location [index 5 osmid (253067120,) @ 8 : 8] = : http://www.openstreetmap.org/node/253067120 278 | Text = live on the BBC Victoria Derbyshire is visiting Derbyshire for an exclusive UK interview 279 | Location [index 87080 osmid (-2316741,) @ 4 : 4] = victoria 280 | Location [index 177879 osmid (-10307525,) @ 4 : 4] = victoria 281 | Location [index 754399 osmid (154301948,) @ 4 : 4] = victoria 282 | Location [index 45074 osmid (-5606595,) @ 4 : 4] = victoria 283 | Location [index 595897 osmid (385402175,) @ 4 : 4] = victoria 284 | Location [index 595901 osmid (462241727,) @ 4 : 4] = victoria 285 | Location [index 754403 osmid (158651084,) @ 4 : 4] = victoria 286 | Location [index 754358 osmid (151336948,) @ 4 : 4] = victoria 287 | Location [index 128827 osmid (-407423,) @ 4 : 4] = victoria 288 | Location [index 595902 osmid (463188523,) @ 4 : 4] = victoria 289 | Location [index 595899 osmid (447925715,) @ 4 : 4] = victoria 290 | Location [index 595898 osmid (435240340,) @ 4 : 4] = victoria 291 | Location [index 597713 osmid (277608416,) @ 4 : 4] = victoria 292 | Location [index 45017 osmid (-5606596,) @ 4 : 4] = victoria 293 | Location [index 775444 osmid (30189922,) @ 4 : 4] = victoria 294 | Location [index 87296 osmid (-2256643,) @ 4 : 4] = victoria 295 | Location [index 754364 osmid (151395812,) @ 4 : 4] = victoria 296 | Location [index 157847 osmid (74701108,) @ 4 : 4] = victoria 297 | Location [index 754393 osmid (151521359,) @ 4 : 4] = victoria 298 | Location [index 161280 osmid (75538688,) @ 4 : 4] = victoria 299 | Location [index 595900 osmid (460070685,) @ 4 : 4] = victoria 300 | Location [index 754369 osmid (151476805,) @ 4 : 4] = victoria 301 | Location [index 99056 osmid (-1828436,) @ 4 : 4] = victoria 302 | Location [index 126056 osmid (-195384,) @ 8 : 8] = derbyshire 303 | Location [index 146796 osmid (-62149,) @ 12 : 12] = uk 304 | Disambiguated Location [index 0 osmid (-195384,) @ 8 : 8] = Derbyshire : http://www.openstreetmap.org/relation/195384 305 | Disambiguated Location [index 2 osmid (-62149,) @ 12 : 12] = United Kingdom;GB;GBR;UK : http://www.openstreetmap.org/relation/62149 306 | ``` 307 | 308 | # Databases needed for preprocessing focus areas (optional) 309 | To preprocess your own focus areas (e.g. a city with all its streets and buildings) you need a local deployment of the planet OpenStreetmapDatabase. Once a focus area is preprocessed a database table will be created for it. This can be used in the geoparse just like the 'global_cities' focus area is in the previous example. Instructions below are dated dec 2020, refer to links for more up-to-date information. 310 | 311 | [Osm2pgsql](http://wiki.openstreetmap.org/wiki/Osm2pgsql#From_the_package_manager) 312 | [Planet.osm](http://wiki.openstreetmap.org/wiki/Planet.osm) 313 | 314 | ``` 315 | # Download OpenStreetMap map data archive 316 | - http://wiki.openstreetmap.org/wiki/Planet.osm 317 | + pick a mirror and download planet-latest.osm.bz2 file 318 | + all maps are WGS84 coord system 319 | + this will give you a .bz2 compressed .pbf file with the OSM dataset for the country specified 320 | - see https://github.com/openstreetmap/osm2pgsql 321 | 322 | # remove postgres (old versions - might not be needed if clean install) 323 | sudo apt list --installed | grep post 324 | sudo apt-get remove --purge postgresql-10 325 | sudo apt-get remove --purge postgresql-10-postgis-2.4-scripts 326 | sudo apt-get remove --purge postgis 327 | 328 | # install using a version number (otherwise get problems later) 329 | sudo apt-get install python3-apt 330 | sudo apt-get install postgresql-10-postgis-2.4 331 | 332 | # print versions 333 | pg_config --version 334 | psql --version 335 | 336 | sudo /etc/init.d/postgresql stop 337 | sudo /etc/init.d/postgresql status 338 | 339 | sudo nano /etc/postgresql/10/main/pg_hba.conf 340 | host all all 127.0.0.1/32 md5 341 | host all all 127.0.0.1/32 trust 342 | 343 | sudo nano /etc/postgresql/10/main/postgresql.conf 344 | + listen_addresses = '*' 345 | + shared_buffers = 512MB 346 | + work_mem = 512MB 347 | + maintenance_work_mem = 2GB 348 | + max_worker_processes = 16 349 | + max_parallel_workers_per_gather = 8 350 | + max_parallel_workers = 16 351 | + constraint_exclusion = partition 352 | 353 | sudo /etc/init.d/postgresql start 354 | sudo /etc/init.d/postgresql status 355 | 356 | # check postgresql is running OK 357 | sudo netstat -nlp | grep 5432 358 | sudo cat /var/log/postgresql/postgresql-10-main.log 359 | 360 | # make postgis database (empty initially) 361 | sudo -u postgres createdb openstreetmap 362 | sudo -u postgres psql -d openstreetmap -c 'CREATE EXTENSION postgis; CREATE EXTENSION hstore;' 363 | sudo -u postgres psql -d openstreetmap -c "SELECT * FROM information_schema.tables WHERE table_schema = 'public'" 364 | 365 | # install osm 366 | sudo mkdir /var/lib/osm 367 | cd /var/lib/osm 368 | sudo wget http://ftp.snt.utwente.nl/pub/misc/openstreetmap/planet-latest.osm.bz2 369 | 370 | # make flat node file (as planet OSM is too large otherwise for RAM) 371 | sudo mkdir /var/lib/osm/flat-nodes 372 | sudo chown -R postgres /var/lib/osm/flat-nodes 373 | 374 | # run osm2pgsql (will take about 7 days to finish so redirect stderr and stdout to file and run as a deamon process) 375 | sudo apt-get install osm2pgsql 376 | sudo -u postgres osm2pgsql -c -d openstreetmap -P 5432 -E 4326 -S /usr/share/osm2pgsql/default.style -k -s -C 8192 --flat-nodes /var/lib/osm/flat-nodes/flat-node-index-file --number-processes 8 /var/lib/osm/planet-latest.osm.bz2 > /var/lib/osm/osm2pgsql-stdout.log 2>&1 & 377 | sudo -u postgres psql -d openstreetmap -c "SELECT * FROM information_schema.tables WHERE table_schema = 'public'" 378 | ``` 379 | 380 | # Example code preprocess focus area (optional) 381 | 382 | Preprocessing new focus area tables in the Postgres database. Fully documented example PY file can be found at geoparsepy.example_preprocess_focus_area.py 383 | 384 | ```python 385 | import os, sys, logging, traceback, codecs, datetime, copy, time, ast, math, re, random, shutil, json 386 | import soton_corenlppy, geoparsepy 387 | 388 | LOG_FORMAT = ('%(message)s') 389 | logger = logging.getLogger( __name__ ) 390 | logging.basicConfig( level=logging.INFO, format=LOG_FORMAT ) 391 | logger.info('logging started') 392 | 393 | dictFocusAreaSpec = { 394 | 'southampton' : { 395 | 'focus_area_id' : 'southampton', 396 | 'admin': ['southampton','south east england', 'united kingdom'], 397 | 'admin_lookup_table' : 'global_cities_admin', 398 | } 399 | } 400 | 401 | dictGlobalSpec = None 402 | 403 | dictGeospatialConfig = geoparsepy.geo_parse_lib.get_geoparse_config( 404 | lang_codes = ['en'], 405 | logger = logger, 406 | whitespace = u'"\u201a\u201b\u201c\u201d()', 407 | sent_token_seps = ['\n','\r\n', '\f', u'\u2026'], 408 | punctuation = """,;\/:+-#~&*=!?""", 409 | ) 410 | 411 | dbHandlerPool = {} 412 | dbHandlerPool['admin'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 413 | dbHandlerPool['point'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 414 | dbHandlerPool['poly'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 415 | dbHandlerPool['line'] = soton_corenlppy.PostgresqlHandler.PostgresqlHandler( 'postgres', 'postgres', 'localhost', 5432, 'openstreetmap' ) 416 | 417 | for strFocusArea in dictFocusAreaSpec.keys() : 418 | logger.info( 'starting focus area ' + strFocusArea ) 419 | jsonFocusArea = dictFocusAreaSpec[strFocusArea] 420 | geoparsepy.geo_preprocess_lib.create_preprocessing_tables( jsonFocusArea, dbHandlerPool['admin'], 'public', delete_contents = False, logger = logger ) 421 | dictNewLocations = geoparsepy.geo_preprocess_lib.execute_preprocessing_focus_area( jsonFocusArea, dbHandlerPool, 'public', logger = logger ) 422 | logger.info( 'finished focus area ' + strFocusArea ) 423 | logger.info( 'location id range : ' + repr(dictNewLocations) ) 424 | 425 | dbHandlerPool['admin'].close() 426 | dbHandlerPool['point'].close() 427 | dbHandlerPool['poly'].close() 428 | dbHandlerPool['line'].close() 429 | ``` 430 | 431 | # Example code preprocess focus area output (optional) 432 | ``` 433 | logging started 434 | loading stoplist from /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-geo-stoplist-en.txt 435 | loading whitelist from /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-geo-whitelist.txt 436 | loading blacklist from /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-geo-blacklist.txt 437 | loading building types from /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-buildingtype-en.txt 438 | loading location type corpus /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-buildingtype-en.txt 439 | - 3 unique titles 440 | - 76 unique types 441 | loading street types from /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-streettype-en.txt 442 | loading location type corpus /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-streettype-en.txt 443 | - 15 unique titles 444 | - 32 unique types 445 | loading admin types from /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-admintype-en.txt 446 | loading location type corpus /home/sem/.local/lib/python3.7/site-packages/geoparsepy/corpus-admintype-en.txt 447 | - 14 unique titles 448 | - 0 unique types 449 | loading gazeteer from /home/sem/.local/lib/python3.7/site-packages/geoparsepy/gazeteer-en.txt 450 | starting focus area southampton 451 | starting preprocessing of new focus area : {'focus_area_id': 'southampton', 'admin': ['southampton', 'south east england', 'united kingdom'], 'admin_lookup_table': 'global_cities_admin'} 452 | starting SQL threads 453 | start SQL (point x 2) . 454 | start SQL (line x 2) . 455 | start SQL (poly x 2) . 456 | start SQL (admin x 2) . 457 | waiting for joins 458 | . end SQL (admin x 2) . 459 | . end SQL (point x 2) . 460 | . end SQL (line x 2) . 461 | . end SQL (poly x 2) . 462 | join successful 463 | finished focus area southampton 464 | location id range : {'southampton_point': (1, 1327), 'southampton_line': (1, 2144), 'southampton_poly': (1, 2748), 'southampton_admin': (1, 7)} 465 | ``` 466 | 467 | --------------------------------------------------------------------------------