├── .clang-format ├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── Database.md ├── LICENSE ├── README.md ├── demo ├── geocoder-nlp.cpp ├── nearby-line.cpp └── nearby-point.cpp ├── geocoder-nlp.pri ├── importer ├── README.md ├── data │ └── priority.list ├── sql │ ├── duplicates.sql │ └── nominatim_cyclic_parents.sql └── src │ ├── config.h │ ├── hierarchy.cpp │ ├── hierarchy.h │ ├── hierarchyitem.cpp │ ├── hierarchyitem.h │ ├── main.cpp │ ├── normalization.cpp │ ├── normalization.h │ ├── utils.cpp │ └── utils.h ├── scripts ├── postal │ ├── README.md │ ├── build_all_country_db.py │ ├── build_country_db.sh │ ├── countries_languages.txt │ ├── make_country_list.py │ └── split_addresses.py └── tags │ ├── README.md │ └── generate_mapstyle.py └── src ├── geocoder.cpp ├── geocoder.h ├── postal.cpp ├── postal.h └── version.h /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | # We'll use defaults from the LLVM style, but with 4 columns indentation. 3 | BasedOnStyle: GNU 4 | ColumnLimit: 100 5 | --- 6 | Language: Cpp 7 | AllowShortFunctionsOnASingleLine: Inline 8 | AlwaysBreakAfterDefinitionReturnType: None 9 | AlwaysBreakAfterReturnType: None 10 | SpaceBeforeParens: ControlStatements 11 | 12 | AlignConsecutiveAssignments: Consecutive 13 | AlignConsecutiveDeclarations: Consecutive 14 | AlignConsecutiveDeclarations: Consecutive -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | 6 | # Compiled Dynamic libraries 7 | *.so 8 | *.dylib 9 | 10 | # Compiled Static libraries 11 | *.lai 12 | *.la 13 | *.a 14 | 15 | *~ 16 | 17 | /*.pro.user 18 | build/ 19 | .vscode 20 | *.code-workspace 21 | CMakeLists.txt.user 22 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "thirdparty/sqlite3pp"] 2 | path = thirdparty/sqlite3pp 3 | url = https://github.com/rinigus/sqlite3pp.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.6.0) 2 | 3 | project(geocoder-nlp 4 | DESCRIPTION "Geocoder NLP") 5 | 6 | set(CMAKE_INCLUDE_CURRENT_DIR ON) 7 | set(CMAKE_CXX_STANDARD 17) 8 | set(CMAKE_CXX_STANDARD_REQUIRED True) 9 | 10 | include(FindPkgConfig) 11 | include(FeatureSummary) 12 | include(GNUInstallDirs) 13 | 14 | find_package(PkgConfig REQUIRED) 15 | find_package(nlohmann_json 3.2.0 REQUIRED) 16 | find_package(Boost 1.30 COMPONENTS program_options REQUIRED) 17 | 18 | pkg_check_modules(MARISA marisa IMPORTED_TARGET) 19 | pkg_check_modules(KYOTOCABINET kyotocabinet IMPORTED_TARGET) 20 | pkg_check_modules(POSTAL libpostal IMPORTED_TARGET) 21 | pkg_check_modules(SQLITE3 sqlite3 IMPORTED_TARGET) 22 | pkg_check_modules(LIBPQXX libpqxx IMPORTED_TARGET) 23 | 24 | add_compile_options(-Wall -Wextra -Werror) 25 | 26 | set(SRC 27 | src/geocoder.cpp 28 | src/postal.cpp) 29 | 30 | set(HEAD 31 | src/geocoder.h 32 | src/postal.h 33 | src/version.h) 34 | 35 | # sqlite3pp include 36 | include_directories(thirdparty/sqlite3pp/headeronly_src) 37 | include_directories(src) 38 | 39 | # boost 40 | include_directories(${Boost_INCLUDE_DIR}) 41 | 42 | # importer 43 | set(IMPSRC 44 | importer/src/config.h 45 | importer/src/main.cpp 46 | importer/src/hierarchy.cpp 47 | importer/src/hierarchy.h 48 | importer/src/hierarchyitem.cpp 49 | importer/src/hierarchyitem.h 50 | importer/src/normalization.cpp 51 | importer/src/normalization.h 52 | importer/src/utils.cpp 53 | importer/src/utils.h 54 | ) 55 | add_executable(geocoder-importer ${SRC} ${HEAD} ${IMPSRC}) 56 | target_link_libraries(geocoder-importer 57 | PkgConfig::MARISA 58 | PkgConfig::KYOTOCABINET 59 | PkgConfig::POSTAL 60 | PkgConfig::SQLITE3 61 | PkgConfig::LIBPQXX 62 | nlohmann_json::nlohmann_json 63 | ${Boost_LIBRARIES}) 64 | 65 | # demo codes 66 | add_executable(geocoder-nlp 67 | demo/geocoder-nlp.cpp 68 | ${SRC} 69 | ${HEAD}) 70 | 71 | target_link_libraries(geocoder-nlp 72 | ${Boost_LIBRARIES}) 73 | 74 | target_link_libraries(geocoder-nlp 75 | PkgConfig::MARISA 76 | PkgConfig::KYOTOCABINET 77 | PkgConfig::POSTAL 78 | PkgConfig::SQLITE3) 79 | 80 | add_executable(nearby-line 81 | demo/nearby-line.cpp 82 | ${SRC} 83 | ${HEAD}) 84 | 85 | target_link_libraries(nearby-line 86 | PkgConfig::MARISA 87 | PkgConfig::KYOTOCABINET 88 | PkgConfig::POSTAL 89 | PkgConfig::SQLITE3) 90 | 91 | add_executable(nearby-point 92 | demo/nearby-point.cpp 93 | ${SRC} 94 | ${HEAD}) 95 | 96 | target_link_libraries(nearby-point 97 | PkgConfig::MARISA 98 | PkgConfig::KYOTOCABINET 99 | PkgConfig::POSTAL 100 | PkgConfig::SQLITE3) 101 | 102 | # install 103 | install(TARGETS geocoder-importer 104 | DESTINATION ${CMAKE_INSTALL_BINDIR}) 105 | 106 | # summary 107 | feature_summary(WHAT ALL FATAL_ON_MISSING_REQUIRED_PACKAGES) 108 | 109 | -------------------------------------------------------------------------------- /Database.md: -------------------------------------------------------------------------------- 1 | # Geocoder NLP database format 2 | 3 | The geocoder database consists of several files which are expected to be in the 4 | same directory. All locations are described using singe coordinate to keep the 5 | files as small as possible. 6 | 7 | The files composing a database are: 8 | 9 | 1. geonlp-primary.sqlite: SQLite database with location description and coordinate 10 | 2. geonlp-normalized.trie: MARISA database with normalized strings 11 | 3. geonlp-normalized-id.kch: Kyoto Cabinet database for linking MARISA and primary IDs 12 | 13 | ## geonlp-primary.sqlite 14 | 15 | SQLite database contains location description, their organization into hierarchy 16 | of objects. 17 | 18 | Table `object_primary` keeps location description. In this table, objects are 19 | stored sequentially (in terms of their `id`) according to the positioning in the 20 | object hierarchy with the children stored after parents. Table `hierarchy` has a 21 | record for each item (`id` from `object_primary`) with the children consisting 22 | of parent ID (`prim_id`) and the ID of the last child (`last_subobject`). 23 | 24 | Object types are stored separately in `type` table with the type ID used in 25 | `object_primary`. 26 | 27 | Spatial queries are indexed using R-Tree with `box_id` used as a reference in 28 | `object_primary`. Namely, as all objects are stored as points, for storage 29 | efficiency, objects next to each other are set to have the same `box_id` and are 30 | found through `-rtree` tables. 31 | 32 | Table `meta` keeps database format version and is used to check version 33 | compatibility. 34 | 35 | ## geonlp-normalized.trie 36 | 37 | All normalized strings are stored in MARISA database 38 | (https://github.com/s-yata/marisa-trie). Normalized strings are formed from 39 | `name` and other similar fields of `object_primary` table in 40 | `geonlp-primary.sqlite`. All strings are pushed into MARISA database that 41 | assigns its internal ID for each of the strings. 42 | 43 | ## geonlp-normalized-id.kch 44 | 45 | Kyoto Cabinet (https://dbmx.net/kyotocabinet/) database for linking MARISA and 46 | primary IDs. Hash database variant is used where `key` is an ID provided by 47 | MARISA for a search string and value is an array of bytes consisting of 48 | `object_primary` IDs stored as `uint32_t` one after another. The array is stored 49 | using `std::string`. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 rinigus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Geocoder NLP 2 | 3 | This is a geocoder C++ library that targets offline use by mobile 4 | applications. It is able to perform forward and reverse geocoding. 5 | For forward geocoding, it uses libpostal to parse the user 6 | request, normalize the parsed result, and search for the match in 7 | geocoder database. In addition to traditional reverse geocoding, it is 8 | able to find points of interest close to the reference point or line. 9 | 10 | The library includes demo program showing how to use it. Its also used 11 | as one of the geocoders in OSM Scout Server 12 | (https://github.com/rinigus/osmscout-server). For preparation of the 13 | SQLite database used by the geocoder, an importer from liboscmscout 14 | map database is provided. 15 | 16 | When using country-based libpostal address parser and limiting number 17 | of languages used to process the search request, it is possible to use 18 | this geocoder on mobile platforms. The development of the geocoder is 19 | mainly targeting Sailfish OS applications with the tests running on 20 | Linux as well. Its expected that the geocoder would run on platforms 21 | supported by libpostal without any major changes. 22 | 23 | To compile, adjust Makefile. The library can be used by incorporating 24 | source code files in `src` subdirectory and ensuring that 25 | `thirdparty/sqlite3pp/headeronly_src` is in the compiler's include 26 | path. 27 | 28 | In addition to libpostal, libsqlite3 is required for the geocoder to 29 | function. For importer, libosmscout is required in addition to the 30 | libraries mentioned above. 31 | 32 | ## Databases 33 | 34 | At present, the datasets required for the geocoder to function are distributed 35 | as a part of OSM Scout Server datasets. 36 | 37 | If you use the geocoder with the full libpostal installation, you don't need to 38 | get the libpostal datasets from that location, but can use the datasets 39 | provided by libpostal. This is a default when there is no path 40 | specified for corresponding libpostal datasets. 41 | 42 | To use country-specific datasets, you would have to get: 43 | * libpostal language parser: postal/global 44 | * libpostal country-specific database: postal/countries/SELECT THE NEEDED ONES 45 | 46 | In addition, the prepared geocoder databases are available at 47 | geocoder/SELECT THE NEEDED ONES. 48 | 49 | Database format is described in [separate document](Database.md). 50 | 51 | ## Acknowledgments 52 | 53 | libpostal: Used for input parsing; https://github.com/openvenues/libpostal 54 | 55 | Nominatim: Used for data import; https://nominatim.org/ 56 | -------------------------------------------------------------------------------- /demo/geocoder-nlp.cpp: -------------------------------------------------------------------------------- 1 | #include "geocoder.h" 2 | #include "postal.h" 3 | #include "version.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace GeoNLP; 10 | namespace po = boost::program_options; 11 | 12 | int main(int argc, char *argv[]) 13 | { 14 | Postal postal; 15 | // postal.set_initialize_every_call(true); 16 | 17 | std::string query; 18 | std::string postal_data_global; 19 | std::string postal_data_country; 20 | std::string geocoder_data; 21 | int max_results = 10; 22 | double ref_latitude; 23 | double ref_longitude; 24 | int ref_zoom = 16; 25 | double ref_importance = 0.75; 26 | 27 | Geocoder::GeoReference reference; 28 | 29 | { 30 | po::options_description generic("Geocoder NLP demo options"); 31 | generic.add_options()("help,h", "Help message"); 32 | generic.add_options()("version,v", "Version"); 33 | generic.add_options()("geocoder-data", po::value(&geocoder_data), 34 | "GeocoderNLP database directory path"); 35 | 36 | generic.add_options()( 37 | "postal-country", po::value(&postal_data_country), 38 | "libpostal country database. Keep empty to use global libpostal parser data."); 39 | generic.add_options()( 40 | "postal-global", po::value(&postal_data_global), 41 | "libpostal global database. Keep empty to use global libpostal parser data."); 42 | 43 | generic.add_options()("max-results", po::value(&max_results), "Maximal number of results"); 44 | 45 | generic.add_options()("ref-latitude", po::value(&ref_latitude), 46 | "Reference for location bias; latitude"); 47 | generic.add_options()("ref-longitude", po::value(&ref_longitude), 48 | "Reference for location bias; longitude"); 49 | generic.add_options()( 50 | "ref-zoom", po::value(&ref_zoom), 51 | "Reference for location bias; zoom level for calculating reference radius"); 52 | generic.add_options()("ref-importance", po::value(&ref_importance), 53 | "Reference for location bias; importance from 0 to 1 of location bias"); 54 | 55 | po::options_description hidden("Hidden options"); 56 | hidden.add_options()("query", po::value(&query), "Search query"); 57 | 58 | po::positional_options_description p; 59 | p.add("query", 1); 60 | 61 | po::options_description cmdline_options; 62 | cmdline_options.add(generic).add(hidden); 63 | 64 | po::variables_map vm; 65 | try 66 | { 67 | po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), 68 | vm); 69 | po::notify(vm); 70 | } 71 | catch (std::exception &e) 72 | { 73 | std::cerr << "Error while parsing options: " << e.what() << "\n\n"; 74 | std::cerr << generic << "\n"; 75 | } 76 | 77 | if (vm.count("help")) 78 | { 79 | std::cout << "Geocoder NLP demo:\n\n" 80 | << "Call as\n\n " << argv[0] << " query\n" 81 | << "\nwhere query is a string.\n\n" 82 | << generic << "\n"; 83 | return 0; 84 | } 85 | 86 | if (vm.count("version")) 87 | { 88 | std::cout << "Geocoder NLP version: " << GEOCODERNLP_VERSION_STRING << "\n"; 89 | return 0; 90 | } 91 | 92 | if (!vm.count("geocoder-data")) 93 | { 94 | std::cerr << "GeocoderNLP database directory path is missing\n"; 95 | return -1; 96 | } 97 | 98 | if (vm.count("ref-latitude") && vm.count("ref-longitude")) 99 | reference.set(ref_latitude, ref_longitude, ref_zoom, ref_importance); 100 | } 101 | 102 | std::vector parsed_query; 103 | Postal::ParseResult nonorm; 104 | 105 | postal.set_postal_datadir(postal_data_global, postal_data_country); 106 | // postal.add_language("da"); 107 | // postal.add_language("et"); 108 | // postal.add_language("en"); 109 | postal.set_initialize_every_call(true); 110 | postal.set_use_primitive(false); 111 | 112 | postal.parse(query, parsed_query, nonorm); 113 | 114 | std::cout << "\nAddress parsing before full normalization:\n\n"; 115 | for (auto v : nonorm) 116 | { 117 | std::cout << v.first << " "; 118 | for (auto k : v.second) 119 | std::cout << k << " "; 120 | std::cout << "\n"; 121 | } 122 | std::cout << "\n"; 123 | 124 | std::cout << "Normalization:\n\n"; 125 | for (auto r : parsed_query) 126 | { 127 | for (auto v : r) 128 | { 129 | std::cout << v.first << " "; 130 | for (auto k : v.second) 131 | std::cout << k << " "; 132 | std::cout << "\n"; 133 | } 134 | std::cout << "\n"; 135 | } 136 | 137 | std::cout << std::endl; 138 | 139 | Geocoder geo(geocoder_data); 140 | geo.set_max_queries_per_hierarchy(30); 141 | geo.set_max_results(max_results); 142 | // geo.set_result_language("en"); 143 | 144 | std::vector result; 145 | 146 | std::cout << "Geocoder loaded" << std::endl; 147 | 148 | geo.search(parsed_query, result, 0, reference); 149 | 150 | std::cout << std::setprecision(8); 151 | std::cout << "Search results: \n\n"; 152 | size_t counter = 0; 153 | for (const Geocoder::GeoResult &r : result) 154 | { 155 | std::cout << r.title << "\n" 156 | << r.address << "\n" 157 | << "Postal code: " << r.postal_code << "\n" 158 | << "Phone: " << r.phone << " / URL: " << r.website << "\n" 159 | << r.latitude << ", " << r.longitude << " / distance=" << r.distance << "\n" 160 | << r.type << " / " << r.id << " / " << r.search_rank << " / " << r.levels_resolved 161 | << "\n\n"; 162 | counter++; 163 | } 164 | 165 | std::cout << "Number of results: " << counter << std::endl; 166 | 167 | return 0; 168 | } 169 | -------------------------------------------------------------------------------- /demo/nearby-line.cpp: -------------------------------------------------------------------------------- 1 | #include "geocoder.h" 2 | #include "postal.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace GeoNLP; 10 | 11 | int main(int argc, char *argv[]) 12 | { 13 | Postal postal; 14 | 15 | if (argc < 8) 16 | { 17 | std::cout << "Use: " << argv[0] << " dbase postal_global postal_country address\n" 18 | << "where\n" 19 | << " dbase - path to Geocoder-NLP database folder\n" 20 | << " postal_global - path to libpostal database with language classifier\n" 21 | << " postal_country - path to libpostal database covering country\n" 22 | << " line - text file containing lat lon coordinates, one per line\n" 23 | << " radius - distance in meters\n" 24 | << " type - type of POI\n" 25 | << " name - name of POI\n"; 26 | return -1; 27 | } 28 | 29 | double radius = atof(argv[5]); 30 | std::cout << "Search radius: " << radius << " meters\n"; 31 | 32 | std::vector latitude, longitude; 33 | { 34 | std::ifstream fin(argv[4]); 35 | double lat, lon; 36 | while (fin >> lat >> lon) 37 | { 38 | latitude.push_back(lat); 39 | longitude.push_back(lon); 40 | } 41 | } 42 | std::cout << "Loaded " << latitude.size() << " coordinates from " << argv[4] << "\n"; 43 | 44 | char *name_query = argv[7]; 45 | std::vector parsed_name; 46 | 47 | if (strlen(name_query) > 0) 48 | { 49 | postal.set_postal_datadir(argv[2], argv[3]); 50 | postal.set_initialize_every_call(true); 51 | postal.expand_string(name_query, parsed_name); 52 | 53 | std::cout << "Normalization of name:\n\n"; 54 | for (auto r : parsed_name) 55 | std::cout << r << "\n"; 56 | } 57 | 58 | std::cout << std::endl; 59 | 60 | std::vector type_query; 61 | if (strlen(argv[6]) > 0) 62 | { 63 | type_query.push_back(argv[6]); 64 | std::cout << "Searching for type: " << argv[6] << "\n"; 65 | } 66 | 67 | Geocoder geo(argv[1]); 68 | geo.set_max_queries_per_hierarchy(30); 69 | geo.set_max_results(25); 70 | 71 | std::vector result; 72 | 73 | std::cout << "Geocoder loaded" << std::endl; 74 | 75 | if (!geo.search_nearby(parsed_name, type_query, latitude, longitude, radius, result, postal)) 76 | std::cout << "Failed to search\n"; 77 | else 78 | { 79 | std::cout << std::setprecision(8); 80 | std::cout << "Search results: \n\n"; 81 | size_t counter = 0; 82 | for (const Geocoder::GeoResult &r : result) 83 | { 84 | std::cout << r.title << "\n" 85 | << r.address << "\n" 86 | << r.latitude << ", " << r.longitude << ": distance - " << r.distance << "\n" 87 | << r.type << " / " << r.id << " / " << r.admin_levels << " / " 88 | << r.levels_resolved << "\n\n"; 89 | counter++; 90 | } 91 | 92 | std::cout << "Number of results: " << counter << std::endl; 93 | } 94 | 95 | return 0; 96 | } 97 | -------------------------------------------------------------------------------- /demo/nearby-point.cpp: -------------------------------------------------------------------------------- 1 | #include "geocoder.h" 2 | #include "postal.h" 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | using namespace GeoNLP; 10 | 11 | int main(int argc, char *argv[]) 12 | { 13 | Postal postal; 14 | 15 | if (argc < 8) 16 | { 17 | std::cout << "Use: " << argv[0] << " dbase postal_global postal_country address\n" 18 | << "where\n" 19 | << " dbase - path to Geocoder-NLP database folder\n" 20 | << " postal_global - path to libpostal database with language classifier\n" 21 | << " postal_country - path to libpostal database covering country\n" 22 | << " point - text file containing lat lon coordinate, the first pair is read only\n" 23 | << " radius - distance in meters\n" 24 | << " type - type of POI\n" 25 | << " name - name of POI\n"; 26 | return -1; 27 | } 28 | 29 | double radius = atof(argv[5]); 30 | std::cout << "Search radius: " << radius << " meters\n"; 31 | 32 | std::vector latitude, longitude; 33 | { 34 | std::ifstream fin(argv[4]); 35 | double lat, lon; 36 | while (fin >> lat >> lon) 37 | { 38 | latitude.push_back(lat); 39 | longitude.push_back(lon); 40 | break; 41 | } 42 | } 43 | std::cout << "Loaded " << latitude.size() << " coordinates from " << argv[4] << "\n"; 44 | 45 | char *name_query = argv[7]; 46 | std::vector parsed_name; 47 | 48 | if (strlen(name_query) > 0) 49 | { 50 | postal.set_postal_datadir(argv[2], argv[3]); 51 | postal.set_initialize_every_call(true); 52 | postal.expand_string(name_query, parsed_name); 53 | 54 | std::cout << "Normalization of name:\n\n"; 55 | for (auto r : parsed_name) 56 | std::cout << r << "\n"; 57 | } 58 | 59 | std::cout << std::endl; 60 | 61 | std::vector type_query; 62 | if (strlen(argv[6]) > 0) 63 | { 64 | type_query.push_back(argv[6]); 65 | std::cout << "Searching for type: " << argv[6] << "\n"; 66 | } 67 | 68 | Geocoder geo(argv[1]); 69 | geo.set_max_queries_per_hierarchy(30); 70 | geo.set_max_results(25); 71 | 72 | std::vector result; 73 | 74 | std::cout << "Geocoder loaded" << std::endl; 75 | 76 | if (!geo.search_nearby(parsed_name, type_query, latitude[0], longitude[0], radius, result, 77 | postal)) 78 | std::cout << "Failed to search\n"; 79 | else 80 | { 81 | std::cout << std::setprecision(8); 82 | std::cout << "Search results: \n\n"; 83 | size_t counter = 0; 84 | for (const Geocoder::GeoResult &r : result) 85 | { 86 | std::cout << r.title << "\n" 87 | << r.address << "\n" 88 | << r.latitude << ", " << r.longitude << ": distance - " << r.distance << "\n" 89 | << r.type << " / " << r.id << " / " << r.admin_levels << " / " 90 | << r.levels_resolved << "\n\n"; 91 | counter++; 92 | } 93 | 94 | std::cout << "Number of results: " << counter << std::endl; 95 | } 96 | 97 | return 0; 98 | } 99 | -------------------------------------------------------------------------------- /geocoder-nlp.pri: -------------------------------------------------------------------------------- 1 | #DEFINES += GEONLP_PRINT_DEBUG_QUERIES 2 | #DEFINES += GEONLP_PRINT_DEBUG 3 | 4 | INCLUDEPATH += $$PWD/src 5 | INCLUDEPATH += $$PWD/thirdparty/sqlite3pp/headeronly_src 6 | 7 | DEPENDPATH += $$PWD/src 8 | 9 | SOURCES += \ 10 | $$PWD/src/postal.cpp \ 11 | $$PWD/src/geocoder.cpp 12 | 13 | HEADERS += \ 14 | $$PWD/src/postal.h \ 15 | $$PWD/src/geocoder.h \ 16 | $$PWD/src/version.h 17 | 18 | LIBS += -lpostal 19 | -------------------------------------------------------------------------------- /importer/README.md: -------------------------------------------------------------------------------- 1 | # Import program and scripts. 2 | 3 | Import program is using Nominatim database and filters the data using boundary 4 | given in GeoJSON file. 5 | 6 | The boundary file can be generated from POLY files as provided by Geofabrik or 7 | given in hierarchy folder of OSM Scout Server (see 8 | https://github.com/rinigus/osmscout-server/tree/master/scripts/import/hierarchy). 9 | As a converter, [poly2geojson](https://github.com/frafra/poly2geojson/) can be 10 | used. 11 | 12 | Data can be filtered using data/priority.list and data/skip.list files. Those 13 | list OSM tags in the form where tag type and its value are merged using `_`. Out 14 | of the lists, the priority one gives locations that would be kept in the 15 | database even without names associated with them for reverse geocoding. The 16 | "skip" list allows to specify locations that would be dropped by location type. 17 | 18 | To generate larger amount of tags for the priority list, the scripts 19 | under `scripts/tags` (from main directory of the project) can be used. 20 | 21 | ## Used 22 | 23 | - Nominatim Docker https://github.com/mediagis/nominatim-docker/ 24 | - For testing: Nominatim-UI https://github.com/osm-search/nominatim-ui 25 | - poly2geojson: https://github.com/frafra/poly2geojson -------------------------------------------------------------------------------- /importer/data/priority.list: -------------------------------------------------------------------------------- 1 | aerialway_cable_car 2 | amenity_atm 3 | amenity_bank 4 | amenity_bar 5 | amenity_bicycle_parking 6 | amenity_bicycle_rental 7 | amenity_biergarten 8 | amenity_bureau_de_change 9 | amenity_cafe 10 | amenity_car_rental 11 | amenity_car_wash 12 | amenity_charging_station 13 | amenity_cinema 14 | amenity_clinic 15 | amenity_college 16 | amenity_community_centre 17 | amenity_courthouse 18 | amenity_dentist 19 | amenity_doctors 20 | amenity_embassy 21 | amenity_fast_food 22 | amenity_ferry_terminal 23 | amenity_fire_station 24 | amenity_fuel 25 | amenity_hospital 26 | amenity_ice_cream 27 | amenity_motorcycle_parking 28 | amenity_nursing_home 29 | amenity_parking 30 | amenity_pharmacy 31 | amenity_place_of_worship 32 | amenity_police 33 | amenity_post_office 34 | amenity_pub 35 | amenity_restaurant 36 | amenity_sauna 37 | amenity_taxi 38 | amenity_theatre 39 | amenity_toilets 40 | amenity_townhall 41 | amenity_university 42 | emergency_defibrillator 43 | emergency_phone 44 | leisure_playground 45 | shop_alcohol 46 | shop_bakery 47 | shop_beauty 48 | shop_beverages 49 | shop_bicycle 50 | shop_books 51 | shop_boutique 52 | shop_butcher 53 | shop_car 54 | shop_car_parts 55 | shop_car_repair 56 | shop_chemist 57 | shop_clothes 58 | shop_computer 59 | shop_confectionery 60 | shop_convenience 61 | shop_copyshop 62 | shop_cosmetics 63 | shop_deli 64 | shop_department_store 65 | shop_doityourself 66 | shop_dry_cleaning 67 | shop_electronics 68 | shop_florist 69 | shop_funeral_directors 70 | shop_furniture 71 | shop_garden_centre 72 | shop_gift 73 | shop_greengrocer 74 | shop_hairdresser 75 | shop_hardware 76 | shop_jewelry 77 | shop_kiosk 78 | shop_laundry 79 | shop_mall 80 | shop_mobile_phone 81 | shop_motorcycle 82 | shop_newsagent 83 | shop_optician 84 | shop_pet 85 | shop_shoes 86 | shop_sports 87 | shop_stationery 88 | shop_supermarket 89 | shop_toys 90 | shop_travel_agency 91 | shop_tyres 92 | shop_vacant 93 | shop_variety_store 94 | shop_wine 95 | tourism_camp_site 96 | tourism_caravan_site 97 | tourism_guest_house 98 | tourism_hostel 99 | tourism_hotel 100 | tourism_information 101 | tourism_motel 102 | -------------------------------------------------------------------------------- /importer/sql/duplicates.sql: -------------------------------------------------------------------------------- 1 | select f.*, o.name from 2 | (select o.name, parent, postal_code, t.name as type_name, count(*) as cnt from object_primary o 3 | join "type" t on t.id = o.type_id 4 | group by o.name, parent, postal_code, t.name 5 | order by cnt desc 6 | limit 25) f 7 | join object_primary o on f.parent=o.id 8 | -------------------------------------------------------------------------------- /importer/sql/nominatim_cyclic_parents.sql: -------------------------------------------------------------------------------- 1 | select c.place_id,c.name,c.parent_place_id,c.linked_place_id,c.osm_id,c.osm_type,c.indexed_status 2 | from placex c 3 | join placex r on r.place_id=c.parent_place_id 4 | where r.parent_place_id=c.place_id; 5 | -------------------------------------------------------------------------------- /importer/src/config.h: -------------------------------------------------------------------------------- 1 | #ifndef GEOCODER_CONFIG_H 2 | #define GEOCODER_CONFIG_H 3 | 4 | #include 5 | 6 | #define TEMPORARY "TEMPORARY" // set to empty if need to debug import 7 | 8 | /// if there are more expansions that specified, this object will be dropped from normalization 9 | /// table 10 | #define MAX_NUMBER_OF_EXPANSIONS 85 11 | 12 | /// starting from this length, check wheher the string is suspicious 13 | #define LENGTH_STARTING_SUSP_CHECK 200 14 | 15 | #define MAX_COMMAS 10 /// maximal number of commas allowed in a name 16 | 17 | #define GEOCODER_IMPORTER_POSTGRES "GEOCODER_IMPORTER_POSTGRES" 18 | 19 | typedef uint64_t hindex; 20 | typedef long long int sqlid; /// type used by IDs in SQLite 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /importer/src/hierarchy.cpp: -------------------------------------------------------------------------------- 1 | #include "hierarchy.h" 2 | 3 | #include 4 | #include 5 | 6 | Hierarchy::Hierarchy() {} 7 | 8 | Hierarchy::~Hierarchy() {} 9 | 10 | void Hierarchy::add_item(std::shared_ptr &item) 11 | { 12 | hindex id = item->id(); 13 | hindex parent_id = item->parent_id(); 14 | if (m_items.count(id)) 15 | throw std::runtime_error("Trying to insert item that has been inserted earlier"); 16 | 17 | m_items[id] = item; 18 | auto p = m_items.find(parent_id); 19 | if (p != m_items.end()) 20 | p->second->add_child(item); 21 | else 22 | { 23 | auto root_leaf = m_root.find(parent_id); 24 | if (root_leaf == m_root.end()) 25 | m_root[parent_id] = std::set >({ item }); 26 | else 27 | root_leaf->second.insert(item); 28 | } 29 | 30 | // check if added item was a parent for someone and adjust root accordingly 31 | auto root_leaf = m_root.find(id); 32 | if (root_leaf != m_root.end()) 33 | { 34 | for (auto root_iter : root_leaf->second) 35 | { 36 | if (root_iter->parent_id() != id) 37 | throw std::runtime_error("Mismatch between expected parent and root location"); 38 | item->add_child(root_iter); 39 | } 40 | m_root.erase(root_leaf); 41 | } 42 | } 43 | 44 | bool Hierarchy::add_linked_item(std::shared_ptr &item) 45 | { 46 | hindex linked = item->linked_id(); 47 | auto tolink = m_items.find(linked); 48 | if (tolink == m_items.end()) 49 | { 50 | std::cout << "Failed to find linked object " << linked << " required by " << item->id() 51 | << ". Skipping linkage.\n"; 52 | return false; 53 | } 54 | 55 | tolink->second->add_linked(item); 56 | return true; 57 | } 58 | 59 | void Hierarchy::cleanup() 60 | { 61 | for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter) 62 | { 63 | std::set > keep; 64 | for (auto item : root_iter->second) 65 | { 66 | item->cleanup_children(); 67 | if (item->keep()) 68 | keep.insert(item); 69 | else 70 | keep.insert(item->children().begin(), item->children().end()); 71 | } 72 | root_iter->second = keep; 73 | 74 | // ensure that the parent is set correctly 75 | for (auto item : root_iter->second) 76 | item->set_parent(root_iter->first, true); 77 | } 78 | } 79 | 80 | void Hierarchy::set_country(const std::string &country, hindex id) 81 | { 82 | if (!m_items.count(id)) 83 | { 84 | std::cout << "Missing country in the database: " << country << " / " << id << "\n"; 85 | for (auto item : root_items()) 86 | if (item->country() == country) 87 | item->print_branch(0); 88 | } 89 | 90 | auto parent = m_items[id]; 91 | for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter) 92 | { 93 | std::set > remove; 94 | for (auto item : root_iter->second) 95 | if (item->country() == country && item->id() != id) 96 | { 97 | parent->add_child(item); 98 | remove.insert(item); 99 | } 100 | std::cout << "Relocated to country: " << country << " - " << remove.size() << "\n"; 101 | for (auto item : remove) 102 | root_iter->second.erase(item); 103 | } 104 | } 105 | 106 | void Hierarchy::finalize() 107 | { 108 | m_root_finalized = root_items(); 109 | sqlid index = 1; 110 | for (auto item : m_root_finalized) 111 | { 112 | index = item->index(index, 0); 113 | item->set_parent(0); 114 | } 115 | 116 | std::cout << "Hierarchy: active items: " << index - 1 117 | << " / cleared items: " << m_items.size() - (index - 1) << "\n"; 118 | } 119 | 120 | bool Hierarchy::check_indexing(bool verbose) 121 | { 122 | std::cout << "Check whether all items are indexed\n"; 123 | 124 | m_index_check_failed.clear(); 125 | bool isok = true; 126 | for (auto itemp : m_items) 127 | { 128 | auto item = itemp.second; 129 | if (item->keep(false) && !item->indexed()) 130 | { 131 | isok = false; 132 | if (verbose) 133 | { 134 | std::cout << "\nItem is not included into hierarchy while it should be\n"; 135 | item->print_item(0); 136 | 137 | std::cout << "\nItem part of hierarchy (child -> parent):\n"; 138 | } 139 | 140 | auto i = item; 141 | std::set ids; 142 | for (auto parent = item->parent_id(); parent != 0; parent = i->parent_id()) 143 | { 144 | if (m_items.find(parent) == m_items.end()) 145 | { 146 | // those are not expected to be many, keeping message 147 | std::cout << "\nCannot find parent with ID " << parent << "\n"; 148 | break; 149 | } 150 | 151 | i = m_items[parent]; 152 | 153 | if (verbose) 154 | i->print_item(0); 155 | 156 | if (ids.count(i->id()) > 0) 157 | { 158 | if (verbose) 159 | std::cout << "\nCyclic branch detected\n"; 160 | 161 | m_index_check_failed.insert(ids.begin(), ids.end()); 162 | break; 163 | } 164 | ids.insert(i->id()); 165 | } 166 | if (verbose) 167 | std::cout << "\n\n"; 168 | } 169 | } 170 | 171 | if (isok) 172 | std::cout << "Items indexing check passed\n"; 173 | else 174 | std::cout << "Items indexing check FAILED\n"; 175 | return isok; 176 | } 177 | 178 | void Hierarchy::write(sqlite3pp::database &db) const 179 | { 180 | for (auto item : m_root_finalized) 181 | item->write(db); 182 | } 183 | 184 | std::deque > Hierarchy::root_items() const 185 | { 186 | std::deque > q; 187 | for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter) 188 | for (auto item : root_iter->second) 189 | q.push_back(item); 190 | return q; 191 | } 192 | 193 | size_t Hierarchy::get_root_count() const 194 | { 195 | size_t count{ 0 }; 196 | for (auto i : m_root) 197 | count += i.second.size(); 198 | return count; 199 | } 200 | 201 | hindex Hierarchy::get_next_nonzero_root_parent() const 202 | { 203 | for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter) 204 | if (root_iter->first) 205 | return root_iter->first; 206 | return 0; 207 | } 208 | 209 | std::set Hierarchy::get_root_countries() const 210 | { 211 | std::set missing; 212 | for (auto item : root_items()) 213 | missing.insert(item->country()); 214 | return missing; 215 | } 216 | 217 | bool Hierarchy::has_item(hindex id) const 218 | { 219 | return m_items.count(id); 220 | } 221 | 222 | void Hierarchy::print(bool full) const 223 | { 224 | std::set root_ids; 225 | for (auto item : root_items()) 226 | { 227 | if (full) 228 | item->print_branch(0); 229 | else 230 | item->print_item(0); 231 | root_ids.insert(item->id()); 232 | } 233 | 234 | std::cout << (full ? "\n\n" : "") << "Root items:\n"; 235 | for (auto id : root_ids) 236 | std::cout << id << " "; 237 | std::cout << "\n"; 238 | std::cout << "Root items count: " << get_root_count() << "\n"; 239 | 240 | std::cout << "Countries: "; 241 | for (auto c : get_root_countries()) 242 | std::cout << c << " "; 243 | std::cout << "\n"; 244 | } 245 | 246 | void Hierarchy::print_root_with_parent_id(hindex parent_id) const 247 | { 248 | for (auto item : root_items()) 249 | { 250 | if (item->parent_id() == parent_id) 251 | item->print_item(0); 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /importer/src/hierarchy.h: -------------------------------------------------------------------------------- 1 | #ifndef HIERARCHY_H 2 | #define HIERARCHY_H 3 | 4 | #pragma once 5 | 6 | #include "hierarchyitem.h" 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | class Hierarchy 16 | { 17 | public: 18 | Hierarchy(); 19 | ~Hierarchy(); 20 | 21 | void add_item(std::shared_ptr &item); 22 | bool add_linked_item(std::shared_ptr &item); 23 | void set_country(const std::string &country, hindex id); 24 | void cleanup(); 25 | void finalize(); 26 | bool check_indexing(bool verbose = false); 27 | void write(sqlite3pp::database &db) const; 28 | 29 | size_t get_missing_count() const { return m_root.size(); } 30 | size_t get_root_count() const; 31 | bool has_item(hindex id) const; 32 | 33 | hindex get_next_nonzero_root_parent() const; 34 | std::set get_root_countries() const; 35 | 36 | std::set get_failed_indexes() const { return m_index_check_failed; } 37 | 38 | void print(bool full = true) const; 39 | void print_root_with_parent_id(hindex parent_id) const; 40 | 41 | private: 42 | std::deque > root_items() const; 43 | 44 | private: 45 | std::map > m_items; 46 | std::map > > m_root; 47 | std::deque > m_root_finalized; 48 | std::set m_index_check_failed; 49 | }; 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /importer/src/hierarchyitem.cpp: -------------------------------------------------------------------------------- 1 | #include "hierarchyitem.h" 2 | #include "postal.h" 3 | #include "utils.h" 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | std::set HierarchyItem::s_priority_types; 12 | std::set HierarchyItem::s_skip_types; 13 | 14 | static std::string allowed_type_chars = "abcdefghijklmnopqrstuvwxyz_-"; 15 | 16 | HierarchyItem::HierarchyItem(const pqxx::row &row) 17 | { 18 | m_id = row["place_id"].as(0); 19 | m_linked_id = row["linked_place_id"].as(0); 20 | m_parent_id = row["parent_place_id"].as(0); 21 | m_country = row["country_code"].as(""); 22 | m_type = geocoder_type(row["class"].as(""), row["type"].as("")); 23 | m_housenumber = row["housenumber"].as(""); 24 | m_postcode = GeoNLP::Postal::normalize_postalcode(row["postcode"].as("")); 25 | m_latitude = row["latitude"].as(0); 26 | m_longitude = row["longitude"].as(0); 27 | m_osm_id = row["osm_id"].as(0); 28 | m_search_rank = row["search_rank"].as(0); 29 | 30 | m_data_name = parse_to_map(row["name"].as("")); 31 | m_data_extra = parse_to_map(row["extra"].as("")); 32 | 33 | set_names(); 34 | m_key = key(); 35 | } 36 | 37 | static std::set load_list(const std::string &fname) 38 | { 39 | std::set d; 40 | if (fname.empty()) 41 | return d; 42 | 43 | std::ifstream f(fname); 44 | std::string line; 45 | if (!f) 46 | { 47 | std::cerr << "Failed to open a file: " << fname << std::endl; 48 | throw std::runtime_error("File cannot be opened"); 49 | } 50 | 51 | while (std::getline(f, line)) 52 | { 53 | boost::algorithm::trim(line); 54 | if (!line.empty()) 55 | d.insert(line); 56 | } 57 | 58 | return d; 59 | } 60 | 61 | void HierarchyItem::load_priority_list(const std::string &fname) 62 | { 63 | s_priority_types = load_list(fname); 64 | } 65 | 66 | void HierarchyItem::load_skip_list(const std::string &fname) 67 | { 68 | s_skip_types = load_list(fname); 69 | } 70 | 71 | void HierarchyItem::drop() 72 | { 73 | m_dropped = true; 74 | } 75 | 76 | bool HierarchyItem::keep(bool verbose) const 77 | { 78 | if (m_dropped) 79 | return false; 80 | 81 | if (m_type.find_first_not_of(allowed_type_chars) != std::string::npos) 82 | { 83 | if (verbose) 84 | std::cout << "Dropping " << m_type << "\n"; 85 | return false; 86 | } 87 | 88 | if (s_skip_types.count(m_type) > 0) 89 | return false; 90 | 91 | return !m_name.empty() || !m_postcode.empty() || s_priority_types.count(m_type) > 0; 92 | } 93 | 94 | bool HierarchyItem::is_duplicate(std::shared_ptr item) const 95 | { 96 | if (s_priority_types.count(m_type) > 0) 97 | return false; 98 | 99 | if (m_name != item->m_name || m_postcode != item->m_postcode) 100 | return false; 101 | 102 | if (m_type == item->m_type || same_starts_with("building", m_type, item->m_type) 103 | || same_starts_with("highway", m_type, item->m_type)) 104 | return true; 105 | 106 | return false; 107 | } 108 | 109 | std::string HierarchyItem::key() const 110 | { 111 | std::stringstream ss; 112 | 113 | ss << m_name << "-" << m_name_extra << "-" << m_postcode << "-"; 114 | 115 | if (m_type.rfind("building", 0) == 0) 116 | ss << "building"; 117 | else if (m_type.rfind("highway", 0) == 0) 118 | ss << "highway"; 119 | else 120 | ss << m_type; 121 | 122 | if (s_priority_types.count(m_type) > 0) 123 | ss << "-" << m_id; 124 | 125 | return ss.str(); 126 | } 127 | 128 | void HierarchyItem::add_child(std::shared_ptr child) 129 | { 130 | m_children.push_back(child); 131 | child->set_parent(m_id); 132 | } 133 | 134 | void HierarchyItem::add_linked(std::shared_ptr linked) 135 | { 136 | m_data_name.insert(linked->m_data_name.begin(), linked->m_data_name.end()); 137 | m_data_extra.insert(linked->m_data_extra.begin(), linked->m_data_extra.end()); 138 | set_names(); 139 | } 140 | 141 | void HierarchyItem::set_names() 142 | { 143 | m_name = get_with_def(m_data_name, "name"); 144 | m_name_extra.clear(); 145 | if (!m_housenumber.empty()) 146 | { 147 | m_name_extra = m_name; 148 | m_name = m_housenumber; 149 | } 150 | 151 | if (m_name_extra.empty()) 152 | m_name_extra = get_with_def(m_data_extra, "brand"); 153 | } 154 | 155 | void HierarchyItem::set_parent(hindex parent, bool force) 156 | { 157 | if (!force && m_parent_id != parent && m_parent_id != 0 && parent != 0) 158 | { 159 | std::cout << "New parent (" << parent << ") for " << m_id << " does not match old one (" 160 | << m_parent_id << ")\n"; 161 | throw std::runtime_error("Mismatch between new and old parent"); 162 | } 163 | m_parent_id = parent; 164 | // for (auto c : m_children) 165 | // c->set_parent(m_id, force); 166 | } 167 | 168 | void HierarchyItem::cleanup_children(bool duplicate_only) 169 | { 170 | // as a result of this run, children that are supposed to be kept are staying in children 171 | // property. all disposed ones are still pointed to via Hierarchy map, but should not be accessed 172 | // while moving along hierarchy for indexing or writing it 173 | if (!duplicate_only) 174 | { 175 | std::deque > children; 176 | for (auto item : m_children) 177 | { 178 | item->cleanup_children(); 179 | if (item->keep()) 180 | children.push_back(item); 181 | else 182 | children.insert(children.end(), item->m_children.begin(), item->m_children.end()); 183 | } 184 | m_children = children; 185 | } 186 | 187 | // print out items with huge amount of children 188 | if (m_children.size() > 10000) 189 | { 190 | print_item(0); 191 | m_children[0]->print_item(3); 192 | } 193 | 194 | // check for duplicates 195 | std::map > children; 196 | for (std::shared_ptr item : m_children) 197 | { 198 | std::string key = item->key(); 199 | auto main_pair = children.find(key); 200 | if (main_pair != children.end()) 201 | { 202 | std::shared_ptr main = main_pair->second; 203 | main->m_children.insert(main->m_children.end(), item->m_children.begin(), 204 | item->m_children.end()); 205 | for (auto &i_children : item->m_children) 206 | i_children->set_parent(main->m_id, true); 207 | item->drop(); 208 | main->cleanup_children(true); 209 | } 210 | else 211 | children[key] = item; 212 | } 213 | 214 | m_children.clear(); 215 | for (auto &iter : children) 216 | m_children.push_back(iter.second); 217 | 218 | // set parent, forced 219 | for (auto item : m_children) 220 | item->set_parent(m_id, true); 221 | } 222 | 223 | sqlid HierarchyItem::index(sqlid idx, sqlid parent) 224 | { 225 | if (!keep()) 226 | throw std::runtime_error("Trying to index a location that was not supposed to be kept"); 227 | m_my_index = idx; 228 | m_parent_index = parent; 229 | ++idx; 230 | for (auto item : m_children) 231 | idx = item->index(idx, m_my_index); 232 | m_last_child_index = idx - 1; 233 | return idx; 234 | } 235 | 236 | void HierarchyItem::write(sqlite3pp::database &db) const 237 | { 238 | if (!keep()) 239 | throw std::runtime_error("Trying to write a location that was not supposed to be kept"); 240 | 241 | // primary data 242 | std::string name_en = get_with_def(m_data_name, "name:en"); 243 | std::string phone = get_with_def(m_data_extra, "phone"); 244 | std::string website = get_with_def(m_data_extra, "website"); 245 | 246 | { 247 | sqlite3pp::command cmd(db, 248 | "INSERT INTO object_primary_tmp (id, postgres_id, name, name_extra, " 249 | "name_en, phone, postal_code, website, parent, longitude, " 250 | "latitude, search_rank) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)"); 251 | cmd.binder() << m_my_index << (int)m_id << m_name << m_name_extra << name_en << phone 252 | << m_postcode << website << m_parent_index << m_longitude << m_latitude 253 | << m_search_rank; 254 | if (cmd.execute() != SQLITE_OK) 255 | std::cerr << "WriteSQL : error inserting primary data for " << m_id << ", " << m_my_index 256 | << "\n"; 257 | } 258 | 259 | // type 260 | { 261 | std::string command 262 | //= "INSERT INTO object_type_tmp (prim_id, type) VALUES (?, \"" + type + "\")"; 263 | = "INSERT INTO object_type_tmp (prim_id, type) VALUES (?, ?)"; 264 | sqlite3pp::command cmd(db, command.c_str()); 265 | cmd.binder() << m_my_index << m_type; 266 | if (cmd.execute() != SQLITE_OK) 267 | std::cerr << "WriteSQL: error inserting type for " << m_id << ", " << m_my_index << "\n"; 268 | } 269 | 270 | // hierarchy 271 | if (m_last_child_index > m_my_index) 272 | { 273 | sqlite3pp::command cmd(db, "INSERT INTO hierarchy (prim_id, last_subobject) VALUES (?, ?)"); 274 | cmd.binder() << m_my_index << m_last_child_index; 275 | if (cmd.execute() != SQLITE_OK) 276 | std::cerr << "WriteSQL: error inserting hierarchy for " << m_id << ", " << m_my_index 277 | << " - " << m_last_child_index << "\n"; 278 | } 279 | 280 | // children 281 | for (const auto &c : m_children) 282 | c->write(db); 283 | } 284 | 285 | void HierarchyItem::print_item(unsigned int offset) const 286 | { 287 | std::cout << std::string(offset, ' ') << "- " << m_id << " "; 288 | if (!m_housenumber.empty()) 289 | std::cout << "house " << m_housenumber << " "; 290 | std::cout << m_name << " "; 291 | std::cout << "(" << m_my_index << " " << m_last_child_index << ": " << m_children.size() << ": " 292 | << m_parent_id << ", " << m_country << ", osmid=" << m_osm_id << ", " << m_key << ")\n"; 293 | } 294 | 295 | void HierarchyItem::print_branch(unsigned int offset) const 296 | { 297 | print_item(offset); 298 | if (m_children.size()) 299 | std::cout << std::string(offset + 2, ' ') << "|\n"; 300 | for (auto c : m_children) 301 | c->print_branch(offset + 3); 302 | } 303 | -------------------------------------------------------------------------------- /importer/src/hierarchyitem.h: -------------------------------------------------------------------------------- 1 | #ifndef HIERARCHYITEM_H 2 | #define HIERARCHYITEM_H 3 | 4 | #include "config.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | class HierarchyItem 14 | { 15 | public: 16 | HierarchyItem(const pqxx::row &row); 17 | ~HierarchyItem(){}; 18 | 19 | hindex id() const { return m_id; } 20 | hindex linked_id() const { return m_linked_id; } 21 | hindex parent_id() const { return m_parent_id; } 22 | const std::string &country() const { return m_country; } 23 | bool keep(bool verbose = true) const; 24 | bool indexed() const { return m_my_index > 0; } 25 | 26 | void drop(); 27 | bool dropped() const { return m_dropped; } 28 | 29 | const std::deque > &children() { return m_children; } 30 | 31 | void add_child(std::shared_ptr child); 32 | void add_linked(std::shared_ptr linked); 33 | void set_parent(hindex parent, bool force = false); 34 | void cleanup_children(bool duplicate_only = false); 35 | sqlid index(sqlid idx, sqlid parent); 36 | void write(sqlite3pp::database &db) const; 37 | 38 | void print_item(unsigned int offset) const; 39 | void print_branch(unsigned int offset) const; 40 | 41 | public: 42 | static void load_priority_list(const std::string &fname); 43 | static void load_skip_list(const std::string &fname); 44 | 45 | static std::set get_priority_list() { return s_priority_types; } 46 | 47 | protected: 48 | void set_names(); 49 | bool is_duplicate(std::shared_ptr item) const; 50 | std::string key() const; 51 | 52 | private: 53 | hindex m_id; 54 | hindex m_linked_id{ 0 }; 55 | hindex m_parent_id; 56 | sqlid m_my_index{ 0 }; 57 | sqlid m_parent_index{ 0 }; 58 | sqlid m_last_child_index{ 0 }; 59 | bool m_dropped{ false }; 60 | std::string m_key; 61 | 62 | std::string m_type; 63 | float m_latitude; 64 | float m_longitude; 65 | uint64_t m_osm_id; 66 | std::string m_country; 67 | std::string m_postcode; 68 | std::string m_housenumber; 69 | std::string m_name; 70 | std::string m_name_extra; 71 | int m_search_rank; 72 | 73 | std::map m_data_name; 74 | std::map m_data_extra; 75 | 76 | std::deque > m_children; 77 | 78 | static std::set s_priority_types; 79 | static std::set s_skip_types; 80 | }; 81 | 82 | #endif 83 | -------------------------------------------------------------------------------- /importer/src/main.cpp: -------------------------------------------------------------------------------- 1 | ///////////////////////////////////////////////////////////////////// 2 | /// WARNING: When adding new languages, increase the language count 3 | /// num_languages in geocoder.c 4 | ///////////////////////////////////////////////////////////////////// 5 | 6 | #include "config.h" 7 | #include "geocoder.h" 8 | #include "hierarchy.h" 9 | #include "normalization.h" 10 | #include "version.h" 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | 28 | using json = nlohmann::json; 29 | namespace po = boost::program_options; 30 | 31 | //////////////////////////////////////////////////////////////////////////// 32 | // MAIN 33 | 34 | int main(int argc, char *argv[]) 35 | { 36 | const int printout_step = 100000; 37 | 38 | std::string polyjson; 39 | std::string database_path; 40 | std::string postal_country_parser; 41 | std::string postal_address_parser_dir; 42 | std::string type_priority_list; 43 | std::string type_skip_list; 44 | std::string log_errors_to_file; 45 | bool verbose_address_expansion = false; 46 | 47 | { 48 | po::options_description generic("Geocoder NLP importer options"); 49 | generic.add_options()("help,h", "Help message")("version,v", "Version"); 50 | generic.add_options()("version-data", "Version of the data file"); 51 | generic.add_options()("poly,p", po::value(&polyjson), 52 | "Boundary of the imported region in GeoJSON format"); 53 | generic.add_options()("postal-country", po::value(&postal_country_parser), 54 | "libpostal country preference for this database"); 55 | generic.add_options()( 56 | "postal-address", po::value(&postal_address_parser_dir), 57 | "libpostal address parser directory. If not specified, global libpostal parser directory " 58 | "preference is used."); 59 | generic.add_options()( 60 | "priority", po::value(&type_priority_list), 61 | "File with OSM tags that are kept even if there is no name associated with the location"); 62 | generic.add_options()( 63 | "skip", po::value(&type_skip_list), 64 | "File with OSM tags for locations that should be dropped even if there is a name " 65 | "associated with the location"); 66 | generic.add_options()( 67 | "log-errors-to-file", po::value(&log_errors_to_file), 68 | "Log errors to file and continue import. File name given as an argument of this option."); 69 | generic.add_options()("verbose", "Verbose address expansion"); 70 | 71 | po::options_description hidden("Hidden options"); 72 | hidden.add_options()("output-directory", po::value(&database_path), 73 | "Output directory for imported database"); 74 | 75 | po::positional_options_description p; 76 | p.add("output-directory", 1); 77 | 78 | po::options_description cmdline_options; 79 | cmdline_options.add(generic).add(hidden); 80 | 81 | po::variables_map vm; 82 | try 83 | { 84 | po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(), 85 | vm); 86 | po::notify(vm); 87 | } 88 | catch (std::exception &e) 89 | { 90 | std::cerr << "Error while parsing options: " << e.what() << "\n\n"; 91 | std::cerr << generic << "\n"; 92 | } 93 | 94 | if (vm.count("help")) 95 | { 96 | std::cout << "Geocoder NLP importer:\n\n" 97 | << "Call as\n\n " << argv[0] << " output-directory\n" 98 | << "\nwhere output-directory is a directory for imported database.\n\n" 99 | << generic << "\n"; 100 | return 0; 101 | } 102 | 103 | if (vm.count(("version"))) 104 | { 105 | std::cout << "Geocoder NLP version: " << GEOCODERNLP_VERSION_STRING << "\n"; 106 | std::cout << "Data format version: " << GeoNLP::Geocoder::version << "\n"; 107 | return 0; 108 | } 109 | 110 | if (vm.count(("version-data"))) 111 | { 112 | std::cout << GeoNLP::Geocoder::version << "\n"; 113 | return 0; 114 | } 115 | 116 | if (vm.count("verbose")) 117 | verbose_address_expansion = true; 118 | 119 | if (!vm.count("poly")) 120 | { 121 | std::cerr << "Boundary of the imported region in GeoJSON format is missing\n"; 122 | return -1; 123 | } 124 | } 125 | 126 | // load GeoJSON for surrounding (multi)polygon from poly.json 127 | std::string border; 128 | { 129 | std::ifstream fin(polyjson); 130 | if (!fin) 131 | { 132 | std::cerr << "Failed to open " << polyjson << "\n"; 133 | return -2; 134 | } 135 | 136 | std::istreambuf_iterator begin(fin), end; 137 | std::string b(begin, end); 138 | border = b; 139 | } 140 | 141 | if (border.size()) 142 | { 143 | json j = json::parse(border); 144 | border = j["geometry"].dump(); 145 | std::cout << "Loaded border GeoJSON. Geometry string length: " << border.size() << "\n"; 146 | } 147 | 148 | HierarchyItem::load_priority_list(type_priority_list); 149 | HierarchyItem::load_skip_list(type_skip_list); 150 | 151 | Hierarchy hierarchy; 152 | 153 | std::string postgres_dblink; 154 | const char *env = std::getenv(GEOCODER_IMPORTER_POSTGRES); 155 | if (env) 156 | postgres_dblink = env; 157 | else 158 | { 159 | std::cout << "Please specify PostgreSQL connection string using environment variable " 160 | << GEOCODER_IMPORTER_POSTGRES << "\n"; 161 | return -1; 162 | } 163 | 164 | std::cout << "Postgres connection: " << postgres_dblink << std::endl; 165 | 166 | pqxx::connection pgc{ postgres_dblink }; 167 | pqxx::work txn{ pgc }; 168 | 169 | // Here, 1000 is used to scale search_rank. Same factor is used in Geocoder::search 170 | const std::string base_query 171 | = R"SQL( 172 | select pl.place_id, linked_place_id, parent_place_resolved as parent_place_id, pl.country_code, class, type, 173 | hstore_to_json(name) as name, hstore_to_json(extratags) as extra, 174 | COALESCE(address->'housenumber',housenumber) AS housenumber, 175 | postcode, ST_X(pl.centroid) as longitude, ST_Y(pl.centroid) as latitude, 176 | CASE 177 | WHEN sname.importance IS NOT NULL THEN (1000*(1-sname.importance))::int 178 | ELSE 1000 + pl.rank_search 179 | END AS search_rank, 180 | osm_type, osm_id 181 | from placex pl 182 | left join search_name as sname on sname.place_id=pl.place_id 183 | left join lateral 184 | (select address_place_id from place_addressline a where a.place_id=pl.place_id 185 | order by cached_rank_address desc, fromarea desc, distance asc limit 1) as a on true 186 | left join lateral 187 | (with recursive prec as 188 | (select place_id, linked_place_id from placex where COALESCE(a.address_place_id,pl.parent_place_id)=placex.place_id 189 | union select p.place_id, p.linked_place_id from placex p join prec on p.place_id=prec.linked_place_id) 190 | select place_id as parent_place_resolved from prec where linked_place_id is null limit 1) as pres on true 191 | )SQL"; 192 | 193 | // load primary hierarchy 194 | { 195 | std::vector types_enclosed; 196 | for (const auto &i : HierarchyItem::get_priority_list()) 197 | types_enclosed.push_back("'" + i + "'"); 198 | 199 | std::string priority_types_condition 200 | = "class || '_' || type IN (" + boost::algorithm::join(types_enclosed, ",") + ")"; 201 | 202 | pqxx::result r = txn.exec_params( 203 | base_query 204 | + "where linked_place_id IS NULL and " 205 | "ST_Intersects(ST_GeomFromGeoJSON($1), geometry) and " 206 | "class ~ '^[a-z_-]+$' and (type~'^[a-z_-]+$' or ((type<>'') is not true)) and " 207 | "(name->'name'<>'' or housenumber<>'' or " 208 | + priority_types_condition + ")", 209 | border); 210 | size_t count = 0; 211 | for (const pqxx::row &row : r) 212 | { 213 | ++count; 214 | std::shared_ptr item = std::make_shared(row); 215 | if (!item->keep()) 216 | continue; 217 | hierarchy.add_item(item); 218 | if (count % printout_step == 0) 219 | std::cout << "Imported records: " << count 220 | << "; Root elements: " << hierarchy.get_root_count() 221 | << "; Missing parents: " << hierarchy.get_missing_count() << std::endl; 222 | } 223 | } 224 | 225 | // load all linked places and merge with the primary ones 226 | { 227 | pqxx::result r = txn.exec_params( 228 | base_query 229 | + "where linked_place_id IS NOT NULL and ST_Intersects(ST_GeomFromGeoJSON($1), " 230 | "geometry)", 231 | border); 232 | size_t count = 0; 233 | size_t failed = 0; 234 | for (const pqxx::row &row : r) 235 | { 236 | ++count; 237 | std::shared_ptr item = std::make_shared(row); 238 | if (!hierarchy.add_linked_item(item)) 239 | failed++; 240 | if (count % printout_step == 0) 241 | std::cout << "Imported linked records: " << count 242 | << "; Root elements: " << hierarchy.get_root_count() 243 | << "; Missing parents: " << hierarchy.get_missing_count() << std::endl; 244 | } 245 | std::cout << "Imported linked records: " << count << " / failed to import: " << failed 246 | << "; Root elements: " << hierarchy.get_root_count() 247 | << "; Missing parents: " << hierarchy.get_missing_count() << std::endl; 248 | } 249 | 250 | // load centroids for postal codes 251 | const std::string postal_codes_query 252 | = R"SQL( 253 | select pc.place_id, NULL as linked_place_id, parent_place_resolved as parent_place_id, country_code, 'postal' as class, 'code' as type, 254 | NULL as name, NULL as extra, NULL as housenumber, 255 | postcode, ST_X(geometry) as longitude, ST_Y(geometry) as latitude, 256 | 100 AS search_rank, 257 | NULL as osm_type, NULL as osm_id 258 | from location_postcode pc 259 | left join lateral 260 | (with recursive prec as 261 | (select place_id, linked_place_id from placex where pc.parent_place_id=placex.place_id 262 | union select p.place_id, p.linked_place_id from placex p join prec on p.place_id=prec.linked_place_id) 263 | select place_id as parent_place_resolved from prec where linked_place_id is null limit 1) as pres on true 264 | )SQL"; 265 | { 266 | pqxx::result r = txn.exec_params(postal_codes_query 267 | + "where " 268 | "ST_Intersects(ST_GeomFromGeoJSON($1), geometry)", 269 | border); 270 | 271 | size_t count = 0; 272 | for (const pqxx::row &row : r) 273 | { 274 | ++count; 275 | std::shared_ptr item = std::make_shared(row); 276 | if (!item->keep()) 277 | { 278 | std::cout << "Dropping postcode?" 279 | << "\n"; 280 | item->print_item(0); 281 | continue; 282 | } 283 | hierarchy.add_item(item); 284 | if (count % printout_step == 0) 285 | std::cout << "Imported postal codes: " << count 286 | << "; Root elements: " << hierarchy.get_root_count() 287 | << "; Missing parents: " << hierarchy.get_missing_count() << std::endl; 288 | } 289 | } 290 | 291 | // find missing parents for root nodes 292 | std::cout << "Fill missing hierarchies. Root size: " << hierarchy.get_root_count() << "\n"; 293 | for (hindex parent = hierarchy.get_next_nonzero_root_parent(); parent; 294 | parent = hierarchy.get_next_nonzero_root_parent()) 295 | { 296 | pqxx::result r = txn.exec_params(base_query + "where pl.place_id=$1", parent); 297 | bool found = false; 298 | for (auto row : r) 299 | { 300 | std::shared_ptr item = std::make_shared(row); 301 | hierarchy.add_item(item); 302 | found = true; 303 | } 304 | 305 | if (!found) 306 | { 307 | std::cerr << "Missing parent with ID " << parent << " . Stopping import\n"; 308 | hierarchy.print_root_with_parent_id(parent); 309 | std::cerr << "\nSQL:\n" << base_query + "where pl.place_id=" << parent << "\n"; 310 | 311 | return -1; 312 | } 313 | } 314 | 315 | // remove all items from hierarchy that are not supposed to be there 316 | std::cout << "Cleanup hierarchy\n"; 317 | hierarchy.cleanup(); 318 | 319 | // find missing countries and move root nodes under them if possible 320 | std::cout << "Try to fill missing parents through countries. Root size: " 321 | << hierarchy.get_root_count() << "\n"; 322 | for (std::string country : hierarchy.get_root_countries()) 323 | { 324 | for (auto row : txn.exec_params( 325 | base_query + "where pl.rank_address = 4 and pl.country_code = $1 limit 1", country)) 326 | { 327 | hindex id = row["place_id"].as(0); 328 | if (!hierarchy.has_item(id)) 329 | { 330 | std::shared_ptr item = std::make_shared(row); 331 | hierarchy.add_item(item); 332 | } 333 | hierarchy.set_country(country, id); 334 | } 335 | } 336 | 337 | hierarchy.finalize(); 338 | if (!hierarchy.check_indexing()) 339 | { 340 | std::set problem = hierarchy.get_failed_indexes(); 341 | for (hindex index : problem) 342 | txn.exec_params0("update placex set indexed_status=2 where place_id=$1", index); 343 | 344 | txn.commit(); 345 | 346 | std::cout << "Requested to reindex Nominatim database (run nominatim index) for " 347 | << problem.size() << " records\n"; 348 | 349 | if (log_errors_to_file.empty()) 350 | return -3; 351 | else 352 | { 353 | std::ofstream flog(log_errors_to_file); 354 | flog << problem.size() << " records were not indexed while they should have been.\n"; 355 | } 356 | } 357 | 358 | txn.commit(); // finalize postgres transactions 359 | 360 | // hierarchy.print(false); 361 | 362 | // Saving data into SQLite 363 | sqlite3pp::database db(GeoNLP::Geocoder::name_primary(database_path).c_str()); 364 | 365 | db.execute("PRAGMA journal_mode = OFF"); 366 | db.execute("PRAGMA synchronous = OFF"); 367 | db.execute("PRAGMA cache_size = 2000000"); 368 | db.execute("PRAGMA temp_store = 2"); 369 | db.execute("BEGIN TRANSACTION"); 370 | db.execute("DROP TABLE IF EXISTS type"); 371 | db.execute("DROP TABLE IF EXISTS object_primary"); 372 | db.execute("DROP TABLE IF EXISTS object_primary_tmp"); 373 | db.execute("DROP TABLE IF EXISTS object_primary_tmp2"); 374 | db.execute("DROP TABLE IF EXISTS boxids"); 375 | db.execute("DROP TABLE IF EXISTS object_type"); 376 | db.execute("DROP TABLE IF EXISTS object_type_tmp"); 377 | db.execute("DROP TABLE IF EXISTS hierarchy"); 378 | db.execute("DROP TABLE IF EXISTS object_primary_rtree"); 379 | 380 | db.execute("CREATE " TEMPORARY " TABLE object_primary_tmp (" 381 | "id INTEGER PRIMARY KEY AUTOINCREMENT, postgres_id INTEGER, name TEXT, name_extra " 382 | "TEXT, name_en TEXT, phone TEXT, postal_code TEXT, website TEXT, parent INTEGER, " 383 | "search_rank INTEGER, " 384 | "latitude REAL, longitude REAL)"); 385 | db.execute("CREATE " TEMPORARY " TABLE object_type_tmp (prim_id INTEGER, type TEXT NOT NULL, " 386 | "FOREIGN KEY (prim_id) REFERENCES objects_primary_tmp(id))"); 387 | db.execute("CREATE TABLE hierarchy (prim_id INTEGER PRIMARY KEY, last_subobject INTEGER, " 388 | "FOREIGN KEY (prim_id) REFERENCES objects_primary(id), FOREIGN KEY (last_subobject) " 389 | "REFERENCES objects_primary(id))"); 390 | 391 | std::cout << "Preliminary filling of the database" << std::endl; 392 | hierarchy.write(db); 393 | 394 | // cleanup from duplicated names 395 | db.execute("UPDATE object_primary_tmp SET name_extra='' WHERE name=name_extra"); 396 | db.execute("UPDATE object_primary_tmp SET name_en='' WHERE name=name_en"); 397 | 398 | std::cout << "Reorganizing database tables" << std::endl; 399 | 400 | db.execute("CREATE TABLE type (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT)"); 401 | db.execute("INSERT INTO type (name) SELECT DISTINCT type FROM object_type_tmp"); 402 | db.execute("CREATE " TEMPORARY 403 | " TABLE object_primary_tmp2 (id INTEGER PRIMARY KEY AUTOINCREMENT, " 404 | "name TEXT, name_extra TEXT, name_en TEXT, phone TEXT, postal_code TEXT, website " 405 | "TEXT, parent INTEGER, type_id INTEGER, latitude REAL, longitude REAL, " 406 | "search_rank INTEGER, boxstr TEXT, " 407 | "FOREIGN KEY (type_id) REFERENCES type(id))"); 408 | 409 | db.execute("INSERT INTO object_primary_tmp2 (id, name, name_extra, name_en, phone, postal_code, " 410 | "website, parent, type_id, latitude, longitude, search_rank, boxstr) " 411 | "SELECT p.id, p.name, p.name_extra, p.name_en, p.phone, p.postal_code, p.website, " 412 | "p.parent, type.id, p.latitude, p.longitude, p.search_rank, " 413 | // LINE BELOW DETERMINES ROUNDING USED FOR BOXES 414 | "CAST(CAST(p.latitude*100 AS INTEGER) AS TEXT) || ',' || CAST(CAST(p.longitude*100 AS " 415 | "INTEGER) AS TEXT) " 416 | "FROM object_primary_tmp p JOIN object_type_tmp tt ON p.id=tt.prim_id " 417 | "JOIN type ON tt.type=type.name"); 418 | 419 | db.execute("CREATE " TEMPORARY " TABLE boxids (id INTEGER PRIMARY KEY AUTOINCREMENT, boxstr " 420 | "TEXT, CONSTRAINT struni UNIQUE (boxstr))"); 421 | db.execute("INSERT INTO boxids (boxstr) SELECT DISTINCT boxstr FROM object_primary_tmp2"); 422 | 423 | db.execute("CREATE TABLE object_primary (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, " 424 | "name_extra TEXT, name_en TEXT, phone TEXT, postal_code TEXT, website TEXT, " 425 | "parent INTEGER, type_id INTEGER, latitude REAL, longitude REAL, search_rank INTEGER, " 426 | "box_id INTEGER, " 427 | "FOREIGN KEY (type_id) REFERENCES type(id))"); 428 | db.execute( 429 | "INSERT INTO object_primary (id, name, name_extra, name_en, phone, postal_code, website, " 430 | "parent, type_id, latitude, longitude, search_rank, box_id) " 431 | "SELECT o.id, name, name_extra, name_en, phone, postal_code, website, parent, type_id, " 432 | "latitude, longitude, search_rank, b.id FROM object_primary_tmp2 o JOIN boxids b ON " 433 | "o.boxstr=b.boxstr"); 434 | 435 | db.execute("DROP INDEX IF EXISTS idx_object_primary_box"); 436 | db.execute("CREATE INDEX idx_object_primary_box ON object_primary (box_id)"); 437 | 438 | db.execute("DROP INDEX IF EXISTS idx_object_primary_postal_code"); 439 | db.execute("CREATE INDEX idx_object_primary_postal_code ON object_primary (postal_code)"); 440 | 441 | std::cout << "Normalize using libpostal" << std::endl; 442 | 443 | normalize_libpostal(db, postal_address_parser_dir, verbose_address_expansion); 444 | normalized_to_final(db, database_path); 445 | 446 | // Create R*Tree for nearest neighbor search 447 | std::cout << "Populating R*Tree" << std::endl; 448 | db.execute( 449 | "CREATE VIRTUAL TABLE object_primary_rtree USING rtree(id, minLat, maxLat, minLon, maxLon)"); 450 | db.execute("INSERT INTO object_primary_rtree (id, minLat, maxLat, minLon, maxLon) " 451 | "SELECT box_id, min(latitude), max(latitude), min(longitude), max(longitude) from " 452 | "object_primary group by box_id"); 453 | 454 | // Stats view 455 | db.execute("DROP VIEW IF EXISTS type_stats"); 456 | db.execute( 457 | "CREATE VIEW type_stats AS SELECT t.name as type_name, COUNT(*) AS cnt FROM object_primary o " 458 | "JOIN \"type\" t ON t.id = o.type_id GROUP BY t.name ORDER BY cnt desc"); 459 | { 460 | std::cout << "List of most popular imported types\n"; 461 | sqlite3pp::query qry(db, "SELECT type_name, cnt FROM type_stats ORDER BY cnt DESC LIMIT 25"); 462 | for (auto v : qry) 463 | { 464 | std::string name; 465 | int cnt; 466 | v.getter() >> name >> cnt; 467 | std::cout << " " << name << "\t" << cnt << "\n"; 468 | } 469 | } 470 | // Recording version 471 | db.execute("DROP TABLE IF EXISTS meta"); 472 | db.execute("CREATE TABLE meta (key TEXT, value TEXT)"); 473 | { 474 | sqlite3pp::command cmd(db, "INSERT INTO meta (key, value) VALUES (?, ?)"); 475 | std::ostringstream ss; 476 | ss << GeoNLP::Geocoder::version; 477 | cmd.binder() << "version" << ss.str().c_str(); 478 | if (cmd.execute() != SQLITE_OK) 479 | std::cerr << "WriteSQL: error inserting version information\n"; 480 | } 481 | 482 | if (!postal_country_parser.empty()) 483 | { 484 | std::cout << "Recording postal parser country preference: " << postal_country_parser << "\n"; 485 | std::string cmd = "INSERT INTO meta (key, value) VALUES (\"postal:country:parser\", \"" 486 | + postal_country_parser + "\")"; 487 | db.execute(cmd.c_str()); 488 | } 489 | 490 | // finalize 491 | db.execute("END TRANSACTION"); 492 | db.execute("VACUUM"); 493 | db.execute("ANALYZE"); 494 | 495 | std::cout << "Done\n"; 496 | 497 | return 0; 498 | } 499 | -------------------------------------------------------------------------------- /importer/src/normalization.cpp: -------------------------------------------------------------------------------- 1 | #include "normalization.h" 2 | #include "config.h" 3 | #include "geocoder.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | //////////////////////////////////////////////////////////////////////////// 10 | /// Libpostal normalization with search string expansion 11 | void normalize_libpostal(sqlite3pp::database &db, std::string address_expansion_dir, bool verbose) 12 | { 13 | struct tonorm 14 | { 15 | std::string name; 16 | sqlid id; 17 | }; 18 | 19 | std::deque data; 20 | sqlite3pp::query qry(db, "SELECT id, name, name_extra, name_en FROM object_primary_tmp"); 21 | for (auto v : qry) 22 | { 23 | tonorm d; 24 | sqlid id; 25 | char const *name, *name_extra, *name_en; 26 | v.getter() >> id >> name >> name_extra >> name_en; 27 | 28 | if (name == nullptr) 29 | continue; // no need to add empty name into search index 30 | 31 | d.id = id; 32 | 33 | d.name = name; 34 | data.push_back(d); 35 | if (name_extra) 36 | { 37 | d.name = name_extra; 38 | data.push_back(d); 39 | } 40 | if (name_en) 41 | { 42 | d.name = name_en; 43 | data.push_back(d); 44 | } 45 | } 46 | 47 | // make a new table for normalized names 48 | db.execute("DROP TABLE IF EXISTS normalized_name"); 49 | db.execute( 50 | "CREATE " TEMPORARY 51 | " TABLE normalized_name (prim_id INTEGER, name TEXT NOT NULL, PRIMARY KEY (name, prim_id))"); 52 | 53 | // load libpostal 54 | if (!libpostal_setup() || !libpostal_setup_language_classifier()) 55 | { 56 | std::cerr << "Failure to load libpostal" << std::endl; 57 | return; 58 | } 59 | 60 | std::vector aed(address_expansion_dir.begin(), address_expansion_dir.end()); 61 | aed.push_back(0); 62 | if ((address_expansion_dir.empty() && !libpostal_setup_parser()) 63 | || (!address_expansion_dir.empty() && !libpostal_setup_parser_datadir(aed.data()))) 64 | { 65 | std::cerr << "Failure to load libpostal parser" << std::endl; 66 | return; 67 | } 68 | 69 | // normalize all names 70 | size_t num_expansions; 71 | size_t num_doubles_dropped = 0; 72 | libpostal_normalize_options_t options = libpostal_get_default_options(); 73 | std::vector charbuff; 74 | for (tonorm &d : data) 75 | { 76 | charbuff.resize(d.name.length() + 1); 77 | std::copy(d.name.c_str(), d.name.c_str() + d.name.length() + 1, charbuff.begin()); 78 | 79 | if (verbose) 80 | std::cout << d.name << ": " << std::flush; 81 | 82 | // check for sanity before we proceed with expansion 83 | if (d.name.length() > LENGTH_STARTING_SUSP_CHECK) 84 | { 85 | size_t digits_space = 0; 86 | for (size_t i = 0; i < d.name.length(); ++i) 87 | if (std::isdigit(charbuff[i]) || std::isspace(charbuff[i])) 88 | digits_space++; 89 | 90 | if ((digits_space * 1.0) / d.name.length() > 0.5) 91 | { 92 | std::cout << "Warning: dropping suspicious name: " << d.name << "\n"; 93 | continue; 94 | } 95 | } 96 | 97 | // check if there are too many commas 98 | if (std::count(d.name.begin(), d.name.end(), ',') > MAX_COMMAS) 99 | { 100 | std::cout << "Warning: dropping suspicious name - too many commas: " << d.name << "\n"; 101 | continue; 102 | } 103 | 104 | // insert normalized, but not expanded string 105 | { 106 | char *normalized = libpostal_normalize_string(charbuff.data(), 107 | LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS); 108 | if (normalized != NULL) 109 | { 110 | sqlite3pp::command cmd(db, "INSERT INTO normalized_name (prim_id, name) VALUES (?,?)"); 111 | std::string s = normalized; 112 | cmd.binder() << d.id << s; 113 | if (cmd.execute() != SQLITE_OK) 114 | { 115 | // std::cerr << "Error inserting: " << d.id << " " << s << std::endl; 116 | num_doubles_dropped++; 117 | } 118 | 119 | free(normalized); 120 | } 121 | } 122 | 123 | char **expansions = libpostal_expand_address(charbuff.data(), options, &num_expansions); 124 | 125 | if (num_expansions > MAX_NUMBER_OF_EXPANSIONS) 126 | { 127 | std::cout << "Warning: large number [" << num_expansions 128 | << "] of normalization expansions of " << d.name 129 | << " - dropping it from the table [" << d.id << "]\n"; 130 | // for (size_t i=0; i < 10 && i < num_expansions; i++) 131 | // std::cout << " example expansion: " << expansions[i] << "\n"; 132 | // std::cout << "\n"; 133 | 134 | continue; // don't insert it, its probably wrong anyway 135 | } 136 | 137 | for (size_t i = 0; i < num_expansions; i++) 138 | { 139 | sqlite3pp::command cmd(db, "INSERT INTO normalized_name (prim_id, name) VALUES (?,?)"); 140 | std::string s = expansions[i]; 141 | cmd.binder() << d.id << s; 142 | if (cmd.execute() != SQLITE_OK) 143 | { 144 | // std::cerr << "Error inserting: " << d.id << " " << s << std::endl; 145 | num_doubles_dropped++; 146 | } 147 | 148 | // to cover the street names that have Dr. or the firstname 149 | // in the front of the mainly used name, add substrings into 150 | // the normalized table as well 151 | const size_t max_substrings = 2; 152 | size_t pos = 1; 153 | for (size_t sbs = 0; sbs < max_substrings && pos < s.length(); ++sbs) 154 | { 155 | bool spacefound = false; 156 | for (; pos < s.length(); ++pos) 157 | { 158 | char c = s[pos]; 159 | if (c == ' ') 160 | spacefound = true; 161 | if (spacefound && c != ' ') 162 | break; 163 | } 164 | 165 | if (pos < s.length()) 166 | { 167 | try 168 | { 169 | sqlite3pp::command cmd( 170 | db, "INSERT INTO normalized_name (prim_id, name) VALUES (?,?)"); 171 | std::string s = expansions[i]; 172 | cmd.binder() << d.id << s.substr(pos); 173 | if (cmd.execute() != SQLITE_OK) 174 | { 175 | // std::cerr << "Error inserting: " << d.id << " " << s << std::endl; 176 | num_doubles_dropped++; 177 | } 178 | } 179 | catch (sqlite3pp::database_error &e) 180 | { 181 | num_doubles_dropped++; 182 | } 183 | } 184 | } 185 | } 186 | 187 | // Free expansions 188 | libpostal_expansion_array_destroy(expansions, num_expansions); 189 | 190 | if (verbose) 191 | std::cout << "done" << std::endl; 192 | } 193 | 194 | std::cout << "Redundant records skipped: " << num_doubles_dropped << "\n"; 195 | 196 | // Teardown libpostal 197 | libpostal_teardown_parser(); 198 | libpostal_teardown(); 199 | libpostal_teardown_language_classifier(); 200 | } 201 | 202 | //////////////////////////////////////////////////////////////////////////// 203 | /// Libpostal normalization with search string expansion 204 | void normalized_to_final(sqlite3pp::database &db, std::string path) 205 | { 206 | std::cout << "Inserting normalized data into MARISA trie" << std::endl; 207 | 208 | marisa::Keyset keyset; 209 | 210 | { 211 | sqlite3pp::query qry(db, "SELECT name FROM normalized_name"); 212 | for (auto v : qry) 213 | { 214 | std::string name; 215 | v.getter() >> name; 216 | keyset.push_back(name.c_str()); 217 | } 218 | } 219 | 220 | marisa::Trie trie; 221 | trie.build(keyset); 222 | trie.save(GeoNLP::Geocoder::name_normalized_trie(path).c_str()); 223 | 224 | struct norm 225 | { 226 | std::string name; 227 | sqlid prim_id; 228 | }; 229 | 230 | std::deque data; 231 | { 232 | sqlite3pp::query qry(db, "SELECT name, prim_id FROM normalized_name"); 233 | for (auto v : qry) 234 | { 235 | norm d; 236 | v.getter() >> d.name >> d.prim_id; 237 | data.push_back(d); 238 | } 239 | } 240 | 241 | std::map > bdata; 242 | for (auto d : data) 243 | { 244 | marisa::Agent agent; 245 | agent.set_query(d.name.c_str()); 246 | if (trie.lookup(agent)) 247 | { 248 | GeoNLP::Geocoder::index_id_key k = agent.key().id(); 249 | if (bdata.count(k) == 0) 250 | bdata[k] = std::vector(); 251 | bdata[k].push_back(d.prim_id); 252 | } 253 | else 254 | { 255 | std::cerr << "Error: cannot find in MARISA trie: " << d.name << std::endl; 256 | } 257 | } 258 | 259 | { 260 | // create the database object 261 | kyotocabinet::HashDB db; 262 | 263 | db.tune_options(kyotocabinet::HashDB::TSMALL | kyotocabinet::HashDB::TLINEAR); 264 | db.tune_alignment(0); 265 | db.tune_defrag(8); 266 | 267 | // open the database 268 | if (!db.open(GeoNLP::Geocoder::name_normalized_id(path).c_str(), 269 | kyotocabinet::HashDB::OWRITER | kyotocabinet::HashDB::OCREATE)) 270 | { 271 | std::cerr << "open error: " << db.error().name() << std::endl; 272 | return; 273 | } 274 | 275 | std::vector keys; 276 | for (auto a : bdata) 277 | keys.push_back(GeoNLP::Geocoder::make_id_key(a.first)); 278 | 279 | std::sort(keys.begin(), keys.end()); 280 | 281 | for (auto key : keys) 282 | { 283 | std::vector &d = bdata[GeoNLP::Geocoder::get_id_key(key)]; 284 | std::sort(d.begin(), d.end()); 285 | std::string value = GeoNLP::Geocoder::make_id_value(d); 286 | if (!db.set(key, value)) 287 | { 288 | std::cerr << "set error: " << db.error().name() << std::endl; 289 | return; 290 | } 291 | } 292 | 293 | std::cout << "Number of records in normalized id database: " << db.count() << "\n"; 294 | 295 | db.close(); 296 | } 297 | 298 | db.execute("DROP TABLE IF EXISTS normalized_name"); 299 | } 300 | -------------------------------------------------------------------------------- /importer/src/normalization.h: -------------------------------------------------------------------------------- 1 | #ifndef GEOCODER_NORMALIZATION_H 2 | #define GEOCODER_NORMALIZATION_H 3 | 4 | #include 5 | #include 6 | 7 | void normalize_libpostal(sqlite3pp::database &db, std::string address_expansion_dir, bool verbose); 8 | 9 | void normalized_to_final(sqlite3pp::database &db, std::string path); 10 | 11 | #endif 12 | -------------------------------------------------------------------------------- /importer/src/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | #include 4 | 5 | using json = nlohmann::json; 6 | 7 | std::string get_with_def(const std::map &m, const std::string &key, 8 | const std::string &defval) 9 | { 10 | auto it = m.find(key); 11 | if (it == m.end()) 12 | return defval; 13 | return it->second; 14 | } 15 | 16 | std::map parse_to_map(const std::string &js) 17 | { 18 | std::map m; 19 | if (js.size()) 20 | { 21 | json j = json::parse(js); 22 | for (auto v : j.items()) 23 | m[v.key()] = v.value(); 24 | } 25 | return m; 26 | } 27 | 28 | bool same_starts_with(const std::string &start, const std::string &s1, const std::string &s2) 29 | { 30 | return s1.rfind(start, 0) == 0 && s2.rfind(start, 0) == 0; 31 | } 32 | 33 | std::string geocoder_type(const std::string &t_class, const std::string &t_value) 34 | { 35 | if (t_value == "yes" || t_value.empty()) 36 | return t_class; 37 | return t_class + "_" + t_value; 38 | } -------------------------------------------------------------------------------- /importer/src/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | 7 | std::string get_with_def(const std::map &m, const std::string &key, 8 | const std::string &defval = std::string()); 9 | 10 | std::string geocoder_type(const std::string &t_class, const std::string &t_value); 11 | 12 | std::map parse_to_map(const std::string &js); 13 | 14 | bool same_starts_with(const std::string &start, const std::string &s1, const std::string &s2); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /scripts/postal/README.md: -------------------------------------------------------------------------------- 1 | # Importing and processing libpostal for country specific datasets 2 | 3 | When run from the folder containing geocoder-nlp and training data in separate subfolders. 4 | 5 | * Split addresses by 6 | ``` 7 | geocoder-nlp/scripts/postal/split_addresses.py training 8 | ``` 9 | 10 | 11 | * To test training on a single country, run 12 | ``` 13 | geocoder-nlp/scripts/postal/build_country_db.sh training countries EE 14 | ``` 15 | 16 | 17 | * Make countries and languages list using 18 | ``` 19 | geocoder-nlp/scripts/postal/make_country_list.py training 20 | ``` 21 | 22 | * Generate Makefile to run address parser generators 23 | ``` 24 | geocoder-nlp/scripts/postal/build_all_country_db.py training countries 25 | ``` 26 | 27 | * Run address parser generator via `make -j 16` 28 | 29 | Note that if address parser is interrupted, Makefile has to be 30 | generated again to remove already generated parts. It doesn't support 31 | such stops and restarts. 32 | -------------------------------------------------------------------------------- /scripts/postal/build_all_country_db.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse, os, collections, sys 4 | 5 | parser = argparse.ArgumentParser(description=''' 6 | This script generates a country specific databases for use with 7 | libpostal routines. Its a driver that runs build_country_db.sh 8 | for every country found in the countries_languages.txt . 9 | 10 | The driver will generate databases only for the countries that are 11 | missing in the output directory''') 12 | 13 | parser.add_argument('addrdata_dir', type=str, 14 | help='path to the directory containing files required for address parser training (formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv)') 15 | 16 | parser.add_argument('output_root_dir', type=str, 17 | help='path to the directory where country specific subdirectory will be created') 18 | 19 | args = parser.parse_args() 20 | 21 | script_dir = os.path.dirname(sys.argv[0]) 22 | 23 | # load list of countries 24 | countries = [] 25 | for l in open("countries_languages.txt", "r"): 26 | countries.append( l.split(':')[0] ) 27 | 28 | # add combinations to cover special cases in PBF downloads 29 | 30 | # add Ireland together with UK to cover Ireland + Northern Ireland case 31 | countries.append("gb-ie") 32 | 33 | countries.append("ht-do") 34 | countries.append("sn-gm") 35 | 36 | # gcc states 37 | countries.append("bh-kw-om-qa-sa-ae") 38 | 39 | countries.append("il-ps") 40 | countries.append("my-sg-bn") 41 | 42 | ### countries list: done ### 43 | 44 | print "Loaded list of countries [%d]: " % len(countries), 45 | for c in countries: 46 | print c, 47 | print 48 | 49 | countries_done = [] 50 | for l in os.listdir(args.output_root_dir): 51 | if l != "compressed" and os.path.exists(os.path.join(args.output_root_dir, l, "address_parser", "address_parser_phrases.dat")): 52 | countries_done.append( l.lower() ) 53 | countries_done.sort() 54 | 55 | print "\nList of countries that are ready [%d]: " % len(countries_done), 56 | for c in countries_done: 57 | print c, 58 | print 59 | 60 | countries_todo = [] 61 | for l in countries: 62 | if l not in countries_done: 63 | countries_todo.append(l) 64 | 65 | print "\nCountries not covered yet [%d]: " % len(countries_todo), 66 | for c in countries_todo: 67 | print c, 68 | print 69 | 70 | ########################################## 71 | 72 | script = "Makefile" 73 | f = open(script, "w") 74 | f.write("# Generated by build_all_country_db\n\nall: ") 75 | for c in countries_todo: 76 | C = c.upper() 77 | f.write(os.path.join(args.output_root_dir, C, "address_parser", "address_parser.dat") + " ") 78 | 79 | f.write("\n\techo All done\n\n") 80 | for c in countries_todo: 81 | C = c.upper() 82 | f.write(os.path.join(args.output_root_dir, C, "address_parser", "address_parser.dat") + ":\n" + 83 | '\t%s/build_country_db.sh "%s" "%s" %s\n\n' % (script_dir, args.addrdata_dir, 84 | args.output_root_dir, 85 | C) ) 86 | 87 | print "\nExamine and run " + script + "\n" 88 | -------------------------------------------------------------------------------- /scripts/postal/build_country_db.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | if [ $# -ne 3 ]; then 6 | echo " 7 | This script generates a country specific databases for use with 8 | libpostal routines. 9 | 10 | Usage: 11 | 12 | $0 addrdata_dir output_root_dir country_code 13 | 14 | where 15 | 16 | addrdata_dir: path to the directory containing files required for 17 | address parser training (formatted_addresses_tagged.random.tsv 18 | formatted_places_tagged.random.tsv) 19 | 20 | output_root_dir: path to the directory where country specific 21 | subdirectory will be created 22 | 23 | country_code: ISO 3166-1 alpha-2 country code (2 letters, 24 | https://en.wikipedia.org/wiki/ISO_3166-1) 25 | 26 | The country code can be specified in any case. It will be converted to 27 | uppercase when making subdirectory under output_root_dir. Note that a 28 | temporary files will be created under output_root_dir while this 29 | script is running. This temporary directory will be removed at the end 30 | of the script. 31 | 32 | The script uses build_geodb and address_parser_train from 33 | libpostal. Either ensure that these executables are in the path or 34 | point the variable POSTAL_SRC_DIR in the script to a directory 35 | containing these executables. 36 | 37 | " 38 | exit -1 39 | fi 40 | 41 | ################################################################# 42 | ### PATH TO LIBPOSTAL SRC DIRECTORY WITH COMPILED EXECUTABLES ### 43 | 44 | POSTAL_SRC_DIR=libpostal/src 45 | 46 | ################################################################# 47 | 48 | ADDRDATA=$1 49 | OUTPUT=$2 50 | COUNTRY=$3 51 | 52 | TMPDATA="$OUTPUT/tmp-$COUNTRY-`cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 8 | head -n 1`" 53 | 54 | COUNTRY_LOWER="${COUNTRY,,}" 55 | COUNTRY_UPPER="${COUNTRY^^}" 56 | 57 | COUNTRY_DIR_BASE=$COUNTRY_UPPER 58 | COUNTRY_DIR="$OUTPUT/$COUNTRY_DIR_BASE" 59 | 60 | PATH="$POSTAL_SRC_DIR:$PATH" 61 | 62 | OUTPUT_ADDRESS="$TMPDATA/address.tsv" 63 | 64 | rm -rf "$COUNTRY_DIR" 65 | 66 | mkdir -p "$TMPDATA" 67 | mkdir -p "$COUNTRY_DIR/address_parser" 68 | 69 | # Addresses 70 | cp /dev/null "$OUTPUT_ADDRESS" 71 | clow=$(echo $COUNTRY_LOWER | tr "-" "\n") 72 | for C in $clow; do 73 | for file in formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv formatted_ways_tagged.random.tsv geoplanet_formatted_addresses_tagged.random.tsv openaddresses_formatted_addresses_tagged.random.tsv uk_openaddresses_formatted_addresses_tagged.random.tsv; do 74 | echo "Address data preparation: $file / $C" 75 | DSPLIT="$ADDRDATA/$file-split" 76 | if [ -d $DSPLIT ]; then 77 | echo "Using pre-split" $file 78 | cat "$ADDRDATA/$file-split/$file-$C" >> "$OUTPUT_ADDRESS" || true 79 | else 80 | grep $'\t'$C$'\t' "$ADDRDATA/$file" >> "$OUTPUT_ADDRESS" || true 81 | fi 82 | done 83 | done 84 | 85 | echo "Randomize addresses" 86 | shuf -o "$OUTPUT_ADDRESS.shuf" "$OUTPUT_ADDRESS" 87 | mv "$OUTPUT_ADDRESS.shuf" "$OUTPUT_ADDRESS" 88 | 89 | ######################################################################## 90 | 91 | echo "Address training" 92 | tlines=`cat "$OUTPUT_ADDRESS" | wc -l` 93 | reflines=15000000 94 | refiters=35 95 | if (( tlines*5 > reflines )); then 96 | iters=5 97 | minup=5 98 | else 99 | iters=$((reflines / tlines + 1)) 100 | minup=3 101 | if (( iters > refiters )); then 102 | iters=$refiters 103 | minup=1 104 | fi 105 | fi 106 | 107 | echo 'Number of training lines:' $tlines 108 | echo 'Iterations:' $iters 109 | echo 'Minimal updates:' $minup 110 | 111 | time address_parser_train --iterations $iters --min-updates $minup "$OUTPUT_ADDRESS" "$COUNTRY_DIR/address_parser" 112 | 113 | echo "Removing temporary directory" 114 | rm -rf "$TMPDATA" 115 | 116 | # echo "Compressing country database" 117 | # mkdir -p "$OUTPUT/compressed" 118 | # (cd "$OUTPUT" && tar jcvf "compressed/$COUNTRY_DIR_BASE.tar.bz2" "$COUNTRY_DIR_BASE") 119 | -------------------------------------------------------------------------------- /scripts/postal/countries_languages.txt: -------------------------------------------------------------------------------- 1 | ad: ar ca de en es fa fr it ja ko nl pl pt ru tr zh 2 | ae: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 3 | af: ar de en es fa fr it ja ko nl pl ps pt ru tr unk xxx zh 4 | ag: ar de en es fa fr it ja ko nl pl pt ru tr zh 5 | ai: ar de en es fa fr it ja ko nl pl pt ru tr zh 6 | al: ar de en es fa fr it ja ko nl pl pt ru sq tr zh 7 | am: ar de en es fa fr hy it ja ko nl pl pt ru tr zh 8 | ao: ar de en es fa fr it ja ko nl pl pt ru tr zh 9 | ar: ar de en es fa fr it ja ko nl pl pt ru tr zh 10 | as: de en es fa fr it ja nl pl ru zh 11 | at: ar de en es fa fr it ja ko nl pl pt ru tr zh 12 | au: ar de en es fa fr it ja ko nl pl pt ru tr zh 13 | aw: ar de en es fr it ja ko nl pap ru unk zh 14 | ax: ar de en es fa fi fr it ja ko nl pl pt ru sv tr unk zh 15 | az: ar az de en es fa fr it ja ko nl pl pt ru tr zh 16 | ba: ar bs de en es fa fr it ja ko nl pl pt ru tr zh 17 | bb: ar de en es fa fr it ja ko nl pl pt ru zh 18 | bd: ar bn de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 19 | be: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 20 | bf: ar de en es fa fr it ja ko nl pl pt ru tr zh 21 | bg: ar bg de en es fa fr it ja ko nl pl pt ru tr zh 22 | bh: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 23 | bi: ar de en es fa fr it ja ko nl pl pt ru zh 24 | bj: ar de en es fa fr it ja ko nl pl pt ru tr zh 25 | bl: de en es fr nl ru zh 26 | bm: ar de en es fa fr it ja ko pl pt ru zh 27 | bn: ar de en es fa fr it ja ko ms nl pl pt ru tr zh 28 | bo: ar ay de en es fa fr it ja ko nl pl pt qu ru tr unk zh 29 | bq: en nl pap ru unk 30 | br: ar de en es fa fr it ja ko nl pl pt ru tr zh 31 | bs: ar de en es fa fr it ja ko nl pl pt ru tr zh 32 | bt: ar de dz en es fa fr it ja ko nl pl pt ru tr zh 33 | bw: ar de en es fa fr it ja ko nl pl pt ru tr zh 34 | by: ar be de en es fa fr it ja ko nl pl pt ru ru_BY ru_RU ru_by ru_ru tr unk xxx zh 35 | bz: ar de en es fa fr it ja ko nl pl pt ru tr zh 36 | ca: ar de en es fa fr ikt it iu ja ko nl pl pt ru tr unk xxx zh 37 | cc: ar de en es fa fr it ja ko nl pl pt ru tr zh 38 | cd: ar de en es fa fr it ja ko nl pl pt ru tr zh 39 | cf: ar de en es fa fr it ja ko nl pl pt ru tr zh 40 | cg: ar de en es fa fr it ja ko nl pl pt ru tr zh 41 | ch: ar de en es fa fr gsw it ja ko nl pl pt ru tr unk xxx zh 42 | ci: ar de en es fa fr it ja ko nl pl pt ru tr zh 43 | ck: ar de en es fa fr it ja ko nl pl pt ru tr zh 44 | cl: ar de en es fa fr it ja ko nl pl pt ru tr zh 45 | cm: ar de en es fa fr it ja ko nl pl pt ru tr zh 46 | cn: ar bo de en es fa fr it ja ko nl pl pt ru tr unk xxx zh zh_arab zh_hans zh_hant zh_jyutping zh_pinyin zh_py zh_tradition 47 | co: ar de en es fa fr it ja ko nl pl pt ru tr zh 48 | cr: ar de en es fa fr it ja ko nl pl pt ru tr zh 49 | cu: ar de en es fa fr it ja ko nl pl pt ru tr zh 50 | cv: ar de en es fa fr it ja ko nl pl pt ru tr zh 51 | cw: ar de en es fa fr it ja ko nl pap ru unk zh 52 | cx: ar de en es fa fr it ja ko nl pl pt ru tr zh 53 | cy: ar de el en es fa fr it ja ko nl pl pt ru tr unk zh 54 | cz: ar cs de en es fa fr it ja ko nl pl pt ru tr zh 55 | de: ar de de_CH en es fa fr it ja ko nl pl pt ru tr zh 56 | dj: ar de en es fa fr it ja ko nl pl pt ru tr zh 57 | dk: ar da de en es fa fr it ja ko nl pl pt ru tr zh 58 | dm: ar de en es fa fr it ja ko nl pl pt ru tr zh 59 | do: ar de en es fa fr it ja ko nl pl pt ru tr zh 60 | dz: ar de en es fa fr fr_1 it ja ko nl pl pt ru tr unk zh 61 | ec: ar de en es fa fr it ja ko nl pl pt qu ru tr unk zh 62 | ee: ar de en es et fa fr it ja ko nl pl pt ru tr zh 63 | eg: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 64 | er: ar de en es fa fr it ja ko nl pl pt ru tr zh 65 | es: ar ca ca_1 de en es eu fa fr gl it ja ko nl oc pl pt ru tr unk xxx zh 66 | et: am ar de en es fa fr it ja ko nl pl pt ru tr zh 67 | fi: ar de en es fa fi fi_1 fr it ja ko nl pl pt ru sv sv_1 tr unk zh 68 | fj: ar de en es fa fr it ja ko nl pl pt ru tr zh 69 | fk: ar de en es fa fr it ja ko nl pl pt ru tr zh 70 | fm: en fa ja ko ru zh 71 | fo: ar de en es fa fo fr it ja ko nl pl pt ru tr zh 72 | fr: ar br br_alt de en es fa fr it ja ko nl oc pl pt ru tr unk xxx zh 73 | ga: ar de en es fa fr it ja ko nl pl pt ru tr zh 74 | gb: ar cy de en en_GB en_IE es fa fr ga gd it ja ko nl pl pt ru tr unk xxx zh 75 | gd: ar de en es fa fr it ja ko nl pl pt ru zh 76 | ge: ar de en es fa fr it ja ka ko nl pl pt ru ru_1 tr unk xxx zh 77 | gf: ar de en es fa fr it ja ko nl pl pt ru tr zh 78 | gg: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 79 | gh: ar de en es fa fr it ja ko nl pl pt ru tr zh 80 | gl: ar de en es fa fr it ja kl ko nl pl pt ru tr zh 81 | gm: ar de en es fa fr it ja ko nl pl pt ru tr zh 82 | gn: ar de en es fa fr it ja ko nl pl pt ru tr zh 83 | gp: ar de en es fa fr it ja ko pl pt ru tr zh 84 | gq: ar de en es fa fr it ja ko nl pl pt ru tr zh 85 | gr: ar de el el_latin en es fa fr it ja ko nl pl pt ru tr unk xxx zh 86 | gt: ar de en es fa fr it ja ko nl pl pt ru tr zh 87 | gu: de en es fr it ja ru zh 88 | gw: ar de en es fa fr it ja ko nl pl pt ru tr zh 89 | gy: ar de en es fa fr it ja ko nl pl pt ru tr zh 90 | hk: de en es fr ja nl pl ru unk xxx zh zh_pinyin 91 | hn: ar de en es fa fr it ja ko nl pl pt ru tr zh 92 | hr: ar de en es fa fr hr it ja ko nl pl pt ru tr zh 93 | ht: ar de en es fa fr it ja ko nl pl pt ru tr zh 94 | hu: ar de en es fa fr hu it ja ko nl pl pt ru tr zh 95 | id: ar de en es fa fr id it ja ko nl pl pt ru tr unk zh 96 | ie: ar de en es fa fr ga it ja ko nl pl pt ru tr unk xxx zh 97 | il: ar de en es fa fr he it ja ko nl pl pt ru tr unk xxx zh 98 | im: ar de en es fa fr gv it ja ko nl pl pt ru tr unk zh 99 | in: ar de en es fa fr hi it ja ko nl pl pt ru tr unk xxx zh 100 | io: ar de en es fa fr it ja ko nl pl pt ru tr zh 101 | iq: ar ar_1 ar_Latn de en en_1 es fa fr it ja ko nl pl pt ru tr unk xxx zh 102 | ir: ar de en en_1 es fa fa_1 fr it ja ko nl pl pt ru tr unk xxx zh 103 | is: ar de en es fa fr is it ja ko nl pl pt ru tr zh 104 | it: ar de en es fa fr it ja ko nl oc pl pt ru sl unk xxx zh 105 | je: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 106 | jm: ar de en es fa fr it ja ko nl pl pt ru tr zh 107 | jo: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 108 | jp: ar de en es fa fr it ja ja_furigana ja_hira ja_kana ja_kana_1 ja_rm ko nl pl pt ru tr unk xxx zh 109 | ke: ar de en es fa fr it ja ko nl pl pt ru tr zh 110 | kg: ar de en es fa fr it ja ko ky nl pl pt ru tr unk xxx zh 111 | kh: ar de en es fa fr it ja km ko nl pl pt ru tr zh 112 | ki: ar de en es fa fr it ja ko nl pt ru tr zh 113 | km: ar de en es fa fr it ja ko nl pl pt ru tr zh 114 | kn: ar de en es fa fr it ja ko nl pl pt ru tr zh 115 | kp: ar de en es fa fr it ja ko ko_hanja ko_rm nl pl pt ru tr unk zh 116 | kr: ar de en es fa fr it ja ko ko_rm ko_rm_2 nl pl pt ru tr unk zh 117 | kw: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 118 | ky: ar de en es fa fr it ja ko nl pl pt ru tr zh 119 | kz: ar de en es fa fr it ja kk ko nl pl pt ru tr unk xxx zh 120 | la: ar de en es fa fr it ja ko lo lo_rm nl pl pt ru tr unk xxx zh 121 | lb: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 122 | lc: ar de en es fa fr it ja ko nl pl pt ru zh 123 | li: ar de en fa fr ja ko nl ru zh 124 | lk: ar de en es fa fr it ja ko nl pl pt ru si tr unk zh 125 | lr: ar de en es fa fr it ja ko nl pl pt ru tr zh 126 | lt: ar de en es fa fr it ja ko lt nl pl pt ru tr zh 127 | lu: ar de en es fa fr it ja ko lb nl pl pt ru tr unk xxx zh 128 | lv: ar de en es fa fr it ja ko lv nl pl pt ru tr zh 129 | ly: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 130 | ma: ar ar_1 de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 131 | mc: ar de en es fa fr it ja ko nl pl pt ru tr zh 132 | md: ar de en es fa fr it ja ko nl pl pt ro ru tr zh 133 | me: ar de en es fa fr it ja ko nl pl pt ru sr tr zh 134 | mf: de en es fr ja ko nl zh 135 | mg: ar de en es fa fr it ja ko mg nl pl pt ru tr unk zh 136 | mh: ar de en es fa fr it ja ko nl pl pt ru tr zh 137 | mk: ar de en es fa fr it ja ko mk nl pl pt ru tr zh 138 | ml: ar de en es fa fr it ja ko nl pl pt ru tr zh 139 | mm: ar de en es fa fr it ja ko my nl pl pt ru tr zh 140 | mn: ar de en es fa fr it ja ko mn mn_mong nl pl pt ru tr unk zh 141 | mo: de en es fr ja ko nl pl pt ru unk xxx zh zh_pinyin 142 | mp: ar en es fa ja ko ru zh 143 | mq: ar de en es fa fr it ja ko pl pt ru tr zh 144 | mr: ar de en es fa fr it ja ko nl pl pt ru tr zh 145 | ms: ar de en es fa fr it ja ko nl pl pt ru tr zh 146 | mt: ar de en es fa fr it ja ko mt pl pt ru unk zh 147 | mu: ar de en es fa fr it ja ko nl pl pt ru unk xxx zh 148 | mv: ar de dv en es fa fr it ja ko nl pl pt ru tr zh 149 | mw: ar de en es fa fr it ja ko nl pl pt ru tr zh 150 | mx: ar de en es fa fr it ja ko nl pl pt ru tr zh 151 | my: ar de en es fa fr it ja ko ms nl pl pt ru tr zh 152 | mz: ar de en es fa fr it ja ko nl pl pt ru tr zh 153 | na: af ar de en es fa fr it ja ko nl pl pt ru tr unk zh 154 | nc: ar de en es fa fr it ja ko nl pl pt ru tr zh 155 | ne: ar de en es fa fr it ja ko nl pl pt ru tr zh 156 | nf: ar de en es fa fr it ja ko nl pl pt ru tr zh 157 | ng: ar de en es fa fr it ja ko nl pl pt ru tr zh 158 | ni: ar de en es fa fr it ja ko nl pl pt ru tr zh 159 | nl: ar de en es fa fr fy it ja ko nl pl pt ru tr unk zh 160 | no: ar de en es fa fr it ja ko nb nl pl pt ru tr zh 161 | np: ar de en es fa fr it ja ko ne nl pl pt ru tr unk xxx zh 162 | nr: ar de en es fa fr it ja ko pl ru zh 163 | nu: ar de en es fa fr it ja ko pl ru zh 164 | nz: ar de en es fa fr it ja ko mi nl pl pt ru tr unk zh 165 | om: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 166 | pa: ar de en es fa fr it ja ko nl pl pt ru tr zh 167 | pe: ar de en es fa fr it ja ko nl pl pt qu ru tr unk zh 168 | pf: ar de en es fa fr it ja ko nl pl pt ru zh 169 | pg: ar de en es fa fr it ja ko nl pl pt ru tr zh 170 | ph: ar de en es fa fil fr it ja ko nl pl pt ru tr unk xxx zh 171 | pk: ar de en es fa fr it ja ko nl pl pt ru tr unk ur zh 172 | pl: ar de en es fa fr it ja ko nl pl pt ru tr zh 173 | pm: ar de en es fa fr it ja ko nl pl pt ru tr zh 174 | pn: ar de en es fa fr it ja ko nl pl pt ru tr zh 175 | pr: ar en es fa ja ko ru unk xxx zh 176 | ps: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 177 | pt: ar de en es fa fr it ja ko nl pl pt ru tr zh 178 | pw: en fa ja ko pt ru zh 179 | py: ar de en es fa fr gn it ja ko nl pl pt ru tr unk zh 180 | qa: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 181 | re: ar de en es fa fr it ja ko nl pl pt ru tr zh 182 | ro: ar de en es fa fr it ja ko nl pl pt ro ro_1 ro_2 ru tr unk zh 183 | rs: ar de en es fa fr it ja ko nl pl pt ru sr tr zh 184 | ru: ar de en es fa fr it ja ko nl pl pt ru ru_rm tr unk zh 185 | rw: ar de en es fa fr it ja ko nl pl pt ru tr zh 186 | sa: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 187 | sb: ar de en fa ja ko ru zh 188 | sc: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 189 | sd: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 190 | se: ar de en es fa fr it ja ko nl pl pt ru sv tr zh 191 | sg: ar de en es fa fr it ja ko ms nl pl pt ru tr unk zh zh_pinyin 192 | sh: ar de en es fa fr it ja ko nl pl pt ru zh 193 | si: ar de en es fa fr it ja ko nl pl pt ru sl tr zh 194 | sk: ar de en es fa fr it ja ko nl pl pt ru sk tr zh 195 | sl: ar de en es fa fr it ja ko nl pl pt ru tr zh 196 | sm: ar de en es fa fr it ja ko nl pl pt ru tr zh 197 | sn: ar de en es fa fr it ja ko nl pl pt ru tr zh 198 | so: ar de en es fa fr it ja ko nl pl pt ru so tr unk zh 199 | sr: ar de en es fa fr it ja ko nl pl pt ru tr zh 200 | ss: ar de en es fa fr it ja ko nl pl pt ru tr zh 201 | st: ar de en es fa fr it ja ko nl pl pt ru tr zh 202 | sv: ar de en es fa fr it ja ko nl pl pt ru tr zh 203 | sx: de en fr it ja nl ru unk xxx zh 204 | sy: ar de en en_1 es fa fr it ja ko nl pl pt ru tr unk zh 205 | sz: ar de en es fa fr it ja ko nl pl pt ru tr zh 206 | tc: ar de en es fa fr it ja ko nl pl pt ru tr zh 207 | td: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 208 | tf: ar de en es fa fr it ja ko nl pl pt ru tr zh 209 | tg: ar de en es fa fr it ja ko nl pl pt ru tr zh 210 | th: ar de en es fa fr it ja ko nl pl pt ru th tr unk xxx zh 211 | tj: ar de en es fa fr it ja ko nl pl pt ru tg tr zh 212 | tk: ar de en es fa fr ja ko pl ru unk zh 213 | tl: ar de en es fa fr it ja ko nl pl pt ru tr zh 214 | tm: ar de en es fa fr it ja ko nl pl pt ru tk tr zh 215 | tn: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 216 | to: ar de en es fa fr it ja ko nl pl pt ru to tr unk zh 217 | tr: ar de en es fa fr it ja ko nl pl pt ru tr zh 218 | tt: ar de en es fa fr it ja ko nl pl pt ru tr zh 219 | tv: ar de en es fa fr ja ko pl pt ru tvl zh 220 | tw: ar de en es fa fr it ja ko nl pl pt ru tr unk zh zh_pinyin zh_py 221 | tz: ar de en es fa fr it ja ko nl pl pt ru tr zh 222 | ua: ar de en es fa fr it ja ko nl pl pt ru ru_rm tr uk unk xxx zh 223 | ug: ar de en es fa fr it ja ko nl pl pt ru tr zh 224 | um: de en es fr it pt ru zh 225 | us: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 226 | uy: ar de en es fa fr it ja ko nl pl pt ru tr zh 227 | uz: ar de en es fa fr it ja ko nl pl pt ru tr uz zh 228 | va: ar de en es fa fr it ja ko nl pl pt ru tr zh 229 | vc: ar de en es fa fr it ja ko nl pl pt ru tr zh 230 | ve: ar de en es fa fr it ja ko nl pl pt ru tr zh 231 | vg: ar de en es fa fr it ja ko nl pl pt ru tr zh 232 | vi: ar da de en es fa fr it ja ko nl pl pt ru tr unk zh 233 | vn: ar de en es fa fr it ja ko nl pl pt ru tr vi zh 234 | vu: ar de en es fa fr it ja ko pl pt ru unk zh 235 | wf: ar de en es fa fr it ja ko nl pl pt ru tr zh 236 | ws: ar de en es fa fr it ja ko nl pl ru zh 237 | ye: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 238 | yt: ar en fa fr ja ko pl ru zh 239 | za: ar de en es fa fr it ja ko nl pl pt ru tr zh 240 | zm: ar de en es fa fr it ja ko nl pl pt ru tr zh 241 | zw: ar de en es fa fr it ja ko nl pl pt ru tr zh 242 | -------------------------------------------------------------------------------- /scripts/postal/make_country_list.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse, glob, collections 4 | 5 | parser = argparse.ArgumentParser(description='Generate list of countries covered among the addresses') 6 | parser.add_argument('addrdata_dir', type=str, 7 | help='path to the directory containing files required for address parser training (formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv)') 8 | 9 | args = parser.parse_args() 10 | 11 | AddrDir = args.addrdata_dir 12 | 13 | cntlan = set() 14 | for fname in glob.glob(AddrDir + "/*.tsv"): 15 | print "Analyzing " + fname 16 | i = 0 17 | for l in open(fname, "r"): 18 | s = l.split() 19 | k = s[0] + " " + s[1] 20 | cntlan.add(k) 21 | 22 | Countries = collections.defaultdict(list) 23 | while True: 24 | if len(cntlan) < 1: 25 | break 26 | cl = cntlan.pop().split() 27 | Countries[ cl[1] ].append(cl[0]) 28 | 29 | keys = Countries.keys() 30 | keys.sort() 31 | for k in keys: 32 | print k + ": ", 33 | Countries[k].sort() 34 | for l in Countries[k]: print l, 35 | print 36 | 37 | f = open("countries_languages.txt", "w") 38 | for k in keys: 39 | f.write(k + ": ") 40 | for l in Countries[k]: f.write(l + " ") 41 | f.write("\n") 42 | -------------------------------------------------------------------------------- /scripts/postal/split_addresses.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse, glob, collections, shutil, os 4 | 5 | parser = argparse.ArgumentParser(description='Split addresses databases by countries') 6 | parser.add_argument('addrdata_dir', type=str, 7 | help='path to the directory containing files required for address parser training (formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv)') 8 | 9 | args = parser.parse_args() 10 | 11 | AddrDir = args.addrdata_dir 12 | SplitDir = os.path.join(AddrDir, "split") 13 | 14 | def splitname(fname, c): 15 | b = os.path.basename(fname) 16 | d = fname + "-split" 17 | f = os.path.join(d, b + "-" + c) 18 | return d, f 19 | 20 | def write(fname_base, data, c): 21 | cnt = data[c] 22 | d, fname = splitname(fname_base, c) 23 | f = open(fname, "a") 24 | while True: 25 | if len(cnt) < 1: 26 | break 27 | l = cnt.pop() 28 | f.write(l) 29 | data[c] = collections.deque() 30 | 31 | for fname in glob.glob(AddrDir + "/*.tsv"): 32 | print "Splitting " + fname 33 | 34 | d, f = splitname(fname, "not-important") 35 | if os.path.exists(d): 36 | shutil.rmtree(d) 37 | os.makedirs(d) 38 | print "Directory containing split files:", d 39 | 40 | data = collections.defaultdict(collections.deque) 41 | for l in open(fname, "r"): 42 | c = l.split()[1] 43 | 44 | data[c].append(l) 45 | if len(data[c]) > 1000: 46 | write(fname, data, c) 47 | 48 | for k in data.keys(): 49 | write(fname, data, k) 50 | 51 | -------------------------------------------------------------------------------- /scripts/tags/README.md: -------------------------------------------------------------------------------- 1 | The script over here generates the list of most popular tag values 2 | that can be inserted into map style sheet (map.ost) and 3 | whitelist. Adjust the script SQL query as appropriate and run the 4 | script. 5 | 6 | As a data, taginfo database from 7 | https://taginfo.openstreetmap.org/download/taginfo-db.db.bz2 is used. 8 | -------------------------------------------------------------------------------- /scripts/tags/generate_mapstyle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import sqlite3 4 | 5 | db = sqlite3.connect('taginfo-db.db') 6 | c = db.cursor() 7 | 8 | prioritylist = "" 9 | 10 | keyvals = [] 11 | for r in c.execute("select key,value from tags where key='shop' order by count_all desc limit 50"): 12 | key, value = r 13 | 14 | # no need for these 15 | if value in ['yes', 'no']: 16 | continue 17 | 18 | keyvals.append([key, value]) 19 | 20 | keyvals.sort() 21 | for r in keyvals: 22 | key, value = r 23 | 24 | prioritylist += key + '_' + value + '\n' 25 | 26 | print(prioritylist) 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/geocoder.cpp: -------------------------------------------------------------------------------- 1 | #include "geocoder.h" 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | using namespace GeoNLP; 11 | 12 | const int GeoNLP::Geocoder::version{ 6 }; 13 | const size_t GeoNLP::Geocoder::num_languages{ 2 }; // 1 (default) + 1 (english) 14 | 15 | typedef boost::geometry::model::point< 16 | double, 2, boost::geometry::cs::spherical_equatorial > 17 | point_t; 18 | const double earth_radius = 6378e3; // meters 19 | 20 | //////////////////// 21 | // helper functions 22 | 23 | static double distance_per_latitude() 24 | { 25 | return 111e3; 26 | } 27 | static double distance_per_longitude(double latitude) 28 | { 29 | return std::max(1000.0, M_PI / 180.0 * 6378137.0 * cos(latitude * M_PI / 180.0)); 30 | } 31 | 32 | //////////////////// 33 | // GeoReference class 34 | 35 | Geocoder::GeoReference::GeoReference() : m_is_set(false) {} 36 | 37 | Geocoder::GeoReference::GeoReference(double lat, double lon, int zoom, double importance) 38 | : m_latitude(lat), m_longitude(lon), m_importance(importance), m_zoom(zoom), m_is_set(true) 39 | { 40 | } 41 | 42 | void Geocoder::GeoReference::set(double lat, double lon, int zoom, double importance) 43 | { 44 | m_latitude = lat; 45 | m_longitude = lon; 46 | m_zoom = zoom; 47 | m_importance = importance; 48 | m_is_set = true; 49 | } 50 | 51 | void Geocoder::GeoReference::reset() 52 | { 53 | m_is_set = false; 54 | } 55 | 56 | double Geocoder::GeoReference::distance(const Geocoder::GeoResult &r) const 57 | { 58 | point_t ref(m_longitude, m_latitude); 59 | point_t q(r.longitude, r.latitude); 60 | return boost::geometry::distance(ref, q) * earth_radius; 61 | } 62 | 63 | //////////////////// 64 | // Geocoder class 65 | 66 | Geocoder::Geocoder() 67 | { 68 | update_limits(); 69 | } 70 | 71 | std::string Geocoder::name_primary(const std::string &dname) 72 | { 73 | return dname + "/geonlp-primary.sqlite"; 74 | } 75 | 76 | std::string Geocoder::name_normalized_trie(const std::string &dname) 77 | { 78 | return dname + "/geonlp-normalized.trie"; 79 | } 80 | 81 | std::string Geocoder::name_normalized_id(const std::string &dname) 82 | { 83 | return dname + "/geonlp-normalized-id.kch"; 84 | } 85 | 86 | Geocoder::Geocoder(const std::string &dbname) 87 | { 88 | if (!load(dbname)) 89 | std::cerr << "Geocoder: error loading " << dbname << std::endl; 90 | } 91 | 92 | bool Geocoder::load(const std::string &dbname) 93 | { 94 | if (dbname == m_database_path && m_database_open) 95 | return true; 96 | 97 | // clean before loading anything 98 | drop(); 99 | 100 | bool error = false; 101 | try 102 | { 103 | m_database_path = dbname; 104 | if (m_db.connect(name_primary(m_database_path).c_str(), SQLITE_OPEN_READONLY) != SQLITE_OK) 105 | { 106 | error = true; 107 | std::cerr << "Error opening SQLite database\n"; 108 | } 109 | 110 | if (!error && !check_version()) 111 | { 112 | error = true; 113 | } 114 | 115 | // Limit Kyoto Cabinet caches 116 | m_database_norm_id.tune_map(32LL * 1024LL * 1024LL); // 64MB default 117 | // m_database_norm_id.tune_page_cache(32LL*1024LL*1024LL); // 64MB default 118 | 119 | if (!error 120 | && !m_database_norm_id.open(name_normalized_id(m_database_path).c_str(), 121 | kyotocabinet::HashDB::OREADER 122 | | kyotocabinet::HashDB::ONOLOCK)) 123 | { 124 | error = true; 125 | std::cerr << "Error opening IDs database\n"; 126 | } 127 | 128 | if (!error) 129 | m_trie_norm.load( 130 | name_normalized_trie(m_database_path).c_str()); // throws exception on error 131 | 132 | m_database_open = true; 133 | } 134 | catch (sqlite3pp::database_error &e) 135 | { 136 | error = true; 137 | std::cerr << "Geocoder SQLite exception: " << e.what() << std::endl; 138 | } 139 | catch (marisa::Exception &e) 140 | { 141 | error = true; 142 | std::cerr << "Geocoder MARISA exception: " << e.what() << std::endl; 143 | } 144 | 145 | if (error) 146 | drop(); 147 | return !error; 148 | } 149 | 150 | bool Geocoder::load() 151 | { 152 | return load(m_database_path); 153 | } 154 | 155 | void Geocoder::drop() 156 | { 157 | m_db.disconnect(); 158 | m_database_norm_id.close(); 159 | m_trie_norm.clear(); 160 | m_database_path = std::string(); 161 | m_database_open = false; 162 | } 163 | 164 | bool Geocoder::check_version() 165 | { 166 | std::ostringstream s; 167 | s << Geocoder::version; 168 | return check_version(s.str()); 169 | } 170 | 171 | bool Geocoder::check_version(const std::string &supported) 172 | { 173 | // this cannot through exceptions 174 | try 175 | { 176 | sqlite3pp::query qry(m_db, "SELECT value FROM meta WHERE key=\"version\""); 177 | 178 | for (auto v : qry) 179 | { 180 | std::string n; 181 | v.getter() >> n; 182 | if (n == supported) 183 | return true; 184 | else 185 | { 186 | std::cerr << "Geocoder: wrong version of the database. Supported: " << supported 187 | << " / database version: " << n << std::endl; 188 | return false; 189 | } 190 | } 191 | } 192 | catch (sqlite3pp::database_error &e) 193 | { 194 | std::cerr << "Geocoder exception while checking database version: " << e.what() << std::endl; 195 | return false; 196 | } 197 | 198 | return false; 199 | } 200 | 201 | void Geocoder::update_limits() 202 | { 203 | m_max_inter_results = m_max_results + m_max_inter_offset; 204 | } 205 | 206 | #ifdef GEONLP_PRINT_DEBUG 207 | static std::string v2s(const std::vector &v) 208 | { 209 | std::string s = "{"; 210 | for (auto i : v) 211 | { 212 | if (s.length() > 1) 213 | s += ", "; 214 | s += i; 215 | } 216 | s += "}"; 217 | return s; 218 | } 219 | #endif 220 | 221 | bool Geocoder::search(const std::vector &parsed_query, 222 | std::vector &result, size_t min_levels, 223 | const GeoReference &reference) 224 | { 225 | if (!m_database_open) 226 | return false; 227 | 228 | // parse query by libpostal 229 | std::vector parsed_result; 230 | std::string postal_code; 231 | 232 | Postal::result2hierarchy(parsed_query, parsed_result, postal_code); 233 | 234 | result.clear(); 235 | m_levels_resolved = min_levels; 236 | 237 | #ifdef GEONLP_PRINT_DEBUG 238 | std::cout << "Search hierarchies:\n"; 239 | std::cout << "Postal code: " << postal_code << "\n"; 240 | #endif 241 | #ifdef GEONLP_PRINT_DEBUG_QUERIES 242 | std::cout << "\n"; 243 | #endif 244 | 245 | try 246 | { // catch and process SQLite and other exceptions 247 | for (const auto &r : parsed_result) 248 | { 249 | #ifdef GEONLP_PRINT_DEBUG 250 | std::cout << "Levels: " << r.size() << " -> "; 251 | for (auto a : r) 252 | std::cout << v2s(a) << " / "; 253 | std::cout << "\n"; 254 | #endif 255 | 256 | m_query_count = 0; 257 | if (r.size() >= m_levels_resolved 258 | || (r.size() == m_levels_resolved 259 | && (m_max_results == 0 || result.size() < m_max_inter_results))) 260 | search(r, postal_code, result); 261 | #ifdef GEONLP_PRINT_DEBUG_QUERIES 262 | else 263 | std::cout << "Skipping hierarchy since search result already has more levels (" 264 | << m_levels_resolved << ") than provided\n"; 265 | #endif 266 | #ifdef GEONLP_PRINT_DEBUG_QUERIES 267 | std::cout << "\n"; 268 | #endif 269 | } 270 | 271 | #ifdef GEONLP_PRINT_DEBUG 272 | std::cout << "\n"; 273 | #endif 274 | 275 | // fill the data 276 | for (GeoResult &r : result) 277 | { 278 | get_name(r.id, r.title, r.address, r.admin_levels, m_levels_in_title); 279 | r.type = get_type(r.id); 280 | get_features(r); 281 | 282 | sqlite3pp::query qry( 283 | m_db, "SELECT latitude, longitude, search_rank FROM object_primary WHERE id=?"); 284 | qry.bind(1, r.id); 285 | for (auto v : qry) 286 | { 287 | // only one entry is expected 288 | v.getter() >> r.latitude >> r.longitude >> r.search_rank; 289 | break; 290 | } 291 | 292 | if (reference.is_set()) 293 | { 294 | r.distance = reference.distance(r); 295 | 296 | // Here, 1000 is used to scale search_rank. Same factor is used in Geocoder importer 297 | r.search_rank -= reference.importance() * 1000 298 | * search_rank_location_bias(r.distance, reference.zoom()); 299 | } 300 | } 301 | } 302 | catch (sqlite3pp::database_error &e) 303 | { 304 | std::cerr << "Geocoder exception: " << e.what() << std::endl; 305 | return false; 306 | } 307 | 308 | // sort and trim results 309 | std::sort(result.begin(), result.end()); 310 | if (m_max_results > 0 && result.size() >= m_max_results) 311 | result.resize(m_max_results); 312 | 313 | return true; 314 | } 315 | 316 | bool Geocoder::search(const Postal::Hierarchy &parsed, const std::string &postal_code, 317 | std::vector &result, size_t level, long long int range0, 318 | long long int range1) 319 | { 320 | /// Special case of search made by postal code only 321 | if (level == 0 && parsed.size() == 0 && !postal_code.empty()) 322 | { 323 | std::string p = postal_code; 324 | sqlite3pp::query qry( 325 | m_db, "SELECT id FROM object_primary WHERE postal_code=:pcode ORDER BY id ASC"); 326 | qry.bind(":pcode", postal_code.c_str(), sqlite3pp::nocopy); 327 | for (auto v : qry) 328 | { 329 | GeoResult r; 330 | v.getter() >> r.id; 331 | r.levels_resolved = 0; 332 | result.push_back(r); 333 | if (m_max_results > 0 && result.size() >= m_max_inter_results) 334 | break; 335 | } 336 | 337 | return true; 338 | } 339 | 340 | /// Number of allowed queries are scaled by the number of 341 | /// languages. Otherwise, with the languages added, one can start 342 | /// missing search results. 343 | if (level >= parsed.size() 344 | || (m_max_queries_per_hierarchy > 0 345 | && m_query_count > m_max_queries_per_hierarchy * num_languages)) 346 | return false; 347 | 348 | m_query_count++; 349 | 350 | std::set 351 | ids_explored; /// keeps all ids which have been used to search further at this level 352 | 353 | // help structure keeping marisa-found search string with found ids 354 | struct IntermediateResult 355 | { 356 | std::string txt; 357 | index_id_value id; 358 | IntermediateResult(const std::string &t, index_id_value i) : txt(t), id(i) {} 359 | bool operator<(const IntermediateResult &A) const 360 | { 361 | return (txt.length() < A.txt.length() || (txt.length() == A.txt.length() && txt < A.txt) 362 | || (txt == A.txt && id < A.id)); 363 | } 364 | }; 365 | 366 | std::deque search_result; 367 | for (const std::string &s : parsed[level]) 368 | { 369 | marisa::Agent agent; 370 | agent.set_query(s.c_str()); 371 | while (m_trie_norm.predictive_search(agent)) 372 | { 373 | std::string val; 374 | if (m_database_norm_id.get(make_id_key(agent.key().id()), &val)) 375 | { 376 | index_id_value *idx, *idx1; 377 | if (get_id_range(val, (level == 0), range0, range1, &idx, &idx1)) 378 | { 379 | for (; idx < idx1; ++idx) 380 | { 381 | long long int id = *idx; 382 | IntermediateResult r(std::string(agent.key().ptr(), agent.key().length()), 383 | id); 384 | search_result.push_back(r); 385 | } 386 | } 387 | } 388 | else 389 | { 390 | std::cerr << "Internal inconsistency of the databases: TRIE " << agent.key().id() 391 | << "\n"; 392 | } 393 | } 394 | } 395 | 396 | std::sort(search_result.begin(), search_result.end()); 397 | 398 | bool last_level = (level + 1 >= parsed.size()); 399 | for (const IntermediateResult &branch : search_result) 400 | { 401 | long long int id = branch.id; 402 | long long int last_subobject = id; 403 | 404 | if (ids_explored.count(id) > 0) 405 | continue; // has been looked into it already 406 | 407 | if (parsed.size() < m_levels_resolved 408 | || (parsed.size() == m_levels_resolved && m_max_results > 0 409 | && result.size() >= m_max_inter_results)) 410 | break; // this search cannot add more results 411 | 412 | ids_explored.insert(id); 413 | 414 | // if postal code is assigned to this level and is correct, 415 | // all subobjects will have the same postal code. check if postal 416 | // code is resolved 417 | bool postal_is_ok = (postal_code.empty() || get_postal_code(id) == postal_code); 418 | 419 | // are we interested in this result even if it doesn't have subregions? 420 | if (!last_level || !postal_is_ok) 421 | { 422 | sqlite3pp::query qry(m_db, "SELECT last_subobject FROM hierarchy WHERE prim_id=?"); 423 | qry.bind(1, id); 424 | for (auto v : qry) 425 | { 426 | // only one entry is expected 427 | v.getter() >> last_subobject; 428 | break; 429 | } 430 | 431 | // check if we have results which are better than this one if it 432 | // does not have any subobjects 433 | if (m_levels_resolved > level + 1 && id >= last_subobject) 434 | continue; // take the next search_result 435 | } 436 | 437 | if (last_level || last_subobject <= id 438 | || !search(parsed, postal_is_ok ? "" : postal_code, result, level + 1, id + 1, 439 | last_subobject)) 440 | { 441 | size_t levels_resolved = level + 1; 442 | bool newlevel = false; 443 | if (m_levels_resolved < levels_resolved) 444 | { 445 | result.clear(); 446 | newlevel = true; 447 | } 448 | 449 | if ((m_levels_resolved == levels_resolved || newlevel) 450 | && (m_max_results == 0 || result.size() < m_max_inter_results)) 451 | { 452 | bool have_already = false; 453 | for (const auto &r : result) 454 | if (r.id == id) 455 | { 456 | have_already = true; 457 | break; 458 | } 459 | 460 | if (!have_already) 461 | { 462 | if (postal_is_ok) 463 | { 464 | GeoResult r; 465 | r.id = id; 466 | r.levels_resolved = levels_resolved; 467 | result.push_back(r); 468 | m_levels_resolved = levels_resolved; 469 | } 470 | else if (id < last_subobject) 471 | { 472 | // search subobjects for ones with the same postal code 473 | // there is a point to start searching only if there are 474 | // subobjects only 475 | sqlite3pp::query qry(m_db, "SELECT id FROM object_primary WHERE " 476 | "postal_code=:pcode AND id>:min AND id<=:max"); 477 | qry.bind(":pcode", postal_code.c_str(), sqlite3pp::nocopy); 478 | qry.bind(":min", id); 479 | qry.bind(":max", last_subobject); 480 | #ifdef GEONLP_PRINT_SQL 481 | std::cout << "SELECT id FROM object_primary WHERE postal_code='" 482 | << postal_code << "' AND id>" << id << " AND id<=" << last_subobject 483 | << "\n"; 484 | #endif 485 | for (auto v : qry) 486 | { 487 | GeoResult r; 488 | v.getter() >> r.id; 489 | r.levels_resolved = levels_resolved; 490 | result.push_back(r); 491 | m_levels_resolved = levels_resolved; 492 | if (m_max_results > 0 && result.size() >= m_max_inter_results) 493 | break; 494 | } 495 | } 496 | } 497 | } 498 | } 499 | } 500 | 501 | return !ids_explored.empty(); 502 | } 503 | 504 | void Geocoder::get_name(long long id, std::string &title, std::string &full, size_t &admin_levels, 505 | int levels_in_title) 506 | { 507 | long long int parent; 508 | std::string name; 509 | std::string name_extra; 510 | std::string name_en; 511 | std::string toadd; 512 | 513 | sqlite3pp::query qry(m_db, 514 | "SELECT name, name_extra, name_en, parent FROM object_primary WHERE id=?"); 515 | qry.bind(1, id); 516 | for (auto v : qry) 517 | { 518 | // only one entry is expected 519 | { 520 | // to allow NULL readouts from the database 521 | char const *n, *ne, *neng; 522 | v.getter() >> n >> ne >> neng >> parent; 523 | if (n) 524 | name = n; 525 | if (ne) 526 | name_extra = ne; 527 | if (neng) 528 | name_en = neng; 529 | } 530 | 531 | if (name.empty() && levels_in_title > 0) 532 | name = get_postal_code(id); 533 | if (name.empty()) 534 | name = " "; 535 | 536 | toadd = std::string(); 537 | if (m_preferred_result_language == "en" && !name_en.empty()) 538 | toadd = name_en; 539 | else if (!name_extra.empty() && name != name_extra) 540 | toadd = name_extra + ", " + name; 541 | 542 | if (toadd.empty()) 543 | toadd = name; 544 | 545 | if (!full.empty()) 546 | full += ", "; 547 | full += toadd; 548 | 549 | if (levels_in_title > 0) 550 | { 551 | if (!title.empty()) 552 | title += ", "; 553 | title += toadd; 554 | } 555 | 556 | get_name(parent, title, full, admin_levels, levels_in_title - 1); 557 | admin_levels++; 558 | return; 559 | } 560 | } 561 | 562 | std::string Geocoder::get_postal_code(long long id) 563 | { 564 | char const *postal_code = nullptr; 565 | 566 | sqlite3pp::query qry(m_db, "SELECT postal_code FROM object_primary WHERE id=?"); 567 | qry.bind(1, id); 568 | 569 | for (auto v : qry) 570 | { 571 | v.getter() >> postal_code; 572 | break; 573 | } 574 | 575 | return postal_code ? postal_code : std::string(); 576 | } 577 | 578 | std::string Geocoder::get_type(long long id) 579 | { 580 | std::string name; 581 | 582 | sqlite3pp::query qry( 583 | m_db, "SELECT t.name FROM object_primary o JOIN type t ON t.id=o.type_id WHERE o.id=?"); 584 | qry.bind(1, id); 585 | 586 | for (auto v : qry) 587 | { 588 | std::string n; 589 | v.getter() >> n; 590 | 591 | if (!name.empty()) 592 | name += ", "; 593 | name += n; 594 | } 595 | 596 | return name; 597 | } 598 | 599 | void Geocoder::get_features(GeoResult &r) 600 | { 601 | sqlite3pp::query qry(m_db, "SELECT phone, postal_code, website FROM object_primary WHERE id=?"); 602 | qry.bind(1, r.id); 603 | for (auto v : qry) 604 | { 605 | // only one entry is expected 606 | char const *phone, *postal, *web; 607 | v.getter() >> phone >> postal >> web; 608 | r.phone = (phone ? phone : ""); 609 | r.postal_code = (postal ? postal : ""); 610 | r.website = (web ? web : ""); 611 | return; 612 | } 613 | } 614 | 615 | bool Geocoder::get_id_range(std::string &v, bool full_range, index_id_value range0, 616 | index_id_value range1, index_id_value **idx0, index_id_value **idx1) 617 | { 618 | int sz = get_id_number_of_values(v); 619 | index_id_value *v0 = (index_id_value *)v.data(); 620 | if (sz == 0) 621 | return false; 622 | 623 | if (full_range) 624 | { 625 | *idx0 = v0; 626 | *idx1 = v0 + sz; 627 | return true; 628 | } 629 | 630 | *idx0 = std::lower_bound(v0, v0 + sz, range0); 631 | if (*idx0 - v0 >= sz) 632 | return false; 633 | 634 | *idx1 = std::upper_bound(v0, v0 + sz, range1); 635 | if (*idx1 - v0 >= sz && *(v0) > range1) 636 | return false; 637 | 638 | return true; 639 | } 640 | 641 | // search next to the reference point 642 | bool Geocoder::search_nearby(const std::vector &name_query, 643 | const std::vector &type_query, double latitude, 644 | double longitude, double radius, std::vector &result, 645 | Postal &postal) 646 | { 647 | if (radius < 0) 648 | return false; 649 | 650 | // rough estimates of distance (meters) per degree 651 | // 652 | const double dist_per_degree_lat = distance_per_latitude(); 653 | const double dist_per_degree_lon = distance_per_longitude(latitude); 654 | 655 | try 656 | { 657 | std::ostringstream qtxt; 658 | qtxt << "SELECT o.id, o.name, o.name_extra, o.name_en, t.name, o.latitude, o.longitude, " 659 | "o.search_rank " 660 | << "FROM object_primary o " 661 | << "JOIN type t ON o.type_id=t.id " 662 | << "JOIN object_primary_rtree ON (o.box_id = object_primary_rtree.id) " 663 | << "WHERE "; 664 | 665 | if (!type_query.empty()) 666 | { 667 | std::string tqfull; 668 | for (auto tq : type_query) 669 | { 670 | if (!tqfull.empty()) 671 | tqfull += " OR "; 672 | else 673 | tqfull = "("; 674 | tqfull += " t.name = '" + tq + "'"; 675 | } 676 | qtxt << tqfull << ") AND "; 677 | } 678 | 679 | qtxt << "maxLat>=:minLat AND minLat<=:maxLat AND maxLon >= :minLon AND minLon <= :maxLon"; 680 | 681 | #ifdef GEONLP_PRINT_SQL 682 | std::cout << qtxt.str() << "\n"; 683 | #endif 684 | sqlite3pp::query qry(m_db, qtxt.str().c_str()); 685 | 686 | qry.bind(":minLat", latitude - radius / dist_per_degree_lat); 687 | qry.bind(":maxLat", latitude + radius / dist_per_degree_lat); 688 | qry.bind(":minLon", longitude - radius / dist_per_degree_lon); 689 | qry.bind(":maxLon", longitude + radius / dist_per_degree_lon); 690 | 691 | for (auto v : qry) 692 | { 693 | long long id; 694 | char const *name, *name_extra, *name_en; 695 | std::string type; 696 | double lat, lon, distance; 697 | int search_rank; 698 | v.getter() >> id >> name >> name_extra >> name_en >> type >> lat >> lon >> search_rank; 699 | 700 | // check if distance is ok. note that the distance is expected 701 | // to be small (on the scale of the planet) 702 | { 703 | double dlat = dist_per_degree_lat * (latitude - lat); 704 | double dlon = dist_per_degree_lon * (longitude - lon); 705 | distance = sqrt(dlat * dlat + dlon * dlon); 706 | if (distance > radius) 707 | continue; // skip this result 708 | } 709 | 710 | // check name query 711 | if (!name_query.empty()) 712 | { 713 | bool found = false; 714 | std::set names; 715 | if (name) 716 | names.insert(name); 717 | if (name_extra) 718 | names.insert(name_extra); 719 | if (name_en) 720 | names.insert(name_en); 721 | for (auto n = names.begin(); n != names.end() && !found; ++n) 722 | { 723 | if (n->empty()) 724 | continue; 725 | std::vector expanded; 726 | postal.expand_string(*n, expanded); 727 | 728 | for (auto q = name_query.cbegin(); !found && q != name_query.cend(); ++q) 729 | for (auto e = expanded.begin(); !found && e != expanded.end(); ++e) 730 | // search is for whether the name starts with the query or has 731 | // the query after space (think of street Dr. Someone and query Someone) 732 | found = (e->compare(0, q->length(), *q) == 0 733 | || e->find(" " + *q) != std::string::npos); 734 | } 735 | 736 | if (!found) 737 | continue; // substring not found 738 | } 739 | 740 | GeoResult r; 741 | r.id = id; 742 | 743 | get_name(r.id, r.title, r.address, r.admin_levels, m_levels_in_title); 744 | r.type = get_type(r.id); 745 | get_features(r); 746 | 747 | r.latitude = lat; 748 | r.longitude = lon; 749 | r.distance = distance; 750 | r.search_rank = search_rank; 751 | r.levels_resolved = 1; // not used in this search 752 | 753 | result.push_back(r); 754 | } 755 | } 756 | catch (sqlite3pp::database_error &e) 757 | { 758 | std::cerr << "Geocoder exception: " << e.what() << std::endl; 759 | return false; 760 | } 761 | 762 | if (m_max_results > 0 && result.size() >= m_max_results) 763 | { 764 | Geocoder::sort_by_distance(result.begin(), result.end()); 765 | result.resize(m_max_results); 766 | } 767 | 768 | return true; 769 | } 770 | 771 | // search next to the reference linestring 772 | bool Geocoder::search_nearby(const std::vector &name_query, 773 | const std::vector &type_query, 774 | const std::vector &latitude, 775 | const std::vector &longitude, double radius, 776 | std::vector &result, Postal &postal, size_t skip_points) 777 | { 778 | if (radius < 0 || latitude.size() < 2 || latitude.size() != longitude.size()) 779 | return false; 780 | 781 | // const double earth_radius = 6378137; 782 | const double radius2 = radius * radius; 783 | 784 | // // fill linestring 785 | // boost::geometry::model::linestring ls; 786 | // for (size_t i=0; i < latitude.size(); ++i) 787 | // boost::geometry::append(ls, point_t(longitude[i],latitude[i])); 788 | 789 | try 790 | { 791 | 792 | std::set processed_boxes; 793 | double line_distance = 0; 794 | for (size_t LineI = skip_points; 795 | LineI < longitude.size() - 1 && (m_max_results == 0 || result.size() <= m_max_results); 796 | ++LineI) 797 | { 798 | 799 | // rough estimates of distance (meters) per degree 800 | const double dist_per_degree_lat = distance_per_latitude(); 801 | const double dist_per_degree_lon = distance_per_longitude(latitude[LineI]); 802 | { 803 | double x1 = latitude[LineI] * dist_per_degree_lat; 804 | double y1 = longitude[LineI] * dist_per_degree_lon; 805 | double x2 = latitude[LineI + 1] * dist_per_degree_lat; 806 | double y2 = longitude[LineI + 1] * dist_per_degree_lon; 807 | double dx = x2 - x1; 808 | double dy = y2 - y1; 809 | line_distance += sqrt(dx * dx + dy * dy); 810 | } 811 | 812 | // step 1: get boxes that are near the line segment 813 | std::deque newboxes; 814 | { 815 | sqlite3pp::query qry(m_db, "SELECT id, minLat, maxLat, minLon, maxLon " 816 | "FROM object_primary_rtree " 817 | "WHERE maxLat>=:minLat AND minLat<=:maxLat AND maxLon >= " 818 | ":minLon AND minLon <= :maxLon"); 819 | 820 | auto bb_lat = std::minmax(latitude[LineI], latitude[LineI + 1]); 821 | auto bb_lon = std::minmax(longitude[LineI], longitude[LineI + 1]); 822 | 823 | qry.bind(":minLat", bb_lat.first - radius / dist_per_degree_lat); 824 | qry.bind(":maxLat", bb_lat.second + radius / dist_per_degree_lat); 825 | qry.bind(":minLon", bb_lon.first - radius / dist_per_degree_lon); 826 | qry.bind(":maxLon", bb_lon.second + radius / dist_per_degree_lon); 827 | 828 | for (auto v : qry) 829 | { 830 | long long id; 831 | double minLat, maxLat, minLon, maxLon; 832 | v.getter() >> id >> minLat >> maxLat >> minLon >> maxLon; 833 | 834 | if (processed_boxes.count(id)) 835 | continue; 836 | processed_boxes.insert(id); 837 | newboxes.push_back(id); 838 | } 839 | } 840 | 841 | // check if anything is new 842 | if (newboxes.empty()) 843 | continue; 844 | 845 | // step 2: search for objects from new boxes 846 | { 847 | std::ostringstream qtxt; 848 | qtxt << "SELECT o.id, o.name, o.name_extra, o.name_en, t.name, " 849 | << "o.latitude, o.longitude, o.search_rank " 850 | << "FROM object_primary o " 851 | << "JOIN type t ON o.type_id=t.id " 852 | << "WHERE "; 853 | 854 | if (!type_query.empty()) 855 | { 856 | std::string tqfull; 857 | for (auto tq : type_query) 858 | { 859 | if (!tqfull.empty()) 860 | tqfull += " OR "; 861 | else 862 | tqfull = "("; 863 | tqfull += " t.name = '" + tq + "'"; 864 | } 865 | qtxt << tqfull << ") AND "; 866 | } 867 | 868 | qtxt << " o.box_id IN ("; 869 | for (size_t i = 0; i < newboxes.size(); ++i) 870 | { 871 | if (i) 872 | qtxt << ", "; 873 | qtxt << newboxes[i]; 874 | } 875 | qtxt << ")"; 876 | #ifdef GEONLP_PRINT_SQL 877 | std::cout << qtxt.str() << "\n"; 878 | #endif 879 | sqlite3pp::query qry(m_db, qtxt.str().c_str()); 880 | 881 | for (auto v : qry) 882 | { 883 | long long id; 884 | char const *name, *name_extra, *name_en; 885 | std::string type; 886 | double lat, lon, distance; 887 | int search_rank; 888 | v.getter() >> id >> name >> name_extra >> name_en >> type >> lat >> lon 889 | >> search_rank; 890 | 891 | // check if distance is ok using earth as a plane approximation around the line 892 | { 893 | double distance2 = -1; 894 | const double xp = lat * dist_per_degree_lat; 895 | const double yp = lon * dist_per_degree_lon; 896 | double u = 0, nrm2 = 0; // cache the last result for distance 897 | for (size_t i = LineI; 898 | i < longitude.size() - 1 && (distance2 < 0 || distance2 > radius2); ++i) 899 | { 900 | // constants used for distance calculations, drop when 901 | // moving back to boost geometry 902 | const double x1 = latitude[i] * dist_per_degree_lat; 903 | const double y1 = longitude[i] * dist_per_degree_lon; 904 | const double x2 = latitude[i + 1] * dist_per_degree_lat; 905 | const double y2 = longitude[i + 1] * dist_per_degree_lon; 906 | const double px = x2 - x1; 907 | const double py = y2 - y1; 908 | nrm2 = px * px + py * py; 909 | 910 | u = ((xp - x1) * px + (yp - y1) * py) / nrm2; 911 | u = std::max(0.0, std::min(1.0, u)); 912 | 913 | double dx = (x1 + u * px) - xp; 914 | double dy = (y1 + u * py) - yp; 915 | double dd = dx * dx + dy * dy; 916 | 917 | if (distance2 < 0 || distance2 > dd) 918 | distance2 = dd; 919 | } 920 | 921 | if (distance2 > radius2) 922 | continue; 923 | 924 | distance = line_distance - (1 - u) * sqrt(nrm2); 925 | } 926 | // // distance check using boost 927 | // if ( boost::geometry::distance(point_t(lon, lat), ls)*earth_radius > radius ) 928 | // continue; // skip this result 929 | 930 | // check name query 931 | if (!name_query.empty()) 932 | { 933 | bool found = false; 934 | std::set names; 935 | if (name) 936 | names.insert(name); 937 | if (name_extra) 938 | names.insert(name_extra); 939 | if (name_en) 940 | names.insert(name_en); 941 | for (auto n = names.begin(); n != names.end() && !found; ++n) 942 | { 943 | if (n->empty()) 944 | continue; 945 | std::vector expanded; 946 | postal.expand_string(*n, expanded); 947 | 948 | for (auto q = name_query.cbegin(); !found && q != name_query.cend(); ++q) 949 | for (auto e = expanded.begin(); !found && e != expanded.end(); ++e) 950 | // search is for whether the name starts with the query or has 951 | // the query after space (think of street Dr. Someone and query Someone) 952 | found = (e->compare(0, q->length(), *q) == 0 953 | || e->find(" " + *q) != std::string::npos); 954 | } 955 | 956 | if (!found) 957 | continue; // substring not found 958 | } 959 | 960 | GeoResult r; 961 | r.id = id; 962 | 963 | get_name(r.id, r.title, r.address, r.admin_levels, m_levels_in_title); 964 | r.type = get_type(r.id); 965 | get_features(r); 966 | 967 | r.latitude = lat; 968 | r.longitude = lon; 969 | r.distance = distance; 970 | r.search_rank = search_rank; 971 | r.levels_resolved = 1; // not used in this search 972 | 973 | result.push_back(r); 974 | } 975 | } 976 | } 977 | } 978 | catch (sqlite3pp::database_error &e) 979 | { 980 | std::cerr << "Geocoder exception: " << e.what() << std::endl; 981 | return false; 982 | } 983 | 984 | return true; 985 | } 986 | 987 | int Geocoder::closest_segment(const std::vector &latitude, 988 | const std::vector &longitude, double reference_latitude, 989 | double reference_longitude) 990 | { 991 | // make all checks first 992 | if (latitude.size() < 2 || latitude.size() != longitude.size()) 993 | return -1; 994 | 995 | // rough estimates of distance (meters) per degree 996 | const double dist_per_degree_lat = distance_per_latitude(); 997 | const double dist_per_degree_lon = distance_per_longitude(reference_latitude); 998 | 999 | const double xp = reference_latitude * dist_per_degree_lat; 1000 | const double yp = reference_longitude * dist_per_degree_lon; 1001 | 1002 | size_t currI = 0; 1003 | double currD2 = -1; 1004 | for (size_t i = 0; i < longitude.size() - 1; ++i) 1005 | { 1006 | const double x1 = latitude[i] * dist_per_degree_lat; 1007 | const double y1 = longitude[i] * dist_per_degree_lon; 1008 | const double x2 = latitude[i + 1] * dist_per_degree_lat; 1009 | const double y2 = longitude[i + 1] * dist_per_degree_lon; 1010 | const double px = x2 - x1; 1011 | const double py = y2 - y1; 1012 | const double nrm2 = px * px + py * py; 1013 | 1014 | double u = ((xp - x1) * px + (yp - y1) * py) / nrm2; 1015 | u = std::max(0.0, std::min(1.0, u)); 1016 | double dx = (x1 + u * px) - xp; 1017 | double dy = (y1 + u * py) - yp; 1018 | double dd = dx * dx + dy * dy; 1019 | 1020 | if (currD2 < 0 || currD2 > dd) 1021 | { 1022 | currI = i; 1023 | currD2 = dd; 1024 | } 1025 | } 1026 | 1027 | return currI; 1028 | } 1029 | 1030 | double Geocoder::search_rank_location_bias(double distance, int zoom) 1031 | { 1032 | // based on Photon implementation of the bias 1033 | const double offset = 800; // meters 1034 | 1035 | distance -= offset; 1036 | if (distance <= 0) 1037 | return 1.0; 1038 | 1039 | zoom = std::max(zoom, 18); 1040 | double radius = (1 << (18 - zoom)) * 250; // meters 1041 | return exp(-distance / radius); 1042 | } 1043 | -------------------------------------------------------------------------------- /src/geocoder.h: -------------------------------------------------------------------------------- 1 | #ifndef GEOCODER_H 2 | #define GEOCODER_H 3 | 4 | #include "postal.h" 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | namespace GeoNLP 15 | { 16 | 17 | class Geocoder 18 | { 19 | 20 | public: 21 | struct GeoResult 22 | { 23 | long long int id; 24 | double latitude; 25 | double longitude; 26 | double distance = 0; 27 | std::string title; 28 | std::string address; 29 | std::string type; 30 | std::string phone; 31 | std::string postal_code; 32 | std::string website; 33 | size_t levels_resolved; 34 | size_t admin_levels = 0; 35 | double search_rank; 36 | 37 | bool operator<(const GeoResult &i) const 38 | { 39 | return (search_rank < i.search_rank 40 | || (search_rank == i.search_rank && address.length() < i.address.length()) 41 | || (search_rank == i.search_rank && address.length() == i.address.length() 42 | && address < i.address)); 43 | } 44 | }; 45 | 46 | class GeoReference 47 | { 48 | public: 49 | GeoReference(); 50 | GeoReference(double lat, double lon, int zoom = 16, double importance = 0.75); 51 | 52 | void set(double lat, double lon, int zoom = 16, double importance = 0.75); 53 | void reset(); 54 | 55 | bool is_set() const { return m_is_set; } 56 | int zoom() const { return m_zoom; } 57 | double importance() const { return m_importance; } 58 | double distance(const GeoResult &r) const; 59 | 60 | private: 61 | double m_latitude; 62 | double m_longitude; 63 | double m_importance; 64 | int m_zoom; 65 | bool m_is_set; 66 | }; 67 | 68 | typedef uint32_t index_id_key; 69 | typedef uint32_t index_id_value; 70 | 71 | static const int version; 72 | static const size_t num_languages; ///< Number of languages supported by this version 73 | 74 | public: 75 | Geocoder(); 76 | Geocoder(const std::string &dbpath); 77 | 78 | virtual ~Geocoder() {} 79 | 80 | /// \brief Search for any objects matching the normalized query 81 | /// 82 | bool search(const std::vector &parsed_query, std::vector &result, 83 | size_t min_levels = 0, const GeoReference &reference = GeoReference()); 84 | 85 | /// \brief Search for objects within given radius from specified point and matching the query 86 | /// 87 | /// Here, radius is given in meters and the reference point is 88 | /// given by latitude and longitude (WGS 84). Query is given by name 89 | /// and type. When the both are given, the both queries have to be fulfilled 90 | /// (think of cafe and its name). Within type and name queries, a single match 91 | /// is sufficient. 92 | bool search_nearby(const std::vector &name_query, 93 | const std::vector &type_query, double latitude, double longitude, 94 | double radius, std::vector &result, Postal &postal); 95 | 96 | /// \brief Search for objects within given radius from specified linestring and matching the query 97 | /// 98 | /// Here, radius is given in meters and the reference linestring 99 | /// is given by latitude and longitude vectors (WGS 84). Query is 100 | /// given by name and type. When the both are given, the both 101 | /// queries have to be fulfilled (think of cafe and its 102 | /// name). Within type and name queries, a single match is 103 | /// sufficient. 104 | /// 105 | /// Parameter skip_points can be used to skip the given number of 106 | /// points from the beginning of the line when searching for 107 | /// objects. This, for example, is used when looking for objects 108 | /// next to route upcoming from the current location 109 | bool search_nearby(const std::vector &name_query, 110 | const std::vector &type_query, 111 | const std::vector &latitude, const std::vector &longitude, 112 | double radius, std::vector &result, Postal &postal, 113 | size_t skip_points = 0); 114 | 115 | int get_levels_in_title() const { return m_levels_in_title; } 116 | void set_levels_in_title(int l) { m_levels_in_title = l; } 117 | 118 | size_t get_max_queries_per_hierarchy() const { return m_max_queries_per_hierarchy; } 119 | void set_max_queries_per_hierarchy(size_t mx) { m_max_queries_per_hierarchy = mx; } 120 | 121 | size_t get_max_results() const { return m_max_results; } 122 | void set_max_results(size_t mx) 123 | { 124 | m_max_results = mx; 125 | update_limits(); 126 | } 127 | 128 | size_t get_max_intermediate_offset() const { return m_max_inter_offset; } 129 | void set_max_intermediate_offset(size_t mx) 130 | { 131 | m_max_inter_offset = mx; 132 | update_limits(); 133 | } 134 | 135 | /// \brief Set preferred language for results 136 | /// 137 | /// Use two-letter coded language code as an argument. For 138 | /// example, for English, set it to "en". If the original name is 139 | /// requested, you could set it to any invalid letter combination 140 | /// or an empty string. 141 | void set_result_language(const std::string &lang) { m_preferred_result_language = lang; } 142 | 143 | bool load(const std::string &dbpath); 144 | bool load(); 145 | void drop(); 146 | 147 | operator bool() const { return m_database_open; } 148 | 149 | public: 150 | static std::string name_primary(const std::string &dname); 151 | static std::string name_normalized_trie(const std::string &dname); 152 | static std::string name_normalized_id(const std::string &dname); 153 | 154 | // interaction with key/value database 155 | static std::string make_id_key(index_id_key key) 156 | { 157 | return std::string((char *)&key, sizeof(key)); 158 | } 159 | 160 | static std::string make_id_value(const std::vector v) 161 | { 162 | return std::string((char *)v.data(), sizeof(index_id_value) * v.size()); 163 | } 164 | 165 | static index_id_key get_id_key(const std::string &v) { return *((index_id_key *)v.data()); } 166 | 167 | static index_id_value get_id_value(const std::string &v, size_t index) 168 | { 169 | index_id_value *p = (index_id_value *)v.data(); 170 | return p[index]; 171 | } 172 | 173 | static size_t get_id_number_of_values(const std::string &v) 174 | { 175 | return v.size() / sizeof(index_id_value); 176 | } 177 | 178 | static bool get_id_range(std::string &v, bool full_range, index_id_value range0, 179 | index_id_value range1, index_id_value **idx0, index_id_value **idx1); 180 | 181 | // support for sorting by distance 182 | static bool distcomp(const Geocoder::GeoResult &i, const Geocoder::GeoResult &j) 183 | { 184 | return (i.distance < j.distance); 185 | } 186 | 187 | template static void sort_by_distance(T begin, T end) 188 | { 189 | std::sort(begin, end, distcomp); 190 | } 191 | 192 | // search for the segment on a line that is the closest to the 193 | // specified point. returns negative value on error 194 | static int closest_segment(const std::vector &latitude, 195 | const std::vector &longitude, double reference_latitude, 196 | double reference_longitude); 197 | 198 | protected: 199 | bool search(const Postal::Hierarchy &parsed, const std::string &postal_code, 200 | std::vector &result, size_t level = 0, long long int range0 = 0, 201 | long long int range1 = 0); 202 | 203 | void get_name(long long int id, std::string &title, std::string &full, size_t &admin_levels, 204 | int levels_in_title); 205 | 206 | std::string get_postal_code(long long int id); 207 | 208 | std::string get_type(long long int id); 209 | 210 | void get_features(GeoResult &r); 211 | 212 | virtual bool check_version(); 213 | 214 | bool check_version(const std::string &supported); 215 | 216 | void update_limits(); 217 | 218 | static double search_rank_location_bias(double distance, int zoom = 16); 219 | 220 | protected: 221 | sqlite3pp::database m_db; 222 | std::string m_database_path; 223 | bool m_database_open = false; 224 | 225 | kyotocabinet::HashDB m_database_norm_id; 226 | marisa::Trie m_trie_norm; 227 | 228 | int m_levels_in_title = 2; 229 | size_t m_max_queries_per_hierarchy = 0; 230 | size_t m_max_results = 25; 231 | size_t m_max_inter_offset = 100; 232 | size_t m_max_inter_results; 233 | 234 | size_t m_levels_resolved; 235 | size_t m_query_count; 236 | 237 | std::string m_preferred_result_language; 238 | }; 239 | 240 | } 241 | #endif // GEOCODER_H 242 | -------------------------------------------------------------------------------- /src/postal.cpp: -------------------------------------------------------------------------------- 1 | #include "postal.h" 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | #include 14 | 15 | using namespace GeoNLP; 16 | 17 | #define ADDRESS_PARSER_LABEL_HOUSE "house" 18 | #define ADDRESS_PARSER_LABEL_CATEGORY "category" 19 | #define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number" 20 | #define ADDRESS_PARSER_LABEL_ROAD "road" 21 | #define ADDRESS_PARSER_LABEL_SUBURB "suburb" 22 | #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district" 23 | #define ADDRESS_PARSER_LABEL_CITY "city" 24 | #define ADDRESS_PARSER_LABEL_ISLAND "island" 25 | #define ADDRESS_PARSER_LABEL_STATE_DISTRICT "state_district" 26 | #define ADDRESS_PARSER_LABEL_STATE "state" 27 | #define ADDRESS_PARSER_LABEL_POSTAL_CODE "postcode" 28 | #define ADDRESS_PARSER_LABEL_COUNTRY_REGION "country_region" 29 | #define ADDRESS_PARSER_LABEL_COUNTRY "country" 30 | 31 | #define PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_INPUT "post:" 32 | #define PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY "h-postcode" 33 | 34 | ////////////////////////////////////////////////////////////////////// 35 | /// Helper string functions 36 | /// 37 | 38 | static void str2vecchar(const std::string &s, std::vector &v) 39 | { 40 | v.resize(s.length() + 1); 41 | std::copy(s.c_str(), s.c_str() + s.length() + 1, v.begin()); 42 | } 43 | 44 | // trim from start 45 | static inline std::string <rim(std::string &s) 46 | { 47 | s.erase(s.begin(), 48 | std::find_if(s.begin(), s.end(), [](unsigned char ch) { 49 | return !std::isspace(ch); 50 | })); 51 | return s; 52 | } 53 | 54 | // trim from end 55 | static inline std::string &rtrim(std::string &s) 56 | { 57 | s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { 58 | return !std::isspace(ch); 59 | }).base(), s.end()); 60 | return s; 61 | } 62 | 63 | // trim from both ends 64 | static inline std::string &trim(std::string &s) 65 | { 66 | return ltrim(rtrim(s)); 67 | } 68 | 69 | static void split_tokens(const std::string &s, char delim, std::vector &elems) 70 | { 71 | std::stringstream ss; 72 | ss.str(s); 73 | std::string item; 74 | while (std::getline(ss, item, delim)) 75 | if (!item.empty()) 76 | elems.push_back(trim(item)); 77 | } 78 | 79 | static std::string primitive_key(size_t ind) 80 | { 81 | std::ostringstream ss; 82 | ss << "h-" << ind; 83 | return ss.str(); 84 | } 85 | 86 | /////////////////////////////////////////////////////////////////// 87 | /// Postal class 88 | 89 | Postal::Postal() {} 90 | 91 | Postal::~Postal() 92 | { 93 | drop(); 94 | } 95 | 96 | void Postal::set_postal_datadir(const std::string &global, const std::string &country) 97 | { 98 | if (global.length() < 1) 99 | m_postal_datadir_global.clear(); 100 | else 101 | str2vecchar(global, m_postal_datadir_global); 102 | 103 | if (country.length() < 1) 104 | m_postal_datadir_country.clear(); 105 | else 106 | str2vecchar(country, m_postal_datadir_country); 107 | 108 | drop(); // force reinitialization 109 | } 110 | 111 | void Postal::set_postal_datadir_country(const std::string &country) 112 | { 113 | std::vector nc; 114 | if (country.length() >= 1) 115 | str2vecchar(country, nc); 116 | 117 | if (nc == m_postal_datadir_country) 118 | return; // nothing to do 119 | 120 | m_postal_datadir_country = nc; 121 | if (m_initialized) 122 | { 123 | // have to drop current country-specific parser and load a new one 124 | libpostal_teardown_parser(); 125 | 126 | if ((m_postal_datadir_country.empty() && !libpostal_setup_parser()) 127 | || (!m_postal_datadir_country.empty() 128 | && !libpostal_setup_parser_datadir(m_postal_datadir_country.data()))) 129 | { 130 | std::cerr << "Postal: Error at set_postal_datadir_country " << country << std::endl; 131 | drop(); 132 | } 133 | } 134 | } 135 | 136 | void Postal::add_language(const std::string &lang) 137 | { 138 | std::vector l; 139 | str2vecchar(lang, l); 140 | m_postal_languages.push_back(l); 141 | drop(); 142 | } 143 | 144 | bool Postal::init() 145 | { 146 | if (m_initialized) 147 | return true; 148 | 149 | // used for transliteration and basic setup 150 | if ((m_postal_datadir_global.empty() && !libpostal_setup()) 151 | || (!m_postal_datadir_global.empty() 152 | && !libpostal_setup_datadir(m_postal_datadir_global.data()))) 153 | return false; 154 | 155 | if (m_use_postal) 156 | { 157 | if (m_postal_languages.empty()) 158 | { 159 | if ((m_postal_datadir_global.empty() && !libpostal_setup_language_classifier()) 160 | || (!m_postal_datadir_global.empty() 161 | && !libpostal_setup_language_classifier_datadir(m_postal_datadir_global.data()))) 162 | return false; 163 | } 164 | 165 | if ((m_postal_datadir_country.empty() && !libpostal_setup_parser()) 166 | || (!m_postal_datadir_country.empty() 167 | && !libpostal_setup_parser_datadir(m_postal_datadir_country.data()))) 168 | return false; 169 | } 170 | 171 | m_initialized = true; 172 | return true; 173 | } 174 | 175 | void Postal::drop() 176 | { 177 | if (!m_initialized) 178 | return; 179 | libpostal_teardown_parser(); 180 | libpostal_teardown(); 181 | libpostal_teardown_language_classifier(); 182 | m_initialized = false; 183 | } 184 | 185 | /// Normalize first, expand next. Seems to miss few expansions in current implementation 186 | /// Have to wait for new libpostal version to switch to this approach 187 | 188 | // bool Postal::parse(const std::string &input, std::vector &result) 189 | //{ 190 | // if (!init()) return false; 191 | 192 | // // convert string into vector of chars (libpostal uses char* as an argument, not const char*) 193 | // std::vector charbuff; 194 | // charbuff.resize(input.length() + 1); 195 | // std::copy(input.c_str(), input.c_str() + input.length() + 1, charbuff.begin()); 196 | 197 | // size_t num_expansions; 198 | // normalize_options_t options_norm = get_libpostal_default_options(); 199 | // address_parser_options_t options_parse = get_libpostal_address_parser_default_options(); 200 | 201 | // char **expansions = expand_address(charbuff.data(), options_norm, &num_expansions); 202 | // if (m_initialize_for_every_call) 203 | // libpostal_teardown_language_classifier(); 204 | 205 | // for (size_t i = 0; i < num_expansions; i++) 206 | // { 207 | // address_parser_response_t *parsed = parse_address(expansions[i], options_parse); 208 | 209 | // ParseResult r; 210 | // for (size_t j = 0; j < parsed->num_components; j++) 211 | // r[ parsed->labels[j] ] = parsed->components[j]; 212 | 213 | // result.push_back(r); 214 | 215 | // address_parser_response_destroy(parsed); 216 | // } 217 | 218 | // expansion_array_destroy(expansions, num_expansions); 219 | 220 | // if (m_initialize_for_every_call) drop(); 221 | 222 | // return true; 223 | //} 224 | 225 | ////////////////////////////////////////////////////////////// 226 | /// expand first then normalize 227 | 228 | bool Postal::parse(const std::string &input, std::vector &result, 229 | Postal::ParseResult &nonormalization) 230 | { 231 | if (m_use_postal && !init()) 232 | return false; 233 | 234 | // libpostal parsing 235 | if (m_use_postal) 236 | { 237 | // convert string into vector of chars (libpostal uses char* as an argument, not const char*) 238 | std::vector charbuff; 239 | charbuff.resize(input.length() + 1); 240 | std::copy(input.c_str(), input.c_str() + input.length() + 1, charbuff.begin()); 241 | 242 | libpostal_address_parser_options_t options_parse 243 | = libpostal_get_address_parser_default_options(); 244 | 245 | // parse the address 246 | libpostal_address_parser_response_t *parsed 247 | = libpostal_parse_address(charbuff.data(), options_parse); 248 | nonormalization.clear(); 249 | for (size_t j = 0; j < parsed->num_components; j++) 250 | { 251 | std::vector pc; 252 | pc.push_back(parsed->components[j]); 253 | nonormalization[parsed->labels[j]] = pc; 254 | } 255 | libpostal_address_parser_response_destroy(parsed); 256 | 257 | expand(nonormalization, result); 258 | } 259 | 260 | // primitive parsing 261 | if (m_use_primitive) 262 | { 263 | std::vector hier; 264 | split_tokens(input, ',', hier); 265 | if (hier.empty()) 266 | hier.push_back(input); 267 | 268 | ParseResult prim; 269 | int shift = 0; 270 | size_t np = strlen(PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_INPUT); 271 | for (size_t j = 0; j < hier.size(); j++) 272 | { 273 | std::string v = hier[hier.size() - j - 1]; 274 | std::string key = primitive_key(j - shift); 275 | v = trim(v); 276 | if (v.compare(0, np, PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_INPUT) == 0) 277 | { 278 | v = v.substr(np); 279 | v = trim(v); 280 | key = PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY; 281 | shift += 1; 282 | } 283 | std::vector pc; 284 | pc.push_back(v); 285 | prim[key] = pc; 286 | } 287 | 288 | expand(prim, result); 289 | } 290 | 291 | if (m_initialize_for_every_call) 292 | drop(); 293 | 294 | return true; 295 | } 296 | 297 | void Postal::expand(const Postal::ParseResult &input, std::vector &result) 298 | { 299 | if (!init()) 300 | { 301 | result.push_back(input); 302 | return; 303 | } 304 | 305 | size_t num_expansions; 306 | libpostal_normalize_options_t options_norm = libpostal_get_default_options(); 307 | 308 | std::vector lang; 309 | for (std::vector &l : m_postal_languages) 310 | lang.push_back(l.data()); 311 | 312 | options_norm.languages = lang.data(); 313 | options_norm.num_languages = lang.size(); 314 | 315 | std::vector charbuff; 316 | 317 | std::vector > address_expansions; 318 | std::vector address_keys; 319 | for (const auto &i : input) 320 | { 321 | // in practice, its only one element at ParseResult at this stage 322 | for (const std::string &tonorm : i.second) 323 | { 324 | std::set norm; 325 | // always add unexpanded result into address expansions 326 | // this will help with the partial entries as described in 327 | // issue #64 https://github.com/rinigus/geocoder-nlp/issues/64 328 | norm.insert(tonorm); 329 | // no need to keep postal code in normalized and expanded 330 | if (i.first != ADDRESS_PARSER_LABEL_POSTAL_CODE 331 | && i.first != PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY) 332 | { 333 | charbuff.resize(tonorm.length() + 1); 334 | std::copy(tonorm.c_str(), tonorm.c_str() + tonorm.length() + 1, charbuff.begin()); 335 | char **expansions 336 | = libpostal_expand_address(charbuff.data(), options_norm, &num_expansions); 337 | for (size_t j = 0; j < num_expansions; j++) 338 | norm.insert(expansions[j]); 339 | 340 | libpostal_expansion_array_destroy(expansions, num_expansions); 341 | } 342 | address_expansions.push_back(std::vector(norm.begin(), norm.end())); 343 | address_keys.push_back(i.first); 344 | } 345 | } 346 | 347 | ParseResult r; 348 | for (size_t i = 0; i < address_keys.size(); ++i) 349 | r[address_keys[i]] = address_expansions[i]; 350 | result.push_back(r); 351 | } 352 | 353 | void Postal::expand_string(const std::string &input, std::vector &expansions) 354 | { 355 | if (!m_use_postal || !init()) 356 | { 357 | expansions.push_back(input); 358 | return; 359 | } 360 | 361 | size_t num_expansions; 362 | libpostal_normalize_options_t options_norm = libpostal_get_default_options(); 363 | 364 | std::vector lang; 365 | for (std::vector &l : m_postal_languages) 366 | lang.push_back(l.data()); 367 | 368 | options_norm.languages = lang.data(); 369 | options_norm.num_languages = lang.size(); 370 | 371 | std::vector charbuff; 372 | 373 | charbuff.resize(input.length() + 1); 374 | std::copy(input.c_str(), input.c_str() + input.length() + 1, charbuff.begin()); 375 | 376 | char **expansions_cstr = libpostal_expand_address(charbuff.data(), options_norm, &num_expansions); 377 | for (size_t j = 0; j < num_expansions; j++) 378 | expansions.push_back(expansions_cstr[j]); 379 | 380 | libpostal_expansion_array_destroy(expansions_cstr, num_expansions); 381 | } 382 | 383 | std::string Postal::normalize_postalcode(const std::string &postal_code) 384 | { 385 | // According to https://en.wikipedia.org/wiki/Postal_code 386 | // postal codes use 387 | // - The Arabic numerals "0" to "9" 388 | // - Letters of the ISO basic Latin alphabet 389 | // - Spaces, hyphens 390 | // 391 | // So, simple normalization can be used. Here, its uppercased and double 392 | // spaces are removed. 393 | 394 | std::string normalized; 395 | for (auto i : postal_code) 396 | { 397 | char c = std::toupper(i); 398 | if (c != ' ' || (!normalized.empty() && normalized.back() != ' ')) 399 | normalized.push_back(c); 400 | } 401 | 402 | // trim a single possible space at the end 403 | if (!normalized.empty() && normalized.back() == ' ') 404 | normalized.pop_back(); 405 | 406 | return normalized; 407 | } 408 | 409 | void Postal::result2hierarchy(const std::vector &p, std::vector &h, 410 | std::string &postal_code) 411 | { 412 | h.clear(); 413 | for (const ParseResult &r : p) 414 | { 415 | Hierarchy h_result; 416 | 417 | #define ADDIFHAS(k) \ 418 | { \ 419 | ParseResult::const_iterator it = r.find(k); \ 420 | if (it != r.end()) \ 421 | h_result.push_back(it->second); \ 422 | } 423 | 424 | ADDIFHAS(ADDRESS_PARSER_LABEL_COUNTRY); 425 | ADDIFHAS(ADDRESS_PARSER_LABEL_COUNTRY_REGION); 426 | ADDIFHAS(ADDRESS_PARSER_LABEL_STATE); 427 | ADDIFHAS(ADDRESS_PARSER_LABEL_STATE_DISTRICT); 428 | ADDIFHAS(ADDRESS_PARSER_LABEL_ISLAND); 429 | ADDIFHAS(ADDRESS_PARSER_LABEL_CITY); 430 | ADDIFHAS(ADDRESS_PARSER_LABEL_CITY_DISTRICT); 431 | ADDIFHAS(ADDRESS_PARSER_LABEL_SUBURB); 432 | ADDIFHAS(ADDRESS_PARSER_LABEL_ROAD); 433 | ADDIFHAS(ADDRESS_PARSER_LABEL_HOUSE_NUMBER); 434 | ADDIFHAS(ADDRESS_PARSER_LABEL_CATEGORY); 435 | ADDIFHAS(ADDRESS_PARSER_LABEL_HOUSE); 436 | 437 | if (postal_code.empty()) 438 | { 439 | ParseResult::const_iterator it = r.find(ADDRESS_PARSER_LABEL_POSTAL_CODE); 440 | if (it != r.end() && it->second.size()) 441 | postal_code = normalize_postalcode(it->second[0]); 442 | } 443 | 444 | // test if its primitive expansion result 445 | if (h_result.empty()) 446 | { 447 | bool done = false; 448 | for (size_t i = 0; !done; ++i) 449 | { 450 | ParseResult::const_iterator it = r.find(primitive_key(i)); 451 | if (it == r.end()) 452 | done = true; 453 | else 454 | h_result.push_back(it->second); 455 | } 456 | } 457 | 458 | // overwrite with the primitive parser postal code if needed 459 | ParseResult::const_iterator it = r.find(PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY); 460 | if (it != r.end() && it->second.size()) 461 | postal_code = normalize_postalcode(it->second[0]); 462 | 463 | h.push_back(h_result); 464 | } 465 | } 466 | -------------------------------------------------------------------------------- /src/postal.h: -------------------------------------------------------------------------------- 1 | #ifndef POSTAL_H 2 | #define POSTAL_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace GeoNLP 9 | { 10 | 11 | class Postal 12 | { 13 | public: 14 | Postal(); 15 | ~Postal(); 16 | 17 | typedef std::map > ParseResult; 18 | typedef std::vector > Hierarchy; 19 | 20 | static std::string normalize_postalcode(const std::string &postal_code); 21 | static void result2hierarchy(const std::vector &p, std::vector &h, 22 | std::string &postal_code); 23 | 24 | /// \brief Parse and normalize input string 25 | /// 26 | bool parse(const std::string &input, std::vector &parsed, 27 | ParseResult &nonormalization); 28 | 29 | /// \brief Normalize input string and return its expansions 30 | /// 31 | void expand_string(const std::string &input, std::vector &expansions); 32 | 33 | bool get_initialize_every_call() const { return m_initialize_for_every_call; } 34 | void set_initialize_every_call(bool v) { m_initialize_for_every_call = v; } 35 | 36 | bool get_use_postal() const { return m_use_postal; } 37 | void set_use_postal(bool v) 38 | { 39 | m_use_postal = v; 40 | if (!v) 41 | drop(); 42 | } 43 | 44 | bool get_use_primitive() const { return m_use_primitive; } 45 | void set_use_primitive(bool v) { m_use_primitive = v; } 46 | 47 | void set_postal_datadir(const std::string &global, const std::string &country); 48 | void set_postal_datadir_country(const std::string &country); 49 | 50 | void clear_languages() 51 | { 52 | m_postal_languages.clear(); 53 | drop(); 54 | } 55 | void add_language(const std::string &lang); 56 | 57 | protected: 58 | bool init(); 59 | void drop(); 60 | 61 | void expand(const Postal::ParseResult &input, std::vector &result); 62 | 63 | protected: 64 | bool m_initialized = false; 65 | bool m_initialize_for_every_call = false; 66 | bool m_use_postal = true; 67 | bool m_use_primitive = true; 68 | 69 | std::vector m_postal_datadir_global; 70 | std::vector m_postal_datadir_country; 71 | std::vector > m_postal_languages; 72 | }; 73 | 74 | } 75 | 76 | #endif // POSTAL_H 77 | -------------------------------------------------------------------------------- /src/version.h: -------------------------------------------------------------------------------- 1 | #ifndef GEOCODER_VERSION_H 2 | #define GEOCODER_VERSION_H 3 | 4 | #define GEOCODERNLP_MAJOR_VERSION 1 5 | #define GEOCODERNLP_MINOR_VERSION 0 6 | #define GEOCODERNLP_PATCH_VERSION 0 7 | 8 | #define GEOCODERNLP_VERSION (GEOCODERNLP_MAJOR_VERSION*100000) + (GEOCODERNLP_MINOR_VERSION*100) + (GEOCODERNLP_PATCH_VERSION) 9 | 10 | #define _STRINGIFY(n) _STRINGIFY_HELPER(n) 11 | #define _STRINGIFY_HELPER(n) #n 12 | 13 | #define GEOCODERNLP_VERSION_STRING \ 14 | _STRINGIFY(GEOCODERNLP_MAJOR_VERSION) "." \ 15 | _STRINGIFY(GEOCODERNLP_MINOR_VERSION) "." \ 16 | _STRINGIFY(GEOCODERNLP_PATCH_VERSION) 17 | 18 | 19 | #endif // GEOCODER_VERSION_H 20 | --------------------------------------------------------------------------------