├── .clang-format
├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── Database.md
├── LICENSE
├── README.md
├── demo
    ├── geocoder-nlp.cpp
    ├── nearby-line.cpp
    └── nearby-point.cpp
├── geocoder-nlp.pri
├── importer
    ├── README.md
    ├── data
    │   └── priority.list
    ├── sql
    │   ├── duplicates.sql
    │   └── nominatim_cyclic_parents.sql
    └── src
    │   ├── config.h
    │   ├── hierarchy.cpp
    │   ├── hierarchy.h
    │   ├── hierarchyitem.cpp
    │   ├── hierarchyitem.h
    │   ├── main.cpp
    │   ├── normalization.cpp
    │   ├── normalization.h
    │   ├── utils.cpp
    │   └── utils.h
├── scripts
    ├── postal
    │   ├── README.md
    │   ├── build_all_country_db.py
    │   ├── build_country_db.sh
    │   ├── countries_languages.txt
    │   ├── make_country_list.py
    │   └── split_addresses.py
    └── tags
    │   ├── README.md
    │   └── generate_mapstyle.py
└── src
    ├── geocoder.cpp
    ├── geocoder.h
    ├── postal.cpp
    ├── postal.h
    └── version.h


/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | # We'll use defaults from the LLVM style, but with 4 columns indentation.
 3 | BasedOnStyle: GNU
 4 | ColumnLimit: 100
 5 | ---
 6 | Language: Cpp
 7 | AllowShortFunctionsOnASingleLine: Inline
 8 | AlwaysBreakAfterDefinitionReturnType: None
 9 | AlwaysBreakAfterReturnType: None
10 | SpaceBeforeParens: ControlStatements
11 | 
12 | AlignConsecutiveAssignments: Consecutive
13 | AlignConsecutiveDeclarations: Consecutive
14 | AlignConsecutiveDeclarations: Consecutive


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | 
 6 | # Compiled Dynamic libraries
 7 | *.so
 8 | *.dylib
 9 | 
10 | # Compiled Static libraries
11 | *.lai
12 | *.la
13 | *.a
14 | 
15 | *~
16 | 
17 | /*.pro.user
18 | build/
19 | .vscode
20 | *.code-workspace
21 | CMakeLists.txt.user
22 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "thirdparty/sqlite3pp"]
2 | 	path = thirdparty/sqlite3pp
3 | 	url = https://github.com/rinigus/sqlite3pp.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.6.0)
  2 | 
  3 | project(geocoder-nlp
  4 | 	DESCRIPTION "Geocoder NLP")
  5 | 
  6 | set(CMAKE_INCLUDE_CURRENT_DIR ON)
  7 | set(CMAKE_CXX_STANDARD 17)
  8 | set(CMAKE_CXX_STANDARD_REQUIRED True)
  9 | 
 10 | include(FindPkgConfig)
 11 | include(FeatureSummary)
 12 | include(GNUInstallDirs)
 13 | 
 14 | find_package(PkgConfig REQUIRED)
 15 | find_package(nlohmann_json 3.2.0 REQUIRED)
 16 | find_package(Boost 1.30 COMPONENTS program_options REQUIRED)
 17 | 
 18 | pkg_check_modules(MARISA marisa IMPORTED_TARGET)
 19 | pkg_check_modules(KYOTOCABINET kyotocabinet IMPORTED_TARGET)
 20 | pkg_check_modules(POSTAL libpostal IMPORTED_TARGET)
 21 | pkg_check_modules(SQLITE3 sqlite3 IMPORTED_TARGET)
 22 | pkg_check_modules(LIBPQXX libpqxx IMPORTED_TARGET)
 23 | 
 24 | add_compile_options(-Wall -Wextra -Werror)
 25 | 
 26 | set(SRC
 27 |   src/geocoder.cpp
 28 |   src/postal.cpp)
 29 | 
 30 | set(HEAD
 31 |   src/geocoder.h
 32 |   src/postal.h
 33 |   src/version.h)
 34 | 
 35 | # sqlite3pp include
 36 | include_directories(thirdparty/sqlite3pp/headeronly_src)
 37 | include_directories(src)
 38 | 
 39 | # boost
 40 | include_directories(${Boost_INCLUDE_DIR})
 41 | 
 42 | # importer
 43 | set(IMPSRC
 44 |   importer/src/config.h
 45 |   importer/src/main.cpp
 46 |   importer/src/hierarchy.cpp
 47 |   importer/src/hierarchy.h
 48 |   importer/src/hierarchyitem.cpp
 49 |   importer/src/hierarchyitem.h
 50 |   importer/src/normalization.cpp
 51 |   importer/src/normalization.h
 52 |   importer/src/utils.cpp
 53 |   importer/src/utils.h
 54 | )
 55 | add_executable(geocoder-importer ${SRC} ${HEAD} ${IMPSRC})
 56 | target_link_libraries(geocoder-importer
 57 |   PkgConfig::MARISA 
 58 |   PkgConfig::KYOTOCABINET
 59 |   PkgConfig::POSTAL
 60 |   PkgConfig::SQLITE3
 61 |   PkgConfig::LIBPQXX
 62 |   nlohmann_json::nlohmann_json
 63 |   ${Boost_LIBRARIES})
 64 | 
 65 | # demo codes
 66 | add_executable(geocoder-nlp
 67 |   demo/geocoder-nlp.cpp
 68 |   ${SRC}
 69 |   ${HEAD})
 70 | 
 71 | target_link_libraries(geocoder-nlp
 72 |   ${Boost_LIBRARIES})
 73 | 
 74 | target_link_libraries(geocoder-nlp
 75 |   PkgConfig::MARISA 
 76 |   PkgConfig::KYOTOCABINET
 77 |   PkgConfig::POSTAL
 78 |   PkgConfig::SQLITE3)
 79 | 
 80 | add_executable(nearby-line
 81 |   demo/nearby-line.cpp
 82 |   ${SRC}
 83 |   ${HEAD})
 84 | 
 85 | target_link_libraries(nearby-line
 86 |   PkgConfig::MARISA 
 87 |   PkgConfig::KYOTOCABINET
 88 |   PkgConfig::POSTAL
 89 |   PkgConfig::SQLITE3)
 90 | 
 91 | add_executable(nearby-point
 92 |   demo/nearby-point.cpp
 93 |   ${SRC}
 94 |   ${HEAD})
 95 | 
 96 | target_link_libraries(nearby-point
 97 |   PkgConfig::MARISA 
 98 |   PkgConfig::KYOTOCABINET
 99 |   PkgConfig::POSTAL
100 |   PkgConfig::SQLITE3)
101 | 
102 | # install
103 | install(TARGETS geocoder-importer
104 |     DESTINATION ${CMAKE_INSTALL_BINDIR})
105 | 
106 | # summary
107 | feature_summary(WHAT ALL FATAL_ON_MISSING_REQUIRED_PACKAGES)
108 | 
109 | 


--------------------------------------------------------------------------------
/Database.md:
--------------------------------------------------------------------------------
 1 | # Geocoder NLP database format
 2 | 
 3 | The geocoder database consists of several files which are expected to be in the
 4 | same directory. All locations are described using singe coordinate to keep the
 5 | files as small as possible.
 6 | 
 7 | The files composing a database are:
 8 | 
 9 | 1. geonlp-primary.sqlite: SQLite database with location description and coordinate
10 | 2. geonlp-normalized.trie: MARISA database with normalized strings
11 | 3. geonlp-normalized-id.kch: Kyoto Cabinet database for linking MARISA and primary IDs
12 | 
13 | ## geonlp-primary.sqlite
14 | 
15 | SQLite database contains location description, their organization into hierarchy
16 | of objects. 
17 | 
18 | Table `object_primary` keeps location description. In this table, objects are
19 | stored sequentially (in terms of their `id`) according to the positioning in the
20 | object hierarchy with the children stored after parents. Table `hierarchy` has a
21 | record for each item (`id` from `object_primary`) with the children consisting
22 | of parent ID (`prim_id`) and the ID of the last child (`last_subobject`). 
23 | 
24 | Object types are stored separately in `type` table with the type ID used in
25 | `object_primary`.
26 | 
27 | Spatial queries are indexed using R-Tree with `box_id` used as a reference in
28 | `object_primary`. Namely, as all objects are stored as points, for storage
29 | efficiency, objects next to each other are set to have the same `box_id` and are
30 | found through `-rtree` tables.
31 | 
32 | Table `meta` keeps database format version and is used to check version
33 | compatibility.
34 | 
35 | ## geonlp-normalized.trie
36 | 
37 | All normalized strings are stored in MARISA database
38 | (https://github.com/s-yata/marisa-trie). Normalized strings are formed from
39 | `name` and other similar fields of `object_primary` table in
40 | `geonlp-primary.sqlite`. All strings are pushed into MARISA database that
41 | assigns its internal ID for each of the strings.
42 | 
43 | ## geonlp-normalized-id.kch
44 | 
45 | Kyoto Cabinet (https://dbmx.net/kyotocabinet/) database for linking MARISA and
46 | primary IDs. Hash database variant is used where `key` is an ID provided by
47 | MARISA for a search string and value is an array of bytes consisting of
48 | `object_primary` IDs stored as `uint32_t` one after another. The array is stored
49 | using `std::string`.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 rinigus
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Geocoder NLP
 2 | 
 3 | This is a geocoder C++ library that targets offline use by mobile 
 4 | applications. It is able to perform forward and reverse geocoding. 
 5 | For forward geocoding, it uses libpostal to parse the user
 6 | request, normalize the parsed result, and search for the match in
 7 | geocoder database. In addition to traditional reverse geocoding, it is 
 8 | able to find points of interest close to the reference point or line.
 9 | 
10 | The library includes demo program showing how to use it. Its also used
11 | as one of the geocoders in OSM Scout Server
12 | (https://github.com/rinigus/osmscout-server). For preparation of the
13 | SQLite database used by the geocoder, an importer from liboscmscout
14 | map database is provided.
15 | 
16 | When using country-based libpostal address parser and limiting number
17 | of languages used to process the search request, it is possible to use
18 | this geocoder on mobile platforms. The development of the geocoder is
19 | mainly targeting Sailfish OS applications with the tests running on
20 | Linux as well. Its expected that the geocoder would run on platforms
21 | supported by libpostal without any major changes.
22 | 
23 | To compile, adjust Makefile. The library can be used by incorporating
24 | source code files in `src` subdirectory and ensuring that
25 | `thirdparty/sqlite3pp/headeronly_src` is in the compiler's include
26 | path.
27 | 
28 | In addition to libpostal, libsqlite3 is required for the geocoder to
29 | function. For importer, libosmscout is required in addition to the
30 | libraries mentioned above.
31 | 
32 | ## Databases
33 | 
34 | At present, the datasets required for the geocoder to function are distributed 
35 | as a part of OSM Scout Server datasets.
36 | 
37 | If you use the geocoder with the full libpostal installation, you don't need to 
38 | get the libpostal datasets from that location, but can use the datasets 
39 | provided by libpostal. This is a default when there is no path
40 | specified for corresponding libpostal datasets. 
41 | 
42 | To use country-specific datasets, you would have to get:
43 | * libpostal language parser: postal/global
44 | * libpostal country-specific database: postal/countries/SELECT THE NEEDED ONES
45 | 
46 | In addition, the prepared geocoder databases are available at 
47 | geocoder/SELECT THE NEEDED ONES.
48 | 
49 | Database format is described in [separate document](Database.md).
50 | 
51 | ## Acknowledgments
52 | 
53 | libpostal: Used for input parsing; https://github.com/openvenues/libpostal
54 | 
55 | Nominatim: Used for data import; https://nominatim.org/
56 | 


--------------------------------------------------------------------------------
/demo/geocoder-nlp.cpp:
--------------------------------------------------------------------------------
  1 | #include "geocoder.h"
  2 | #include "postal.h"
  3 | #include "version.h"
  4 | 
  5 | #include <boost/program_options.hpp>
  6 | #include <iomanip>
  7 | #include <iostream>
  8 | 
  9 | using namespace GeoNLP;
 10 | namespace po = boost::program_options;
 11 | 
 12 | int main(int argc, char *argv[])
 13 | {
 14 |   Postal postal;
 15 |   // postal.set_initialize_every_call(true);
 16 | 
 17 |   std::string query;
 18 |   std::string postal_data_global;
 19 |   std::string postal_data_country;
 20 |   std::string geocoder_data;
 21 |   int         max_results = 10;
 22 |   double      ref_latitude;
 23 |   double      ref_longitude;
 24 |   int         ref_zoom       = 16;
 25 |   double      ref_importance = 0.75;
 26 | 
 27 |   Geocoder::GeoReference reference;
 28 | 
 29 |   {
 30 |     po::options_description generic("Geocoder NLP demo options");
 31 |     generic.add_options()("help,h", "Help message");
 32 |     generic.add_options()("version,v", "Version");
 33 |     generic.add_options()("geocoder-data", po::value<std::string>(&geocoder_data),
 34 |                           "GeocoderNLP database directory path");
 35 | 
 36 |     generic.add_options()(
 37 |         "postal-country", po::value<std::string>(&postal_data_country),
 38 |         "libpostal country database. Keep empty to use global libpostal parser data.");
 39 |     generic.add_options()(
 40 |         "postal-global", po::value<std::string>(&postal_data_global),
 41 |         "libpostal global database. Keep empty to use global libpostal parser data.");
 42 | 
 43 |     generic.add_options()("max-results", po::value<int>(&max_results), "Maximal number of results");
 44 | 
 45 |     generic.add_options()("ref-latitude", po::value<double>(&ref_latitude),
 46 |                           "Reference for location bias; latitude");
 47 |     generic.add_options()("ref-longitude", po::value<double>(&ref_longitude),
 48 |                           "Reference for location bias; longitude");
 49 |     generic.add_options()(
 50 |         "ref-zoom", po::value<int>(&ref_zoom),
 51 |         "Reference for location bias; zoom level for calculating reference radius");
 52 |     generic.add_options()("ref-importance", po::value<double>(&ref_importance),
 53 |                           "Reference for location bias; importance from 0 to 1 of location bias");
 54 | 
 55 |     po::options_description hidden("Hidden options");
 56 |     hidden.add_options()("query", po::value<std::string>(&query), "Search query");
 57 | 
 58 |     po::positional_options_description p;
 59 |     p.add("query", 1);
 60 | 
 61 |     po::options_description cmdline_options;
 62 |     cmdline_options.add(generic).add(hidden);
 63 | 
 64 |     po::variables_map vm;
 65 |     try
 66 |       {
 67 |         po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(),
 68 |                   vm);
 69 |         po::notify(vm);
 70 |       }
 71 |     catch (std::exception &e)
 72 |       {
 73 |         std::cerr << "Error while parsing options: " << e.what() << "\n\n";
 74 |         std::cerr << generic << "\n";
 75 |       }
 76 | 
 77 |     if (vm.count("help"))
 78 |       {
 79 |         std::cout << "Geocoder NLP demo:\n\n"
 80 |                   << "Call as\n\n " << argv[0] << " <options> query\n"
 81 |                   << "\nwhere query is a string.\n\n"
 82 |                   << generic << "\n";
 83 |         return 0;
 84 |       }
 85 | 
 86 |     if (vm.count("version"))
 87 |       {
 88 |         std::cout << "Geocoder NLP version: " << GEOCODERNLP_VERSION_STRING << "\n";
 89 |         return 0;
 90 |       }
 91 | 
 92 |     if (!vm.count("geocoder-data"))
 93 |       {
 94 |         std::cerr << "GeocoderNLP database directory path is missing\n";
 95 |         return -1;
 96 |       }
 97 | 
 98 |     if (vm.count("ref-latitude") && vm.count("ref-longitude"))
 99 |       reference.set(ref_latitude, ref_longitude, ref_zoom, ref_importance);
100 |   }
101 | 
102 |   std::vector<Postal::ParseResult> parsed_query;
103 |   Postal::ParseResult              nonorm;
104 | 
105 |   postal.set_postal_datadir(postal_data_global, postal_data_country);
106 |   // postal.add_language("da");
107 |   // postal.add_language("et");
108 |   // postal.add_language("en");
109 |   postal.set_initialize_every_call(true);
110 |   postal.set_use_primitive(false);
111 | 
112 |   postal.parse(query, parsed_query, nonorm);
113 | 
114 |   std::cout << "\nAddress parsing before full normalization:\n\n";
115 |   for (auto v : nonorm)
116 |     {
117 |       std::cout << v.first << " ";
118 |       for (auto k : v.second)
119 |         std::cout << k << " ";
120 |       std::cout << "\n";
121 |     }
122 |   std::cout << "\n";
123 | 
124 |   std::cout << "Normalization:\n\n";
125 |   for (auto r : parsed_query)
126 |     {
127 |       for (auto v : r)
128 |         {
129 |           std::cout << v.first << " ";
130 |           for (auto k : v.second)
131 |             std::cout << k << " ";
132 |           std::cout << "\n";
133 |         }
134 |       std::cout << "\n";
135 |     }
136 | 
137 |   std::cout << std::endl;
138 | 
139 |   Geocoder geo(geocoder_data);
140 |   geo.set_max_queries_per_hierarchy(30);
141 |   geo.set_max_results(max_results);
142 |   // geo.set_result_language("en");
143 | 
144 |   std::vector<Geocoder::GeoResult> result;
145 | 
146 |   std::cout << "Geocoder loaded" << std::endl;
147 | 
148 |   geo.search(parsed_query, result, 0, reference);
149 | 
150 |   std::cout << std::setprecision(8);
151 |   std::cout << "Search results: \n\n";
152 |   size_t counter = 0;
153 |   for (const Geocoder::GeoResult &r : result)
154 |     {
155 |       std::cout << r.title << "\n"
156 |                 << r.address << "\n"
157 |                 << "Postal code: " << r.postal_code << "\n"
158 |                 << "Phone: " << r.phone << " / URL: " << r.website << "\n"
159 |                 << r.latitude << ", " << r.longitude << " / distance=" << r.distance << "\n"
160 |                 << r.type << " / " << r.id << " / " << r.search_rank << " / " << r.levels_resolved
161 |                 << "\n\n";
162 |       counter++;
163 |     }
164 | 
165 |   std::cout << "Number of results: " << counter << std::endl;
166 | 
167 |   return 0;
168 | }
169 | 


--------------------------------------------------------------------------------
/demo/nearby-line.cpp:
--------------------------------------------------------------------------------
 1 | #include "geocoder.h"
 2 | #include "postal.h"
 3 | 
 4 | #include <fstream>
 5 | #include <iomanip>
 6 | #include <iostream>
 7 | #include <stdlib.h>
 8 | 
 9 | using namespace GeoNLP;
10 | 
11 | int main(int argc, char *argv[])
12 | {
13 |   Postal postal;
14 | 
15 |   if (argc < 8)
16 |     {
17 |       std::cout << "Use: " << argv[0] << " dbase postal_global postal_country address\n"
18 |                 << "where\n"
19 |                 << " dbase - path to Geocoder-NLP database folder\n"
20 |                 << " postal_global - path to libpostal database with language classifier\n"
21 |                 << " postal_country - path to libpostal  database covering country\n"
22 |                 << " line - text file containing lat lon coordinates, one per line\n"
23 |                 << " radius - distance in meters\n"
24 |                 << " type - type of POI\n"
25 |                 << " name - name of POI\n";
26 |       return -1;
27 |     }
28 | 
29 |   double radius = atof(argv[5]);
30 |   std::cout << "Search radius: " << radius << " meters\n";
31 | 
32 |   std::vector<double> latitude, longitude;
33 |   {
34 |     std::ifstream fin(argv[4]);
35 |     double        lat, lon;
36 |     while (fin >> lat >> lon)
37 |       {
38 |         latitude.push_back(lat);
39 |         longitude.push_back(lon);
40 |       }
41 |   }
42 |   std::cout << "Loaded " << latitude.size() << " coordinates from " << argv[4] << "\n";
43 | 
44 |   char                    *name_query = argv[7];
45 |   std::vector<std::string> parsed_name;
46 | 
47 |   if (strlen(name_query) > 0)
48 |     {
49 |       postal.set_postal_datadir(argv[2], argv[3]);
50 |       postal.set_initialize_every_call(true);
51 |       postal.expand_string(name_query, parsed_name);
52 | 
53 |       std::cout << "Normalization of name:\n\n";
54 |       for (auto r : parsed_name)
55 |         std::cout << r << "\n";
56 |     }
57 | 
58 |   std::cout << std::endl;
59 | 
60 |   std::vector<std::string> type_query;
61 |   if (strlen(argv[6]) > 0)
62 |     {
63 |       type_query.push_back(argv[6]);
64 |       std::cout << "Searching for type: " << argv[6] << "\n";
65 |     }
66 | 
67 |   Geocoder geo(argv[1]);
68 |   geo.set_max_queries_per_hierarchy(30);
69 |   geo.set_max_results(25);
70 | 
71 |   std::vector<Geocoder::GeoResult> result;
72 | 
73 |   std::cout << "Geocoder loaded" << std::endl;
74 | 
75 |   if (!geo.search_nearby(parsed_name, type_query, latitude, longitude, radius, result, postal))
76 |     std::cout << "Failed to search\n";
77 |   else
78 |     {
79 |       std::cout << std::setprecision(8);
80 |       std::cout << "Search results: \n\n";
81 |       size_t counter = 0;
82 |       for (const Geocoder::GeoResult &r : result)
83 |         {
84 |           std::cout << r.title << "\n"
85 |                     << r.address << "\n"
86 |                     << r.latitude << ", " << r.longitude << ": distance - " << r.distance << "\n"
87 |                     << r.type << " / " << r.id << " / " << r.admin_levels << " / "
88 |                     << r.levels_resolved << "\n\n";
89 |           counter++;
90 |         }
91 | 
92 |       std::cout << "Number of results: " << counter << std::endl;
93 |     }
94 | 
95 |   return 0;
96 | }
97 | 


--------------------------------------------------------------------------------
/demo/nearby-point.cpp:
--------------------------------------------------------------------------------
 1 | #include "geocoder.h"
 2 | #include "postal.h"
 3 | 
 4 | #include <fstream>
 5 | #include <iomanip>
 6 | #include <iostream>
 7 | #include <stdlib.h>
 8 | 
 9 | using namespace GeoNLP;
10 | 
11 | int main(int argc, char *argv[])
12 | {
13 |   Postal postal;
14 | 
15 |   if (argc < 8)
16 |     {
17 |       std::cout << "Use: " << argv[0] << " dbase postal_global postal_country address\n"
18 |                 << "where\n"
19 |                 << " dbase - path to Geocoder-NLP database folder\n"
20 |                 << " postal_global - path to libpostal database with language classifier\n"
21 |                 << " postal_country - path to libpostal  database covering country\n"
22 |                 << " point - text file containing lat lon coordinate, the first pair is read only\n"
23 |                 << " radius - distance in meters\n"
24 |                 << " type - type of POI\n"
25 |                 << " name - name of POI\n";
26 |       return -1;
27 |     }
28 | 
29 |   double radius = atof(argv[5]);
30 |   std::cout << "Search radius: " << radius << " meters\n";
31 | 
32 |   std::vector<double> latitude, longitude;
33 |   {
34 |     std::ifstream fin(argv[4]);
35 |     double        lat, lon;
36 |     while (fin >> lat >> lon)
37 |       {
38 |         latitude.push_back(lat);
39 |         longitude.push_back(lon);
40 |         break;
41 |       }
42 |   }
43 |   std::cout << "Loaded " << latitude.size() << " coordinates from " << argv[4] << "\n";
44 | 
45 |   char                    *name_query = argv[7];
46 |   std::vector<std::string> parsed_name;
47 | 
48 |   if (strlen(name_query) > 0)
49 |     {
50 |       postal.set_postal_datadir(argv[2], argv[3]);
51 |       postal.set_initialize_every_call(true);
52 |       postal.expand_string(name_query, parsed_name);
53 | 
54 |       std::cout << "Normalization of name:\n\n";
55 |       for (auto r : parsed_name)
56 |         std::cout << r << "\n";
57 |     }
58 | 
59 |   std::cout << std::endl;
60 | 
61 |   std::vector<std::string> type_query;
62 |   if (strlen(argv[6]) > 0)
63 |     {
64 |       type_query.push_back(argv[6]);
65 |       std::cout << "Searching for type: " << argv[6] << "\n";
66 |     }
67 | 
68 |   Geocoder geo(argv[1]);
69 |   geo.set_max_queries_per_hierarchy(30);
70 |   geo.set_max_results(25);
71 | 
72 |   std::vector<Geocoder::GeoResult> result;
73 | 
74 |   std::cout << "Geocoder loaded" << std::endl;
75 | 
76 |   if (!geo.search_nearby(parsed_name, type_query, latitude[0], longitude[0], radius, result,
77 |                          postal))
78 |     std::cout << "Failed to search\n";
79 |   else
80 |     {
81 |       std::cout << std::setprecision(8);
82 |       std::cout << "Search results: \n\n";
83 |       size_t counter = 0;
84 |       for (const Geocoder::GeoResult &r : result)
85 |         {
86 |           std::cout << r.title << "\n"
87 |                     << r.address << "\n"
88 |                     << r.latitude << ", " << r.longitude << ": distance - " << r.distance << "\n"
89 |                     << r.type << " / " << r.id << " / " << r.admin_levels << " / "
90 |                     << r.levels_resolved << "\n\n";
91 |           counter++;
92 |         }
93 | 
94 |       std::cout << "Number of results: " << counter << std::endl;
95 |     }
96 | 
97 |   return 0;
98 | }
99 | 


--------------------------------------------------------------------------------
/geocoder-nlp.pri:
--------------------------------------------------------------------------------
 1 | #DEFINES += GEONLP_PRINT_DEBUG_QUERIES
 2 | #DEFINES += GEONLP_PRINT_DEBUG
 3 | 
 4 | INCLUDEPATH += $$PWD/src
 5 | INCLUDEPATH += $$PWD/thirdparty/sqlite3pp/headeronly_src
 6 | 
 7 | DEPENDPATH += $$PWD/src
 8 | 
 9 | SOURCES += \
10 |     $$PWD/src/postal.cpp \
11 |     $$PWD/src/geocoder.cpp
12 | 
13 | HEADERS += \
14 |     $$PWD/src/postal.h \
15 |     $$PWD/src/geocoder.h \
16 |     $$PWD/src/version.h
17 |        
18 | LIBS += -lpostal 
19 | 


--------------------------------------------------------------------------------
/importer/README.md:
--------------------------------------------------------------------------------
 1 | # Import program and scripts.
 2 | 
 3 | Import program is using Nominatim database and filters the data using boundary
 4 | given in GeoJSON file.
 5 | 
 6 | The boundary file can be generated from POLY files as provided by Geofabrik or
 7 | given in hierarchy folder of OSM Scout Server (see
 8 | https://github.com/rinigus/osmscout-server/tree/master/scripts/import/hierarchy).
 9 | As a converter, [poly2geojson](https://github.com/frafra/poly2geojson/) can be
10 | used.
11 | 
12 | Data can be filtered using data/priority.list and data/skip.list files. Those
13 | list OSM tags in the form where tag type and its value are merged using `_`. Out
14 | of the lists, the priority one gives locations that would be kept in the
15 | database even without names associated with them for reverse geocoding. The
16 | "skip" list allows to specify locations that would be dropped by location type.
17 | 
18 | To generate larger amount of tags for the priority list, the scripts
19 | under `scripts/tags` (from main directory of the project) can be used.
20 | 
21 | ## Used
22 | 
23 | - Nominatim Docker https://github.com/mediagis/nominatim-docker/
24 | - For testing: Nominatim-UI https://github.com/osm-search/nominatim-ui
25 | - poly2geojson: https://github.com/frafra/poly2geojson


--------------------------------------------------------------------------------
/importer/data/priority.list:
--------------------------------------------------------------------------------
  1 | aerialway_cable_car
  2 | amenity_atm
  3 | amenity_bank
  4 | amenity_bar
  5 | amenity_bicycle_parking
  6 | amenity_bicycle_rental
  7 | amenity_biergarten
  8 | amenity_bureau_de_change
  9 | amenity_cafe
 10 | amenity_car_rental
 11 | amenity_car_wash
 12 | amenity_charging_station
 13 | amenity_cinema
 14 | amenity_clinic
 15 | amenity_college
 16 | amenity_community_centre
 17 | amenity_courthouse
 18 | amenity_dentist
 19 | amenity_doctors
 20 | amenity_embassy
 21 | amenity_fast_food
 22 | amenity_ferry_terminal
 23 | amenity_fire_station
 24 | amenity_fuel
 25 | amenity_hospital
 26 | amenity_ice_cream
 27 | amenity_motorcycle_parking
 28 | amenity_nursing_home
 29 | amenity_parking
 30 | amenity_pharmacy
 31 | amenity_place_of_worship
 32 | amenity_police
 33 | amenity_post_office
 34 | amenity_pub
 35 | amenity_restaurant
 36 | amenity_sauna
 37 | amenity_taxi
 38 | amenity_theatre
 39 | amenity_toilets
 40 | amenity_townhall
 41 | amenity_university
 42 | emergency_defibrillator
 43 | emergency_phone
 44 | leisure_playground
 45 | shop_alcohol
 46 | shop_bakery
 47 | shop_beauty
 48 | shop_beverages
 49 | shop_bicycle
 50 | shop_books
 51 | shop_boutique
 52 | shop_butcher
 53 | shop_car
 54 | shop_car_parts
 55 | shop_car_repair
 56 | shop_chemist
 57 | shop_clothes
 58 | shop_computer
 59 | shop_confectionery
 60 | shop_convenience
 61 | shop_copyshop
 62 | shop_cosmetics
 63 | shop_deli
 64 | shop_department_store
 65 | shop_doityourself
 66 | shop_dry_cleaning
 67 | shop_electronics
 68 | shop_florist
 69 | shop_funeral_directors
 70 | shop_furniture
 71 | shop_garden_centre
 72 | shop_gift
 73 | shop_greengrocer
 74 | shop_hairdresser
 75 | shop_hardware
 76 | shop_jewelry
 77 | shop_kiosk
 78 | shop_laundry
 79 | shop_mall
 80 | shop_mobile_phone
 81 | shop_motorcycle
 82 | shop_newsagent
 83 | shop_optician
 84 | shop_pet
 85 | shop_shoes
 86 | shop_sports
 87 | shop_stationery
 88 | shop_supermarket
 89 | shop_toys
 90 | shop_travel_agency
 91 | shop_tyres
 92 | shop_vacant
 93 | shop_variety_store
 94 | shop_wine
 95 | tourism_camp_site
 96 | tourism_caravan_site
 97 | tourism_guest_house
 98 | tourism_hostel
 99 | tourism_hotel
100 | tourism_information
101 | tourism_motel
102 | 


--------------------------------------------------------------------------------
/importer/sql/duplicates.sql:
--------------------------------------------------------------------------------
1 | select f.*, o.name from 
2 | (select o.name, parent, postal_code, t.name as type_name, count(*) as cnt from object_primary o
3 | join "type" t on t.id = o.type_id
4 | group by o.name, parent, postal_code, t.name
5 | order by cnt desc
6 | limit 25) f 
7 | join object_primary o on f.parent=o.id
8 | 


--------------------------------------------------------------------------------
/importer/sql/nominatim_cyclic_parents.sql:
--------------------------------------------------------------------------------
1 | select c.place_id,c.name,c.parent_place_id,c.linked_place_id,c.osm_id,c.osm_type,c.indexed_status 
2 | from placex c 
3 | join placex r on r.place_id=c.parent_place_id 
4 | where r.parent_place_id=c.place_id;
5 | 


--------------------------------------------------------------------------------
/importer/src/config.h:
--------------------------------------------------------------------------------
 1 | #ifndef GEOCODER_CONFIG_H
 2 | #define GEOCODER_CONFIG_H
 3 | 
 4 | #include <cstdint>
 5 | 
 6 | #define TEMPORARY "TEMPORARY" // set to empty if need to debug import
 7 | 
 8 | /// if there are more expansions that specified, this object will be dropped from normalization
 9 | /// table
10 | #define MAX_NUMBER_OF_EXPANSIONS 85
11 | 
12 | /// starting from this length, check wheher the string is suspicious
13 | #define LENGTH_STARTING_SUSP_CHECK 200
14 | 
15 | #define MAX_COMMAS 10 /// maximal number of commas allowed in a name
16 | 
17 | #define GEOCODER_IMPORTER_POSTGRES "GEOCODER_IMPORTER_POSTGRES"
18 | 
19 | typedef uint64_t      hindex;
20 | typedef long long int sqlid; /// type used by IDs in SQLite
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/importer/src/hierarchy.cpp:
--------------------------------------------------------------------------------
  1 | #include "hierarchy.h"
  2 | 
  3 | #include <iostream>
  4 | #include <stdexcept>
  5 | 
  6 | Hierarchy::Hierarchy() {}
  7 | 
  8 | Hierarchy::~Hierarchy() {}
  9 | 
 10 | void Hierarchy::add_item(std::shared_ptr<HierarchyItem> &item)
 11 | {
 12 |   hindex id        = item->id();
 13 |   hindex parent_id = item->parent_id();
 14 |   if (m_items.count(id))
 15 |     throw std::runtime_error("Trying to insert item that has been inserted earlier");
 16 | 
 17 |   m_items[id] = item;
 18 |   auto p      = m_items.find(parent_id);
 19 |   if (p != m_items.end())
 20 |     p->second->add_child(item);
 21 |   else
 22 |     {
 23 |       auto root_leaf = m_root.find(parent_id);
 24 |       if (root_leaf == m_root.end())
 25 |         m_root[parent_id] = std::set<std::shared_ptr<HierarchyItem> >({ item });
 26 |       else
 27 |         root_leaf->second.insert(item);
 28 |     }
 29 | 
 30 |   // check if added item was a parent for someone and adjust root accordingly
 31 |   auto root_leaf = m_root.find(id);
 32 |   if (root_leaf != m_root.end())
 33 |     {
 34 |       for (auto root_iter : root_leaf->second)
 35 |         {
 36 |           if (root_iter->parent_id() != id)
 37 |             throw std::runtime_error("Mismatch between expected parent and root location");
 38 |           item->add_child(root_iter);
 39 |         }
 40 |       m_root.erase(root_leaf);
 41 |     }
 42 | }
 43 | 
 44 | bool Hierarchy::add_linked_item(std::shared_ptr<HierarchyItem> &item)
 45 | {
 46 |   hindex linked = item->linked_id();
 47 |   auto   tolink = m_items.find(linked);
 48 |   if (tolink == m_items.end())
 49 |     {
 50 |       std::cout << "Failed to find linked object " << linked << " required by " << item->id()
 51 |                 << ". Skipping linkage.\n";
 52 |       return false;
 53 |     }
 54 | 
 55 |   tolink->second->add_linked(item);
 56 |   return true;
 57 | }
 58 | 
 59 | void Hierarchy::cleanup()
 60 | {
 61 |   for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter)
 62 |     {
 63 |       std::set<std::shared_ptr<HierarchyItem> > keep;
 64 |       for (auto item : root_iter->second)
 65 |         {
 66 |           item->cleanup_children();
 67 |           if (item->keep())
 68 |             keep.insert(item);
 69 |           else
 70 |             keep.insert(item->children().begin(), item->children().end());
 71 |         }
 72 |       root_iter->second = keep;
 73 | 
 74 |       // ensure that the parent is set correctly
 75 |       for (auto item : root_iter->second)
 76 |         item->set_parent(root_iter->first, true);
 77 |     }
 78 | }
 79 | 
 80 | void Hierarchy::set_country(const std::string &country, hindex id)
 81 | {
 82 |   if (!m_items.count(id))
 83 |     {
 84 |       std::cout << "Missing country in the database: " << country << " / " << id << "\n";
 85 |       for (auto item : root_items())
 86 |         if (item->country() == country)
 87 |           item->print_branch(0);
 88 |     }
 89 | 
 90 |   auto parent = m_items[id];
 91 |   for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter)
 92 |     {
 93 |       std::set<std::shared_ptr<HierarchyItem> > remove;
 94 |       for (auto item : root_iter->second)
 95 |         if (item->country() == country && item->id() != id)
 96 |           {
 97 |             parent->add_child(item);
 98 |             remove.insert(item);
 99 |           }
100 |       std::cout << "Relocated to country: " << country << " - " << remove.size() << "\n";
101 |       for (auto item : remove)
102 |         root_iter->second.erase(item);
103 |     }
104 | }
105 | 
106 | void Hierarchy::finalize()
107 | {
108 |   m_root_finalized = root_items();
109 |   sqlid index      = 1;
110 |   for (auto item : m_root_finalized)
111 |     {
112 |       index = item->index(index, 0);
113 |       item->set_parent(0);
114 |     }
115 | 
116 |   std::cout << "Hierarchy: active items: " << index - 1
117 |             << " / cleared items: " << m_items.size() - (index - 1) << "\n";
118 | }
119 | 
120 | bool Hierarchy::check_indexing(bool verbose)
121 | {
122 |   std::cout << "Check whether all items are indexed\n";
123 | 
124 |   m_index_check_failed.clear();
125 |   bool isok = true;
126 |   for (auto itemp : m_items)
127 |     {
128 |       auto item = itemp.second;
129 |       if (item->keep(false) && !item->indexed())
130 |         {
131 |           isok = false;
132 |           if (verbose)
133 |             {
134 |               std::cout << "\nItem is not included into hierarchy while it should be\n";
135 |               item->print_item(0);
136 | 
137 |               std::cout << "\nItem part of hierarchy (child -> parent):\n";
138 |             }
139 | 
140 |           auto             i = item;
141 |           std::set<hindex> ids;
142 |           for (auto parent = item->parent_id(); parent != 0; parent = i->parent_id())
143 |             {
144 |               if (m_items.find(parent) == m_items.end())
145 |                 {
146 |                   // those are not expected to be many, keeping message
147 |                   std::cout << "\nCannot find parent with ID " << parent << "\n";
148 |                   break;
149 |                 }
150 | 
151 |               i = m_items[parent];
152 | 
153 |               if (verbose)
154 |                 i->print_item(0);
155 | 
156 |               if (ids.count(i->id()) > 0)
157 |                 {
158 |                   if (verbose)
159 |                     std::cout << "\nCyclic branch detected\n";
160 | 
161 |                   m_index_check_failed.insert(ids.begin(), ids.end());
162 |                   break;
163 |                 }
164 |               ids.insert(i->id());
165 |             }
166 |           if (verbose)
167 |             std::cout << "\n\n";
168 |         }
169 |     }
170 | 
171 |   if (isok)
172 |     std::cout << "Items indexing check passed\n";
173 |   else
174 |     std::cout << "Items indexing check FAILED\n";
175 |   return isok;
176 | }
177 | 
178 | void Hierarchy::write(sqlite3pp::database &db) const
179 | {
180 |   for (auto item : m_root_finalized)
181 |     item->write(db);
182 | }
183 | 
184 | std::deque<std::shared_ptr<HierarchyItem> > Hierarchy::root_items() const
185 | {
186 |   std::deque<std::shared_ptr<HierarchyItem> > q;
187 |   for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter)
188 |     for (auto item : root_iter->second)
189 |       q.push_back(item);
190 |   return q;
191 | }
192 | 
193 | size_t Hierarchy::get_root_count() const
194 | {
195 |   size_t count{ 0 };
196 |   for (auto i : m_root)
197 |     count += i.second.size();
198 |   return count;
199 | }
200 | 
201 | hindex Hierarchy::get_next_nonzero_root_parent() const
202 | {
203 |   for (auto root_iter = m_root.begin(); root_iter != m_root.end(); ++root_iter)
204 |     if (root_iter->first)
205 |       return root_iter->first;
206 |   return 0;
207 | }
208 | 
209 | std::set<std::string> Hierarchy::get_root_countries() const
210 | {
211 |   std::set<std::string> missing;
212 |   for (auto item : root_items())
213 |     missing.insert(item->country());
214 |   return missing;
215 | }
216 | 
217 | bool Hierarchy::has_item(hindex id) const
218 | {
219 |   return m_items.count(id);
220 | }
221 | 
222 | void Hierarchy::print(bool full) const
223 | {
224 |   std::set<hindex> root_ids;
225 |   for (auto item : root_items())
226 |     {
227 |       if (full)
228 |         item->print_branch(0);
229 |       else
230 |         item->print_item(0);
231 |       root_ids.insert(item->id());
232 |     }
233 | 
234 |   std::cout << (full ? "\n\n" : "") << "Root items:\n";
235 |   for (auto id : root_ids)
236 |     std::cout << id << " ";
237 |   std::cout << "\n";
238 |   std::cout << "Root items count: " << get_root_count() << "\n";
239 | 
240 |   std::cout << "Countries: ";
241 |   for (auto c : get_root_countries())
242 |     std::cout << c << " ";
243 |   std::cout << "\n";
244 | }
245 | 
246 | void Hierarchy::print_root_with_parent_id(hindex parent_id) const
247 | {
248 |   for (auto item : root_items())
249 |     {
250 |       if (item->parent_id() == parent_id)
251 |         item->print_item(0);
252 |     }
253 | }
254 | 


--------------------------------------------------------------------------------
/importer/src/hierarchy.h:
--------------------------------------------------------------------------------
 1 | #ifndef HIERARCHY_H
 2 | #define HIERARCHY_H
 3 | 
 4 | #pragma once
 5 | 
 6 | #include "hierarchyitem.h"
 7 | 
 8 | #include <deque>
 9 | #include <map>
10 | #include <memory>
11 | #include <set>
12 | #include <sqlite3pp.h>
13 | #include <string>
14 | 
15 | class Hierarchy
16 | {
17 | public:
18 |   Hierarchy();
19 |   ~Hierarchy();
20 | 
21 |   void add_item(std::shared_ptr<HierarchyItem> &item);
22 |   bool add_linked_item(std::shared_ptr<HierarchyItem> &item);
23 |   void set_country(const std::string &country, hindex id);
24 |   void cleanup();
25 |   void finalize();
26 |   bool check_indexing(bool verbose = false);
27 |   void write(sqlite3pp::database &db) const;
28 | 
29 |   size_t get_missing_count() const { return m_root.size(); }
30 |   size_t get_root_count() const;
31 |   bool   has_item(hindex id) const;
32 | 
33 |   hindex                get_next_nonzero_root_parent() const;
34 |   std::set<std::string> get_root_countries() const;
35 | 
36 |   std::set<hindex> get_failed_indexes() const { return m_index_check_failed; }
37 | 
38 |   void print(bool full = true) const;
39 |   void print_root_with_parent_id(hindex parent_id) const;
40 | 
41 | private:
42 |   std::deque<std::shared_ptr<HierarchyItem> > root_items() const;
43 | 
44 | private:
45 |   std::map<hindex, std::shared_ptr<HierarchyItem> >            m_items;
46 |   std::map<hindex, std::set<std::shared_ptr<HierarchyItem> > > m_root;
47 |   std::deque<std::shared_ptr<HierarchyItem> >                  m_root_finalized;
48 |   std::set<hindex>                                             m_index_check_failed;
49 | };
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/importer/src/hierarchyitem.cpp:
--------------------------------------------------------------------------------
  1 | #include "hierarchyitem.h"
  2 | #include "postal.h"
  3 | #include "utils.h"
  4 | 
  5 | #include <boost/algorithm/string/trim.hpp>
  6 | #include <fstream>
  7 | #include <iostream>
  8 | #include <sstream>
  9 | #include <stdexcept>
 10 | 
 11 | std::set<std::string> HierarchyItem::s_priority_types;
 12 | std::set<std::string> HierarchyItem::s_skip_types;
 13 | 
 14 | static std::string allowed_type_chars = "abcdefghijklmnopqrstuvwxyz_-";
 15 | 
 16 | HierarchyItem::HierarchyItem(const pqxx::row &row)
 17 | {
 18 |   m_id          = row["place_id"].as<hindex>(0);
 19 |   m_linked_id   = row["linked_place_id"].as<hindex>(0);
 20 |   m_parent_id   = row["parent_place_id"].as<hindex>(0);
 21 |   m_country     = row["country_code"].as<std::string>("");
 22 |   m_type        = geocoder_type(row["class"].as<std::string>(""), row["type"].as<std::string>(""));
 23 |   m_housenumber = row["housenumber"].as<std::string>("");
 24 |   m_postcode    = GeoNLP::Postal::normalize_postalcode(row["postcode"].as<std::string>(""));
 25 |   m_latitude    = row["latitude"].as<float>(0);
 26 |   m_longitude   = row["longitude"].as<float>(0);
 27 |   m_osm_id      = row["osm_id"].as<uint64_t>(0);
 28 |   m_search_rank = row["search_rank"].as<int>(0);
 29 | 
 30 |   m_data_name  = parse_to_map(row["name"].as<std::string>(""));
 31 |   m_data_extra = parse_to_map(row["extra"].as<std::string>(""));
 32 | 
 33 |   set_names();
 34 |   m_key = key();
 35 | }
 36 | 
 37 | static std::set<std::string> load_list(const std::string &fname)
 38 | {
 39 |   std::set<std::string> d;
 40 |   if (fname.empty())
 41 |     return d;
 42 | 
 43 |   std::ifstream f(fname);
 44 |   std::string   line;
 45 |   if (!f)
 46 |     {
 47 |       std::cerr << "Failed to open a file: " << fname << std::endl;
 48 |       throw std::runtime_error("File cannot be opened");
 49 |     }
 50 | 
 51 |   while (std::getline(f, line))
 52 |     {
 53 |       boost::algorithm::trim(line);
 54 |       if (!line.empty())
 55 |         d.insert(line);
 56 |     }
 57 | 
 58 |   return d;
 59 | }
 60 | 
 61 | void HierarchyItem::load_priority_list(const std::string &fname)
 62 | {
 63 |   s_priority_types = load_list(fname);
 64 | }
 65 | 
 66 | void HierarchyItem::load_skip_list(const std::string &fname)
 67 | {
 68 |   s_skip_types = load_list(fname);
 69 | }
 70 | 
 71 | void HierarchyItem::drop()
 72 | {
 73 |   m_dropped = true;
 74 | }
 75 | 
 76 | bool HierarchyItem::keep(bool verbose) const
 77 | {
 78 |   if (m_dropped)
 79 |     return false;
 80 | 
 81 |   if (m_type.find_first_not_of(allowed_type_chars) != std::string::npos)
 82 |     {
 83 |       if (verbose)
 84 |         std::cout << "Dropping " << m_type << "\n";
 85 |       return false;
 86 |     }
 87 | 
 88 |   if (s_skip_types.count(m_type) > 0)
 89 |     return false;
 90 | 
 91 |   return !m_name.empty() || !m_postcode.empty() || s_priority_types.count(m_type) > 0;
 92 | }
 93 | 
 94 | bool HierarchyItem::is_duplicate(std::shared_ptr<HierarchyItem> item) const
 95 | {
 96 |   if (s_priority_types.count(m_type) > 0)
 97 |     return false;
 98 | 
 99 |   if (m_name != item->m_name || m_postcode != item->m_postcode)
100 |     return false;
101 | 
102 |   if (m_type == item->m_type || same_starts_with("building", m_type, item->m_type)
103 |       || same_starts_with("highway", m_type, item->m_type))
104 |     return true;
105 | 
106 |   return false;
107 | }
108 | 
109 | std::string HierarchyItem::key() const
110 | {
111 |   std::stringstream ss;
112 | 
113 |   ss << m_name << "-" << m_name_extra << "-" << m_postcode << "-";
114 | 
115 |   if (m_type.rfind("building", 0) == 0)
116 |     ss << "building";
117 |   else if (m_type.rfind("highway", 0) == 0)
118 |     ss << "highway";
119 |   else
120 |     ss << m_type;
121 | 
122 |   if (s_priority_types.count(m_type) > 0)
123 |     ss << "-" << m_id;
124 | 
125 |   return ss.str();
126 | }
127 | 
128 | void HierarchyItem::add_child(std::shared_ptr<HierarchyItem> child)
129 | {
130 |   m_children.push_back(child);
131 |   child->set_parent(m_id);
132 | }
133 | 
134 | void HierarchyItem::add_linked(std::shared_ptr<HierarchyItem> linked)
135 | {
136 |   m_data_name.insert(linked->m_data_name.begin(), linked->m_data_name.end());
137 |   m_data_extra.insert(linked->m_data_extra.begin(), linked->m_data_extra.end());
138 |   set_names();
139 | }
140 | 
141 | void HierarchyItem::set_names()
142 | {
143 |   m_name = get_with_def(m_data_name, "name");
144 |   m_name_extra.clear();
145 |   if (!m_housenumber.empty())
146 |     {
147 |       m_name_extra = m_name;
148 |       m_name       = m_housenumber;
149 |     }
150 | 
151 |   if (m_name_extra.empty())
152 |     m_name_extra = get_with_def(m_data_extra, "brand");
153 | }
154 | 
155 | void HierarchyItem::set_parent(hindex parent, bool force)
156 | {
157 |   if (!force && m_parent_id != parent && m_parent_id != 0 && parent != 0)
158 |     {
159 |       std::cout << "New parent (" << parent << ") for " << m_id << " does not match old one ("
160 |                 << m_parent_id << ")\n";
161 |       throw std::runtime_error("Mismatch between new and old parent");
162 |     }
163 |   m_parent_id = parent;
164 |   //   for (auto c : m_children)
165 |   //     c->set_parent(m_id, force);
166 | }
167 | 
168 | void HierarchyItem::cleanup_children(bool duplicate_only)
169 | {
170 |   // as a result of this run, children that are supposed to be kept are staying in children
171 |   // property. all disposed ones are still pointed to via Hierarchy map, but should not be accessed
172 |   // while moving along hierarchy for indexing or writing it
173 |   if (!duplicate_only)
174 |     {
175 |       std::deque<std::shared_ptr<HierarchyItem> > children;
176 |       for (auto item : m_children)
177 |         {
178 |           item->cleanup_children();
179 |           if (item->keep())
180 |             children.push_back(item);
181 |           else
182 |             children.insert(children.end(), item->m_children.begin(), item->m_children.end());
183 |         }
184 |       m_children = children;
185 |     }
186 | 
187 |   // print out items with huge amount of children
188 |   if (m_children.size() > 10000)
189 |     {
190 |       print_item(0);
191 |       m_children[0]->print_item(3);
192 |     }
193 | 
194 |   // check for duplicates
195 |   std::map<std::string, std::shared_ptr<HierarchyItem> > children;
196 |   for (std::shared_ptr<HierarchyItem> item : m_children)
197 |     {
198 |       std::string key       = item->key();
199 |       auto        main_pair = children.find(key);
200 |       if (main_pair != children.end())
201 |         {
202 |           std::shared_ptr<HierarchyItem> main = main_pair->second;
203 |           main->m_children.insert(main->m_children.end(), item->m_children.begin(),
204 |                                   item->m_children.end());
205 |           for (auto &i_children : item->m_children)
206 |             i_children->set_parent(main->m_id, true);
207 |           item->drop();
208 |           main->cleanup_children(true);
209 |         }
210 |       else
211 |         children[key] = item;
212 |     }
213 | 
214 |   m_children.clear();
215 |   for (auto &iter : children)
216 |     m_children.push_back(iter.second);
217 | 
218 |   // set parent, forced
219 |   for (auto item : m_children)
220 |     item->set_parent(m_id, true);
221 | }
222 | 
223 | sqlid HierarchyItem::index(sqlid idx, sqlid parent)
224 | {
225 |   if (!keep())
226 |     throw std::runtime_error("Trying to index a location that was not supposed to be kept");
227 |   m_my_index     = idx;
228 |   m_parent_index = parent;
229 |   ++idx;
230 |   for (auto item : m_children)
231 |     idx = item->index(idx, m_my_index);
232 |   m_last_child_index = idx - 1;
233 |   return idx;
234 | }
235 | 
236 | void HierarchyItem::write(sqlite3pp::database &db) const
237 | {
238 |   if (!keep())
239 |     throw std::runtime_error("Trying to write a location that was not supposed to be kept");
240 | 
241 |   // primary data
242 |   std::string name_en = get_with_def(m_data_name, "name:en");
243 |   std::string phone   = get_with_def(m_data_extra, "phone");
244 |   std::string website = get_with_def(m_data_extra, "website");
245 | 
246 |   {
247 |     sqlite3pp::command cmd(db,
248 |                            "INSERT INTO object_primary_tmp (id, postgres_id, name, name_extra, "
249 |                            "name_en, phone, postal_code, website, parent, longitude, "
250 |                            "latitude, search_rank) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)");
251 |     cmd.binder() << m_my_index << (int)m_id << m_name << m_name_extra << name_en << phone
252 |                  << m_postcode << website << m_parent_index << m_longitude << m_latitude
253 |                  << m_search_rank;
254 |     if (cmd.execute() != SQLITE_OK)
255 |       std::cerr << "WriteSQL : error inserting primary data for " << m_id << ", " << m_my_index
256 |                 << "\n";
257 |   }
258 | 
259 |   // type
260 |   {
261 |     std::string command
262 |         //= "INSERT INTO object_type_tmp (prim_id, type) VALUES (?, \"" + type + "\")";
263 |         = "INSERT INTO object_type_tmp (prim_id, type) VALUES (?, ?)";
264 |     sqlite3pp::command cmd(db, command.c_str());
265 |     cmd.binder() << m_my_index << m_type;
266 |     if (cmd.execute() != SQLITE_OK)
267 |       std::cerr << "WriteSQL: error inserting type for " << m_id << ", " << m_my_index << "\n";
268 |   }
269 | 
270 |   // hierarchy
271 |   if (m_last_child_index > m_my_index)
272 |     {
273 |       sqlite3pp::command cmd(db, "INSERT INTO hierarchy (prim_id, last_subobject) VALUES (?, ?)");
274 |       cmd.binder() << m_my_index << m_last_child_index;
275 |       if (cmd.execute() != SQLITE_OK)
276 |         std::cerr << "WriteSQL: error inserting hierarchy for " << m_id << ", " << m_my_index
277 |                   << " - " << m_last_child_index << "\n";
278 |     }
279 | 
280 |   // children
281 |   for (const auto &c : m_children)
282 |     c->write(db);
283 | }
284 | 
285 | void HierarchyItem::print_item(unsigned int offset) const
286 | {
287 |   std::cout << std::string(offset, ' ') << "- " << m_id << " ";
288 |   if (!m_housenumber.empty())
289 |     std::cout << "house " << m_housenumber << " ";
290 |   std::cout << m_name << " ";
291 |   std::cout << "(" << m_my_index << " " << m_last_child_index << ": " << m_children.size() << ": "
292 |             << m_parent_id << ", " << m_country << ", osmid=" << m_osm_id << ", " << m_key << ")\n";
293 | }
294 | 
295 | void HierarchyItem::print_branch(unsigned int offset) const
296 | {
297 |   print_item(offset);
298 |   if (m_children.size())
299 |     std::cout << std::string(offset + 2, ' ') << "|\n";
300 |   for (auto c : m_children)
301 |     c->print_branch(offset + 3);
302 | }
303 | 


--------------------------------------------------------------------------------
/importer/src/hierarchyitem.h:
--------------------------------------------------------------------------------
 1 | #ifndef HIERARCHYITEM_H
 2 | #define HIERARCHYITEM_H
 3 | 
 4 | #include "config.h"
 5 | 
 6 | #include <deque>
 7 | #include <memory>
 8 | #include <pqxx/pqxx>
 9 | #include <set>
10 | #include <sqlite3pp.h>
11 | #include <string>
12 | 
13 | class HierarchyItem
14 | {
15 | public:
16 |   HierarchyItem(const pqxx::row &row);
17 |   ~HierarchyItem(){};
18 | 
19 |   hindex             id() const { return m_id; }
20 |   hindex             linked_id() const { return m_linked_id; }
21 |   hindex             parent_id() const { return m_parent_id; }
22 |   const std::string &country() const { return m_country; }
23 |   bool               keep(bool verbose = true) const;
24 |   bool               indexed() const { return m_my_index > 0; }
25 | 
26 |   void drop();
27 |   bool dropped() const { return m_dropped; }
28 | 
29 |   const std::deque<std::shared_ptr<HierarchyItem> > &children() { return m_children; }
30 | 
31 |   void  add_child(std::shared_ptr<HierarchyItem> child);
32 |   void  add_linked(std::shared_ptr<HierarchyItem> linked);
33 |   void  set_parent(hindex parent, bool force = false);
34 |   void  cleanup_children(bool duplicate_only = false);
35 |   sqlid index(sqlid idx, sqlid parent);
36 |   void  write(sqlite3pp::database &db) const;
37 | 
38 |   void print_item(unsigned int offset) const;
39 |   void print_branch(unsigned int offset) const;
40 | 
41 | public:
42 |   static void load_priority_list(const std::string &fname);
43 |   static void load_skip_list(const std::string &fname);
44 | 
45 |   static std::set<std::string> get_priority_list() { return s_priority_types; }
46 | 
47 | protected:
48 |   void        set_names();
49 |   bool        is_duplicate(std::shared_ptr<HierarchyItem> item) const;
50 |   std::string key() const;
51 | 
52 | private:
53 |   hindex      m_id;
54 |   hindex      m_linked_id{ 0 };
55 |   hindex      m_parent_id;
56 |   sqlid       m_my_index{ 0 };
57 |   sqlid       m_parent_index{ 0 };
58 |   sqlid       m_last_child_index{ 0 };
59 |   bool        m_dropped{ false };
60 |   std::string m_key;
61 | 
62 |   std::string m_type;
63 |   float       m_latitude;
64 |   float       m_longitude;
65 |   uint64_t    m_osm_id;
66 |   std::string m_country;
67 |   std::string m_postcode;
68 |   std::string m_housenumber;
69 |   std::string m_name;
70 |   std::string m_name_extra;
71 |   int         m_search_rank;
72 | 
73 |   std::map<std::string, std::string> m_data_name;
74 |   std::map<std::string, std::string> m_data_extra;
75 | 
76 |   std::deque<std::shared_ptr<HierarchyItem> > m_children;
77 | 
78 |   static std::set<std::string> s_priority_types;
79 |   static std::set<std::string> s_skip_types;
80 | };
81 | 
82 | #endif
83 | 


--------------------------------------------------------------------------------
/importer/src/main.cpp:
--------------------------------------------------------------------------------
  1 | /////////////////////////////////////////////////////////////////////
  2 | /// WARNING: When adding new languages, increase the language count
  3 | /// num_languages in geocoder.c
  4 | /////////////////////////////////////////////////////////////////////
  5 | 
  6 | #include "config.h"
  7 | #include "geocoder.h"
  8 | #include "hierarchy.h"
  9 | #include "normalization.h"
 10 | #include "version.h"
 11 | 
 12 | #include <algorithm>
 13 | #include <boost/algorithm/string/join.hpp>
 14 | #include <boost/program_options.hpp>
 15 | #include <cctype>
 16 | #include <cstdlib>
 17 | #include <deque>
 18 | #include <fstream>
 19 | #include <iterator>
 20 | #include <libpostal/libpostal.h>
 21 | #include <locale>
 22 | #include <map>
 23 | #include <nlohmann/json.hpp>
 24 | #include <pqxx/pqxx>
 25 | #include <set>
 26 | #include <sqlite3pp.h>
 27 | 
 28 | using json   = nlohmann::json;
 29 | namespace po = boost::program_options;
 30 | 
 31 | ////////////////////////////////////////////////////////////////////////////
 32 | // MAIN
 33 | 
 34 | int main(int argc, char *argv[])
 35 | {
 36 |   const int printout_step = 100000;
 37 | 
 38 |   std::string polyjson;
 39 |   std::string database_path;
 40 |   std::string postal_country_parser;
 41 |   std::string postal_address_parser_dir;
 42 |   std::string type_priority_list;
 43 |   std::string type_skip_list;
 44 |   std::string log_errors_to_file;
 45 |   bool        verbose_address_expansion = false;
 46 | 
 47 |   {
 48 |     po::options_description generic("Geocoder NLP importer options");
 49 |     generic.add_options()("help,h", "Help message")("version,v", "Version");
 50 |     generic.add_options()("version-data", "Version of the data file");
 51 |     generic.add_options()("poly,p", po::value<std::string>(&polyjson),
 52 |                           "Boundary of the imported region in GeoJSON format");
 53 |     generic.add_options()("postal-country", po::value<std::string>(&postal_country_parser),
 54 |                           "libpostal country preference for this database");
 55 |     generic.add_options()(
 56 |         "postal-address", po::value<std::string>(&postal_address_parser_dir),
 57 |         "libpostal address parser directory. If not specified, global libpostal parser directory "
 58 |         "preference is used.");
 59 |     generic.add_options()(
 60 |         "priority", po::value<std::string>(&type_priority_list),
 61 |         "File with OSM tags that are kept even if there is no name associated with the location");
 62 |     generic.add_options()(
 63 |         "skip", po::value<std::string>(&type_skip_list),
 64 |         "File with OSM tags for locations that should be dropped even if there is a name "
 65 |         "associated with the location");
 66 |     generic.add_options()(
 67 |         "log-errors-to-file", po::value<std::string>(&log_errors_to_file),
 68 |         "Log errors to file and continue import. File name given as an argument of this option.");
 69 |     generic.add_options()("verbose", "Verbose address expansion");
 70 | 
 71 |     po::options_description hidden("Hidden options");
 72 |     hidden.add_options()("output-directory", po::value<std::string>(&database_path),
 73 |                          "Output directory for imported database");
 74 | 
 75 |     po::positional_options_description p;
 76 |     p.add("output-directory", 1);
 77 | 
 78 |     po::options_description cmdline_options;
 79 |     cmdline_options.add(generic).add(hidden);
 80 | 
 81 |     po::variables_map vm;
 82 |     try
 83 |       {
 84 |         po::store(po::command_line_parser(argc, argv).options(cmdline_options).positional(p).run(),
 85 |                   vm);
 86 |         po::notify(vm);
 87 |       }
 88 |     catch (std::exception &e)
 89 |       {
 90 |         std::cerr << "Error while parsing options: " << e.what() << "\n\n";
 91 |         std::cerr << generic << "\n";
 92 |       }
 93 | 
 94 |     if (vm.count("help"))
 95 |       {
 96 |         std::cout << "Geocoder NLP importer:\n\n"
 97 |                   << "Call as\n\n " << argv[0] << " <options> output-directory\n"
 98 |                   << "\nwhere output-directory is a directory for imported database.\n\n"
 99 |                   << generic << "\n";
100 |         return 0;
101 |       }
102 | 
103 |     if (vm.count(("version")))
104 |       {
105 |         std::cout << "Geocoder NLP version: " << GEOCODERNLP_VERSION_STRING << "\n";
106 |         std::cout << "Data format version: " << GeoNLP::Geocoder::version << "\n";
107 |         return 0;
108 |       }
109 | 
110 |     if (vm.count(("version-data")))
111 |       {
112 |         std::cout << GeoNLP::Geocoder::version << "\n";
113 |         return 0;
114 |       }
115 | 
116 |     if (vm.count("verbose"))
117 |       verbose_address_expansion = true;
118 | 
119 |     if (!vm.count("poly"))
120 |       {
121 |         std::cerr << "Boundary of the imported region in GeoJSON format is missing\n";
122 |         return -1;
123 |       }
124 |   }
125 | 
126 |   // load GeoJSON for surrounding (multi)polygon from poly.json
127 |   std::string border;
128 |   {
129 |     std::ifstream fin(polyjson);
130 |     if (!fin)
131 |       {
132 |         std::cerr << "Failed to open " << polyjson << "\n";
133 |         return -2;
134 |       }
135 | 
136 |     std::istreambuf_iterator<char> begin(fin), end;
137 |     std::string                    b(begin, end);
138 |     border = b;
139 |   }
140 | 
141 |   if (border.size())
142 |     {
143 |       json j = json::parse(border);
144 |       border = j["geometry"].dump();
145 |       std::cout << "Loaded border GeoJSON. Geometry string length: " << border.size() << "\n";
146 |     }
147 | 
148 |   HierarchyItem::load_priority_list(type_priority_list);
149 |   HierarchyItem::load_skip_list(type_skip_list);
150 | 
151 |   Hierarchy hierarchy;
152 | 
153 |   std::string postgres_dblink;
154 |   const char *env = std::getenv(GEOCODER_IMPORTER_POSTGRES);
155 |   if (env)
156 |     postgres_dblink = env;
157 |   else
158 |     {
159 |       std::cout << "Please specify PostgreSQL connection string using environment variable "
160 |                 << GEOCODER_IMPORTER_POSTGRES << "\n";
161 |       return -1;
162 |     }
163 | 
164 |   std::cout << "Postgres connection: " << postgres_dblink << std::endl;
165 | 
166 |   pqxx::connection pgc{ postgres_dblink };
167 |   pqxx::work       txn{ pgc };
168 | 
169 |   // Here, 1000 is used to scale search_rank. Same factor is used in Geocoder::search
170 |   const std::string base_query
171 |       = R"SQL(
172 | select pl.place_id, linked_place_id, parent_place_resolved as parent_place_id, pl.country_code, class, type,
173 | hstore_to_json(name) as name, hstore_to_json(extratags) as extra,
174 | COALESCE(address->'housenumber',housenumber) AS housenumber,
175 | postcode, ST_X(pl.centroid) as longitude, ST_Y(pl.centroid) as latitude,
176 | CASE
177 | WHEN sname.importance IS NOT NULL THEN (1000*(1-sname.importance))::int
178 | ELSE 1000 + pl.rank_search
179 | END AS search_rank,
180 | osm_type, osm_id
181 | from placex pl
182 | left join search_name as sname on sname.place_id=pl.place_id
183 | left join lateral
184 | (select address_place_id from place_addressline a where a.place_id=pl.place_id
185 | order by cached_rank_address desc, fromarea desc, distance asc limit 1) as a on true
186 | left join lateral
187 | (with recursive prec as
188 | (select place_id, linked_place_id from placex where COALESCE(a.address_place_id,pl.parent_place_id)=placex.place_id
189 | union select p.place_id, p.linked_place_id from placex p join prec on p.place_id=prec.linked_place_id)
190 | select place_id as parent_place_resolved from prec where linked_place_id is null limit 1) as pres on true
191 |       )SQL";
192 | 
193 |   // load primary hierarchy
194 |   {
195 |     std::vector<std::string> types_enclosed;
196 |     for (const auto &i : HierarchyItem::get_priority_list())
197 |       types_enclosed.push_back("'" + i + "'");
198 | 
199 |     std::string priority_types_condition
200 |         = "class || '_' || type IN (" + boost::algorithm::join(types_enclosed, ",") + ")";
201 | 
202 |     pqxx::result r = txn.exec_params(
203 |         base_query
204 |             + "where linked_place_id IS NULL and "
205 |               "ST_Intersects(ST_GeomFromGeoJSON($1), geometry) and "
206 |               "class ~ '^[a-z_-]+$' and (type~'^[a-z_-]+$' or ((type<>'') is not true)) and "
207 |               "(name->'name'<>'' or housenumber<>'' or "
208 |             + priority_types_condition + ")",
209 |         border);
210 |     size_t count = 0;
211 |     for (const pqxx::row &row : r)
212 |       {
213 |         ++count;
214 |         std::shared_ptr<HierarchyItem> item = std::make_shared<HierarchyItem>(row);
215 |         if (!item->keep())
216 |           continue;
217 |         hierarchy.add_item(item);
218 |         if (count % printout_step == 0)
219 |           std::cout << "Imported records: " << count
220 |                     << "; Root elements: " << hierarchy.get_root_count()
221 |                     << "; Missing parents: " << hierarchy.get_missing_count() << std::endl;
222 |       }
223 |   }
224 | 
225 |   // load all linked places and merge with the primary ones
226 |   {
227 |     pqxx::result r = txn.exec_params(
228 |         base_query
229 |             + "where linked_place_id IS NOT NULL and ST_Intersects(ST_GeomFromGeoJSON($1), "
230 |               "geometry)",
231 |         border);
232 |     size_t count  = 0;
233 |     size_t failed = 0;
234 |     for (const pqxx::row &row : r)
235 |       {
236 |         ++count;
237 |         std::shared_ptr<HierarchyItem> item = std::make_shared<HierarchyItem>(row);
238 |         if (!hierarchy.add_linked_item(item))
239 |           failed++;
240 |         if (count % printout_step == 0)
241 |           std::cout << "Imported linked records: " << count
242 |                     << "; Root elements: " << hierarchy.get_root_count()
243 |                     << "; Missing parents: " << hierarchy.get_missing_count() << std::endl;
244 |       }
245 |     std::cout << "Imported linked records: " << count << " / failed to import: " << failed
246 |               << "; Root elements: " << hierarchy.get_root_count()
247 |               << "; Missing parents: " << hierarchy.get_missing_count() << std::endl;
248 |   }
249 | 
250 |   // load centroids for postal codes
251 |   const std::string postal_codes_query
252 |       = R"SQL(
253 | select pc.place_id, NULL as linked_place_id, parent_place_resolved as parent_place_id, country_code, 'postal' as class, 'code' as type,
254 | NULL as name, NULL as extra, NULL as housenumber,
255 | postcode, ST_X(geometry) as longitude, ST_Y(geometry) as latitude,
256 | 100 AS search_rank,
257 | NULL as osm_type, NULL as osm_id
258 | from location_postcode pc
259 | left join lateral
260 | (with recursive prec as
261 | (select place_id, linked_place_id from placex where pc.parent_place_id=placex.place_id
262 | union select p.place_id, p.linked_place_id from placex p join prec on p.place_id=prec.linked_place_id)
263 | select place_id as parent_place_resolved from prec where linked_place_id is null limit 1) as pres on true
264 |       )SQL";
265 |   {
266 |     pqxx::result r = txn.exec_params(postal_codes_query
267 |                                          + "where "
268 |                                            "ST_Intersects(ST_GeomFromGeoJSON($1), geometry)",
269 |                                      border);
270 | 
271 |     size_t count = 0;
272 |     for (const pqxx::row &row : r)
273 |       {
274 |         ++count;
275 |         std::shared_ptr<HierarchyItem> item = std::make_shared<HierarchyItem>(row);
276 |         if (!item->keep())
277 |           {
278 |             std::cout << "Dropping postcode?"
279 |                       << "\n";
280 |             item->print_item(0);
281 |             continue;
282 |           }
283 |         hierarchy.add_item(item);
284 |         if (count % printout_step == 0)
285 |           std::cout << "Imported postal codes: " << count
286 |                     << "; Root elements: " << hierarchy.get_root_count()
287 |                     << "; Missing parents: " << hierarchy.get_missing_count() << std::endl;
288 |       }
289 |   }
290 | 
291 |   // find missing parents for root nodes
292 |   std::cout << "Fill missing hierarchies. Root size: " << hierarchy.get_root_count() << "\n";
293 |   for (hindex parent = hierarchy.get_next_nonzero_root_parent(); parent;
294 |        parent        = hierarchy.get_next_nonzero_root_parent())
295 |     {
296 |              pqxx::result r     = txn.exec_params(base_query + "where pl.place_id=$1", parent);
297 |              bool         found = false;
298 |              for (auto row : r)
299 |         {
300 |                  std::shared_ptr<HierarchyItem> item = std::make_shared<HierarchyItem>(row);
301 |                  hierarchy.add_item(item);
302 |                  found = true;
303 |         }
304 | 
305 |       if (!found)
306 |         {
307 |                  std::cerr << "Missing parent with ID " << parent << " . Stopping import\n";
308 |                  hierarchy.print_root_with_parent_id(parent);
309 |                  std::cerr << "\nSQL:\n" << base_query + "where pl.place_id=" << parent << "\n";
310 | 
311 |                  return -1;
312 |         }
313 |     }
314 | 
315 |   // remove all items from hierarchy that are not supposed to be there
316 |   std::cout << "Cleanup hierarchy\n";
317 |   hierarchy.cleanup();
318 | 
319 |   // find missing countries and move root nodes under them if possible
320 |   std::cout << "Try to fill missing parents through countries. Root size: "
321 |             << hierarchy.get_root_count() << "\n";
322 |   for (std::string country : hierarchy.get_root_countries())
323 |     {
324 |       for (auto row : txn.exec_params(
325 |                base_query + "where pl.rank_address = 4 and pl.country_code = $1 limit 1", country))
326 |         {
327 |           hindex id = row["place_id"].as<hindex>(0);
328 |           if (!hierarchy.has_item(id))
329 |             {
330 |               std::shared_ptr<HierarchyItem> item = std::make_shared<HierarchyItem>(row);
331 |               hierarchy.add_item(item);
332 |             }
333 |           hierarchy.set_country(country, id);
334 |         }
335 |     }
336 | 
337 |   hierarchy.finalize();
338 |   if (!hierarchy.check_indexing())
339 |     {
340 |       std::set<hindex> problem = hierarchy.get_failed_indexes();
341 |       for (hindex index : problem)
342 |         txn.exec_params0("update placex set indexed_status=2 where place_id=$1", index);
343 | 
344 |       txn.commit();
345 | 
346 |       std::cout << "Requested to reindex Nominatim database (run nominatim index) for "
347 |                 << problem.size() << " records\n";
348 | 
349 |       if (log_errors_to_file.empty())
350 |         return -3;
351 |       else
352 |         {
353 |           std::ofstream flog(log_errors_to_file);
354 |           flog << problem.size() << " records were not indexed while they should have been.\n";
355 |         }
356 |     }
357 | 
358 |   txn.commit(); // finalize postgres transactions
359 | 
360 |   // hierarchy.print(false);
361 | 
362 |   // Saving data into SQLite
363 |   sqlite3pp::database db(GeoNLP::Geocoder::name_primary(database_path).c_str());
364 | 
365 |   db.execute("PRAGMA journal_mode = OFF");
366 |   db.execute("PRAGMA synchronous = OFF");
367 |   db.execute("PRAGMA cache_size = 2000000");
368 |   db.execute("PRAGMA temp_store = 2");
369 |   db.execute("BEGIN TRANSACTION");
370 |   db.execute("DROP TABLE IF EXISTS type");
371 |   db.execute("DROP TABLE IF EXISTS object_primary");
372 |   db.execute("DROP TABLE IF EXISTS object_primary_tmp");
373 |   db.execute("DROP TABLE IF EXISTS object_primary_tmp2");
374 |   db.execute("DROP TABLE IF EXISTS boxids");
375 |   db.execute("DROP TABLE IF EXISTS object_type");
376 |   db.execute("DROP TABLE IF EXISTS object_type_tmp");
377 |   db.execute("DROP TABLE IF EXISTS hierarchy");
378 |   db.execute("DROP TABLE IF EXISTS object_primary_rtree");
379 | 
380 |   db.execute("CREATE " TEMPORARY " TABLE object_primary_tmp ("
381 |              "id INTEGER PRIMARY KEY AUTOINCREMENT, postgres_id INTEGER, name TEXT, name_extra "
382 |              "TEXT, name_en TEXT, phone TEXT, postal_code TEXT, website TEXT, parent INTEGER, "
383 |              "search_rank INTEGER, "
384 |              "latitude REAL, longitude REAL)");
385 |   db.execute("CREATE " TEMPORARY " TABLE object_type_tmp (prim_id INTEGER, type TEXT NOT NULL, "
386 |              "FOREIGN KEY (prim_id) REFERENCES objects_primary_tmp(id))");
387 |   db.execute("CREATE TABLE hierarchy (prim_id INTEGER PRIMARY KEY, last_subobject INTEGER, "
388 |              "FOREIGN KEY (prim_id) REFERENCES objects_primary(id), FOREIGN KEY (last_subobject) "
389 |              "REFERENCES objects_primary(id))");
390 | 
391 |   std::cout << "Preliminary filling of the database" << std::endl;
392 |   hierarchy.write(db);
393 | 
394 |   // cleanup from duplicated names
395 |   db.execute("UPDATE object_primary_tmp SET name_extra='' WHERE name=name_extra");
396 |   db.execute("UPDATE object_primary_tmp SET name_en='' WHERE name=name_en");
397 | 
398 |   std::cout << "Reorganizing database tables" << std::endl;
399 | 
400 |   db.execute("CREATE TABLE type (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT)");
401 |   db.execute("INSERT INTO type (name) SELECT DISTINCT type FROM object_type_tmp");
402 |   db.execute("CREATE " TEMPORARY
403 |              " TABLE object_primary_tmp2 (id INTEGER PRIMARY KEY AUTOINCREMENT, "
404 |              "name TEXT, name_extra TEXT, name_en TEXT, phone TEXT, postal_code TEXT, website "
405 |              "TEXT, parent INTEGER, type_id INTEGER, latitude REAL, longitude REAL, "
406 |              "search_rank INTEGER, boxstr TEXT, "
407 |              "FOREIGN KEY (type_id) REFERENCES type(id))");
408 | 
409 |   db.execute("INSERT INTO object_primary_tmp2 (id, name, name_extra, name_en, phone, postal_code, "
410 |              "website, parent, type_id, latitude, longitude, search_rank, boxstr) "
411 |              "SELECT p.id, p.name, p.name_extra, p.name_en, p.phone, p.postal_code, p.website, "
412 |              "p.parent, type.id, p.latitude, p.longitude, p.search_rank, "
413 |              // LINE BELOW DETERMINES ROUNDING USED FOR BOXES
414 |              "CAST(CAST(p.latitude*100 AS INTEGER) AS TEXT) || ',' || CAST(CAST(p.longitude*100 AS "
415 |              "INTEGER) AS TEXT) "
416 |              "FROM object_primary_tmp p JOIN object_type_tmp tt ON p.id=tt.prim_id "
417 |              "JOIN type ON tt.type=type.name");
418 | 
419 |   db.execute("CREATE " TEMPORARY " TABLE boxids (id INTEGER PRIMARY KEY AUTOINCREMENT, boxstr "
420 |              "TEXT, CONSTRAINT struni UNIQUE (boxstr))");
421 |   db.execute("INSERT INTO boxids (boxstr) SELECT DISTINCT boxstr FROM object_primary_tmp2");
422 | 
423 |   db.execute("CREATE TABLE object_primary (id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, "
424 |              "name_extra TEXT, name_en TEXT, phone TEXT, postal_code TEXT, website TEXT, "
425 |              "parent INTEGER, type_id INTEGER, latitude REAL, longitude REAL, search_rank INTEGER, "
426 |              "box_id INTEGER, "
427 |              "FOREIGN KEY (type_id) REFERENCES type(id))");
428 |   db.execute(
429 |       "INSERT INTO object_primary (id, name, name_extra, name_en, phone, postal_code, website, "
430 |       "parent, type_id, latitude, longitude, search_rank, box_id) "
431 |       "SELECT o.id, name, name_extra, name_en, phone, postal_code, website, parent, type_id, "
432 |       "latitude, longitude, search_rank, b.id FROM object_primary_tmp2 o JOIN boxids b ON "
433 |       "o.boxstr=b.boxstr");
434 | 
435 |   db.execute("DROP INDEX IF EXISTS idx_object_primary_box");
436 |   db.execute("CREATE INDEX idx_object_primary_box ON object_primary (box_id)");
437 | 
438 |   db.execute("DROP INDEX IF EXISTS idx_object_primary_postal_code");
439 |   db.execute("CREATE INDEX idx_object_primary_postal_code ON object_primary (postal_code)");
440 | 
441 |   std::cout << "Normalize using libpostal" << std::endl;
442 | 
443 |   normalize_libpostal(db, postal_address_parser_dir, verbose_address_expansion);
444 |   normalized_to_final(db, database_path);
445 | 
446 |   // Create R*Tree for nearest neighbor search
447 |   std::cout << "Populating R*Tree" << std::endl;
448 |   db.execute(
449 |       "CREATE VIRTUAL TABLE object_primary_rtree USING rtree(id, minLat, maxLat, minLon, maxLon)");
450 |   db.execute("INSERT INTO object_primary_rtree (id, minLat, maxLat, minLon, maxLon) "
451 |              "SELECT box_id, min(latitude), max(latitude), min(longitude), max(longitude) from "
452 |              "object_primary group by box_id");
453 | 
454 |   // Stats view
455 |   db.execute("DROP VIEW IF EXISTS type_stats");
456 |   db.execute(
457 |       "CREATE VIEW type_stats AS SELECT t.name as type_name, COUNT(*) AS cnt FROM object_primary o "
458 |       "JOIN \"type\" t ON t.id = o.type_id GROUP BY t.name ORDER BY cnt desc");
459 |   {
460 |     std::cout << "List of most popular imported types\n";
461 |     sqlite3pp::query qry(db, "SELECT type_name, cnt FROM type_stats ORDER BY cnt DESC LIMIT 25");
462 |     for (auto v : qry)
463 |       {
464 |         std::string name;
465 |         int         cnt;
466 |         v.getter() >> name >> cnt;
467 |         std::cout << " " << name << "\t" << cnt << "\n";
468 |       }
469 |   }
470 |   // Recording version
471 |   db.execute("DROP TABLE IF EXISTS meta");
472 |   db.execute("CREATE TABLE meta (key TEXT, value TEXT)");
473 |   {
474 |     sqlite3pp::command cmd(db, "INSERT INTO meta (key, value) VALUES (?, ?)");
475 |     std::ostringstream ss;
476 |     ss << GeoNLP::Geocoder::version;
477 |     cmd.binder() << "version" << ss.str().c_str();
478 |     if (cmd.execute() != SQLITE_OK)
479 |       std::cerr << "WriteSQL: error inserting version information\n";
480 |   }
481 | 
482 |   if (!postal_country_parser.empty())
483 |     {
484 |       std::cout << "Recording postal parser country preference: " << postal_country_parser << "\n";
485 |       std::string cmd = "INSERT INTO meta (key, value) VALUES (\"postal:country:parser\", \""
486 |                         + postal_country_parser + "\")";
487 |       db.execute(cmd.c_str());
488 |     }
489 | 
490 |   // finalize
491 |   db.execute("END TRANSACTION");
492 |   db.execute("VACUUM");
493 |   db.execute("ANALYZE");
494 | 
495 |   std::cout << "Done\n";
496 | 
497 |   return 0;
498 | }
499 | 


--------------------------------------------------------------------------------
/importer/src/normalization.cpp:
--------------------------------------------------------------------------------
  1 | #include "normalization.h"
  2 | #include "config.h"
  3 | #include "geocoder.h"
  4 | 
  5 | #include <kchashdb.h>
  6 | #include <libpostal/libpostal.h>
  7 | #include <marisa.h>
  8 | 
  9 | ////////////////////////////////////////////////////////////////////////////
 10 | /// Libpostal normalization with search string expansion
 11 | void normalize_libpostal(sqlite3pp::database &db, std::string address_expansion_dir, bool verbose)
 12 | {
 13 |   struct tonorm
 14 |   {
 15 |     std::string name;
 16 |     sqlid       id;
 17 |   };
 18 | 
 19 |   std::deque<tonorm> data;
 20 |   sqlite3pp::query   qry(db, "SELECT id, name, name_extra, name_en FROM object_primary_tmp");
 21 |   for (auto v : qry)
 22 |     {
 23 |       tonorm      d;
 24 |       sqlid       id;
 25 |       char const *name, *name_extra, *name_en;
 26 |       v.getter() >> id >> name >> name_extra >> name_en;
 27 | 
 28 |       if (name == nullptr)
 29 |         continue; // no need to add empty name into search index
 30 | 
 31 |       d.id = id;
 32 | 
 33 |       d.name = name;
 34 |       data.push_back(d);
 35 |       if (name_extra)
 36 |         {
 37 |           d.name = name_extra;
 38 |           data.push_back(d);
 39 |         }
 40 |       if (name_en)
 41 |         {
 42 |           d.name = name_en;
 43 |           data.push_back(d);
 44 |         }
 45 |     }
 46 | 
 47 |   // make a new table for normalized names
 48 |   db.execute("DROP TABLE IF EXISTS normalized_name");
 49 |   db.execute(
 50 |       "CREATE " TEMPORARY
 51 |       " TABLE normalized_name (prim_id INTEGER, name TEXT NOT NULL, PRIMARY KEY (name, prim_id))");
 52 | 
 53 |   // load libpostal
 54 |   if (!libpostal_setup() || !libpostal_setup_language_classifier())
 55 |     {
 56 |       std::cerr << "Failure to load libpostal" << std::endl;
 57 |       return;
 58 |     }
 59 | 
 60 |   std::vector<char> aed(address_expansion_dir.begin(), address_expansion_dir.end());
 61 |   aed.push_back(0);
 62 |   if ((address_expansion_dir.empty() && !libpostal_setup_parser())
 63 |       || (!address_expansion_dir.empty() && !libpostal_setup_parser_datadir(aed.data())))
 64 |     {
 65 |       std::cerr << "Failure to load libpostal parser" << std::endl;
 66 |       return;
 67 |     }
 68 | 
 69 |   // normalize all names
 70 |   size_t                        num_expansions;
 71 |   size_t                        num_doubles_dropped = 0;
 72 |   libpostal_normalize_options_t options             = libpostal_get_default_options();
 73 |   std::vector<char>             charbuff;
 74 |   for (tonorm &d : data)
 75 |     {
 76 |       charbuff.resize(d.name.length() + 1);
 77 |       std::copy(d.name.c_str(), d.name.c_str() + d.name.length() + 1, charbuff.begin());
 78 | 
 79 |       if (verbose)
 80 |         std::cout << d.name << ": " << std::flush;
 81 | 
 82 |       // check for sanity before we proceed with expansion
 83 |       if (d.name.length() > LENGTH_STARTING_SUSP_CHECK)
 84 |         {
 85 |           size_t digits_space = 0;
 86 |           for (size_t i = 0; i < d.name.length(); ++i)
 87 |             if (std::isdigit(charbuff[i]) || std::isspace(charbuff[i]))
 88 |               digits_space++;
 89 | 
 90 |           if ((digits_space * 1.0) / d.name.length() > 0.5)
 91 |             {
 92 |               std::cout << "Warning: dropping suspicious name: " << d.name << "\n";
 93 |               continue;
 94 |             }
 95 |         }
 96 | 
 97 |       // check if there are too many commas
 98 |       if (std::count(d.name.begin(), d.name.end(), ',') > MAX_COMMAS)
 99 |         {
100 |           std::cout << "Warning: dropping suspicious name - too many commas: " << d.name << "\n";
101 |           continue;
102 |         }
103 | 
104 |       // insert normalized, but not expanded string
105 |       {
106 |         char *normalized = libpostal_normalize_string(charbuff.data(),
107 |                                                       LIBPOSTAL_NORMALIZE_DEFAULT_STRING_OPTIONS);
108 |         if (normalized != NULL)
109 |           {
110 |             sqlite3pp::command cmd(db, "INSERT INTO normalized_name (prim_id, name) VALUES (?,?)");
111 |             std::string        s = normalized;
112 |             cmd.binder() << d.id << s;
113 |             if (cmd.execute() != SQLITE_OK)
114 |               {
115 |                 // std::cerr << "Error inserting: " << d.id << " " << s << std::endl;
116 |                 num_doubles_dropped++;
117 |               }
118 | 
119 |             free(normalized);
120 |           }
121 |       }
122 | 
123 |       char **expansions = libpostal_expand_address(charbuff.data(), options, &num_expansions);
124 | 
125 |       if (num_expansions > MAX_NUMBER_OF_EXPANSIONS)
126 |         {
127 |           std::cout << "Warning: large number [" << num_expansions
128 |                     << "] of normalization expansions of " << d.name
129 |                     << " - dropping it from the table [" << d.id << "]\n";
130 |           // for (size_t i=0; i < 10 && i < num_expansions; i++)
131 |           //   std::cout << "   example expansion: " << expansions[i] << "\n";
132 |           // std::cout << "\n";
133 | 
134 |           continue; // don't insert it, its probably wrong anyway
135 |         }
136 | 
137 |       for (size_t i = 0; i < num_expansions; i++)
138 |         {
139 |           sqlite3pp::command cmd(db, "INSERT INTO normalized_name (prim_id, name) VALUES (?,?)");
140 |           std::string        s = expansions[i];
141 |           cmd.binder() << d.id << s;
142 |           if (cmd.execute() != SQLITE_OK)
143 |             {
144 |               // std::cerr << "Error inserting: " << d.id << " " << s << std::endl;
145 |               num_doubles_dropped++;
146 |             }
147 | 
148 |           // to cover the street names that have Dr. or the firstname
149 |           // in the front of the mainly used name, add substrings into
150 |           // the normalized table as well
151 |           const size_t max_substrings = 2;
152 |           size_t       pos            = 1;
153 |           for (size_t sbs = 0; sbs < max_substrings && pos < s.length(); ++sbs)
154 |             {
155 |               bool spacefound = false;
156 |               for (; pos < s.length(); ++pos)
157 |                 {
158 |                   char c = s[pos];
159 |                   if (c == ' ')
160 |                     spacefound = true;
161 |                   if (spacefound && c != ' ')
162 |                     break;
163 |                 }
164 | 
165 |               if (pos < s.length())
166 |                 {
167 |                   try
168 |                     {
169 |                       sqlite3pp::command cmd(
170 |                           db, "INSERT INTO normalized_name (prim_id, name) VALUES (?,?)");
171 |                       std::string s = expansions[i];
172 |                       cmd.binder() << d.id << s.substr(pos);
173 |                       if (cmd.execute() != SQLITE_OK)
174 |                         {
175 |                           // std::cerr << "Error inserting: " << d.id << " " << s << std::endl;
176 |                           num_doubles_dropped++;
177 |                         }
178 |                     }
179 |                   catch (sqlite3pp::database_error &e)
180 |                     {
181 |                       num_doubles_dropped++;
182 |                     }
183 |                 }
184 |             }
185 |         }
186 | 
187 |       // Free expansions
188 |       libpostal_expansion_array_destroy(expansions, num_expansions);
189 | 
190 |       if (verbose)
191 |         std::cout << "done" << std::endl;
192 |     }
193 | 
194 |   std::cout << "Redundant records skipped: " << num_doubles_dropped << "\n";
195 | 
196 |   // Teardown libpostal
197 |   libpostal_teardown_parser();
198 |   libpostal_teardown();
199 |   libpostal_teardown_language_classifier();
200 | }
201 | 
202 | ////////////////////////////////////////////////////////////////////////////
203 | /// Libpostal normalization with search string expansion
204 | void normalized_to_final(sqlite3pp::database &db, std::string path)
205 | {
206 |   std::cout << "Inserting normalized data into MARISA trie" << std::endl;
207 | 
208 |   marisa::Keyset keyset;
209 | 
210 |   {
211 |     sqlite3pp::query qry(db, "SELECT name FROM normalized_name");
212 |     for (auto v : qry)
213 |       {
214 |         std::string name;
215 |         v.getter() >> name;
216 |         keyset.push_back(name.c_str());
217 |       }
218 |   }
219 | 
220 |   marisa::Trie trie;
221 |   trie.build(keyset);
222 |   trie.save(GeoNLP::Geocoder::name_normalized_trie(path).c_str());
223 | 
224 |   struct norm
225 |   {
226 |     std::string name;
227 |     sqlid       prim_id;
228 |   };
229 | 
230 |   std::deque<norm> data;
231 |   {
232 |     sqlite3pp::query qry(db, "SELECT name, prim_id FROM normalized_name");
233 |     for (auto v : qry)
234 |       {
235 |         norm d;
236 |         v.getter() >> d.name >> d.prim_id;
237 |         data.push_back(d);
238 |       }
239 |   }
240 | 
241 |   std::map<GeoNLP::Geocoder::index_id_key, std::vector<GeoNLP::Geocoder::index_id_value> > bdata;
242 |   for (auto d : data)
243 |     {
244 |       marisa::Agent agent;
245 |       agent.set_query(d.name.c_str());
246 |       if (trie.lookup(agent))
247 |         {
248 |           GeoNLP::Geocoder::index_id_key k = agent.key().id();
249 |           if (bdata.count(k) == 0)
250 |             bdata[k] = std::vector<GeoNLP::Geocoder::index_id_value>();
251 |           bdata[k].push_back(d.prim_id);
252 |         }
253 |       else
254 |         {
255 |           std::cerr << "Error: cannot find in MARISA trie: " << d.name << std::endl;
256 |         }
257 |     }
258 | 
259 |   {
260 |     // create the database object
261 |     kyotocabinet::HashDB db;
262 | 
263 |     db.tune_options(kyotocabinet::HashDB::TSMALL | kyotocabinet::HashDB::TLINEAR);
264 |     db.tune_alignment(0);
265 |     db.tune_defrag(8);
266 | 
267 |     // open the database
268 |     if (!db.open(GeoNLP::Geocoder::name_normalized_id(path).c_str(),
269 |                  kyotocabinet::HashDB::OWRITER | kyotocabinet::HashDB::OCREATE))
270 |       {
271 |         std::cerr << "open error: " << db.error().name() << std::endl;
272 |         return;
273 |       }
274 | 
275 |     std::vector<std::string> keys;
276 |     for (auto a : bdata)
277 |       keys.push_back(GeoNLP::Geocoder::make_id_key(a.first));
278 | 
279 |     std::sort(keys.begin(), keys.end());
280 | 
281 |     for (auto key : keys)
282 |       {
283 |         std::vector<GeoNLP::Geocoder::index_id_value> &d = bdata[GeoNLP::Geocoder::get_id_key(key)];
284 |         std::sort(d.begin(), d.end());
285 |         std::string value = GeoNLP::Geocoder::make_id_value(d);
286 |         if (!db.set(key, value))
287 |           {
288 |             std::cerr << "set error: " << db.error().name() << std::endl;
289 |             return;
290 |           }
291 |       }
292 | 
293 |     std::cout << "Number of records in normalized id database: " << db.count() << "\n";
294 | 
295 |     db.close();
296 |   }
297 | 
298 |   db.execute("DROP TABLE IF EXISTS normalized_name");
299 | }
300 | 


--------------------------------------------------------------------------------
/importer/src/normalization.h:
--------------------------------------------------------------------------------
 1 | #ifndef GEOCODER_NORMALIZATION_H
 2 | #define GEOCODER_NORMALIZATION_H
 3 | 
 4 | #include <sqlite3pp.h>
 5 | #include <string>
 6 | 
 7 | void normalize_libpostal(sqlite3pp::database &db, std::string address_expansion_dir, bool verbose);
 8 | 
 9 | void normalized_to_final(sqlite3pp::database &db, std::string path);
10 | 
11 | #endif
12 | 


--------------------------------------------------------------------------------
/importer/src/utils.cpp:
--------------------------------------------------------------------------------
 1 | #include "utils.h"
 2 | 
 3 | #include <nlohmann/json.hpp>
 4 | 
 5 | using json = nlohmann::json;
 6 | 
 7 | std::string get_with_def(const std::map<std::string, std::string> &m, const std::string &key,
 8 |                          const std::string &defval)
 9 | {
10 |   auto it = m.find(key);
11 |   if (it == m.end())
12 |     return defval;
13 |   return it->second;
14 | }
15 | 
16 | std::map<std::string, std::string> parse_to_map(const std::string &js)
17 | {
18 |   std::map<std::string, std::string> m;
19 |   if (js.size())
20 |     {
21 |       json j = json::parse(js);
22 |       for (auto v : j.items())
23 |         m[v.key()] = v.value();
24 |     }
25 |   return m;
26 | }
27 | 
28 | bool same_starts_with(const std::string &start, const std::string &s1, const std::string &s2)
29 | {
30 |   return s1.rfind(start, 0) == 0 && s2.rfind(start, 0) == 0;
31 | }
32 | 
33 | std::string geocoder_type(const std::string &t_class, const std::string &t_value)
34 | {
35 |   if (t_value == "yes" || t_value.empty())
36 |     return t_class;
37 |   return t_class + "_" + t_value;
38 | }


--------------------------------------------------------------------------------
/importer/src/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H
 2 | #define UTILS_H
 3 | 
 4 | #include <map>
 5 | #include <string>
 6 | 
 7 | std::string get_with_def(const std::map<std::string, std::string> &m, const std::string &key,
 8 |                          const std::string &defval = std::string());
 9 | 
10 | std::string geocoder_type(const std::string &t_class, const std::string &t_value);
11 | 
12 | std::map<std::string, std::string> parse_to_map(const std::string &js);
13 | 
14 | bool same_starts_with(const std::string &start, const std::string &s1, const std::string &s2);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/scripts/postal/README.md:
--------------------------------------------------------------------------------
 1 | # Importing and processing libpostal for country specific datasets
 2 | 
 3 | When run from the folder containing geocoder-nlp and training data in separate subfolders.
 4 | 
 5 | * Split addresses by
 6 | ```
 7 | geocoder-nlp/scripts/postal/split_addresses.py training
 8 | ```
 9 | 
10 | 
11 | * To test training on a single country, run
12 | ```
13 | geocoder-nlp/scripts/postal/build_country_db.sh training countries EE
14 | ```
15 | 
16 | 
17 | * Make countries and languages list using
18 | ```
19 | geocoder-nlp/scripts/postal/make_country_list.py training
20 | ```
21 | 
22 | * Generate Makefile to run address parser generators
23 | ```
24 | geocoder-nlp/scripts/postal/build_all_country_db.py training countries
25 | ```
26 | 
27 | * Run address parser generator via `make -j 16`
28 | 
29 | Note that if address parser is interrupted, Makefile has to be
30 | generated again to remove already generated parts. It doesn't support
31 | such stops and restarts. 
32 | 


--------------------------------------------------------------------------------
/scripts/postal/build_all_country_db.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse, os, collections, sys
 4 | 
 5 | parser = argparse.ArgumentParser(description='''
 6 | This script generates a country specific databases for use with
 7 | libpostal routines. Its a driver that runs build_country_db.sh
 8 | for every country found in the countries_languages.txt .
 9 | 
10 | The driver will generate databases only for the countries that are
11 | missing in the output directory''')
12 | 
13 | parser.add_argument('addrdata_dir', type=str,
14 |                     help='path to the directory containing files required for address parser training (formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv)')
15 | 
16 | parser.add_argument('output_root_dir', type=str,
17 |                     help='path to the directory where country specific subdirectory will be created')
18 | 
19 | args = parser.parse_args()
20 | 
21 | script_dir = os.path.dirname(sys.argv[0])
22 | 
23 | # load list of countries
24 | countries = []
25 | for l in open("countries_languages.txt", "r"):
26 |     countries.append( l.split(':')[0] )
27 | 
28 | # add combinations to cover special cases in PBF downloads
29 | 
30 | # add Ireland together with UK to cover Ireland + Northern Ireland case
31 | countries.append("gb-ie")
32 | 
33 | countries.append("ht-do")
34 | countries.append("sn-gm")
35 | 
36 | # gcc states
37 | countries.append("bh-kw-om-qa-sa-ae")
38 | 
39 | countries.append("il-ps")
40 | countries.append("my-sg-bn")
41 | 
42 | ### countries list: done ###
43 | 
44 | print "Loaded list of countries [%d]: " % len(countries),
45 | for c in countries:
46 |     print c,
47 | print 
48 | 
49 | countries_done = []
50 | for l in os.listdir(args.output_root_dir):
51 |     if l != "compressed" and os.path.exists(os.path.join(args.output_root_dir, l, "address_parser", "address_parser_phrases.dat")):
52 |         countries_done.append( l.lower() )
53 | countries_done.sort()
54 | 
55 | print "\nList of countries that are ready [%d]: " % len(countries_done),
56 | for c in countries_done:
57 |     print c,
58 | print
59 | 
60 | countries_todo = []
61 | for l in countries:
62 |     if l not in countries_done:
63 |         countries_todo.append(l)
64 | 
65 | print "\nCountries not covered yet [%d]: " % len(countries_todo),
66 | for c in countries_todo:
67 |     print c,
68 | print
69 | 
70 | ##########################################
71 | 
72 | script = "Makefile"
73 | f = open(script, "w")
74 | f.write("# Generated by build_all_country_db\n\nall: ")
75 | for c in countries_todo:
76 |     C = c.upper()
77 |     f.write(os.path.join(args.output_root_dir, C, "address_parser", "address_parser.dat") + " ")
78 | 
79 | f.write("\n\techo All done\n\n")
80 | for c in countries_todo:
81 |     C = c.upper()
82 |     f.write(os.path.join(args.output_root_dir, C, "address_parser", "address_parser.dat") + ":\n" +
83 |             '\t%s/build_country_db.sh "%s" "%s" %s\n\n' % (script_dir, args.addrdata_dir,
84 |                                                            args.output_root_dir,
85 |                                                            C) )
86 | 
87 | print "\nExamine and run " + script + "\n"
88 | 


--------------------------------------------------------------------------------
/scripts/postal/build_country_db.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | set -e
  4 | 
  5 | if [ $# -ne 3 ]; then
  6 |     echo "
  7 | This script generates a country specific databases for use with
  8 | libpostal routines.
  9 | 
 10 | Usage:
 11 | 
 12 | $0 addrdata_dir output_root_dir country_code
 13 | 
 14 | where 
 15 | 
 16 | addrdata_dir: path to the directory containing files required for
 17 | address parser training (formatted_addresses_tagged.random.tsv
 18 | formatted_places_tagged.random.tsv)
 19 | 
 20 | output_root_dir: path to the directory where country specific
 21 | subdirectory will be created
 22 | 
 23 | country_code: ISO 3166-1 alpha-2 country code (2 letters,
 24 | https://en.wikipedia.org/wiki/ISO_3166-1)
 25 | 
 26 | The country code can be specified in any case. It will be converted to
 27 | uppercase when making subdirectory under output_root_dir. Note that a
 28 | temporary files will be created under output_root_dir while this
 29 | script is running. This temporary directory will be removed at the end
 30 | of the script.
 31 | 
 32 | The script uses build_geodb and address_parser_train from
 33 | libpostal. Either ensure that these executables are in the path or
 34 | point the variable POSTAL_SRC_DIR in the script to a directory
 35 | containing these executables.
 36 | 
 37 | "
 38 |     exit -1
 39 | fi
 40 | 
 41 | #################################################################
 42 | ### PATH TO LIBPOSTAL SRC DIRECTORY WITH COMPILED EXECUTABLES ###
 43 | 
 44 | POSTAL_SRC_DIR=libpostal/src
 45 | 
 46 | #################################################################
 47 | 
 48 | ADDRDATA=$1
 49 | OUTPUT=$2
 50 | COUNTRY=$3
 51 | 
 52 | TMPDATA="$OUTPUT/tmp-$COUNTRY-`cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 8 | head -n 1`"
 53 | 
 54 | COUNTRY_LOWER="${COUNTRY,,}"
 55 | COUNTRY_UPPER="${COUNTRY^^}"
 56 | 
 57 | COUNTRY_DIR_BASE=$COUNTRY_UPPER
 58 | COUNTRY_DIR="$OUTPUT/$COUNTRY_DIR_BASE"
 59 | 
 60 | PATH="$POSTAL_SRC_DIR:$PATH"
 61 | 
 62 | OUTPUT_ADDRESS="$TMPDATA/address.tsv"
 63 | 
 64 | rm -rf "$COUNTRY_DIR"
 65 | 
 66 | mkdir -p "$TMPDATA"
 67 | mkdir -p "$COUNTRY_DIR/address_parser"
 68 | 
 69 | # Addresses
 70 | cp /dev/null "$OUTPUT_ADDRESS"
 71 | clow=$(echo $COUNTRY_LOWER | tr "-" "\n")
 72 | for C in $clow; do 
 73 |     for file in formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv formatted_ways_tagged.random.tsv geoplanet_formatted_addresses_tagged.random.tsv openaddresses_formatted_addresses_tagged.random.tsv uk_openaddresses_formatted_addresses_tagged.random.tsv; do
 74 | 	echo "Address data preparation: $file / $C"
 75 | 	DSPLIT="$ADDRDATA/$file-split"
 76 | 	if [ -d $DSPLIT ]; then
 77 | 	    echo "Using pre-split" $file
 78 | 	    cat "$ADDRDATA/$file-split/$file-$C" >> "$OUTPUT_ADDRESS" || true
 79 | 	else
 80 | 	    grep $'\t'$C$'\t' "$ADDRDATA/$file" >> "$OUTPUT_ADDRESS" || true
 81 | 	fi
 82 |     done
 83 | done
 84 | 
 85 | echo "Randomize addresses"
 86 | shuf -o "$OUTPUT_ADDRESS.shuf" "$OUTPUT_ADDRESS"
 87 | mv "$OUTPUT_ADDRESS.shuf" "$OUTPUT_ADDRESS"
 88 | 
 89 | ########################################################################
 90 | 
 91 | echo "Address training"
 92 | tlines=`cat "$OUTPUT_ADDRESS" | wc -l`
 93 | reflines=15000000
 94 | refiters=35
 95 | if (( tlines*5 > reflines )); then
 96 |     iters=5
 97 |     minup=5
 98 | else
 99 |     iters=$((reflines / tlines + 1))
100 |     minup=3
101 |     if (( iters > refiters )); then
102 |         iters=$refiters
103 |         minup=1
104 |     fi
105 | fi
106 | 
107 | echo 'Number of training lines:' $tlines
108 | echo 'Iterations:' $iters
109 | echo 'Minimal updates:' $minup
110 | 
111 | time address_parser_train --iterations $iters --min-updates $minup "$OUTPUT_ADDRESS" "$COUNTRY_DIR/address_parser"
112 | 
113 | echo "Removing temporary directory"
114 | rm -rf "$TMPDATA"
115 | 
116 | # echo "Compressing country database"
117 | # mkdir -p "$OUTPUT/compressed"
118 | # (cd "$OUTPUT" && tar jcvf "compressed/$COUNTRY_DIR_BASE.tar.bz2" "$COUNTRY_DIR_BASE")
119 | 


--------------------------------------------------------------------------------
/scripts/postal/countries_languages.txt:
--------------------------------------------------------------------------------
  1 | ad: ar ca de en es fa fr it ja ko nl pl pt ru tr zh 
  2 | ae: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
  3 | af: ar de en es fa fr it ja ko nl pl ps pt ru tr unk xxx zh 
  4 | ag: ar de en es fa fr it ja ko nl pl pt ru tr zh 
  5 | ai: ar de en es fa fr it ja ko nl pl pt ru tr zh 
  6 | al: ar de en es fa fr it ja ko nl pl pt ru sq tr zh 
  7 | am: ar de en es fa fr hy it ja ko nl pl pt ru tr zh 
  8 | ao: ar de en es fa fr it ja ko nl pl pt ru tr zh 
  9 | ar: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 10 | as: de en es fa fr it ja nl pl ru zh 
 11 | at: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 12 | au: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 13 | aw: ar de en es fr it ja ko nl pap ru unk zh 
 14 | ax: ar de en es fa fi fr it ja ko nl pl pt ru sv tr unk zh 
 15 | az: ar az de en es fa fr it ja ko nl pl pt ru tr zh 
 16 | ba: ar bs de en es fa fr it ja ko nl pl pt ru tr zh 
 17 | bb: ar de en es fa fr it ja ko nl pl pt ru zh 
 18 | bd: ar bn de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
 19 | be: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
 20 | bf: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 21 | bg: ar bg de en es fa fr it ja ko nl pl pt ru tr zh 
 22 | bh: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
 23 | bi: ar de en es fa fr it ja ko nl pl pt ru zh 
 24 | bj: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 25 | bl: de en es fr nl ru zh 
 26 | bm: ar de en es fa fr it ja ko pl pt ru zh 
 27 | bn: ar de en es fa fr it ja ko ms nl pl pt ru tr zh 
 28 | bo: ar ay de en es fa fr it ja ko nl pl pt qu ru tr unk zh 
 29 | bq: en nl pap ru unk 
 30 | br: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 31 | bs: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 32 | bt: ar de dz en es fa fr it ja ko nl pl pt ru tr zh 
 33 | bw: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 34 | by: ar be de en es fa fr it ja ko nl pl pt ru ru_BY ru_RU ru_by ru_ru tr unk xxx zh 
 35 | bz: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 36 | ca: ar de en es fa fr ikt it iu ja ko nl pl pt ru tr unk xxx zh 
 37 | cc: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 38 | cd: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 39 | cf: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 40 | cg: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 41 | ch: ar de en es fa fr gsw it ja ko nl pl pt ru tr unk xxx zh 
 42 | ci: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 43 | ck: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 44 | cl: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 45 | cm: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 46 | cn: ar bo de en es fa fr it ja ko nl pl pt ru tr unk xxx zh zh_arab zh_hans zh_hant zh_jyutping zh_pinyin zh_py zh_tradition 
 47 | co: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 48 | cr: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 49 | cu: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 50 | cv: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 51 | cw: ar de en es fa fr it ja ko nl pap ru unk zh 
 52 | cx: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 53 | cy: ar de el en es fa fr it ja ko nl pl pt ru tr unk zh 
 54 | cz: ar cs de en es fa fr it ja ko nl pl pt ru tr zh 
 55 | de: ar de de_CH en es fa fr it ja ko nl pl pt ru tr zh 
 56 | dj: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 57 | dk: ar da de en es fa fr it ja ko nl pl pt ru tr zh 
 58 | dm: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 59 | do: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 60 | dz: ar de en es fa fr fr_1 it ja ko nl pl pt ru tr unk zh 
 61 | ec: ar de en es fa fr it ja ko nl pl pt qu ru tr unk zh 
 62 | ee: ar de en es et fa fr it ja ko nl pl pt ru tr zh 
 63 | eg: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
 64 | er: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 65 | es: ar ca ca_1 de en es eu fa fr gl it ja ko nl oc pl pt ru tr unk xxx zh 
 66 | et: am ar de en es fa fr it ja ko nl pl pt ru tr zh 
 67 | fi: ar de en es fa fi fi_1 fr it ja ko nl pl pt ru sv sv_1 tr unk zh 
 68 | fj: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 69 | fk: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 70 | fm: en fa ja ko ru zh 
 71 | fo: ar de en es fa fo fr it ja ko nl pl pt ru tr zh 
 72 | fr: ar br br_alt de en es fa fr it ja ko nl oc pl pt ru tr unk xxx zh 
 73 | ga: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 74 | gb: ar cy de en en_GB en_IE es fa fr ga gd it ja ko nl pl pt ru tr unk xxx zh 
 75 | gd: ar de en es fa fr it ja ko nl pl pt ru zh 
 76 | ge: ar de en es fa fr it ja ka ko nl pl pt ru ru_1 tr unk xxx zh 
 77 | gf: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 78 | gg: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
 79 | gh: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 80 | gl: ar de en es fa fr it ja kl ko nl pl pt ru tr zh 
 81 | gm: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 82 | gn: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 83 | gp: ar de en es fa fr it ja ko pl pt ru tr zh 
 84 | gq: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 85 | gr: ar de el el_latin en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
 86 | gt: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 87 | gu: de en es fr it ja ru zh 
 88 | gw: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 89 | gy: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 90 | hk: de en es fr ja nl pl ru unk xxx zh zh_pinyin 
 91 | hn: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 92 | hr: ar de en es fa fr hr it ja ko nl pl pt ru tr zh 
 93 | ht: ar de en es fa fr it ja ko nl pl pt ru tr zh 
 94 | hu: ar de en es fa fr hu it ja ko nl pl pt ru tr zh 
 95 | id: ar de en es fa fr id it ja ko nl pl pt ru tr unk zh 
 96 | ie: ar de en es fa fr ga it ja ko nl pl pt ru tr unk xxx zh 
 97 | il: ar de en es fa fr he it ja ko nl pl pt ru tr unk xxx zh 
 98 | im: ar de en es fa fr gv it ja ko nl pl pt ru tr unk zh 
 99 | in: ar de en es fa fr hi it ja ko nl pl pt ru tr unk xxx zh 
100 | io: ar de en es fa fr it ja ko nl pl pt ru tr zh 
101 | iq: ar ar_1 ar_Latn de en en_1 es fa fr it ja ko nl pl pt ru tr unk xxx zh 
102 | ir: ar de en en_1 es fa fa_1 fr it ja ko nl pl pt ru tr unk xxx zh 
103 | is: ar de en es fa fr is it ja ko nl pl pt ru tr zh 
104 | it: ar de en es fa fr it ja ko nl oc pl pt ru sl unk xxx zh 
105 | je: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
106 | jm: ar de en es fa fr it ja ko nl pl pt ru tr zh 
107 | jo: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
108 | jp: ar de en es fa fr it ja ja_furigana ja_hira ja_kana ja_kana_1 ja_rm ko nl pl pt ru tr unk xxx zh 
109 | ke: ar de en es fa fr it ja ko nl pl pt ru tr zh 
110 | kg: ar de en es fa fr it ja ko ky nl pl pt ru tr unk xxx zh 
111 | kh: ar de en es fa fr it ja km ko nl pl pt ru tr zh 
112 | ki: ar de en es fa fr it ja ko nl pt ru tr zh 
113 | km: ar de en es fa fr it ja ko nl pl pt ru tr zh 
114 | kn: ar de en es fa fr it ja ko nl pl pt ru tr zh 
115 | kp: ar de en es fa fr it ja ko ko_hanja ko_rm nl pl pt ru tr unk zh 
116 | kr: ar de en es fa fr it ja ko ko_rm ko_rm_2 nl pl pt ru tr unk zh 
117 | kw: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
118 | ky: ar de en es fa fr it ja ko nl pl pt ru tr zh 
119 | kz: ar de en es fa fr it ja kk ko nl pl pt ru tr unk xxx zh 
120 | la: ar de en es fa fr it ja ko lo lo_rm nl pl pt ru tr unk xxx zh 
121 | lb: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
122 | lc: ar de en es fa fr it ja ko nl pl pt ru zh 
123 | li: ar de en fa fr ja ko nl ru zh 
124 | lk: ar de en es fa fr it ja ko nl pl pt ru si tr unk zh 
125 | lr: ar de en es fa fr it ja ko nl pl pt ru tr zh 
126 | lt: ar de en es fa fr it ja ko lt nl pl pt ru tr zh 
127 | lu: ar de en es fa fr it ja ko lb nl pl pt ru tr unk xxx zh 
128 | lv: ar de en es fa fr it ja ko lv nl pl pt ru tr zh 
129 | ly: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
130 | ma: ar ar_1 de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
131 | mc: ar de en es fa fr it ja ko nl pl pt ru tr zh 
132 | md: ar de en es fa fr it ja ko nl pl pt ro ru tr zh 
133 | me: ar de en es fa fr it ja ko nl pl pt ru sr tr zh 
134 | mf: de en es fr ja ko nl zh 
135 | mg: ar de en es fa fr it ja ko mg nl pl pt ru tr unk zh 
136 | mh: ar de en es fa fr it ja ko nl pl pt ru tr zh 
137 | mk: ar de en es fa fr it ja ko mk nl pl pt ru tr zh 
138 | ml: ar de en es fa fr it ja ko nl pl pt ru tr zh 
139 | mm: ar de en es fa fr it ja ko my nl pl pt ru tr zh 
140 | mn: ar de en es fa fr it ja ko mn mn_mong nl pl pt ru tr unk zh 
141 | mo: de en es fr ja ko nl pl pt ru unk xxx zh zh_pinyin 
142 | mp: ar en es fa ja ko ru zh 
143 | mq: ar de en es fa fr it ja ko pl pt ru tr zh 
144 | mr: ar de en es fa fr it ja ko nl pl pt ru tr zh 
145 | ms: ar de en es fa fr it ja ko nl pl pt ru tr zh 
146 | mt: ar de en es fa fr it ja ko mt pl pt ru unk zh 
147 | mu: ar de en es fa fr it ja ko nl pl pt ru unk xxx zh 
148 | mv: ar de dv en es fa fr it ja ko nl pl pt ru tr zh 
149 | mw: ar de en es fa fr it ja ko nl pl pt ru tr zh 
150 | mx: ar de en es fa fr it ja ko nl pl pt ru tr zh 
151 | my: ar de en es fa fr it ja ko ms nl pl pt ru tr zh 
152 | mz: ar de en es fa fr it ja ko nl pl pt ru tr zh 
153 | na: af ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
154 | nc: ar de en es fa fr it ja ko nl pl pt ru tr zh 
155 | ne: ar de en es fa fr it ja ko nl pl pt ru tr zh 
156 | nf: ar de en es fa fr it ja ko nl pl pt ru tr zh 
157 | ng: ar de en es fa fr it ja ko nl pl pt ru tr zh 
158 | ni: ar de en es fa fr it ja ko nl pl pt ru tr zh 
159 | nl: ar de en es fa fr fy it ja ko nl pl pt ru tr unk zh 
160 | no: ar de en es fa fr it ja ko nb nl pl pt ru tr zh 
161 | np: ar de en es fa fr it ja ko ne nl pl pt ru tr unk xxx zh 
162 | nr: ar de en es fa fr it ja ko pl ru zh 
163 | nu: ar de en es fa fr it ja ko pl ru zh 
164 | nz: ar de en es fa fr it ja ko mi nl pl pt ru tr unk zh 
165 | om: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
166 | pa: ar de en es fa fr it ja ko nl pl pt ru tr zh 
167 | pe: ar de en es fa fr it ja ko nl pl pt qu ru tr unk zh 
168 | pf: ar de en es fa fr it ja ko nl pl pt ru zh 
169 | pg: ar de en es fa fr it ja ko nl pl pt ru tr zh 
170 | ph: ar de en es fa fil fr it ja ko nl pl pt ru tr unk xxx zh 
171 | pk: ar de en es fa fr it ja ko nl pl pt ru tr unk ur zh 
172 | pl: ar de en es fa fr it ja ko nl pl pt ru tr zh 
173 | pm: ar de en es fa fr it ja ko nl pl pt ru tr zh 
174 | pn: ar de en es fa fr it ja ko nl pl pt ru tr zh 
175 | pr: ar en es fa ja ko ru unk xxx zh 
176 | ps: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
177 | pt: ar de en es fa fr it ja ko nl pl pt ru tr zh 
178 | pw: en fa ja ko pt ru zh 
179 | py: ar de en es fa fr gn it ja ko nl pl pt ru tr unk zh 
180 | qa: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
181 | re: ar de en es fa fr it ja ko nl pl pt ru tr zh 
182 | ro: ar de en es fa fr it ja ko nl pl pt ro ro_1 ro_2 ru tr unk zh 
183 | rs: ar de en es fa fr it ja ko nl pl pt ru sr tr zh 
184 | ru: ar de en es fa fr it ja ko nl pl pt ru ru_rm tr unk zh 
185 | rw: ar de en es fa fr it ja ko nl pl pt ru tr zh 
186 | sa: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
187 | sb: ar de en fa ja ko ru zh 
188 | sc: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
189 | sd: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
190 | se: ar de en es fa fr it ja ko nl pl pt ru sv tr zh 
191 | sg: ar de en es fa fr it ja ko ms nl pl pt ru tr unk zh zh_pinyin 
192 | sh: ar de en es fa fr it ja ko nl pl pt ru zh 
193 | si: ar de en es fa fr it ja ko nl pl pt ru sl tr zh 
194 | sk: ar de en es fa fr it ja ko nl pl pt ru sk tr zh 
195 | sl: ar de en es fa fr it ja ko nl pl pt ru tr zh 
196 | sm: ar de en es fa fr it ja ko nl pl pt ru tr zh 
197 | sn: ar de en es fa fr it ja ko nl pl pt ru tr zh 
198 | so: ar de en es fa fr it ja ko nl pl pt ru so tr unk zh 
199 | sr: ar de en es fa fr it ja ko nl pl pt ru tr zh 
200 | ss: ar de en es fa fr it ja ko nl pl pt ru tr zh 
201 | st: ar de en es fa fr it ja ko nl pl pt ru tr zh 
202 | sv: ar de en es fa fr it ja ko nl pl pt ru tr zh 
203 | sx: de en fr it ja nl ru unk xxx zh 
204 | sy: ar de en en_1 es fa fr it ja ko nl pl pt ru tr unk zh 
205 | sz: ar de en es fa fr it ja ko nl pl pt ru tr zh 
206 | tc: ar de en es fa fr it ja ko nl pl pt ru tr zh 
207 | td: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
208 | tf: ar de en es fa fr it ja ko nl pl pt ru tr zh 
209 | tg: ar de en es fa fr it ja ko nl pl pt ru tr zh 
210 | th: ar de en es fa fr it ja ko nl pl pt ru th tr unk xxx zh 
211 | tj: ar de en es fa fr it ja ko nl pl pt ru tg tr zh 
212 | tk: ar de en es fa fr ja ko pl ru unk zh 
213 | tl: ar de en es fa fr it ja ko nl pl pt ru tr zh 
214 | tm: ar de en es fa fr it ja ko nl pl pt ru tk tr zh 
215 | tn: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
216 | to: ar de en es fa fr it ja ko nl pl pt ru to tr unk zh 
217 | tr: ar de en es fa fr it ja ko nl pl pt ru tr zh 
218 | tt: ar de en es fa fr it ja ko nl pl pt ru tr zh 
219 | tv: ar de en es fa fr ja ko pl pt ru tvl zh 
220 | tw: ar de en es fa fr it ja ko nl pl pt ru tr unk zh zh_pinyin zh_py 
221 | tz: ar de en es fa fr it ja ko nl pl pt ru tr zh 
222 | ua: ar de en es fa fr it ja ko nl pl pt ru ru_rm tr uk unk xxx zh 
223 | ug: ar de en es fa fr it ja ko nl pl pt ru tr zh 
224 | um: de en es fr it pt ru zh 
225 | us: ar de en es fa fr it ja ko nl pl pt ru tr unk xxx zh 
226 | uy: ar de en es fa fr it ja ko nl pl pt ru tr zh 
227 | uz: ar de en es fa fr it ja ko nl pl pt ru tr uz zh 
228 | va: ar de en es fa fr it ja ko nl pl pt ru tr zh 
229 | vc: ar de en es fa fr it ja ko nl pl pt ru tr zh 
230 | ve: ar de en es fa fr it ja ko nl pl pt ru tr zh 
231 | vg: ar de en es fa fr it ja ko nl pl pt ru tr zh 
232 | vi: ar da de en es fa fr it ja ko nl pl pt ru tr unk zh 
233 | vn: ar de en es fa fr it ja ko nl pl pt ru tr vi zh 
234 | vu: ar de en es fa fr it ja ko pl pt ru unk zh 
235 | wf: ar de en es fa fr it ja ko nl pl pt ru tr zh 
236 | ws: ar de en es fa fr it ja ko nl pl ru zh 
237 | ye: ar de en es fa fr it ja ko nl pl pt ru tr unk zh 
238 | yt: ar en fa fr ja ko pl ru zh 
239 | za: ar de en es fa fr it ja ko nl pl pt ru tr zh 
240 | zm: ar de en es fa fr it ja ko nl pl pt ru tr zh 
241 | zw: ar de en es fa fr it ja ko nl pl pt ru tr zh 
242 | 


--------------------------------------------------------------------------------
/scripts/postal/make_country_list.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse, glob, collections
 4 | 
 5 | parser = argparse.ArgumentParser(description='Generate list of countries covered among the addresses')
 6 | parser.add_argument('addrdata_dir', type=str,
 7 |                     help='path to the directory containing files required for address parser training (formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv)')
 8 | 
 9 | args = parser.parse_args()
10 | 
11 | AddrDir = args.addrdata_dir
12 | 
13 | cntlan = set()
14 | for fname in glob.glob(AddrDir + "/*.tsv"):
15 |     print "Analyzing " + fname
16 |     i = 0
17 |     for l in open(fname, "r"):
18 |         s = l.split()
19 |         k = s[0] + " " + s[1]
20 |         cntlan.add(k)
21 | 
22 | Countries = collections.defaultdict(list)
23 | while True:
24 |     if len(cntlan) < 1:
25 |         break
26 |     cl = cntlan.pop().split()
27 |     Countries[ cl[1] ].append(cl[0])
28 | 
29 | keys = Countries.keys()
30 | keys.sort()
31 | for k in keys:
32 |     print k + ": ",
33 |     Countries[k].sort()
34 |     for l in Countries[k]: print l,
35 |     print
36 |     
37 | f = open("countries_languages.txt", "w")
38 | for k in keys:
39 |     f.write(k + ": ")
40 |     for l in Countries[k]: f.write(l + " ")
41 |     f.write("\n")
42 | 


--------------------------------------------------------------------------------
/scripts/postal/split_addresses.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import argparse, glob, collections, shutil, os
 4 | 
 5 | parser = argparse.ArgumentParser(description='Split addresses databases by countries')
 6 | parser.add_argument('addrdata_dir', type=str,
 7 |                     help='path to the directory containing files required for address parser training (formatted_addresses_tagged.random.tsv formatted_places_tagged.random.tsv)')
 8 | 
 9 | args = parser.parse_args()
10 | 
11 | AddrDir = args.addrdata_dir
12 | SplitDir = os.path.join(AddrDir, "split")
13 | 
14 | def splitname(fname, c):
15 |     b = os.path.basename(fname)
16 |     d = fname + "-split"
17 |     f = os.path.join(d, b + "-" + c)
18 |     return d, f
19 | 
20 | def write(fname_base, data, c):
21 |     cnt = data[c]
22 |     d, fname = splitname(fname_base, c)
23 |     f = open(fname, "a")
24 |     while True:
25 |         if len(cnt) < 1:
26 |             break
27 |         l = cnt.pop()
28 |         f.write(l)
29 |     data[c] = collections.deque()
30 | 
31 | for fname in glob.glob(AddrDir + "/*.tsv"):
32 |     print "Splitting " + fname
33 |             
34 |     d, f = splitname(fname, "not-important")
35 |     if os.path.exists(d):
36 |         shutil.rmtree(d)
37 |     os.makedirs(d)
38 |     print "Directory containing split files:", d
39 |     
40 |     data = collections.defaultdict(collections.deque)
41 |     for l in open(fname, "r"):
42 |         c = l.split()[1]
43 | 
44 |         data[c].append(l)
45 |         if len(data[c]) > 1000:
46 |             write(fname, data, c)
47 | 
48 |     for k in data.keys():
49 |         write(fname, data, k)
50 |         
51 | 


--------------------------------------------------------------------------------
/scripts/tags/README.md:
--------------------------------------------------------------------------------
1 | The script over here generates the list of most popular tag values
2 | that can be inserted into map style sheet (map.ost) and
3 | whitelist. Adjust the script SQL query as appropriate and run the
4 | script.
5 | 
6 | As a data, taginfo database from
7 | https://taginfo.openstreetmap.org/download/taginfo-db.db.bz2 is used. 
8 | 


--------------------------------------------------------------------------------
/scripts/tags/generate_mapstyle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sqlite3
 4 | 
 5 | db = sqlite3.connect('taginfo-db.db')
 6 | c = db.cursor()
 7 | 
 8 | prioritylist = ""
 9 | 
10 | keyvals = []
11 | for r in c.execute("select key,value from tags where key='shop' order by count_all desc limit 50"):
12 |     key, value = r
13 | 
14 |     # no need for these
15 |     if value in ['yes', 'no']:
16 |         continue
17 |     
18 |     keyvals.append([key, value])
19 | 
20 | keyvals.sort()
21 | for r in keyvals:
22 |     key, value = r
23 | 
24 |     prioritylist += key + '_' + value + '\n'
25 | 
26 | print(prioritylist)
27 | 
28 |     
29 | 


--------------------------------------------------------------------------------
/src/geocoder.cpp:
--------------------------------------------------------------------------------
   1 | #include "geocoder.h"
   2 | 
   3 | #include <algorithm>
   4 | #include <boost/geometry.hpp>
   5 | #include <deque>
   6 | #include <iostream>
   7 | #include <set>
   8 | #include <sstream>
   9 | 
  10 | using namespace GeoNLP;
  11 | 
  12 | const int    GeoNLP::Geocoder::version{ 6 };
  13 | const size_t GeoNLP::Geocoder::num_languages{ 2 }; // 1 (default) + 1 (english)
  14 | 
  15 | typedef boost::geometry::model::point<
  16 |     double, 2, boost::geometry::cs::spherical_equatorial<boost::geometry::degree> >
  17 |              point_t;
  18 | const double earth_radius = 6378e3; // meters
  19 | 
  20 | ////////////////////
  21 | // helper functions
  22 | 
  23 | static double distance_per_latitude()
  24 | {
  25 |   return 111e3;
  26 | }
  27 | static double distance_per_longitude(double latitude)
  28 | {
  29 |   return std::max(1000.0, M_PI / 180.0 * 6378137.0 * cos(latitude * M_PI / 180.0));
  30 | }
  31 | 
  32 | ////////////////////
  33 | // GeoReference class
  34 | 
  35 | Geocoder::GeoReference::GeoReference() : m_is_set(false) {}
  36 | 
  37 | Geocoder::GeoReference::GeoReference(double lat, double lon, int zoom, double importance)
  38 |     : m_latitude(lat), m_longitude(lon), m_importance(importance), m_zoom(zoom), m_is_set(true)
  39 | {
  40 | }
  41 | 
  42 | void Geocoder::GeoReference::set(double lat, double lon, int zoom, double importance)
  43 | {
  44 |   m_latitude   = lat;
  45 |   m_longitude  = lon;
  46 |   m_zoom       = zoom;
  47 |   m_importance = importance;
  48 |   m_is_set     = true;
  49 | }
  50 | 
  51 | void Geocoder::GeoReference::reset()
  52 | {
  53 |   m_is_set = false;
  54 | }
  55 | 
  56 | double Geocoder::GeoReference::distance(const Geocoder::GeoResult &r) const
  57 | {
  58 |   point_t ref(m_longitude, m_latitude);
  59 |   point_t q(r.longitude, r.latitude);
  60 |   return boost::geometry::distance(ref, q) * earth_radius;
  61 | }
  62 | 
  63 | ////////////////////
  64 | // Geocoder class
  65 | 
  66 | Geocoder::Geocoder()
  67 | {
  68 |   update_limits();
  69 | }
  70 | 
  71 | std::string Geocoder::name_primary(const std::string &dname)
  72 | {
  73 |   return dname + "/geonlp-primary.sqlite";
  74 | }
  75 | 
  76 | std::string Geocoder::name_normalized_trie(const std::string &dname)
  77 | {
  78 |   return dname + "/geonlp-normalized.trie";
  79 | }
  80 | 
  81 | std::string Geocoder::name_normalized_id(const std::string &dname)
  82 | {
  83 |   return dname + "/geonlp-normalized-id.kch";
  84 | }
  85 | 
  86 | Geocoder::Geocoder(const std::string &dbname)
  87 | {
  88 |   if (!load(dbname))
  89 |     std::cerr << "Geocoder: error loading " << dbname << std::endl;
  90 | }
  91 | 
  92 | bool Geocoder::load(const std::string &dbname)
  93 | {
  94 |   if (dbname == m_database_path && m_database_open)
  95 |     return true;
  96 | 
  97 |   // clean before loading anything
  98 |   drop();
  99 | 
 100 |   bool error = false;
 101 |   try
 102 |     {
 103 |       m_database_path = dbname;
 104 |       if (m_db.connect(name_primary(m_database_path).c_str(), SQLITE_OPEN_READONLY) != SQLITE_OK)
 105 |         {
 106 |           error = true;
 107 |           std::cerr << "Error opening SQLite database\n";
 108 |         }
 109 | 
 110 |       if (!error && !check_version())
 111 |         {
 112 |           error = true;
 113 |         }
 114 | 
 115 |       // Limit Kyoto Cabinet caches
 116 |       m_database_norm_id.tune_map(32LL * 1024LL * 1024LL); // 64MB default
 117 |       // m_database_norm_id.tune_page_cache(32LL*1024LL*1024LL); // 64MB default
 118 | 
 119 |       if (!error
 120 |           && !m_database_norm_id.open(name_normalized_id(m_database_path).c_str(),
 121 |                                       kyotocabinet::HashDB::OREADER
 122 |                                           | kyotocabinet::HashDB::ONOLOCK))
 123 |         {
 124 |           error = true;
 125 |           std::cerr << "Error opening IDs database\n";
 126 |         }
 127 | 
 128 |       if (!error)
 129 |         m_trie_norm.load(
 130 |             name_normalized_trie(m_database_path).c_str()); // throws exception on error
 131 | 
 132 |       m_database_open = true;
 133 |     }
 134 |   catch (sqlite3pp::database_error &e)
 135 |     {
 136 |       error = true;
 137 |       std::cerr << "Geocoder SQLite exception: " << e.what() << std::endl;
 138 |     }
 139 |   catch (marisa::Exception &e)
 140 |     {
 141 |       error = true;
 142 |       std::cerr << "Geocoder MARISA exception: " << e.what() << std::endl;
 143 |     }
 144 | 
 145 |   if (error)
 146 |     drop();
 147 |   return !error;
 148 | }
 149 | 
 150 | bool Geocoder::load()
 151 | {
 152 |   return load(m_database_path);
 153 | }
 154 | 
 155 | void Geocoder::drop()
 156 | {
 157 |   m_db.disconnect();
 158 |   m_database_norm_id.close();
 159 |   m_trie_norm.clear();
 160 |   m_database_path = std::string();
 161 |   m_database_open = false;
 162 | }
 163 | 
 164 | bool Geocoder::check_version()
 165 | {
 166 |   std::ostringstream s;
 167 |   s << Geocoder::version;
 168 |   return check_version(s.str());
 169 | }
 170 | 
 171 | bool Geocoder::check_version(const std::string &supported)
 172 | {
 173 |   // this cannot through exceptions
 174 |   try
 175 |     {
 176 |       sqlite3pp::query qry(m_db, "SELECT value FROM meta WHERE key=\"version\"");
 177 | 
 178 |       for (auto v : qry)
 179 |         {
 180 |           std::string n;
 181 |           v.getter() >> n;
 182 |           if (n == supported)
 183 |             return true;
 184 |           else
 185 |             {
 186 |               std::cerr << "Geocoder: wrong version of the database. Supported: " << supported
 187 |                         << " / database version: " << n << std::endl;
 188 |               return false;
 189 |             }
 190 |         }
 191 |     }
 192 |   catch (sqlite3pp::database_error &e)
 193 |     {
 194 |       std::cerr << "Geocoder exception while checking database version: " << e.what() << std::endl;
 195 |       return false;
 196 |     }
 197 | 
 198 |   return false;
 199 | }
 200 | 
 201 | void Geocoder::update_limits()
 202 | {
 203 |   m_max_inter_results = m_max_results + m_max_inter_offset;
 204 | }
 205 | 
 206 | #ifdef GEONLP_PRINT_DEBUG
 207 | static std::string v2s(const std::vector<std::string> &v)
 208 | {
 209 |   std::string s = "{";
 210 |   for (auto i : v)
 211 |     {
 212 |       if (s.length() > 1)
 213 |         s += ", ";
 214 |       s += i;
 215 |     }
 216 |   s += "}";
 217 |   return s;
 218 | }
 219 | #endif
 220 | 
 221 | bool Geocoder::search(const std::vector<Postal::ParseResult> &parsed_query,
 222 |                       std::vector<Geocoder::GeoResult> &result, size_t min_levels,
 223 |                       const GeoReference &reference)
 224 | {
 225 |   if (!m_database_open)
 226 |     return false;
 227 | 
 228 |   // parse query by libpostal
 229 |   std::vector<Postal::Hierarchy> parsed_result;
 230 |   std::string                    postal_code;
 231 | 
 232 |   Postal::result2hierarchy(parsed_query, parsed_result, postal_code);
 233 | 
 234 |   result.clear();
 235 |   m_levels_resolved = min_levels;
 236 | 
 237 | #ifdef GEONLP_PRINT_DEBUG
 238 |   std::cout << "Search hierarchies:\n";
 239 |   std::cout << "Postal code: " << postal_code << "\n";
 240 | #endif
 241 | #ifdef GEONLP_PRINT_DEBUG_QUERIES
 242 |   std::cout << "\n";
 243 | #endif
 244 | 
 245 |   try
 246 |     { // catch and process SQLite and other exceptions
 247 |       for (const auto &r : parsed_result)
 248 |         {
 249 | #ifdef GEONLP_PRINT_DEBUG
 250 |           std::cout << "Levels: " << r.size() << " -> ";
 251 |           for (auto a : r)
 252 |             std::cout << v2s(a) << " / ";
 253 |           std::cout << "\n";
 254 | #endif
 255 | 
 256 |           m_query_count = 0;
 257 |           if (r.size() >= m_levels_resolved
 258 |               || (r.size() == m_levels_resolved
 259 |                   && (m_max_results == 0 || result.size() < m_max_inter_results)))
 260 |             search(r, postal_code, result);
 261 | #ifdef GEONLP_PRINT_DEBUG_QUERIES
 262 |           else
 263 |             std::cout << "Skipping hierarchy since search result already has more levels ("
 264 |                       << m_levels_resolved << ") than provided\n";
 265 | #endif
 266 | #ifdef GEONLP_PRINT_DEBUG_QUERIES
 267 |           std::cout << "\n";
 268 | #endif
 269 |         }
 270 | 
 271 | #ifdef GEONLP_PRINT_DEBUG
 272 |       std::cout << "\n";
 273 | #endif
 274 | 
 275 |       // fill the data
 276 |       for (GeoResult &r : result)
 277 |         {
 278 |           get_name(r.id, r.title, r.address, r.admin_levels, m_levels_in_title);
 279 |           r.type = get_type(r.id);
 280 |           get_features(r);
 281 | 
 282 |           sqlite3pp::query qry(
 283 |               m_db, "SELECT latitude, longitude, search_rank FROM object_primary WHERE id=?");
 284 |           qry.bind(1, r.id);
 285 |           for (auto v : qry)
 286 |             {
 287 |               // only one entry is expected
 288 |               v.getter() >> r.latitude >> r.longitude >> r.search_rank;
 289 |               break;
 290 |             }
 291 | 
 292 |           if (reference.is_set())
 293 |             {
 294 |               r.distance = reference.distance(r);
 295 | 
 296 |               // Here, 1000 is used to scale search_rank. Same factor is used in Geocoder importer
 297 |               r.search_rank -= reference.importance() * 1000
 298 |                                * search_rank_location_bias(r.distance, reference.zoom());
 299 |             }
 300 |         }
 301 |     }
 302 |   catch (sqlite3pp::database_error &e)
 303 |     {
 304 |       std::cerr << "Geocoder exception: " << e.what() << std::endl;
 305 |       return false;
 306 |     }
 307 | 
 308 |   // sort and trim results
 309 |   std::sort(result.begin(), result.end());
 310 |   if (m_max_results > 0 && result.size() >= m_max_results)
 311 |     result.resize(m_max_results);
 312 | 
 313 |   return true;
 314 | }
 315 | 
 316 | bool Geocoder::search(const Postal::Hierarchy &parsed, const std::string &postal_code,
 317 |                       std::vector<Geocoder::GeoResult> &result, size_t level, long long int range0,
 318 |                       long long int range1)
 319 | {
 320 |   /// Special case of search made by postal code only
 321 |   if (level == 0 && parsed.size() == 0 && !postal_code.empty())
 322 |     {
 323 |       std::string      p = postal_code;
 324 |       sqlite3pp::query qry(
 325 |           m_db, "SELECT id FROM object_primary WHERE postal_code=:pcode ORDER BY id ASC");
 326 |       qry.bind(":pcode", postal_code.c_str(), sqlite3pp::nocopy);
 327 |       for (auto v : qry)
 328 |         {
 329 |           GeoResult r;
 330 |           v.getter() >> r.id;
 331 |           r.levels_resolved = 0;
 332 |           result.push_back(r);
 333 |           if (m_max_results > 0 && result.size() >= m_max_inter_results)
 334 |             break;
 335 |         }
 336 | 
 337 |       return true;
 338 |     }
 339 | 
 340 |   /// Number of allowed queries are scaled by the number of
 341 |   /// languages. Otherwise, with the languages added, one can start
 342 |   /// missing search results.
 343 |   if (level >= parsed.size()
 344 |       || (m_max_queries_per_hierarchy > 0
 345 |           && m_query_count > m_max_queries_per_hierarchy * num_languages))
 346 |     return false;
 347 | 
 348 |   m_query_count++;
 349 | 
 350 |   std::set<long long int>
 351 |       ids_explored; /// keeps all ids which have been used to search further at this level
 352 | 
 353 |   // help structure keeping marisa-found search string with found ids
 354 |   struct IntermediateResult
 355 |   {
 356 |     std::string    txt;
 357 |     index_id_value id;
 358 |     IntermediateResult(const std::string &t, index_id_value i) : txt(t), id(i) {}
 359 |     bool operator<(const IntermediateResult &A) const
 360 |     {
 361 |       return (txt.length() < A.txt.length() || (txt.length() == A.txt.length() && txt < A.txt)
 362 |               || (txt == A.txt && id < A.id));
 363 |     }
 364 |   };
 365 | 
 366 |   std::deque<IntermediateResult> search_result;
 367 |   for (const std::string &s : parsed[level])
 368 |     {
 369 |       marisa::Agent agent;
 370 |       agent.set_query(s.c_str());
 371 |       while (m_trie_norm.predictive_search(agent))
 372 |         {
 373 |           std::string val;
 374 |           if (m_database_norm_id.get(make_id_key(agent.key().id()), &val))
 375 |             {
 376 |               index_id_value *idx, *idx1;
 377 |               if (get_id_range(val, (level == 0), range0, range1, &idx, &idx1))
 378 |                 {
 379 |                   for (; idx < idx1; ++idx)
 380 |                     {
 381 |                       long long int      id = *idx;
 382 |                       IntermediateResult r(std::string(agent.key().ptr(), agent.key().length()),
 383 |                                            id);
 384 |                       search_result.push_back(r);
 385 |                     }
 386 |                 }
 387 |             }
 388 |           else
 389 |             {
 390 |               std::cerr << "Internal inconsistency of the databases: TRIE " << agent.key().id()
 391 |                         << "\n";
 392 |             }
 393 |         }
 394 |     }
 395 | 
 396 |   std::sort(search_result.begin(), search_result.end());
 397 | 
 398 |   bool last_level = (level + 1 >= parsed.size());
 399 |   for (const IntermediateResult &branch : search_result)
 400 |     {
 401 |       long long int id             = branch.id;
 402 |       long long int last_subobject = id;
 403 | 
 404 |       if (ids_explored.count(id) > 0)
 405 |         continue; // has been looked into it already
 406 | 
 407 |       if (parsed.size() < m_levels_resolved
 408 |           || (parsed.size() == m_levels_resolved && m_max_results > 0
 409 |               && result.size() >= m_max_inter_results))
 410 |         break; // this search cannot add more results
 411 | 
 412 |       ids_explored.insert(id);
 413 | 
 414 |       // if postal code is assigned to this level and is correct,
 415 |       // all subobjects will have the same postal code. check if postal
 416 |       // code is resolved
 417 |       bool postal_is_ok = (postal_code.empty() || get_postal_code(id) == postal_code);
 418 | 
 419 |       // are we interested in this result even if it doesn't have subregions?
 420 |       if (!last_level || !postal_is_ok)
 421 |         {
 422 |           sqlite3pp::query qry(m_db, "SELECT last_subobject FROM hierarchy WHERE prim_id=?");
 423 |           qry.bind(1, id);
 424 |           for (auto v : qry)
 425 |             {
 426 |               // only one entry is expected
 427 |               v.getter() >> last_subobject;
 428 |               break;
 429 |             }
 430 | 
 431 |           // check if we have results which are better than this one if it
 432 |           // does not have any subobjects
 433 |           if (m_levels_resolved > level + 1 && id >= last_subobject)
 434 |             continue; // take the next search_result
 435 |         }
 436 | 
 437 |       if (last_level || last_subobject <= id
 438 |           || !search(parsed, postal_is_ok ? "" : postal_code, result, level + 1, id + 1,
 439 |                      last_subobject))
 440 |         {
 441 |           size_t levels_resolved = level + 1;
 442 |           bool   newlevel        = false;
 443 |           if (m_levels_resolved < levels_resolved)
 444 |             {
 445 |               result.clear();
 446 |               newlevel = true;
 447 |             }
 448 | 
 449 |           if ((m_levels_resolved == levels_resolved || newlevel)
 450 |               && (m_max_results == 0 || result.size() < m_max_inter_results))
 451 |             {
 452 |               bool have_already = false;
 453 |               for (const auto &r : result)
 454 |                 if (r.id == id)
 455 |                   {
 456 |                     have_already = true;
 457 |                     break;
 458 |                   }
 459 | 
 460 |               if (!have_already)
 461 |                 {
 462 |                   if (postal_is_ok)
 463 |                     {
 464 |                       GeoResult r;
 465 |                       r.id              = id;
 466 |                       r.levels_resolved = levels_resolved;
 467 |                       result.push_back(r);
 468 |                       m_levels_resolved = levels_resolved;
 469 |                     }
 470 |                   else if (id < last_subobject)
 471 |                     {
 472 |                       // search subobjects for ones with the same postal code
 473 |                       // there is a point to start searching only if there are
 474 |                       // subobjects only
 475 |                       sqlite3pp::query qry(m_db, "SELECT id FROM object_primary WHERE "
 476 |                                                  "postal_code=:pcode AND id>:min AND id<=:max");
 477 |                       qry.bind(":pcode", postal_code.c_str(), sqlite3pp::nocopy);
 478 |                       qry.bind(":min", id);
 479 |                       qry.bind(":max", last_subobject);
 480 | #ifdef GEONLP_PRINT_SQL
 481 |                       std::cout << "SELECT id FROM object_primary WHERE postal_code='"
 482 |                                 << postal_code << "' AND id>" << id << " AND id<=" << last_subobject
 483 |                                 << "\n";
 484 | #endif
 485 |                       for (auto v : qry)
 486 |                         {
 487 |                           GeoResult r;
 488 |                           v.getter() >> r.id;
 489 |                           r.levels_resolved = levels_resolved;
 490 |                           result.push_back(r);
 491 |                           m_levels_resolved = levels_resolved;
 492 |                           if (m_max_results > 0 && result.size() >= m_max_inter_results)
 493 |                             break;
 494 |                         }
 495 |                     }
 496 |                 }
 497 |             }
 498 |         }
 499 |     }
 500 | 
 501 |   return !ids_explored.empty();
 502 | }
 503 | 
 504 | void Geocoder::get_name(long long id, std::string &title, std::string &full, size_t &admin_levels,
 505 |                         int levels_in_title)
 506 | {
 507 |   long long int parent;
 508 |   std::string   name;
 509 |   std::string   name_extra;
 510 |   std::string   name_en;
 511 |   std::string   toadd;
 512 | 
 513 |   sqlite3pp::query qry(m_db,
 514 |                        "SELECT name, name_extra, name_en, parent FROM object_primary WHERE id=?");
 515 |   qry.bind(1, id);
 516 |   for (auto v : qry)
 517 |     {
 518 |       // only one entry is expected
 519 |       {
 520 |         // to allow NULL readouts from the database
 521 |         char const *n, *ne, *neng;
 522 |         v.getter() >> n >> ne >> neng >> parent;
 523 |         if (n)
 524 |           name = n;
 525 |         if (ne)
 526 |           name_extra = ne;
 527 |         if (neng)
 528 |           name_en = neng;
 529 |       }
 530 | 
 531 |       if (name.empty() && levels_in_title > 0)
 532 |         name = get_postal_code(id);
 533 |       if (name.empty())
 534 |         name = " ";
 535 | 
 536 |       toadd = std::string();
 537 |       if (m_preferred_result_language == "en" && !name_en.empty())
 538 |         toadd = name_en;
 539 |       else if (!name_extra.empty() && name != name_extra)
 540 |         toadd = name_extra + ", " + name;
 541 | 
 542 |       if (toadd.empty())
 543 |         toadd = name;
 544 | 
 545 |       if (!full.empty())
 546 |         full += ", ";
 547 |       full += toadd;
 548 | 
 549 |       if (levels_in_title > 0)
 550 |         {
 551 |           if (!title.empty())
 552 |             title += ", ";
 553 |           title += toadd;
 554 |         }
 555 | 
 556 |       get_name(parent, title, full, admin_levels, levels_in_title - 1);
 557 |       admin_levels++;
 558 |       return;
 559 |     }
 560 | }
 561 | 
 562 | std::string Geocoder::get_postal_code(long long id)
 563 | {
 564 |   char const *postal_code = nullptr;
 565 | 
 566 |   sqlite3pp::query qry(m_db, "SELECT postal_code FROM object_primary WHERE id=?");
 567 |   qry.bind(1, id);
 568 | 
 569 |   for (auto v : qry)
 570 |     {
 571 |       v.getter() >> postal_code;
 572 |       break;
 573 |     }
 574 | 
 575 |   return postal_code ? postal_code : std::string();
 576 | }
 577 | 
 578 | std::string Geocoder::get_type(long long id)
 579 | {
 580 |   std::string name;
 581 | 
 582 |   sqlite3pp::query qry(
 583 |       m_db, "SELECT t.name FROM object_primary o JOIN type t ON t.id=o.type_id WHERE o.id=?");
 584 |   qry.bind(1, id);
 585 | 
 586 |   for (auto v : qry)
 587 |     {
 588 |       std::string n;
 589 |       v.getter() >> n;
 590 | 
 591 |       if (!name.empty())
 592 |         name += ", ";
 593 |       name += n;
 594 |     }
 595 | 
 596 |   return name;
 597 | }
 598 | 
 599 | void Geocoder::get_features(GeoResult &r)
 600 | {
 601 |   sqlite3pp::query qry(m_db, "SELECT phone, postal_code, website FROM object_primary WHERE id=?");
 602 |   qry.bind(1, r.id);
 603 |   for (auto v : qry)
 604 |     {
 605 |       // only one entry is expected
 606 |       char const *phone, *postal, *web;
 607 |       v.getter() >> phone >> postal >> web;
 608 |       r.phone       = (phone ? phone : "");
 609 |       r.postal_code = (postal ? postal : "");
 610 |       r.website     = (web ? web : "");
 611 |       return;
 612 |     }
 613 | }
 614 | 
 615 | bool Geocoder::get_id_range(std::string &v, bool full_range, index_id_value range0,
 616 |                             index_id_value range1, index_id_value **idx0, index_id_value **idx1)
 617 | {
 618 |   int             sz = get_id_number_of_values(v);
 619 |   index_id_value *v0 = (index_id_value *)v.data();
 620 |   if (sz == 0)
 621 |     return false;
 622 | 
 623 |   if (full_range)
 624 |     {
 625 |       *idx0 = v0;
 626 |       *idx1 = v0 + sz;
 627 |       return true;
 628 |     }
 629 | 
 630 |   *idx0 = std::lower_bound(v0, v0 + sz, range0);
 631 |   if (*idx0 - v0 >= sz)
 632 |     return false;
 633 | 
 634 |   *idx1 = std::upper_bound(v0, v0 + sz, range1);
 635 |   if (*idx1 - v0 >= sz && *(v0) > range1)
 636 |     return false;
 637 | 
 638 |   return true;
 639 | }
 640 | 
 641 | // search next to the reference point
 642 | bool Geocoder::search_nearby(const std::vector<std::string> &name_query,
 643 |                              const std::vector<std::string> &type_query, double latitude,
 644 |                              double longitude, double radius, std::vector<GeoResult> &result,
 645 |                              Postal &postal)
 646 | {
 647 |   if (radius < 0)
 648 |     return false;
 649 | 
 650 |   // rough estimates of distance (meters) per degree
 651 |   //
 652 |   const double dist_per_degree_lat = distance_per_latitude();
 653 |   const double dist_per_degree_lon = distance_per_longitude(latitude);
 654 | 
 655 |   try
 656 |     {
 657 |       std::ostringstream qtxt;
 658 |       qtxt << "SELECT o.id, o.name, o.name_extra, o.name_en, t.name, o.latitude, o.longitude, "
 659 |               "o.search_rank "
 660 |            << "FROM object_primary o "
 661 |            << "JOIN type t ON o.type_id=t.id "
 662 |            << "JOIN object_primary_rtree ON (o.box_id = object_primary_rtree.id) "
 663 |            << "WHERE ";
 664 | 
 665 |       if (!type_query.empty())
 666 |         {
 667 |           std::string tqfull;
 668 |           for (auto tq : type_query)
 669 |             {
 670 |               if (!tqfull.empty())
 671 |                 tqfull += " OR ";
 672 |               else
 673 |                 tqfull = "(";
 674 |               tqfull += " t.name = '" + tq + "'";
 675 |             }
 676 |           qtxt << tqfull << ") AND ";
 677 |         }
 678 | 
 679 |       qtxt << "maxLat>=:minLat AND minLat<=:maxLat AND maxLon >= :minLon AND minLon <= :maxLon";
 680 | 
 681 | #ifdef GEONLP_PRINT_SQL
 682 |       std::cout << qtxt.str() << "\n";
 683 | #endif
 684 |       sqlite3pp::query qry(m_db, qtxt.str().c_str());
 685 | 
 686 |       qry.bind(":minLat", latitude - radius / dist_per_degree_lat);
 687 |       qry.bind(":maxLat", latitude + radius / dist_per_degree_lat);
 688 |       qry.bind(":minLon", longitude - radius / dist_per_degree_lon);
 689 |       qry.bind(":maxLon", longitude + radius / dist_per_degree_lon);
 690 | 
 691 |       for (auto v : qry)
 692 |         {
 693 |           long long   id;
 694 |           char const *name, *name_extra, *name_en;
 695 |           std::string type;
 696 |           double      lat, lon, distance;
 697 |           int         search_rank;
 698 |           v.getter() >> id >> name >> name_extra >> name_en >> type >> lat >> lon >> search_rank;
 699 | 
 700 |           // check if distance is ok. note that the distance is expected
 701 |           // to be small (on the scale of the planet)
 702 |           {
 703 |             double dlat = dist_per_degree_lat * (latitude - lat);
 704 |             double dlon = dist_per_degree_lon * (longitude - lon);
 705 |             distance    = sqrt(dlat * dlat + dlon * dlon);
 706 |             if (distance > radius)
 707 |               continue; // skip this result
 708 |           }
 709 | 
 710 |           // check name query
 711 |           if (!name_query.empty())
 712 |             {
 713 |               bool                  found = false;
 714 |               std::set<std::string> names;
 715 |               if (name)
 716 |                 names.insert(name);
 717 |               if (name_extra)
 718 |                 names.insert(name_extra);
 719 |               if (name_en)
 720 |                 names.insert(name_en);
 721 |               for (auto n = names.begin(); n != names.end() && !found; ++n)
 722 |                 {
 723 |                   if (n->empty())
 724 |                     continue;
 725 |                   std::vector<std::string> expanded;
 726 |                   postal.expand_string(*n, expanded);
 727 | 
 728 |                   for (auto q = name_query.cbegin(); !found && q != name_query.cend(); ++q)
 729 |                     for (auto e = expanded.begin(); !found && e != expanded.end(); ++e)
 730 |                       // search is for whether the name starts with the query or has
 731 |                       // the query after space (think of street Dr. Someone and query Someone)
 732 |                       found = (e->compare(0, q->length(), *q) == 0
 733 |                                || e->find(" " + *q) != std::string::npos);
 734 |                 }
 735 | 
 736 |               if (!found)
 737 |                 continue; // substring not found
 738 |             }
 739 | 
 740 |           GeoResult r;
 741 |           r.id = id;
 742 | 
 743 |           get_name(r.id, r.title, r.address, r.admin_levels, m_levels_in_title);
 744 |           r.type = get_type(r.id);
 745 |           get_features(r);
 746 | 
 747 |           r.latitude        = lat;
 748 |           r.longitude       = lon;
 749 |           r.distance        = distance;
 750 |           r.search_rank     = search_rank;
 751 |           r.levels_resolved = 1; // not used in this search
 752 | 
 753 |           result.push_back(r);
 754 |         }
 755 |     }
 756 |   catch (sqlite3pp::database_error &e)
 757 |     {
 758 |       std::cerr << "Geocoder exception: " << e.what() << std::endl;
 759 |       return false;
 760 |     }
 761 | 
 762 |   if (m_max_results > 0 && result.size() >= m_max_results)
 763 |     {
 764 |       Geocoder::sort_by_distance(result.begin(), result.end());
 765 |       result.resize(m_max_results);
 766 |     }
 767 | 
 768 |   return true;
 769 | }
 770 | 
 771 | // search next to the reference linestring
 772 | bool Geocoder::search_nearby(const std::vector<std::string> &name_query,
 773 |                              const std::vector<std::string> &type_query,
 774 |                              const std::vector<double>      &latitude,
 775 |                              const std::vector<double> &longitude, double radius,
 776 |                              std::vector<GeoResult> &result, Postal &postal, size_t skip_points)
 777 | {
 778 |   if (radius < 0 || latitude.size() < 2 || latitude.size() != longitude.size())
 779 |     return false;
 780 | 
 781 |   // const double earth_radius = 6378137;
 782 |   const double radius2 = radius * radius;
 783 | 
 784 |   // // fill linestring
 785 |   // boost::geometry::model::linestring<point_t> ls;
 786 |   // for (size_t i=0; i < latitude.size(); ++i)
 787 |   //   boost::geometry::append(ls, point_t(longitude[i],latitude[i]));
 788 | 
 789 |   try
 790 |     {
 791 | 
 792 |       std::set<long long> processed_boxes;
 793 |       double              line_distance = 0;
 794 |       for (size_t LineI = skip_points;
 795 |            LineI < longitude.size() - 1 && (m_max_results == 0 || result.size() <= m_max_results);
 796 |            ++LineI)
 797 |         {
 798 | 
 799 |           // rough estimates of distance (meters) per degree
 800 |           const double dist_per_degree_lat = distance_per_latitude();
 801 |           const double dist_per_degree_lon = distance_per_longitude(latitude[LineI]);
 802 |           {
 803 |             double x1 = latitude[LineI] * dist_per_degree_lat;
 804 |             double y1 = longitude[LineI] * dist_per_degree_lon;
 805 |             double x2 = latitude[LineI + 1] * dist_per_degree_lat;
 806 |             double y2 = longitude[LineI + 1] * dist_per_degree_lon;
 807 |             double dx = x2 - x1;
 808 |             double dy = y2 - y1;
 809 |             line_distance += sqrt(dx * dx + dy * dy);
 810 |           }
 811 | 
 812 |           // step 1: get boxes that are near the line segment
 813 |           std::deque<long long> newboxes;
 814 |           {
 815 |             sqlite3pp::query qry(m_db, "SELECT id, minLat, maxLat, minLon, maxLon "
 816 |                                        "FROM object_primary_rtree "
 817 |                                        "WHERE maxLat>=:minLat AND minLat<=:maxLat AND maxLon >= "
 818 |                                        ":minLon AND minLon <= :maxLon");
 819 | 
 820 |             auto bb_lat = std::minmax(latitude[LineI], latitude[LineI + 1]);
 821 |             auto bb_lon = std::minmax(longitude[LineI], longitude[LineI + 1]);
 822 | 
 823 |             qry.bind(":minLat", bb_lat.first - radius / dist_per_degree_lat);
 824 |             qry.bind(":maxLat", bb_lat.second + radius / dist_per_degree_lat);
 825 |             qry.bind(":minLon", bb_lon.first - radius / dist_per_degree_lon);
 826 |             qry.bind(":maxLon", bb_lon.second + radius / dist_per_degree_lon);
 827 | 
 828 |             for (auto v : qry)
 829 |               {
 830 |                 long long id;
 831 |                 double    minLat, maxLat, minLon, maxLon;
 832 |                 v.getter() >> id >> minLat >> maxLat >> minLon >> maxLon;
 833 | 
 834 |                 if (processed_boxes.count(id))
 835 |                   continue;
 836 |                 processed_boxes.insert(id);
 837 |                 newboxes.push_back(id);
 838 |               }
 839 |           }
 840 | 
 841 |           // check if anything is new
 842 |           if (newboxes.empty())
 843 |             continue;
 844 | 
 845 |           // step 2: search for objects from new boxes
 846 |           {
 847 |             std::ostringstream qtxt;
 848 |             qtxt << "SELECT o.id, o.name, o.name_extra, o.name_en, t.name, "
 849 |                  << "o.latitude, o.longitude, o.search_rank "
 850 |                  << "FROM object_primary o "
 851 |                  << "JOIN type t ON o.type_id=t.id "
 852 |                  << "WHERE ";
 853 | 
 854 |             if (!type_query.empty())
 855 |               {
 856 |                 std::string tqfull;
 857 |                 for (auto tq : type_query)
 858 |                   {
 859 |                     if (!tqfull.empty())
 860 |                       tqfull += " OR ";
 861 |                     else
 862 |                       tqfull = "(";
 863 |                     tqfull += " t.name = '" + tq + "'";
 864 |                   }
 865 |                 qtxt << tqfull << ") AND ";
 866 |               }
 867 | 
 868 |             qtxt << " o.box_id IN (";
 869 |             for (size_t i = 0; i < newboxes.size(); ++i)
 870 |               {
 871 |                 if (i)
 872 |                   qtxt << ", ";
 873 |                 qtxt << newboxes[i];
 874 |               }
 875 |             qtxt << ")";
 876 | #ifdef GEONLP_PRINT_SQL
 877 |             std::cout << qtxt.str() << "\n";
 878 | #endif
 879 |             sqlite3pp::query qry(m_db, qtxt.str().c_str());
 880 | 
 881 |             for (auto v : qry)
 882 |               {
 883 |                 long long   id;
 884 |                 char const *name, *name_extra, *name_en;
 885 |                 std::string type;
 886 |                 double      lat, lon, distance;
 887 |                 int         search_rank;
 888 |                 v.getter() >> id >> name >> name_extra >> name_en >> type >> lat >> lon
 889 |                     >> search_rank;
 890 | 
 891 |                 // check if distance is ok using earth as a plane approximation around the line
 892 |                 {
 893 |                   double       distance2 = -1;
 894 |                   const double xp        = lat * dist_per_degree_lat;
 895 |                   const double yp        = lon * dist_per_degree_lon;
 896 |                   double       u = 0, nrm2 = 0; // cache the last result for distance
 897 |                   for (size_t i = LineI;
 898 |                        i < longitude.size() - 1 && (distance2 < 0 || distance2 > radius2); ++i)
 899 |                     {
 900 |                       // constants used for distance calculations, drop when
 901 |                       // moving back to boost geometry
 902 |                       const double x1 = latitude[i] * dist_per_degree_lat;
 903 |                       const double y1 = longitude[i] * dist_per_degree_lon;
 904 |                       const double x2 = latitude[i + 1] * dist_per_degree_lat;
 905 |                       const double y2 = longitude[i + 1] * dist_per_degree_lon;
 906 |                       const double px = x2 - x1;
 907 |                       const double py = y2 - y1;
 908 |                       nrm2            = px * px + py * py;
 909 | 
 910 |                       u = ((xp - x1) * px + (yp - y1) * py) / nrm2;
 911 |                       u = std::max(0.0, std::min(1.0, u));
 912 | 
 913 |                       double dx = (x1 + u * px) - xp;
 914 |                       double dy = (y1 + u * py) - yp;
 915 |                       double dd = dx * dx + dy * dy;
 916 | 
 917 |                       if (distance2 < 0 || distance2 > dd)
 918 |                         distance2 = dd;
 919 |                     }
 920 | 
 921 |                   if (distance2 > radius2)
 922 |                     continue;
 923 | 
 924 |                   distance = line_distance - (1 - u) * sqrt(nrm2);
 925 |                 }
 926 |                 // // distance check using boost
 927 |                 // if ( boost::geometry::distance(point_t(lon, lat), ls)*earth_radius > radius )
 928 |                 //   continue; // skip this result
 929 | 
 930 |                 // check name query
 931 |                 if (!name_query.empty())
 932 |                   {
 933 |                     bool                  found = false;
 934 |                     std::set<std::string> names;
 935 |                     if (name)
 936 |                       names.insert(name);
 937 |                     if (name_extra)
 938 |                       names.insert(name_extra);
 939 |                     if (name_en)
 940 |                       names.insert(name_en);
 941 |                     for (auto n = names.begin(); n != names.end() && !found; ++n)
 942 |                       {
 943 |                         if (n->empty())
 944 |                           continue;
 945 |                         std::vector<std::string> expanded;
 946 |                         postal.expand_string(*n, expanded);
 947 | 
 948 |                         for (auto q = name_query.cbegin(); !found && q != name_query.cend(); ++q)
 949 |                           for (auto e = expanded.begin(); !found && e != expanded.end(); ++e)
 950 |                             // search is for whether the name starts with the query or has
 951 |                             // the query after space (think of street Dr. Someone and query Someone)
 952 |                             found = (e->compare(0, q->length(), *q) == 0
 953 |                                      || e->find(" " + *q) != std::string::npos);
 954 |                       }
 955 | 
 956 |                     if (!found)
 957 |                       continue; // substring not found
 958 |                   }
 959 | 
 960 |                 GeoResult r;
 961 |                 r.id = id;
 962 | 
 963 |                 get_name(r.id, r.title, r.address, r.admin_levels, m_levels_in_title);
 964 |                 r.type = get_type(r.id);
 965 |                 get_features(r);
 966 | 
 967 |                 r.latitude        = lat;
 968 |                 r.longitude       = lon;
 969 |                 r.distance        = distance;
 970 |                 r.search_rank     = search_rank;
 971 |                 r.levels_resolved = 1; // not used in this search
 972 | 
 973 |                 result.push_back(r);
 974 |               }
 975 |           }
 976 |         }
 977 |     }
 978 |   catch (sqlite3pp::database_error &e)
 979 |     {
 980 |       std::cerr << "Geocoder exception: " << e.what() << std::endl;
 981 |       return false;
 982 |     }
 983 | 
 984 |   return true;
 985 | }
 986 | 
 987 | int Geocoder::closest_segment(const std::vector<double> &latitude,
 988 |                               const std::vector<double> &longitude, double reference_latitude,
 989 |                               double reference_longitude)
 990 | {
 991 |   // make all checks first
 992 |   if (latitude.size() < 2 || latitude.size() != longitude.size())
 993 |     return -1;
 994 | 
 995 |   // rough estimates of distance (meters) per degree
 996 |   const double dist_per_degree_lat = distance_per_latitude();
 997 |   const double dist_per_degree_lon = distance_per_longitude(reference_latitude);
 998 | 
 999 |   const double xp = reference_latitude * dist_per_degree_lat;
1000 |   const double yp = reference_longitude * dist_per_degree_lon;
1001 | 
1002 |   size_t currI  = 0;
1003 |   double currD2 = -1;
1004 |   for (size_t i = 0; i < longitude.size() - 1; ++i)
1005 |     {
1006 |       const double x1   = latitude[i] * dist_per_degree_lat;
1007 |       const double y1   = longitude[i] * dist_per_degree_lon;
1008 |       const double x2   = latitude[i + 1] * dist_per_degree_lat;
1009 |       const double y2   = longitude[i + 1] * dist_per_degree_lon;
1010 |       const double px   = x2 - x1;
1011 |       const double py   = y2 - y1;
1012 |       const double nrm2 = px * px + py * py;
1013 | 
1014 |       double u  = ((xp - x1) * px + (yp - y1) * py) / nrm2;
1015 |       u         = std::max(0.0, std::min(1.0, u));
1016 |       double dx = (x1 + u * px) - xp;
1017 |       double dy = (y1 + u * py) - yp;
1018 |       double dd = dx * dx + dy * dy;
1019 | 
1020 |       if (currD2 < 0 || currD2 > dd)
1021 |         {
1022 |           currI  = i;
1023 |           currD2 = dd;
1024 |         }
1025 |     }
1026 | 
1027 |   return currI;
1028 | }
1029 | 
1030 | double Geocoder::search_rank_location_bias(double distance, int zoom)
1031 | {
1032 |   // based on Photon implementation of the bias
1033 |   const double offset = 800; // meters
1034 | 
1035 |   distance -= offset;
1036 |   if (distance <= 0)
1037 |     return 1.0;
1038 | 
1039 |   zoom          = std::max(zoom, 18);
1040 |   double radius = (1 << (18 - zoom)) * 250; // meters
1041 |   return exp(-distance / radius);
1042 | }
1043 | 


--------------------------------------------------------------------------------
/src/geocoder.h:
--------------------------------------------------------------------------------
  1 | #ifndef GEOCODER_H
  2 | #define GEOCODER_H
  3 | 
  4 | #include "postal.h"
  5 | 
  6 | #include <kchashdb.h>
  7 | #include <marisa.h>
  8 | #include <sqlite3pp.h>
  9 | 
 10 | #include <cctype>
 11 | #include <string>
 12 | #include <vector>
 13 | 
 14 | namespace GeoNLP
 15 | {
 16 | 
 17 | class Geocoder
 18 | {
 19 | 
 20 | public:
 21 |   struct GeoResult
 22 |   {
 23 |     long long int id;
 24 |     double        latitude;
 25 |     double        longitude;
 26 |     double        distance = 0;
 27 |     std::string   title;
 28 |     std::string   address;
 29 |     std::string   type;
 30 |     std::string   phone;
 31 |     std::string   postal_code;
 32 |     std::string   website;
 33 |     size_t        levels_resolved;
 34 |     size_t        admin_levels = 0;
 35 |     double        search_rank;
 36 | 
 37 |     bool operator<(const GeoResult &i) const
 38 |     {
 39 |       return (search_rank < i.search_rank
 40 |               || (search_rank == i.search_rank && address.length() < i.address.length())
 41 |               || (search_rank == i.search_rank && address.length() == i.address.length()
 42 |                   && address < i.address));
 43 |     }
 44 |   };
 45 | 
 46 |   class GeoReference
 47 |   {
 48 |   public:
 49 |     GeoReference();
 50 |     GeoReference(double lat, double lon, int zoom = 16, double importance = 0.75);
 51 | 
 52 |     void set(double lat, double lon, int zoom = 16, double importance = 0.75);
 53 |     void reset();
 54 | 
 55 |     bool   is_set() const { return m_is_set; }
 56 |     int    zoom() const { return m_zoom; }
 57 |     double importance() const { return m_importance; }
 58 |     double distance(const GeoResult &r) const;
 59 | 
 60 |   private:
 61 |     double m_latitude;
 62 |     double m_longitude;
 63 |     double m_importance;
 64 |     int    m_zoom;
 65 |     bool   m_is_set;
 66 |   };
 67 | 
 68 |   typedef uint32_t index_id_key;
 69 |   typedef uint32_t index_id_value;
 70 | 
 71 |   static const int    version;
 72 |   static const size_t num_languages; ///< Number of languages supported by this version
 73 | 
 74 | public:
 75 |   Geocoder();
 76 |   Geocoder(const std::string &dbpath);
 77 | 
 78 |   virtual ~Geocoder() {}
 79 | 
 80 |   /// \brief Search for any objects matching the normalized query
 81 |   ///
 82 |   bool search(const std::vector<Postal::ParseResult> &parsed_query, std::vector<GeoResult> &result,
 83 |               size_t min_levels = 0, const GeoReference &reference = GeoReference());
 84 | 
 85 |   /// \brief Search for objects within given radius from specified point and matching the query
 86 |   ///
 87 |   /// Here, radius is given in meters and the reference point is
 88 |   /// given by latitude and longitude (WGS 84). Query is given by name
 89 |   /// and type. When the both are given, the both queries have to be fulfilled
 90 |   /// (think of cafe and its name). Within type and name queries, a single match
 91 |   /// is sufficient.
 92 |   bool search_nearby(const std::vector<std::string> &name_query,
 93 |                      const std::vector<std::string> &type_query, double latitude, double longitude,
 94 |                      double radius, std::vector<GeoResult> &result, Postal &postal);
 95 | 
 96 |   /// \brief Search for objects within given radius from specified linestring and matching the query
 97 |   ///
 98 |   /// Here, radius is given in meters and the reference linestring
 99 |   /// is given by latitude and longitude vectors (WGS 84). Query is
100 |   /// given by name and type. When the both are given, the both
101 |   /// queries have to be fulfilled (think of cafe and its
102 |   /// name). Within type and name queries, a single match is
103 |   /// sufficient.
104 |   ///
105 |   /// Parameter skip_points can be used to skip the given number of
106 |   /// points from the beginning of the line when searching for
107 |   /// objects. This, for example, is used when looking for objects
108 |   /// next to route upcoming from the current location
109 |   bool search_nearby(const std::vector<std::string> &name_query,
110 |                      const std::vector<std::string> &type_query,
111 |                      const std::vector<double> &latitude, const std::vector<double> &longitude,
112 |                      double radius, std::vector<GeoResult> &result, Postal &postal,
113 |                      size_t skip_points = 0);
114 | 
115 |   int  get_levels_in_title() const { return m_levels_in_title; }
116 |   void set_levels_in_title(int l) { m_levels_in_title = l; }
117 | 
118 |   size_t get_max_queries_per_hierarchy() const { return m_max_queries_per_hierarchy; }
119 |   void   set_max_queries_per_hierarchy(size_t mx) { m_max_queries_per_hierarchy = mx; }
120 | 
121 |   size_t get_max_results() const { return m_max_results; }
122 |   void   set_max_results(size_t mx)
123 |   {
124 |     m_max_results = mx;
125 |     update_limits();
126 |   }
127 | 
128 |   size_t get_max_intermediate_offset() const { return m_max_inter_offset; }
129 |   void   set_max_intermediate_offset(size_t mx)
130 |   {
131 |     m_max_inter_offset = mx;
132 |     update_limits();
133 |   }
134 | 
135 |   /// \brief Set preferred language for results
136 |   ///
137 |   /// Use two-letter coded language code as an argument. For
138 |   /// example, for English, set it to "en". If the original name is
139 |   /// requested, you could set it to any invalid letter combination
140 |   /// or an empty string.
141 |   void set_result_language(const std::string &lang) { m_preferred_result_language = lang; }
142 | 
143 |   bool load(const std::string &dbpath);
144 |   bool load();
145 |   void drop();
146 | 
147 |   operator bool() const { return m_database_open; }
148 | 
149 | public:
150 |   static std::string name_primary(const std::string &dname);
151 |   static std::string name_normalized_trie(const std::string &dname);
152 |   static std::string name_normalized_id(const std::string &dname);
153 | 
154 |   // interaction with key/value database
155 |   static std::string make_id_key(index_id_key key)
156 |   {
157 |     return std::string((char *)&key, sizeof(key));
158 |   }
159 | 
160 |   static std::string make_id_value(const std::vector<index_id_value> v)
161 |   {
162 |     return std::string((char *)v.data(), sizeof(index_id_value) * v.size());
163 |   }
164 | 
165 |   static index_id_key get_id_key(const std::string &v) { return *((index_id_key *)v.data()); }
166 | 
167 |   static index_id_value get_id_value(const std::string &v, size_t index)
168 |   {
169 |     index_id_value *p = (index_id_value *)v.data();
170 |     return p[index];
171 |   }
172 | 
173 |   static size_t get_id_number_of_values(const std::string &v)
174 |   {
175 |     return v.size() / sizeof(index_id_value);
176 |   }
177 | 
178 |   static bool get_id_range(std::string &v, bool full_range, index_id_value range0,
179 |                            index_id_value range1, index_id_value **idx0, index_id_value **idx1);
180 | 
181 |   // support for sorting by distance
182 |   static bool distcomp(const Geocoder::GeoResult &i, const Geocoder::GeoResult &j)
183 |   {
184 |     return (i.distance < j.distance);
185 |   }
186 | 
187 |   template <typename T> static void sort_by_distance(T begin, T end)
188 |   {
189 |     std::sort(begin, end, distcomp);
190 |   }
191 | 
192 |   // search for the segment on a line that is the closest to the
193 |   // specified point. returns negative value on error
194 |   static int closest_segment(const std::vector<double> &latitude,
195 |                              const std::vector<double> &longitude, double reference_latitude,
196 |                              double reference_longitude);
197 | 
198 | protected:
199 |   bool search(const Postal::Hierarchy &parsed, const std::string &postal_code,
200 |               std::vector<GeoResult> &result, size_t level = 0, long long int range0 = 0,
201 |               long long int range1 = 0);
202 | 
203 |   void get_name(long long int id, std::string &title, std::string &full, size_t &admin_levels,
204 |                 int levels_in_title);
205 | 
206 |   std::string get_postal_code(long long int id);
207 | 
208 |   std::string get_type(long long int id);
209 | 
210 |   void get_features(GeoResult &r);
211 | 
212 |   virtual bool check_version();
213 | 
214 |   bool check_version(const std::string &supported);
215 | 
216 |   void update_limits();
217 | 
218 |   static double search_rank_location_bias(double distance, int zoom = 16);
219 | 
220 | protected:
221 |   sqlite3pp::database m_db;
222 |   std::string         m_database_path;
223 |   bool                m_database_open = false;
224 | 
225 |   kyotocabinet::HashDB m_database_norm_id;
226 |   marisa::Trie         m_trie_norm;
227 | 
228 |   int    m_levels_in_title           = 2;
229 |   size_t m_max_queries_per_hierarchy = 0;
230 |   size_t m_max_results               = 25;
231 |   size_t m_max_inter_offset          = 100;
232 |   size_t m_max_inter_results;
233 | 
234 |   size_t m_levels_resolved;
235 |   size_t m_query_count;
236 | 
237 |   std::string m_preferred_result_language;
238 | };
239 | 
240 | }
241 | #endif // GEOCODER_H
242 | 


--------------------------------------------------------------------------------
/src/postal.cpp:
--------------------------------------------------------------------------------
  1 | #include "postal.h"
  2 | 
  3 | #include <libpostal/libpostal.h>
  4 | 
  5 | #include <algorithm>
  6 | #include <cctype>
  7 | #include <functional>
  8 | #include <iostream>
  9 | #include <locale>
 10 | #include <set>
 11 | #include <sstream>
 12 | 
 13 | #include <string.h>
 14 | 
 15 | using namespace GeoNLP;
 16 | 
 17 | #define ADDRESS_PARSER_LABEL_HOUSE "house"
 18 | #define ADDRESS_PARSER_LABEL_CATEGORY "category"
 19 | #define ADDRESS_PARSER_LABEL_HOUSE_NUMBER "house_number"
 20 | #define ADDRESS_PARSER_LABEL_ROAD "road"
 21 | #define ADDRESS_PARSER_LABEL_SUBURB "suburb"
 22 | #define ADDRESS_PARSER_LABEL_CITY_DISTRICT "city_district"
 23 | #define ADDRESS_PARSER_LABEL_CITY "city"
 24 | #define ADDRESS_PARSER_LABEL_ISLAND "island"
 25 | #define ADDRESS_PARSER_LABEL_STATE_DISTRICT "state_district"
 26 | #define ADDRESS_PARSER_LABEL_STATE "state"
 27 | #define ADDRESS_PARSER_LABEL_POSTAL_CODE "postcode"
 28 | #define ADDRESS_PARSER_LABEL_COUNTRY_REGION "country_region"
 29 | #define ADDRESS_PARSER_LABEL_COUNTRY "country"
 30 | 
 31 | #define PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_INPUT "post:"
 32 | #define PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY "h-postcode"
 33 | 
 34 | //////////////////////////////////////////////////////////////////////
 35 | /// Helper string functions
 36 | ///
 37 | 
 38 | static void str2vecchar(const std::string &s, std::vector<char> &v)
 39 | {
 40 |   v.resize(s.length() + 1);
 41 |   std::copy(s.c_str(), s.c_str() + s.length() + 1, v.begin());
 42 | }
 43 | 
 44 | // trim from start
 45 | static inline std::string &ltrim(std::string &s)
 46 | {
 47 |   s.erase(s.begin(),
 48 | 	  std::find_if(s.begin(), s.end(), [](unsigned char ch) {
 49 | 	    return !std::isspace(ch);
 50 | 	  }));
 51 |   return s;
 52 | }
 53 | 
 54 | // trim from end
 55 | static inline std::string &rtrim(std::string &s)
 56 | {
 57 |   s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
 58 |     return !std::isspace(ch);
 59 |   }).base(), s.end());
 60 |   return s;
 61 | }
 62 | 
 63 | // trim from both ends
 64 | static inline std::string &trim(std::string &s)
 65 | {
 66 |   return ltrim(rtrim(s));
 67 | }
 68 | 
 69 | static void split_tokens(const std::string &s, char delim, std::vector<std::string> &elems)
 70 | {
 71 |   std::stringstream ss;
 72 |   ss.str(s);
 73 |   std::string item;
 74 |   while (std::getline(ss, item, delim))
 75 |     if (!item.empty())
 76 |       elems.push_back(trim(item));
 77 | }
 78 | 
 79 | static std::string primitive_key(size_t ind)
 80 | {
 81 |   std::ostringstream ss;
 82 |   ss << "h-" << ind;
 83 |   return ss.str();
 84 | }
 85 | 
 86 | ///////////////////////////////////////////////////////////////////
 87 | /// Postal class
 88 | 
 89 | Postal::Postal() {}
 90 | 
 91 | Postal::~Postal()
 92 | {
 93 |   drop();
 94 | }
 95 | 
 96 | void Postal::set_postal_datadir(const std::string &global, const std::string &country)
 97 | {
 98 |   if (global.length() < 1)
 99 |     m_postal_datadir_global.clear();
100 |   else
101 |     str2vecchar(global, m_postal_datadir_global);
102 | 
103 |   if (country.length() < 1)
104 |     m_postal_datadir_country.clear();
105 |   else
106 |     str2vecchar(country, m_postal_datadir_country);
107 | 
108 |   drop(); // force reinitialization
109 | }
110 | 
111 | void Postal::set_postal_datadir_country(const std::string &country)
112 | {
113 |   std::vector<char> nc;
114 |   if (country.length() >= 1)
115 |     str2vecchar(country, nc);
116 | 
117 |   if (nc == m_postal_datadir_country)
118 |     return; // nothing to do
119 | 
120 |   m_postal_datadir_country = nc;
121 |   if (m_initialized)
122 |     {
123 |       // have to drop current country-specific parser and load a new one
124 |       libpostal_teardown_parser();
125 | 
126 |       if ((m_postal_datadir_country.empty() && !libpostal_setup_parser())
127 |           || (!m_postal_datadir_country.empty()
128 |               && !libpostal_setup_parser_datadir(m_postal_datadir_country.data())))
129 |         {
130 |           std::cerr << "Postal: Error at set_postal_datadir_country " << country << std::endl;
131 |           drop();
132 |         }
133 |     }
134 | }
135 | 
136 | void Postal::add_language(const std::string &lang)
137 | {
138 |   std::vector<char> l;
139 |   str2vecchar(lang, l);
140 |   m_postal_languages.push_back(l);
141 |   drop();
142 | }
143 | 
144 | bool Postal::init()
145 | {
146 |   if (m_initialized)
147 |     return true;
148 | 
149 |   // used for transliteration and basic setup
150 |   if ((m_postal_datadir_global.empty() && !libpostal_setup())
151 |       || (!m_postal_datadir_global.empty()
152 |           && !libpostal_setup_datadir(m_postal_datadir_global.data())))
153 |     return false;
154 | 
155 |   if (m_use_postal)
156 |     {
157 |       if (m_postal_languages.empty())
158 |         {
159 |           if ((m_postal_datadir_global.empty() && !libpostal_setup_language_classifier())
160 |               || (!m_postal_datadir_global.empty()
161 |                   && !libpostal_setup_language_classifier_datadir(m_postal_datadir_global.data())))
162 |             return false;
163 |         }
164 | 
165 |       if ((m_postal_datadir_country.empty() && !libpostal_setup_parser())
166 |           || (!m_postal_datadir_country.empty()
167 |               && !libpostal_setup_parser_datadir(m_postal_datadir_country.data())))
168 |         return false;
169 |     }
170 | 
171 |   m_initialized = true;
172 |   return true;
173 | }
174 | 
175 | void Postal::drop()
176 | {
177 |   if (!m_initialized)
178 |     return;
179 |   libpostal_teardown_parser();
180 |   libpostal_teardown();
181 |   libpostal_teardown_language_classifier();
182 |   m_initialized = false;
183 | }
184 | 
185 | /// Normalize first, expand next. Seems to miss few expansions in current implementation
186 | /// Have to wait for new libpostal version to switch to this approach
187 | 
188 | // bool Postal::parse(const std::string &input, std::vector<Postal::ParseResult> &result)
189 | //{
190 | //     if (!init()) return false;
191 | 
192 | //    // convert string into vector of chars (libpostal uses char* as an argument, not const char*)
193 | //    std::vector<char> charbuff;
194 | //    charbuff.resize(input.length() + 1);
195 | //    std::copy(input.c_str(), input.c_str() + input.length() + 1, charbuff.begin());
196 | 
197 | //    size_t num_expansions;
198 | //    normalize_options_t options_norm = get_libpostal_default_options();
199 | //    address_parser_options_t options_parse = get_libpostal_address_parser_default_options();
200 | 
201 | //    char **expansions = expand_address(charbuff.data(), options_norm, &num_expansions);
202 | //    if (m_initialize_for_every_call)
203 | //        libpostal_teardown_language_classifier();
204 | 
205 | //    for (size_t i = 0; i < num_expansions; i++)
206 | //    {
207 | //        address_parser_response_t *parsed = parse_address(expansions[i], options_parse);
208 | 
209 | //        ParseResult r;
210 | //        for (size_t j = 0; j < parsed->num_components; j++)
211 | //            r[ parsed->labels[j] ] = parsed->components[j];
212 | 
213 | //        result.push_back(r);
214 | 
215 | //        address_parser_response_destroy(parsed);
216 | //    }
217 | 
218 | //    expansion_array_destroy(expansions, num_expansions);
219 | 
220 | //    if (m_initialize_for_every_call) drop();
221 | 
222 | //    return true;
223 | //}
224 | 
225 | //////////////////////////////////////////////////////////////
226 | /// expand first then normalize
227 | 
228 | bool Postal::parse(const std::string &input, std::vector<Postal::ParseResult> &result,
229 |                    Postal::ParseResult &nonormalization)
230 | {
231 |   if (m_use_postal && !init())
232 |     return false;
233 | 
234 |   // libpostal parsing
235 |   if (m_use_postal)
236 |     {
237 |       // convert string into vector of chars (libpostal uses char* as an argument, not const char*)
238 |       std::vector<char> charbuff;
239 |       charbuff.resize(input.length() + 1);
240 |       std::copy(input.c_str(), input.c_str() + input.length() + 1, charbuff.begin());
241 | 
242 |       libpostal_address_parser_options_t options_parse
243 |           = libpostal_get_address_parser_default_options();
244 | 
245 |       // parse the address
246 |       libpostal_address_parser_response_t *parsed
247 |           = libpostal_parse_address(charbuff.data(), options_parse);
248 |       nonormalization.clear();
249 |       for (size_t j = 0; j < parsed->num_components; j++)
250 |         {
251 |           std::vector<std::string> pc;
252 |           pc.push_back(parsed->components[j]);
253 |           nonormalization[parsed->labels[j]] = pc;
254 |         }
255 |       libpostal_address_parser_response_destroy(parsed);
256 | 
257 |       expand(nonormalization, result);
258 |     }
259 | 
260 |   // primitive parsing
261 |   if (m_use_primitive)
262 |     {
263 |       std::vector<std::string> hier;
264 |       split_tokens(input, ',', hier);
265 |       if (hier.empty())
266 |         hier.push_back(input);
267 | 
268 |       ParseResult prim;
269 |       int         shift = 0;
270 |       size_t      np    = strlen(PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_INPUT);
271 |       for (size_t j = 0; j < hier.size(); j++)
272 |         {
273 |           std::string v   = hier[hier.size() - j - 1];
274 |           std::string key = primitive_key(j - shift);
275 |           v               = trim(v);
276 |           if (v.compare(0, np, PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_INPUT) == 0)
277 |             {
278 |               v   = v.substr(np);
279 |               v   = trim(v);
280 |               key = PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY;
281 |               shift += 1;
282 |             }
283 |           std::vector<std::string> pc;
284 |           pc.push_back(v);
285 |           prim[key] = pc;
286 |         }
287 | 
288 |       expand(prim, result);
289 |     }
290 | 
291 |   if (m_initialize_for_every_call)
292 |     drop();
293 | 
294 |   return true;
295 | }
296 | 
297 | void Postal::expand(const Postal::ParseResult &input, std::vector<Postal::ParseResult> &result)
298 | {
299 |   if (!init())
300 |     {
301 |       result.push_back(input);
302 |       return;
303 |     }
304 | 
305 |   size_t                        num_expansions;
306 |   libpostal_normalize_options_t options_norm = libpostal_get_default_options();
307 | 
308 |   std::vector<char *> lang;
309 |   for (std::vector<char> &l : m_postal_languages)
310 |     lang.push_back(l.data());
311 | 
312 |   options_norm.languages     = lang.data();
313 |   options_norm.num_languages = lang.size();
314 | 
315 |   std::vector<char> charbuff;
316 | 
317 |   std::vector<std::vector<std::string> > address_expansions;
318 |   std::vector<std::string>               address_keys;
319 |   for (const auto &i : input)
320 |     {
321 |       // in practice, its only one element at ParseResult at this stage
322 |       for (const std::string &tonorm : i.second)
323 |         {
324 |           std::set<std::string> norm;
325 |           // always add unexpanded result into address expansions
326 |           // this will help with the partial entries as described in
327 |           // issue #64 https://github.com/rinigus/geocoder-nlp/issues/64
328 |           norm.insert(tonorm);
329 |           // no need to keep postal code in normalized and expanded
330 |           if (i.first != ADDRESS_PARSER_LABEL_POSTAL_CODE
331 |               && i.first != PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY)
332 |             {
333 |               charbuff.resize(tonorm.length() + 1);
334 |               std::copy(tonorm.c_str(), tonorm.c_str() + tonorm.length() + 1, charbuff.begin());
335 |               char **expansions
336 |                   = libpostal_expand_address(charbuff.data(), options_norm, &num_expansions);
337 |               for (size_t j = 0; j < num_expansions; j++)
338 |                 norm.insert(expansions[j]);
339 | 
340 |               libpostal_expansion_array_destroy(expansions, num_expansions);
341 |             }
342 |           address_expansions.push_back(std::vector<std::string>(norm.begin(), norm.end()));
343 |           address_keys.push_back(i.first);
344 |         }
345 |     }
346 | 
347 |   ParseResult r;
348 |   for (size_t i = 0; i < address_keys.size(); ++i)
349 |     r[address_keys[i]] = address_expansions[i];
350 |   result.push_back(r);
351 | }
352 | 
353 | void Postal::expand_string(const std::string &input, std::vector<std::string> &expansions)
354 | {
355 |   if (!m_use_postal || !init())
356 |     {
357 |       expansions.push_back(input);
358 |       return;
359 |     }
360 | 
361 |   size_t                        num_expansions;
362 |   libpostal_normalize_options_t options_norm = libpostal_get_default_options();
363 | 
364 |   std::vector<char *> lang;
365 |   for (std::vector<char> &l : m_postal_languages)
366 |     lang.push_back(l.data());
367 | 
368 |   options_norm.languages     = lang.data();
369 |   options_norm.num_languages = lang.size();
370 | 
371 |   std::vector<char> charbuff;
372 | 
373 |   charbuff.resize(input.length() + 1);
374 |   std::copy(input.c_str(), input.c_str() + input.length() + 1, charbuff.begin());
375 | 
376 |   char **expansions_cstr = libpostal_expand_address(charbuff.data(), options_norm, &num_expansions);
377 |   for (size_t j = 0; j < num_expansions; j++)
378 |     expansions.push_back(expansions_cstr[j]);
379 | 
380 |   libpostal_expansion_array_destroy(expansions_cstr, num_expansions);
381 | }
382 | 
383 | std::string Postal::normalize_postalcode(const std::string &postal_code)
384 | {
385 |   // According to https://en.wikipedia.org/wiki/Postal_code
386 |   // postal codes use
387 |   //  - The Arabic numerals "0" to "9"
388 |   //  - Letters of the ISO basic Latin alphabet
389 |   //  - Spaces, hyphens
390 |   //
391 |   // So, simple normalization can be used. Here, its uppercased and double
392 |   // spaces are removed.
393 | 
394 |   std::string normalized;
395 |   for (auto i : postal_code)
396 |     {
397 |       char c = std::toupper(i);
398 |       if (c != ' ' || (!normalized.empty() && normalized.back() != ' '))
399 |         normalized.push_back(c);
400 |     }
401 | 
402 |   // trim a single possible space at the end
403 |   if (!normalized.empty() && normalized.back() == ' ')
404 |     normalized.pop_back();
405 | 
406 |   return normalized;
407 | }
408 | 
409 | void Postal::result2hierarchy(const std::vector<ParseResult> &p, std::vector<Hierarchy> &h,
410 |                               std::string &postal_code)
411 | {
412 |   h.clear();
413 |   for (const ParseResult &r : p)
414 |     {
415 |       Hierarchy h_result;
416 | 
417 | #define ADDIFHAS(k)                                                                                \
418 |   {                                                                                                \
419 |     ParseResult::const_iterator it = r.find(k);                                                    \
420 |     if (it != r.end())                                                                             \
421 |       h_result.push_back(it->second);                                                              \
422 |   }
423 | 
424 |       ADDIFHAS(ADDRESS_PARSER_LABEL_COUNTRY);
425 |       ADDIFHAS(ADDRESS_PARSER_LABEL_COUNTRY_REGION);
426 |       ADDIFHAS(ADDRESS_PARSER_LABEL_STATE);
427 |       ADDIFHAS(ADDRESS_PARSER_LABEL_STATE_DISTRICT);
428 |       ADDIFHAS(ADDRESS_PARSER_LABEL_ISLAND);
429 |       ADDIFHAS(ADDRESS_PARSER_LABEL_CITY);
430 |       ADDIFHAS(ADDRESS_PARSER_LABEL_CITY_DISTRICT);
431 |       ADDIFHAS(ADDRESS_PARSER_LABEL_SUBURB);
432 |       ADDIFHAS(ADDRESS_PARSER_LABEL_ROAD);
433 |       ADDIFHAS(ADDRESS_PARSER_LABEL_HOUSE_NUMBER);
434 |       ADDIFHAS(ADDRESS_PARSER_LABEL_CATEGORY);
435 |       ADDIFHAS(ADDRESS_PARSER_LABEL_HOUSE);
436 | 
437 |       if (postal_code.empty())
438 |         {
439 |           ParseResult::const_iterator it = r.find(ADDRESS_PARSER_LABEL_POSTAL_CODE);
440 |           if (it != r.end() && it->second.size())
441 |             postal_code = normalize_postalcode(it->second[0]);
442 |         }
443 | 
444 |       // test if its primitive expansion result
445 |       if (h_result.empty())
446 |         {
447 |           bool done = false;
448 |           for (size_t i = 0; !done; ++i)
449 |             {
450 |               ParseResult::const_iterator it = r.find(primitive_key(i));
451 |               if (it == r.end())
452 |                 done = true;
453 |               else
454 |                 h_result.push_back(it->second);
455 |             }
456 |         }
457 | 
458 |       // overwrite with the primitive parser postal code if needed
459 |       ParseResult::const_iterator it = r.find(PRIMITIVE_ADDRESS_PARSER_POSTAL_CODE_KEY);
460 |       if (it != r.end() && it->second.size())
461 |         postal_code = normalize_postalcode(it->second[0]);
462 | 
463 |       h.push_back(h_result);
464 |     }
465 | }
466 | 


--------------------------------------------------------------------------------
/src/postal.h:
--------------------------------------------------------------------------------
 1 | #ifndef POSTAL_H
 2 | #define POSTAL_H
 3 | 
 4 | #include <map>
 5 | #include <string>
 6 | #include <vector>
 7 | 
 8 | namespace GeoNLP
 9 | {
10 | 
11 | class Postal
12 | {
13 | public:
14 |   Postal();
15 |   ~Postal();
16 | 
17 |   typedef std::map<std::string, std::vector<std::string> > ParseResult;
18 |   typedef std::vector<std::vector<std::string> >           Hierarchy;
19 | 
20 |   static std::string normalize_postalcode(const std::string &postal_code);
21 |   static void        result2hierarchy(const std::vector<ParseResult> &p, std::vector<Hierarchy> &h,
22 |                                       std::string &postal_code);
23 | 
24 |   /// \brief Parse and normalize input string
25 |   ///
26 |   bool parse(const std::string &input, std::vector<Postal::ParseResult> &parsed,
27 |              ParseResult &nonormalization);
28 | 
29 |   /// \brief Normalize input string and return its expansions
30 |   ///
31 |   void expand_string(const std::string &input, std::vector<std::string> &expansions);
32 | 
33 |   bool get_initialize_every_call() const { return m_initialize_for_every_call; }
34 |   void set_initialize_every_call(bool v) { m_initialize_for_every_call = v; }
35 | 
36 |   bool get_use_postal() const { return m_use_postal; }
37 |   void set_use_postal(bool v)
38 |   {
39 |     m_use_postal = v;
40 |     if (!v)
41 |       drop();
42 |   }
43 | 
44 |   bool get_use_primitive() const { return m_use_primitive; }
45 |   void set_use_primitive(bool v) { m_use_primitive = v; }
46 | 
47 |   void set_postal_datadir(const std::string &global, const std::string &country);
48 |   void set_postal_datadir_country(const std::string &country);
49 | 
50 |   void clear_languages()
51 |   {
52 |     m_postal_languages.clear();
53 |     drop();
54 |   }
55 |   void add_language(const std::string &lang);
56 | 
57 | protected:
58 |   bool init();
59 |   void drop();
60 | 
61 |   void expand(const Postal::ParseResult &input, std::vector<Postal::ParseResult> &result);
62 | 
63 | protected:
64 |   bool m_initialized               = false;
65 |   bool m_initialize_for_every_call = false;
66 |   bool m_use_postal                = true;
67 |   bool m_use_primitive             = true;
68 | 
69 |   std::vector<char>               m_postal_datadir_global;
70 |   std::vector<char>               m_postal_datadir_country;
71 |   std::vector<std::vector<char> > m_postal_languages;
72 | };
73 | 
74 | }
75 | 
76 | #endif // POSTAL_H
77 | 


--------------------------------------------------------------------------------
/src/version.h:
--------------------------------------------------------------------------------
 1 | #ifndef GEOCODER_VERSION_H
 2 | #define GEOCODER_VERSION_H
 3 | 
 4 | #define GEOCODERNLP_MAJOR_VERSION 1
 5 | #define GEOCODERNLP_MINOR_VERSION 0
 6 | #define GEOCODERNLP_PATCH_VERSION 0
 7 | 
 8 | #define GEOCODERNLP_VERSION (GEOCODERNLP_MAJOR_VERSION*100000) + (GEOCODERNLP_MINOR_VERSION*100) + (GEOCODERNLP_PATCH_VERSION)
 9 | 
10 | #define _STRINGIFY(n) _STRINGIFY_HELPER(n)
11 | #define _STRINGIFY_HELPER(n) #n
12 | 
13 | #define GEOCODERNLP_VERSION_STRING  \
14 |                 _STRINGIFY(GEOCODERNLP_MAJOR_VERSION) "." \
15 |                 _STRINGIFY(GEOCODERNLP_MINOR_VERSION) "." \
16 |                 _STRINGIFY(GEOCODERNLP_PATCH_VERSION)
17 | 
18 | 
19 | #endif // GEOCODER_VERSION_H
20 | 


--------------------------------------------------------------------------------