├── .npmrc
├── .jshintignore
├── cmd
    ├── server.sh
    ├── loadtest.js
    ├── units
    ├── integration
    ├── placetype.filter
    ├── build.sh
    ├── extract.sh
    ├── cli.js
    ├── jq.filter
    ├── load.js
    ├── ci.sh
    ├── s3_upload.sh
    ├── repl.js
    ├── generate_tests.js
    └── wof_extract_sqlite.js
├── .gitignore
├── query
    ├── count_tokens.sql
    ├── has_subject_autocomplete.sql
    ├── match_subject_distinct_subject_ids.sql
    ├── match_subject_autocomplete_distinct_subject_ids.sql
    ├── index.js
    ├── match_subject_object_autocomplete.sql
    ├── match_subject_object.sql
    ├── build_rtree.sql
    ├── match_subject_object_geom_intersects.sql
    └── match_subject_object_geom_intersects_autocomplete.sql
├── .dockerignore
├── .github
    └── workflows
    │   ├── pull_request.yml
    │   ├── _test.yml
    │   └── push.yml
├── lib
    ├── jsonParseStream.js
    ├── permutations.js
    ├── sorted.js
    ├── Database.js
    ├── TokenIndex.js
    ├── Result.js
    ├── analysis.js
    ├── unicode.js
    ├── DocStore.js
    └── Queries.js
├── config
    └── language
    │   ├── alternatives.js
    │   ├── blacklist.js
    │   └── whitelist.js
├── Dockerfile
├── .jshintrc
├── Placeholder.js
├── test
    ├── integration.js
    ├── lib
    │   ├── permutations.js
    │   ├── jsonParseStream.js
    │   ├── sorted.js
    │   ├── Database.js
    │   ├── DocStore.js
    │   ├── analysis.js
    │   ├── TokenIndex.js
    │   ├── Result.js
    │   └── Queries.js
    ├── server
    │   └── routes
    │   │   ├── _util.js
    │   │   └── findbyid.js
    ├── units.js
    ├── prototype
    │   ├── query_integration.js
    │   ├── tokenize_integration.js
    │   ├── io.js
    │   ├── tokenize.js
    │   └── query.js
    ├── functional_autocomplete.js
    ├── case.js
    ├── functional.js
    └── cases
    │   └── capitalCities.txt
├── server
    ├── routes
    │   ├── _util.js
    │   ├── tokenize.js
    │   ├── query.js
    │   ├── findbyid.js
    │   └── search.js
    ├── http.js
    └── demo
    │   └── index.html
├── prototype
    ├── io.js
    ├── tokenize.js
    ├── query.js
    └── wof.js
├── LICENSE
├── package.json
└── README.md


/.npmrc:
--------------------------------------------------------------------------------
1 | package-lock=false
2 | 


--------------------------------------------------------------------------------
/.jshintignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | coverage
3 | reports


--------------------------------------------------------------------------------
/cmd/server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | exec node server/http.js
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | data/*
3 | test/cases/generated.txt
4 | 


--------------------------------------------------------------------------------
/query/count_tokens.sql:
--------------------------------------------------------------------------------
1 | SELECT COUNT(*) AS cnt
2 | FROM fulltext AS ft
3 | WHERE ft.fulltext MATCH $token_quoted
4 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | .git
 2 | node_modules
 3 | .dockerignore
 4 | .gitignore
 5 | .gitattributes
 6 | Dockerfile
 7 | README.md
 8 | data/*
 9 | test/cases/generated.txt
10 | 


--------------------------------------------------------------------------------
/cmd/loadtest.js:
--------------------------------------------------------------------------------
1 | 
2 | var Placeholder = require('../Placeholder'),
3 |     ph = new Placeholder();
4 | 
5 | ph.load();
6 | console.log( 'loaded!' );
7 | // setInterval( function(){}, 1000 );
8 | 


--------------------------------------------------------------------------------
/cmd/units:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # run tests with pipefail to avoid false passes
4 | # see https://github.com/pelias/pelias/issues/744
5 | set -euo pipefail
6 | 
7 | node test/units.js | npx tap-spec
8 | 


--------------------------------------------------------------------------------
/cmd/integration:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # run tests with pipefail to avoid false passes
4 | # see https://github.com/pelias/pelias/issues/744
5 | set -euo pipefail
6 | 
7 | node test/integration.js | npx tap-spec
8 | 


--------------------------------------------------------------------------------
/query/has_subject_autocomplete.sql:
--------------------------------------------------------------------------------
1 | SELECT id
2 | FROM tokens as t1
3 |   JOIN fulltext AS f1 ON f1.rowid = t1.rowid
4 | WHERE f1.fulltext MATCH $subject
5 | -- AND t1.tag NOT IN ( 'colloquial' )
6 | LIMIT 1
7 | 


--------------------------------------------------------------------------------
/cmd/placetype.filter:
--------------------------------------------------------------------------------
1 | "wof:placetype":\s*"\(ocean\|continent\|marinearea\|empire\|country\|dependency\|disputed\|macroregion\|region\|macrocounty\|county\|localadmin\|locality\|borough\|macrohood\|neighbourhood\)"
2 | 


--------------------------------------------------------------------------------
/query/match_subject_distinct_subject_ids.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT( t1.id ) AS subjectId
2 | FROM tokens AS t1
3 | WHERE t1.token = $subject
4 | -- AND t1.tag NOT IN ( 'colloquial' )
5 | ORDER BY t1.id ASC
6 | LIMIT $limit
7 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
1 | name: Continuous Integration
2 | on: pull_request
3 | jobs:
4 |   unit-tests:
5 |     # only run this job for forks
6 |     if: github.event.pull_request.head.repo.full_name != github.repository
7 |     uses: ./.github/workflows/_test.yml
8 | 


--------------------------------------------------------------------------------
/query/match_subject_autocomplete_distinct_subject_ids.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT( t1.id ) AS subjectId
2 | FROM tokens AS t1
3 |   JOIN fulltext AS f1 ON f1.rowid = t1.rowid
4 | WHERE f1.fulltext MATCH $subject
5 | -- AND t1.tag NOT IN ( 'colloquial' )
6 | ORDER BY t1.id ASC
7 | LIMIT $limit
8 | 


--------------------------------------------------------------------------------
/cmd/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd );
 4 | 
 5 | PLACEHOLDER_DATA=${PLACEHOLDER_DATA:-"./data"};
 6 | 
 7 | rm -f ${PLACEHOLDER_DATA}/store.sqlite3;
 8 | 
 9 | cat ${PLACEHOLDER_DATA}/wof.extract | node ${DIR}/load.js
10 | 
11 | echo 'Done!'
12 | 


--------------------------------------------------------------------------------
/query/index.js:
--------------------------------------------------------------------------------
 1 | 
 2 | const fs = require('fs');
 3 | const path = require('path');
 4 | 
 5 | // load queries from filesystem
 6 | module.exports = fs.readdirSync(__dirname).reduce((memo, filename) => {
 7 |   var sql = fs.readFileSync( path.join( __dirname, filename ), 'utf8' ).trim();
 8 |   memo[ filename.replace('.sql', '' ) ] = sql;
 9 |   return memo;
10 | }, {});
11 | 


--------------------------------------------------------------------------------
/lib/jsonParseStream.js:
--------------------------------------------------------------------------------
 1 | var through = require('through2');
 2 | 
 3 | function streamFactory(){
 4 |   return through.obj(function( row, _, next ){
 5 | 
 6 |     try {
 7 |       this.push( JSON.parse( row ) );
 8 |     } catch( e ){
 9 |       console.error( 'invalid json', e );
10 |     }
11 | 
12 |     next();
13 |   });
14 | }
15 | 
16 | module.exports = streamFactory;
17 | 


--------------------------------------------------------------------------------
/cmd/extract.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd );
 4 | 
 5 | PLACEHOLDER_DATA=${PLACEHOLDER_DATA:-"./data"};
 6 | 
 7 | mkdir -p ${PLACEHOLDER_DATA};
 8 | 
 9 | echo "Creating extract at ${PLACEHOLDER_DATA}/wof.extract"
10 | 
11 | exec node --max_old_space_size=8000 ${DIR}/wof_extract_sqlite.js > ${PLACEHOLDER_DATA}/wof.extract;
12 | 
13 | echo 'Done!'
14 | 


--------------------------------------------------------------------------------
/config/language/alternatives.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   'tib': 'bod',
 3 |   'cze': 'ces',
 4 |   'wel': 'cym',
 5 |   'ger': 'deu',
 6 |   'gre': 'ell',
 7 |   'baq': 'eus',
 8 |   'per': 'fas',
 9 |   'fre': 'fra',
10 |   'arm': 'hye',
11 |   'ice': 'isl',
12 |   'geo': 'kat',
13 |   'mac': 'mkd',
14 |   'mao': 'mri',
15 |   'may': 'msa',
16 |   'bur': 'mya',
17 |   'dut': 'nld',
18 |   'rum': 'ron',
19 |   'slo': 'slk',
20 |   'alb': 'sqi',
21 |   'chi': 'zho'
22 | };


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # base image
 2 | FROM pelias/baseimage
 3 | 
 4 | # change working dir
 5 | ENV WORKDIR /code/pelias/placeholder
 6 | WORKDIR ${WORKDIR}
 7 | 
 8 | # copy package.json first to prevent npm install being rerun when only code changes
 9 | COPY ./package.json ${WORK}
10 | RUN npm install
11 | 
12 | # copy code from local checkout
13 | ADD . ${WORKDIR}
14 | 
15 | ENV PLACEHOLDER_DATA '/data/placeholder'
16 | 
17 | USER pelias
18 | 
19 | CMD [ "./cmd/server.sh" ]
20 | 


--------------------------------------------------------------------------------
/.jshintrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "node": true,
 3 |   "curly": true,
 4 |   "eqeqeq": true,
 5 |   "esversion": 6,
 6 |   "freeze": true,
 7 |   "immed": true,
 8 |   "indent": 2,
 9 |   "latedef": false,
10 |   "newcap": true,
11 |   "noarg": true,
12 |   "noempty": true,
13 |   "nonbsp": true,
14 |   "nonew": true,
15 |   "plusplus": false,
16 |   "quotmark": "single",
17 |   "undef": true,
18 |   "unused": false,
19 |   "maxparams": 5,
20 |   "maxdepth": 4,
21 |   "maxlen": 140
22 | }
23 | 


--------------------------------------------------------------------------------
/query/match_subject_object_autocomplete.sql:
--------------------------------------------------------------------------------
 1 | SELECT t1.id AS subjectId, t2.id as objectId
 2 | FROM lineage AS l1
 3 |   JOIN tokens AS t1 ON t1.id = l1.id
 4 |   JOIN tokens AS t2 ON t2.id = l1.pid
 5 | WHERE t1.token = $subject
 6 | AND t2.token LIKE $object
 7 | AND (
 8 |   t1.lang = t2.lang OR
 9 |   t1.lang IN ( 'eng', 'und' ) OR
10 |   t2.lang IN ( 'eng', 'und' )
11 | )
12 | -- AND t1.tag NOT IN ( 'colloquial' )
13 | -- AND t2.tag NOT IN ( 'colloquial' )
14 | GROUP BY t1.id, t2.id
15 | ORDER BY t1.id ASC, t2.id ASC
16 | LIMIT $limit
17 | 


--------------------------------------------------------------------------------
/Placeholder.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var _ = require('lodash'),
 3 |     DocStore = require('./lib/DocStore'),
 4 |     TokenIndex = require('./lib/TokenIndex');
 5 | 
 6 | // constructor
 7 | function Placeholder( options ){
 8 |   this.store = new DocStore( options );
 9 |   this.index = new TokenIndex( options );
10 | }
11 | 
12 | // load prototype methods from modules
13 | Placeholder.prototype = _.extend( Placeholder.prototype,
14 |   require('./prototype/io.js'),
15 |   require('./prototype/query.js'),
16 |   require('./prototype/tokenize.js'),
17 |   require('./prototype/wof.js')
18 | );
19 | 
20 | module.exports = Placeholder;
21 | 


--------------------------------------------------------------------------------
/cmd/cli.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Placeholder = require('../Placeholder'),
 3 |     ph = new Placeholder();
 4 | 
 5 | // init placeholder
 6 | ph.load();
 7 | 
 8 | // -- user input --
 9 | var input = ( process.argv.slice(2) || [] ).join(' ') || '';
10 | console.log( input + '\n' );
11 | 
12 | // -- search --
13 | console.time('took');
14 | ph.query( input, ( err, res ) => {
15 |   console.timeEnd('took');
16 | 
17 |   // print results
18 |   ph.store.getMany( res.getIdsAsArray(), (err, docs) => {
19 |     docs.forEach( doc => {
20 |       console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') );
21 |     });
22 |   });
23 | });
24 | 


--------------------------------------------------------------------------------
/.github/workflows/_test.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Tests
 2 | on: workflow_call
 3 | jobs:
 4 |   unit-tests:
 5 |     runs-on: '${{ matrix.os }}'
 6 |     strategy:
 7 |       matrix:
 8 |         os:
 9 |           - ubuntu-22.04
10 |         node-version: [ 20.x, 22.x, 24.x ]
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - name: 'Install node.js ${{ matrix.node-version }}'
14 |         uses: actions/setup-node@v4
15 |         with:
16 |           node-version: '${{ matrix.node-version }}'
17 |       - name: Run unit tests
18 |         run: |
19 |           [[ -f ./bin/ci-setup ]] && ./bin/ci-setup
20 |           npm install
21 |           npm run ci
22 | 


--------------------------------------------------------------------------------
/test/integration.js:
--------------------------------------------------------------------------------
 1 | var tape = require('tape');
 2 | var path = require('path');
 3 | 
 4 | var tests = [
 5 |   './lib/Queries',
 6 |   './prototype/tokenize_integration',
 7 |   './prototype/query_integration',
 8 |   './functional',
 9 |   './functional_autocomplete',
10 | ];
11 | 
12 | // test runner
13 | tests.map( function( testpath ){
14 | 
15 |   var file = require( testpath );
16 | 
17 |   var test = function( name, func ) {
18 |     return tape( path.normalize( testpath ) + ': ' + name , func );
19 |   };
20 | 
21 |   for( var testCase in file ){
22 |     if( 'function' === typeof file[testCase] ){
23 |       file[testCase]( test );
24 |     }
25 |   }
26 | });
27 | 


--------------------------------------------------------------------------------
/test/lib/permutations.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var permutations = require('../../lib/permutations');
 3 | 
 4 | module.exports.permutations = function(test, common) {
 5 |   test('permutations', function(t) {
 6 | 
 7 |     var input = [ 'soho', 'new', 'york', 'usa' ];
 8 |     var expected = [
 9 |       [ 'soho', 'new', 'york', 'usa' ],
10 |       [ 'soho', 'new', 'york' ],
11 |       [ 'soho', 'new' ],
12 |       [ 'soho' ],
13 |       [ 'new', 'york', 'usa' ],
14 |       [ 'new', 'york' ],
15 |       [ 'new' ],
16 |       [ 'york', 'usa' ],
17 |       [ 'york' ],
18 |       [ 'usa' ]
19 |     ];
20 | 
21 |     t.deepEqual( permutations.expand( input ), expected );
22 |     t.end();
23 |   });
24 | };
25 | 


--------------------------------------------------------------------------------
/cmd/jq.filter:
--------------------------------------------------------------------------------
 1 | .properties | with_entries(
 2 |   select(.key | test(
 3 |     "^(wof:(id|name|placetype|hierarchy|parent_id|country_alpha3|abbreviation|shortcode|superseded_by|label|population|megacity)$|" +
 4 |     "lbl:(bbox|latitude|longitude)$|" +
 5 |     "geom:(area|bbox|latitude|longitude)$|" +
 6 |     "iso:(country)$|" +
 7 |     "ne:(iso_a2|iso_a3|pop_est)$|" +
 8 |     "edtf:(deprecated)$|" +
 9 |     "mz:(is_current|population)$|" +
10 |     "gn:(population|pop)$|" +
11 |     "zs:(pop10)$|" +
12 |     "qs:(pop|gn_pop|photo_sum)$|" +
13 |     "wk:(population)$|" +
14 |     "meso:(pop)$|" +
15 |     "statoids:(population)$|" +
16 |     "name:|" +
17 |     "abrv:)"
18 |   ))
19 | )
20 | 


--------------------------------------------------------------------------------
/server/routes/_util.js:
--------------------------------------------------------------------------------
 1 | 
 2 | // in express, if you pass query params like this `?param[]=value`
 3 | // then the type of the param is Array and the code may be expecting a string.
 4 | // this convenience function allows either form to be used.
 5 | function arrayParam( param ){
 6 |   var res = [];
 7 | 
 8 |   // accept param as array. eg: param[]=value
 9 |   if( Array.isArray( param ) ){ res = param; }
10 | 
11 |   // accept param as string. eg: param=value
12 |   if( 'string' === typeof param ){ res = param.split(','); }
13 | 
14 |   // trim strings and remove empty elements
15 |   return res.map(a => a.trim()).filter(a => a.length);
16 | }
17 | 
18 | module.exports.arrayParam = arrayParam;
19 | 


--------------------------------------------------------------------------------
/test/server/routes/_util.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var util = require('../../../server/routes/_util');
 3 | 
 4 | module.exports.arrayParam = function(test, common) {
 5 |   test('arrayParam', function(t) {
 6 |     t.deepEqual( util.arrayParam(undefined), [], 'undefined' );
 7 |     t.deepEqual( util.arrayParam(null), [], 'null' );
 8 |     t.deepEqual( util.arrayParam(''), [], 'empty' );
 9 |     t.deepEqual( util.arrayParam([]), [], 'empty array' );
10 |     t.deepEqual( util.arrayParam(['a ', ' b']), ['a','b'], 'array' );
11 |     t.deepEqual( util.arrayParam(' test '), ['test'], 'simple string' );
12 |     t.deepEqual( util.arrayParam(' test, foo '), ['test','foo'], 'delimited string' );
13 |     t.end();
14 |   });
15 | };
16 | 


--------------------------------------------------------------------------------
/server/routes/tokenize.js:
--------------------------------------------------------------------------------
 1 | 
 2 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX;
 3 | 
 4 | module.exports = function( req, res ){
 5 | 
 6 |   // placeholder
 7 |   var ph = req.app.locals.ph;
 8 | 
 9 |   // input text
10 |   var text = req.query.text || '';
11 | 
12 |   // live mode (autocomplete-style search)
13 |   // we append a byte indicating the last word is potentially incomplete.
14 |   // except where the last token is a space, then we simply trim the space.
15 |   if( req.query.mode === 'live' ){
16 |     if( ' ' === text.slice(-1) ){
17 |       text = text.trim();
18 |     } else {
19 |       text += PARTIAL_TOKEN_SUFFIX;
20 |     }
21 |   }
22 | 
23 |   ph.tokenize( text, ( err, groups ) => {
24 |     res.status(200).json( groups );
25 |   });
26 | };
27 | 


--------------------------------------------------------------------------------
/test/lib/jsonParseStream.js:
--------------------------------------------------------------------------------
 1 | 
 2 | const through = require('through2');
 3 | const parser = require('../../lib/jsonParseStream');
 4 | 
 5 | module.exports.parse = function(test, common) {
 6 |   test('parse', function(t) {
 7 |     
 8 |     var chunks = [];
 9 | 
10 |     const xform = (chunk, _, next) => {
11 |       chunks.push( chunk );
12 |       next();
13 |     };
14 | 
15 |     const flush = (next) => {
16 |       t.deepEqual(chunks, [
17 |         { hello: 'world' },
18 |         { test: 'message' }
19 |       ]);
20 |       t.end();
21 |       next();
22 |     };
23 | 
24 |     const stream = parser();
25 |     stream.pipe( through.obj( xform, flush ) );
26 |     stream.write('{ "hello": "world" }');
27 |     stream.write('{ "test": "message" }');
28 |     stream.end();
29 |   });
30 | };
31 | 


--------------------------------------------------------------------------------
/config/language/blacklist.js:
--------------------------------------------------------------------------------
 1 | 
 2 | /**
 3 |   This blacklist lists all the language codes we exclude for import in to placeholder.
 4 | 
 5 |   The whosonfirst dataset contains many disused and rarely-used languages which
 6 |   can cause issues when the source data has been machine transiliterated.
 7 | 
 8 |   The list is non-exhaustive and was originally sourced from wikipedia and various
 9 |   online sources, I aimed to only include the least commonly spoken languages.
10 | 
11 |   If you feel a language code is wrong or missing, please feel free to edit this file.
12 | **/
13 | 
14 | // Enawene Nawe language
15 | // https://en.wikipedia.org/wiki/Enawene_Nawe_language
16 | module.exports.unk = '';
17 | 
18 | // Volapük
19 | // https://en.wikipedia.org/wiki/Volap%C3%BCk
20 | module.exports.vol = '';
21 | 


--------------------------------------------------------------------------------
/query/match_subject_object.sql:
--------------------------------------------------------------------------------
 1 | WITH l AS (
 2 |   SELECT *
 3 |   FROM lineage
 4 |   WHERE id IN (
 5 |     SELECT id
 6 |     FROM tokens
 7 |     WHERE token = $subject
 8 |   )
 9 |   AND pid IN (
10 |     SELECT id
11 |     FROM tokens
12 |     WHERE token = $object
13 |   )
14 | )
15 | SELECT
16 |   l.id AS subjectId,
17 |   l.pid AS objectId
18 | FROM l
19 |   JOIN tokens AS t1
20 |     INDEXED BY tokens_cover_idx
21 |     USING (id)
22 |   JOIN tokens AS t2
23 |     INDEXED BY tokens_cover_idx
24 |     ON t2.id = l.pid
25 | WHERE
26 |   t1.token = $subject
27 | AND
28 |   t2.token = $object
29 | AND (
30 |   t1.lang = t2.lang OR
31 |   t1.lang IN ( 'eng', 'und' ) OR
32 |   t2.lang IN ( 'eng', 'und' )
33 | )
34 | -- AND t1.tag NOT IN ( 'colloquial' )
35 | -- AND t2.tag NOT IN ( 'colloquial' )
36 | GROUP BY l.id, l.pid
37 | ORDER BY l.id ASC, l.pid ASC
38 | LIMIT $limit
39 | 


--------------------------------------------------------------------------------
/cmd/load.js:
--------------------------------------------------------------------------------
 1 | const split = require('split2');
 2 | const through = require('through2');
 3 | const parser = require('../lib/jsonParseStream');
 4 | const Placeholder = require('../Placeholder');
 5 | const ph = new Placeholder();
 6 | 
 7 | // run import pipeline
 8 | console.error('import...');
 9 | ph.load({ reset: true });
10 | 
11 | // run import
12 | process.stdin.pipe( split() )
13 |              .pipe( parser() )
14 |              .pipe( through.obj( function insert( row, _, next ){
15 |                ph.insertWofRecord( row, next );
16 |              }, function flush( done ){
17 |                console.error('populate fts...');
18 |                ph.populate();
19 |                console.error('optimize...');
20 |                ph.optimize();
21 |                console.error('close...');
22 |                ph.close();
23 |                done();
24 |              }));
25 | 


--------------------------------------------------------------------------------
/test/units.js:
--------------------------------------------------------------------------------
 1 | var tape = require('tape');
 2 | var path = require('path');
 3 | 
 4 | var tests = [
 5 |   './lib/jsonParseStream',
 6 |   './lib/analysis',
 7 |   './lib/permutations',
 8 |   './lib/sorted',
 9 |   './lib/Database',
10 |   './lib/DocStore',
11 |   './lib/TokenIndex',
12 |   './lib/Result',
13 |   './prototype/wof',
14 |   './prototype/io',
15 |   './prototype/tokenize',
16 |   './prototype/query',
17 |   './server/routes/_util.js',
18 |   './server/routes/findbyid.js',
19 | ];
20 | 
21 | // test runner
22 | tests.map( function( testpath ){
23 | 
24 |   var file = require( testpath );
25 | 
26 |   var test = function( name, func ) {
27 |     return tape( path.normalize( testpath ) + ': ' + name , func );
28 |   };
29 | 
30 |   for( var testCase in file ){
31 |     if( 'function' === typeof file[testCase] ){
32 |       file[testCase]( test );
33 |     }
34 |   }
35 | });
36 | 


--------------------------------------------------------------------------------
/server/routes/query.js:
--------------------------------------------------------------------------------
 1 | 
 2 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX;
 3 | 
 4 | module.exports = function( req, res ){
 5 | 
 6 |   // placeholder
 7 |   var ph = req.app.locals.ph;
 8 | 
 9 |   // input text
10 |   var text = req.query.text || '';
11 | 
12 |   // live mode (autocomplete-style search)
13 |   // we append a byte indicating the last word is potentially incomplete.
14 |   // except where the last token is a space, then we simply trim the space.
15 |   if( req.query.mode === 'live' ){
16 |     if( ' ' === text.slice(-1) ){
17 |       text = text.trim();
18 |     } else {
19 |       text += PARTIAL_TOKEN_SUFFIX;
20 |     }
21 |   }
22 | 
23 |   // perform query
24 |   console.time('took');
25 |   ph.query( text, ( err, result ) => {
26 |     console.timeEnd('took');
27 |     res.status(200).json( result.getIdsAsArray() );
28 |   });
29 | };
30 | 


--------------------------------------------------------------------------------
/test/prototype/query_integration.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Placeholder = require('../../Placeholder');
 3 | 
 4 | module.exports.query = function(test, util) {
 5 | 
 6 |   // load data
 7 |   var ph = new Placeholder();
 8 |   ph.load();
 9 | 
10 |   var assert = runner.bind(null, test, ph);
11 | 
12 |   assert([['kelburn', 'wellington', 'new zealand']], [1729339019]);
13 |   assert([['north sydney']], [85784821, 101931469, 102048877, 404225393, 1310698409]);
14 |   assert([['sydney', 'new south wales', 'australia']], [101932003, 102049151, 404226357, 1376953385, 1377004395]);
15 |   assert([['ケープタウン', '南アフリカ']], [101928027]);
16 | };
17 | 
18 | // convenience function for writing quick 'n easy test cases
19 | function runner( test, ph, actual, expected ){
20 |   test( actual, function(t) {
21 |     ph.query( actual[0].join(' '), ( err, res ) => {
22 |       t.deepEqual( res.getIdsAsArray(), expected );
23 |       t.end();
24 |     });
25 |   });
26 | }
27 | 


--------------------------------------------------------------------------------
/query/build_rtree.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | -- create virtual table
 3 | CREATE VIRTUAL TABLE IF NOT EXISTS rtree USING rtree(
 4 |    id,              -- Integer primary key
 5 |    minX, maxX,      -- Minimum and maximum X coordinate
 6 |    minY, maxY,      -- Minimum and maximum Y coordinate
 7 |    minZ, maxZ       -- Minimum and maximum 'rank'
 8 | );
 9 | 
10 | -- delete existing values
11 | DELETE FROM rtree;
12 | 
13 | -- fill rtree
14 | INSERT INTO rtree
15 | SELECT
16 |   id,
17 |   json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[0]' ) AS minX,
18 |   json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[2]' ) AS maxX,
19 |   json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[1]' ) AS minY,
20 |   json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[3]' ) AS maxY,
21 |   json_extract( json, '$.rank.min' ) AS minZ,
22 |   json_extract( json, '$.rank.max' ) AS maxZ
23 | FROM docs;


--------------------------------------------------------------------------------
/cmd/ci.sh:
--------------------------------------------------------------------------------
 1 | # Download Placeholder data for tests
 2 | BUCKET=https://data.geocode.earth/placeholder
 3 | 
 4 | export AGENT="github/${GITHUB_ACTOR}"
 5 | export REFERER="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}"
 6 | 
 7 | if [ ! -e data/store.sqlite3 ]; then
 8 |   # ensure data directory exists
 9 |   mkdir -p data
10 | 
11 |   # attempt to download today's data first, fall back to latest if not found
12 |   echo "Downloading placeholder data..."
13 |   curl -A "${AGENT}" -e "${REFERER}" -sfo data/store.sqlite3.gz ${BUCKET}/$(date +%Y-%m-%d)/store.sqlite3.gz || true
14 |   [ -e data/store.sqlite3.gz ] || curl -A "${AGENT}" -e "${REFERER}" -so data/store.sqlite3.gz ${BUCKET}/store.sqlite3.gz
15 | 
16 |   # decompress the sqlite database
17 |   echo "Decompressing placeholder data..."
18 |   gunzip -f data/store.sqlite3.gz
19 | fi
20 | 
21 | # check sqlite3 version
22 | sqlite3 --version
23 | 
24 | # install npm dependencies
25 | npm install
26 | 
27 | # run all tests
28 | npm run all
29 | 


--------------------------------------------------------------------------------
/prototype/io.js:
--------------------------------------------------------------------------------
 1 | 
 2 | // plugin to handle I/O
 3 | const path = require('path');
 4 | 
 5 | // load data from disk
 6 | module.exports.load = function( opts ){
 7 |   const dataDir = process.env.PLACEHOLDER_DATA || path.join( __dirname, '../data/');
 8 |   const dbPath = path.join( dataDir, 'store.sqlite3' );
 9 | 
10 |   this.store.open( dbPath, opts ); // document store
11 |   this.index.open( dbPath, opts ); // token index
12 | };
13 | 
14 | // populate databases
15 | module.exports.populate = function(){
16 |   this.store.populate();
17 |   this.index.populate();
18 | };
19 | 
20 | // optimize databases
21 | module.exports.optimize = function(){
22 |   this.index.optimize();
23 | };
24 | 
25 | // check schema of databases match
26 | // the schema expected by the codebase
27 | module.exports.checkSchema = function(){
28 |   this.store.checkSchema();
29 |   this.index.checkSchema();
30 | };
31 | 
32 | // gracefully close connections
33 | module.exports.close = function(){
34 |   this.store.close();
35 |   this.index.close();
36 | };
37 | 


--------------------------------------------------------------------------------
/query/match_subject_object_geom_intersects.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   t1.id AS subjectId,
 3 |   t2.id as objectId
 4 | FROM fulltext f1
 5 |   JOIN tokens t1 ON (
 6 |     f1.rowid = t1.rowid
 7 |     AND f1.fulltext MATCH $subject_quoted
 8 |     AND LIKELY(t1.token = $subject)
 9 |   )
10 |     JOIN rtree AS r1 ON t1.id = r1.id
11 |       JOIN rtree AS r2 ON (
12 |         r1.maxZ < r2.minZ AND
13 |         (r1.minX - $threshold) < r2.maxX AND
14 |         (r1.maxX + $threshold) > r2.minX AND
15 |         (r1.minY - $threshold) < r2.maxY AND
16 |         (r1.maxY + $threshold) > r2.minY
17 |       )
18 |         JOIN fulltext AS f2 ON f2.fulltext MATCH $object_quoted
19 |           JOIN tokens t2 ON (
20 |             f2.rowid = t2.rowid
21 |             AND r2.id = t2.id
22 |             AND LIKELY(t2.token = $object)
23 |             AND (
24 |               t1.lang = t2.lang OR
25 |               t1.lang IN ('eng', 'und') OR
26 |               t2.lang IN ('eng', 'und')
27 |             )
28 |           )
29 | GROUP BY t1.id, t2.id
30 | ORDER BY t1.id ASC, t2.id ASC
31 | LIMIT $limit
32 | 


--------------------------------------------------------------------------------
/test/functional_autocomplete.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Placeholder = require('../Placeholder');
 3 | 
 4 | module.exports.functional = function(test, util) {
 5 | 
 6 |   // load data
 7 |   var ph = new Placeholder();
 8 |   ph.load();
 9 | 
10 |   var assert = runner.bind(null, test, ph);
11 | 
12 |   assert('Kelbur\x26', [1326645067, 1729339019]);
13 |   assert('Kelburn\x26', [1326645067, 1729339019]);
14 |   assert('Kelburn W\x26', [1729339019]);
15 |   assert('Kelburn Well\x26', [1729339019]);
16 |   assert('Kelburn Wellington\x26', [1729339019]);
17 |   assert('Kelburn Wellington New\x26', [1729339019]);
18 |   assert('Kelburn Wellington New Z\x26', [1729339019]);
19 |   assert('Kelburn Wellington New Zeal\x26', [1729339019]);
20 |   assert('Kelburn Wellington New Zealand\x26', [1729339019]);
21 | };
22 | 
23 | // convenience function for writing quick 'n easy test cases
24 | function runner( test, ph, actual, expected ){
25 |   test( actual, function(t) {
26 |     ph.query( actual, ( err, res ) => {
27 |       t.deepEqual( res.getIdsAsArray(), expected );
28 |       t.end();
29 |     });
30 |   });
31 | }
32 | 


--------------------------------------------------------------------------------
/query/match_subject_object_geom_intersects_autocomplete.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   t1.id AS subjectId,
 3 |   t2.id as objectId
 4 | FROM fulltext f1
 5 |   JOIN tokens t1 ON (
 6 |     f1.rowid = t1.rowid
 7 |     AND f1.fulltext MATCH $subject_quoted
 8 |     AND LIKELY(t1.token = $subject)
 9 |   )
10 |     JOIN rtree AS r1 ON t1.id = r1.id
11 |       JOIN rtree AS r2 ON (
12 |         r1.maxZ < r2.minZ AND
13 |         (r1.minX - $threshold) < r2.maxX AND
14 |         (r1.maxX + $threshold) > r2.minX AND
15 |         (r1.minY - $threshold) < r2.maxY AND
16 |         (r1.maxY + $threshold) > r2.minY
17 |       )
18 |         JOIN fulltext AS f2 ON f2.fulltext MATCH $object_quoted OR $object_quoted*
19 |           JOIN tokens t2 ON (
20 |             f2.rowid = t2.rowid
21 |             AND r2.id = t2.id
22 |             AND LIKELY(t2.token = $object OR t2.token LIKE ($object || '%'))
23 |             AND (
24 |               t1.lang = t2.lang OR
25 |               t1.lang IN ('eng', 'und') OR
26 |               t2.lang IN ('eng', 'und')
27 |             )
28 |           )
29 | GROUP BY t1.id, t2.id
30 | ORDER BY t1.id ASC, t2.id ASC
31 | LIMIT $limit
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 pelias
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lib/permutations.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var GROUP_MIN = 1;
 3 | var GROUP_MAX = 6;
 4 | 
 5 | // produce all the possible token groups from adjacent input tokens (without reordering tokens)
 6 | 
 7 | module.exports.expand = function( tokens ){
 8 | 
 9 |   var groups = [];
10 | 
11 |   // favour smaller tokens over larger ones
12 |   // for( var i=0; i<tokens.length; i++ ){
13 |   //   for( var j=i+GROUP_MIN; j<i+GROUP_MIN+GROUP_MAX; j++ ){
14 |   //     if( j > tokens.length ){ break; }
15 |   //     groups.push( tokens.slice( i, j ) );
16 |   //   }
17 |   // }
18 | 
19 |   // favour larger tokens over shorter ones
20 |   for( var i=0; i<tokens.length; i++ ){
21 |     for( var j=i+GROUP_MAX; j>=i+GROUP_MIN; j-- ){
22 |       if( j <= tokens.length ){
23 |         groups.push( tokens.slice( i, j ) );
24 |       }
25 |     }
26 |   }
27 | 
28 |   return groups;
29 | };
30 | 
31 | /**
32 | example:
33 | 
34 | input: [ 'soho', 'new', 'york', 'usa' ]
35 | 
36 | output: [
37 |   [ 'soho' ],
38 |   [ 'soho', 'new' ],
39 |   [ 'soho', 'new', 'york' ],
40 |   [ 'soho', 'new', 'york', 'usa' ],
41 |   [ 'new' ],
42 |   [ 'new', 'york' ],
43 |   [ 'new', 'york', 'usa' ],
44 |   [ 'york' ],
45 |   [ 'york', 'usa' ],
46 |   [ 'usa' ]
47 | ]
48 | **/
49 | 


--------------------------------------------------------------------------------
/server/routes/findbyid.js:
--------------------------------------------------------------------------------
 1 | 
 2 | module.exports = function( req, res ){
 3 | 
 4 |   // placeholder
 5 |   var ph = req.app.locals.ph;
 6 | 
 7 |   var ids = ( req.query.ids || '' ).split(',').map( function( id ){
 8 |     return parseInt( id.trim(), 10 );
 9 |   }).filter( function( id ){
10 |     return !isNaN( id );
11 |   });
12 | 
13 |   var lang;
14 |   if( 'string' === typeof req.query.lang && req.query.lang.length === 3 ){
15 |     lang = req.query.lang.toLowerCase();
16 |   }
17 | 
18 |   // load docs
19 |   ph.store.getMany( ids, function( err, documents ){
20 |     if( err ){ return res.status(500).send({}); }
21 |     if( !documents || !documents.length ){ return res.status(404).send({}); }
22 | 
23 |     var docs = {};
24 |     for( var i=0; i<documents.length; i++ ){
25 |       var result = documents[i];
26 | 
27 |       // return only the single language requested by the user
28 |       // or, if not available, return all languages.
29 |       // ref: https://github.com/pelias/placeholder/pull/128
30 |       const translation = result.names[lang];
31 |       if ( Array.isArray(translation) ) {
32 |         result.names = {};
33 |         result.names[lang] = translation;
34 |       }
35 |       docs[ result.id ] = result;
36 |     }
37 | 
38 |     res.status(200).json(docs);
39 |   });
40 | };
41 | 


--------------------------------------------------------------------------------
/lib/sorted.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var _ = require('lodash'),
 3 |     intersect = require('sorted-intersect');
 4 | 
 5 | /**
 6 |   algorithms optimized for sorted sets
 7 | 
 8 |   note: algorithm selection here can have a massive impact on performance.
 9 | **/
10 | 
11 | /**
12 |   merge two sorted arrays and ensure the result contains unique values
13 | **/
14 | function sortedMergeUniq( a, b ) {
15 | 
16 |   if( a.length < 1 ){ return b; }
17 |   if( b.length < 1 ){ return a; }
18 | 
19 |   var arr = [];
20 |   var aa = 0, bb = 0;
21 |   var A = a[0], B = b[0];
22 |   var C = -Infinity;
23 | 
24 |   while( true ) {
25 | 
26 |     // seek iterators past current value
27 |     while( A <= C ){ A = a[aa++]; }
28 |     while( B <= C ){ B = b[bb++]; }
29 |     if( A === undefined && B === undefined ){ break; }
30 | 
31 |     if( A < B || ( A && !B ) ) {
32 |       arr.push( A ); C = A;
33 |     } else {
34 |       arr.push( B ); C = B;
35 |     }
36 |   }
37 | 
38 |   return arr;
39 | }
40 | 
41 | // sorting comparitors
42 | var comparitor = { number: { asc: function( a, b ) {
43 |   return a - b;
44 | }}};
45 | 
46 | function sort( arr ){ return arr.sort( comparitor.number.asc ); }
47 | 
48 | module.exports.merge = sortedMergeUniq;
49 | module.exports.intersect = intersect;
50 | module.exports.sort = sort;
51 | module.exports.unique = _.sortedUniq;
52 | 


--------------------------------------------------------------------------------
/.github/workflows/push.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | on: push
 3 | jobs:
 4 |   unit-tests:
 5 |     uses: ./.github/workflows/_test.yml
 6 |   npm-publish:
 7 |     needs: unit-tests
 8 |     if: github.ref == 'refs/heads/master' && needs.unit-tests.result == 'success'
 9 |     runs-on: ubuntu-24.04
10 |     permissions:
11 |       id-token: write
12 |       contents: write
13 |     steps:
14 |       - uses: actions/checkout@v4
15 |       - name: Install Node.js
16 |         uses: actions/setup-node@v4
17 |         with:
18 |           node-version: 22.x
19 |       - name: Run semantic-release
20 |         env:
21 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
22 |         run: >
23 |           if [[ "${{ github.repository_owner }}" == "pelias" ]]; then
24 |             curl "https://raw.githubusercontent.com/pelias/ci-tools/master/semantic-release.sh" | bash -
25 |           fi
26 |   build-docker-images:
27 |     # run this job if the unit tests passed and the npm-publish job was a success or was skipped
28 |     # note: github actions won't run a job if you don't call one of the status check functions, so `always()` is called since it evalutes to `true`
29 |     if: ${{ always() && needs.unit-tests.result == 'success' && (needs.npm-publish.result == 'success' || needs.npm-publish.result == 'skipped') }}
30 |     needs: [unit-tests, npm-publish]
31 |     runs-on: ubuntu-24.04
32 |     steps:
33 |       - uses: actions/checkout@v4
34 |       - name: Build Docker images
35 |         env:
36 |           DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
37 |           DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
38 |         run: |
39 |           curl "https://raw.githubusercontent.com/pelias/ci-tools/master/build-docker-images.sh" | bash -
40 | 


--------------------------------------------------------------------------------
/test/lib/sorted.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var sorted = require('../../lib/sorted');
 3 | 
 4 | // sort
 5 | module.exports.sort = function(test, common) {
 6 | 
 7 |   // test runner
 8 |   var assert = function( actual, expected ){
 9 |     test( 'sort', function(t) {
10 |       t.deepEqual( sorted.sort( actual ), expected );
11 |       t.end();
12 |     });
13 |   };
14 | 
15 |   assert([0, 10, 4, -1, 5, 5, 3], [ -1, 0, 3, 4, 5, 5, 10 ]);
16 |   assert([0, 9, 4, -10, 5, 5, 2], [ -10, 0, 2, 4, 5, 5, 9 ]);
17 | };
18 | 
19 | // sorted merge
20 | module.exports.merge = function(test, common) {
21 | 
22 |   // test runner
23 |   var assert = function( a, b, expected ){
24 |     test( 'merge', function(t) {
25 |       t.deepEqual( sorted.merge( a, b ), expected );
26 |       t.end();
27 |     });
28 |   };
29 | 
30 |   assert(
31 |     [ -1, 0, 3, 4, 5, 5, 10 ],
32 |     [ -10, 0, 2, 4, 5, 5, 9 ],
33 |     [ -10, -1, 0, 2, 3, 4, 5, 9, 10 ]
34 |   );
35 | };
36 | 
37 | // sorted intersect
38 | module.exports.intersect = function(test, common) {
39 | 
40 |   // test runner
41 |   var assert = function( a, b, expected ){
42 |     test( 'intersect', function(t) {
43 |       t.deepEqual( sorted.intersect([ a, b ]), expected );
44 |       t.end();
45 |     });
46 |   };
47 | 
48 |   assert(
49 |     [ -1, 0, 3, 4, 5, 5, 10 ],
50 |     [ -10, 0, 2, 4, 5, 5, 9 ],
51 |     [ 0, 4, 5, 5 ]
52 |   );
53 | };
54 | 
55 | // sorted unique
56 | module.exports.unique = function(test, common) {
57 | 
58 |   // test runner
59 |   var assert = function( a, expected ){
60 |     test( 'unique', function(t) {
61 |       t.deepEqual( sorted.unique( a ), expected );
62 |       t.end();
63 |     });
64 |   };
65 | 
66 |   assert(
67 |     [ -1, 0, 0, 3, 4, 5, 5, 10 ],
68 |     [ -1, 0, 3, 4, 5, 10 ]
69 |   );
70 | };
71 | 


--------------------------------------------------------------------------------
/cmd/s3_upload.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | # directory of this file
 5 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
 6 | DATA_DIR=${PLACEHOLDER_DATA:-"${DIR}/../data"}
 7 | BUCKET='s3://pelias-data.nextzen.org/placeholder'
 8 | TODAY=`date +%Y-%m-%d`
 9 | 
10 | echo '--- gzipping data files ---'
11 | if type pigz >/dev/null
12 |   then
13 |     pigz -k -c --best "${DATA_DIR}/store.sqlite3" > "${DATA_DIR}/store.sqlite3.gz"
14 |     pigz -k -c --best "${DATA_DIR}/wof.extract" > "${DATA_DIR}/wof.extract.gz"
15 |   else
16 |     gzip -c --best "${DATA_DIR}/store.sqlite3" > "${DATA_DIR}/store.sqlite3.gz"
17 |     gzip -c --best "${DATA_DIR}/wof.extract" > "${DATA_DIR}/wof.extract.gz"
18 | fi
19 | 
20 | echo '--- uploading archive ---'
21 | aws s3 cp "${DATA_DIR}/store.sqlite3.gz" "${BUCKET}/archive/${TODAY}/store.sqlite3.gz" --region us-east-1 --acl public-read
22 | aws s3 cp "${DATA_DIR}/wof.extract.gz" "${BUCKET}/archive/${TODAY}/wof.extract.gz" --region us-east-1 --acl public-read
23 | 
24 | echo '--- list remote archive ---'
25 | aws s3 ls --human-readable "${BUCKET}/archive/${TODAY}/"
26 | 
27 | echo -e "\n> would you like to promote this build to production (yes/no)?"
28 | read answer
29 | 
30 | if [ "$answer" == "yes" ] || [ "$answer" == "y" ]; then
31 |   echo '--- promoting build to production ---'
32 |   aws s3 cp "${BUCKET}/archive/${TODAY}/store.sqlite3.gz" "${BUCKET}/store.sqlite3.gz" --region us-east-1 --acl public-read
33 |   aws s3 cp "${BUCKET}/archive/${TODAY}/wof.extract.gz" "${BUCKET}/wof.extract.gz" --region us-east-1 --acl public-read
34 | 
35 |   echo '--- list remote production files ---'
36 |   aws s3 ls --human-readable "${BUCKET}/"
37 | else
38 |   echo 'you did not answer yes, the build was not promoted to production'
39 | fi
40 | 


--------------------------------------------------------------------------------
/cmd/repl.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var repl = require('repl'),
 3 |     Placeholder = require('../Placeholder'),
 4 |     ph = new Placeholder();
 5 | 
 6 | // init placeholder
 7 | ph.load();
 8 | 
 9 | // commands
10 | var commands = {
11 |   search: function( input, cb ){
12 |     console.time('took');
13 |     ph.query( input, ( err, res ) => {
14 |       ph.store.getMany( res.getIdsAsArray(), ( err, docs ) => {
15 |         if( err ){ return console.error( err ); }
16 |         docs.forEach( doc => {
17 |           console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') );
18 |         });
19 |         console.timeEnd('took');
20 |         cb();
21 |       });
22 |     });
23 |   },
24 |   tokenize: function( input, cb ){
25 |     console.time('took');
26 |     ph.tokenize( input, ( err, groups ) => {
27 |       console.timeEnd('took');
28 |       console.log( groups );
29 |       cb();
30 |     });
31 |   },
32 |   token: function( body, cb ){
33 |     console.log( 'token', '"' + body + '"' );
34 |     console.time('took');
35 |     ph.index.matchSubjectDistinctSubjectIds( body, ( err, rows ) => {
36 |       const subjectIds = rows.map( row => row.subjectId );
37 |       console.timeEnd('took');
38 |       console.log( subjectIds );
39 |       cb();
40 |     });
41 |   },
42 |   id: function( id, cb ){
43 |     console.time('took');
44 |     ph.store.get( id, ( err, doc ) => {
45 |       if( err ){ return console.error( err ); }
46 |       // console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') );
47 |       console.log( doc );
48 |       console.timeEnd('took');
49 |       cb();
50 |     });
51 |   }
52 | };
53 | 
54 | function myEval(cmd, context, filename, cb) {
55 |   var split = cmd.trim().split(/\s+/g);
56 |   if( commands.hasOwnProperty( split[0] ) ){
57 |     return commands[ split[0] ].call( null, split.splice(1).join(' '), cb );
58 |   }
59 |   commands.search( split.join(' '), cb );
60 | }
61 | 
62 | // open the repl session
63 | var prompt = repl.start({ prompt: 'placeholder > ', eval: myEval });
64 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pelias-placeholder",
 3 |   "version": "0.0.0-development",
 4 |   "engines": {
 5 |     "node": ">=10.0.0"
 6 |   },
 7 |   "author": "mapzen",
 8 |   "license": "MIT",
 9 |   "main": "server.js",
10 |   "scripts": {
11 |     "test": "npm run units",
12 |     "units": "./cmd/units",
13 |     "integration": "./cmd/integration",
14 |     "funcs": "for case in test/cases/*.txt; do node test/case.js $case; done",
15 |     "all": "npm run units && npm run integration && npm run funcs",
16 |     "start": "./cmd/server.sh",
17 |     "extract": "bash ./cmd/extract.sh",
18 |     "build": "bash ./cmd/build.sh",
19 |     "gentests": "cat data/wof.extract | node cmd/generate_tests.js > test/cases/generated.txt",
20 |     "repl": "node cmd/repl.js",
21 |     "cli": "node cmd/cli.js",
22 |     "lint": "jshint .",
23 |     "validate": "npm ls",
24 |     "ci": "./cmd/ci.sh"
25 |   },
26 |   "repository": {
27 |     "type": "git",
28 |     "url": "https://github.com/pelias/placeholder.git"
29 |   },
30 |   "bugs": {
31 |     "url": "https://github.com/pelias/placeholder/issues"
32 |   },
33 |   "homepage": "https://github.com/pelias/placeholder#readme",
34 |   "dependencies": {
35 |     "async": "^3.0.1",
36 |     "better-sqlite3": "^12.2.0",
37 |     "express": "^4.15.2",
38 |     "lodash": "^4.17.21",
39 |     "lower-case": "^2.0.0",
40 |     "morgan": "^1.9.0",
41 |     "pelias-blacklist-stream": "^1.1.0",
42 |     "pelias-config": "^4.5.0",
43 |     "pelias-logger": "^1.2.1",
44 |     "pelias-whosonfirst": "^8.1.0",
45 |     "regenerate": "^1.4.2",
46 |     "remove-accents-diacritics": "^1.0.2",
47 |     "require-dir": "^1.0.0",
48 |     "sorted-intersect": "^0.1.4",
49 |     "split2": "^3.0.0",
50 |     "through2": "^3.0.0"
51 |   },
52 |   "devDependencies": {
53 |     "jshint": "^2.5.6",
54 |     "precommit-hook": "^3.0.0",
55 |     "tap-spec": "^5.0.0",
56 |     "tape": "^5.0.0"
57 |   },
58 |   "pre-commit": [
59 |     "lint",
60 |     "validate",
61 |     "test"
62 |   ],
63 |   "release": {
64 |     "branch": "master",
65 |     "success": []
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/test/case.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var fs = require('fs'),
 3 |     path = require('path'),
 4 |     assert = require('assert'),
 5 |     split = require('split2'),
 6 |     through = require('through2'),
 7 |     Placeholder = require('../Placeholder');
 8 | 
 9 | /**
10 |   this test reads the 'test/cases/*.txt' files (if present) and uses it's lines
11 |   to generate test cases.
12 | 
13 |   see: README.md for more info on how to generate test cases.
14 | **/
15 | 
16 | // ensure the file is available in the filesystem
17 | var testcasePath = process.argv[2];
18 | try { fs.statSync( testcasePath ); }
19 | catch( e ) {
20 |   return console.error('%s not found, skipping test', testcasePath);
21 | }
22 | 
23 | console.error( '----------- ' + testcasePath + ' -----------' );
24 | 
25 | // --------------
26 | 
27 | // load placeholder data
28 | var ph = new Placeholder();
29 | ph.load();
30 | 
31 | // stream the test cases, run them one-by-one
32 | var stream = fs.createReadStream( testcasePath, 'utf8' );
33 | stream.pipe( split() )
34 |       .pipe( through( function( line, _, next ){
35 |         if( !line.length ){ return; } // skip empty lines
36 |         var split = line.toString('utf8').split(' ');
37 |         var id = parseInt( split[0], 10 );
38 |         runner( ph, split.slice(1).join(' '), id, next );
39 |       }, function( done ){
40 |         console.log();
41 |         done();
42 |       }));
43 | 
44 | // --------------
45 | 
46 | // convenience function for writing quick 'n easy test cases
47 | function runner( ph, actual, expected, next ){
48 |   ph.query( actual, ( err, res ) => {
49 | 
50 |     const ids = res.getIdsAsArray();
51 |     process.stderr.write('.');
52 | 
53 |     try {
54 |       assert.ok( -1 !== ids.indexOf( expected ), 'id found in results' );
55 |     }
56 | 
57 |     catch( e ){
58 |       console.log('\n');
59 |       console.log('input:    ', actual);
60 |       console.log('expected: ', expected);
61 |       console.log('actual:   ', ids.join(', '));
62 |       console.log();
63 |     }
64 | 
65 |     finally {
66 |       next();
67 |     }
68 | 
69 |   });
70 | }
71 | 


--------------------------------------------------------------------------------
/lib/Database.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Sqlite3 = require('better-sqlite3');
 3 | 
 4 | // generic sqlite database
 5 | function Database(){}
 6 | 
 7 | Database.prototype.open = function( path, options ){
 8 | 
 9 |   // set up a safe environment for running tests.
10 |   // note: usually in-memory databases using the same
11 |   // path would share the same database reference.
12 |   if( options && true === options.test ){
13 |     path = ':memory:';
14 |   }
15 | 
16 |   // open connection
17 |   this.db = new Sqlite3( path, options );
18 | 
19 |   // configure database tables
20 |   this.configure();
21 | 
22 |   // reset data (clear all previous data and recreate schemas)
23 |   if( options && true === options.reset ){
24 |     this.reset();
25 |     this.optimize();
26 |   }
27 | };
28 | 
29 | Database.prototype.close = function(){
30 |   this.db.close();
31 | };
32 | 
33 | Database.prototype.prepare = function( sql ){
34 |   if( !this.hasOwnProperty('stmt') ){ this.stmt = {}; }
35 |   if( !this.stmt.hasOwnProperty( sql ) ){
36 |     this.stmt[ sql ] = this.db.prepare( sql );
37 |   }
38 |   return this.stmt[ sql ];
39 | };
40 | 
41 | Database.prototype.configure = function(){
42 |   this.db.pragma('foreign_keys=OFF'); // we don't enforce foreign key constraints
43 |   this.db.pragma('page_size=4096'); // (default: 1024)
44 |   this.db.pragma('cache_size=-2000'); // (default: -2000, 2GB)
45 |   this.db.pragma('synchronous=OFF');
46 |   this.db.pragma('journal_mode=MEMORY');
47 |   this.db.pragma('temp_store=MEMORY');
48 | };
49 | 
50 | Database.prototype.reset = function(){ /* no-op */ };
51 | Database.prototype.populate = function(){ /* no-op */ };
52 | Database.prototype.checkSchema = function(){ /* no-op */ };
53 | Database.prototype.optimize = function(){
54 |   this.db.exec('VACUUM');
55 | };
56 | 
57 | // convenience function to validate a table schema against
58 | // an expected schema, throwing an error if they do not match.
59 | Database.assertSchema = function( db, tableName, expected ){
60 |   const actual = db.prepare('PRAGMA table_info(' + tableName + ')').all();
61 |   if( JSON.stringify(actual) !== JSON.stringify(expected) ){
62 |     throw new Error( 'schema invalid: table ' + tableName );
63 |   }
64 | };
65 | 
66 | module.exports = Database;
67 | 


--------------------------------------------------------------------------------
/test/functional.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Placeholder = require('../Placeholder');
 3 | 
 4 | module.exports.functional = function(test, util) {
 5 | 
 6 |   // load data
 7 |   var ph = new Placeholder();
 8 |   ph.load();
 9 | 
10 |   var assert = runner.bind(null, test, ph);
11 | 
12 |   assert('Kelburn Wellington New Zealand', [1729339019]);
13 |   assert('North Sydney', [85784821, 101931469, 102048877, 404225393, 1310698409]);
14 |   assert('Sydney New South Wales Australia', [101932003, 102049151, 404226357, 1376953385, 1377004395]);
15 |   assert('ケープタウン 南アフリカ', [101928027]);
16 | 
17 |   // possible duplicates
18 |   // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/1841
19 |   assert('경기도 광명시', [102026551, 890472589]);
20 |   assert('부산광역시 부산진구', [890475779, 890476045]);
21 | 
22 |   assert('서울 마포구', [890473201]);
23 |   assert('전라북도 전주시 완산구', [102026471]);
24 | 
25 |   assert('london on', [ 101735809 ]);
26 |   assert('paris, tx', [ 101725293 ]);
27 | 
28 |   assert('123 apple bay ave neutral bay north sydney new south wales au',
29 |     [ 101931387, 404225267 ]
30 |   );
31 | 
32 |   assert('30 w 26th st ny nyc 10117 ny usa', [ 85977539 ]);
33 | 
34 |   // should not include county: 102081377, or localadmin: 404482867
35 |   assert('lancaster lancaster pa', [ 101718643, 404487183, 404487185, 1729458067, 1729466275 ]);
36 | 
37 |   // assertions from pelias acceptance-test suite
38 |   assert('灣仔, 香港', [85671779, 1243098523]);
39 |   assert('new york city, usa', [85977539]);
40 |   assert('sendai, japan', [102031919, 1108739995, 1125901991, 1243269829]);
41 |   assert('Észak-Alföld', [404227483]);
42 |   assert('Comunidad Foral De Navarra, ES', [404227391]);
43 |   assert('Île-De-France, France', [404227465]);
44 |   assert('Dél-Dunántúl, HU', [404227491]);
45 |   assert('Sardegna, Italy', [404227535]);
46 |   assert('Közép-Magyarország, Hungary', [404227489]);
47 | 
48 |   // All tokens should be in the same language
49 |   // Parijs = Paris (nl); Francia = France (it)
50 |   // see: https://github.com/pelias/placeholder/pull/195
51 |   assert('Parijs Francia', [1225878855]);
52 | };
53 | 
54 | // convenience function for writing quick 'n easy test cases
55 | function runner( test, ph, actual, expected ){
56 |   test( actual, function(t) {
57 |     ph.query( actual, ( err, res ) => {
58 |       t.deepEqual( res.getIdsAsArray(), expected );
59 |       t.end();
60 |     });
61 |   });
62 | }
63 | 


--------------------------------------------------------------------------------
/test/prototype/tokenize_integration.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var Placeholder = require('../../Placeholder');
 3 | 
 4 | module.exports.tokenize = function(test, util) {
 5 | 
 6 |   // load data
 7 |   var ph = new Placeholder();
 8 |   ph.load();
 9 | 
10 |   var assert = runner.bind(null, test, ph);
11 | 
12 |   assert('Kelburn Wellington New Zealand', [['kelburn', 'wellington', 'new zealand']]);
13 |   assert('Sydney New South Wales Australia', [['sydney', 'new south wales', 'australia']]);
14 |   assert('ケープタウン 南アフリカ', [['ケーフタウン', '南アフリカ']]);
15 | 
16 |   // duplicates
17 |   assert('lancaster lancaster pa', [['lancaster', 'lancaster', 'pa']]);
18 | 
19 |   // korean place names
20 |   assert('세종특별자치시', [['세종특별자치시']]);
21 | 
22 |   // synonymous groupings
23 |   // see: https://github.com/pelias/placeholder/issues/28
24 |   // note: the 'Le Cros-d’Utelle, France' example (as at 20-09-17) no longer dedupes
25 |   // to a single grouping due to the introduction of the token 'le' from 85685547
26 |   assert('Le Cros-d’Utelle, France', [['le crosdutelle', 'france' ], [ 'le cros d utelle', 'france']]);
27 |   assert('luxemburg luxemburg', [['luxemburg', 'luxemburg']]); // does not remove duplicate tokens
28 | 
29 |   // ambiguous parses
30 |   // @note: these are the glorious future:
31 | 
32 |   // assert('Adams North Brunswick', [
33 |   //   [ 'adams north', 'brunswick' ],
34 |   //   [ 'adams', 'north brunswick' ]
35 |   // ]);
36 |   //
37 |   // assert('Heritage East San Jose', [
38 |   //   [ 'heritage east', 'san jose' ],
39 |   //   [ 'heritage', 'east san jose' ]
40 |   // ]);
41 |   //
42 |   // assert('bay ave neutral bay north sydney', [
43 |   //   [ 'bay', 'neutral bay', 'north sydney' ],
44 |   //   [ 'bay', 'neutral bay', 'north', 'sydney' ]
45 |   // ]);
46 |   //
47 |   // assert('mitte mitte berlin de', [
48 |   //   [ 'mitte berlin', 'de' ],
49 |   //   [ 'mitte', 'mitte berlin', 'de' ],
50 |   //   [ 'mitte', 'mitte', 'berlin', 'de' ]
51 |   // ]);
52 |   //
53 |   // assert('North Sydney', [
54 |   //   [ 'north sydney' ],
55 |   //   [ 'north', 'sydney' ]
56 |   // ]);
57 |   //
58 |   // assert('neutral bay north sydney', [
59 |   //   [ 'neutral bay', 'north sydney' ],
60 |   //   [ 'neutral bay', 'north', 'sydney' ]
61 |   // ]);
62 | };
63 | 
64 | // convenience function for writing quick 'n easy test cases
65 | function runner( test, ph, actual, expected ){
66 |   test( actual, function(t) {
67 |     ph.tokenize( actual, ( err, queries ) => {
68 |       t.deepEqual( queries, expected );
69 |       t.end();
70 |     });
71 |   });
72 | }
73 | 


--------------------------------------------------------------------------------
/cmd/generate_tests.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var split = require('split2'),
 3 |     through = require('through2'),
 4 |     parser = require('../lib/jsonParseStream'),
 5 |     Placeholder = require('../Placeholder'),
 6 |     ph = new Placeholder();
 7 | 
 8 | ph.load(); // load data from disk
 9 | 
10 | var order = [
11 |   'venue',
12 |   'address',
13 |   'building',
14 |   'campus',
15 |   'microhood',
16 |   'neighbourhood',
17 |   'macrohood',
18 |   'burough',
19 |   'postalcode',
20 |   'locality',
21 |   'metro area',
22 |   'localadmin',
23 |   'county',
24 |   'macrocounty',
25 |   'region',
26 |   'macroregion',
27 |   'country',
28 |   'empire',
29 |   'continent',
30 |   'ocean',
31 |   'planet'
32 | ];
33 | 
34 | // run test generation pipeline
35 | process.stdin.pipe( split() )
36 |              .pipe( parser() )
37 |              .pipe( through.obj( function insert( wof, _, next ){
38 | 
39 |               var id = wof['wof:id'];
40 |               if( 'string' === typeof id ){ id = parseInt( id, 10 ); }
41 | 
42 |               // sanity check; because WOF
43 |               if( !ph.isValidWofRecord( id, wof ) ) { return next(); }
44 | 
45 |               // console.error( wof );
46 | 
47 |               for( var h in wof['wof:hierarchy'] ){
48 | 
49 |                 // collect all parent ids for this hierarchy
50 |                 var parentIds = [];
51 |                 for( var o=0; o<order.length; o++ ){
52 |                   var placetype_id = order[o]+ '_id';
53 |                   var pid = wof['wof:hierarchy'][h][placetype_id];
54 |                   if( pid && pid !== id && pid > 0 ){
55 |                     if( 'string' === typeof pid ){ pid = parseInt( pid, 10 ); }
56 |                     parentIds.push( pid );
57 |                   }
58 |                 }
59 | 
60 |                 print( ph, [ id, wof['wof:name'] ], parentIds );
61 |               }
62 | 
63 |                next();
64 |              }));
65 | 
66 | function print( ph, line, parentIds ){
67 |   ph.store.getMany( parentIds, function( err, parents ){
68 | 
69 |     if( err || !Array.isArray( parents ) || !parents.length ){
70 |       console.error( 'an error occurred', err, parents );
71 |       return;
72 |     }
73 | 
74 |     var parentMap = {};
75 |     parents.forEach( function( parent ){
76 |       parentMap[ parent.id ] = parent;
77 |     });
78 | 
79 |     parentIds.forEach( function( pid ){
80 |       if( !parentMap.hasOwnProperty( pid ) ){
81 |         console.error( 'parent record of %s not found: %s', line[0], pid );
82 |         return;
83 |       }
84 |       line.push( parentMap[pid].name );
85 |     });
86 | 
87 |     console.log( line.join(' ') );
88 |   });
89 | }
90 | 


--------------------------------------------------------------------------------
/cmd/wof_extract_sqlite.js:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const fs = require('fs');
 3 | const whosonfirst = require('pelias-whosonfirst');
 4 | const config = require('pelias-config').generate().imports.whosonfirst;
 5 | const SQLiteStream = whosonfirst.SQLiteStream;
 6 | const through = require('through2');
 7 | const Placeholder = require('../Placeholder');
 8 | const combinedStream = require('combined-stream');
 9 | 
10 | const SQLITE_REGEX = /whosonfirst-data-[a-z0-9-]+\.db$/;
11 | 
12 | // Use WOF_DIR env variable when available, otherwise use the location specified in pelias.json
13 | const WOF_DIR = process.env.WOF_DIR || path.join(config.datapath, 'sqlite');
14 | 
15 | const layers = fs.readFileSync(path.join(__dirname, 'placetype.filter'), 'utf-8')
16 |                   .replace(/^.*\(/, '') // Removes all characters before the first parenthesis
17 |                   .match(/[a-z]+/g); // Get the layer list
18 | 
19 | const jq_filter = new RegExp(
20 |   fs.readFileSync(path.join(__dirname, 'jq.filter'), 'utf-8')
21 |     .replace(/\n\s*/g, '') // Normalize multi-line
22 |     .match(/test\(\s*"([^"]+(?:"\s*\+\s*"[^"]+)*)"\s*\)/)[1] // Extract pattern
23 |     .replace(/"\s*\+\s*"/g, '') // Remove string concatenation
24 | );
25 | 
26 | const output = () => {
27 |   if (process.argv.length > 2 && process.argv[2] === 'build') {
28 |     const ph = new Placeholder();
29 |     ph.load({ reset: true });
30 |     return through.obj((row, _, next) => {
31 |       ph.insertWofRecord(row, next);
32 |     }, done => {
33 |       console.error('populate fts...');
34 |       ph.populate();
35 |       console.error('optimize...');
36 |       ph.optimize();
37 |       console.error('close...');
38 |       ph.close();
39 |       done();
40 |     });
41 |   } else {
42 |     return through.obj((row, _, next) => {
43 |       console.log(JSON.stringify(row));
44 |       next();
45 |     });
46 |   }
47 | };
48 | 
49 | const sqliteStream = combinedStream.create();
50 | fs.readdirSync(WOF_DIR)
51 |   .filter(file => SQLITE_REGEX.test(file))
52 |   .map(file => path.join(WOF_DIR, file))
53 |   .forEach(dbPath => {
54 |     sqliteStream.append(next => {
55 |       next(new SQLiteStream(
56 |         dbPath,
57 |         config.importPlace ?
58 |         SQLiteStream.findGeoJSONByPlacetypeAndWOFId(layers, config.importPlace) :
59 |         SQLiteStream.findGeoJSONByPlacetype(layers)
60 |       ));
61 |     });
62 |   });
63 | 
64 | sqliteStream
65 |   .pipe(whosonfirst.toJSONStream())
66 |   .pipe(through.obj((row, _, next) => {
67 |     Object.keys(row.properties)
68 |           .filter(key => !jq_filter.test(key))
69 |           .forEach(key => delete row.properties[key]);
70 |     next(null, row.properties);
71 |   }))
72 |   .pipe(output());
73 | 


--------------------------------------------------------------------------------
/test/server/routes/findbyid.js:
--------------------------------------------------------------------------------
  1 | const findbyid = require('../../../server/routes/findbyid');
  2 | const _ = require('lodash');
  3 | const identity = () => {};
  4 | 
  5 | const makeRequest = opts => {
  6 |   const req = {};
  7 |   _.set(req, 'app.locals.ph.store.getMany', opts.getMany);
  8 |   _.set(req, 'query.ids', opts.ids);
  9 |   _.set(req, 'query.lang', opts.lang);
 10 |   return req;
 11 | };
 12 | 
 13 | const makeResponse = opts => {
 14 |   return {
 15 |     status: status => {
 16 |       if (opts.status) { opts.status(status); }
 17 |       return { send: opts.send || identity, json: opts.json || identity };
 18 |     }
 19 |   };
 20 | };
 21 | 
 22 | module.exports.all = (test, common) => {
 23 |   test('parse ids - correct numbers list with spaces', t => {
 24 |     const req = makeRequest({
 25 |       ids: '85682555, 85633111,102064231 , 85682523 ,    102063845    ,',
 26 |       getMany: function(ids) {
 27 |         t.deepEqual(ids, [85682555, 85633111, 102064231, 85682523, 102063845]);
 28 |         t.end();
 29 |       }
 30 |     });
 31 |     findbyid(req, null);
 32 |   });
 33 | 
 34 |   test('parse ids - incorrect numbers', t => {
 35 |     const req = makeRequest({
 36 |       ids: 'not a number, 85633111a,1d02064231',
 37 |       getMany: function(ids) {
 38 |         t.deepEqual(ids, [85633111, 1]);
 39 |         t.end();
 40 |       }
 41 |     });
 42 |     findbyid(req, null);
 43 |   });
 44 | 
 45 |   test('status code - 500', t => {
 46 |     const req = makeRequest({
 47 |       getMany: (ids, cb) => { cb('Error'); }
 48 |     });
 49 |     const res = makeResponse({
 50 |       status: status => {
 51 |         t.deepEqual(status, 500);
 52 |         t.end();
 53 |       }
 54 |     });
 55 |     findbyid(req, res);
 56 |   });
 57 | 
 58 |   test('status code - 404', t => {
 59 |     const req = makeRequest({
 60 |       getMany: (ids, cb) => { cb(null, []); }
 61 |     });
 62 |     const res = makeResponse({
 63 |       status: status => {
 64 |         t.deepEqual(status, 404);
 65 |         t.end();
 66 |       }
 67 |     });
 68 |     findbyid(req, res);
 69 |   });
 70 | 
 71 |   test('find by ids - without lang', t => {
 72 |     const req = makeRequest({
 73 |       getMany: (ids, cb) => {
 74 |         cb(null, [{
 75 |           id: 101751119,
 76 |           names: { fra: ['Paris'], eng: ['Paris'], ita: ['Parigi'] }
 77 |         }]);
 78 |       }
 79 |     });
 80 |     const res = makeResponse({
 81 |       status: status => { t.deepEqual(status, 200); },
 82 |       json: docs => {
 83 |         t.deepEqual(docs, {
 84 |           101751119: {
 85 |             id: 101751119,
 86 |             names: { fra: ['Paris'], eng: ['Paris'], ita: ['Parigi'] }
 87 |           }
 88 |         });
 89 |         t.end();
 90 |       }
 91 |     });
 92 |     findbyid(req, res);
 93 |   });
 94 | 
 95 |   test('find by ids - with lang', t => {
 96 |     const req = makeRequest({
 97 |       lang: 'fra',
 98 |       getMany: (ids, cb) => {
 99 |         cb(null, [{
100 |           id: 101751119,
101 |           names: { fra: ['Paris'], eng: ['Paris'], ita: ['Parigi'] }
102 |         }]);
103 |       }
104 |     });
105 |     const res = makeResponse({
106 |       status: status => { t.deepEqual(status, 200); },
107 |       json: docs => {
108 |         t.deepEqual(docs, {
109 |           101751119: {
110 |             id: 101751119,
111 |             names: { fra: ['Paris'] }
112 |           }
113 |         });
114 |         t.end();
115 |       }
116 |     });
117 |     findbyid(req, res);
118 |   });
119 | };


--------------------------------------------------------------------------------
/test/prototype/io.js:
--------------------------------------------------------------------------------
  1 | 
  2 | const path = require('path');
  3 | const io = require('../../prototype/io');
  4 | 
  5 | // Mock out placeholder
  6 | const MockPlaceholder = function(){
  7 |   this.store = {};
  8 |   this.index = {};
  9 | };
 10 | MockPlaceholder.prototype = io;
 11 | 
 12 | module.exports.exports = function(test, common) {
 13 |   test('exports', function(t) {
 14 |     t.equal( typeof io.load, 'function' );
 15 |     t.equal( typeof io.populate, 'function' );
 16 |     t.equal( typeof io.optimize, 'function' );
 17 |     t.equal( typeof io.checkSchema, 'function' );
 18 |     t.equal( typeof io.close, 'function' );
 19 |     t.end();
 20 |   });
 21 | };
 22 | 
 23 | module.exports.load = function(test, common) {
 24 |   test('load', function(t) {
 25 | 
 26 |     const ph = new MockPlaceholder();
 27 |     const options = { foo: 'bar' };
 28 | 
 29 |     t.plan(4);
 30 | 
 31 |     const expectedFilename = path.join(__dirname, '../../data/store.sqlite3');
 32 | 
 33 |     // open store db
 34 |     ph.store.open = function( dbPath, opts ){
 35 |       t.equals(dbPath, expectedFilename);
 36 |       t.deepEqual(opts, options);
 37 |     };
 38 | 
 39 |     // open index db
 40 |     ph.index.open = function( dbPath, opts ){
 41 |       t.equals(dbPath, expectedFilename);
 42 |       t.deepEqual(opts, options);
 43 |     };
 44 | 
 45 |     ph.load(options);
 46 |   });
 47 |   test('load - using env var', function(t) {
 48 | 
 49 |     const ph = new MockPlaceholder();
 50 |     const options = { foo: 'bar' };
 51 | 
 52 |     t.plan(4);
 53 | 
 54 |     process.env.PLACEHOLDER_DATA = '/my_data_dir/';
 55 |     const expectedFilename = path.join(process.env.PLACEHOLDER_DATA, 'store.sqlite3');
 56 | 
 57 |     // open store db
 58 |     ph.store.open = function( dbPath, opts ){
 59 |       t.equals(dbPath, expectedFilename);
 60 |       t.deepEqual(opts, options);
 61 |     };
 62 | 
 63 |     // open index db
 64 |     ph.index.open = function( dbPath, opts ){
 65 |       t.equals(dbPath, expectedFilename);
 66 |       t.deepEqual(opts, options);
 67 |     };
 68 | 
 69 |     ph.load(options);
 70 | 
 71 |     delete process.env.PLACEHOLDER_DATA;
 72 |   });
 73 | };
 74 | 
 75 | module.exports.populate = function(test, common) {
 76 |   test('populate', function(t) {
 77 | 
 78 |     const ph = new MockPlaceholder();
 79 | 
 80 |     t.plan(2);
 81 | 
 82 |     // run 'populate' on both dbs
 83 |     ph.store.populate = t.false;
 84 |     ph.index.populate = t.false;
 85 | 
 86 |     ph.populate();
 87 |   });
 88 | };
 89 | 
 90 | module.exports.optimize = function(test, common) {
 91 |   test('optimize', function(t) {
 92 | 
 93 |     const ph = new MockPlaceholder();
 94 | 
 95 |     t.plan(1);
 96 | 
 97 |     // only run 'optimize' on one db
 98 |     ph.store.optimize = t.true;
 99 |     ph.index.optimize = t.false;
100 | 
101 |     ph.optimize();
102 |   });
103 | };
104 | 
105 | module.exports.checkSchema = function(test, common) {
106 |   test('checkSchema', function(t) {
107 | 
108 |     const ph = new MockPlaceholder();
109 | 
110 |     t.plan(2);
111 | 
112 |     // run 'checkSchema' on both dbs
113 |     ph.store.checkSchema = t.false;
114 |     ph.index.checkSchema = t.false;
115 | 
116 |     ph.checkSchema();
117 |   });
118 | };
119 | 
120 | module.exports.close = function(test, common) {
121 |   test('close', function(t) {
122 | 
123 |     const ph = new MockPlaceholder();
124 | 
125 |     t.plan(2);
126 | 
127 |     // run 'close' on both dbs
128 |     ph.store.close = t.false;
129 |     ph.index.close = t.false;
130 | 
131 |     ph.close();
132 |   });
133 | };
134 | 


--------------------------------------------------------------------------------
/test/lib/Database.js:
--------------------------------------------------------------------------------
  1 | const _ = require('lodash');
  2 | const Database = require('../../lib/Database');
  3 | 
  4 | module.exports.constructor = function(test, common) {
  5 |   test('constructor', function(t) {
  6 |     var db = new Database();
  7 |     t.equal( typeof db.open, 'function' );
  8 |     t.equal( typeof db.close, 'function' );
  9 |     t.equal( typeof db.prepare, 'function' );
 10 |     t.equal( typeof db.configure, 'function' );
 11 |     t.equal( typeof db.reset, 'function' );
 12 |     t.equal( typeof db.populate, 'function' );
 13 |     t.equal( typeof db.optimize, 'function' );
 14 |     t.equal( typeof Database.assertSchema, 'function' );
 15 |     t.end();
 16 |   });
 17 | };
 18 | 
 19 | module.exports.open = function(test, common) {
 20 |   test('open', function(t) {
 21 |     var db = new Database();
 22 |     t.false( db.db );
 23 | 
 24 |     // ensure 'reset' is not run
 25 |     db.reset = t.end;
 26 | 
 27 |     // ensure 'optimize' is not run
 28 |     db.optimize = t.end;
 29 | 
 30 |     // open connection
 31 |     db.open('/tmp/db', { test: true });
 32 |     t.equal( db.db.constructor.name, 'Database' );
 33 |     t.deepLooseEqual( db.db, {
 34 |       inTransaction: false,
 35 |       open: true,
 36 |       memory: true,
 37 |       readonly: false,
 38 |       name: db.db.name
 39 |     });
 40 | 
 41 |     t.end();
 42 |   });
 43 | 
 44 |   test('open - runs configure', function(t) {
 45 |     var db = new Database();
 46 | 
 47 |     // ensure 'configure' is run
 48 |     db.configure = t.end;
 49 | 
 50 |     // open connection
 51 |     db.open('/tmp/db', { test: true });
 52 |   });
 53 | 
 54 |   test('open - runs reset', function(t) {
 55 |     var db = new Database();
 56 | 
 57 |     // ensure 'reset' is run
 58 |     db.reset = t.end;
 59 | 
 60 |     // open connection
 61 |     db.open('/tmp/db', { test: true, reset: true });
 62 |   });
 63 | 
 64 |   test('open - runs optimize', function(t) {
 65 |     var db = new Database();
 66 | 
 67 |     // ensure 'optimize' is run
 68 |     db.optimize = t.end;
 69 | 
 70 |     // open connection
 71 |     db.open('/tmp/db', { test: true, reset: true });
 72 |   });
 73 | };
 74 | 
 75 | module.exports.close = function(test, common) {
 76 |   test('close', function(t) {
 77 |     var db = new Database();
 78 |     db.open('/tmp/db', { test: true });
 79 |     t.true( db.db.open );
 80 |     db.close();
 81 |     t.false( db.db.open );
 82 |     t.end();
 83 |   });
 84 | };
 85 | 
 86 | module.exports.prepare = function(test, common) {
 87 |   test('prepare', function(t) {
 88 |     var db = new Database();
 89 |     db.open('/tmp/db', { test: true });
 90 | 
 91 |     t.equal(typeof db.stmt, 'undefined');
 92 | 
 93 |     const sql = 'SELECT * FROM sqlite_master';
 94 |     db.prepare(sql);
 95 | 
 96 |     t.true(typeof db.stmt, 'object');
 97 |     t.true(db.stmt.hasOwnProperty(sql));
 98 |     t.true(db.stmt[sql].reader);
 99 |     t.equal(db.stmt[sql].source, sql, 'sql query should be as expected');
100 | 
101 |     t.end();
102 |   });
103 | };
104 | 
105 | module.exports.configure = function(test, common) {
106 |   test('configure', function(t) {
107 |     var db = new Database();
108 |     db.open('/tmp/db', { test: true });
109 | 
110 |     // configure
111 |     const pragma_checks = {
112 |       foreign_keys: 0,
113 |       page_size: 4096,
114 |       cache_size: -2000,
115 |       synchronous: 0,
116 |       // journal_mode: 'memory',
117 |       temp_store: 2
118 |     };
119 | 
120 |     t.plan(_.size(pragma_checks));
121 |     _.forEach(pragma_checks, (value, key) => {
122 |       const stmt = db.db.prepare(`PRAGMA ${key};`);
123 |       t.deepEqual(stmt.get(), { [key]: value });
124 |     });
125 |   });
126 | };
127 | 


--------------------------------------------------------------------------------
/lib/TokenIndex.js:
--------------------------------------------------------------------------------
 1 | 
 2 | var util = require('util');
 3 | var Database = require('./Database');
 4 | var Queries = require('./Queries');
 5 | 
 6 | // document store database
 7 | function TokenIndex(){}
 8 | util.inherits( TokenIndex, Database );
 9 | 
10 | // @todo: more elegant polymorphism
11 | for( var method in Queries ){
12 |   TokenIndex.prototype[method] = Queries[method];
13 | }
14 | 
15 | TokenIndex.prototype.reset = function(){
16 |   this.db.exec('DROP TABLE IF EXISTS lineage;');
17 |   this.db.exec('CREATE TABLE lineage( id INTEGER, pid INTEGER );');
18 |   this.db.exec('CREATE INDEX IF NOT EXISTS lineage_cover_idx ON lineage(id, pid);');
19 | 
20 |   this.db.exec('DROP TABLE IF EXISTS tokens;');
21 |   this.db.exec('CREATE TABLE tokens( id INTEGER, lang STRING, tag STRING, token STRING );');
22 |   this.db.exec('CREATE INDEX IF NOT EXISTS tokens_cover_idx ON tokens(id, lang, tag);');
23 |   this.db.exec('CREATE INDEX IF NOT EXISTS tokens_token_idx ON tokens(token);');
24 | 
25 |   // FTS table options
26 |   // see: https://sqlite.org/fts5.html
27 |   var options = [
28 |     `tokenize="unicode61 remove_diacritics 0 tokenchars '_'"`,
29 |     `prefix='1 2 3 4 5 6 7 8 9 10 11 12'`,
30 |     'columnsize=0'
31 |   ].join(', ');
32 |   this.db.exec('DROP TABLE IF EXISTS fulltext;');
33 |   this.db.exec('CREATE VIRTUAL TABLE fulltext USING fts5( token, ' + options + ');');
34 | };
35 | 
36 | // ensure that the database schema matches what is expected by the codebase
37 | TokenIndex.prototype.checkSchema = function(){
38 |   Database.assertSchema(this.db, 'lineage', [
39 |     { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 },
40 |     { cid: 1, name: 'pid', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 }
41 |   ]);
42 |   Database.assertSchema(this.db, 'tokens', [
43 |     { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 },
44 |     { cid: 1, name: 'lang', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 },
45 |     { cid: 2, name: 'tag', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 },
46 |     { cid: 3, name: 'token', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 }
47 |   ]);
48 |   Database.assertSchema(this.db, 'fulltext', [
49 |     { cid: 0, name: 'token', type: '', notnull: 0, dflt_value: null, pk: 0 }
50 |   ]);
51 | };
52 | 
53 | TokenIndex.prototype.populate = function(){
54 |   this.db.exec(`INSERT INTO fulltext(rowid, token) SELECT rowid, REPLACE(token,' ','_') FROM tokens;`);
55 |   this.db.exec(`INSERT INTO fulltext(fulltext) VALUES('optimize');`);
56 | };
57 | 
58 | TokenIndex.prototype.setLineage = function( id, pids, cb ){
59 |   if( !Array.isArray( pids ) || !pids.length ){ return cb(); }
60 | 
61 |   // create prepared statement
62 |   var stmt = this.prepare('INSERT INTO lineage ( id, pid ) VALUES ( $id, $pid )');
63 | 
64 |   try {
65 |     pids.forEach( pid => stmt.run({ id: id, pid: pid }) );
66 |     return cb( null );
67 |   } catch ( err ){
68 |     console.error( err );
69 |     console.error( stmt.source );
70 |     return cb( err );
71 |   }
72 | };
73 | 
74 | TokenIndex.prototype.setTokens = function( id, tokens, cb ){
75 |   if( !Array.isArray( tokens ) || !tokens.length ){ return cb(); }
76 | 
77 |   // create prepared statement
78 |   var stmt = this.prepare(
79 |     'INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )'
80 |   );
81 | 
82 |   try {
83 |     tokens.forEach( token => stmt.run({
84 |       id: id,
85 |       lang: token.lang,
86 |       tag: token.tag,
87 |       token: token.body
88 |     }));
89 |     return cb( null );
90 |   } catch ( err ){
91 |     console.error( err );
92 |     console.error( stmt.source );
93 |     return cb( err );
94 |   }
95 | };
96 | 
97 | module.exports = TokenIndex;
98 | 


--------------------------------------------------------------------------------
/lib/Result.js:
--------------------------------------------------------------------------------
  1 | 
  2 | const util = require('util');
  3 | const DEBUG = false;
  4 | 
  5 | // convenience function for debugging
  6 | function _debugRows( rows ){
  7 |   rows = rows || [];
  8 |   console.log('found (' + rows.length + '):');
  9 |   console.log( rows.map( row => {
 10 |     return ' - ' + util.format(
 11 |       '"%s" (%d) >>> "%s" (%d)',
 12 |       row.subject,
 13 |       row.subjectId,
 14 |       row.object,
 15 |       row.objectId
 16 |     );
 17 |   }).join('\n'));
 18 | }
 19 | 
 20 |   // reset indicates if we failed to find any matches for
 21 |   // object with any of the subjects
 22 |   // in this case we will use the previous object value
 23 |   // as a 'seed' for the id pool
 24 | 
 25 | function Result( group, done ){
 26 |   this.group = Array.isArray( group ) ? group : [];
 27 |   this.ids = {};
 28 |   this.mask = new Array( this.group.length ).fill( false );
 29 |   this.pos = {
 30 |     subject: this.group.length -2,
 31 |     object: this.group.length -1
 32 |   };
 33 |   this.reset = false;
 34 |   this.done = ('function' === typeof done) ? done : function(){};
 35 | }
 36 | 
 37 | Result.prototype.getSubject = function(){
 38 |   return this.group[ this.pos.subject ];
 39 | };
 40 | 
 41 | Result.prototype.getObject = function(){
 42 |   return this.group[ this.pos.object ];
 43 | };
 44 | 
 45 | Result.prototype.getPreviousObject = function(){
 46 |   return this.group[ this.pos.prev_object ];
 47 | };
 48 | 
 49 | Result.prototype.getIdsAsArray = function(){
 50 |   return Object.keys( this.ids ).map( k => parseInt( k, 10 ) );
 51 | };
 52 | 
 53 | // return all the 'subjectId' values from rows returned from the db
 54 | // optionally: use a function to filter which rows are included.
 55 | Result.subjectIdsFromRows = function( rows, filter ){
 56 |   return rows.reduce(( memo, row ) => {
 57 |     if( 'function' === typeof filter ){
 58 |       if( !filter( row ) ){ return memo; }
 59 |     }
 60 |     if( row.hasOwnProperty('subjectId') ){
 61 |       memo[ row.subjectId ] = true;
 62 |     }
 63 |     return memo;
 64 |   }, {});
 65 | };
 66 | 
 67 | // convenience function to set mask values
 68 | Result.prototype.setMask = function( entity, bool ){
 69 |   if( this.pos.hasOwnProperty(entity) && -1 < this.pos[entity] ){
 70 |     this.mask[ this.pos[entity] ] = !!bool;
 71 |   }
 72 | };
 73 | 
 74 | // intersect the currect resultset with new matching rows from
 75 | // the database.
 76 | Result.prototype.intersect = function( err, rows ){
 77 | 
 78 |   // debugging
 79 |   if( DEBUG ){ _debugRows( rows ); }
 80 | 
 81 |   // no results were found
 82 |   if( err || !rows || !rows.length ){
 83 | 
 84 |     // decrement iterator
 85 |     this.pos.subject--;
 86 |     return;
 87 |   }
 88 | 
 89 |   // first time we have found matching rows for the query
 90 |   if( !Object.keys( this.ids ).length ){
 91 |     this.ids = Result.subjectIdsFromRows( rows );
 92 |     this.setMask('object', true);
 93 |     this.setMask('subject', true);
 94 |     this.pos.object = this.pos.subject;
 95 |     this.pos.subject = this.pos.object-1;
 96 |     return;
 97 |   }
 98 | 
 99 |   // compute the intersection of the new rows and the past
100 |   // matched ids.
101 | 
102 |   // find the results which are children of existing ids
103 |   const children = Result.subjectIdsFromRows(
104 |     rows,
105 |     row => this.ids.hasOwnProperty( row.objectId )
106 |   );
107 | 
108 |   // we found at least one valid child
109 |   if( !!Object.keys( children ).length ){
110 |     this.ids = children;
111 |     this.setMask('subject', true);
112 |     this.pos.object = this.pos.subject;
113 |     this.pos.subject = this.pos.object-1;
114 |     return;
115 |   }
116 | 
117 |   // we failed to find any valid children of existing ids
118 |   if( DEBUG ){ console.error( 'failed!' ); }
119 | 
120 |   // decrement iterator
121 |   this.pos.subject--;
122 | };
123 | 
124 | module.exports = Result;
125 | 


--------------------------------------------------------------------------------
/test/cases/capitalCities.txt:
--------------------------------------------------------------------------------
  1 | 101877135 Andorra la Vella, Andorra
  2 | 421168799 Kabul, Afghanistan
  3 | 890445621 St. John's, Antigua and Barbuda
  4 | 890441875 The Valley, Anguilla
  5 | 421182367 Yerevan, Armenia
  6 | 890432155 Luanda, Angola
  7 | 101734459 Pago Pago, American Samoa
  8 | 890432017 Oranjestad, Aruba
  9 | 85667871 Mariehamn, Aland Islands
 10 | 890518775 Sarajevo, Bosnia and Herzegovina
 11 | 890452811 Bridgetown, Barbados
 12 | 421190647 Manama, Bahrain
 13 | 421204487 Bujumbura, Burundi
 14 | 421168997 Porto-Novo, Benin
 15 | 890442097 Hamilton, Bermuda
 16 | 421188863 Bandar Seri Begawan, Brunei
 17 | 101964877 Brasilia, Brazil
 18 | 85669631 Gaborone, Botswana
 19 | 890442105 Belmopan, Belize
 20 | 101735873 Ottawa, Canada
 21 | 101938929 West Island, Cocos Islands
 22 | 421181445 Bangui, Central African Republic
 23 | 85670067 Brazzaville, Republic of the Congo
 24 | 101748453 Bern, Switzerland
 25 | 421168957 Yamoussoukro, Ivory Coast
 26 | 102016915 Santiago, Chile
 27 | 85670331 Praia, Cape Verde
 28 | 421187435 Willemstad, Curacao
 29 | 101909779 Berlin, Germany
 30 | 101749159 Copenhagen, Denmark
 31 | 890442101 Roseau, Dominica
 32 | 101748153 Tallinn, Estonia
 33 | 421199769 Asmara, Eritrea
 34 | 101748283 Madrid, Spain
 35 | 101748417 Helsinki, Finland
 36 | 101750367 London, United Kingdom
 37 | 890451719 St. George's, Grenada
 38 | 890442055 Cayenne, French Guiana
 39 | 1125821075 St Peter Port, Guernsey
 40 | 421168965 Accra, Ghana
 41 | 101753853 Gibraltar, Gibraltar
 42 | 101870623 Nuuk, Greenland
 43 | 421167921 Banjul, Gambia
 44 | 421189675 Conakry, Guinea
 45 | 890420199 Basse-Terre, Guadeloupe
 46 | 421178347 Malabo, Equatorial Guinea
 47 | 421197943 Bissau, Guinea-Bissau
 48 | 890437279 Hong Kong, Hong Kong
 49 | 101751659 Zagreb, Croatia
 50 | 101751703 Budapest, Hungary
 51 | 101751737 Dublin, Ireland
 52 | 1125918569 Diego Garcia, British Indian Ocean Territory
 53 | 101751753 Reykjavik, Iceland
 54 | 1125783915 Saint Helier, Jersey
 55 | 421186515 Kingston, Jamaica
 56 | 85672817 Tokyo, Japan
 57 | 890440079 Basseterre, Saint Kitts and Nevis
 58 | 102026327 Seoul, South Korea
 59 | 890434949 George Town, Cayman Islands
 60 | 421191125 Astana, Kazakhstan
 61 | 85673679 Castries, Saint Lucia
 62 | 101828603 Vaduz, Liechtenstein
 63 | 421195189 Maseru, Lesotho
 64 | 101753031 Vilnius, Lithuania
 65 | 101751765 Luxembourg, Luxembourg
 66 | 890444507 Rabat, Morocco
 67 | 101831917 Monaco, Monaco
 68 | 421181453 Antananarivo, Madagascar
 69 | 890451463 Majuro, Marshall Islands
 70 | 890491957 Skopje, Macedonia
 71 | 85681291 Macao, Macao
 72 | 1108960813 Plymouth, Montserrat
 73 | 101752423 Valletta, Malta
 74 | 85674093 Male, Maldives
 75 | 421168781 Lilongwe, Malawi
 76 | 102023407 Kuala Lumpur, Malaysia
 77 | 1141909361 Windhoek, Namibia
 78 | 890413117 Noumea, New Caledonia
 79 | 890440179 Kingston, Norfolk Island
 80 | 101751893 Amsterdam, Netherlands
 81 | 1495123997 Oslo, Norway
 82 | 85675677 Yaren, Nauru
 83 | 1141909453 Alofi, Niue
 84 | 890445081 Panama City, Panama
 85 | 890435983 Saint-Pierre, Saint Pierre and Miquelon
 86 | 85676471 Melekeok, Palau
 87 | 421190363 Doha, Qatar
 88 | 102003033 Moscow, Russia
 89 | 890444217 Honiara, Solomon Islands
 90 | 421202159 Victoria, Seychelles
 91 | 101752307 Stockholm, Sweden
 92 | 102032341 Singapore, Singapore
 93 | 101752073 Ljubljana, Slovenia
 94 | 1108800123 Bratislava, Slovakia
 95 | 890452049 Freetown, Sierra Leone
 96 | 85677205 San Marino, San Marino
 97 | 890449737 Mogadishu, Somalia
 98 | 85677301 Sao Tome, Sao Tome and Principe
 99 | 890434823 Philipsburg, Sint Maarten
100 | 102025263 Bangkok, Thailand
101 | 421196557 Dili, East Timor
102 | 421167889 Ashgabat, Turkmenistan
103 | 85679123 Tunis, Tunisia
104 | 85679409 Ankara, Turkey
105 | 85679705 Dodoma, Tanzania
106 | 421168855 Tashkent, Uzbekistan
107 | 890434937 Road Town, British Virgin Islands
108 | 421177479 Hanoi, Vietnam
109 | 890416453 Port Vila, Vanuatu
110 | 890452045 Mata Utu, Wallis and Futuna
111 | 890416609 Apia, Samoa
112 | 421178937 Lusaka, Zambia
113 | 421201479 Harare, Zimbabwe
114 | 


--------------------------------------------------------------------------------
/test/lib/DocStore.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var DocStore = require('../../lib/DocStore');
  3 | 
  4 | module.exports.constructor = function(test, common) {
  5 |   test('constructor', function(t) {
  6 |     var db = new DocStore();
  7 |     t.equal( db.constructor.super_.name, 'Database' );
  8 |     t.equal( typeof db.reset, 'function' );
  9 |     t.equal( typeof db.set, 'function' );
 10 |     t.equal( typeof db.get, 'function' );
 11 |     t.equal( typeof db.getMany, 'function' );
 12 |     t.end();
 13 |   });
 14 | };
 15 | 
 16 | module.exports.reset = function(test, common) {
 17 |   test('reset', function(t) {
 18 |     var db = new DocStore();
 19 |     db.open('/tmp/db', { test: true, reset: true });
 20 |     
 21 |     // ensure table has been created
 22 |     const sql = 'PRAGMA table_info(docs)';
 23 |     t.deepEqual( db.prepare(sql).all(), [
 24 |       { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 1 },
 25 |       { cid: 1, name: 'json', type: 'TEXT', notnull: 0, dflt_value: null, pk: 0 }
 26 |     ]);
 27 |     
 28 |     t.end();
 29 |   });
 30 | };
 31 | 
 32 | module.exports.checkSchema = function(test, common) {
 33 |   test('checkSchema - empty', function(t) {
 34 |     var db = new DocStore();
 35 |     db.open('/tmp/db', { test: true });
 36 |     t.throws(() => { db.checkSchema(); }, /schema invalid: table docs/);
 37 |     t.end();
 38 |   });
 39 |   test('checkSchema - valid', function(t) {
 40 |     var db = new DocStore();
 41 |     db.open('/tmp/db', { test: true, reset: true });
 42 |     t.doesNotThrow(() => { db.checkSchema(); });
 43 |     t.end();
 44 |   });
 45 |   test('checkSchema - invalid', function(t) {
 46 |     var db = new DocStore();
 47 |     db.open('/tmp/db', { test: true });
 48 |     db.db.exec('DROP TABLE IF EXISTS docs');
 49 |     db.db.exec('CREATE TABLE docs( id INTEGER PRIMARY KEY, foo TEXT )');
 50 |     t.throws(() => { db.checkSchema(); });
 51 |     t.end();
 52 |   });
 53 | };
 54 | 
 55 | module.exports.set = function(test, common) {
 56 |   test('set', function(t) {
 57 |     var db = new DocStore();
 58 |     db.open('/tmp/db', { test: true, reset: true });
 59 | 
 60 |     t.plan(1);
 61 | 
 62 |     const id = 100;
 63 |     const data = { test: { foo: 'bar' } };
 64 | 
 65 |     db.set( id, data, (err) => {
 66 | 
 67 |       // ensure row has been created
 68 |       const sql = 'SELECT * FROM docs WHERE id = ? LIMIT 1';
 69 |       t.deepEqual( db.prepare(sql).all(id), [
 70 |         { id: id, json: DocStore.codec.encode( data ) }
 71 |       ]);
 72 | 
 73 |     });
 74 |   });
 75 | };
 76 | 
 77 | module.exports.get = function(test, common) {
 78 |   test('get', function(t) {
 79 |     var db = new DocStore();
 80 |     db.open('/tmp/db', { test: true, reset: true });
 81 | 
 82 |     t.plan(1);
 83 | 
 84 |     const id = 100;
 85 |     const data = { test: { foo: 'bar' } };
 86 | 
 87 |     // insert a row in the database
 88 |     db.prepare('INSERT INTO docs (id, json) VALUES ($id, $json)')
 89 |       .run({ id: id, json: DocStore.codec.encode( data ) });
 90 | 
 91 |       // retrieve row
 92 |     db.get( id, (err, res) => {
 93 |       t.deepEqual( res, data );
 94 |     });
 95 |   });
 96 | };
 97 | 
 98 | module.exports.getMany = function(test, common) {
 99 |   test('getMany', function(t) {
100 |     var db = new DocStore();
101 |     db.open('/tmp/db', { test: true, reset: true });
102 | 
103 |     t.plan(1);
104 | 
105 |     // insert a row in the database
106 |     var stmt = db.prepare('INSERT INTO docs (id, json) VALUES ($id, $json)');
107 |     stmt.run({ id: 100, json: DocStore.codec.encode({ test: 100 }) });
108 |     stmt.run({ id: 200, json: DocStore.codec.encode({ test: 200 }) });
109 |     stmt.run({ id: 300, json: DocStore.codec.encode({ test: 300 }) });
110 | 
111 |       // retrieve rows
112 |     db.getMany( [100, 300], (err, res) => {
113 |       t.deepEqual( res, [
114 |         { test: 100 },
115 |         { test: 300 }
116 |       ]);
117 |     });
118 |   });
119 |   test('getMany - empty ids array', function(t) {
120 |     var db = new DocStore();
121 |     db.open('/tmp/db', { test: true, reset: true });
122 | 
123 |     t.plan(1);
124 | 
125 |     // retrieve rows
126 |     db.getMany( [], (err, res) => {
127 |       t.deepEqual( res, [] );
128 |     });
129 |   });
130 | };
131 | 


--------------------------------------------------------------------------------
/prototype/tokenize.js:
--------------------------------------------------------------------------------
  1 | 
  2 | // plugin for tokenize
  3 | const _ = require('lodash');
  4 | const async = require('async');
  5 | const analysis = require('../lib/analysis');
  6 | const permutations = require('../lib/permutations');
  7 | 
  8 | function tokenize(input, cb){
  9 | 
 10 |   // tokenize input
 11 |   const synonyms = analysis.tokenize(input);
 12 | 
 13 |   // test each synonym against the database and select the best synonyms
 14 |   async.map( synonyms, _eachSynonym.bind(this), (err, queries) => {
 15 |     return cb( null, _queryFilter( queries ) );
 16 |   });
 17 | }
 18 | 
 19 | // test if a phrase exists in the index
 20 | function _indexContainsPhrase(phrase, cb){
 21 |   this.index.hasSubject( phrase, function( bool ){
 22 |     return cb( null, bool );
 23 |   });
 24 | }
 25 | 
 26 | // expand each synonym in to its permutations and check them against the database.
 27 | function _eachSynonym(synonym, cb){
 28 | 
 29 |   // expand token permutations
 30 |   const phrases = _permutations(synonym);
 31 | 
 32 |   // filter out permutations which do not match phrases in the index
 33 |   async.filterSeries( phrases, _indexContainsPhrase.bind(this), (err, matchedPhrases) => {
 34 |     return cb( null, _groups(synonym, matchedPhrases) );
 35 |   });
 36 | }
 37 | 
 38 | // expand token permutations
 39 | function _permutations(tokens){
 40 |   return _.uniq(permutations.expand(tokens).map(perm => perm.join(' ')));
 41 | }
 42 | 
 43 | // remove unwanted queries
 44 | function _queryFilter(queries){
 45 | 
 46 |   // remove empty arrays
 47 |   queries = queries.filter( function( query ){
 48 |     return 0 !== query.length;
 49 |   });
 50 | 
 51 |   // remove synonymous groupings
 52 |   queries = queries.filter( function( query, i ){
 53 |     for( var j=0; j<queries.length; j++ ){
 54 |       if( j === i ){ continue; }
 55 |       if( _.isEqual( query, queries[j].slice( -query.length ) ) ){
 56 |         return false;
 57 |       }
 58 |     }
 59 |     return true;
 60 |   });
 61 | 
 62 |   return queries;
 63 | }
 64 | 
 65 | // a convenience function to very efficiently compare a range
 66 | // of elements in array A to the entirety of array B.
 67 | function _isArrayRangeIsEqual(A, B, offset){
 68 |   if( A.length-(offset||0) < B.length ){ return false; }
 69 |   for( var i=0; i<B.length; i++ ){
 70 |     if( A[(offset||0)+i] !== B[i] ){
 71 |       return false;
 72 |     }
 73 |   }
 74 |   return true;
 75 | }
 76 | 
 77 | // select the optimal phrases from those found in the database
 78 | function _groups(tokens, phrases) {
 79 | 
 80 |   // sort the largest phrases first
 81 |   phrases.sort((a, b) => b.length - a.length);
 82 | 
 83 |   // generate a map of matched phrases where the
 84 |   // key is a single word token (the first word in
 85 |   // the phrase) and the values is an array of
 86 |   // phrases which contain that word.
 87 |   const index = Object.create(null);
 88 |   phrases.forEach( phrase => {
 89 |     const words = phrase.split(/\s+/);
 90 |     const firstWord = words[0];
 91 |     if( !index[ firstWord ] ){
 92 |       index[ firstWord ] = [];
 93 |     }
 94 |     index[ firstWord ].push( words );
 95 |   });
 96 | 
 97 |   // an array of the chosen phrases
 98 |   const groups = [];
 99 | 
100 |   // iterate over the input tokens
101 |   for( var t=0; t<tokens.length; t++ ){
102 |     var token = tokens[t];
103 |     var matches = index[token];
104 | 
105 |     // skip tokens with no matching phrases in the index
106 |     if( !matches ){ continue; }
107 | 
108 |     // iterate over each matching phrase in the index
109 |     for( var z=0; z<matches.length; z++ ){
110 |       var phrase = matches[z];
111 | 
112 |       // select the longest matching phrase
113 |       if( !_isArrayRangeIsEqual( tokens, phrase, t ) ){ continue; }
114 | 
115 |       // add the match to the groups array
116 |       groups.push( phrase.join(' ') );
117 | 
118 |       // advance the iterator to skip any other words in the phrase
119 |       t += phrase.length -1;
120 |       break;
121 |     }
122 |   }
123 | 
124 |   return groups;
125 | }
126 | 
127 | module.exports.tokenize = tokenize;
128 | module.exports._indexContainsPhrase = _indexContainsPhrase;
129 | module.exports._eachSynonym = _eachSynonym;
130 | module.exports._permutations = _permutations;
131 | module.exports._queryFilter = _queryFilter;
132 | module.exports._isArrayRangeIsEqual = _isArrayRangeIsEqual;
133 | module.exports._groups = _groups;
134 | 


--------------------------------------------------------------------------------
/lib/analysis.js:
--------------------------------------------------------------------------------
  1 | 
  2 | const lowercase = require('lower-case').lowerCase;
  3 | const unicode = require('./unicode');
  4 | 
  5 | const PARTIAL_TOKEN_SUFFIX = '\x26';
  6 | 
  7 | /**
  8 |   note: these functions return arrays which can be used to return synonymous
  9 |   versions of the same tokens.
 10 | **/
 11 | 
 12 | function normalize( input ){
 13 | 
 14 |   // sanity check arguments
 15 |   if( typeof input !== 'string' ){ return []; }
 16 | 
 17 |   // apply unicode normalization
 18 |   input = unicode.normalize(input);
 19 | 
 20 |   // trim input of superfluous whitespace
 21 |   input = input.trim();
 22 | 
 23 |   // string is empty
 24 |   if( 0 === input.length ){ return []; }
 25 | 
 26 |   // whosonfirst sometimes uses '-1' instead of ''
 27 |   if( input === '-1' ){ return []; }
 28 | 
 29 |   // remove certain punctuation
 30 |   input = input.replace(/[\.]+/g,'');
 31 | 
 32 |   // replace certain punctuation with spaces
 33 |   input = input.replace(/[",]+/g,' ');
 34 | 
 35 |   // remove 'disambiguation' tokens from name suffix
 36 |   // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/885
 37 |   input = input.replace(/(\s+([-֊־‐‑﹣]|[\(\[])).*$/, '');
 38 | 
 39 |   // input consists of only numbers
 40 |   if( /^\d+$/.test( input ) ){ return []; }
 41 | 
 42 |   // generic synonym contractions
 43 |   input = input.replace(/\b(sainte)\b/gi, 'ste')
 44 |                .replace(/\b(saint)\b/gi, 'st')
 45 |                .replace(/\b(mount)\b/gi, 'mt')
 46 |                .replace(/\b(fort)\b/gi, 'ft');
 47 | 
 48 |   // input can have multiple synonymous representations
 49 |   var synonyms = [ input ];
 50 | 
 51 |   // synonymous representations of hyphens and apostrophes
 52 |   if( input.match(/['‘’-]+/) ){
 53 |     synonyms = synonyms.map( function( synonym ){
 54 |       return synonym.replace(/['‘’-]+/g, '');
 55 |     }).concat( synonyms.map( function( synonym ){
 56 |       return synonym
 57 |         .replace(/(['‘’]s)/g, '')
 58 |         .replace(/['‘’-]+/g, ' ');
 59 |     }));
 60 |   }
 61 | 
 62 |   // simple replacements
 63 |   if( input.match(/[&ßœ̆]+/) ){
 64 |     synonyms = synonyms.map( function( synonym ){
 65 |       return synonym
 66 |         .replace(/ & /g, ' and ')
 67 |         .replace(/ß/g, 'ss')
 68 |         .replace(/œ̆/g, 'oe');
 69 |     });
 70 |   }
 71 | 
 72 |   // synonymous representations of umlauts / special letters
 73 |   if( input.match(/[üöäÄÜÖ]+/) ){
 74 |     synonyms = synonyms.concat(
 75 |       synonyms.map( function( synonym ){
 76 |         return synonym
 77 |           .replace(/ü/g, 'ue')
 78 |           .replace(/ö/g, 'oe')
 79 |           .replace(/ä/g, 'ae')
 80 |           .replace(/Ä/g, 'Ae')
 81 |           .replace(/Ü/g, 'Ue')
 82 |           .replace(/Ö/g, 'Oe');
 83 |       })
 84 |     );
 85 |   }
 86 | 
 87 |   // synonymous representations of official designations
 88 |   if (input.match(/county|city|township/i) ){
 89 |     synonyms = synonyms.concat(
 90 |       synonyms.map(synonym => {
 91 |         return synonym
 92 |           .replace(/^county\s(of\s)?(.*)$/gi, '$2')
 93 |           .replace(/^(.*)\scounty$/gi, '$1')
 94 |           .replace(/^city\sof(?!\s?the)\s?(.*)$/gi, '$1')
 95 |           .replace(/^(.*\s)charter\s(township)$/gi, '$1$2');
 96 |       })
 97 |     );
 98 |   }
 99 | 
100 |   // replace multiple spaces with a single space and trim tokens
101 |   return synonyms.map( function( synonym ){
102 |     return synonym.replace(/\s{2,}/g, ' ').trim();
103 |   })
104 |   // normalization
105 |   .map( function( synonym ){
106 |     return lowercase( unicode.fold( synonym ) );
107 |   })
108 |   // remove empty synonyms
109 |   .filter( function( synonym ){
110 |     return synonym && synonym.length;
111 |   })
112 |   // remove duplicate synonyms
113 |   .filter( function( synonym, pos, self ){
114 |     return self.indexOf(synonym) === pos;
115 |   });
116 | }
117 | 
118 | // try to detect languages which write their addresses in the opposite order-of-presentation to how it's
119 | // done in the west.
120 | // http://www.columbia.edu/~fdc/postal/#general
121 | const REGEX_MAJOR_TO_MINOR = /[\u0591-\u07FF\u1100-\u11FF\u3130-\u318F\uA960-\uA97F\uAC00-\uD7AF\uD7B0-\uD7FF\u0400-\u04FF]/;
122 | 
123 | function tokenize( input ){
124 |   return normalize(input).map( function( synonym ){
125 |     // reverse tokens for major-to-minor address schemes
126 |     if( REGEX_MAJOR_TO_MINOR.test( synonym ) ){
127 |       return synonym.split(/\s+/g).reverse();
128 |     }
129 |     return synonym.split(/\s+/g);
130 |   });
131 | }
132 | 
133 | module.exports.normalize = normalize;
134 | module.exports.tokenize = tokenize;
135 | module.exports.PARTIAL_TOKEN_SUFFIX = PARTIAL_TOKEN_SUFFIX;
136 | module.exports.REGEX_MAJOR_TO_MINOR = REGEX_MAJOR_TO_MINOR;
137 | 


--------------------------------------------------------------------------------
/lib/unicode.js:
--------------------------------------------------------------------------------
  1 | const _ = require('lodash');
  2 | const regenerate = require('regenerate');
  3 | const accentsDiacritics = require('remove-accents-diacritics');
  4 | 
  5 | // non-printable control characters
  6 | // ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters
  7 | const CONTROL_CODES = regenerate()
  8 |   .addRange(0x0000, 0x001F) // C0 (0000-001F)
  9 |   .add(0x007F) // Delete
 10 |   .addRange(0x0080, 0x009F) // C1 (0080-009F)
 11 |   .toRegExp('g');
 12 | 
 13 | // non-standard spaces
 14 | // ref: http://jkorpela.fi/chars/spaces.html
 15 | const ALTERNATE_SPACES = regenerate()
 16 |   .add(0x00A0) // NO-BREAK SPACE
 17 |   .add(0x1680) // OGHAM SPACE MARK
 18 |   .add(0x180E) // MONGOLIAN VOWEL SEPARATOR
 19 |   .addRange(0x2000, 0x200B) // EN QUAD - ZERO WIDTH SPACE
 20 |   .add(0x202F) // NARROW NO-BREAK SPACE
 21 |   .add(0x205F) // MEDIUM MATHEMATICAL SPACE
 22 |   .add(0x3000) // IDEOGRAPHIC SPACE
 23 |   .add(0xFEFF) // ZERO WIDTH NO-BREAK SPACE
 24 |   .toRegExp('g');
 25 | 
 26 | // pattern to match consecutive spaces
 27 | // const CONSECUTIVE_SPACES = /\s{2,}/g;
 28 | 
 29 | // unicode combining marks
 30 | // see: https://github.com/pelias/pelias/issues/829#issuecomment-542614645
 31 | // ref: https://en.wikipedia.org/wiki/Combining_character
 32 | const COMBINING_MARKS = regenerate()
 33 |   .add(0x200D) // ZERO WIDTH JOINER (U+200D)
 34 |   .addRange(0x0300, 0x036F) // Combining Diacritical Marks (0300–036F)
 35 |   .addRange(0x1AB0, 0x1AFF) // Combining Diacritical Marks Extended (1AB0–1AFF)
 36 |   .addRange(0x1DC0, 0x1DFF) // Combining Diacritical Marks Supplement (1DC0–1DFF)
 37 |   .addRange(0x20D0, 0x20FF) // Combining Diacritical Marks for Symbols (20D0–20FF)
 38 |   .addRange(0xFE00, 0xFE0F) // Variation Selectors (FE00-FE0F)
 39 |   .addRange(0xFE20, 0xFE2F) // Combining Half Marks (FE20–FE2F)
 40 |   .add(0x3099) // combining dakuten (U+3099)
 41 |   .add(0x309A) // combining handakuten (U+309A)
 42 |   .toRegExp('g');
 43 | 
 44 | // miscellaneous symbols with no relevance to geocoding
 45 | const MISC_UNSUPPORTED_SYMBOLS = regenerate()
 46 |   // Superscripts and Subscripts (2070-209F)
 47 |   // Currency Symbols (20A0-20CF)
 48 |   // Letterlike Symbols (2100-214F)
 49 |   // Number Forms (2150-218F)
 50 |   // Arrows (2190-21FF)
 51 |   // Mathematical Operators (2200-22FF)
 52 |   // Miscellaneous Technical (2300-23FF)
 53 |   // Control Pictures (2400-243F)
 54 |   // Optical Character Recognition (2440-245F)
 55 |   // Enclosed Alphanumerics (2460-24FF)
 56 |   // Box Drawing (2500-257F)
 57 |   // Block Elements (2580-259F)
 58 |   // Geometric Shapes (25A0-25FF)
 59 |   // Miscellaneous Symbols (2600-26FF)
 60 |   // Dingbats (2700-27BF)
 61 |   // Miscellaneous Mathematical Symbols-A (27C0-27EF)
 62 |   // Supplemental Arrows-A (27F0-27FF)
 63 |   // Braille Patterns (2800-28FF)
 64 |   // Supplemental Arrows-B (2900-297F)
 65 |   // Miscellaneous Mathematical Symbols-B (2980-29FF)
 66 |   // Supplemental Mathematical Operators (2A00-2AFF)
 67 |   // Miscellaneous Symbols and Arrows (2B00-2BFF)
 68 |   .addRange(0x2070, 0x2BFF) // A Range Covering Consecutive Blocks Listed Above
 69 | 
 70 |   // symbols
 71 |   .addRange(0x02B0, 0x02FF) // Spacing Modifier Letters (02B0-02FF)
 72 |   .addRange(0x1400, 0x167F) // Unified Canadian Aboriginal Syllabics (1400-167F)
 73 |   .addRange(0x1D100, 0x1D1FF) // Musical Symbols (1D100-1D1FF)
 74 |   .addRange(0x1D400, 0x1D7FF) // Mathematical Alphanumeric Symbols (1D400-1D7FF)
 75 | 
 76 |   // emojis
 77 |   .addRange(0x1F300, 0x1F5FF) // Miscellaneous Symbols and Pictographs (1F300-1F5FF)
 78 |   .addRange(0x1F3FB, 0x1F3FF) // Emoji Modifier Fitzpatrick (skin tones) (1F3FB–1F3FF)
 79 |   .addRange(0x1F600, 0x1F64F) // Emoticons (1F600–1F64F)
 80 |   .addRange(0x1F680, 0x1F6FF) // Transport and Map Symbols (1F680-1F6FF)
 81 |   .addRange(0x1F900, 0x1F9FF) // Supplemental Symbols and Pictographs (1F900-1F9FF)
 82 |   .toRegExp('g');
 83 | 
 84 | function normalize(str) {
 85 | 
 86 |   // sanity checking
 87 |   if(!_.isString(str)){ return str; }
 88 | 
 89 |   return str
 90 |     .normalize('NFKC')
 91 |     .replace(CONTROL_CODES, '')
 92 |     .replace(ALTERNATE_SPACES, ' ')
 93 |     .replace(MISC_UNSUPPORTED_SYMBOLS, '')
 94 |     .replace(COMBINING_MARKS, '');
 95 | }
 96 | 
 97 | /**
 98 |  * Converts alphabetic, numeric, and symbolic characters that are not
 99 |  * in the Basic Latin Unicode block(first 127 ASCII characters) to their
100 |  * ASCII equivalent, if one exists.For example, the filter changes à to a.
101 |  */
102 | function fold(str) {
103 | 
104 |   // sanity checking
105 |   if (!_.isString(str)) { return str; }
106 | 
107 |   return accentsDiacritics.remove(str)
108 |     .normalize('NFD')
109 |     .replace(COMBINING_MARKS, '')
110 |     .normalize('NFKC');
111 | }
112 | 
113 | module.exports.normalize = normalize;
114 | module.exports.fold = fold;
115 | 


--------------------------------------------------------------------------------
/lib/DocStore.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var util = require('util');
  3 | var Database = require('./Database');
  4 | 
  5 | // document store database
  6 | function DocStore(){}
  7 | util.inherits( DocStore, Database );
  8 | 
  9 | DocStore.prototype.reset = function(){
 10 |   this.db.exec('DROP TABLE IF EXISTS docs');
 11 |   this.db.exec('DROP TABLE IF EXISTS rtree');
 12 |   this.db.exec('CREATE TABLE docs( id INTEGER PRIMARY KEY, json TEXT )');
 13 | 
 14 |   // create rtree table
 15 |   this.db.exec('CREATE VIRTUAL TABLE IF NOT EXISTS rtree USING rtree( id, minX, maxX, minY, maxY, minZ, maxZ )');
 16 | 
 17 |   // triggers to keep the rtree index up-to-date
 18 |   var triggers = {
 19 |     insert: `INSERT INTO rtree ( id, minX, maxX, minY, maxY, minZ, maxZ ) VALUES (
 20 |       new.id,
 21 |       json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[0]' ),
 22 |       json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[2]' ),
 23 |       json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[1]' ),
 24 |       json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[3]' ),
 25 |       json_extract( new.json, '$.rank.min' ),
 26 |       json_extract( new.json, '$.rank.max' )
 27 |     )`,
 28 |     delete: 'DELETE FROM rtree WHERE id = old.id'
 29 |   };
 30 | 
 31 |   this.db.exec(`CREATE TRIGGER IF NOT EXISTS rtree_insert_trigger
 32 |     AFTER INSERT ON docs
 33 |     BEGIN ${triggers.insert}; END`);
 34 | 
 35 |   this.db.exec(`CREATE TRIGGER IF NOT EXISTS rtree_delete_trigger
 36 |     AFTER DELETE ON docs
 37 |     BEGIN ${triggers.delete}; END`);
 38 | 
 39 |   this.db.exec(`CREATE TRIGGER IF NOT EXISTS rtree_update_trigger
 40 |     AFTER UPDATE ON docs
 41 |     BEGIN ${triggers.delete}; ${triggers.insert}; END`);
 42 | };
 43 | 
 44 | // ensure that the database schema matches what is expected by the codebase
 45 | DocStore.prototype.checkSchema = function(){
 46 |   Database.assertSchema(this.db, 'docs', [
 47 |     { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 1 },
 48 |     { cid: 1, name: 'json', type: 'TEXT', notnull: 0, dflt_value: null, pk: 0 }
 49 |   ]);
 50 |   Database.assertSchema(this.db, 'rtree', [
 51 |     { cid: 0, name: 'id', type: 'INT', notnull: 0, dflt_value: null, pk: 0 },
 52 |     { cid: 1, name: 'minX', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 },
 53 |     { cid: 2, name: 'maxX', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 },
 54 |     { cid: 3, name: 'minY', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 },
 55 |     { cid: 4, name: 'maxY', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 },
 56 |     { cid: 5, name: 'minZ', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 },
 57 |     { cid: 6, name: 'maxZ', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 }
 58 |   ]);
 59 | };
 60 | 
 61 | DocStore.prototype.set = function( id, doc, cb ){
 62 | 
 63 |   // create prepared statement
 64 |   var stmt = this.prepare('INSERT INTO docs (id, json) VALUES ($id, $json)');
 65 | 
 66 |   try {
 67 |     stmt.run({ id: id, json: DocStore.codec.encode( doc ) });
 68 |     return cb( null );
 69 |   } catch ( err ){
 70 |     console.error( err );
 71 |     console.error( stmt.source );
 72 |     console.error( id, doc );
 73 |     return cb( err );
 74 |   }
 75 | };
 76 | 
 77 | DocStore.prototype.get = function( id, cb ){
 78 | 
 79 |   // create prepared statement
 80 |   var stmt = this.prepare('SELECT json FROM docs WHERE id = ? LIMIT 1');
 81 | 
 82 |   try {
 83 |     var doc = stmt.get( id );
 84 |     if( !doc ){ return cb( 'not found' ); }
 85 |     return cb( null, DocStore.codec.decode( doc ) );
 86 |   } catch ( err ){
 87 |     console.error( err );
 88 |     console.error( stmt.source );
 89 |     return cb( err );
 90 |   }
 91 | };
 92 | 
 93 | DocStore.prototype.getMany = function( ids, cb ){
 94 | 
 95 |   if( !Array.isArray( ids ) || !ids.length ){
 96 |     return cb( null, [] );
 97 |   }
 98 | 
 99 |   // create prepared statement
100 |   var stmt = this.prepare('SELECT json FROM docs WHERE id IN ' +
101 |     '(' + Array(ids.length).fill('?').join(',') + ')'
102 |   );
103 | 
104 |   // var stmt = this.prepare('SELECT json FROM docs WHERE id IN ( ? )');
105 | 
106 |   try {
107 |     var docs = stmt.all( ids );
108 |     if( !docs ){ return cb( 'not found' ); }
109 |     return cb( null, docs.map( DocStore.codec.decode ));
110 |   } catch ( err ){
111 |     console.error( err );
112 |     console.error( stmt.source );
113 |     console.error( ids );
114 |     return cb( err );
115 |   }
116 | };
117 | 
118 | // encode/decode json strings
119 | DocStore.codec = {
120 |   encode: ( decoded ) => {
121 |     return JSON.stringify( decoded );
122 |   },
123 |   decode: ( encoded ) => {
124 |     return JSON.parse( encoded.json );
125 |   }
126 | };
127 | 
128 | module.exports = DocStore;
129 | 


--------------------------------------------------------------------------------
/prototype/query.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var async = require('async');
  3 | var util = require('util');
  4 | var Result = require('../lib/Result');
  5 | var debug = false;
  6 | 
  7 | function reduce( index, res ){
  8 | 
  9 |   // we are on the last subject for this iteration
 10 |   if( -1 === res.pos.subject ){
 11 | 
 12 |     // we still have more object tokens to try
 13 |     // so we reset the iterators.
 14 |     if( res.pos.object > 1 ){
 15 | 
 16 |       // reset (move on to the next object)
 17 |       res.reset = true;
 18 | 
 19 |       // we have more values to try, update the positions
 20 |       // move on to the next object and start checking subjects to its left
 21 |       res.pos.prev_object = res.pos.object;
 22 |       res.pos.object--;
 23 |       res.pos.subject = res.pos.object-1;
 24 |     }
 25 | 
 26 |     // we have run out of tokens (all object tokens used up)
 27 |     else {
 28 | 
 29 |       // we didn't match anything, so simply return the ids for
 30 |       // the rightmost token.
 31 |       if( !Object.keys(res.ids).length ){
 32 |         const lastToken = res.group[ res.group.length -1 ];
 33 |         return index.matchSubjectDistinctSubjectIds( lastToken, ( err, rows ) => {
 34 |           res.intersect( err, rows );
 35 |           return res.done( null, res );
 36 |         });
 37 |       }
 38 | 
 39 |       // we are done, return the result
 40 |       return res.done( null, res );
 41 |     }
 42 |   }
 43 | 
 44 |   if( debug ){
 45 |     if( res.reset ){ console.error( 'RESET!!' ); }
 46 |     console.log( '---------------------------------------------------' );
 47 |     console.log( util.format( '"%s" >>> "%s"', res.getSubject(), res.getObject() ) );
 48 |   }
 49 | 
 50 |   // reset
 51 |   if( res.reset ){
 52 |     res.reset = false; // return to default value
 53 |     index.matchSubjectDistinctSubjectIds( res.getPreviousObject(), (err, rows) => {
 54 |       res.intersect( err, rows );
 55 |       reduce( index, res );
 56 |     });
 57 |   }
 58 | 
 59 |   // regular query
 60 |   else {
 61 |     index.matchSubjectObject( res.getSubject(), res.getObject(), (err, rows) => {
 62 | 
 63 |       // perform a query for nearby features and include them in the results
 64 |       if( !rows || rows.length === 0 ){
 65 |         index.matchSubjectObjectGeomIntersects( res.getSubject(), res.getObject(), (err2, rows2) => {
 66 |           res.intersect( err2, rows2 );
 67 |           reduce( index, res );
 68 |         });
 69 |       }
 70 | 
 71 |       // do not perform a nearby search
 72 |       else {
 73 |         res.intersect( err, rows );
 74 |         reduce( index, res );
 75 |       }
 76 |     });
 77 |   }
 78 | }
 79 | 
 80 | // query a single group
 81 | function _queryGroup( index, group, done ){
 82 | 
 83 |   // handle empty group
 84 |   if( !group || !group.length ){
 85 |     return done( null, new Result() );
 86 |   }
 87 | 
 88 |   reduce( index, new Result( group, done ) );
 89 | }
 90 | 
 91 | // query many groups & merge the result
 92 | function _queryManyGroups( index, groups, done ){
 93 | 
 94 |   // handle empty groups
 95 |   if( !groups || !groups.length ){
 96 |     return done( null, new Result() );
 97 |   }
 98 | 
 99 |   // query each group in parallel
100 |   // note: parallel likely doesn't have much of a perf gain when
101 |   // using 'npm better-sqlite3'.
102 |   async.parallel( groups.map( group => cb => {
103 |     _queryGroup( index, group, ( err, res ) => {
104 |       cb( null, { err: err, res: res });
105 |     });
106 |   }), function mergeQueryGroupResults( err, batch ) {
107 | 
108 |     var merged = new Result();
109 |     merged.group = batch[0].res.group;
110 |     merged.mask = batch[0].res.mask;
111 | 
112 |     // merge results
113 |     batch.forEach( b => {
114 |       if( b.err ){ return; }
115 | 
116 |       // merge ids
117 |       for( var attr in b.res.ids ){
118 |         merged.ids[ attr ] = b.res.ids[ attr ];
119 |       }
120 | 
121 |       // merge mask
122 |       b.res.mask.forEach(( bool, pos ) => {
123 |         if( true === bool ){ merged.mask[ pos ] = bool; }
124 |       });
125 |     });
126 | 
127 |     // @todo find a way of returning all masks/groups
128 |     // instead of only the first element
129 |     return done( err, merged );
130 |   });
131 | }
132 | 
133 | function query( text, done ){
134 |   this.tokenize( text, function( err, groups ){
135 | 
136 |     switch( groups.length ){
137 | 
138 |       // in a failure case we didnt find any groups; abort now
139 |       case 0: return done( null, new Result() );
140 | 
141 |       // in most cases there is only one group to query
142 |       case 1: return _queryGroup( this.index, groups[0], done );
143 | 
144 |       // for queries with multiple groups, we query each
145 |       // group and then merge the results together.
146 |       default: return _queryManyGroups( this.index, groups, done );
147 |     }
148 | 
149 |   }.bind(this));
150 | }
151 | 
152 | module.exports.query = query;
153 | module.exports._queryGroup = _queryGroup;
154 | module.exports._queryManyGroups = _queryManyGroups;
155 | 


--------------------------------------------------------------------------------
/server/routes/search.js:
--------------------------------------------------------------------------------
  1 | 
  2 | const _ = require('lodash');
  3 | const util = require('./_util');
  4 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX;
  5 | 
  6 | module.exports = function( req, res ){
  7 | 
  8 |   // placeholder
  9 |   var ph = req.app.locals.ph;
 10 | 
 11 |   // input text
 12 |   var text = req.query.text || '';
 13 | 
 14 |   // placetype filter
 15 |   var filter = { placetype: util.arrayParam( req.query.placetype ) };
 16 | 
 17 |   // live mode (autocomplete-style search)
 18 |   // we append a byte indicating the last word is potentially incomplete.
 19 |   // except where the last token is a space, then we simply trim the space.
 20 |   if( req.query.mode === 'live' ){
 21 |     if( ' ' === text.slice(-1) ){
 22 |       text = text.trim();
 23 |     } else {
 24 |       text += PARTIAL_TOKEN_SUFFIX;
 25 |     }
 26 |   }
 27 | 
 28 |   // perform query
 29 |   console.time('took');
 30 |   ph.query( text, ( err, result ) => {
 31 |     console.timeEnd('took');
 32 | 
 33 |     // language property
 34 |     var lang;
 35 |     if( 'string' === typeof req.query.lang && req.query.lang.length === 3 ){
 36 |       lang = req.query.lang.toLowerCase();
 37 |     }
 38 | 
 39 |     // fetch all result docs by id
 40 |     ph.store.getMany( result.getIdsAsArray(), function( err, documents ){
 41 |       if( err ){ return res.status(500).send(err); }
 42 |       if( !documents || !documents.length ){ return res.status(200).send([]); }
 43 | 
 44 |       // placetype filter
 45 |       if( Array.isArray( filter.placetype ) && filter.placetype.length ){
 46 |         documents = documents.filter(res => _.includes( filter.placetype, res.placetype ));
 47 |       }
 48 | 
 49 |       // get a list of parent ids
 50 |       const parentIds = getParentIds( documents );
 51 | 
 52 |       // load all the parents
 53 |       ph.store.getMany( parentIds, ( err, parentResults ) => {
 54 | 
 55 |         // a database error occurred
 56 |         if( err ){ console.error( 'error fetching parent ids', err ); }
 57 | 
 58 |         // handle case where the database was unable to return any rows
 59 |         parentResults = parentResults || [];
 60 | 
 61 |         // create a map of parents
 62 |         const parents = rowsToIdMap( parentResults );
 63 | 
 64 |         // map documents to dict using id as key
 65 |         const docs = documents.map( function( result ){
 66 |           return mapResult( ph, result, parents, lang );
 67 |         });
 68 | 
 69 |         // sort documents according to sorting rules
 70 |         docs.sort( sortingAlgorithm );
 71 | 
 72 |         // send json
 73 |         res.status(200).json( docs );
 74 |       });
 75 |     });
 76 |   });
 77 | };
 78 | 
 79 | /**
 80 |   sort highest 'population' first, using 'geom.area' as a second
 81 |   sorting condition where population data is not available.
 82 | **/
 83 | function sortingAlgorithm( a, b ){
 84 | 
 85 |   // condition 1 - population
 86 |   const a1 = a.population || 0;
 87 |   const b1 = b.population || 0;
 88 | 
 89 |   // condition 2 - geom.area
 90 |   const a2 = a.geom && a.geom.area || 0;
 91 |   const b2 = b.geom && b.geom.area || 0;
 92 | 
 93 |   if( a1 < b1 ){ return +1; }
 94 |   if( a1 > b1 ){ return -1; }
 95 |   if( a2 < b2 ){ return +1; }
 96 |   if( a2 > b2 ){ return -1; }
 97 |   return 0;
 98 | }
 99 | 
100 | function mapResult( ph, result, parents, lang ){
101 | 
102 |   // swap languages
103 |   if( Array.isArray( result.names[lang] ) && result.names[lang].length ){
104 |     result.name = result.names[lang][0];
105 |     result.languageDefaulted = false;
106 |   } else {
107 |     result.languageDefaulted = true;
108 |   }
109 | 
110 |   // delete language properties
111 |   delete result.names;
112 | 
113 |   // delete rank properties
114 |   delete result.rank;
115 | 
116 |   result.lineage = result.lineage.map( function( lineage ){
117 |     return mapLineage( ph, lineage, parents, lang );
118 |   });
119 |   return result;
120 | }
121 | 
122 | function mapLineage( ph, lineage, parents, lang ){
123 |   const res = {};
124 | 
125 |   for( var attr in lineage ){
126 |     var parent = parents[ lineage[ attr ] ];
127 | 
128 |     if( !parent ){
129 |       console.error( 'parent not found!', attr, lineage[ attr ] );
130 |       continue;
131 |     }
132 | 
133 |     var name = parent.name;
134 |     var languageDefaulted = true;
135 | 
136 |     // swap languages
137 |     if( Array.isArray( parent.names[lang] ) && parent.names[lang].length ){
138 |       languageDefaulted = false;
139 |       name = parent.names[lang][0];
140 |     }
141 | 
142 |     res[ parent.placetype ] = {
143 |       id: parent.id,
144 |       name: name,
145 |       abbr: parent.abbr,
146 |       languageDefaulted: languageDefaulted
147 |     };
148 |   }
149 | 
150 |   return res;
151 | }
152 | 
153 | // convert array of results to map using id as key
154 | function rowsToIdMap( rows ){
155 |   const map = {};
156 |   rows.forEach( function( row ){
157 |     map[ row.id ] = row;
158 |   });
159 |   return map;
160 | }
161 | 
162 | // get a unique array of parent ids
163 | function getParentIds( results ){
164 |   const parentIds = {};
165 |   results.forEach( function( row ){
166 |     row.lineage.forEach( function( lineage ){
167 |       for( var attr in lineage ){
168 |         parentIds[ lineage[attr] ] = true;
169 |       }
170 |     });
171 |   });
172 |   return Object.keys( parentIds );
173 | }
174 | 


--------------------------------------------------------------------------------
/server/http.js:
--------------------------------------------------------------------------------
  1 | 
  2 | /**
  3 |   The http server improves performance on multicore machines by using the
  4 |   node core 'cluster' module to fork worker processes.
  5 | 
  6 |   The default setting is to use all available CPUs, this will spawn 32 child
  7 |   processes on a 32 core machine.
  8 | 
  9 |   If you would like to disable this feature (maybe because you are running
 10 |   inside a container) then you can do so by setting the env var CPUS=1
 11 | 
 12 |   You may also specify exactly how many child processes you would like to
 13 |   spawn by setting the env var to a numeric value >1, eg CPUS=4
 14 | 
 15 |   If the CPUS env var is set less than 1 or greater than os.cpus().length
 16 |   then the default setting will be used (using all available cores).
 17 | **/
 18 | 
 19 | const os = require('os');
 20 | const morgan = require('morgan');
 21 | const express = require('express');
 22 | const cluster = require('cluster');
 23 | const through = require('through2');
 24 | const _ = require('lodash');
 25 | 
 26 | const Placeholder = require('../Placeholder.js');
 27 | const logger = require('pelias-logger').get('placeholder');
 28 | 
 29 | // select the amount of cpus we will use
 30 | const envCpus = parseInt( process.env.CPUS, 10 );
 31 | const cpus = Math.min( Math.max( envCpus || Infinity, 1 ), os.cpus().length );
 32 | 
 33 | // optionally override port/host using env var
 34 | var PORT = process.env.PORT || 3000;
 35 | var HOST = process.env.HOST || undefined;
 36 | var app = express();
 37 | 
 38 | // store the express http server so it can be terminated gracefully later
 39 | let server;
 40 | 
 41 | //record whether the service is terminating to control what events are worth logging
 42 | let terminating = false;
 43 | 
 44 | function log() {
 45 |   morgan.token('url', (req, res) => {
 46 |     // if there's a DNT header, just return '/' as the URL
 47 |     if (['DNT', 'dnt', 'do_not_track'].some(header => _.has(req.headers, header))) {
 48 |       return _.get(req, 'route.path');
 49 |     } else {
 50 |       return req.originalUrl;
 51 |     }
 52 |   });
 53 | 
 54 |   // 'short' format includes response time but leaves out date
 55 |   return morgan('short', {
 56 |     stream: through( function write( ln, _, next ){
 57 |       logger.info( ln.toString().trim() );
 58 |       next();
 59 |     })
 60 |   });
 61 | }
 62 | 
 63 | // make sure that logging is the first thing that happens for all endpoints
 64 | app.use(log());
 65 | 
 66 | // init placeholder
 67 | var ph = new Placeholder({ readonly: true });
 68 | ph.load();
 69 | 
 70 | // ensure the database schemas match what is expected by the codebase.
 71 | try { ph.checkSchema(); }
 72 | catch( e ){
 73 |   console.info('------------------------------------------------------');
 74 |   console.error('Database schema is out-of-date!');
 75 |   console.info('Your database files do not match the expected schema.');
 76 |   console.info('Please follow instructions in the README to obtain new database files.');
 77 |   console.info('This is the expected behaviour for breaking schema updates.');
 78 |   console.info('more info: https://github.com/pelias/placeholder');
 79 |   console.info('------------------------------------------------------');
 80 |   process.exit(1);
 81 | }
 82 | 
 83 | // store $ph on app
 84 | app.locals.ph = ph;
 85 | 
 86 | // generic http headers
 87 | app.use((req, res, next) => {
 88 |   res.header('Charset','utf8');
 89 |   res.header('Cache-Control','public, max-age=120');
 90 |   next();
 91 | });
 92 | 
 93 | // routes
 94 | app.get( '/parser/search', require( './routes/search' ) );
 95 | app.get( '/parser/findbyid', require( './routes/findbyid' ) );
 96 | app.get( '/parser/query', require( './routes/query' ) );
 97 | app.get( '/parser/tokenize', require( './routes/tokenize' ) );
 98 | 
 99 | // demo page
100 | app.use('/demo', express.static( __dirname + '/demo' ));
101 | app.use('/', (req, res) => { res.redirect('/demo#eng'); });
102 | 
103 | // handle SIGINT and SIGTERM (required for fast docker restarts)
104 | function handler() {
105 |   ph.close();
106 | 
107 |   terminating = true;
108 |   if (cluster.isMaster) {
109 |     logger.info('Placeholder service shutting down');
110 |     for (const id in cluster.workers) {
111 |       cluster.workers[id].kill('SIGINT');
112 |       cluster.workers[id].disconnect();
113 |     }
114 |   }
115 | 
116 |   if (server) {
117 |     server.close();
118 |   }
119 | }
120 | 
121 | process.on('SIGINT', handler);
122 | process.on('SIGTERM', handler);
123 | 
124 | // start multi-threaded server
125 | if( cpus > 1 ){
126 |   if( cluster.isMaster ){
127 |     logger.info('[master] using %d cpus', cpus);
128 | 
129 |     // worker exit event
130 |     cluster.on('exit', (worker, code, signal) => {
131 |       if (!terminating) {
132 |         logger.error('[master] worker died', worker.process.pid);
133 |       }
134 |     });
135 | 
136 |     // worker fork event
137 |     cluster.on('fork', (worker, code, signal) => {
138 |       logger.info('[master] worker forked', worker.process.pid);
139 |     });
140 | 
141 |     // fork workers
142 |     for( var c=0; c<cpus; c++ ){
143 |       cluster.fork();
144 |     }
145 | 
146 |   } else {
147 |     server = app.listen( PORT, HOST, () => {
148 |       logger.info('[worker %d] listening on %s:%s', process.pid, HOST||'0.0.0.0', PORT);
149 |     });
150 |   }
151 | }
152 | 
153 | // start single-threaded server
154 | else {
155 |   logger.info('[master] using %d cpus', cpus);
156 | 
157 |   server = app.listen( PORT, HOST, () => {
158 |     logger.info('[master] listening on %s:%s', HOST||'0.0.0.0', PORT);
159 |   });
160 | }
161 | 


--------------------------------------------------------------------------------
/test/lib/analysis.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var analysis = require('../../lib/analysis');
  3 | 
  4 | module.exports.normalize = function(test, common) {
  5 |   var assert = runner.bind(null, test, 'normalize');
  6 | 
  7 |   // Germanic substitutions
  8 |   assert( 'Schöneberg', [ 'schoneberg', 'schoeneberg' ] );
  9 | 
 10 |   // apostrophe s
 11 |   assert( 'St. George\'s', [ 'st georges', 'st george' ] );
 12 |   assert( 'St. George\‘s', [ 'st georges', 'st george' ] );
 13 |   assert( 'St. George\’s', [ 'st georges', 'st george' ] );
 14 | 
 15 |   // Punctuation substitutions
 16 |   assert( 'Straße', [ 'strasse' ] );
 17 |   assert( 'Jǿ œ̆', [ 'jo oe' ] );
 18 |   assert( 'orilẹ́ede manamari', [ 'orileede manamari' ] );
 19 |   assert( 'z︠h︡ovkva', [ 'zhovkva' ] );
 20 |   assert( 'Žovkva', [ 'zovkva' ] );
 21 |   assert( 'Żółkiew', [ 'zolkiew' ] );
 22 |   assert( 'Trinidad & Tobago', [ 'trinidad and tobago' ] );
 23 | 
 24 |   // Tests to confirm the order of function execution
 25 |   // see: https://github.com/pelias/placeholder/pull/12#issuecomment-302437570
 26 |   test('order of execution', function(t) {
 27 |     t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'inceyol' ] );
 28 |     t.equal( analysis.normalize( 'İnceyol' )[0].length, 7 );
 29 |     t.equal( analysis.normalize( 'İ' )[0].length, 1 );
 30 |     t.end();
 31 |   });
 32 | 
 33 |   // Synonym contractions
 34 |   assert( 'SainT token sAiNt value saInt', [ 'st token st value st' ] );
 35 |   assert( 'SaintE token sAinTe value saINte', [ 'ste token ste value ste' ] );
 36 |   assert( 'FoRt token fORt value fOrT', [ 'ft token ft value ft' ] );
 37 |   assert( 'MoUNt token mOUNt value mouNT', [ 'mt token mt value mt' ] );
 38 | 
 39 |   // Synonym contractions - hyphens
 40 |   assert( 'Foo-Sainte-Bar', [ 'foostebar', 'foo ste bar' ] );
 41 |   assert( 'Foo-Saint-Bar', [ 'foostbar', 'foo st bar' ] );
 42 |   assert( 'Foo-Mount-Bar', [ 'foomtbar', 'foo mt bar' ] );
 43 |   assert( 'Foo-Fort-Bar', [ 'fooftbar', 'foo ft bar' ] );
 44 | 
 45 |   // Synonym - with/without official designation
 46 |   assert( 'County', [ 'county' ] );
 47 |   assert( 'County Durham', [ 'county durham', 'durham' ] );
 48 |   assert( 'County of Durham', [ 'county of durham', 'durham' ] );
 49 |   assert( 'Durham County', [ 'durham county', 'durham' ] );
 50 |   assert( 'County Two Words', [ 'county two words', 'two words' ] );
 51 |   assert( 'County of Two Words', [ 'county of two words', 'two words' ] );
 52 |   assert( 'Two Words County', [ 'two words county', 'two words' ] );
 53 | 
 54 |   assert( 'City', [ 'city' ] );
 55 |   assert( 'City London', [ 'city london' ] );
 56 |   assert( 'City of London', [ 'city of london', 'london' ] );
 57 |   assert( 'London City', [ 'london city' ] );
 58 |   assert( 'City Salt Lake', [ 'city salt lake' ] );
 59 |   assert( 'City of Salt Lake', [ 'city of salt lake', 'salt lake' ] );
 60 |   assert( 'New York City', [ 'new york city' ] );
 61 |   assert( 'City New York', [ 'city new york' ] );
 62 |   assert( 'City of New York', [ 'city of new york', 'new york' ] );
 63 |   assert( 'New York City', [ 'new york city' ] );
 64 | 
 65 |   assert( 'City of the Sun', [ 'city of the sun' ] );
 66 |   assert( 'City of Sun', [ 'city of sun', 'sun' ] );
 67 | 
 68 |   // https://en.wikipedia.org/wiki/Charter_township
 69 |   assert( 'Word Charter Township', [ 'word charter township', 'word township' ] );
 70 |   assert( 'Two Words Charter Township', [ 'two words charter township', 'two words township' ] );
 71 | 
 72 |   // remove 'disambiguation' tokens from name suffix
 73 |   // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/885
 74 |   assert( 'St Kilda (Vic.)', [ 'st kilda' ] );
 75 |   assert( 'Spring Mountain (Qld)', [ 'spring mountain' ] );
 76 |   assert( 'Mónaco - Monaco', [ 'monaco' ] );
 77 |   assert( 'Monako (peyi)', [ 'monako' ] );
 78 |   assert( 'Monako [peyi]', [ 'monako' ] );
 79 |   assert( 'Port Phillip (C)', [ 'port phillip' ] );
 80 |   assert( 'Portland (Oregon)', [ 'portland' ] );
 81 |   assert( 'Sutherland Shire (A)', [ 'sutherland shire' ] );
 82 |   assert( 'Cocos- [Keeling] eilande', [ 'cocos' ] );
 83 | 
 84 |   // remove tokens that *only* contain numbers
 85 |   assert( '1', [] );
 86 |   assert( '22', [] );
 87 |   assert( '333', [] );
 88 |   assert( '22nd', ['22nd'] );
 89 |   assert( 'a12', ['a12'] );
 90 |   assert( '-1', [] ); // special case: handle '-1' values
 91 |   assert( '1 -1', [] );
 92 |   assert( '1 --1', [] );
 93 |   assert( '1 (foo)', [] );
 94 |   assert( '1 [foo]', [] );
 95 | };
 96 | 
 97 | module.exports.tokenize = function(test, common) {
 98 |   var assert = runner.bind(null, test, 'tokenize');
 99 | 
100 |   // invalid type
101 |   assert( [], [] );
102 |   assert( {}, [] );
103 | 
104 |   // delimiters
105 |   assert( 'Foo  Bar', [[ 'foo', 'bar' ]] );
106 |   assert( 'Foo,,Bar', [[ 'foo', 'bar' ]] );
107 |   assert( 'Foo\'\'Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] );
108 |   assert( 'Foo‘‘Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] );
109 |   assert( 'Foo’’Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] );
110 |   assert( 'Foo\'’’Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] );
111 |   assert( 'Foo""Bar', [[ 'foo', 'bar' ]] );
112 | 
113 |   // not a delimeter
114 |   assert( 'Foo..Bar', [[ 'foobar' ]] );
115 |   assert( 'West L.A.', [[ 'west', 'la' ]] );
116 | 
117 |   // synonymous punctuation
118 |   assert( 'Foo-Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] );
119 |   assert( 'Tol\'yatti', [[ 'tolyatti' ], [ 'tol', 'yatti' ]] );
120 |   assert( 'Sendai-shi', [[ 'sendaishi' ], [ 'sendai', 'shi' ]] );
121 | };
122 | 
123 | module.exports.minor_to_major = function(test, common) {
124 | 
125 |   var isMajorToMinor = function( str ){
126 |     return analysis.REGEX_MAJOR_TO_MINOR.test( str );
127 |   };
128 | 
129 |   test( 'minor-to-major', function(t) {
130 |     t.false( isMajorToMinor('London, UK'), 'English' );
131 |     t.false( isMajorToMinor('Köln Deutschland'), 'German' );
132 |     t.false( isMajorToMinor('Orléans Nîmes Besançon'), 'French' );
133 |     t.end();
134 |   });
135 | 
136 |   test( 'major-to-minor', function(t) {
137 |     t.true( isMajorToMinor('г.Москва'), 'Russian' );
138 |     t.true( isMajorToMinor('경기도 광명시'), 'Korean' );
139 |     t.true( isMajorToMinor('ישראל'), 'Hebrew' );
140 |     t.true( isMajorToMinor('دبي'), 'Arabic' );
141 |     t.end();
142 |   });
143 | };
144 | 
145 | // convenience function for writing quick 'n easy test cases
146 | function runner( test, method, actual, expected ){
147 |   test( actual, function(t) {
148 |     t.deepEqual( analysis[method]( actual ), expected );
149 |     t.end();
150 |   });
151 | }
152 | 


--------------------------------------------------------------------------------
/lib/Queries.js:
--------------------------------------------------------------------------------
  1 | 
  2 | // load SQL queries from filesystem
  3 | const query = require('../query/index');
  4 | const PARTIAL_TOKEN_SUFFIX = require('./analysis').PARTIAL_TOKEN_SUFFIX;
  5 | const REMOVE_PARTIAL_TOKEN_REGEX = new RegExp(PARTIAL_TOKEN_SUFFIX, 'g');
  6 | const MAX_RESULTS = 100;
  7 | const DEBUG = false;
  8 | 
  9 | // set threshold bounds between 0.0-1.0 (degrees), defaults to 0.2
 10 | const RTREE_ENV = parseFloat( process.env.RTREE_THRESHOLD );
 11 | const RTREE_THRESHOLD = !isNaN( RTREE_ENV ) ? Math.max( 0, Math.min( 1, RTREE_ENV ) ) : 0.2;
 12 | 
 13 | function debug( stmt, args, cb ){
 14 |   if( !DEBUG ){ return cb; }
 15 |   var query = renderQuery( stmt, args );
 16 |   var start = new Date().getTime();
 17 |   return function() {
 18 |     var took = new Date().getTime() - start;
 19 |     console.error('\x1b[1m' + query + '\x1b[0m');
 20 |     console.error('\x1b[1;93mtook', took + 'ms\x1b[0m');
 21 |     console.error('---------------------------------------------------------');
 22 |     cb.apply( null, Array.prototype.slice.call( arguments ) );
 23 |   };
 24 | }
 25 | 
 26 | // debug statement and args
 27 | function renderQuery( stmt, args ){
 28 |   var output = stmt.source;
 29 |   Object.keys( args ).forEach( key => {
 30 |     output = output.replace('$' + key, '\'' + args[ key ] + '\'');
 31 |   });
 32 |   return output;
 33 | }
 34 | 
 35 | // generic boolean query
 36 | module.exports._queryBool = function( stmt, args, cb ){
 37 |   cb = debug( stmt, args, cb );
 38 |   try {
 39 |     var row = stmt.get( args );
 40 |     return cb( undefined !== row );
 41 |   } catch ( err ){
 42 |     console.error( err );
 43 |     return cb( false );
 44 |   }
 45 | };
 46 | 
 47 | // generic all query
 48 | module.exports._queryAll = function( stmt, args, cb ){
 49 |   cb = debug( stmt, args, cb );
 50 |   try {
 51 |     var rows = stmt.all( args );
 52 |     if( !Array.isArray( rows ) ){ return cb( null, [] ); }
 53 |     return cb( null, rows );
 54 |   } catch ( err ){
 55 |     console.error( err );
 56 |     return cb( err );
 57 |   }
 58 | };
 59 | 
 60 | // cb( bool ) whether a 'subject' value exists in the db
 61 | module.exports.hasSubject = function( subject, cb ){
 62 |   var isPartialToken = subject.slice(-1) === PARTIAL_TOKEN_SUFFIX;
 63 |   subject = subject.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, '');
 64 | 
 65 |   // no-op for empty string
 66 |   if( '' === subject.trim() ){ return cb( null, [] ); }
 67 | 
 68 |   if( isPartialToken ){
 69 |     this._queryBool(
 70 |       this.prepare( query.has_subject_autocomplete ),
 71 |       { subject: `"${subject}" OR "${subject}"*` },
 72 |       cb
 73 |     );
 74 |   } else {
 75 |     this._queryBool(
 76 |       this.prepare( query.has_subject_autocomplete ),
 77 |       { subject: `"${subject}"` },
 78 |       cb
 79 |     );
 80 |   }
 81 | };
 82 | 
 83 | module.exports.matchSubjectDistinctSubjectIds = function( subject, cb ){
 84 |   var isPartialToken = subject.slice(-1) === PARTIAL_TOKEN_SUFFIX;
 85 | 
 86 |   // no-op for empty string
 87 |   if( '' === subject.trim() ){ return cb( null, [] ); }
 88 | 
 89 |   if( isPartialToken ){
 90 |     subject = subject.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, '');
 91 |     if( '' === subject.trim() ){ return cb( null, [] ); }
 92 | 
 93 |     this._queryAll(
 94 |       this.prepare( query.match_subject_autocomplete_distinct_subject_ids ),
 95 |       { subject: `"${subject}" OR "${subject}"*`, limit: MAX_RESULTS },
 96 |       cb
 97 |     );
 98 |   } else {
 99 |     this._queryAll(
100 |       this.prepare( query.match_subject_distinct_subject_ids ),
101 |       { subject: subject, limit: MAX_RESULTS },
102 |       cb
103 |     );
104 |   }
105 | };
106 | 
107 | module.exports.matchSubjectObject = function( subject, object, cb ){
108 |   var isPartialToken = object.slice(-1) === PARTIAL_TOKEN_SUFFIX;
109 | 
110 |   // no-op for empty string
111 |   if( '' === subject.trim() ){ return cb( null, [] ); }
112 |   if( '' === object.trim() ){ return cb( null, [] ); }
113 | 
114 |   if( isPartialToken ){
115 |     object = object.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, '');
116 |     if( '' === object.trim() ){ return cb( null, [] ); }
117 | 
118 |     if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); }
119 | 
120 |     this._queryAll(
121 |       this.prepare( query.match_subject_object_autocomplete ),
122 |       {
123 |         subject: subject,
124 |         object: `${object}%`,
125 |         limit: MAX_RESULTS
126 |       },
127 |       cb
128 |     );
129 |   } else {
130 |     if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); }
131 | 
132 |     this._queryAll(
133 |       this.prepare( query.match_subject_object ),
134 |       {
135 |         subject: subject,
136 |         object: object,
137 |         limit: MAX_RESULTS
138 |       },
139 |       cb
140 |     );
141 |   }
142 | };
143 | 
144 | module.exports._hasTooManyCombinations = function(subject, object) {
145 |   const terms = [ subject, object ];
146 |   const stmt = this.prepare(query.count_tokens);
147 | 
148 |   const counts = terms.map(token => stmt.get({ token_quoted: `"${token}"` }).cnt);
149 |   const combinations = counts.reduce((a, b) => a * b, 1);
150 | 
151 |   return combinations >= 1e6;
152 | };
153 | 
154 | module.exports.matchSubjectObjectGeomIntersects = function( subject, object, cb ){
155 |   var isPartialToken = object.slice(-1) === PARTIAL_TOKEN_SUFFIX;
156 | 
157 |   // no-op for empty string
158 |   if( '' === subject.trim() ){ return cb( null, [] ); }
159 |   if( '' === object.trim() ){ return cb( null, [] ); }
160 | 
161 |   // no-op when theshold is less than 0
162 |   if( 0 > RTREE_THRESHOLD ){ return cb( null, [] ); }
163 | 
164 |   if( isPartialToken ){
165 |     object = object.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, '');
166 |     if( '' === object.trim() ){ return cb( null, [] ); }
167 | 
168 |     if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); }
169 | 
170 |     this._queryAll(
171 |       this.prepare( query.match_subject_object_geom_intersects_autocomplete ),
172 |       {
173 |         subject,
174 |         object,
175 |         subject_quoted: `"${subject}"`,
176 |         object_quoted: `"${object}"`,
177 |         threshold: RTREE_THRESHOLD,
178 |         limit: MAX_RESULTS
179 |       },
180 |       cb
181 |     );
182 |   } else {
183 |     if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); }
184 | 
185 |     this._queryAll(
186 |       this.prepare( query.match_subject_object_geom_intersects ),
187 |       {
188 |         subject,
189 |         object,
190 |         subject_quoted: `"${subject}"`,
191 |         object_quoted: `"${object}"`,
192 |         threshold: RTREE_THRESHOLD,
193 |         limit: MAX_RESULTS
194 |       },
195 |       cb
196 |     );
197 |   }
198 | };
199 | 


--------------------------------------------------------------------------------
/test/lib/TokenIndex.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var TokenIndex = require('../../lib/TokenIndex');
  3 | 
  4 | module.exports.constructor = function(test, common) {
  5 |   test('constructor', function(t) {
  6 |     var db = new TokenIndex();
  7 |     t.equal( db.constructor.super_.name, 'Database' );
  8 |     t.equal( typeof db.reset, 'function' );
  9 |     t.equal( typeof db.populate, 'function' );
 10 | 
 11 |     t.equal( typeof db.setLineage, 'function' );
 12 |     t.equal( typeof db.setTokens, 'function' );
 13 |     t.end();
 14 |   });
 15 | };
 16 | 
 17 | module.exports.reset = function(test, common) {
 18 |   test('reset', function(t) {
 19 |     var db = new TokenIndex();
 20 |     db.open('/tmp/db', { test: true, reset: true });
 21 | 
 22 |     // ensure table has been created
 23 |     var sql = 'PRAGMA table_info(lineage)';
 24 |     t.deepEqual( db.prepare(sql).all(), [
 25 |       { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 },
 26 |       { cid: 1, name: 'pid', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 }
 27 |     ]);
 28 | 
 29 |     // ensure table has been created
 30 |     sql = 'PRAGMA table_info(tokens)';
 31 |     t.deepEqual( db.prepare(sql).all(), [
 32 |       { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 },
 33 |       { cid: 1, name: 'lang', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 },
 34 |       { cid: 2, name: 'tag', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 },
 35 |       { cid: 3, name: 'token', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 }
 36 |     ]);
 37 | 
 38 |     // ensure table has been created
 39 |     sql = 'PRAGMA table_info(fulltext)';
 40 |     t.deepEqual( db.prepare(sql).all(), [
 41 |       { cid: 0, name: 'token', type: '', notnull: 0, dflt_value: null, pk: 0 }
 42 |     ]);
 43 | 
 44 |     // ensure fts table has been created with the correct options
 45 |     sql = `SELECT * FROM sqlite_master WHERE type='table' AND name='fulltext'`;
 46 |     const expected =
 47 |       'CREATE VIRTUAL TABLE fulltext USING fts5( token, ' + [
 48 |       `tokenize="unicode61 remove_diacritics 0 tokenchars '_'"`,
 49 |       `prefix='1 2 3 4 5 6 7 8 9 10 11 12'`,
 50 |       'columnsize=0'
 51 |     ].join(', ') + ')';
 52 | 
 53 |     t.deepEqual( db.prepare(sql).get().sql, expected );
 54 |     t.end();
 55 |   });
 56 | };
 57 | 
 58 | module.exports.checkSchema = function(test, common) {
 59 |   test('checkSchema - empty', function(t) {
 60 |     var db = new TokenIndex();
 61 |     db.open('/tmp/db', { test: true });
 62 |     t.throws(() => { db.checkSchema(); }, /schema invalid: table lineage/);
 63 |     t.end();
 64 |   });
 65 |   test('checkSchema - valid', function(t) {
 66 |     var db = new TokenIndex();
 67 |     db.open('/tmp/db', { test: true, reset: true });
 68 |     t.doesNotThrow(() => { db.checkSchema(); });
 69 |     t.end();
 70 |   });
 71 |   test('checkSchema - invalid lineage', function(t) {
 72 |     var db = new TokenIndex();
 73 |     db.open('/tmp/db', { test: true, reset: true });
 74 |     db.db.exec('DROP TABLE IF EXISTS lineage');
 75 |     t.throws(() => { db.checkSchema(); }, /schema invalid: table lineage/);
 76 |     t.end();
 77 |   });
 78 |   test('checkSchema - invalid tokens', function(t) {
 79 |     var db = new TokenIndex();
 80 |     db.open('/tmp/db', { test: true, reset: true });
 81 |     db.db.exec('DROP TABLE IF EXISTS tokens');
 82 |     t.throws(() => { db.checkSchema(); }, /schema invalid: table tokens/);
 83 |     t.end();
 84 |   });
 85 |   test('checkSchema - invalid fulltext', function(t) {
 86 |     var db = new TokenIndex();
 87 |     db.open('/tmp/db', { test: true, reset: true });
 88 |     db.db.exec('DROP TABLE IF EXISTS fulltext');
 89 |     t.throws(() => { db.checkSchema(); }, /schema invalid: table fulltext/);
 90 |     t.end();
 91 |   });
 92 | };
 93 | 
 94 | module.exports.populate = function(test, common) {
 95 |   test('populate', function(t) {
 96 |     var db = new TokenIndex();
 97 |     db.open('/tmp/db', { test: true, reset: true });
 98 | 
 99 |     // prepare some sql statments
100 |     const fulltext = {
101 |       query: db.prepare('SELECT * FROM fulltext')
102 |     };
103 |     const tokens = {
104 |       insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )')
105 |     };
106 | 
107 |     // add some rows to the tokens table
108 |     tokens.insert.run({ id: 1, lang: 'en', tag: 'test', token: 'hello world' });
109 |     tokens.insert.run({ id: 2, lang: 'fr', tag: 'test', token: 'a b c' });
110 | 
111 |     // no rows in fulltext table
112 |     t.deepEqual( fulltext.query.all(), [] );
113 | 
114 |     // run populate
115 |     db.populate();
116 | 
117 |     // no rows in fulltext table
118 |     t.deepEqual( fulltext.query.all(), [
119 |       { token: 'hello_world' },
120 |       { token: 'a_b_c' }
121 |     ]);
122 | 
123 |     t.end();
124 |   });
125 | };
126 | 
127 | module.exports.setLineage = function(test, common) {
128 |   test('setLineage', function(t) {
129 |     var db = new TokenIndex();
130 |     db.open('/tmp/db', { test: true, reset: true });
131 | 
132 |     t.plan(1);
133 | 
134 |     const id = 100;
135 |     const pids = [ 200, 300 ];
136 | 
137 |     db.setLineage( id, pids, (err) => {
138 | 
139 |       // ensure rows have been created
140 |       const sql = 'SELECT * FROM lineage';
141 |       t.deepEqual( db.prepare(sql).all(), [
142 |         { id: 100, pid: 200 },
143 |         { id: 100, pid: 300 }
144 |       ]);
145 | 
146 |     });
147 |   });
148 |   test('setLineage - empty pids array', function(t) {
149 |     var db = new TokenIndex();
150 |     db.open('/tmp/db', { test: true, reset: true });
151 | 
152 |     t.plan(1);
153 | 
154 |     db.setLineage( 1, [], (err, res) => {
155 |       t.deepEqual( db.prepare('SELECT * FROM lineage').all(), []);
156 |     });
157 |   });
158 | };
159 | 
160 | module.exports.setTokens = function(test, common) {
161 |   test('setTokens', function(t) {
162 |     var db = new TokenIndex();
163 |     db.open('/tmp/db', { test: true, reset: true });
164 | 
165 |     t.plan(1);
166 | 
167 |     const id = 100;
168 |     const tokens = [
169 |       { lang: 'en', tag: 'abbr', body: 'test1' },
170 |       { lang: 'fr', tag: 'variant', body: 'test2' }
171 |     ];
172 | 
173 |     db.setTokens( id, tokens, (err) => {
174 | 
175 |       // ensure rows have been created
176 |       const sql = 'SELECT * FROM tokens';
177 |       t.deepEqual( db.prepare(sql).all(), [
178 |         { id: 100, lang: 'en', tag: 'abbr', token: 'test1' },
179 |         { id: 100, lang: 'fr', tag: 'variant', token: 'test2' }
180 |       ]);
181 | 
182 |     });
183 |   });
184 |   test('setTokens - empty tokens array', function(t) {
185 |     var db = new TokenIndex();
186 |     db.open('/tmp/db', { test: true, reset: true });
187 | 
188 |     t.plan(1);
189 | 
190 |     db.setTokens( 1, [], (err, res) => {
191 |       t.deepEqual( db.prepare('SELECT * FROM tokens').all(), []);
192 |     });
193 |   });
194 | };
195 | 


--------------------------------------------------------------------------------
/config/language/whitelist.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |   This whitelist lists all the language codes we accept for import in to placeholder.
  3 | 
  4 |   The whosonfirst dataset contains many disused and rarely-used languages which
  5 |   can cause issues when the source data has been machine transiliterated.
  6 | 
  7 |   The list is non-exhaustive and was originally sourced from wikipedia and various
  8 |   online sources, I aimed to include the most commonly spoken languages worldwide.
  9 | 
 10 |   If you feel a language code is wrong or missing, please feel free to edit this file.
 11 | **/
 12 | 
 13 | // Chinese - 汉语/漢語 Hànyǔ or 中文 Zhōngwén
 14 | module.exports.chi = '';
 15 | module.exports.zho = '';
 16 | module.exports.cdo = '';
 17 | module.exports.cjy = '';
 18 | module.exports.cmn = '';
 19 | module.exports.cpx = '';
 20 | module.exports.czh = '';
 21 | module.exports.czo = '';
 22 | module.exports.gan = '';
 23 | module.exports.hak = '';
 24 | module.exports.hsn = '';
 25 | module.exports.mnp = '';
 26 | module.exports.nan = '';
 27 | module.exports.wuu = '';
 28 | module.exports.yue = '';
 29 | module.exports.och = '';
 30 | module.exports.ltc = '';
 31 | module.exports.lzh = '';
 32 | 
 33 | // Spanish - español ("Spanish") and castellano ("Castilian")
 34 | module.exports.esp = '';
 35 | module.exports.spa = '';
 36 | 
 37 | // English
 38 | module.exports.eng = '';
 39 | 
 40 | // Arabic - العربية
 41 | module.exports.ara = '';
 42 | module.exports.arq = '';
 43 | module.exports.aao = '';
 44 | module.exports.bbz = '';
 45 | module.exports.abv = '';
 46 | module.exports.shu = '';
 47 | module.exports.acy = '';
 48 | module.exports.adf = '';
 49 | module.exports.avl = '';
 50 | module.exports.arz = '';
 51 | module.exports.afb = '';
 52 | module.exports.ayh = '';
 53 | module.exports.acw = '';
 54 | module.exports.ayl = '';
 55 | module.exports.acm = '';
 56 | module.exports.ary = '';
 57 | module.exports.ars = '';
 58 | module.exports.apc = '';
 59 | module.exports.ayp = '';
 60 | module.exports.acx = '';
 61 | module.exports.aec = '';
 62 | module.exports.ayn = '';
 63 | module.exports.ssh = '';
 64 | module.exports.ajp = '';
 65 | module.exports.arb = '';
 66 | module.exports.apb = '';
 67 | module.exports.pga = '';
 68 | module.exports.acq = '';
 69 | module.exports.abh = '';
 70 | module.exports.aeb = '';
 71 | module.exports.auz = '';
 72 | 
 73 | // Hindi - मानक हिन्दी - Mānak Hindī
 74 | module.exports.hin = '';
 75 | 
 76 | // Bengali - বাংলা - Bangla
 77 | module.exports.ben = '';
 78 | 
 79 | // Portuguese - português
 80 | module.exports.por = '';
 81 | 
 82 | // Russian - ру́сский язы́к - russkij jazyk
 83 | module.exports.rus = '';
 84 | 
 85 | // Japanese - 日本語 - Nihongo
 86 | module.exports.jpn = '';
 87 | 
 88 | // Punjabi
 89 | module.exports.pan = '';
 90 | module.exports.pnb = '';
 91 | 
 92 | // German - Deutsch
 93 | module.exports.ger = '';
 94 | module.exports.deu = '';
 95 | module.exports.gmh = '';
 96 | module.exports.goh = '';
 97 | module.exports.gct = '';
 98 | module.exports.bar = '';
 99 | module.exports.cim = '';
100 | module.exports.geh = '';
101 | module.exports.ksh = '';
102 | module.exports.nds = '';
103 | module.exports.sli = '';
104 | module.exports.ltz = '';
105 | module.exports.vmf = '';
106 | module.exports.mhn = '';
107 | module.exports.pfl = '';
108 | module.exports.pdc = '';
109 | module.exports.pdt = '';
110 | module.exports.swg = '';
111 | module.exports.gsw = '';
112 | module.exports.uln = '';
113 | module.exports.sxu = '';
114 | module.exports.wae = '';
115 | module.exports.wep = '';
116 | module.exports.hrx = '';
117 | module.exports.yec = '';
118 | 
119 | // Javanese
120 | module.exports.jav = '';
121 | module.exports.jvn = '';
122 | module.exports.jas = '';
123 | module.exports.osi = '';
124 | module.exports.tes = '';
125 | module.exports.kaw = '';
126 | 
127 | // Malay
128 | module.exports.msa = '';
129 | module.exports.kxd = '';
130 | module.exports.ind = '';
131 | module.exports.zsm = '';
132 | module.exports.jax = '';
133 | module.exports.meo = '';
134 | module.exports.kvr = '';
135 | module.exports.xmm = '';
136 | module.exports.min = '';
137 | module.exports.mui = '';
138 | module.exports.zmi = '';
139 | module.exports.max = '';
140 | module.exports.mfa = '';
141 | 
142 | // Lahnda
143 | module.exports.lah = '';
144 | module.exports.hnd = '';
145 | module.exports.hno = '';
146 | module.exports.jat = '';
147 | module.exports.phr = '';
148 | module.exports.skr = '';
149 | module.exports.xhe = '';
150 | 
151 | // Telugu
152 | module.exports.tel = '';
153 | 
154 | // Vietnamese
155 | module.exports.vie = '';
156 | 
157 | // Marathi
158 | module.exports.mar = '';
159 | module.exports.omr = '';
160 | 
161 | // French - le français
162 | module.exports.fra = '';
163 | module.exports.fre = '';
164 | 
165 | // Korean - 한국어 - Hangugeo
166 | module.exports.kor = '';
167 | module.exports.jje = '';
168 | module.exports.okm = '';
169 | module.exports.oko = '';
170 | 
171 | // Tamil
172 | module.exports.tam = '';
173 | module.exports.oty = '';
174 | module.exports.ptq = '';
175 | 
176 | // Italian
177 | module.exports.ita = '';
178 | 
179 | // Urdu
180 | module.exports.urd = '';
181 | 
182 | // Tai-Kadai - ภาษาไต - p̣hās̛̄ā tay
183 | module.exports.tai = '';
184 | 
185 | // Thai
186 | module.exports.tha = '';
187 | 
188 | // Tagalog
189 | module.exports.tgl = '';
190 | module.exports.fil = '';
191 | 
192 | // Swedish
193 | module.exports.swe = '';
194 | 
195 | // Turkish
196 | module.exports.tur = '';
197 | 
198 | // Gujarati
199 | module.exports.guj = '';
200 | 
201 | // Persian
202 | module.exports.fas = '';
203 | module.exports.pes = '';
204 | module.exports.prs = '';
205 | module.exports.tgk = '';
206 | module.exports.aiq = '';
207 | module.exports.bhh = '';
208 | module.exports.haz = '';
209 | module.exports.jpr = '';
210 | module.exports.phv = '';
211 | module.exports.deh = '';
212 | module.exports.jdt = '';
213 | module.exports.ttt = '';
214 | 
215 | // Polish
216 | module.exports.pol = '';
217 | module.exports.szl = '';
218 | 
219 | // Pashto
220 | module.exports.pus = '';
221 | module.exports.pst = '';
222 | module.exports.pbu = '';
223 | module.exports.pbt = '';
224 | module.exports.wne = '';
225 | 
226 | // Kannada
227 | module.exports.kan = '';
228 | 
229 | // Malayalam
230 | module.exports.mal = '';
231 | 
232 | // Sundanese
233 | module.exports.sun = '';
234 | 
235 | // Hausa
236 | module.exports.hau = '';
237 | 
238 | // Odia
239 | module.exports.ori = '';
240 | module.exports.ory = '';
241 | module.exports.spv = '';
242 | module.exports.bpv = '';
243 | module.exports.ort = '';
244 | module.exports.dso = '';
245 | 
246 | // Romanian
247 | module.exports.rum = '';
248 | module.exports.ron = '';
249 | 
250 | // Dutch
251 | module.exports.dut = '';
252 | module.exports.nld = '';
253 | module.exports.vls = '';
254 | module.exports.zea = '';
255 | 
256 | // Hungarian
257 | module.exports.hun = '';
258 | module.exports.ohu = '';
259 | 
260 | // Greek
261 | module.exports.gre = '';
262 | module.exports.ell = '';
263 | module.exports.grc = '';
264 | module.exports.cpg = '';
265 | module.exports.gmy = '';
266 | module.exports.pnt = '';
267 | module.exports.tsd = '';
268 | module.exports.yej = '';
269 | 
270 | // Czech
271 | module.exports.cze = '';
272 | module.exports.ces = '';
273 | 


--------------------------------------------------------------------------------
/test/lib/Result.js:
--------------------------------------------------------------------------------
  1 | const Result = require('../../lib/Result');
  2 | 
  3 | module.exports.constructor = function(test, common) {
  4 |   test('constructor', function(t) {
  5 |     const res = new Result();
  6 | 
  7 |     t.equal( typeof res.getSubject, 'function' );
  8 |     t.equal( typeof res.getObject, 'function' );
  9 |     t.equal( typeof res.getPreviousObject, 'function' );
 10 |     t.equal( typeof res.getIdsAsArray, 'function' );
 11 |     t.equal( typeof res.setMask, 'function' );
 12 |     t.equal( typeof res.intersect, 'function' );
 13 | 
 14 |     t.deepEqual( res.group, [] );
 15 |     t.deepEqual( res.ids, {} );
 16 |     t.deepEqual( res.mask, [] );
 17 |     t.deepEqual( res.pos, { subject: -2, object: -1 });
 18 |     t.deepEqual( res.reset, false );
 19 |     t.equal( typeof res.done, 'function' );
 20 | 
 21 |     t.end();
 22 |   });
 23 | 
 24 |   test('constructor - set group', function(t) {
 25 |     const res = new Result(['a','b','c']);
 26 |     t.deepEqual( res.group, ['a','b','c'] );
 27 |     t.deepEqual( res.mask, [false, false, false] );
 28 |     t.end();
 29 |   });
 30 | 
 31 |   test('constructor - set group - invalid', function(t) {
 32 |     const res = new Result({ 0: 'a' });
 33 |     t.deepEqual( res.group, [] );
 34 |     t.end();
 35 |   });
 36 | 
 37 |   test('constructor - set done', function(t) {
 38 |     const done = function(){ console.error('test'); };
 39 |     const res = new Result(undefined, done);
 40 |     t.equal( res.done, done );
 41 |     t.end();
 42 |   });
 43 | 
 44 |   test('constructor - set done - invalid', function(t) {
 45 |     const res = new Result(undefined, {});
 46 |     t.equal( typeof res.done, 'function' );
 47 |     t.end();
 48 |   });
 49 | };
 50 | 
 51 | module.exports.getSubject = function(test, common) {
 52 |   test('getSubject', function(t) {
 53 |     const res = new Result();
 54 |     t.equal(res.getSubject(), undefined);
 55 | 
 56 |     const res2 = new Result(['a','b','c']);
 57 |     t.equal(res2.getSubject(), 'b');
 58 | 
 59 |     const res3 = new Result(['a','b','c']);
 60 |     res3.pos.subject = 0;
 61 |     t.equal(res3.getSubject(), 'a');
 62 | 
 63 |     t.end();
 64 |   });
 65 | };
 66 | 
 67 | module.exports.getObject = function(test, common) {
 68 |   test('getObject', function(t) {
 69 |     const res = new Result();
 70 |     t.equal(res.getObject(), undefined);
 71 | 
 72 |     const res2 = new Result(['a','b','c']);
 73 |     t.equal(res2.getObject(), 'c');
 74 | 
 75 |     const res3 = new Result(['a','b','c']);
 76 |     res3.pos.object = 1;
 77 |     t.equal(res3.getObject(), 'b');
 78 | 
 79 |     t.end();
 80 |   });
 81 | };
 82 | 
 83 | module.exports.getPreviousObject = function(test, common) {
 84 |   test('getPreviousObject', function(t) {
 85 |     const res = new Result();
 86 |     t.equal(res.getPreviousObject(), undefined);
 87 | 
 88 |     const res2 = new Result(['a','b','c']);
 89 |     t.equal(res2.getPreviousObject(), undefined);
 90 | 
 91 |     const res3 = new Result(['a','b','c']);
 92 |     res3.pos.prev_object = 1;
 93 |     t.equal(res3.getPreviousObject(), 'b');
 94 | 
 95 |     t.end();
 96 |   });
 97 | };
 98 | 
 99 | module.exports.getIdsAsArray = function(test, common) {
100 |   test('getIdsAsArray', function(t) {
101 |     const res = new Result();
102 |     t.deepEqual(res.getIdsAsArray(), []);
103 | 
104 |     const res2 = new Result();
105 |     res2.ids = { '200': true, '201': true, '202': true };
106 |     t.deepEqual(res2.getIdsAsArray(), [200, 201, 202]);
107 | 
108 |     t.end();
109 |   });
110 | };
111 | 
112 | module.exports.setMask = function(test, common) {
113 |   test('default mask', function(t) {
114 |     const res = new Result(['a','b','c']);
115 |     t.deepEqual(res.mask, [false, false, false]);
116 |     t.end();
117 |   });
118 |   test('setMask - invalid property', function(t) {
119 |     const res = new Result(['a','b','c']);
120 |     t.deepEqual(res.mask, [false, false, false]);
121 |     res.setMask('invalidproperty', true);
122 |     t.deepEqual(res.mask, [false, false, false]);
123 |     t.end();
124 |   });
125 |   test('setMask - subject - true', function(t) {
126 |     const res = new Result(['a','b','c']);
127 |     res.setMask('subject', true);
128 |     t.deepEqual(res.mask, [false, true, false]);
129 |     t.end();
130 |   });
131 |   test('setMask - subject - truthy', function(t) {
132 |     const res = new Result(['a','b','c']);
133 |     res.setMask('subject', 'non null string');
134 |     t.deepEqual(res.mask, [false, true, false]);
135 |     t.end();
136 |   });
137 |   test('setMask - subject - false', function(t) {
138 |     const res = new Result(['a','b','c']);
139 |     res.mask = [true, true, true];
140 |     res.setMask('subject', false);
141 |     t.deepEqual(res.mask, [true, false, true]);
142 |     t.end();
143 |   });
144 |   test('setMask - subject - falsy', function(t) {
145 |     const res = new Result(['a','b','c']);
146 |     res.mask = [true, true, true];
147 |     res.setMask('subject', null);
148 |     t.deepEqual(res.mask, [true, false, true]);
149 |     t.end();
150 |   });
151 |   test('setMask - object - true', function(t) {
152 |     const res = new Result(['a','b','c']);
153 |     res.setMask('object', true);
154 |     t.deepEqual(res.mask, [false, false, true]);
155 |     t.end();
156 |   });
157 |   test('setMask - object - truthy', function(t) {
158 |     const res = new Result(['a','b','c']);
159 |     res.setMask('object', 'non null string');
160 |     t.deepEqual(res.mask, [false, false, true]);
161 |     t.end();
162 |   });
163 |   test('setMask - object - false', function(t) {
164 |     const res = new Result(['a','b','c']);
165 |     res.mask = [true, true, true];
166 |     res.setMask('object', false);
167 |     t.deepEqual(res.mask, [true, true, false]);
168 |     t.end();
169 |   });
170 |   test('setMask - object - falsy', function(t) {
171 |     const res = new Result(['a','b','c']);
172 |     res.mask = [true, true, true];
173 |     res.setMask('object', null);
174 |     t.deepEqual(res.mask, [true, true, false]);
175 |     t.end();
176 |   });
177 | };
178 | 
179 | module.exports.intersect = function(test, common) {
180 |   test('intersect - error', function(t) {
181 |     const res = new Result(['a','b','c','d','e']);
182 |     t.deepEqual( res.pos, { subject: 3, object: 4 });
183 |     res.intersect( 'an error' );
184 |     t.deepEqual( res.pos, { subject: 2, object: 4 });
185 |     t.end();
186 |   });
187 |   test('intersect - no results', function(t) {
188 |     const res = new Result(['a','b','c','d','e']);
189 |     t.deepEqual( res.pos, { subject: 3, object: 4 });
190 |     res.intersect( null, [] );
191 |     t.deepEqual( res.pos, { subject: 2, object: 4 });
192 |     t.end();
193 |   });
194 |   test('intersect - match', function(t) {
195 |     const res = new Result(['a','b','c','d','e']);
196 |     t.deepEqual( res.pos, { subject: 3, object: 4 });
197 |     res.intersect( null, [
198 |       { subjectId: 102, objectId: 202 },
199 |       { subjectId: 105, objectId: 205 },
200 |       { subjectId: 100, objectId: 200 }
201 |     ]);
202 |     t.deepEqual( res.pos, { subject: 2, object: 3 });
203 |     t.deepEqual( res.ids, { 100: true, 102: true, 105: true });
204 |     t.end();
205 |   });
206 |   test('intersect - match parent', function(t) {
207 |     const res = new Result(['a','b','c','d','e']);
208 |     res.ids = { 200: true, 201: true, 202: true };
209 |     t.deepEqual( res.pos, { subject: 3, object: 4 });
210 |     res.intersect( null, [
211 |       { subjectId: 102, objectId: 202 },
212 |       { subjectId: 100, objectId: 200 },
213 |       { subjectId: 105, objectId: 205 }
214 |     ]);
215 |     t.deepEqual( res.pos, { subject: 2, object: 3 });
216 |     t.deepEqual( res.ids, { 100: true, 102: true });
217 |     t.end();
218 |   });
219 | };
220 | 


--------------------------------------------------------------------------------
/test/prototype/tokenize.js:
--------------------------------------------------------------------------------
  1 | 
  2 | const tokenize = require('../../prototype/tokenize');
  3 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX;
  4 | 
  5 | module.exports.exports = function(test, common) {
  6 |   test('exports', function(t) {
  7 |     t.equal( typeof tokenize.tokenize, 'function' );
  8 |     t.equal( typeof tokenize._indexContainsPhrase, 'function' );
  9 |     t.equal( typeof tokenize._eachSynonym, 'function' );
 10 |     t.equal( typeof tokenize._permutations, 'function' );
 11 |     t.equal( typeof tokenize._queryFilter, 'function' );
 12 |     t.equal( typeof tokenize._isArrayRangeIsEqual, 'function' );
 13 |     t.equal( typeof tokenize._groups, 'function' );
 14 |     t.end();
 15 |   });
 16 | };
 17 | 
 18 | // test if a phrase exists in the index
 19 | module.exports._indexContainsPhrase = function(test, common) {
 20 |   test('_indexContainsPhrase - true', function(t) {
 21 |     t.plan(3);
 22 |     var mock = tokenize._indexContainsPhrase.bind({
 23 |       index: { hasSubject: ( phrase, cb ) => {
 24 |         t.equals(phrase, 'hello world');
 25 |         return cb( true );
 26 |       }}
 27 |     });
 28 | 
 29 |     mock('hello world', (err, bool) => {
 30 |       t.false(err);
 31 |       t.true(bool);
 32 |     });
 33 |   });
 34 |   test('_indexContainsPhrase - false', function(t) {
 35 |     t.plan(3);
 36 |     var mock = tokenize._indexContainsPhrase.bind({
 37 |       index: { hasSubject: ( phrase, cb ) => {
 38 |         t.equals(phrase, 'hello world');
 39 |         return cb( false );
 40 |       }}
 41 |     });
 42 | 
 43 |     mock('hello world', (err, bool) => {
 44 |       t.false(err);
 45 |       t.false(bool);
 46 |     });
 47 |   });
 48 |   test('_indexContainsPhrase - partial token - true', function(t) {
 49 |     t.plan(3);
 50 |     var mock = tokenize._indexContainsPhrase.bind({
 51 |       index: { hasSubject: ( phrase, cb ) => {
 52 |         t.equals(phrase, 'hello world' + PARTIAL_TOKEN_SUFFIX);
 53 |         return cb( true );
 54 |       }}
 55 |     });
 56 | 
 57 |     mock('hello world' + PARTIAL_TOKEN_SUFFIX, (err, bool) => {
 58 |       t.false(err);
 59 |       t.true(bool);
 60 |     });
 61 |   });
 62 |   test('_indexContainsPhrase - partial token - false', function(t) {
 63 |     t.plan(3);
 64 |     var mock = tokenize._indexContainsPhrase.bind({
 65 |       index: { hasSubject: ( phrase, cb ) => {
 66 |         t.equals(phrase, 'hello world' + PARTIAL_TOKEN_SUFFIX);
 67 |         return cb( false );
 68 |       }}
 69 |     });
 70 | 
 71 |     mock('hello world' + PARTIAL_TOKEN_SUFFIX, (err, bool) => {
 72 |       t.false(err);
 73 |       t.false(bool);
 74 |     });
 75 |   });
 76 | };
 77 | 
 78 | // expand each synonym in to its permutations and check them against the database.
 79 | module.exports._eachSynonym = function(test, common) {
 80 |   test('_eachSynonym', function(t) {
 81 | 
 82 |     const synonym = ['hello', 'big', 'bright', 'new', 'world'];
 83 |     const expected = [ 'hello big', 'bright', 'new world' ];
 84 | 
 85 |     var mock = tokenize._eachSynonym.bind({
 86 |       index: { hasSubject: ( phrase, cb ) => {
 87 |         switch( phrase ){
 88 |           case 'hello big':
 89 |           case 'hello new':
 90 |           case 'new world':
 91 |           case 'bright':
 92 |           case 'world':
 93 |             return cb( true );
 94 |           default:
 95 |             return cb( false );
 96 |         }
 97 |       }}
 98 |     });
 99 | 
100 |     mock(synonym, (err, phrases) => {
101 |       t.false(err);
102 |       t.deepEqual(phrases, expected);
103 |       t.end();
104 |     });
105 |   });
106 | };
107 | 
108 | // _permutations takes an array of input tokens and produces
109 | // an output array consisting of all the potential adjancent
110 | // groupings of the input tokens up to the defined threshold.
111 | module.exports._permutations = function(test, common) {
112 |   test('_permutations', function(t) {
113 | 
114 |     const tokens = ['new', 'south', 'wales'];
115 |     const expected = [
116 |       'new south wales',
117 |       'new south',
118 |       'new',
119 |       'south wales',
120 |       'south',
121 |       'wales'
122 |     ];
123 | 
124 |     t.deepEqual(tokenize._permutations(tokens), expected);
125 |     t.end();
126 |   });
127 | };
128 | 
129 | // _queryFilter removes unwanted queries from the array before
130 | // they are returned to the caller.
131 | module.exports._queryFilter = function(test, common) {
132 |   test('_queryFilter - remove empty arrays', function(t) {
133 | 
134 |     const queries = [[], ['a'], [], ['b','c'], [], ['d'], []];
135 |     const expected = [['a'], ['b','c'], ['d']];
136 | 
137 |     t.deepEqual(tokenize._queryFilter(queries), expected);
138 |     t.end();
139 |   });
140 | 
141 |   // synonymous groupings
142 |   // this removes queries such as `[ B, C ]` where another group such as
143 |   // `[ A, B, C ]` exists.
144 |   // see: https://github.com/pelias/placeholder/issues/28
145 |   test('_queryFilter - synonymous groupings', function(t) {
146 | 
147 |     const queries = [
148 |       ['A','B','C','D'], ['B','C','D'], ['C','D'], ['D'],
149 |       ['A','B','C'], ['B','C'], ['C'],
150 |       ['A','B']
151 |     ];
152 |     const expected = [
153 |       ['A','B','C','D'],
154 |       ['A','B','C'],
155 |       ['A','B']
156 |     ];
157 | 
158 |     t.deepEqual(tokenize._queryFilter(queries), expected);
159 |     t.end();
160 |   });
161 | };
162 | 
163 | // _groups takes an array of input tokens, the tokens are first run through
164 | // the _permutations function above, each permutation is looked up in the db.
165 | // this function aims to select the best permutations to use for the query.
166 | // note: it strongly favours the longer token groupings
167 | module.exports._groups = function(test, common) {
168 |   test('_groups', function(t) {
169 | 
170 |     const tokens = ['north', 'sydney', 'new', 'south', 'wales', 'au'];
171 |     const phrases = [
172 |       'south wales','new south wales', 'wales', 'north', 'sydney',
173 |       'north sydney', 'south', 'au'
174 |     ];
175 |     const expected = ['north sydney', 'new south wales', 'au'];
176 | 
177 |     t.deepEqual(tokenize._groups(tokens, phrases), expected);
178 |     t.end();
179 |   });
180 | 
181 |   // https://github.com/pelias/placeholder/issues/231
182 |   test('_groups "constructor"', function(t) {
183 | 
184 |     const tokens = ['constructor'];
185 |     const phrases = [];
186 |     const expected = [];
187 | 
188 |     t.deepEqual(tokenize._groups(tokens, phrases), expected);
189 |     t.end();
190 |   });
191 | };
192 | 
193 | // 
194 | module.exports._isArrayRangeIsEqual = function(test, common) {
195 |   test('_isArrayRangeIsEqual', function(t) {
196 | 
197 |     const A = [1, 2, 3, 1, 2, 3];
198 |     const B = [1, 2];
199 |     const C = [3];
200 | 
201 |     t.true(tokenize._isArrayRangeIsEqual(A, B));
202 |     t.true(tokenize._isArrayRangeIsEqual(A, B, 0));
203 |     t.true(tokenize._isArrayRangeIsEqual(A, B, 3));
204 |     t.false(tokenize._isArrayRangeIsEqual(A, B, 1));
205 |     t.false(tokenize._isArrayRangeIsEqual(A, B, 2));
206 |     t.false(tokenize._isArrayRangeIsEqual(A, B, 4));
207 |     t.false(tokenize._isArrayRangeIsEqual(A, B, 5));
208 |     t.false(tokenize._isArrayRangeIsEqual(A, B, 6));
209 |     t.false(tokenize._isArrayRangeIsEqual(A, B, -1));
210 |     t.false(tokenize._isArrayRangeIsEqual(A, B, Infinity));
211 | 
212 |     t.true(tokenize._isArrayRangeIsEqual(A, C, 2));
213 |     t.true(tokenize._isArrayRangeIsEqual(A, C, 5));
214 |     t.false(tokenize._isArrayRangeIsEqual(A, C));
215 |     t.false(tokenize._isArrayRangeIsEqual(A, C, 0));
216 |     t.false(tokenize._isArrayRangeIsEqual(A, C, 1));
217 |     t.false(tokenize._isArrayRangeIsEqual(A, C, 3));
218 |     t.false(tokenize._isArrayRangeIsEqual(A, C, 4));
219 |     t.false(tokenize._isArrayRangeIsEqual(A, C, 6));
220 |     t.false(tokenize._isArrayRangeIsEqual(A, C, -1));
221 |     t.false(tokenize._isArrayRangeIsEqual(A, C, Infinity));
222 |     
223 |     t.end();
224 |   });
225 | };
226 | 
227 | 


--------------------------------------------------------------------------------
/test/lib/Queries.js:
--------------------------------------------------------------------------------
  1 | 
  2 | var TokenIndex = require('../../lib/TokenIndex');
  3 | 
  4 | module.exports.constructor = function(test, common) {
  5 |   test('constructor', function(t) {
  6 |     var db = new TokenIndex();
  7 |     t.equal( typeof db._queryBool, 'function' );
  8 |     t.equal( typeof db._queryAll, 'function' );
  9 |     t.equal( typeof db.hasSubject, 'function' );
 10 |     t.equal( typeof db.matchSubjectDistinctSubjectIds, 'function' );
 11 |     t.equal( typeof db.matchSubjectObject, 'function' );
 12 |     t.end();
 13 |   });
 14 | };
 15 | 
 16 | module.exports.hasSubject = function(test, common) {
 17 |   test('hasSubject', function(t) {
 18 |     var db = new TokenIndex();
 19 |     db.open('/tmp/db', { test: true, reset: true });
 20 | 
 21 |     // prepare some sql statments
 22 |     const tokens = {
 23 |       insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )')
 24 |     };
 25 | 
 26 |     // add some rows to the tokens table
 27 |     tokens.insert.run({ id: 1, lang: 'en', tag: 'test', token: 'hello world' });
 28 |     tokens.insert.run({ id: 2, lang: 'fr', tag: 'test', token: 'a b c' });
 29 | 
 30 |     // run populate
 31 |     db.populate();
 32 | 
 33 |     t.plan(7);
 34 |     db.hasSubject('hel', t.false );
 35 |     db.hasSubject('hello', t.false );
 36 |     db.hasSubject('hello wor', t.false );
 37 |     db.hasSubject('hello world', t.true );
 38 |     db.hasSubject('a', t.false );
 39 |     db.hasSubject('a b', t.false );
 40 |     db.hasSubject('a b c', t.true );
 41 |   });
 42 | };
 43 | 
 44 | module.exports.hasSubjectAutocomplete = function(test, common) {
 45 |   test('hasSubject - autocomplete', function(t) {
 46 |     var db = new TokenIndex();
 47 |     db.open('/tmp/db', { test: true, reset: true });
 48 | 
 49 |     // prepare some sql statments
 50 |     const tokens = {
 51 |       insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )')
 52 |     };
 53 | 
 54 |     // add some rows to the tokens table
 55 |     tokens.insert.run({ id: 1, lang: '', tag: '', token: 'hello world' });
 56 |     tokens.insert.run({ id: 2, lang: '', tag: '', token: 'a b c' });
 57 | 
 58 |     // run populate
 59 |     db.populate();
 60 | 
 61 |     t.plan(7);
 62 |     db.hasSubject('hel\x26', t.true );
 63 |     db.hasSubject('hello\x26', t.true );
 64 |     db.hasSubject('hello wor\x26', t.true );
 65 |     db.hasSubject('hello world\x26', t.true );
 66 |     db.hasSubject('a\x26', t.true );
 67 |     db.hasSubject('a b\x26', t.true );
 68 |     db.hasSubject('a b c\x26', t.true );
 69 |   });
 70 | };
 71 | 
 72 | module.exports.matchSubjectDistinctSubjectIds = function(test, common) {
 73 |   test('matchSubjectDistinctSubjectIds', function(t) {
 74 |     var db = new TokenIndex();
 75 |     db.open('/tmp/db', { test: true, reset: true });
 76 | 
 77 |     // prepare some sql statments
 78 |     const tokens = {
 79 |       insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )')
 80 |     };
 81 | 
 82 |     // add some rows to the tokens table
 83 |     tokens.insert.run({ id: 1, lang: '', tag: '', token: 'hello world' });
 84 |     tokens.insert.run({ id: 2, lang: '', tag: '', token: 'a b c' });
 85 |     tokens.insert.run({ id: 3, lang: '', tag: '', token: 'hello world' });
 86 | 
 87 |     // run populate
 88 |     db.populate();
 89 | 
 90 |     // generic failure test
 91 |     const fail = (err, ids) => {
 92 |       t.false(err);
 93 |       t.deepEquals(ids, []);
 94 |     };
 95 | 
 96 |     t.plan(14);
 97 |     db.matchSubjectDistinctSubjectIds('hel', fail);
 98 |     db.matchSubjectDistinctSubjectIds('hello', fail);
 99 |     db.matchSubjectDistinctSubjectIds('hello wor', fail);
100 |     db.matchSubjectDistinctSubjectIds('hello world', (err, ids) => {
101 |       t.false(err);
102 |       t.deepEquals(ids, [
103 |         { subjectId: 1 },
104 |         { subjectId: 3 }
105 |       ]);
106 |     });
107 |     db.matchSubjectDistinctSubjectIds('a', fail);
108 |     db.matchSubjectDistinctSubjectIds('a b', fail);
109 |     db.matchSubjectDistinctSubjectIds('a b c', (err, ids) => {
110 |       t.false(err);
111 |       t.deepEquals(ids, [
112 |         { subjectId: 2 }
113 |       ]);
114 |     });
115 |   });
116 | };
117 | 
118 | module.exports.matchSubjectAutocompleteDistinctSubjectIds = function(test, common) {
119 |   test('matchSubjectDistinctSubjectIds - autocomplete', function(t) {
120 |     var db = new TokenIndex();
121 |     db.open('/tmp/db', { test: true, reset: true });
122 | 
123 |     // prepare some sql statments
124 |     const tokens = {
125 |       insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )')
126 |     };
127 | 
128 |     // add some rows to the tokens table
129 |     tokens.insert.run({ id: 1, lang: '', tag: '', token: 'hello world' });
130 |     tokens.insert.run({ id: 2, lang: '', tag: '', token: 'a b c' });
131 |     tokens.insert.run({ id: 3, lang: '', tag: '', token: 'hello world' });
132 | 
133 |     // run populate
134 |     db.populate();
135 | 
136 |     // generic failure test
137 |     const fail = (err, ids) => {
138 |       t.false(err);
139 |       t.deepEquals(ids, []);
140 |     };
141 | 
142 |     const passOne = (err, ids) => {
143 |       t.false(err);
144 |       t.deepEquals(ids, [
145 |         { subjectId: 1 },
146 |         { subjectId: 3 }
147 |       ]);
148 |     };
149 | 
150 |     const passTwo = (err, ids) => {
151 |       t.false(err);
152 |       t.deepEquals(ids, [
153 |         { subjectId: 2 }
154 |       ]);
155 |     };
156 | 
157 |     t.plan(14);
158 |     db.matchSubjectDistinctSubjectIds('hel\x26', passOne);
159 |     db.matchSubjectDistinctSubjectIds('hello\x26', passOne);
160 |     db.matchSubjectDistinctSubjectIds('hello wor\x26', passOne);
161 |     db.matchSubjectDistinctSubjectIds('hello world\x26', passOne);
162 |     db.matchSubjectDistinctSubjectIds('a\x26', passTwo);
163 |     db.matchSubjectDistinctSubjectIds('a b\x26', passTwo);
164 |     db.matchSubjectDistinctSubjectIds('a b c\x26', passTwo);
165 |   });
166 | };
167 | 
168 | module.exports.matchSubjectObject = function(test, common) {
169 |   test('matchSubjectObject', function(t) {
170 |     var db = new TokenIndex();
171 |     db.open('/tmp/db', { test: true, reset: true });
172 | 
173 |     // prepare some sql statments
174 |     const tokens = {
175 |       insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )')
176 |     };
177 |     const lineage = {
178 |       insert: db.prepare('INSERT INTO lineage ( id, pid ) VALUES ( $id, $pid )')
179 |     };
180 | 
181 |     // add some rows to the tokens table
182 |     tokens.insert.run({ id: 1, lang: '', tag: '', token: 'paris' });
183 |     tokens.insert.run({ id: 2, lang: '', tag: '', token: 'paris' });
184 |     tokens.insert.run({ id: 3, lang: '', tag: '', token: 'france' });
185 |     tokens.insert.run({ id: 4, lang: '', tag: '', token: 'texas' });
186 | 
187 |     // add some rows to the lineage table
188 |     lineage.insert.run({ id: 1, pid: 3 });
189 |     lineage.insert.run({ id: 2, pid: 4 });
190 | 
191 |     // run populate
192 |     db.populate();
193 | 
194 |     // generic failure test
195 |     const fail = (err, ids) => {
196 |       t.false(err);
197 |       t.deepEquals(ids, []);
198 |     };
199 | 
200 |     t.plan(10);
201 |     db.matchSubjectObject('paris', 'paris', fail);
202 |     db.matchSubjectObject('france', 'france', fail);
203 |     db.matchSubjectObject('texas', 'texas', fail);
204 | 
205 |     db.matchSubjectObject('paris', 'france', (err, ids) => {
206 |       t.false(err);
207 |       t.deepEquals(ids, [
208 |         { subjectId: 1, objectId: 3 }
209 |       ]);
210 |     });
211 | 
212 |     db.matchSubjectObject('paris', 'texas', (err, ids) => {
213 |       t.false(err);
214 |       t.deepEquals(ids, [
215 |         { subjectId: 2, objectId: 4 }
216 |       ]);
217 |     });
218 |   });
219 | };
220 | 
221 | module.exports.matchSubjectObjectAutocomplete = function(test, common) {
222 |   test('matchSubjectObject - autocomplete', function(t) {
223 |     var db = new TokenIndex();
224 |     db.open('/tmp/db', { test: true, reset: true });
225 | 
226 |     // prepare some sql statments
227 |     const tokens = {
228 |       insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )')
229 |     };
230 |     const lineage = {
231 |       insert: db.prepare('INSERT INTO lineage ( id, pid ) VALUES ( $id, $pid )')
232 |     };
233 | 
234 |     // add some rows to the tokens table
235 |     tokens.insert.run({ id: 1, lang: '', tag: '', token: 'paris' });
236 |     tokens.insert.run({ id: 2, lang: '', tag: '', token: 'paris' });
237 |     tokens.insert.run({ id: 3, lang: '', tag: '', token: 'france' });
238 |     tokens.insert.run({ id: 4, lang: '', tag: '', token: 'texas' });
239 | 
240 |     // add some rows to the lineage table
241 |     lineage.insert.run({ id: 1, pid: 3 });
242 |     lineage.insert.run({ id: 2, pid: 4 });
243 | 
244 |     // run populate
245 |     db.populate();
246 | 
247 |     // generic failure test
248 |     const fail = (err, ids) => {
249 |       t.false(err);
250 |       t.deepEquals(ids, []);
251 |     };
252 | 
253 |     t.plan(10);
254 |     db.matchSubjectObject('paris', 'par\x26', fail);
255 |     db.matchSubjectObject('france', 'franc\x26', fail);
256 |     db.matchSubjectObject('texas', 'tex\x26', fail);
257 | 
258 |     db.matchSubjectObject('paris', 'fr\x26', (err, ids) => {
259 |       t.false(err);
260 |       t.deepEquals(ids, [
261 |         { subjectId: 1, objectId: 3 }
262 |       ]);
263 |     });
264 | 
265 |     db.matchSubjectObject('paris', 't\x26', (err, ids) => {
266 |       t.false(err);
267 |       t.deepEquals(ids, [
268 |         { subjectId: 2, objectId: 4 }
269 |       ]);
270 |     });
271 |   });
272 | };
273 | 


--------------------------------------------------------------------------------
/test/prototype/query.js:
--------------------------------------------------------------------------------
  1 | const Result = require('../../lib/Result');
  2 | const query = require('../../prototype/query');
  3 | 
  4 | module.exports.exports = function(test, common) {
  5 |   test('exports', function(t) {
  6 |     t.equal( typeof query.query, 'function' );
  7 |     t.equal( typeof query._queryGroup, 'function' );
  8 |     t.equal( typeof query._queryManyGroups, 'function' );
  9 |     t.end();
 10 |   });
 11 | };
 12 | 
 13 | module.exports._queryGroup = function(test, common) {
 14 |   test('_queryGroup - empty group', function(t) {
 15 | 
 16 |     const group = [];
 17 | 
 18 |     const done = (err, res) => {
 19 |       t.deepEqual(err, null);
 20 |       t.deepEqual(res.constructor.name, 'Result');
 21 |       t.deepEqual(res.getIdsAsArray(), []);
 22 |       t.deepEqual(res.mask, []);
 23 |       t.deepEqual(res.group, group);
 24 |       t.end();
 25 |     };
 26 | 
 27 |     query._queryGroup(null, group, done);
 28 |   });
 29 |   test('_queryGroup - single token - no matches', function(t) {
 30 | 
 31 |     const group = ['hello world'];
 32 |     t.plan(6);
 33 | 
 34 |     const index = {
 35 |       matchSubjectDistinctSubjectIds: ( phrase, cb ) => {
 36 |         t.equal(phrase, 'hello world');
 37 |         return cb( null, new Result() );
 38 |       }
 39 |     };
 40 | 
 41 |     const done = (err, res) => {
 42 |       t.deepEqual(err, null);
 43 |       t.deepEqual(res.constructor.name, 'Result');
 44 |       t.deepEqual(res.getIdsAsArray(), []);
 45 |       t.deepEqual(res.mask, [ false ]);
 46 |       t.deepEqual(res.group, group);
 47 |     };
 48 | 
 49 |     query._queryGroup(index, group, done);
 50 |   });
 51 |   test('_queryGroup - single token - with matches', function(t) {
 52 | 
 53 |     const group = ['hello world'];
 54 |     t.plan(6);
 55 | 
 56 |     const index = {
 57 |       matchSubjectDistinctSubjectIds: ( phrase, cb ) => {
 58 |         t.equal(phrase, 'hello world');
 59 |         return cb( null, [
 60 |           { subjectId: 100 },
 61 |           { subjectId: 200 },
 62 |           { subjectId: 300 },
 63 |         ]);
 64 |       }
 65 |     };
 66 | 
 67 |     const done = (err, res) => {
 68 |       t.deepEqual(err, null);
 69 |       t.deepEqual(res.constructor.name, 'Result');
 70 |       t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]);
 71 |       t.deepEqual(res.mask, [ true ]);
 72 |       t.deepEqual(res.group, group);
 73 |     };
 74 | 
 75 |     query._queryGroup(index, group, done);
 76 |   });
 77 |   test('_queryGroup - multiple tokens - no matches', function(t) {
 78 | 
 79 |     const group = ['hello world', 'test', 'foo bar'];
 80 |     t.plan(10);
 81 | 
 82 |     const index = {
 83 |       matchSubjectObject: ( subject, object, cb ) => {
 84 |         t.ok(true);
 85 |         return cb( null, [] );
 86 |       },
 87 |       matchSubjectDistinctSubjectIds: ( subject, cb ) => {
 88 |         t.equal(subject, 'foo bar');
 89 |         return cb( null, [
 90 |           { subjectId: 100 },
 91 |           { subjectId: 200 },
 92 |           { subjectId: 300 },
 93 |         ]);
 94 |       },
 95 |       matchSubjectObjectGeomIntersects: ( subject, object, cb ) => {
 96 |         t.ok(true);
 97 |         return cb( null, [] );
 98 |       }
 99 |     };
100 | 
101 |     const done = (err, res) => {
102 |       t.deepEqual(err, null);
103 |       t.deepEqual(res.constructor.name, 'Result');
104 |       t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]);
105 |       t.deepEqual(res.mask, [ true, true, false ]);
106 |       t.deepEqual(res.group, group);
107 |     };
108 | 
109 |     query._queryGroup(index, group, done);
110 |   });
111 |   test('_queryGroup - multiple tokens - matches', function(t) {
112 | 
113 |     const group = ['hello world', 'test', 'foo bar'];
114 |     t.plan(7);
115 | 
116 |     const index = {
117 |       matchSubjectObject: ( subject, object, cb ) => {
118 |         t.ok(true);
119 |         switch( subject ){
120 |           case 'hello world':
121 |             return cb( null, [
122 |               { subjectId: 100, objectId: 300 },
123 |               { subjectId: 200, objectId: 410 },
124 |             ]);
125 |           case 'test':
126 |             return cb( null, [
127 |               { subjectId: 300, objectId: 800 },
128 |               { subjectId: 400, objectId: 900 },
129 |             ]);
130 |           default:
131 |             return cb( null, [
132 |               { subjectId: 800, objectId: 880 },
133 |               { subjectId: 900, objectId: 990 },
134 |             ]);
135 |         }
136 |       },
137 |       matchSubjectObjectGeomIntersects: ( subject, object, cb ) => {
138 |         t.ok(true);
139 |         return cb( null, [] );
140 |       }
141 |     };
142 | 
143 |     const done = (err, res) => {
144 |       t.deepEqual(err, null);
145 |       t.deepEqual(res.constructor.name, 'Result');
146 |       t.deepEqual(res.getIdsAsArray(), [ 100 ]);
147 |       t.deepEqual(res.mask, [ true, true, true ]);
148 |       t.deepEqual(res.group, group);
149 |     };
150 | 
151 |     query._queryGroup(index, group, done);
152 |   });
153 | };
154 | 
155 | module.exports._queryManyGroups = function(test, common) {
156 |   test('_queryManyGroups - empty groups', function(t) {
157 | 
158 |     const groups = [];
159 | 
160 |     const done = (err, res) => {
161 |       t.deepEqual(err, null);
162 |       t.deepEqual(res.constructor.name, 'Result');
163 |       t.deepEqual(res.getIdsAsArray(), []);
164 |       t.deepEqual(res.mask, []);
165 |       t.deepEqual(res.group, []);
166 |       t.end();
167 |     };
168 | 
169 |     query._queryManyGroups(null, groups, done);
170 |   });
171 |   test('_queryManyGroups - single group', function(t) {
172 | 
173 |     t.plan(6);
174 |     const groups = [
175 |       ['hello world'],
176 |     ];
177 | 
178 |     const index = {
179 |       matchSubjectDistinctSubjectIds: ( phrase, cb ) => {
180 |         t.equal(phrase, 'hello world');
181 |         return cb( null, [
182 |           { subjectId: 100 },
183 |           { subjectId: 200 },
184 |           { subjectId: 300 },
185 |         ]);
186 |       }
187 |     };
188 | 
189 |     const done = (err, res) => {
190 |       t.deepEqual(err, null);
191 |       t.deepEqual(res.constructor.name, 'Result');
192 |       t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]);
193 |       t.deepEqual(res.mask, [ true ]);
194 |       t.deepEqual(res.group, groups[0]);
195 |     };
196 | 
197 |     query._queryManyGroups(index, groups, done);
198 |   });
199 |   test('_queryManyGroups - multiple groups', function(t) {
200 | 
201 |     t.plan(7);
202 |     const groups = [
203 |       ['hello world'],
204 |       ['hallo welt'],
205 |     ];
206 | 
207 |     const index = {
208 |       matchSubjectDistinctSubjectIds: ( phrase, cb ) => {
209 |         t.ok(true);
210 |         switch( phrase ){
211 |           case 'hello world':
212 |             return cb( null, [
213 |               { subjectId: 100, objectId: 300 },
214 |               { subjectId: 200, objectId: 410 },
215 |             ]);
216 |           case 'hallo welt':
217 |             return cb( null, [
218 |               { subjectId: 300, objectId: 800 },
219 |               { subjectId: 400, objectId: 900 },
220 |             ]);
221 |           default:
222 |             return cb( null, [
223 |               { subjectId: 800, objectId: 880 },
224 |               { subjectId: 900, objectId: 990 },
225 |             ]);
226 |         }
227 |       }
228 |     };
229 | 
230 |     const done = (err, res) => {
231 |       t.deepEqual(err, null);
232 |       t.deepEqual(res.constructor.name, 'Result');
233 |       t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300, 400 ]);
234 |       t.deepEqual(res.mask, [ true ]);
235 |       t.deepEqual(res.group, groups[0]);
236 |     };
237 | 
238 |     query._queryManyGroups(index, groups, done);
239 |   });
240 | };
241 | 
242 | module.exports.query = function(test, common) {
243 |   test('query - empty text', function(t) {
244 | 
245 |     const text = '';
246 |     const mock = {
247 |       tokenize: ( t, cb ) => {
248 |         cb( null, [] );
249 |       }
250 |     };
251 | 
252 |     const done = (err, res) => {
253 |       t.deepEqual(err, null);
254 |       t.deepEqual(res.constructor.name, 'Result');
255 |       t.deepEqual(res.getIdsAsArray(), []);
256 |       t.deepEqual(res.mask, []);
257 |       t.deepEqual(res.group, []);
258 |       t.end();
259 |     };
260 | 
261 |     query.query.call(mock, text, done);
262 |   });
263 |   test('query - single group', function(t) {
264 | 
265 |     t.plan(6);
266 |     const text = 'hello world';
267 |     const mock = {
268 |       tokenize: ( t, cb ) => {
269 |         cb( null, [['hello world']] );
270 |       },
271 |       index: {
272 |         matchSubjectDistinctSubjectIds: ( phrase, cb ) => {
273 |           t.equal(phrase, 'hello world');
274 |           return cb( null, [
275 |             { subjectId: 100 },
276 |             { subjectId: 200 },
277 |             { subjectId: 300 },
278 |           ]);
279 |         }
280 |       }
281 |     };
282 | 
283 |     const done = (err, res) => {
284 |       t.deepEqual(err, null);
285 |       t.deepEqual(res.constructor.name, 'Result');
286 |       t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]);
287 |       t.deepEqual(res.mask, [ true ]);
288 |       t.deepEqual(res.group, [ 'hello world' ]);
289 |     };
290 | 
291 |     query.query.call(mock, text, done);
292 |   });
293 |   test('query - multiple groups', function(t) {
294 | 
295 |     t.plan(7);
296 |     const text = 'hello world';
297 |     const mock = {
298 |       tokenize: ( t, cb ) => {
299 |         cb( null, [['hello world'], ['hallo welt']] );
300 |       },
301 |       index: {
302 |         matchSubjectDistinctSubjectIds: ( phrase, cb ) => {
303 |           t.ok(true);
304 |           return cb( null, [
305 |             { subjectId: 100 },
306 |             { subjectId: 200 },
307 |             { subjectId: 300 },
308 |           ]);
309 |         }
310 |       }
311 |     };
312 | 
313 |     const done = (err, res) => {
314 |       t.deepEqual(err, null);
315 |       t.deepEqual(res.constructor.name, 'Result');
316 |       t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]);
317 |       t.deepEqual(res.mask, [ true ]);
318 |       t.deepEqual(res.group, [ 'hello world' ]);
319 |     };
320 | 
321 |     query.query.call(mock, text, done);
322 |   });
323 | };
324 | 


--------------------------------------------------------------------------------
/server/demo/index.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en" ng-app="demo">
  3 |   <head>
  4 | 
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" />
  6 | 	<script src="https://code.jquery.com/jquery-3.7.1.min.js" integrity="sha256-/JqT3SQfawRcv/BIHPThkBvs0OEvtFFmqPF/lYI/Cxo=" crossorigin="anonymous"></script>
  7 | 
  8 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/underscore.js/1.13.7/underscore-umd-min.js"></script>
  9 | 
 10 |     <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.7/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-LN+7fdVzj6u52u30Kp6M/trliBMCMKTyK833zpbD+pXdCLuTusPj697FH4R/5mcr" crossorigin="anonymous">
 11 |     <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.7/dist/js/bootstrap.bundle.min.js" integrity="sha384-ndDqU0Gzau9qJ1lfW4pNLlhNTkCfHzAVBReH9diLvGRem5+R9g2FzA8ZGN954O5Q" crossorigin="anonymous"></script>
 12 | 
 13 |     <link rel="stylesheet" href="https://unpkg.com/leaflet@1.9.4/dist/leaflet.css" integrity="sha256-p4NxAoJBhIIN+hmNHrzRCf9tD/miZyoHS5obTRR9BMY=" crossorigin=""/>
 14 |     <script src="https://cdnjs.cloudflare.com/ajax/libs/leaflet/1.0.3/leaflet.js"></script>
 15 |     <script src="https://unpkg.com/leaflet@1.9.4/dist/leaflet.js" integrity="sha256-20nQCchB9co0qIjJZRGuk2/Z9VM+kNiyxNV1lvTlZBo=" crossorigin=""></script>
 16 | 
 17 |     <style>
 18 |       .wofinfo {
 19 |         margin: 0;
 20 |         padding: 0;
 21 |         display: inline-block;
 22 |       }
 23 |       .wofabbr {
 24 |         margin: 0 2px 0 5px;
 25 |         padding: 0 5px;
 26 |         background-color: #AAA;
 27 |         color: #FFF;
 28 |         font-size: 10px;
 29 |         display: inline-block;
 30 |       }
 31 |       #tokens {
 32 |         margin: 0;
 33 |         padding: 0;
 34 |         display: inline-block;
 35 |       }
 36 |       .wofinfo a {
 37 |         font-size:9px;
 38 |         padding: 2px;
 39 |         padding-left: 0px;
 40 |       }
 41 |       .wofinfo em {
 42 |         font-size:9px;
 43 |         padding: 2px;
 44 |         /*padding-right: 0px;*/
 45 |         font-style: normal;
 46 |       }
 47 | 
 48 |       .btn-tokens{
 49 |         --bs-btn-color: var(--bs-gray-dark);
 50 |         --bs-btn-bg: var(--bs-white);
 51 |         --bs-btn-border-color: var(--bs-border-color);
 52 |       }
 53 |     </style>
 54 | 
 55 |     <script>
 56 |       var style = {
 57 |         stroke: true,
 58 |         color: 'blue',
 59 |         opacity: 0.5,
 60 |         dashArray: '8, 5',
 61 |         fillColor: 'blue',
 62 |         fillOpacity: 0.0,
 63 |         weight: 2
 64 |       };
 65 | 
 66 |       var order = [
 67 |         'venue', 'address', 'building', 'campus', 'microhood', 'neighbourhood', 'macrohood', 'borough', 'postalcode',
 68 |         'locality', 'metro area', 'localadmin', 'county', 'macrocounty', 'region', 'macroregion', 'marinearea', 'country', 'empire', 'continent', 'ocean', 'planet'
 69 |       ];
 70 | 
 71 |       function exec( args ){
 72 | 
 73 |         $('#results').empty();
 74 |         clearMap();
 75 | 
 76 |         search( args, function( results ){
 77 |           results = results || [];
 78 | 
 79 |           // load token groups
 80 |           tokenize( args, function( groups ){
 81 |             groups = groups || [];
 82 | 
 83 |             // render results
 84 |             render( results, groups );
 85 |           });
 86 |         });
 87 |       }
 88 | 
 89 |       function renderResult( result ){
 90 | 
 91 |         // the individual parts of the result hierarchy
 92 |         var parts = [];
 93 | 
 94 |         // the result document
 95 |         parts.push({
 96 |           id: result.id,
 97 |           type: result.placetype,
 98 |           name: result.name
 99 |         })
100 | 
101 |         // only render a single lineage
102 |         if( result.lineage.length ){
103 |           var lineage = result.lineage[0];
104 |           order.forEach( function( type ){
105 |             if( lineage.hasOwnProperty( type ) ){
106 | 
107 |               // skip adding the result twice
108 |               if( lineage[type].id === result.id ){ return; }
109 | 
110 |               parts.push({
111 |                 id: lineage[type].id,
112 |                 type: type,
113 |                 name: lineage[type].name,
114 |                 abbr: lineage[type].abbr
115 |               });
116 |             }
117 |           });
118 |         }
119 | 
120 |         var view = parts.map( function( part, i ){
121 |           var v = Array(i).join('&nbsp;&nbsp;&nbsp;');
122 |           if( i > 0 ){ v += '└ '; }
123 |           v += '<strong>' + ( part.name || '??' ) + '</strong>';
124 |           if( part.abbr ){
125 |             v += '<div class="wofabbr">' + part.abbr + '</div>';
126 |           }
127 |           v += '<div class="wofinfo">';
128 |           v += '<em>' + ( part.type || '??' ) + '</em>';
129 |           v += '<a href="https://spelunker.whosonfirst.org/id/' + ( part.id || '??' ) + '">' + ( part.id || '??' ) + '</a>';
130 |           v += '</div>';
131 |           return v;
132 |         })
133 | 
134 |         // console.log( lins[i] );
135 |         $("#results").append('<li class="list-group-item"><span>' + view.join('<br />') + '</span></li>');
136 |       }
137 | 
138 |       function render( results, groups ){
139 | 
140 |         $('#results').empty();
141 |         $('#tokens').empty();
142 |         clearMap();
143 | 
144 |         // display token groups
145 |         groups.forEach( function( win ){
146 |           var buttons = win.map( function( token ){
147 |             return '<li type="button" class="btn btn-light btn-tokens" style="margin-top: 5px"><span>' + token + '</span></li>';
148 |           });
149 |           $("#tokens").html('<li><ul style="margin: 0; padding: 5px; padding-top: 0; list-style: none; background-color: #efefef; margin-bottom: 5px;">' + buttons.join('\n') + '</ul></li>' );
150 |         });
151 | 
152 |         // render the results
153 |         _.each( results, renderResult );
154 | 
155 |         // aggregate the bboxes
156 |         var bboxes =  _.compact( _.map( results, function( result ){
157 |           return result.geom.bbox;
158 |         }));
159 | 
160 |         // change map location
161 |         updateMap( bboxes );
162 |       }
163 | 
164 |       function request( url, params, cb ){
165 |         $.ajax({
166 |             url: url ,
167 |             method: 'GET',
168 |             data: params,
169 |             headers: { 'Accept': 'application/json' }
170 |           })
171 |           .done(cb);
172 |       };
173 | 
174 |       function search( args, cb ){
175 |         console.info( 'search', args );
176 | 
177 |         // get language from url hash
178 |         var hash = window.location.hash.substr(1);
179 |         if( hash && hash.length === 3 ){
180 |           args.lang = hash;
181 |         }
182 | 
183 |         saveText( args.text );
184 |         request( '/parser/search', args, cb );
185 |       }
186 | 
187 |       // function query( text, cb ){
188 |       //   console.info( 'query', text );
189 |       //   request( '/parser/query', { text: text }, cb );
190 |       // }
191 | 
192 |       // function findbyid( ids, cb ){
193 |       //   console.info( 'findbyid', ids );
194 |       //   request( '/parser/findbyid', { ids: ids.join(',') }, cb );
195 |       // }
196 | 
197 |       function tokenize( args, cb ){
198 |         console.info( 'tokenize', args );
199 |         request( '/parser/tokenize', args, cb );
200 |       }
201 | 
202 |       function clearMap(){
203 |         document.layer.clearLayers()
204 |       }
205 | 
206 |       function saveText( text ){
207 |         if( window.localStorage ){
208 |           window.localStorage.setItem( 'text', text );
209 |         }
210 |       }
211 | 
212 |       function loadText(){
213 |         if( window.localStorage ){
214 |           var text = window.localStorage.getItem('text');
215 |           if( text ){ return text; }
216 |         }
217 | 
218 |         // default text
219 |         return 'Example Street Neutral Bay North Sydney New South Wales 9999 AU';
220 |       }
221 | 
222 |       function updateMap( bboxes ){
223 | 
224 |         clearMap()
225 | 
226 |         bboxes.forEach( function( bbox ){
227 |           var split = bbox.split(',');
228 |           var bounds = [[split[1], split[0]], [split[3], split[2]]];
229 |           var rect = L.rectangle(bounds, style);
230 |           rect.addTo(document.layer);
231 |         });
232 | 
233 |         document.map.fitBounds( document.layer.getBounds() );
234 |       }
235 | 
236 |       $(document).ready(function() {
237 | 
238 |         var searchMode = function( e ){
239 |           exec({ text: $('#text').val() });
240 |           return false;
241 |         };
242 | 
243 |         var liveMode = _.debounce( function( e ){
244 |           if( e.which === 13 ){ return false; } // not for enter key
245 | 
246 |           var args = { text: $('#text').val() };
247 | 
248 |           // live mode
249 |           if( $('#live').is(':checked') ){ args.mode = 'live'; }
250 | 
251 |           exec( args );
252 |           return false;
253 |         }, 200 );
254 | 
255 |         // button / keyup event handlers
256 |         $('#search').on( 'submit', searchMode );
257 |         $('#search').on( 'keyup', liveMode );
258 | 
259 |         // checkbox change event handler
260 |         $('#live').change( function( e ) {
261 | 
262 |           // enable/disable event handler
263 |           $('#search').off( 'keyup', liveMode );
264 |           if( e.target.checked ){
265 |             $('#search').on( 'keyup', liveMode );
266 |           }
267 | 
268 |           refresh();
269 |           return false;
270 |         });
271 | 
272 |         $('#go').on( 'click', function( e ){
273 |           $('#search').submit();
274 |           return false;
275 |         });
276 | 
277 |         // create map
278 |         var map = L.map('map');
279 |         var tiles = '//{s}.tile.jawg.io/jawg-terrain/{z}/{x}/{y}.png?access-token=t6fAKnvaPdPCucraY88YwlKjBfUHqBMvvZBIWlcp1Z9Z5FVtA02uWo6Dc9DGB2JO';
280 | 
281 |         L.tileLayer( tiles, {
282 |           scrollWheelZoom: true,
283 |           zoomControl: true,
284 |           attribution:'Map &copy; <a href="http://jawg.io" target="_blank" class="jawg-attrib"><b>Jawg</b>Maps</a> | Map data &copy; <a href="https://www.openstreetmap.org/copyright" target="_blank" class="osm-attrib">OpenStreetMap contributors</a>' ,
285 |           maxZoom: 22
286 |         }).addTo(map);
287 | 
288 |         map.setView( new L.LatLng( 52.52, 13.40 ), 14 );
289 | 
290 |         var bboxLayer = L.geoJson();
291 |         bboxLayer.addTo(map);
292 | 
293 |         document.map = map;
294 |         document.layer = bboxLayer;
295 | 
296 |         // trigger search on hash change
297 |         $(window).on( 'hashchange', function(){
298 |           $('#search').submit();
299 |         })
300 | 
301 |         $('#text').val( loadText() );
302 |         $('#search').submit();
303 |       });
304 |     </script>
305 | 
306 |   </head>
307 |   <body>
308 | 
309 |     <div style="margin:20px;">
310 | 
311 |       <form id="search" action="">
312 |         <div class="row">
313 |           <div class="col-md-6">
314 | 
315 |             <div class="input-group">
316 |               <div class="input-group-text">
317 |                 <input id="live" class="form-check-input mt-0" type="checkbox" checked="checked" />
318 |               </div>
319 |               <input id="text" type="text" class="form-control" placeholder="Search for...">
320 |               <button id="go" class="btn btn-primary" type="button">Parse!</button>
321 |             </div><!-- /input-group -->
322 | 
323 |             <ul id="tokens" class="btn-group" role="group" style="margin-top:10px; list-style: none;"></ul>
324 |             <ul id="results" class="list-group" style="margin-top:10px;"></ul>
325 |           </div><!-- /.col-md-6 -->
326 | 
327 |           <div class="col-md-6" style="">
328 |             <div id="map" style="width:100%; height:100%; height: 90vh;" />
329 |           </div><!-- /.col-md-6 -->
330 |         </div><!-- /.row -->
331 |       </form>
332 | 
333 |     </div>
334 | 
335 |   </body>
336 | </html>
337 | 


--------------------------------------------------------------------------------
/prototype/wof.js:
--------------------------------------------------------------------------------
  1 | 
  2 | // plugin for whosonfirst
  3 | const _ = require('lodash');
  4 | const dir = require('require-dir');
  5 | const util = require('util');
  6 | const blacklist = require('pelias-blacklist-stream/loader')();
  7 | const analysis = require('../lib/analysis');
  8 | const language = dir('../config/language');
  9 | const LOW_POPULATION_THRESHOLD = 2000;
 10 | 
 11 | // list of languages / tags we favour in cases of deduplication
 12 | const LANG_PREFS = ['eng','und'];
 13 | const TAG_PREFS = ['preferred','abbr','label','variant','colloquial'];
 14 | 
 15 | // insert a wof record in to index
 16 | function insertWofRecord( wof, next ){
 17 | 
 18 |   var id = wof['wof:id'];
 19 |   if( 'string' === typeof id ){ id = parseInt( id, 10 ); }
 20 | 
 21 |   // sanity check; because WOF
 22 |   if( !isValidWofRecord( id, wof ) ) { return next(); }
 23 | 
 24 |   // enforce pelias/blacklist-stream exclusions
 25 |   let peliasGID = util.format('whosonfirst:%s:%d', wof['wof:placetype'], id);
 26 |   if( blacklist && blacklist.hasOwnProperty( peliasGID ) ) { return next(); }
 27 | 
 28 |   // --- document which will be saved in the doc store ---
 29 | 
 30 |   const doc = {
 31 |     id: id,
 32 |     name: wof['wof:label'] || wof['wof:name'],
 33 |     abbr: getAbbreviation( wof ),
 34 |     placetype: wof['wof:placetype'],
 35 |     rank: getRank( wof['wof:placetype'] ),
 36 |     population: getPopulation( wof ),
 37 |     popularity: wof['qs:photo_sum'],
 38 |     lineage: wof['wof:hierarchy'],
 39 |     geom: {
 40 |       area: wof['geom:area'],
 41 |       bbox: validBoundingBox(wof['lbl:bbox']) || validBoundingBox(wof['geom:bbox']),
 42 |       lat: wof['lbl:latitude'] || wof['geom:latitude'],
 43 |       lon: wof['lbl:longitude'] ||wof['geom:longitude']
 44 |     },
 45 |     names: {}
 46 |   };
 47 | 
 48 |   var tokens = [];
 49 |   var parentIds = [];
 50 | 
 51 |   // --- cast strings to numeric types ---
 52 |   // note: sometimes numeric properties in WOF can be encoded as strings.
 53 | 
 54 |   doc.population = _.toInteger( doc.population ) || undefined;
 55 |   doc.popularity = _.toInteger( doc.popularity ) || undefined;
 56 |   doc.geom.area = _.toFinite( doc.geom.area ) || undefined;
 57 |   doc.geom.lat = _.toFinite( doc.geom.lat );
 58 |   doc.geom.lon = _.toFinite( doc.geom.lon );
 59 | 
 60 |   // --- tokens ---
 61 | 
 62 |   // disable adding tokens to the index for the 'empire' placetype.
 63 |   // this ensures empire records are not retrieved via search.
 64 |   if( 'empire' !== doc.placetype ){
 65 | 
 66 |     // add 'wof:label'
 67 |     tokens.push({ lang: 'und', tag: 'label', body: wof['wof:label'] });
 68 | 
 69 |     // add 'wof:name'
 70 |     tokens.push({ lang: 'und', tag: 'label', body: wof['wof:name'] });
 71 | 
 72 |     // add 'wof:shortcode'
 73 |     // @todo: wof:abbreviation is deprecated, remove references to it
 74 |     tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:shortcode'] || wof['wof:abbreviation'] });
 75 | 
 76 |     // add 'ne:abbrev'
 77 |     // tokens.push({ lang: 'und', body: wof['ne:abbrev'] });
 78 | 
 79 |     // fields specific to countries & dependencies
 80 |     if( 'country' === doc.placetype || 'dependency' === doc.placetype ) {
 81 |       if( wof['iso:country'] && wof['iso:country'] !== 'XX' ){
 82 | 
 83 |         // add 'ne:iso_a2'
 84 |         tokens.push({ lang: 'und', tag: 'abbr', body: wof['ne:iso_a2'] });
 85 | 
 86 |         // add 'ne:iso_a3'
 87 |         tokens.push({ lang: 'und', tag: 'abbr', body: wof['ne:iso_a3'] });
 88 | 
 89 |         // add 'wof:country'
 90 |         // warning: eg. FR for 'French Guiana'
 91 |         // tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:country'] });
 92 | 
 93 |         // add 'iso:country'
 94 |         tokens.push({ lang: 'und', tag: 'abbr', body: wof['iso:country'] });
 95 | 
 96 |         // add 'wof:country_alpha3'
 97 |         tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:country_alpha3'] });
 98 |       }
 99 |     }
100 | 
101 |     // note: skip all `name:*` fields when we suspect that they were sourced from
102 |     // machine transliteration via WikiData.
103 |     // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/799
104 |     const hasDeadOrObscureLanguages = _.has(wof, 'name:vol_x_preferred');
105 |     const isLowOrUnknownPopulation = _.get(doc, 'population', 0) < LOW_POPULATION_THRESHOLD;
106 |     const isMegaCity = _.get(doc, 'wof:megacity', 0) === 1;
107 |     const isCapitalCity = !_.isEmpty(_.get(doc, 'wof:capital_of'));
108 |     const isLikelyTransliterated = (
109 |       hasDeadOrObscureLanguages && isLowOrUnknownPopulation && !isMegaCity && !isCapitalCity
110 |     );
111 |     if (!isLikelyTransliterated) {
112 | 
113 |       // add 'name:*' fields
114 |       for( var attr in wof ){
115 |         // https://github.com/whosonfirst/whosonfirst-names
116 |         // names: preferred|colloquial|variant|unknown
117 |         const match = attr.match(/^name:([a-z]{3})_x_(preferred|colloquial|variant)$/);
118 |         if (!match) { continue; }
119 | 
120 |         // Fix for https://github.com/pelias/placeholder/pull/126
121 |         // Transform iso codes 639-2/B to 639-2/T
122 |         const lang = language.alternatives[match[1]] || match[1];
123 | 
124 |         // skip languages in the blacklist, see config file for more info
125 |         if( language.blacklist.hasOwnProperty( match[1] ) ){ continue; }
126 | 
127 |         // skip if both iso codes 639-2/B and 639-2/T are present and the current iso is 639-2/B
128 |         if ( lang !== match[1] && wof[ 'name:' + lang + '_x_' + match[2] ]) { continue; }
129 | 
130 |         // index each alternative name
131 |         for( var n in wof[ attr ] ){
132 |           tokens.push({
133 |             lang: lang,
134 |             tag: match[2],
135 |             body: wof[ attr ][ n ]
136 |           });
137 |         }
138 | 
139 |         // doc - only store 'preferred' strings
140 |         if( match[2] === 'preferred' ){
141 |           doc.names[ lang ] = wof[ attr ];
142 |         }
143 |       }
144 | 
145 |     }
146 |   }
147 | 
148 |   // In the USA we would like to favor the 'wof:label' property over the 'name:eng_x_preferred' property.
149 |   if( 'US' === wof['iso:country'] && wof['wof:label'] ){
150 |     doc.names.eng = [ wof['wof:label'] ];
151 |   }
152 | 
153 |   // --- graph ---
154 | 
155 |   // parent_id property (some records have this property set but no hierarchy)
156 |   var parentId;
157 |   if( wof.hasOwnProperty('wof:parent_id') ){
158 |     parentId = wof['wof:parent_id'];
159 |     if( 'string' === typeof parentId ){ parentId = parseInt( parentId, 10 ); }
160 |     if( !isNaN( parentId ) && parentId !== id && parentId > 0 ){
161 |       parentIds.push( parentId ); // is child of
162 |     }
163 |   }
164 | 
165 |   // hierarchy properties
166 |   for( var h in wof['wof:hierarchy'] ){
167 |    for( var i in wof['wof:hierarchy'][h] ){
168 |      var pid = wof['wof:hierarchy'][h][i];
169 |      if( 'string' === typeof pid ){ pid = parseInt( pid, 10 ); }
170 |      if( pid === id || pid <= 0 || pid === parentId ){ continue; }
171 |      //  parentIds.push( id, pid, 'p' ); // has parent
172 |      parentIds.push( pid ); // is child of
173 |    }
174 |   }
175 | 
176 |   // ---- consume aggregates
177 | 
178 |   // normalize tokens
179 |   tokens = tokens.reduce(( res, token ) => {
180 |     analysis.normalize( token.body ).forEach( norm => {
181 |       res.push({ lang: token.lang, tag: token.tag, body: norm });
182 |     });
183 |     return res;
184 |   }, []);
185 | 
186 |   // sort tokens (for optimal deduplication)
187 |   tokens.sort((i1, i2) => {
188 | 
189 |     // sort by language
190 |     const l1 = LANG_PREFS.indexOf(i1.lang);
191 |     const l2 = LANG_PREFS.indexOf(i2.lang);
192 | 
193 |     if (l1 === -1){ return +1; }
194 |     if (l2 === -1){ return -1; }
195 |     if (l1 > l2){ return +1; }
196 |     if (l1 < l2){ return -1; }
197 | 
198 |     // sort by tag
199 |     const t1 = TAG_PREFS.indexOf(i1.tag);
200 |     const t2 = TAG_PREFS.indexOf(i2.tag);
201 | 
202 |     if (t1 === -1){ return +1; }
203 |     if (t2 === -1){ return -1; }
204 |     if (t1 > t2){ return +1; }
205 |     if (t1 < t2){ return -1; }
206 | 
207 |     return 0;
208 |   });
209 | 
210 |   // deduplicate tokens
211 |   var seen = {};
212 |   tokens = tokens.filter( token => {
213 |     if( seen.hasOwnProperty( 'eng:' + token.body ) ){ return false; }
214 |     if( seen.hasOwnProperty( 'und:' + token.body ) ){ return false; }
215 |     const key = token.lang + ':' + token.body;
216 |     return seen.hasOwnProperty( key ) ? false : ( seen[ key ] = true );
217 |   });
218 | 
219 |   // deduplicate parent ids
220 |   parentIds = parentIds.filter(( pid, pos ) => {
221 |     return parentIds.indexOf( pid ) === pos;
222 |   });
223 | 
224 |   // save all data to the databases
225 |   this.store.set( id, doc, ( err ) => {
226 |     if( err ){ console.error( err ); }
227 |     this.index.setTokens( id, tokens, ( err ) => {
228 |       if( err ){ console.error( err ); }
229 |       this.index.setLineage( id, parentIds, ( err ) => {
230 |         if( err ){ console.error( err ); }
231 |         next();
232 |       });
233 |     });
234 |   });
235 | }
236 | 
237 | // check if value is a valid number
238 | function isFiniteNumber( value ){
239 |   return !_.isEmpty(_.trim( value )) && _.isFinite(_.toNumber( value ));
240 | }
241 | 
242 | function isValidWofRecord( id, wof ){
243 | 
244 |   // sanity check inputs
245 |   if( !id || !wof ) { return false; }
246 | 
247 |   // sanity check; because WOF
248 |   if( id <= 0 ) { return false; }
249 | 
250 |   // skip deprecated records
251 |   const deprecated = _.trim( wof['edtf:deprecated'] );
252 |   if( !_.isEmpty( deprecated ) && deprecated !== 'uuuu' ){
253 |     return false;
254 |   }
255 | 
256 |   // skip superseded records
257 |   const superseded = wof['wof:superseded_by'];
258 |   if( Array.isArray( superseded ) && superseded.length > 0 ){
259 |     return false;
260 |   }
261 | 
262 |   /**
263 |     skip non-current records
264 | 
265 |     0 signifies a non-current record
266 |     1 signifies a current record
267 |     -1 signifies an inderminate state, someone needs to look at this record and decide
268 | 
269 |     note: we are considering -1 values as current (for now)
270 |   **/
271 |   const isCurrent = wof['mz:is_current'];
272 |   if( isCurrent === '0' || isCurrent === 0 ){
273 |     return false;
274 |   }
275 | 
276 |   // invalid latitude
277 |   if( !isFiniteNumber(wof['lbl:latitude']) && !isFiniteNumber(wof['geom:latitude']) ){
278 |     return false;
279 |   }
280 | 
281 |   // invalid longitude
282 |   if( !isFiniteNumber(wof['lbl:longitude']) && !isFiniteNumber(wof['geom:longitude']) ){
283 |     return false;
284 |   }
285 | 
286 |   return true;
287 | }
288 | 
289 | // this function favors mz:population when available, falling back to other properties.
290 | // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/240#issuecomment-294907374
291 | function getPopulation( wof ) {
292 |        if( wof['mz:population'] ){          return wof['mz:population']; }
293 |   else if( wof['wof:population'] ){         return wof['wof:population']; }
294 |   else if( wof['wk:population'] ){          return wof['wk:population']; }
295 |   else if( wof['gn:population'] ){          return wof['gn:population']; }
296 |   else if( wof['gn:pop'] ){                 return wof['gn:pop']; }
297 |   else if( wof['qs:pop'] ){                 return wof['qs:pop']; }
298 |   else if( wof['qs:gn_pop'] ){              return wof['qs:gn_pop']; }
299 |   else if( wof['zs:pop10'] ){               return wof['zs:pop10']; }
300 |   else if( wof['meso:pop'] ){               return wof['meso:pop']; }
301 |   else if( wof['statoids:population'] ){    return wof['statoids:population']; }
302 |   else if( wof['ne:pop_est'] ){             return wof['ne:pop_est']; }
303 | }
304 | 
305 | // abbreviations and ISO codes
306 | // logic copied from: pelias/whosonfirst src/components/extractFields.js (since modified)
307 | // @todo: wof:abbreviation is deprecated, remove references to it
308 | function getAbbreviation( wof ) {
309 |   if( 'country' === wof['wof:placetype'] || 'dependency' === wof['wof:placetype'] ) {
310 |     return wof['wof:country_alpha3'] || wof['ne:iso_a3'];
311 |   } else if( wof['wof:shortcode'] || wof['wof:abbreviation'] ) {
312 |     return wof['wof:shortcode'] || wof['wof:abbreviation'];
313 |   }
314 | }
315 | 
316 | const PLACETYPE_RANK = [
317 |   'venue', 'address', 'building', 'campus', 'microhood', 'neighbourhood', 'macrohood', 'borough', 'postalcode',
318 |   'locality', 'metro area', 'localadmin', 'county', 'macrocounty', 'region', 'macroregion', 'marinearea',
319 |   'disputed', 'dependency', 'country', 'empire', 'continent', 'ocean', 'planet'
320 | ];
321 | 
322 | // this function returns an integer representation of the placetype,
323 | function getRank( placetype ){
324 |   var rank = PLACETYPE_RANK.indexOf((placetype || '').toLowerCase().trim());
325 |   return {
326 |     min: rank,
327 |     max: rank +1
328 |   };
329 | }
330 | 
331 | // this function validates and returns the bbox property verbatim, else undefined
332 | // see: https://github.com/pelias/placeholder/issues/183
333 | // format: minx, miny, maxx, maxy
334 | function validBoundingBox(bbox) {
335 |   if (!_.isString(bbox)) { return; }
336 |   const coords = bbox.split(',');
337 |   if (coords.length !== 4) { return; }
338 |   const floats = coords.map(c => parseFloat(c));
339 |   if (floats.some(isNaN)) { return; }
340 |   if (floats[0] > floats[2]) { return; }
341 |   if (floats[1] > floats[3]) { return; }
342 |   return bbox;
343 | }
344 | 
345 | module.exports.insertWofRecord = insertWofRecord;
346 | module.exports.isValidWofRecord = isValidWofRecord;
347 | module.exports.getPopulation = getPopulation;
348 | module.exports.getAbbreviation = getAbbreviation;
349 | module.exports.validBoundingBox = validBoundingBox;
350 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <img height="100" src="https://raw.githubusercontent.com/pelias/design/master/logo/pelias_github/Github_markdown_hero.png">
  3 | </p>
  4 | <h3 align="center">A modular, open-source search engine for our world.</h3>
  5 | <p align="center">Pelias is a geocoder powered completely by open data, available freely to everyone.</p>
  6 | <p align="center">
  7 | <a href="https://en.wikipedia.org/wiki/MIT_License"><img src="https://img.shields.io/github/license/pelias/api?style=flat&color=orange" /></a>
  8 | <a href="https://hub.docker.com/u/pelias"><img src="https://img.shields.io/docker/pulls/pelias/api?style=flat&color=informational" /></a>
  9 | <a href="https://gitter.im/pelias/pelias"><img src="https://img.shields.io/gitter/room/pelias/pelias?style=flat&color=yellow" /></a>
 10 | </p>
 11 | <p align="center">
 12 | 	<a href="https://github.com/pelias/docker">Local Installation</a> ·
 13 |         <a href="https://geocode.earth">Cloud Webservice</a> ·
 14 | 	<a href="https://github.com/pelias/documentation">Documentation</a> ·
 15 | 	<a href="https://gitter.im/pelias/pelias">Community Chat</a>
 16 | </p>
 17 | <details open>
 18 | <summary>What is Pelias?</summary>
 19 | <br />
 20 | Pelias is a search engine for places worldwide, powered by open data. It turns addresses and place names into geographic coordinates, and turns geographic coordinates into places and addresses. With Pelias, you’re able to turn your users’ place searches into actionable geodata and transform your geodata into real places.
 21 | <br /><br />
 22 | We think open data, open source, and open strategy win over proprietary solutions at any part of the stack and we want to ensure the services we offer are in line with that vision. We believe that an open geocoder improves over the long-term only if the community can incorporate truly representative local knowledge.
 23 | </details>
 24 | 
 25 | # Pelias coarse geocoder
 26 | 
 27 | This repository provides all the code & geographic data you'll need to run your own coarse geocoder.
 28 | 
 29 | Read our [An (almost) one line coarse geocoder with Docker](https://geocode.earth/blog/2019/almost-one-line-coarse-geocoding) blog post for a quick start guide and [check out our demo](https://placeholder.demo.geocode.earth).
 30 | 
 31 | This service is intended to be run as part of the [Pelias Gecoder](https://github.com/pelias/pelias) but can just as easily be run independently as it has no external dependencies.
 32 | 
 33 | ## Natural language parser for geographic text
 34 | 
 35 | The engine takes unstructured input text, such as 'Neutral Bay North Sydney New South Wales' and attempts to deduce the geographic area the user is referring to.
 36 | 
 37 | Human beings (familiar with Australian geography) are able to quickly scan the text and establish that there 3 distinct token groups: 'Neutral Bay', 'North Sydney' & 'New South Wales'.
 38 | 
 39 | The engine uses a similar technique to our brains, scanning across the text, cycling through a dictionary of learned terms and then trying to establish logical token groups.
 40 | 
 41 | Once token groups have been established, a reductive algorithm is used to ensure that the token groups are logical in a geographic context. We don't want to return New York City for a term such as 'nyc france', so we need to only return things called 'nyc' *inside* places called 'france'.
 42 | 
 43 | The engine starts from the rightmost group, and works to the left, ensuring token groups represent geographic entities contained *within* those which came before. This process is repeated until it either runs out of groups, or would return 0 results.
 44 | 
 45 | The best estimation is then returned, either as a set of integers representing the ids of those regions, or as a JSON structure which also contains additional information such as population counts etc.
 46 | 
 47 | The data is sourced from the [whosonfirst](https://github.com/whosonfirst-data/whosonfirst-data) project, this project also includes different language translations of place names.
 48 | 
 49 | Placeholder supports searching on and retrieving tokens in different languages and also offers support for synonyms and abbreviations.
 50 | 
 51 | The engine includes a rudimentary language detection algorithm which attempts to detect right-to-left languages and languages which write their addresses in major-to-minor format. It will then reverse the tokens to re-order them in to minor-to-major ordering.
 52 | 
 53 | ---
 54 | 
 55 | ## Requirements
 56 | 
 57 | Placeholder requires Node.js and SQLite
 58 | 
 59 | See [Pelias software requirements](https://github.com/pelias/documentation/blob/master/requirements.md) for required and recommended versions.
 60 | 
 61 | ## Install
 62 | 
 63 | ```bash
 64 | $ git clone git@github.com:pelias/placeholder.git && cd placeholder
 65 | $ npm install
 66 | ```
 67 | 
 68 | ### Download the required database files
 69 | 
 70 | Data hosting is provided by [Geocode Earth](https://geocode.earth). Other
 71 | Pelias related downloads are available at https://geocode.earth/data.
 72 | 
 73 | ```bash
 74 | $ mkdir data
 75 | $ curl -s https://data.geocode.earth/placeholder/store.sqlite3.gz | gunzip > data/store.sqlite3;
 76 | ```
 77 | 
 78 | ### Confirm the build was successful
 79 | 
 80 | ```bash
 81 | $ npm test
 82 | ```
 83 | 
 84 | ```bash
 85 | $ npm run cli -- san fran
 86 | 
 87 | > pelias-placeholder@1.0.0 cli
 88 | > node cmd/cli.js "san" "fran"
 89 | 
 90 | san fran
 91 | 
 92 | took: 3ms
 93 |  - 85922583	locality 	San Francisco
 94 | ```
 95 | 
 96 | ---
 97 | 
 98 | ## Run server
 99 | 
100 | ```bash
101 | $ PORT=6100 npm start;
102 | ```
103 | 
104 | #### Configuration via Environment Variables
105 | 
106 | The service supports additional environment variables that affect its operation:
107 | 
108 | | Environment Variable | Default | Description |
109 | | -------------------- | ------- | ----------- |
110 | | `HOST` | `undefined` | The network address that the placeholder service will bind to. Defaults to whatever the current Node.js default is, which is currently to listen on `0.0.0.0` (all interfaces). See the [Node.js Net documentation](https://nodejs.org/api/net.html#net_server_listen_port_host_backlog_callback) for more information. |
111 | | `PORT` | `3000` | The TCP port that the placeholder service will use for incoming network connections |
112 | | `PLACEHOLDER_DATA` | `../data/` | Path to the directory where the placeholder service will find the `store.sqlite3` database file. |
113 | 
114 | ### Open browser
115 | 
116 | the server should now be running and you should be able to access the http API:
117 | 
118 | ```bash
119 | http://localhost:6100/
120 | ```
121 | 
122 | try the following paths:
123 | 
124 | ```javascript
125 | /demo
126 | /parser/search?text=london
127 | /parser/findbyid?ids=101748479
128 | /parser/query?text=london
129 | /parser/tokenize?text=sydney new south wales
130 | ```
131 | 
132 | ### Changing languages
133 | 
134 | the `/parser/search` endpoint accepts a `?lang=xxx` property which can be used to vary the language of data returned.
135 | 
136 | for example, the following urls will return strings in Japanese / Russian where available:
137 | 
138 | ```javascript
139 | /parser/search?text=germany&lang=jpn
140 | /parser/search?text=germany&lang=rus
141 | ```
142 | 
143 | documents returned by `/parser/search` contain a boolean property named `languageDefaulted` which indicates if the service was able to find a translation in the language you request (false) or whether it returned the default language (true).
144 | 
145 | The `/parser/findbyid` endpoint also accepts a `?lang=xxx` property which will return the selected lang if the translation exists and all translations otherwise.
146 | 
147 | for example, the following url will return strings in French / Korean where available:
148 | 
149 | ```javascript
150 | /parser/findbyid?ids=85633147,102191581,85862899&lang=fra
151 | /parser/findbyid?ids=85633147,102191581,85862899&lang=kor
152 | ```
153 | 
154 | the demo is also able to serve responses in different languages by providing the language code in the URL anchor:
155 | 
156 | ```bash
157 | /demo#jpn
158 | /demo#chi
159 | /demo#eng
160 | /demo#fra
161 | ... etc.
162 | ```
163 | 
164 | ### Filtering by placetype
165 | 
166 | the `/parser/search` endpoint accepts a `?placetype=xxx` parameter which can be used to control the placetype of records which are returned.
167 | 
168 | the API does not provide any performance benefits, it is simply a convenience API to filter by a whitelist.
169 | 
170 | you may specify multiple placetypes using a comma to separate them, such as `?placetype=xxx,yyy`, these are matched as OR conditions. eg: (xxx OR yyy)
171 | 
172 | for example:
173 | 
174 | the query `search?text=luxemburg` will return results for the `country`, `region`, `locality` etc.
175 | 
176 | you can use the placetype filter to control which records are returned:
177 | 
178 | ```
179 | # all matching results
180 | search?text=luxemburg
181 | 
182 | # only return matching country records
183 | search?text=luxemburg&placetype=country
184 | 
185 | # return matching country or region records
186 | search?text=luxemburg&placetype=country,region
187 | ```
188 | 
189 | ### Live mode (BETA)
190 | 
191 | the `/parser/search` endpoint accepts a `?mode=live` parameter pair which can be used to enable an autocomplete-style API.
192 | 
193 | in this mode the final token of each input text is considered as 'incomplete', meaning that the user has potentially only typed part of a token.
194 | 
195 | this mode is currently in BETA, the interface and behaviour may change over time.
196 | 
197 | ### Configuring the rtree threshold
198 | 
199 | the default matching strategy uses the `lineage` table to ensure that token pairs represent a valid child->parent relationship. this ensures that queries like 'London France' do not match, because there is no entry in the lineage table linking those two places together.
200 | 
201 | in some cases it's preferable to fall back to a matching strategy which considers geographically nearby places with a matching name, even if that relationship does not explicitly exist in the lineage table.
202 | 
203 | for example, 'Basel France' will return 'Basel Switzerland'. this is useful for handling user input errors and errors and omissions from the lineage table.
204 | 
205 | in the example above, 'Basel France' only matches because the bounding box of 'Basel' overlaps the bounding box of 'France' and no other valid entry for 'Basel France' exists.
206 | 
207 | the definition of what is 'nearby' is configurable, the bbox for the minor term (left token) is expanded by a threshold (the threshold is added or subtracted to each of the bbox vertices).
208 | 
209 | by default the threshold is set as `0.2` (degrees), any float value between 0 and 1 may be specified via the enviornment variable `RTREE_THRESHOLD`.
210 | 
211 | a setting of less than 0 will disable the rtree functionality completely. disabling the rtree will result in nearby queries such as 'Basel France' returning 'France' instead of 'Basel Switzerland'.
212 | 
213 | ---
214 | 
215 | ## Run the interactive shell
216 | 
217 | ```bash
218 | $ npm run repl
219 | 
220 | > pelias-placeholder@1.0.0 repl
221 | > node cmd/repl.js
222 | 
223 | placeholder >
224 | ```
225 | 
226 | try the following commands:
227 | 
228 | ```javascript
229 | placeholder > london on
230 |  - 101735809	locality 	London
231 | 
232 | placeholder > search london on
233 |  - 101735809	locality 	London
234 | 
235 | placeholder > tokenize sydney new south wales
236 |  [ [ 'sydney', 'new south wales' ] ]
237 | 
238 | placeholder > token kelburn
239 |  [ 1729339019 ]
240 | 
241 | placeholder > id 1729339019
242 |  { name: 'Kelburn',
243 |    placetype: 'neighbourhood',
244 |    lineage:
245 |     { continent_id: 102191583,
246 |       country_id: 85633345,
247 |       county_id: 102079339,
248 |       locality_id: 101915529,
249 |       neighbourhood_id: 1729339019,
250 |       region_id: 85687233 },
251 |    names: { eng: [ 'Kelburn' ] } }
252 | ```
253 | 
254 | ---
255 | 
256 | ## Configuration for pelias API
257 | 
258 | While Placeholder can be used as a stand-alone application or included with other geographic software / search engines, it is designed for the [Pelias geocoder](https://github.com/pelias/pelias).
259 | 
260 | To connect Placeholder service to the Pelias API, [configure the pelias config file](https://github.com/pelias/api#configuration-via-pelias-config) with the port that placeholder is running on.
261 | 
262 | ---
263 | 
264 | ## Tests
265 | 
266 | ### run the test suite
267 | 
268 | ```bash
269 | $ npm test
270 | ```
271 | 
272 | ### Run the functional cases
273 | 
274 | there are more exhaustive test cases included in `test/cases/`.
275 | 
276 | to run all the test cases:
277 | 
278 | ```bash
279 | $ npm run funcs
280 | ```
281 | 
282 | ### Generate a ~500,000 line test file
283 | 
284 | this command requires the `data/wof.extract` file mentioned below in the 'building the database' section.
285 | 
286 | ```bash
287 | $ npm run gentests
288 | ```
289 | 
290 | once complete you can find the generated test cases in `test/cases/generated.txt`.
291 | 
292 | ---
293 | 
294 | ## Docker
295 | 
296 | ### Build the service image
297 | 
298 | ```bash
299 | $ docker-compose build
300 | ```
301 | 
302 | ### Run the service in the background
303 | 
304 | ```bash
305 | $ docker-compose up -d
306 | ```
307 | 
308 | ---
309 | 
310 | ## Building the database
311 | 
312 | ### Prerequisites
313 | - jq 1.5+ must be installed
314 |     - on ubuntu: `sudo apt-get install jq`
315 |     - on mac: `brew install jq`
316 | - Who's on First data download
317 |     - use the download script in [pelias/whosonfirst](https://github.com/pelias/whosonfirst#downloading-the-data)
318 | 
319 | ### Steps
320 | the database is created from geographic data sourced from the [whosonfirst](https://whosonfirst.org/) project.
321 | 
322 | the whosonfirst project is distributed as geojson files, so in order to speed up development we first extract the relevant data in to a file: `data/wof.extract`.
323 | 
324 | the following command will iterate over all the `geojson` files downloaded by the Pelias whosonfirst importer, extracting the relevant properties in to the file `data/wof.extract`.
325 | 
326 | this process can take 30-60 minutes to run and consumes ~350MB of disk space, you will only need to run this command once, or when your local `whosonfirst-data` files are updated.
327 | 
328 | ```bash
329 | $ npm run extract
330 | 
331 | # alternative if you do not have a `pelias.json` file specifying where WOF data should be
332 | $ WOF_DIR=/path/to/your/whosonfirst/data npm run extract
333 | 
334 | ```
335 | 
336 | now you can rebuild the `data/store.json` file with the following command:
337 | 
338 | this should take 2-3 minutes to run:
339 | 
340 | ```bash
341 | $ npm run build
342 | ```
343 | 
344 | ---
345 | 
346 | ## Using the Docker image
347 | 
348 | ### Rebuild the image
349 | 
350 | you can rebuild the image on any system with the following command:
351 | 
352 | ```bash
353 | $ docker build -t pelias/placeholder .
354 | ```
355 | 
356 | ### Download pre-built image
357 | 
358 | Up to date Docker images are built and automatically pushed to Docker Hub from our continuous integration pipeline
359 | 
360 | You can pull the latest stable image with
361 | 
362 | ```bash
363 | $ docker pull pelias/placeholder
364 | ```
365 | 
366 | ### Download custom image tags
367 | 
368 | We publish each commit and the latest of each branch to separate tags
369 | 
370 | A list of all available tags to download can be found at https://hub.docker.com/r/pelias/placeholder/tags/
371 | 


--------------------------------------------------------------------------------