├── .npmrc ├── .jshintignore ├── cmd ├── server.sh ├── loadtest.js ├── units ├── integration ├── placetype.filter ├── build.sh ├── extract.sh ├── cli.js ├── jq.filter ├── load.js ├── ci.sh ├── s3_upload.sh ├── repl.js ├── generate_tests.js └── wof_extract_sqlite.js ├── .gitignore ├── query ├── count_tokens.sql ├── has_subject_autocomplete.sql ├── match_subject_distinct_subject_ids.sql ├── match_subject_autocomplete_distinct_subject_ids.sql ├── index.js ├── match_subject_object_autocomplete.sql ├── match_subject_object.sql ├── build_rtree.sql ├── match_subject_object_geom_intersects.sql └── match_subject_object_geom_intersects_autocomplete.sql ├── .dockerignore ├── .github └── workflows │ ├── pull_request.yml │ ├── _test.yml │ └── push.yml ├── lib ├── jsonParseStream.js ├── permutations.js ├── sorted.js ├── Database.js ├── TokenIndex.js ├── Result.js ├── analysis.js ├── unicode.js ├── DocStore.js └── Queries.js ├── config └── language │ ├── alternatives.js │ ├── blacklist.js │ └── whitelist.js ├── Dockerfile ├── .jshintrc ├── Placeholder.js ├── test ├── integration.js ├── lib │ ├── permutations.js │ ├── jsonParseStream.js │ ├── sorted.js │ ├── Database.js │ ├── DocStore.js │ ├── analysis.js │ ├── TokenIndex.js │ ├── Result.js │ └── Queries.js ├── server │ └── routes │ │ ├── _util.js │ │ └── findbyid.js ├── units.js ├── prototype │ ├── query_integration.js │ ├── tokenize_integration.js │ ├── io.js │ ├── tokenize.js │ └── query.js ├── functional_autocomplete.js ├── case.js ├── functional.js └── cases │ └── capitalCities.txt ├── server ├── routes │ ├── _util.js │ ├── tokenize.js │ ├── query.js │ ├── findbyid.js │ └── search.js ├── http.js └── demo │ └── index.html ├── prototype ├── io.js ├── tokenize.js ├── query.js └── wof.js ├── LICENSE ├── package.json └── README.md /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /.jshintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | coverage 3 | reports -------------------------------------------------------------------------------- /cmd/server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | exec node server/http.js 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | data/* 3 | test/cases/generated.txt 4 | -------------------------------------------------------------------------------- /query/count_tokens.sql: -------------------------------------------------------------------------------- 1 | SELECT COUNT(*) AS cnt 2 | FROM fulltext AS ft 3 | WHERE ft.fulltext MATCH $token_quoted 4 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | node_modules 3 | .dockerignore 4 | .gitignore 5 | .gitattributes 6 | Dockerfile 7 | README.md 8 | data/* 9 | test/cases/generated.txt 10 | -------------------------------------------------------------------------------- /cmd/loadtest.js: -------------------------------------------------------------------------------- 1 | 2 | var Placeholder = require('../Placeholder'), 3 | ph = new Placeholder(); 4 | 5 | ph.load(); 6 | console.log( 'loaded!' ); 7 | // setInterval( function(){}, 1000 ); 8 | -------------------------------------------------------------------------------- /cmd/units: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run tests with pipefail to avoid false passes 4 | # see https://github.com/pelias/pelias/issues/744 5 | set -euo pipefail 6 | 7 | node test/units.js | npx tap-spec 8 | -------------------------------------------------------------------------------- /cmd/integration: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run tests with pipefail to avoid false passes 4 | # see https://github.com/pelias/pelias/issues/744 5 | set -euo pipefail 6 | 7 | node test/integration.js | npx tap-spec 8 | -------------------------------------------------------------------------------- /query/has_subject_autocomplete.sql: -------------------------------------------------------------------------------- 1 | SELECT id 2 | FROM tokens as t1 3 | JOIN fulltext AS f1 ON f1.rowid = t1.rowid 4 | WHERE f1.fulltext MATCH $subject 5 | -- AND t1.tag NOT IN ( 'colloquial' ) 6 | LIMIT 1 7 | -------------------------------------------------------------------------------- /cmd/placetype.filter: -------------------------------------------------------------------------------- 1 | "wof:placetype":\s*"\(ocean\|continent\|marinearea\|empire\|country\|dependency\|disputed\|macroregion\|region\|macrocounty\|county\|localadmin\|locality\|borough\|macrohood\|neighbourhood\)" 2 | -------------------------------------------------------------------------------- /query/match_subject_distinct_subject_ids.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT( t1.id ) AS subjectId 2 | FROM tokens AS t1 3 | WHERE t1.token = $subject 4 | -- AND t1.tag NOT IN ( 'colloquial' ) 5 | ORDER BY t1.id ASC 6 | LIMIT $limit 7 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | on: pull_request 3 | jobs: 4 | unit-tests: 5 | # only run this job for forks 6 | if: github.event.pull_request.head.repo.full_name != github.repository 7 | uses: ./.github/workflows/_test.yml 8 | -------------------------------------------------------------------------------- /query/match_subject_autocomplete_distinct_subject_ids.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT( t1.id ) AS subjectId 2 | FROM tokens AS t1 3 | JOIN fulltext AS f1 ON f1.rowid = t1.rowid 4 | WHERE f1.fulltext MATCH $subject 5 | -- AND t1.tag NOT IN ( 'colloquial' ) 6 | ORDER BY t1.id ASC 7 | LIMIT $limit 8 | -------------------------------------------------------------------------------- /cmd/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ); 4 | 5 | PLACEHOLDER_DATA=${PLACEHOLDER_DATA:-"./data"}; 6 | 7 | rm -f ${PLACEHOLDER_DATA}/store.sqlite3; 8 | 9 | cat ${PLACEHOLDER_DATA}/wof.extract | node ${DIR}/load.js 10 | 11 | echo 'Done!' 12 | -------------------------------------------------------------------------------- /query/index.js: -------------------------------------------------------------------------------- 1 | 2 | const fs = require('fs'); 3 | const path = require('path'); 4 | 5 | // load queries from filesystem 6 | module.exports = fs.readdirSync(__dirname).reduce((memo, filename) => { 7 | var sql = fs.readFileSync( path.join( __dirname, filename ), 'utf8' ).trim(); 8 | memo[ filename.replace('.sql', '' ) ] = sql; 9 | return memo; 10 | }, {}); 11 | -------------------------------------------------------------------------------- /lib/jsonParseStream.js: -------------------------------------------------------------------------------- 1 | var through = require('through2'); 2 | 3 | function streamFactory(){ 4 | return through.obj(function( row, _, next ){ 5 | 6 | try { 7 | this.push( JSON.parse( row ) ); 8 | } catch( e ){ 9 | console.error( 'invalid json', e ); 10 | } 11 | 12 | next(); 13 | }); 14 | } 15 | 16 | module.exports = streamFactory; 17 | -------------------------------------------------------------------------------- /cmd/extract.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ); 4 | 5 | PLACEHOLDER_DATA=${PLACEHOLDER_DATA:-"./data"}; 6 | 7 | mkdir -p ${PLACEHOLDER_DATA}; 8 | 9 | echo "Creating extract at ${PLACEHOLDER_DATA}/wof.extract" 10 | 11 | exec node --max_old_space_size=8000 ${DIR}/wof_extract_sqlite.js > ${PLACEHOLDER_DATA}/wof.extract; 12 | 13 | echo 'Done!' 14 | -------------------------------------------------------------------------------- /config/language/alternatives.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | 'tib': 'bod', 3 | 'cze': 'ces', 4 | 'wel': 'cym', 5 | 'ger': 'deu', 6 | 'gre': 'ell', 7 | 'baq': 'eus', 8 | 'per': 'fas', 9 | 'fre': 'fra', 10 | 'arm': 'hye', 11 | 'ice': 'isl', 12 | 'geo': 'kat', 13 | 'mac': 'mkd', 14 | 'mao': 'mri', 15 | 'may': 'msa', 16 | 'bur': 'mya', 17 | 'dut': 'nld', 18 | 'rum': 'ron', 19 | 'slo': 'slk', 20 | 'alb': 'sqi', 21 | 'chi': 'zho' 22 | }; -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # base image 2 | FROM pelias/baseimage 3 | 4 | # change working dir 5 | ENV WORKDIR /code/pelias/placeholder 6 | WORKDIR ${WORKDIR} 7 | 8 | # copy package.json first to prevent npm install being rerun when only code changes 9 | COPY ./package.json ${WORK} 10 | RUN npm install 11 | 12 | # copy code from local checkout 13 | ADD . ${WORKDIR} 14 | 15 | ENV PLACEHOLDER_DATA '/data/placeholder' 16 | 17 | USER pelias 18 | 19 | CMD [ "./cmd/server.sh" ] 20 | -------------------------------------------------------------------------------- /.jshintrc: -------------------------------------------------------------------------------- 1 | { 2 | "node": true, 3 | "curly": true, 4 | "eqeqeq": true, 5 | "esversion": 6, 6 | "freeze": true, 7 | "immed": true, 8 | "indent": 2, 9 | "latedef": false, 10 | "newcap": true, 11 | "noarg": true, 12 | "noempty": true, 13 | "nonbsp": true, 14 | "nonew": true, 15 | "plusplus": false, 16 | "quotmark": "single", 17 | "undef": true, 18 | "unused": false, 19 | "maxparams": 5, 20 | "maxdepth": 4, 21 | "maxlen": 140 22 | } 23 | -------------------------------------------------------------------------------- /query/match_subject_object_autocomplete.sql: -------------------------------------------------------------------------------- 1 | SELECT t1.id AS subjectId, t2.id as objectId 2 | FROM lineage AS l1 3 | JOIN tokens AS t1 ON t1.id = l1.id 4 | JOIN tokens AS t2 ON t2.id = l1.pid 5 | WHERE t1.token = $subject 6 | AND t2.token LIKE $object 7 | AND ( 8 | t1.lang = t2.lang OR 9 | t1.lang IN ( 'eng', 'und' ) OR 10 | t2.lang IN ( 'eng', 'und' ) 11 | ) 12 | -- AND t1.tag NOT IN ( 'colloquial' ) 13 | -- AND t2.tag NOT IN ( 'colloquial' ) 14 | GROUP BY t1.id, t2.id 15 | ORDER BY t1.id ASC, t2.id ASC 16 | LIMIT $limit 17 | -------------------------------------------------------------------------------- /Placeholder.js: -------------------------------------------------------------------------------- 1 | 2 | var _ = require('lodash'), 3 | DocStore = require('./lib/DocStore'), 4 | TokenIndex = require('./lib/TokenIndex'); 5 | 6 | // constructor 7 | function Placeholder( options ){ 8 | this.store = new DocStore( options ); 9 | this.index = new TokenIndex( options ); 10 | } 11 | 12 | // load prototype methods from modules 13 | Placeholder.prototype = _.extend( Placeholder.prototype, 14 | require('./prototype/io.js'), 15 | require('./prototype/query.js'), 16 | require('./prototype/tokenize.js'), 17 | require('./prototype/wof.js') 18 | ); 19 | 20 | module.exports = Placeholder; 21 | -------------------------------------------------------------------------------- /cmd/cli.js: -------------------------------------------------------------------------------- 1 | 2 | var Placeholder = require('../Placeholder'), 3 | ph = new Placeholder(); 4 | 5 | // init placeholder 6 | ph.load(); 7 | 8 | // -- user input -- 9 | var input = ( process.argv.slice(2) || [] ).join(' ') || ''; 10 | console.log( input + '\n' ); 11 | 12 | // -- search -- 13 | console.time('took'); 14 | ph.query( input, ( err, res ) => { 15 | console.timeEnd('took'); 16 | 17 | // print results 18 | ph.store.getMany( res.getIdsAsArray(), (err, docs) => { 19 | docs.forEach( doc => { 20 | console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') ); 21 | }); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /.github/workflows/_test.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | on: workflow_call 3 | jobs: 4 | unit-tests: 5 | runs-on: '${{ matrix.os }}' 6 | strategy: 7 | matrix: 8 | os: 9 | - ubuntu-22.04 10 | node-version: [ 20.x, 22.x, 24.x ] 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: 'Install node.js ${{ matrix.node-version }}' 14 | uses: actions/setup-node@v4 15 | with: 16 | node-version: '${{ matrix.node-version }}' 17 | - name: Run unit tests 18 | run: | 19 | [[ -f ./bin/ci-setup ]] && ./bin/ci-setup 20 | npm install 21 | npm run ci 22 | -------------------------------------------------------------------------------- /test/integration.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'); 2 | var path = require('path'); 3 | 4 | var tests = [ 5 | './lib/Queries', 6 | './prototype/tokenize_integration', 7 | './prototype/query_integration', 8 | './functional', 9 | './functional_autocomplete', 10 | ]; 11 | 12 | // test runner 13 | tests.map( function( testpath ){ 14 | 15 | var file = require( testpath ); 16 | 17 | var test = function( name, func ) { 18 | return tape( path.normalize( testpath ) + ': ' + name , func ); 19 | }; 20 | 21 | for( var testCase in file ){ 22 | if( 'function' === typeof file[testCase] ){ 23 | file[testCase]( test ); 24 | } 25 | } 26 | }); 27 | -------------------------------------------------------------------------------- /test/lib/permutations.js: -------------------------------------------------------------------------------- 1 | 2 | var permutations = require('../../lib/permutations'); 3 | 4 | module.exports.permutations = function(test, common) { 5 | test('permutations', function(t) { 6 | 7 | var input = [ 'soho', 'new', 'york', 'usa' ]; 8 | var expected = [ 9 | [ 'soho', 'new', 'york', 'usa' ], 10 | [ 'soho', 'new', 'york' ], 11 | [ 'soho', 'new' ], 12 | [ 'soho' ], 13 | [ 'new', 'york', 'usa' ], 14 | [ 'new', 'york' ], 15 | [ 'new' ], 16 | [ 'york', 'usa' ], 17 | [ 'york' ], 18 | [ 'usa' ] 19 | ]; 20 | 21 | t.deepEqual( permutations.expand( input ), expected ); 22 | t.end(); 23 | }); 24 | }; 25 | -------------------------------------------------------------------------------- /cmd/jq.filter: -------------------------------------------------------------------------------- 1 | .properties | with_entries( 2 | select(.key | test( 3 | "^(wof:(id|name|placetype|hierarchy|parent_id|country_alpha3|abbreviation|shortcode|superseded_by|label|population|megacity)$|" + 4 | "lbl:(bbox|latitude|longitude)$|" + 5 | "geom:(area|bbox|latitude|longitude)$|" + 6 | "iso:(country)$|" + 7 | "ne:(iso_a2|iso_a3|pop_est)$|" + 8 | "edtf:(deprecated)$|" + 9 | "mz:(is_current|population)$|" + 10 | "gn:(population|pop)$|" + 11 | "zs:(pop10)$|" + 12 | "qs:(pop|gn_pop|photo_sum)$|" + 13 | "wk:(population)$|" + 14 | "meso:(pop)$|" + 15 | "statoids:(population)$|" + 16 | "name:|" + 17 | "abrv:)" 18 | )) 19 | ) 20 | -------------------------------------------------------------------------------- /server/routes/_util.js: -------------------------------------------------------------------------------- 1 | 2 | // in express, if you pass query params like this `?param[]=value` 3 | // then the type of the param is Array and the code may be expecting a string. 4 | // this convenience function allows either form to be used. 5 | function arrayParam( param ){ 6 | var res = []; 7 | 8 | // accept param as array. eg: param[]=value 9 | if( Array.isArray( param ) ){ res = param; } 10 | 11 | // accept param as string. eg: param=value 12 | if( 'string' === typeof param ){ res = param.split(','); } 13 | 14 | // trim strings and remove empty elements 15 | return res.map(a => a.trim()).filter(a => a.length); 16 | } 17 | 18 | module.exports.arrayParam = arrayParam; 19 | -------------------------------------------------------------------------------- /test/server/routes/_util.js: -------------------------------------------------------------------------------- 1 | 2 | var util = require('../../../server/routes/_util'); 3 | 4 | module.exports.arrayParam = function(test, common) { 5 | test('arrayParam', function(t) { 6 | t.deepEqual( util.arrayParam(undefined), [], 'undefined' ); 7 | t.deepEqual( util.arrayParam(null), [], 'null' ); 8 | t.deepEqual( util.arrayParam(''), [], 'empty' ); 9 | t.deepEqual( util.arrayParam([]), [], 'empty array' ); 10 | t.deepEqual( util.arrayParam(['a ', ' b']), ['a','b'], 'array' ); 11 | t.deepEqual( util.arrayParam(' test '), ['test'], 'simple string' ); 12 | t.deepEqual( util.arrayParam(' test, foo '), ['test','foo'], 'delimited string' ); 13 | t.end(); 14 | }); 15 | }; 16 | -------------------------------------------------------------------------------- /server/routes/tokenize.js: -------------------------------------------------------------------------------- 1 | 2 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX; 3 | 4 | module.exports = function( req, res ){ 5 | 6 | // placeholder 7 | var ph = req.app.locals.ph; 8 | 9 | // input text 10 | var text = req.query.text || ''; 11 | 12 | // live mode (autocomplete-style search) 13 | // we append a byte indicating the last word is potentially incomplete. 14 | // except where the last token is a space, then we simply trim the space. 15 | if( req.query.mode === 'live' ){ 16 | if( ' ' === text.slice(-1) ){ 17 | text = text.trim(); 18 | } else { 19 | text += PARTIAL_TOKEN_SUFFIX; 20 | } 21 | } 22 | 23 | ph.tokenize( text, ( err, groups ) => { 24 | res.status(200).json( groups ); 25 | }); 26 | }; 27 | -------------------------------------------------------------------------------- /test/lib/jsonParseStream.js: -------------------------------------------------------------------------------- 1 | 2 | const through = require('through2'); 3 | const parser = require('../../lib/jsonParseStream'); 4 | 5 | module.exports.parse = function(test, common) { 6 | test('parse', function(t) { 7 | 8 | var chunks = []; 9 | 10 | const xform = (chunk, _, next) => { 11 | chunks.push( chunk ); 12 | next(); 13 | }; 14 | 15 | const flush = (next) => { 16 | t.deepEqual(chunks, [ 17 | { hello: 'world' }, 18 | { test: 'message' } 19 | ]); 20 | t.end(); 21 | next(); 22 | }; 23 | 24 | const stream = parser(); 25 | stream.pipe( through.obj( xform, flush ) ); 26 | stream.write('{ "hello": "world" }'); 27 | stream.write('{ "test": "message" }'); 28 | stream.end(); 29 | }); 30 | }; 31 | -------------------------------------------------------------------------------- /config/language/blacklist.js: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | This blacklist lists all the language codes we exclude for import in to placeholder. 4 | 5 | The whosonfirst dataset contains many disused and rarely-used languages which 6 | can cause issues when the source data has been machine transiliterated. 7 | 8 | The list is non-exhaustive and was originally sourced from wikipedia and various 9 | online sources, I aimed to only include the least commonly spoken languages. 10 | 11 | If you feel a language code is wrong or missing, please feel free to edit this file. 12 | **/ 13 | 14 | // Enawene Nawe language 15 | // https://en.wikipedia.org/wiki/Enawene_Nawe_language 16 | module.exports.unk = ''; 17 | 18 | // Volapük 19 | // https://en.wikipedia.org/wiki/Volap%C3%BCk 20 | module.exports.vol = ''; 21 | -------------------------------------------------------------------------------- /query/match_subject_object.sql: -------------------------------------------------------------------------------- 1 | WITH l AS ( 2 | SELECT * 3 | FROM lineage 4 | WHERE id IN ( 5 | SELECT id 6 | FROM tokens 7 | WHERE token = $subject 8 | ) 9 | AND pid IN ( 10 | SELECT id 11 | FROM tokens 12 | WHERE token = $object 13 | ) 14 | ) 15 | SELECT 16 | l.id AS subjectId, 17 | l.pid AS objectId 18 | FROM l 19 | JOIN tokens AS t1 20 | INDEXED BY tokens_cover_idx 21 | USING (id) 22 | JOIN tokens AS t2 23 | INDEXED BY tokens_cover_idx 24 | ON t2.id = l.pid 25 | WHERE 26 | t1.token = $subject 27 | AND 28 | t2.token = $object 29 | AND ( 30 | t1.lang = t2.lang OR 31 | t1.lang IN ( 'eng', 'und' ) OR 32 | t2.lang IN ( 'eng', 'und' ) 33 | ) 34 | -- AND t1.tag NOT IN ( 'colloquial' ) 35 | -- AND t2.tag NOT IN ( 'colloquial' ) 36 | GROUP BY l.id, l.pid 37 | ORDER BY l.id ASC, l.pid ASC 38 | LIMIT $limit 39 | -------------------------------------------------------------------------------- /cmd/load.js: -------------------------------------------------------------------------------- 1 | const split = require('split2'); 2 | const through = require('through2'); 3 | const parser = require('../lib/jsonParseStream'); 4 | const Placeholder = require('../Placeholder'); 5 | const ph = new Placeholder(); 6 | 7 | // run import pipeline 8 | console.error('import...'); 9 | ph.load({ reset: true }); 10 | 11 | // run import 12 | process.stdin.pipe( split() ) 13 | .pipe( parser() ) 14 | .pipe( through.obj( function insert( row, _, next ){ 15 | ph.insertWofRecord( row, next ); 16 | }, function flush( done ){ 17 | console.error('populate fts...'); 18 | ph.populate(); 19 | console.error('optimize...'); 20 | ph.optimize(); 21 | console.error('close...'); 22 | ph.close(); 23 | done(); 24 | })); 25 | -------------------------------------------------------------------------------- /test/units.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'); 2 | var path = require('path'); 3 | 4 | var tests = [ 5 | './lib/jsonParseStream', 6 | './lib/analysis', 7 | './lib/permutations', 8 | './lib/sorted', 9 | './lib/Database', 10 | './lib/DocStore', 11 | './lib/TokenIndex', 12 | './lib/Result', 13 | './prototype/wof', 14 | './prototype/io', 15 | './prototype/tokenize', 16 | './prototype/query', 17 | './server/routes/_util.js', 18 | './server/routes/findbyid.js', 19 | ]; 20 | 21 | // test runner 22 | tests.map( function( testpath ){ 23 | 24 | var file = require( testpath ); 25 | 26 | var test = function( name, func ) { 27 | return tape( path.normalize( testpath ) + ': ' + name , func ); 28 | }; 29 | 30 | for( var testCase in file ){ 31 | if( 'function' === typeof file[testCase] ){ 32 | file[testCase]( test ); 33 | } 34 | } 35 | }); 36 | -------------------------------------------------------------------------------- /server/routes/query.js: -------------------------------------------------------------------------------- 1 | 2 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX; 3 | 4 | module.exports = function( req, res ){ 5 | 6 | // placeholder 7 | var ph = req.app.locals.ph; 8 | 9 | // input text 10 | var text = req.query.text || ''; 11 | 12 | // live mode (autocomplete-style search) 13 | // we append a byte indicating the last word is potentially incomplete. 14 | // except where the last token is a space, then we simply trim the space. 15 | if( req.query.mode === 'live' ){ 16 | if( ' ' === text.slice(-1) ){ 17 | text = text.trim(); 18 | } else { 19 | text += PARTIAL_TOKEN_SUFFIX; 20 | } 21 | } 22 | 23 | // perform query 24 | console.time('took'); 25 | ph.query( text, ( err, result ) => { 26 | console.timeEnd('took'); 27 | res.status(200).json( result.getIdsAsArray() ); 28 | }); 29 | }; 30 | -------------------------------------------------------------------------------- /test/prototype/query_integration.js: -------------------------------------------------------------------------------- 1 | 2 | var Placeholder = require('../../Placeholder'); 3 | 4 | module.exports.query = function(test, util) { 5 | 6 | // load data 7 | var ph = new Placeholder(); 8 | ph.load(); 9 | 10 | var assert = runner.bind(null, test, ph); 11 | 12 | assert([['kelburn', 'wellington', 'new zealand']], [1729339019]); 13 | assert([['north sydney']], [85784821, 101931469, 102048877, 404225393, 1310698409]); 14 | assert([['sydney', 'new south wales', 'australia']], [101932003, 102049151, 404226357, 1376953385, 1377004395]); 15 | assert([['ケープタウン', '南アフリカ']], [101928027]); 16 | }; 17 | 18 | // convenience function for writing quick 'n easy test cases 19 | function runner( test, ph, actual, expected ){ 20 | test( actual, function(t) { 21 | ph.query( actual[0].join(' '), ( err, res ) => { 22 | t.deepEqual( res.getIdsAsArray(), expected ); 23 | t.end(); 24 | }); 25 | }); 26 | } 27 | -------------------------------------------------------------------------------- /query/build_rtree.sql: -------------------------------------------------------------------------------- 1 | 2 | -- create virtual table 3 | CREATE VIRTUAL TABLE IF NOT EXISTS rtree USING rtree( 4 | id, -- Integer primary key 5 | minX, maxX, -- Minimum and maximum X coordinate 6 | minY, maxY, -- Minimum and maximum Y coordinate 7 | minZ, maxZ -- Minimum and maximum 'rank' 8 | ); 9 | 10 | -- delete existing values 11 | DELETE FROM rtree; 12 | 13 | -- fill rtree 14 | INSERT INTO rtree 15 | SELECT 16 | id, 17 | json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[0]' ) AS minX, 18 | json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[2]' ) AS maxX, 19 | json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[1]' ) AS minY, 20 | json_extract( json( '[' || json_extract( json, '$.geom.bbox' ) || ']' ), '$[3]' ) AS maxY, 21 | json_extract( json, '$.rank.min' ) AS minZ, 22 | json_extract( json, '$.rank.max' ) AS maxZ 23 | FROM docs; -------------------------------------------------------------------------------- /cmd/ci.sh: -------------------------------------------------------------------------------- 1 | # Download Placeholder data for tests 2 | BUCKET=https://data.geocode.earth/placeholder 3 | 4 | export AGENT="github/${GITHUB_ACTOR}" 5 | export REFERER="${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}" 6 | 7 | if [ ! -e data/store.sqlite3 ]; then 8 | # ensure data directory exists 9 | mkdir -p data 10 | 11 | # attempt to download today's data first, fall back to latest if not found 12 | echo "Downloading placeholder data..." 13 | curl -A "${AGENT}" -e "${REFERER}" -sfo data/store.sqlite3.gz ${BUCKET}/$(date +%Y-%m-%d)/store.sqlite3.gz || true 14 | [ -e data/store.sqlite3.gz ] || curl -A "${AGENT}" -e "${REFERER}" -so data/store.sqlite3.gz ${BUCKET}/store.sqlite3.gz 15 | 16 | # decompress the sqlite database 17 | echo "Decompressing placeholder data..." 18 | gunzip -f data/store.sqlite3.gz 19 | fi 20 | 21 | # check sqlite3 version 22 | sqlite3 --version 23 | 24 | # install npm dependencies 25 | npm install 26 | 27 | # run all tests 28 | npm run all 29 | -------------------------------------------------------------------------------- /prototype/io.js: -------------------------------------------------------------------------------- 1 | 2 | // plugin to handle I/O 3 | const path = require('path'); 4 | 5 | // load data from disk 6 | module.exports.load = function( opts ){ 7 | const dataDir = process.env.PLACEHOLDER_DATA || path.join( __dirname, '../data/'); 8 | const dbPath = path.join( dataDir, 'store.sqlite3' ); 9 | 10 | this.store.open( dbPath, opts ); // document store 11 | this.index.open( dbPath, opts ); // token index 12 | }; 13 | 14 | // populate databases 15 | module.exports.populate = function(){ 16 | this.store.populate(); 17 | this.index.populate(); 18 | }; 19 | 20 | // optimize databases 21 | module.exports.optimize = function(){ 22 | this.index.optimize(); 23 | }; 24 | 25 | // check schema of databases match 26 | // the schema expected by the codebase 27 | module.exports.checkSchema = function(){ 28 | this.store.checkSchema(); 29 | this.index.checkSchema(); 30 | }; 31 | 32 | // gracefully close connections 33 | module.exports.close = function(){ 34 | this.store.close(); 35 | this.index.close(); 36 | }; 37 | -------------------------------------------------------------------------------- /query/match_subject_object_geom_intersects.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | t1.id AS subjectId, 3 | t2.id as objectId 4 | FROM fulltext f1 5 | JOIN tokens t1 ON ( 6 | f1.rowid = t1.rowid 7 | AND f1.fulltext MATCH $subject_quoted 8 | AND LIKELY(t1.token = $subject) 9 | ) 10 | JOIN rtree AS r1 ON t1.id = r1.id 11 | JOIN rtree AS r2 ON ( 12 | r1.maxZ < r2.minZ AND 13 | (r1.minX - $threshold) < r2.maxX AND 14 | (r1.maxX + $threshold) > r2.minX AND 15 | (r1.minY - $threshold) < r2.maxY AND 16 | (r1.maxY + $threshold) > r2.minY 17 | ) 18 | JOIN fulltext AS f2 ON f2.fulltext MATCH $object_quoted 19 | JOIN tokens t2 ON ( 20 | f2.rowid = t2.rowid 21 | AND r2.id = t2.id 22 | AND LIKELY(t2.token = $object) 23 | AND ( 24 | t1.lang = t2.lang OR 25 | t1.lang IN ('eng', 'und') OR 26 | t2.lang IN ('eng', 'und') 27 | ) 28 | ) 29 | GROUP BY t1.id, t2.id 30 | ORDER BY t1.id ASC, t2.id ASC 31 | LIMIT $limit 32 | -------------------------------------------------------------------------------- /test/functional_autocomplete.js: -------------------------------------------------------------------------------- 1 | 2 | var Placeholder = require('../Placeholder'); 3 | 4 | module.exports.functional = function(test, util) { 5 | 6 | // load data 7 | var ph = new Placeholder(); 8 | ph.load(); 9 | 10 | var assert = runner.bind(null, test, ph); 11 | 12 | assert('Kelbur\x26', [1326645067, 1729339019]); 13 | assert('Kelburn\x26', [1326645067, 1729339019]); 14 | assert('Kelburn W\x26', [1729339019]); 15 | assert('Kelburn Well\x26', [1729339019]); 16 | assert('Kelburn Wellington\x26', [1729339019]); 17 | assert('Kelburn Wellington New\x26', [1729339019]); 18 | assert('Kelburn Wellington New Z\x26', [1729339019]); 19 | assert('Kelburn Wellington New Zeal\x26', [1729339019]); 20 | assert('Kelburn Wellington New Zealand\x26', [1729339019]); 21 | }; 22 | 23 | // convenience function for writing quick 'n easy test cases 24 | function runner( test, ph, actual, expected ){ 25 | test( actual, function(t) { 26 | ph.query( actual, ( err, res ) => { 27 | t.deepEqual( res.getIdsAsArray(), expected ); 28 | t.end(); 29 | }); 30 | }); 31 | } 32 | -------------------------------------------------------------------------------- /query/match_subject_object_geom_intersects_autocomplete.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | t1.id AS subjectId, 3 | t2.id as objectId 4 | FROM fulltext f1 5 | JOIN tokens t1 ON ( 6 | f1.rowid = t1.rowid 7 | AND f1.fulltext MATCH $subject_quoted 8 | AND LIKELY(t1.token = $subject) 9 | ) 10 | JOIN rtree AS r1 ON t1.id = r1.id 11 | JOIN rtree AS r2 ON ( 12 | r1.maxZ < r2.minZ AND 13 | (r1.minX - $threshold) < r2.maxX AND 14 | (r1.maxX + $threshold) > r2.minX AND 15 | (r1.minY - $threshold) < r2.maxY AND 16 | (r1.maxY + $threshold) > r2.minY 17 | ) 18 | JOIN fulltext AS f2 ON f2.fulltext MATCH $object_quoted OR $object_quoted* 19 | JOIN tokens t2 ON ( 20 | f2.rowid = t2.rowid 21 | AND r2.id = t2.id 22 | AND LIKELY(t2.token = $object OR t2.token LIKE ($object || '%')) 23 | AND ( 24 | t1.lang = t2.lang OR 25 | t1.lang IN ('eng', 'und') OR 26 | t2.lang IN ('eng', 'und') 27 | ) 28 | ) 29 | GROUP BY t1.id, t2.id 30 | ORDER BY t1.id ASC, t2.id ASC 31 | LIMIT $limit 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 pelias 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib/permutations.js: -------------------------------------------------------------------------------- 1 | 2 | var GROUP_MIN = 1; 3 | var GROUP_MAX = 6; 4 | 5 | // produce all the possible token groups from adjacent input tokens (without reordering tokens) 6 | 7 | module.exports.expand = function( tokens ){ 8 | 9 | var groups = []; 10 | 11 | // favour smaller tokens over larger ones 12 | // for( var i=0; i tokens.length ){ break; } 15 | // groups.push( tokens.slice( i, j ) ); 16 | // } 17 | // } 18 | 19 | // favour larger tokens over shorter ones 20 | for( var i=0; i=i+GROUP_MIN; j-- ){ 22 | if( j <= tokens.length ){ 23 | groups.push( tokens.slice( i, j ) ); 24 | } 25 | } 26 | } 27 | 28 | return groups; 29 | }; 30 | 31 | /** 32 | example: 33 | 34 | input: [ 'soho', 'new', 'york', 'usa' ] 35 | 36 | output: [ 37 | [ 'soho' ], 38 | [ 'soho', 'new' ], 39 | [ 'soho', 'new', 'york' ], 40 | [ 'soho', 'new', 'york', 'usa' ], 41 | [ 'new' ], 42 | [ 'new', 'york' ], 43 | [ 'new', 'york', 'usa' ], 44 | [ 'york' ], 45 | [ 'york', 'usa' ], 46 | [ 'usa' ] 47 | ] 48 | **/ 49 | -------------------------------------------------------------------------------- /server/routes/findbyid.js: -------------------------------------------------------------------------------- 1 | 2 | module.exports = function( req, res ){ 3 | 4 | // placeholder 5 | var ph = req.app.locals.ph; 6 | 7 | var ids = ( req.query.ids || '' ).split(',').map( function( id ){ 8 | return parseInt( id.trim(), 10 ); 9 | }).filter( function( id ){ 10 | return !isNaN( id ); 11 | }); 12 | 13 | var lang; 14 | if( 'string' === typeof req.query.lang && req.query.lang.length === 3 ){ 15 | lang = req.query.lang.toLowerCase(); 16 | } 17 | 18 | // load docs 19 | ph.store.getMany( ids, function( err, documents ){ 20 | if( err ){ return res.status(500).send({}); } 21 | if( !documents || !documents.length ){ return res.status(404).send({}); } 22 | 23 | var docs = {}; 24 | for( var i=0; i 23 | if [[ "${{ github.repository_owner }}" == "pelias" ]]; then 24 | curl "https://raw.githubusercontent.com/pelias/ci-tools/master/semantic-release.sh" | bash - 25 | fi 26 | build-docker-images: 27 | # run this job if the unit tests passed and the npm-publish job was a success or was skipped 28 | # note: github actions won't run a job if you don't call one of the status check functions, so `always()` is called since it evalutes to `true` 29 | if: ${{ always() && needs.unit-tests.result == 'success' && (needs.npm-publish.result == 'success' || needs.npm-publish.result == 'skipped') }} 30 | needs: [unit-tests, npm-publish] 31 | runs-on: ubuntu-24.04 32 | steps: 33 | - uses: actions/checkout@v4 34 | - name: Build Docker images 35 | env: 36 | DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} 37 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} 38 | run: | 39 | curl "https://raw.githubusercontent.com/pelias/ci-tools/master/build-docker-images.sh" | bash - 40 | -------------------------------------------------------------------------------- /test/lib/sorted.js: -------------------------------------------------------------------------------- 1 | 2 | var sorted = require('../../lib/sorted'); 3 | 4 | // sort 5 | module.exports.sort = function(test, common) { 6 | 7 | // test runner 8 | var assert = function( actual, expected ){ 9 | test( 'sort', function(t) { 10 | t.deepEqual( sorted.sort( actual ), expected ); 11 | t.end(); 12 | }); 13 | }; 14 | 15 | assert([0, 10, 4, -1, 5, 5, 3], [ -1, 0, 3, 4, 5, 5, 10 ]); 16 | assert([0, 9, 4, -10, 5, 5, 2], [ -10, 0, 2, 4, 5, 5, 9 ]); 17 | }; 18 | 19 | // sorted merge 20 | module.exports.merge = function(test, common) { 21 | 22 | // test runner 23 | var assert = function( a, b, expected ){ 24 | test( 'merge', function(t) { 25 | t.deepEqual( sorted.merge( a, b ), expected ); 26 | t.end(); 27 | }); 28 | }; 29 | 30 | assert( 31 | [ -1, 0, 3, 4, 5, 5, 10 ], 32 | [ -10, 0, 2, 4, 5, 5, 9 ], 33 | [ -10, -1, 0, 2, 3, 4, 5, 9, 10 ] 34 | ); 35 | }; 36 | 37 | // sorted intersect 38 | module.exports.intersect = function(test, common) { 39 | 40 | // test runner 41 | var assert = function( a, b, expected ){ 42 | test( 'intersect', function(t) { 43 | t.deepEqual( sorted.intersect([ a, b ]), expected ); 44 | t.end(); 45 | }); 46 | }; 47 | 48 | assert( 49 | [ -1, 0, 3, 4, 5, 5, 10 ], 50 | [ -10, 0, 2, 4, 5, 5, 9 ], 51 | [ 0, 4, 5, 5 ] 52 | ); 53 | }; 54 | 55 | // sorted unique 56 | module.exports.unique = function(test, common) { 57 | 58 | // test runner 59 | var assert = function( a, expected ){ 60 | test( 'unique', function(t) { 61 | t.deepEqual( sorted.unique( a ), expected ); 62 | t.end(); 63 | }); 64 | }; 65 | 66 | assert( 67 | [ -1, 0, 0, 3, 4, 5, 5, 10 ], 68 | [ -1, 0, 3, 4, 5, 10 ] 69 | ); 70 | }; 71 | -------------------------------------------------------------------------------- /cmd/s3_upload.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | # directory of this file 5 | DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 6 | DATA_DIR=${PLACEHOLDER_DATA:-"${DIR}/../data"} 7 | BUCKET='s3://pelias-data.nextzen.org/placeholder' 8 | TODAY=`date +%Y-%m-%d` 9 | 10 | echo '--- gzipping data files ---' 11 | if type pigz >/dev/null 12 | then 13 | pigz -k -c --best "${DATA_DIR}/store.sqlite3" > "${DATA_DIR}/store.sqlite3.gz" 14 | pigz -k -c --best "${DATA_DIR}/wof.extract" > "${DATA_DIR}/wof.extract.gz" 15 | else 16 | gzip -c --best "${DATA_DIR}/store.sqlite3" > "${DATA_DIR}/store.sqlite3.gz" 17 | gzip -c --best "${DATA_DIR}/wof.extract" > "${DATA_DIR}/wof.extract.gz" 18 | fi 19 | 20 | echo '--- uploading archive ---' 21 | aws s3 cp "${DATA_DIR}/store.sqlite3.gz" "${BUCKET}/archive/${TODAY}/store.sqlite3.gz" --region us-east-1 --acl public-read 22 | aws s3 cp "${DATA_DIR}/wof.extract.gz" "${BUCKET}/archive/${TODAY}/wof.extract.gz" --region us-east-1 --acl public-read 23 | 24 | echo '--- list remote archive ---' 25 | aws s3 ls --human-readable "${BUCKET}/archive/${TODAY}/" 26 | 27 | echo -e "\n> would you like to promote this build to production (yes/no)?" 28 | read answer 29 | 30 | if [ "$answer" == "yes" ] || [ "$answer" == "y" ]; then 31 | echo '--- promoting build to production ---' 32 | aws s3 cp "${BUCKET}/archive/${TODAY}/store.sqlite3.gz" "${BUCKET}/store.sqlite3.gz" --region us-east-1 --acl public-read 33 | aws s3 cp "${BUCKET}/archive/${TODAY}/wof.extract.gz" "${BUCKET}/wof.extract.gz" --region us-east-1 --acl public-read 34 | 35 | echo '--- list remote production files ---' 36 | aws s3 ls --human-readable "${BUCKET}/" 37 | else 38 | echo 'you did not answer yes, the build was not promoted to production' 39 | fi 40 | -------------------------------------------------------------------------------- /cmd/repl.js: -------------------------------------------------------------------------------- 1 | 2 | var repl = require('repl'), 3 | Placeholder = require('../Placeholder'), 4 | ph = new Placeholder(); 5 | 6 | // init placeholder 7 | ph.load(); 8 | 9 | // commands 10 | var commands = { 11 | search: function( input, cb ){ 12 | console.time('took'); 13 | ph.query( input, ( err, res ) => { 14 | ph.store.getMany( res.getIdsAsArray(), ( err, docs ) => { 15 | if( err ){ return console.error( err ); } 16 | docs.forEach( doc => { 17 | console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') ); 18 | }); 19 | console.timeEnd('took'); 20 | cb(); 21 | }); 22 | }); 23 | }, 24 | tokenize: function( input, cb ){ 25 | console.time('took'); 26 | ph.tokenize( input, ( err, groups ) => { 27 | console.timeEnd('took'); 28 | console.log( groups ); 29 | cb(); 30 | }); 31 | }, 32 | token: function( body, cb ){ 33 | console.log( 'token', '"' + body + '"' ); 34 | console.time('took'); 35 | ph.index.matchSubjectDistinctSubjectIds( body, ( err, rows ) => { 36 | const subjectIds = rows.map( row => row.subjectId ); 37 | console.timeEnd('took'); 38 | console.log( subjectIds ); 39 | cb(); 40 | }); 41 | }, 42 | id: function( id, cb ){ 43 | console.time('took'); 44 | ph.store.get( id, ( err, doc ) => { 45 | if( err ){ return console.error( err ); } 46 | // console.log( ' -', [ doc.id, doc.placetype + ' ', doc.name ].join('\t') ); 47 | console.log( doc ); 48 | console.timeEnd('took'); 49 | cb(); 50 | }); 51 | } 52 | }; 53 | 54 | function myEval(cmd, context, filename, cb) { 55 | var split = cmd.trim().split(/\s+/g); 56 | if( commands.hasOwnProperty( split[0] ) ){ 57 | return commands[ split[0] ].call( null, split.splice(1).join(' '), cb ); 58 | } 59 | commands.search( split.join(' '), cb ); 60 | } 61 | 62 | // open the repl session 63 | var prompt = repl.start({ prompt: 'placeholder > ', eval: myEval }); 64 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pelias-placeholder", 3 | "version": "0.0.0-development", 4 | "engines": { 5 | "node": ">=10.0.0" 6 | }, 7 | "author": "mapzen", 8 | "license": "MIT", 9 | "main": "server.js", 10 | "scripts": { 11 | "test": "npm run units", 12 | "units": "./cmd/units", 13 | "integration": "./cmd/integration", 14 | "funcs": "for case in test/cases/*.txt; do node test/case.js $case; done", 15 | "all": "npm run units && npm run integration && npm run funcs", 16 | "start": "./cmd/server.sh", 17 | "extract": "bash ./cmd/extract.sh", 18 | "build": "bash ./cmd/build.sh", 19 | "gentests": "cat data/wof.extract | node cmd/generate_tests.js > test/cases/generated.txt", 20 | "repl": "node cmd/repl.js", 21 | "cli": "node cmd/cli.js", 22 | "lint": "jshint .", 23 | "validate": "npm ls", 24 | "ci": "./cmd/ci.sh" 25 | }, 26 | "repository": { 27 | "type": "git", 28 | "url": "https://github.com/pelias/placeholder.git" 29 | }, 30 | "bugs": { 31 | "url": "https://github.com/pelias/placeholder/issues" 32 | }, 33 | "homepage": "https://github.com/pelias/placeholder#readme", 34 | "dependencies": { 35 | "async": "^3.0.1", 36 | "better-sqlite3": "^12.2.0", 37 | "express": "^4.15.2", 38 | "lodash": "^4.17.21", 39 | "lower-case": "^2.0.0", 40 | "morgan": "^1.9.0", 41 | "pelias-blacklist-stream": "^1.1.0", 42 | "pelias-config": "^4.5.0", 43 | "pelias-logger": "^1.2.1", 44 | "pelias-whosonfirst": "^8.1.0", 45 | "regenerate": "^1.4.2", 46 | "remove-accents-diacritics": "^1.0.2", 47 | "require-dir": "^1.0.0", 48 | "sorted-intersect": "^0.1.4", 49 | "split2": "^3.0.0", 50 | "through2": "^3.0.0" 51 | }, 52 | "devDependencies": { 53 | "jshint": "^2.5.6", 54 | "precommit-hook": "^3.0.0", 55 | "tap-spec": "^5.0.0", 56 | "tape": "^5.0.0" 57 | }, 58 | "pre-commit": [ 59 | "lint", 60 | "validate", 61 | "test" 62 | ], 63 | "release": { 64 | "branch": "master", 65 | "success": [] 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /test/case.js: -------------------------------------------------------------------------------- 1 | 2 | var fs = require('fs'), 3 | path = require('path'), 4 | assert = require('assert'), 5 | split = require('split2'), 6 | through = require('through2'), 7 | Placeholder = require('../Placeholder'); 8 | 9 | /** 10 | this test reads the 'test/cases/*.txt' files (if present) and uses it's lines 11 | to generate test cases. 12 | 13 | see: README.md for more info on how to generate test cases. 14 | **/ 15 | 16 | // ensure the file is available in the filesystem 17 | var testcasePath = process.argv[2]; 18 | try { fs.statSync( testcasePath ); } 19 | catch( e ) { 20 | return console.error('%s not found, skipping test', testcasePath); 21 | } 22 | 23 | console.error( '----------- ' + testcasePath + ' -----------' ); 24 | 25 | // -------------- 26 | 27 | // load placeholder data 28 | var ph = new Placeholder(); 29 | ph.load(); 30 | 31 | // stream the test cases, run them one-by-one 32 | var stream = fs.createReadStream( testcasePath, 'utf8' ); 33 | stream.pipe( split() ) 34 | .pipe( through( function( line, _, next ){ 35 | if( !line.length ){ return; } // skip empty lines 36 | var split = line.toString('utf8').split(' '); 37 | var id = parseInt( split[0], 10 ); 38 | runner( ph, split.slice(1).join(' '), id, next ); 39 | }, function( done ){ 40 | console.log(); 41 | done(); 42 | })); 43 | 44 | // -------------- 45 | 46 | // convenience function for writing quick 'n easy test cases 47 | function runner( ph, actual, expected, next ){ 48 | ph.query( actual, ( err, res ) => { 49 | 50 | const ids = res.getIdsAsArray(); 51 | process.stderr.write('.'); 52 | 53 | try { 54 | assert.ok( -1 !== ids.indexOf( expected ), 'id found in results' ); 55 | } 56 | 57 | catch( e ){ 58 | console.log('\n'); 59 | console.log('input: ', actual); 60 | console.log('expected: ', expected); 61 | console.log('actual: ', ids.join(', ')); 62 | console.log(); 63 | } 64 | 65 | finally { 66 | next(); 67 | } 68 | 69 | }); 70 | } 71 | -------------------------------------------------------------------------------- /lib/Database.js: -------------------------------------------------------------------------------- 1 | 2 | var Sqlite3 = require('better-sqlite3'); 3 | 4 | // generic sqlite database 5 | function Database(){} 6 | 7 | Database.prototype.open = function( path, options ){ 8 | 9 | // set up a safe environment for running tests. 10 | // note: usually in-memory databases using the same 11 | // path would share the same database reference. 12 | if( options && true === options.test ){ 13 | path = ':memory:'; 14 | } 15 | 16 | // open connection 17 | this.db = new Sqlite3( path, options ); 18 | 19 | // configure database tables 20 | this.configure(); 21 | 22 | // reset data (clear all previous data and recreate schemas) 23 | if( options && true === options.reset ){ 24 | this.reset(); 25 | this.optimize(); 26 | } 27 | }; 28 | 29 | Database.prototype.close = function(){ 30 | this.db.close(); 31 | }; 32 | 33 | Database.prototype.prepare = function( sql ){ 34 | if( !this.hasOwnProperty('stmt') ){ this.stmt = {}; } 35 | if( !this.stmt.hasOwnProperty( sql ) ){ 36 | this.stmt[ sql ] = this.db.prepare( sql ); 37 | } 38 | return this.stmt[ sql ]; 39 | }; 40 | 41 | Database.prototype.configure = function(){ 42 | this.db.pragma('foreign_keys=OFF'); // we don't enforce foreign key constraints 43 | this.db.pragma('page_size=4096'); // (default: 1024) 44 | this.db.pragma('cache_size=-2000'); // (default: -2000, 2GB) 45 | this.db.pragma('synchronous=OFF'); 46 | this.db.pragma('journal_mode=MEMORY'); 47 | this.db.pragma('temp_store=MEMORY'); 48 | }; 49 | 50 | Database.prototype.reset = function(){ /* no-op */ }; 51 | Database.prototype.populate = function(){ /* no-op */ }; 52 | Database.prototype.checkSchema = function(){ /* no-op */ }; 53 | Database.prototype.optimize = function(){ 54 | this.db.exec('VACUUM'); 55 | }; 56 | 57 | // convenience function to validate a table schema against 58 | // an expected schema, throwing an error if they do not match. 59 | Database.assertSchema = function( db, tableName, expected ){ 60 | const actual = db.prepare('PRAGMA table_info(' + tableName + ')').all(); 61 | if( JSON.stringify(actual) !== JSON.stringify(expected) ){ 62 | throw new Error( 'schema invalid: table ' + tableName ); 63 | } 64 | }; 65 | 66 | module.exports = Database; 67 | -------------------------------------------------------------------------------- /test/functional.js: -------------------------------------------------------------------------------- 1 | 2 | var Placeholder = require('../Placeholder'); 3 | 4 | module.exports.functional = function(test, util) { 5 | 6 | // load data 7 | var ph = new Placeholder(); 8 | ph.load(); 9 | 10 | var assert = runner.bind(null, test, ph); 11 | 12 | assert('Kelburn Wellington New Zealand', [1729339019]); 13 | assert('North Sydney', [85784821, 101931469, 102048877, 404225393, 1310698409]); 14 | assert('Sydney New South Wales Australia', [101932003, 102049151, 404226357, 1376953385, 1377004395]); 15 | assert('ケープタウン 南アフリカ', [101928027]); 16 | 17 | // possible duplicates 18 | // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/1841 19 | assert('경기도 광명시', [102026551, 890472589]); 20 | assert('부산광역시 부산진구', [890475779, 890476045]); 21 | 22 | assert('서울 마포구', [890473201]); 23 | assert('전라북도 전주시 완산구', [102026471]); 24 | 25 | assert('london on', [ 101735809 ]); 26 | assert('paris, tx', [ 101725293 ]); 27 | 28 | assert('123 apple bay ave neutral bay north sydney new south wales au', 29 | [ 101931387, 404225267 ] 30 | ); 31 | 32 | assert('30 w 26th st ny nyc 10117 ny usa', [ 85977539 ]); 33 | 34 | // should not include county: 102081377, or localadmin: 404482867 35 | assert('lancaster lancaster pa', [ 101718643, 404487183, 404487185, 1729458067, 1729466275 ]); 36 | 37 | // assertions from pelias acceptance-test suite 38 | assert('灣仔, 香港', [85671779, 1243098523]); 39 | assert('new york city, usa', [85977539]); 40 | assert('sendai, japan', [102031919, 1108739995, 1125901991, 1243269829]); 41 | assert('Észak-Alföld', [404227483]); 42 | assert('Comunidad Foral De Navarra, ES', [404227391]); 43 | assert('Île-De-France, France', [404227465]); 44 | assert('Dél-Dunántúl, HU', [404227491]); 45 | assert('Sardegna, Italy', [404227535]); 46 | assert('Közép-Magyarország, Hungary', [404227489]); 47 | 48 | // All tokens should be in the same language 49 | // Parijs = Paris (nl); Francia = France (it) 50 | // see: https://github.com/pelias/placeholder/pull/195 51 | assert('Parijs Francia', [1225878855]); 52 | }; 53 | 54 | // convenience function for writing quick 'n easy test cases 55 | function runner( test, ph, actual, expected ){ 56 | test( actual, function(t) { 57 | ph.query( actual, ( err, res ) => { 58 | t.deepEqual( res.getIdsAsArray(), expected ); 59 | t.end(); 60 | }); 61 | }); 62 | } 63 | -------------------------------------------------------------------------------- /test/prototype/tokenize_integration.js: -------------------------------------------------------------------------------- 1 | 2 | var Placeholder = require('../../Placeholder'); 3 | 4 | module.exports.tokenize = function(test, util) { 5 | 6 | // load data 7 | var ph = new Placeholder(); 8 | ph.load(); 9 | 10 | var assert = runner.bind(null, test, ph); 11 | 12 | assert('Kelburn Wellington New Zealand', [['kelburn', 'wellington', 'new zealand']]); 13 | assert('Sydney New South Wales Australia', [['sydney', 'new south wales', 'australia']]); 14 | assert('ケープタウン 南アフリカ', [['ケーフタウン', '南アフリカ']]); 15 | 16 | // duplicates 17 | assert('lancaster lancaster pa', [['lancaster', 'lancaster', 'pa']]); 18 | 19 | // korean place names 20 | assert('세종특별자치시', [['세종특별자치시']]); 21 | 22 | // synonymous groupings 23 | // see: https://github.com/pelias/placeholder/issues/28 24 | // note: the 'Le Cros-d’Utelle, France' example (as at 20-09-17) no longer dedupes 25 | // to a single grouping due to the introduction of the token 'le' from 85685547 26 | assert('Le Cros-d’Utelle, France', [['le crosdutelle', 'france' ], [ 'le cros d utelle', 'france']]); 27 | assert('luxemburg luxemburg', [['luxemburg', 'luxemburg']]); // does not remove duplicate tokens 28 | 29 | // ambiguous parses 30 | // @note: these are the glorious future: 31 | 32 | // assert('Adams North Brunswick', [ 33 | // [ 'adams north', 'brunswick' ], 34 | // [ 'adams', 'north brunswick' ] 35 | // ]); 36 | // 37 | // assert('Heritage East San Jose', [ 38 | // [ 'heritage east', 'san jose' ], 39 | // [ 'heritage', 'east san jose' ] 40 | // ]); 41 | // 42 | // assert('bay ave neutral bay north sydney', [ 43 | // [ 'bay', 'neutral bay', 'north sydney' ], 44 | // [ 'bay', 'neutral bay', 'north', 'sydney' ] 45 | // ]); 46 | // 47 | // assert('mitte mitte berlin de', [ 48 | // [ 'mitte berlin', 'de' ], 49 | // [ 'mitte', 'mitte berlin', 'de' ], 50 | // [ 'mitte', 'mitte', 'berlin', 'de' ] 51 | // ]); 52 | // 53 | // assert('North Sydney', [ 54 | // [ 'north sydney' ], 55 | // [ 'north', 'sydney' ] 56 | // ]); 57 | // 58 | // assert('neutral bay north sydney', [ 59 | // [ 'neutral bay', 'north sydney' ], 60 | // [ 'neutral bay', 'north', 'sydney' ] 61 | // ]); 62 | }; 63 | 64 | // convenience function for writing quick 'n easy test cases 65 | function runner( test, ph, actual, expected ){ 66 | test( actual, function(t) { 67 | ph.tokenize( actual, ( err, queries ) => { 68 | t.deepEqual( queries, expected ); 69 | t.end(); 70 | }); 71 | }); 72 | } 73 | -------------------------------------------------------------------------------- /cmd/generate_tests.js: -------------------------------------------------------------------------------- 1 | 2 | var split = require('split2'), 3 | through = require('through2'), 4 | parser = require('../lib/jsonParseStream'), 5 | Placeholder = require('../Placeholder'), 6 | ph = new Placeholder(); 7 | 8 | ph.load(); // load data from disk 9 | 10 | var order = [ 11 | 'venue', 12 | 'address', 13 | 'building', 14 | 'campus', 15 | 'microhood', 16 | 'neighbourhood', 17 | 'macrohood', 18 | 'burough', 19 | 'postalcode', 20 | 'locality', 21 | 'metro area', 22 | 'localadmin', 23 | 'county', 24 | 'macrocounty', 25 | 'region', 26 | 'macroregion', 27 | 'country', 28 | 'empire', 29 | 'continent', 30 | 'ocean', 31 | 'planet' 32 | ]; 33 | 34 | // run test generation pipeline 35 | process.stdin.pipe( split() ) 36 | .pipe( parser() ) 37 | .pipe( through.obj( function insert( wof, _, next ){ 38 | 39 | var id = wof['wof:id']; 40 | if( 'string' === typeof id ){ id = parseInt( id, 10 ); } 41 | 42 | // sanity check; because WOF 43 | if( !ph.isValidWofRecord( id, wof ) ) { return next(); } 44 | 45 | // console.error( wof ); 46 | 47 | for( var h in wof['wof:hierarchy'] ){ 48 | 49 | // collect all parent ids for this hierarchy 50 | var parentIds = []; 51 | for( var o=0; o 0 ){ 55 | if( 'string' === typeof pid ){ pid = parseInt( pid, 10 ); } 56 | parentIds.push( pid ); 57 | } 58 | } 59 | 60 | print( ph, [ id, wof['wof:name'] ], parentIds ); 61 | } 62 | 63 | next(); 64 | })); 65 | 66 | function print( ph, line, parentIds ){ 67 | ph.store.getMany( parentIds, function( err, parents ){ 68 | 69 | if( err || !Array.isArray( parents ) || !parents.length ){ 70 | console.error( 'an error occurred', err, parents ); 71 | return; 72 | } 73 | 74 | var parentMap = {}; 75 | parents.forEach( function( parent ){ 76 | parentMap[ parent.id ] = parent; 77 | }); 78 | 79 | parentIds.forEach( function( pid ){ 80 | if( !parentMap.hasOwnProperty( pid ) ){ 81 | console.error( 'parent record of %s not found: %s', line[0], pid ); 82 | return; 83 | } 84 | line.push( parentMap[pid].name ); 85 | }); 86 | 87 | console.log( line.join(' ') ); 88 | }); 89 | } 90 | -------------------------------------------------------------------------------- /cmd/wof_extract_sqlite.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const fs = require('fs'); 3 | const whosonfirst = require('pelias-whosonfirst'); 4 | const config = require('pelias-config').generate().imports.whosonfirst; 5 | const SQLiteStream = whosonfirst.SQLiteStream; 6 | const through = require('through2'); 7 | const Placeholder = require('../Placeholder'); 8 | const combinedStream = require('combined-stream'); 9 | 10 | const SQLITE_REGEX = /whosonfirst-data-[a-z0-9-]+\.db$/; 11 | 12 | // Use WOF_DIR env variable when available, otherwise use the location specified in pelias.json 13 | const WOF_DIR = process.env.WOF_DIR || path.join(config.datapath, 'sqlite'); 14 | 15 | const layers = fs.readFileSync(path.join(__dirname, 'placetype.filter'), 'utf-8') 16 | .replace(/^.*\(/, '') // Removes all characters before the first parenthesis 17 | .match(/[a-z]+/g); // Get the layer list 18 | 19 | const jq_filter = new RegExp( 20 | fs.readFileSync(path.join(__dirname, 'jq.filter'), 'utf-8') 21 | .replace(/\n\s*/g, '') // Normalize multi-line 22 | .match(/test\(\s*"([^"]+(?:"\s*\+\s*"[^"]+)*)"\s*\)/)[1] // Extract pattern 23 | .replace(/"\s*\+\s*"/g, '') // Remove string concatenation 24 | ); 25 | 26 | const output = () => { 27 | if (process.argv.length > 2 && process.argv[2] === 'build') { 28 | const ph = new Placeholder(); 29 | ph.load({ reset: true }); 30 | return through.obj((row, _, next) => { 31 | ph.insertWofRecord(row, next); 32 | }, done => { 33 | console.error('populate fts...'); 34 | ph.populate(); 35 | console.error('optimize...'); 36 | ph.optimize(); 37 | console.error('close...'); 38 | ph.close(); 39 | done(); 40 | }); 41 | } else { 42 | return through.obj((row, _, next) => { 43 | console.log(JSON.stringify(row)); 44 | next(); 45 | }); 46 | } 47 | }; 48 | 49 | const sqliteStream = combinedStream.create(); 50 | fs.readdirSync(WOF_DIR) 51 | .filter(file => SQLITE_REGEX.test(file)) 52 | .map(file => path.join(WOF_DIR, file)) 53 | .forEach(dbPath => { 54 | sqliteStream.append(next => { 55 | next(new SQLiteStream( 56 | dbPath, 57 | config.importPlace ? 58 | SQLiteStream.findGeoJSONByPlacetypeAndWOFId(layers, config.importPlace) : 59 | SQLiteStream.findGeoJSONByPlacetype(layers) 60 | )); 61 | }); 62 | }); 63 | 64 | sqliteStream 65 | .pipe(whosonfirst.toJSONStream()) 66 | .pipe(through.obj((row, _, next) => { 67 | Object.keys(row.properties) 68 | .filter(key => !jq_filter.test(key)) 69 | .forEach(key => delete row.properties[key]); 70 | next(null, row.properties); 71 | })) 72 | .pipe(output()); 73 | -------------------------------------------------------------------------------- /test/server/routes/findbyid.js: -------------------------------------------------------------------------------- 1 | const findbyid = require('../../../server/routes/findbyid'); 2 | const _ = require('lodash'); 3 | const identity = () => {}; 4 | 5 | const makeRequest = opts => { 6 | const req = {}; 7 | _.set(req, 'app.locals.ph.store.getMany', opts.getMany); 8 | _.set(req, 'query.ids', opts.ids); 9 | _.set(req, 'query.lang', opts.lang); 10 | return req; 11 | }; 12 | 13 | const makeResponse = opts => { 14 | return { 15 | status: status => { 16 | if (opts.status) { opts.status(status); } 17 | return { send: opts.send || identity, json: opts.json || identity }; 18 | } 19 | }; 20 | }; 21 | 22 | module.exports.all = (test, common) => { 23 | test('parse ids - correct numbers list with spaces', t => { 24 | const req = makeRequest({ 25 | ids: '85682555, 85633111,102064231 , 85682523 , 102063845 ,', 26 | getMany: function(ids) { 27 | t.deepEqual(ids, [85682555, 85633111, 102064231, 85682523, 102063845]); 28 | t.end(); 29 | } 30 | }); 31 | findbyid(req, null); 32 | }); 33 | 34 | test('parse ids - incorrect numbers', t => { 35 | const req = makeRequest({ 36 | ids: 'not a number, 85633111a,1d02064231', 37 | getMany: function(ids) { 38 | t.deepEqual(ids, [85633111, 1]); 39 | t.end(); 40 | } 41 | }); 42 | findbyid(req, null); 43 | }); 44 | 45 | test('status code - 500', t => { 46 | const req = makeRequest({ 47 | getMany: (ids, cb) => { cb('Error'); } 48 | }); 49 | const res = makeResponse({ 50 | status: status => { 51 | t.deepEqual(status, 500); 52 | t.end(); 53 | } 54 | }); 55 | findbyid(req, res); 56 | }); 57 | 58 | test('status code - 404', t => { 59 | const req = makeRequest({ 60 | getMany: (ids, cb) => { cb(null, []); } 61 | }); 62 | const res = makeResponse({ 63 | status: status => { 64 | t.deepEqual(status, 404); 65 | t.end(); 66 | } 67 | }); 68 | findbyid(req, res); 69 | }); 70 | 71 | test('find by ids - without lang', t => { 72 | const req = makeRequest({ 73 | getMany: (ids, cb) => { 74 | cb(null, [{ 75 | id: 101751119, 76 | names: { fra: ['Paris'], eng: ['Paris'], ita: ['Parigi'] } 77 | }]); 78 | } 79 | }); 80 | const res = makeResponse({ 81 | status: status => { t.deepEqual(status, 200); }, 82 | json: docs => { 83 | t.deepEqual(docs, { 84 | 101751119: { 85 | id: 101751119, 86 | names: { fra: ['Paris'], eng: ['Paris'], ita: ['Parigi'] } 87 | } 88 | }); 89 | t.end(); 90 | } 91 | }); 92 | findbyid(req, res); 93 | }); 94 | 95 | test('find by ids - with lang', t => { 96 | const req = makeRequest({ 97 | lang: 'fra', 98 | getMany: (ids, cb) => { 99 | cb(null, [{ 100 | id: 101751119, 101 | names: { fra: ['Paris'], eng: ['Paris'], ita: ['Parigi'] } 102 | }]); 103 | } 104 | }); 105 | const res = makeResponse({ 106 | status: status => { t.deepEqual(status, 200); }, 107 | json: docs => { 108 | t.deepEqual(docs, { 109 | 101751119: { 110 | id: 101751119, 111 | names: { fra: ['Paris'] } 112 | } 113 | }); 114 | t.end(); 115 | } 116 | }); 117 | findbyid(req, res); 118 | }); 119 | }; -------------------------------------------------------------------------------- /test/prototype/io.js: -------------------------------------------------------------------------------- 1 | 2 | const path = require('path'); 3 | const io = require('../../prototype/io'); 4 | 5 | // Mock out placeholder 6 | const MockPlaceholder = function(){ 7 | this.store = {}; 8 | this.index = {}; 9 | }; 10 | MockPlaceholder.prototype = io; 11 | 12 | module.exports.exports = function(test, common) { 13 | test('exports', function(t) { 14 | t.equal( typeof io.load, 'function' ); 15 | t.equal( typeof io.populate, 'function' ); 16 | t.equal( typeof io.optimize, 'function' ); 17 | t.equal( typeof io.checkSchema, 'function' ); 18 | t.equal( typeof io.close, 'function' ); 19 | t.end(); 20 | }); 21 | }; 22 | 23 | module.exports.load = function(test, common) { 24 | test('load', function(t) { 25 | 26 | const ph = new MockPlaceholder(); 27 | const options = { foo: 'bar' }; 28 | 29 | t.plan(4); 30 | 31 | const expectedFilename = path.join(__dirname, '../../data/store.sqlite3'); 32 | 33 | // open store db 34 | ph.store.open = function( dbPath, opts ){ 35 | t.equals(dbPath, expectedFilename); 36 | t.deepEqual(opts, options); 37 | }; 38 | 39 | // open index db 40 | ph.index.open = function( dbPath, opts ){ 41 | t.equals(dbPath, expectedFilename); 42 | t.deepEqual(opts, options); 43 | }; 44 | 45 | ph.load(options); 46 | }); 47 | test('load - using env var', function(t) { 48 | 49 | const ph = new MockPlaceholder(); 50 | const options = { foo: 'bar' }; 51 | 52 | t.plan(4); 53 | 54 | process.env.PLACEHOLDER_DATA = '/my_data_dir/'; 55 | const expectedFilename = path.join(process.env.PLACEHOLDER_DATA, 'store.sqlite3'); 56 | 57 | // open store db 58 | ph.store.open = function( dbPath, opts ){ 59 | t.equals(dbPath, expectedFilename); 60 | t.deepEqual(opts, options); 61 | }; 62 | 63 | // open index db 64 | ph.index.open = function( dbPath, opts ){ 65 | t.equals(dbPath, expectedFilename); 66 | t.deepEqual(opts, options); 67 | }; 68 | 69 | ph.load(options); 70 | 71 | delete process.env.PLACEHOLDER_DATA; 72 | }); 73 | }; 74 | 75 | module.exports.populate = function(test, common) { 76 | test('populate', function(t) { 77 | 78 | const ph = new MockPlaceholder(); 79 | 80 | t.plan(2); 81 | 82 | // run 'populate' on both dbs 83 | ph.store.populate = t.false; 84 | ph.index.populate = t.false; 85 | 86 | ph.populate(); 87 | }); 88 | }; 89 | 90 | module.exports.optimize = function(test, common) { 91 | test('optimize', function(t) { 92 | 93 | const ph = new MockPlaceholder(); 94 | 95 | t.plan(1); 96 | 97 | // only run 'optimize' on one db 98 | ph.store.optimize = t.true; 99 | ph.index.optimize = t.false; 100 | 101 | ph.optimize(); 102 | }); 103 | }; 104 | 105 | module.exports.checkSchema = function(test, common) { 106 | test('checkSchema', function(t) { 107 | 108 | const ph = new MockPlaceholder(); 109 | 110 | t.plan(2); 111 | 112 | // run 'checkSchema' on both dbs 113 | ph.store.checkSchema = t.false; 114 | ph.index.checkSchema = t.false; 115 | 116 | ph.checkSchema(); 117 | }); 118 | }; 119 | 120 | module.exports.close = function(test, common) { 121 | test('close', function(t) { 122 | 123 | const ph = new MockPlaceholder(); 124 | 125 | t.plan(2); 126 | 127 | // run 'close' on both dbs 128 | ph.store.close = t.false; 129 | ph.index.close = t.false; 130 | 131 | ph.close(); 132 | }); 133 | }; 134 | -------------------------------------------------------------------------------- /test/lib/Database.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const Database = require('../../lib/Database'); 3 | 4 | module.exports.constructor = function(test, common) { 5 | test('constructor', function(t) { 6 | var db = new Database(); 7 | t.equal( typeof db.open, 'function' ); 8 | t.equal( typeof db.close, 'function' ); 9 | t.equal( typeof db.prepare, 'function' ); 10 | t.equal( typeof db.configure, 'function' ); 11 | t.equal( typeof db.reset, 'function' ); 12 | t.equal( typeof db.populate, 'function' ); 13 | t.equal( typeof db.optimize, 'function' ); 14 | t.equal( typeof Database.assertSchema, 'function' ); 15 | t.end(); 16 | }); 17 | }; 18 | 19 | module.exports.open = function(test, common) { 20 | test('open', function(t) { 21 | var db = new Database(); 22 | t.false( db.db ); 23 | 24 | // ensure 'reset' is not run 25 | db.reset = t.end; 26 | 27 | // ensure 'optimize' is not run 28 | db.optimize = t.end; 29 | 30 | // open connection 31 | db.open('/tmp/db', { test: true }); 32 | t.equal( db.db.constructor.name, 'Database' ); 33 | t.deepLooseEqual( db.db, { 34 | inTransaction: false, 35 | open: true, 36 | memory: true, 37 | readonly: false, 38 | name: db.db.name 39 | }); 40 | 41 | t.end(); 42 | }); 43 | 44 | test('open - runs configure', function(t) { 45 | var db = new Database(); 46 | 47 | // ensure 'configure' is run 48 | db.configure = t.end; 49 | 50 | // open connection 51 | db.open('/tmp/db', { test: true }); 52 | }); 53 | 54 | test('open - runs reset', function(t) { 55 | var db = new Database(); 56 | 57 | // ensure 'reset' is run 58 | db.reset = t.end; 59 | 60 | // open connection 61 | db.open('/tmp/db', { test: true, reset: true }); 62 | }); 63 | 64 | test('open - runs optimize', function(t) { 65 | var db = new Database(); 66 | 67 | // ensure 'optimize' is run 68 | db.optimize = t.end; 69 | 70 | // open connection 71 | db.open('/tmp/db', { test: true, reset: true }); 72 | }); 73 | }; 74 | 75 | module.exports.close = function(test, common) { 76 | test('close', function(t) { 77 | var db = new Database(); 78 | db.open('/tmp/db', { test: true }); 79 | t.true( db.db.open ); 80 | db.close(); 81 | t.false( db.db.open ); 82 | t.end(); 83 | }); 84 | }; 85 | 86 | module.exports.prepare = function(test, common) { 87 | test('prepare', function(t) { 88 | var db = new Database(); 89 | db.open('/tmp/db', { test: true }); 90 | 91 | t.equal(typeof db.stmt, 'undefined'); 92 | 93 | const sql = 'SELECT * FROM sqlite_master'; 94 | db.prepare(sql); 95 | 96 | t.true(typeof db.stmt, 'object'); 97 | t.true(db.stmt.hasOwnProperty(sql)); 98 | t.true(db.stmt[sql].reader); 99 | t.equal(db.stmt[sql].source, sql, 'sql query should be as expected'); 100 | 101 | t.end(); 102 | }); 103 | }; 104 | 105 | module.exports.configure = function(test, common) { 106 | test('configure', function(t) { 107 | var db = new Database(); 108 | db.open('/tmp/db', { test: true }); 109 | 110 | // configure 111 | const pragma_checks = { 112 | foreign_keys: 0, 113 | page_size: 4096, 114 | cache_size: -2000, 115 | synchronous: 0, 116 | // journal_mode: 'memory', 117 | temp_store: 2 118 | }; 119 | 120 | t.plan(_.size(pragma_checks)); 121 | _.forEach(pragma_checks, (value, key) => { 122 | const stmt = db.db.prepare(`PRAGMA ${key};`); 123 | t.deepEqual(stmt.get(), { [key]: value }); 124 | }); 125 | }); 126 | }; 127 | -------------------------------------------------------------------------------- /lib/TokenIndex.js: -------------------------------------------------------------------------------- 1 | 2 | var util = require('util'); 3 | var Database = require('./Database'); 4 | var Queries = require('./Queries'); 5 | 6 | // document store database 7 | function TokenIndex(){} 8 | util.inherits( TokenIndex, Database ); 9 | 10 | // @todo: more elegant polymorphism 11 | for( var method in Queries ){ 12 | TokenIndex.prototype[method] = Queries[method]; 13 | } 14 | 15 | TokenIndex.prototype.reset = function(){ 16 | this.db.exec('DROP TABLE IF EXISTS lineage;'); 17 | this.db.exec('CREATE TABLE lineage( id INTEGER, pid INTEGER );'); 18 | this.db.exec('CREATE INDEX IF NOT EXISTS lineage_cover_idx ON lineage(id, pid);'); 19 | 20 | this.db.exec('DROP TABLE IF EXISTS tokens;'); 21 | this.db.exec('CREATE TABLE tokens( id INTEGER, lang STRING, tag STRING, token STRING );'); 22 | this.db.exec('CREATE INDEX IF NOT EXISTS tokens_cover_idx ON tokens(id, lang, tag);'); 23 | this.db.exec('CREATE INDEX IF NOT EXISTS tokens_token_idx ON tokens(token);'); 24 | 25 | // FTS table options 26 | // see: https://sqlite.org/fts5.html 27 | var options = [ 28 | `tokenize="unicode61 remove_diacritics 0 tokenchars '_'"`, 29 | `prefix='1 2 3 4 5 6 7 8 9 10 11 12'`, 30 | 'columnsize=0' 31 | ].join(', '); 32 | this.db.exec('DROP TABLE IF EXISTS fulltext;'); 33 | this.db.exec('CREATE VIRTUAL TABLE fulltext USING fts5( token, ' + options + ');'); 34 | }; 35 | 36 | // ensure that the database schema matches what is expected by the codebase 37 | TokenIndex.prototype.checkSchema = function(){ 38 | Database.assertSchema(this.db, 'lineage', [ 39 | { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 }, 40 | { cid: 1, name: 'pid', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 } 41 | ]); 42 | Database.assertSchema(this.db, 'tokens', [ 43 | { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 }, 44 | { cid: 1, name: 'lang', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 }, 45 | { cid: 2, name: 'tag', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 }, 46 | { cid: 3, name: 'token', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 } 47 | ]); 48 | Database.assertSchema(this.db, 'fulltext', [ 49 | { cid: 0, name: 'token', type: '', notnull: 0, dflt_value: null, pk: 0 } 50 | ]); 51 | }; 52 | 53 | TokenIndex.prototype.populate = function(){ 54 | this.db.exec(`INSERT INTO fulltext(rowid, token) SELECT rowid, REPLACE(token,' ','_') FROM tokens;`); 55 | this.db.exec(`INSERT INTO fulltext(fulltext) VALUES('optimize');`); 56 | }; 57 | 58 | TokenIndex.prototype.setLineage = function( id, pids, cb ){ 59 | if( !Array.isArray( pids ) || !pids.length ){ return cb(); } 60 | 61 | // create prepared statement 62 | var stmt = this.prepare('INSERT INTO lineage ( id, pid ) VALUES ( $id, $pid )'); 63 | 64 | try { 65 | pids.forEach( pid => stmt.run({ id: id, pid: pid }) ); 66 | return cb( null ); 67 | } catch ( err ){ 68 | console.error( err ); 69 | console.error( stmt.source ); 70 | return cb( err ); 71 | } 72 | }; 73 | 74 | TokenIndex.prototype.setTokens = function( id, tokens, cb ){ 75 | if( !Array.isArray( tokens ) || !tokens.length ){ return cb(); } 76 | 77 | // create prepared statement 78 | var stmt = this.prepare( 79 | 'INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )' 80 | ); 81 | 82 | try { 83 | tokens.forEach( token => stmt.run({ 84 | id: id, 85 | lang: token.lang, 86 | tag: token.tag, 87 | token: token.body 88 | })); 89 | return cb( null ); 90 | } catch ( err ){ 91 | console.error( err ); 92 | console.error( stmt.source ); 93 | return cb( err ); 94 | } 95 | }; 96 | 97 | module.exports = TokenIndex; 98 | -------------------------------------------------------------------------------- /lib/Result.js: -------------------------------------------------------------------------------- 1 | 2 | const util = require('util'); 3 | const DEBUG = false; 4 | 5 | // convenience function for debugging 6 | function _debugRows( rows ){ 7 | rows = rows || []; 8 | console.log('found (' + rows.length + '):'); 9 | console.log( rows.map( row => { 10 | return ' - ' + util.format( 11 | '"%s" (%d) >>> "%s" (%d)', 12 | row.subject, 13 | row.subjectId, 14 | row.object, 15 | row.objectId 16 | ); 17 | }).join('\n')); 18 | } 19 | 20 | // reset indicates if we failed to find any matches for 21 | // object with any of the subjects 22 | // in this case we will use the previous object value 23 | // as a 'seed' for the id pool 24 | 25 | function Result( group, done ){ 26 | this.group = Array.isArray( group ) ? group : []; 27 | this.ids = {}; 28 | this.mask = new Array( this.group.length ).fill( false ); 29 | this.pos = { 30 | subject: this.group.length -2, 31 | object: this.group.length -1 32 | }; 33 | this.reset = false; 34 | this.done = ('function' === typeof done) ? done : function(){}; 35 | } 36 | 37 | Result.prototype.getSubject = function(){ 38 | return this.group[ this.pos.subject ]; 39 | }; 40 | 41 | Result.prototype.getObject = function(){ 42 | return this.group[ this.pos.object ]; 43 | }; 44 | 45 | Result.prototype.getPreviousObject = function(){ 46 | return this.group[ this.pos.prev_object ]; 47 | }; 48 | 49 | Result.prototype.getIdsAsArray = function(){ 50 | return Object.keys( this.ids ).map( k => parseInt( k, 10 ) ); 51 | }; 52 | 53 | // return all the 'subjectId' values from rows returned from the db 54 | // optionally: use a function to filter which rows are included. 55 | Result.subjectIdsFromRows = function( rows, filter ){ 56 | return rows.reduce(( memo, row ) => { 57 | if( 'function' === typeof filter ){ 58 | if( !filter( row ) ){ return memo; } 59 | } 60 | if( row.hasOwnProperty('subjectId') ){ 61 | memo[ row.subjectId ] = true; 62 | } 63 | return memo; 64 | }, {}); 65 | }; 66 | 67 | // convenience function to set mask values 68 | Result.prototype.setMask = function( entity, bool ){ 69 | if( this.pos.hasOwnProperty(entity) && -1 < this.pos[entity] ){ 70 | this.mask[ this.pos[entity] ] = !!bool; 71 | } 72 | }; 73 | 74 | // intersect the currect resultset with new matching rows from 75 | // the database. 76 | Result.prototype.intersect = function( err, rows ){ 77 | 78 | // debugging 79 | if( DEBUG ){ _debugRows( rows ); } 80 | 81 | // no results were found 82 | if( err || !rows || !rows.length ){ 83 | 84 | // decrement iterator 85 | this.pos.subject--; 86 | return; 87 | } 88 | 89 | // first time we have found matching rows for the query 90 | if( !Object.keys( this.ids ).length ){ 91 | this.ids = Result.subjectIdsFromRows( rows ); 92 | this.setMask('object', true); 93 | this.setMask('subject', true); 94 | this.pos.object = this.pos.subject; 95 | this.pos.subject = this.pos.object-1; 96 | return; 97 | } 98 | 99 | // compute the intersection of the new rows and the past 100 | // matched ids. 101 | 102 | // find the results which are children of existing ids 103 | const children = Result.subjectIdsFromRows( 104 | rows, 105 | row => this.ids.hasOwnProperty( row.objectId ) 106 | ); 107 | 108 | // we found at least one valid child 109 | if( !!Object.keys( children ).length ){ 110 | this.ids = children; 111 | this.setMask('subject', true); 112 | this.pos.object = this.pos.subject; 113 | this.pos.subject = this.pos.object-1; 114 | return; 115 | } 116 | 117 | // we failed to find any valid children of existing ids 118 | if( DEBUG ){ console.error( 'failed!' ); } 119 | 120 | // decrement iterator 121 | this.pos.subject--; 122 | }; 123 | 124 | module.exports = Result; 125 | -------------------------------------------------------------------------------- /test/cases/capitalCities.txt: -------------------------------------------------------------------------------- 1 | 101877135 Andorra la Vella, Andorra 2 | 421168799 Kabul, Afghanistan 3 | 890445621 St. John's, Antigua and Barbuda 4 | 890441875 The Valley, Anguilla 5 | 421182367 Yerevan, Armenia 6 | 890432155 Luanda, Angola 7 | 101734459 Pago Pago, American Samoa 8 | 890432017 Oranjestad, Aruba 9 | 85667871 Mariehamn, Aland Islands 10 | 890518775 Sarajevo, Bosnia and Herzegovina 11 | 890452811 Bridgetown, Barbados 12 | 421190647 Manama, Bahrain 13 | 421204487 Bujumbura, Burundi 14 | 421168997 Porto-Novo, Benin 15 | 890442097 Hamilton, Bermuda 16 | 421188863 Bandar Seri Begawan, Brunei 17 | 101964877 Brasilia, Brazil 18 | 85669631 Gaborone, Botswana 19 | 890442105 Belmopan, Belize 20 | 101735873 Ottawa, Canada 21 | 101938929 West Island, Cocos Islands 22 | 421181445 Bangui, Central African Republic 23 | 85670067 Brazzaville, Republic of the Congo 24 | 101748453 Bern, Switzerland 25 | 421168957 Yamoussoukro, Ivory Coast 26 | 102016915 Santiago, Chile 27 | 85670331 Praia, Cape Verde 28 | 421187435 Willemstad, Curacao 29 | 101909779 Berlin, Germany 30 | 101749159 Copenhagen, Denmark 31 | 890442101 Roseau, Dominica 32 | 101748153 Tallinn, Estonia 33 | 421199769 Asmara, Eritrea 34 | 101748283 Madrid, Spain 35 | 101748417 Helsinki, Finland 36 | 101750367 London, United Kingdom 37 | 890451719 St. George's, Grenada 38 | 890442055 Cayenne, French Guiana 39 | 1125821075 St Peter Port, Guernsey 40 | 421168965 Accra, Ghana 41 | 101753853 Gibraltar, Gibraltar 42 | 101870623 Nuuk, Greenland 43 | 421167921 Banjul, Gambia 44 | 421189675 Conakry, Guinea 45 | 890420199 Basse-Terre, Guadeloupe 46 | 421178347 Malabo, Equatorial Guinea 47 | 421197943 Bissau, Guinea-Bissau 48 | 890437279 Hong Kong, Hong Kong 49 | 101751659 Zagreb, Croatia 50 | 101751703 Budapest, Hungary 51 | 101751737 Dublin, Ireland 52 | 1125918569 Diego Garcia, British Indian Ocean Territory 53 | 101751753 Reykjavik, Iceland 54 | 1125783915 Saint Helier, Jersey 55 | 421186515 Kingston, Jamaica 56 | 85672817 Tokyo, Japan 57 | 890440079 Basseterre, Saint Kitts and Nevis 58 | 102026327 Seoul, South Korea 59 | 890434949 George Town, Cayman Islands 60 | 421191125 Astana, Kazakhstan 61 | 85673679 Castries, Saint Lucia 62 | 101828603 Vaduz, Liechtenstein 63 | 421195189 Maseru, Lesotho 64 | 101753031 Vilnius, Lithuania 65 | 101751765 Luxembourg, Luxembourg 66 | 890444507 Rabat, Morocco 67 | 101831917 Monaco, Monaco 68 | 421181453 Antananarivo, Madagascar 69 | 890451463 Majuro, Marshall Islands 70 | 890491957 Skopje, Macedonia 71 | 85681291 Macao, Macao 72 | 1108960813 Plymouth, Montserrat 73 | 101752423 Valletta, Malta 74 | 85674093 Male, Maldives 75 | 421168781 Lilongwe, Malawi 76 | 102023407 Kuala Lumpur, Malaysia 77 | 1141909361 Windhoek, Namibia 78 | 890413117 Noumea, New Caledonia 79 | 890440179 Kingston, Norfolk Island 80 | 101751893 Amsterdam, Netherlands 81 | 1495123997 Oslo, Norway 82 | 85675677 Yaren, Nauru 83 | 1141909453 Alofi, Niue 84 | 890445081 Panama City, Panama 85 | 890435983 Saint-Pierre, Saint Pierre and Miquelon 86 | 85676471 Melekeok, Palau 87 | 421190363 Doha, Qatar 88 | 102003033 Moscow, Russia 89 | 890444217 Honiara, Solomon Islands 90 | 421202159 Victoria, Seychelles 91 | 101752307 Stockholm, Sweden 92 | 102032341 Singapore, Singapore 93 | 101752073 Ljubljana, Slovenia 94 | 1108800123 Bratislava, Slovakia 95 | 890452049 Freetown, Sierra Leone 96 | 85677205 San Marino, San Marino 97 | 890449737 Mogadishu, Somalia 98 | 85677301 Sao Tome, Sao Tome and Principe 99 | 890434823 Philipsburg, Sint Maarten 100 | 102025263 Bangkok, Thailand 101 | 421196557 Dili, East Timor 102 | 421167889 Ashgabat, Turkmenistan 103 | 85679123 Tunis, Tunisia 104 | 85679409 Ankara, Turkey 105 | 85679705 Dodoma, Tanzania 106 | 421168855 Tashkent, Uzbekistan 107 | 890434937 Road Town, British Virgin Islands 108 | 421177479 Hanoi, Vietnam 109 | 890416453 Port Vila, Vanuatu 110 | 890452045 Mata Utu, Wallis and Futuna 111 | 890416609 Apia, Samoa 112 | 421178937 Lusaka, Zambia 113 | 421201479 Harare, Zimbabwe 114 | -------------------------------------------------------------------------------- /test/lib/DocStore.js: -------------------------------------------------------------------------------- 1 | 2 | var DocStore = require('../../lib/DocStore'); 3 | 4 | module.exports.constructor = function(test, common) { 5 | test('constructor', function(t) { 6 | var db = new DocStore(); 7 | t.equal( db.constructor.super_.name, 'Database' ); 8 | t.equal( typeof db.reset, 'function' ); 9 | t.equal( typeof db.set, 'function' ); 10 | t.equal( typeof db.get, 'function' ); 11 | t.equal( typeof db.getMany, 'function' ); 12 | t.end(); 13 | }); 14 | }; 15 | 16 | module.exports.reset = function(test, common) { 17 | test('reset', function(t) { 18 | var db = new DocStore(); 19 | db.open('/tmp/db', { test: true, reset: true }); 20 | 21 | // ensure table has been created 22 | const sql = 'PRAGMA table_info(docs)'; 23 | t.deepEqual( db.prepare(sql).all(), [ 24 | { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 1 }, 25 | { cid: 1, name: 'json', type: 'TEXT', notnull: 0, dflt_value: null, pk: 0 } 26 | ]); 27 | 28 | t.end(); 29 | }); 30 | }; 31 | 32 | module.exports.checkSchema = function(test, common) { 33 | test('checkSchema - empty', function(t) { 34 | var db = new DocStore(); 35 | db.open('/tmp/db', { test: true }); 36 | t.throws(() => { db.checkSchema(); }, /schema invalid: table docs/); 37 | t.end(); 38 | }); 39 | test('checkSchema - valid', function(t) { 40 | var db = new DocStore(); 41 | db.open('/tmp/db', { test: true, reset: true }); 42 | t.doesNotThrow(() => { db.checkSchema(); }); 43 | t.end(); 44 | }); 45 | test('checkSchema - invalid', function(t) { 46 | var db = new DocStore(); 47 | db.open('/tmp/db', { test: true }); 48 | db.db.exec('DROP TABLE IF EXISTS docs'); 49 | db.db.exec('CREATE TABLE docs( id INTEGER PRIMARY KEY, foo TEXT )'); 50 | t.throws(() => { db.checkSchema(); }); 51 | t.end(); 52 | }); 53 | }; 54 | 55 | module.exports.set = function(test, common) { 56 | test('set', function(t) { 57 | var db = new DocStore(); 58 | db.open('/tmp/db', { test: true, reset: true }); 59 | 60 | t.plan(1); 61 | 62 | const id = 100; 63 | const data = { test: { foo: 'bar' } }; 64 | 65 | db.set( id, data, (err) => { 66 | 67 | // ensure row has been created 68 | const sql = 'SELECT * FROM docs WHERE id = ? LIMIT 1'; 69 | t.deepEqual( db.prepare(sql).all(id), [ 70 | { id: id, json: DocStore.codec.encode( data ) } 71 | ]); 72 | 73 | }); 74 | }); 75 | }; 76 | 77 | module.exports.get = function(test, common) { 78 | test('get', function(t) { 79 | var db = new DocStore(); 80 | db.open('/tmp/db', { test: true, reset: true }); 81 | 82 | t.plan(1); 83 | 84 | const id = 100; 85 | const data = { test: { foo: 'bar' } }; 86 | 87 | // insert a row in the database 88 | db.prepare('INSERT INTO docs (id, json) VALUES ($id, $json)') 89 | .run({ id: id, json: DocStore.codec.encode( data ) }); 90 | 91 | // retrieve row 92 | db.get( id, (err, res) => { 93 | t.deepEqual( res, data ); 94 | }); 95 | }); 96 | }; 97 | 98 | module.exports.getMany = function(test, common) { 99 | test('getMany', function(t) { 100 | var db = new DocStore(); 101 | db.open('/tmp/db', { test: true, reset: true }); 102 | 103 | t.plan(1); 104 | 105 | // insert a row in the database 106 | var stmt = db.prepare('INSERT INTO docs (id, json) VALUES ($id, $json)'); 107 | stmt.run({ id: 100, json: DocStore.codec.encode({ test: 100 }) }); 108 | stmt.run({ id: 200, json: DocStore.codec.encode({ test: 200 }) }); 109 | stmt.run({ id: 300, json: DocStore.codec.encode({ test: 300 }) }); 110 | 111 | // retrieve rows 112 | db.getMany( [100, 300], (err, res) => { 113 | t.deepEqual( res, [ 114 | { test: 100 }, 115 | { test: 300 } 116 | ]); 117 | }); 118 | }); 119 | test('getMany - empty ids array', function(t) { 120 | var db = new DocStore(); 121 | db.open('/tmp/db', { test: true, reset: true }); 122 | 123 | t.plan(1); 124 | 125 | // retrieve rows 126 | db.getMany( [], (err, res) => { 127 | t.deepEqual( res, [] ); 128 | }); 129 | }); 130 | }; 131 | -------------------------------------------------------------------------------- /prototype/tokenize.js: -------------------------------------------------------------------------------- 1 | 2 | // plugin for tokenize 3 | const _ = require('lodash'); 4 | const async = require('async'); 5 | const analysis = require('../lib/analysis'); 6 | const permutations = require('../lib/permutations'); 7 | 8 | function tokenize(input, cb){ 9 | 10 | // tokenize input 11 | const synonyms = analysis.tokenize(input); 12 | 13 | // test each synonym against the database and select the best synonyms 14 | async.map( synonyms, _eachSynonym.bind(this), (err, queries) => { 15 | return cb( null, _queryFilter( queries ) ); 16 | }); 17 | } 18 | 19 | // test if a phrase exists in the index 20 | function _indexContainsPhrase(phrase, cb){ 21 | this.index.hasSubject( phrase, function( bool ){ 22 | return cb( null, bool ); 23 | }); 24 | } 25 | 26 | // expand each synonym in to its permutations and check them against the database. 27 | function _eachSynonym(synonym, cb){ 28 | 29 | // expand token permutations 30 | const phrases = _permutations(synonym); 31 | 32 | // filter out permutations which do not match phrases in the index 33 | async.filterSeries( phrases, _indexContainsPhrase.bind(this), (err, matchedPhrases) => { 34 | return cb( null, _groups(synonym, matchedPhrases) ); 35 | }); 36 | } 37 | 38 | // expand token permutations 39 | function _permutations(tokens){ 40 | return _.uniq(permutations.expand(tokens).map(perm => perm.join(' '))); 41 | } 42 | 43 | // remove unwanted queries 44 | function _queryFilter(queries){ 45 | 46 | // remove empty arrays 47 | queries = queries.filter( function( query ){ 48 | return 0 !== query.length; 49 | }); 50 | 51 | // remove synonymous groupings 52 | queries = queries.filter( function( query, i ){ 53 | for( var j=0; j b.length - a.length); 82 | 83 | // generate a map of matched phrases where the 84 | // key is a single word token (the first word in 85 | // the phrase) and the values is an array of 86 | // phrases which contain that word. 87 | const index = Object.create(null); 88 | phrases.forEach( phrase => { 89 | const words = phrase.split(/\s+/); 90 | const firstWord = words[0]; 91 | if( !index[ firstWord ] ){ 92 | index[ firstWord ] = []; 93 | } 94 | index[ firstWord ].push( words ); 95 | }); 96 | 97 | // an array of the chosen phrases 98 | const groups = []; 99 | 100 | // iterate over the input tokens 101 | for( var t=0; t { 91 | return synonym 92 | .replace(/^county\s(of\s)?(.*)$/gi, '$2') 93 | .replace(/^(.*)\scounty$/gi, '$1') 94 | .replace(/^city\sof(?!\s?the)\s?(.*)$/gi, '$1') 95 | .replace(/^(.*\s)charter\s(township)$/gi, '$1$2'); 96 | }) 97 | ); 98 | } 99 | 100 | // replace multiple spaces with a single space and trim tokens 101 | return synonyms.map( function( synonym ){ 102 | return synonym.replace(/\s{2,}/g, ' ').trim(); 103 | }) 104 | // normalization 105 | .map( function( synonym ){ 106 | return lowercase( unicode.fold( synonym ) ); 107 | }) 108 | // remove empty synonyms 109 | .filter( function( synonym ){ 110 | return synonym && synonym.length; 111 | }) 112 | // remove duplicate synonyms 113 | .filter( function( synonym, pos, self ){ 114 | return self.indexOf(synonym) === pos; 115 | }); 116 | } 117 | 118 | // try to detect languages which write their addresses in the opposite order-of-presentation to how it's 119 | // done in the west. 120 | // http://www.columbia.edu/~fdc/postal/#general 121 | const REGEX_MAJOR_TO_MINOR = /[\u0591-\u07FF\u1100-\u11FF\u3130-\u318F\uA960-\uA97F\uAC00-\uD7AF\uD7B0-\uD7FF\u0400-\u04FF]/; 122 | 123 | function tokenize( input ){ 124 | return normalize(input).map( function( synonym ){ 125 | // reverse tokens for major-to-minor address schemes 126 | if( REGEX_MAJOR_TO_MINOR.test( synonym ) ){ 127 | return synonym.split(/\s+/g).reverse(); 128 | } 129 | return synonym.split(/\s+/g); 130 | }); 131 | } 132 | 133 | module.exports.normalize = normalize; 134 | module.exports.tokenize = tokenize; 135 | module.exports.PARTIAL_TOKEN_SUFFIX = PARTIAL_TOKEN_SUFFIX; 136 | module.exports.REGEX_MAJOR_TO_MINOR = REGEX_MAJOR_TO_MINOR; 137 | -------------------------------------------------------------------------------- /lib/unicode.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const regenerate = require('regenerate'); 3 | const accentsDiacritics = require('remove-accents-diacritics'); 4 | 5 | // non-printable control characters 6 | // ref: https://en.wikipedia.org/wiki/List_of_Unicode_characters 7 | const CONTROL_CODES = regenerate() 8 | .addRange(0x0000, 0x001F) // C0 (0000-001F) 9 | .add(0x007F) // Delete 10 | .addRange(0x0080, 0x009F) // C1 (0080-009F) 11 | .toRegExp('g'); 12 | 13 | // non-standard spaces 14 | // ref: http://jkorpela.fi/chars/spaces.html 15 | const ALTERNATE_SPACES = regenerate() 16 | .add(0x00A0) // NO-BREAK SPACE 17 | .add(0x1680) // OGHAM SPACE MARK 18 | .add(0x180E) // MONGOLIAN VOWEL SEPARATOR 19 | .addRange(0x2000, 0x200B) // EN QUAD - ZERO WIDTH SPACE 20 | .add(0x202F) // NARROW NO-BREAK SPACE 21 | .add(0x205F) // MEDIUM MATHEMATICAL SPACE 22 | .add(0x3000) // IDEOGRAPHIC SPACE 23 | .add(0xFEFF) // ZERO WIDTH NO-BREAK SPACE 24 | .toRegExp('g'); 25 | 26 | // pattern to match consecutive spaces 27 | // const CONSECUTIVE_SPACES = /\s{2,}/g; 28 | 29 | // unicode combining marks 30 | // see: https://github.com/pelias/pelias/issues/829#issuecomment-542614645 31 | // ref: https://en.wikipedia.org/wiki/Combining_character 32 | const COMBINING_MARKS = regenerate() 33 | .add(0x200D) // ZERO WIDTH JOINER (U+200D) 34 | .addRange(0x0300, 0x036F) // Combining Diacritical Marks (0300–036F) 35 | .addRange(0x1AB0, 0x1AFF) // Combining Diacritical Marks Extended (1AB0–1AFF) 36 | .addRange(0x1DC0, 0x1DFF) // Combining Diacritical Marks Supplement (1DC0–1DFF) 37 | .addRange(0x20D0, 0x20FF) // Combining Diacritical Marks for Symbols (20D0–20FF) 38 | .addRange(0xFE00, 0xFE0F) // Variation Selectors (FE00-FE0F) 39 | .addRange(0xFE20, 0xFE2F) // Combining Half Marks (FE20–FE2F) 40 | .add(0x3099) // combining dakuten (U+3099) 41 | .add(0x309A) // combining handakuten (U+309A) 42 | .toRegExp('g'); 43 | 44 | // miscellaneous symbols with no relevance to geocoding 45 | const MISC_UNSUPPORTED_SYMBOLS = regenerate() 46 | // Superscripts and Subscripts (2070-209F) 47 | // Currency Symbols (20A0-20CF) 48 | // Letterlike Symbols (2100-214F) 49 | // Number Forms (2150-218F) 50 | // Arrows (2190-21FF) 51 | // Mathematical Operators (2200-22FF) 52 | // Miscellaneous Technical (2300-23FF) 53 | // Control Pictures (2400-243F) 54 | // Optical Character Recognition (2440-245F) 55 | // Enclosed Alphanumerics (2460-24FF) 56 | // Box Drawing (2500-257F) 57 | // Block Elements (2580-259F) 58 | // Geometric Shapes (25A0-25FF) 59 | // Miscellaneous Symbols (2600-26FF) 60 | // Dingbats (2700-27BF) 61 | // Miscellaneous Mathematical Symbols-A (27C0-27EF) 62 | // Supplemental Arrows-A (27F0-27FF) 63 | // Braille Patterns (2800-28FF) 64 | // Supplemental Arrows-B (2900-297F) 65 | // Miscellaneous Mathematical Symbols-B (2980-29FF) 66 | // Supplemental Mathematical Operators (2A00-2AFF) 67 | // Miscellaneous Symbols and Arrows (2B00-2BFF) 68 | .addRange(0x2070, 0x2BFF) // A Range Covering Consecutive Blocks Listed Above 69 | 70 | // symbols 71 | .addRange(0x02B0, 0x02FF) // Spacing Modifier Letters (02B0-02FF) 72 | .addRange(0x1400, 0x167F) // Unified Canadian Aboriginal Syllabics (1400-167F) 73 | .addRange(0x1D100, 0x1D1FF) // Musical Symbols (1D100-1D1FF) 74 | .addRange(0x1D400, 0x1D7FF) // Mathematical Alphanumeric Symbols (1D400-1D7FF) 75 | 76 | // emojis 77 | .addRange(0x1F300, 0x1F5FF) // Miscellaneous Symbols and Pictographs (1F300-1F5FF) 78 | .addRange(0x1F3FB, 0x1F3FF) // Emoji Modifier Fitzpatrick (skin tones) (1F3FB–1F3FF) 79 | .addRange(0x1F600, 0x1F64F) // Emoticons (1F600–1F64F) 80 | .addRange(0x1F680, 0x1F6FF) // Transport and Map Symbols (1F680-1F6FF) 81 | .addRange(0x1F900, 0x1F9FF) // Supplemental Symbols and Pictographs (1F900-1F9FF) 82 | .toRegExp('g'); 83 | 84 | function normalize(str) { 85 | 86 | // sanity checking 87 | if(!_.isString(str)){ return str; } 88 | 89 | return str 90 | .normalize('NFKC') 91 | .replace(CONTROL_CODES, '') 92 | .replace(ALTERNATE_SPACES, ' ') 93 | .replace(MISC_UNSUPPORTED_SYMBOLS, '') 94 | .replace(COMBINING_MARKS, ''); 95 | } 96 | 97 | /** 98 | * Converts alphabetic, numeric, and symbolic characters that are not 99 | * in the Basic Latin Unicode block(first 127 ASCII characters) to their 100 | * ASCII equivalent, if one exists.For example, the filter changes à to a. 101 | */ 102 | function fold(str) { 103 | 104 | // sanity checking 105 | if (!_.isString(str)) { return str; } 106 | 107 | return accentsDiacritics.remove(str) 108 | .normalize('NFD') 109 | .replace(COMBINING_MARKS, '') 110 | .normalize('NFKC'); 111 | } 112 | 113 | module.exports.normalize = normalize; 114 | module.exports.fold = fold; 115 | -------------------------------------------------------------------------------- /lib/DocStore.js: -------------------------------------------------------------------------------- 1 | 2 | var util = require('util'); 3 | var Database = require('./Database'); 4 | 5 | // document store database 6 | function DocStore(){} 7 | util.inherits( DocStore, Database ); 8 | 9 | DocStore.prototype.reset = function(){ 10 | this.db.exec('DROP TABLE IF EXISTS docs'); 11 | this.db.exec('DROP TABLE IF EXISTS rtree'); 12 | this.db.exec('CREATE TABLE docs( id INTEGER PRIMARY KEY, json TEXT )'); 13 | 14 | // create rtree table 15 | this.db.exec('CREATE VIRTUAL TABLE IF NOT EXISTS rtree USING rtree( id, minX, maxX, minY, maxY, minZ, maxZ )'); 16 | 17 | // triggers to keep the rtree index up-to-date 18 | var triggers = { 19 | insert: `INSERT INTO rtree ( id, minX, maxX, minY, maxY, minZ, maxZ ) VALUES ( 20 | new.id, 21 | json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[0]' ), 22 | json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[2]' ), 23 | json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[1]' ), 24 | json_extract( json( '[' || json_extract( new.json, '$.geom.bbox' ) || ']' ), '$[3]' ), 25 | json_extract( new.json, '$.rank.min' ), 26 | json_extract( new.json, '$.rank.max' ) 27 | )`, 28 | delete: 'DELETE FROM rtree WHERE id = old.id' 29 | }; 30 | 31 | this.db.exec(`CREATE TRIGGER IF NOT EXISTS rtree_insert_trigger 32 | AFTER INSERT ON docs 33 | BEGIN ${triggers.insert}; END`); 34 | 35 | this.db.exec(`CREATE TRIGGER IF NOT EXISTS rtree_delete_trigger 36 | AFTER DELETE ON docs 37 | BEGIN ${triggers.delete}; END`); 38 | 39 | this.db.exec(`CREATE TRIGGER IF NOT EXISTS rtree_update_trigger 40 | AFTER UPDATE ON docs 41 | BEGIN ${triggers.delete}; ${triggers.insert}; END`); 42 | }; 43 | 44 | // ensure that the database schema matches what is expected by the codebase 45 | DocStore.prototype.checkSchema = function(){ 46 | Database.assertSchema(this.db, 'docs', [ 47 | { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 1 }, 48 | { cid: 1, name: 'json', type: 'TEXT', notnull: 0, dflt_value: null, pk: 0 } 49 | ]); 50 | Database.assertSchema(this.db, 'rtree', [ 51 | { cid: 0, name: 'id', type: 'INT', notnull: 0, dflt_value: null, pk: 0 }, 52 | { cid: 1, name: 'minX', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 }, 53 | { cid: 2, name: 'maxX', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 }, 54 | { cid: 3, name: 'minY', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 }, 55 | { cid: 4, name: 'maxY', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 }, 56 | { cid: 5, name: 'minZ', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 }, 57 | { cid: 6, name: 'maxZ', type: 'REAL', notnull: 0, dflt_value: null, pk: 0 } 58 | ]); 59 | }; 60 | 61 | DocStore.prototype.set = function( id, doc, cb ){ 62 | 63 | // create prepared statement 64 | var stmt = this.prepare('INSERT INTO docs (id, json) VALUES ($id, $json)'); 65 | 66 | try { 67 | stmt.run({ id: id, json: DocStore.codec.encode( doc ) }); 68 | return cb( null ); 69 | } catch ( err ){ 70 | console.error( err ); 71 | console.error( stmt.source ); 72 | console.error( id, doc ); 73 | return cb( err ); 74 | } 75 | }; 76 | 77 | DocStore.prototype.get = function( id, cb ){ 78 | 79 | // create prepared statement 80 | var stmt = this.prepare('SELECT json FROM docs WHERE id = ? LIMIT 1'); 81 | 82 | try { 83 | var doc = stmt.get( id ); 84 | if( !doc ){ return cb( 'not found' ); } 85 | return cb( null, DocStore.codec.decode( doc ) ); 86 | } catch ( err ){ 87 | console.error( err ); 88 | console.error( stmt.source ); 89 | return cb( err ); 90 | } 91 | }; 92 | 93 | DocStore.prototype.getMany = function( ids, cb ){ 94 | 95 | if( !Array.isArray( ids ) || !ids.length ){ 96 | return cb( null, [] ); 97 | } 98 | 99 | // create prepared statement 100 | var stmt = this.prepare('SELECT json FROM docs WHERE id IN ' + 101 | '(' + Array(ids.length).fill('?').join(',') + ')' 102 | ); 103 | 104 | // var stmt = this.prepare('SELECT json FROM docs WHERE id IN ( ? )'); 105 | 106 | try { 107 | var docs = stmt.all( ids ); 108 | if( !docs ){ return cb( 'not found' ); } 109 | return cb( null, docs.map( DocStore.codec.decode )); 110 | } catch ( err ){ 111 | console.error( err ); 112 | console.error( stmt.source ); 113 | console.error( ids ); 114 | return cb( err ); 115 | } 116 | }; 117 | 118 | // encode/decode json strings 119 | DocStore.codec = { 120 | encode: ( decoded ) => { 121 | return JSON.stringify( decoded ); 122 | }, 123 | decode: ( encoded ) => { 124 | return JSON.parse( encoded.json ); 125 | } 126 | }; 127 | 128 | module.exports = DocStore; 129 | -------------------------------------------------------------------------------- /prototype/query.js: -------------------------------------------------------------------------------- 1 | 2 | var async = require('async'); 3 | var util = require('util'); 4 | var Result = require('../lib/Result'); 5 | var debug = false; 6 | 7 | function reduce( index, res ){ 8 | 9 | // we are on the last subject for this iteration 10 | if( -1 === res.pos.subject ){ 11 | 12 | // we still have more object tokens to try 13 | // so we reset the iterators. 14 | if( res.pos.object > 1 ){ 15 | 16 | // reset (move on to the next object) 17 | res.reset = true; 18 | 19 | // we have more values to try, update the positions 20 | // move on to the next object and start checking subjects to its left 21 | res.pos.prev_object = res.pos.object; 22 | res.pos.object--; 23 | res.pos.subject = res.pos.object-1; 24 | } 25 | 26 | // we have run out of tokens (all object tokens used up) 27 | else { 28 | 29 | // we didn't match anything, so simply return the ids for 30 | // the rightmost token. 31 | if( !Object.keys(res.ids).length ){ 32 | const lastToken = res.group[ res.group.length -1 ]; 33 | return index.matchSubjectDistinctSubjectIds( lastToken, ( err, rows ) => { 34 | res.intersect( err, rows ); 35 | return res.done( null, res ); 36 | }); 37 | } 38 | 39 | // we are done, return the result 40 | return res.done( null, res ); 41 | } 42 | } 43 | 44 | if( debug ){ 45 | if( res.reset ){ console.error( 'RESET!!' ); } 46 | console.log( '---------------------------------------------------' ); 47 | console.log( util.format( '"%s" >>> "%s"', res.getSubject(), res.getObject() ) ); 48 | } 49 | 50 | // reset 51 | if( res.reset ){ 52 | res.reset = false; // return to default value 53 | index.matchSubjectDistinctSubjectIds( res.getPreviousObject(), (err, rows) => { 54 | res.intersect( err, rows ); 55 | reduce( index, res ); 56 | }); 57 | } 58 | 59 | // regular query 60 | else { 61 | index.matchSubjectObject( res.getSubject(), res.getObject(), (err, rows) => { 62 | 63 | // perform a query for nearby features and include them in the results 64 | if( !rows || rows.length === 0 ){ 65 | index.matchSubjectObjectGeomIntersects( res.getSubject(), res.getObject(), (err2, rows2) => { 66 | res.intersect( err2, rows2 ); 67 | reduce( index, res ); 68 | }); 69 | } 70 | 71 | // do not perform a nearby search 72 | else { 73 | res.intersect( err, rows ); 74 | reduce( index, res ); 75 | } 76 | }); 77 | } 78 | } 79 | 80 | // query a single group 81 | function _queryGroup( index, group, done ){ 82 | 83 | // handle empty group 84 | if( !group || !group.length ){ 85 | return done( null, new Result() ); 86 | } 87 | 88 | reduce( index, new Result( group, done ) ); 89 | } 90 | 91 | // query many groups & merge the result 92 | function _queryManyGroups( index, groups, done ){ 93 | 94 | // handle empty groups 95 | if( !groups || !groups.length ){ 96 | return done( null, new Result() ); 97 | } 98 | 99 | // query each group in parallel 100 | // note: parallel likely doesn't have much of a perf gain when 101 | // using 'npm better-sqlite3'. 102 | async.parallel( groups.map( group => cb => { 103 | _queryGroup( index, group, ( err, res ) => { 104 | cb( null, { err: err, res: res }); 105 | }); 106 | }), function mergeQueryGroupResults( err, batch ) { 107 | 108 | var merged = new Result(); 109 | merged.group = batch[0].res.group; 110 | merged.mask = batch[0].res.mask; 111 | 112 | // merge results 113 | batch.forEach( b => { 114 | if( b.err ){ return; } 115 | 116 | // merge ids 117 | for( var attr in b.res.ids ){ 118 | merged.ids[ attr ] = b.res.ids[ attr ]; 119 | } 120 | 121 | // merge mask 122 | b.res.mask.forEach(( bool, pos ) => { 123 | if( true === bool ){ merged.mask[ pos ] = bool; } 124 | }); 125 | }); 126 | 127 | // @todo find a way of returning all masks/groups 128 | // instead of only the first element 129 | return done( err, merged ); 130 | }); 131 | } 132 | 133 | function query( text, done ){ 134 | this.tokenize( text, function( err, groups ){ 135 | 136 | switch( groups.length ){ 137 | 138 | // in a failure case we didnt find any groups; abort now 139 | case 0: return done( null, new Result() ); 140 | 141 | // in most cases there is only one group to query 142 | case 1: return _queryGroup( this.index, groups[0], done ); 143 | 144 | // for queries with multiple groups, we query each 145 | // group and then merge the results together. 146 | default: return _queryManyGroups( this.index, groups, done ); 147 | } 148 | 149 | }.bind(this)); 150 | } 151 | 152 | module.exports.query = query; 153 | module.exports._queryGroup = _queryGroup; 154 | module.exports._queryManyGroups = _queryManyGroups; 155 | -------------------------------------------------------------------------------- /server/routes/search.js: -------------------------------------------------------------------------------- 1 | 2 | const _ = require('lodash'); 3 | const util = require('./_util'); 4 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX; 5 | 6 | module.exports = function( req, res ){ 7 | 8 | // placeholder 9 | var ph = req.app.locals.ph; 10 | 11 | // input text 12 | var text = req.query.text || ''; 13 | 14 | // placetype filter 15 | var filter = { placetype: util.arrayParam( req.query.placetype ) }; 16 | 17 | // live mode (autocomplete-style search) 18 | // we append a byte indicating the last word is potentially incomplete. 19 | // except where the last token is a space, then we simply trim the space. 20 | if( req.query.mode === 'live' ){ 21 | if( ' ' === text.slice(-1) ){ 22 | text = text.trim(); 23 | } else { 24 | text += PARTIAL_TOKEN_SUFFIX; 25 | } 26 | } 27 | 28 | // perform query 29 | console.time('took'); 30 | ph.query( text, ( err, result ) => { 31 | console.timeEnd('took'); 32 | 33 | // language property 34 | var lang; 35 | if( 'string' === typeof req.query.lang && req.query.lang.length === 3 ){ 36 | lang = req.query.lang.toLowerCase(); 37 | } 38 | 39 | // fetch all result docs by id 40 | ph.store.getMany( result.getIdsAsArray(), function( err, documents ){ 41 | if( err ){ return res.status(500).send(err); } 42 | if( !documents || !documents.length ){ return res.status(200).send([]); } 43 | 44 | // placetype filter 45 | if( Array.isArray( filter.placetype ) && filter.placetype.length ){ 46 | documents = documents.filter(res => _.includes( filter.placetype, res.placetype )); 47 | } 48 | 49 | // get a list of parent ids 50 | const parentIds = getParentIds( documents ); 51 | 52 | // load all the parents 53 | ph.store.getMany( parentIds, ( err, parentResults ) => { 54 | 55 | // a database error occurred 56 | if( err ){ console.error( 'error fetching parent ids', err ); } 57 | 58 | // handle case where the database was unable to return any rows 59 | parentResults = parentResults || []; 60 | 61 | // create a map of parents 62 | const parents = rowsToIdMap( parentResults ); 63 | 64 | // map documents to dict using id as key 65 | const docs = documents.map( function( result ){ 66 | return mapResult( ph, result, parents, lang ); 67 | }); 68 | 69 | // sort documents according to sorting rules 70 | docs.sort( sortingAlgorithm ); 71 | 72 | // send json 73 | res.status(200).json( docs ); 74 | }); 75 | }); 76 | }); 77 | }; 78 | 79 | /** 80 | sort highest 'population' first, using 'geom.area' as a second 81 | sorting condition where population data is not available. 82 | **/ 83 | function sortingAlgorithm( a, b ){ 84 | 85 | // condition 1 - population 86 | const a1 = a.population || 0; 87 | const b1 = b.population || 0; 88 | 89 | // condition 2 - geom.area 90 | const a2 = a.geom && a.geom.area || 0; 91 | const b2 = b.geom && b.geom.area || 0; 92 | 93 | if( a1 < b1 ){ return +1; } 94 | if( a1 > b1 ){ return -1; } 95 | if( a2 < b2 ){ return +1; } 96 | if( a2 > b2 ){ return -1; } 97 | return 0; 98 | } 99 | 100 | function mapResult( ph, result, parents, lang ){ 101 | 102 | // swap languages 103 | if( Array.isArray( result.names[lang] ) && result.names[lang].length ){ 104 | result.name = result.names[lang][0]; 105 | result.languageDefaulted = false; 106 | } else { 107 | result.languageDefaulted = true; 108 | } 109 | 110 | // delete language properties 111 | delete result.names; 112 | 113 | // delete rank properties 114 | delete result.rank; 115 | 116 | result.lineage = result.lineage.map( function( lineage ){ 117 | return mapLineage( ph, lineage, parents, lang ); 118 | }); 119 | return result; 120 | } 121 | 122 | function mapLineage( ph, lineage, parents, lang ){ 123 | const res = {}; 124 | 125 | for( var attr in lineage ){ 126 | var parent = parents[ lineage[ attr ] ]; 127 | 128 | if( !parent ){ 129 | console.error( 'parent not found!', attr, lineage[ attr ] ); 130 | continue; 131 | } 132 | 133 | var name = parent.name; 134 | var languageDefaulted = true; 135 | 136 | // swap languages 137 | if( Array.isArray( parent.names[lang] ) && parent.names[lang].length ){ 138 | languageDefaulted = false; 139 | name = parent.names[lang][0]; 140 | } 141 | 142 | res[ parent.placetype ] = { 143 | id: parent.id, 144 | name: name, 145 | abbr: parent.abbr, 146 | languageDefaulted: languageDefaulted 147 | }; 148 | } 149 | 150 | return res; 151 | } 152 | 153 | // convert array of results to map using id as key 154 | function rowsToIdMap( rows ){ 155 | const map = {}; 156 | rows.forEach( function( row ){ 157 | map[ row.id ] = row; 158 | }); 159 | return map; 160 | } 161 | 162 | // get a unique array of parent ids 163 | function getParentIds( results ){ 164 | const parentIds = {}; 165 | results.forEach( function( row ){ 166 | row.lineage.forEach( function( lineage ){ 167 | for( var attr in lineage ){ 168 | parentIds[ lineage[attr] ] = true; 169 | } 170 | }); 171 | }); 172 | return Object.keys( parentIds ); 173 | } 174 | -------------------------------------------------------------------------------- /server/http.js: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | The http server improves performance on multicore machines by using the 4 | node core 'cluster' module to fork worker processes. 5 | 6 | The default setting is to use all available CPUs, this will spawn 32 child 7 | processes on a 32 core machine. 8 | 9 | If you would like to disable this feature (maybe because you are running 10 | inside a container) then you can do so by setting the env var CPUS=1 11 | 12 | You may also specify exactly how many child processes you would like to 13 | spawn by setting the env var to a numeric value >1, eg CPUS=4 14 | 15 | If the CPUS env var is set less than 1 or greater than os.cpus().length 16 | then the default setting will be used (using all available cores). 17 | **/ 18 | 19 | const os = require('os'); 20 | const morgan = require('morgan'); 21 | const express = require('express'); 22 | const cluster = require('cluster'); 23 | const through = require('through2'); 24 | const _ = require('lodash'); 25 | 26 | const Placeholder = require('../Placeholder.js'); 27 | const logger = require('pelias-logger').get('placeholder'); 28 | 29 | // select the amount of cpus we will use 30 | const envCpus = parseInt( process.env.CPUS, 10 ); 31 | const cpus = Math.min( Math.max( envCpus || Infinity, 1 ), os.cpus().length ); 32 | 33 | // optionally override port/host using env var 34 | var PORT = process.env.PORT || 3000; 35 | var HOST = process.env.HOST || undefined; 36 | var app = express(); 37 | 38 | // store the express http server so it can be terminated gracefully later 39 | let server; 40 | 41 | //record whether the service is terminating to control what events are worth logging 42 | let terminating = false; 43 | 44 | function log() { 45 | morgan.token('url', (req, res) => { 46 | // if there's a DNT header, just return '/' as the URL 47 | if (['DNT', 'dnt', 'do_not_track'].some(header => _.has(req.headers, header))) { 48 | return _.get(req, 'route.path'); 49 | } else { 50 | return req.originalUrl; 51 | } 52 | }); 53 | 54 | // 'short' format includes response time but leaves out date 55 | return morgan('short', { 56 | stream: through( function write( ln, _, next ){ 57 | logger.info( ln.toString().trim() ); 58 | next(); 59 | }) 60 | }); 61 | } 62 | 63 | // make sure that logging is the first thing that happens for all endpoints 64 | app.use(log()); 65 | 66 | // init placeholder 67 | var ph = new Placeholder({ readonly: true }); 68 | ph.load(); 69 | 70 | // ensure the database schemas match what is expected by the codebase. 71 | try { ph.checkSchema(); } 72 | catch( e ){ 73 | console.info('------------------------------------------------------'); 74 | console.error('Database schema is out-of-date!'); 75 | console.info('Your database files do not match the expected schema.'); 76 | console.info('Please follow instructions in the README to obtain new database files.'); 77 | console.info('This is the expected behaviour for breaking schema updates.'); 78 | console.info('more info: https://github.com/pelias/placeholder'); 79 | console.info('------------------------------------------------------'); 80 | process.exit(1); 81 | } 82 | 83 | // store $ph on app 84 | app.locals.ph = ph; 85 | 86 | // generic http headers 87 | app.use((req, res, next) => { 88 | res.header('Charset','utf8'); 89 | res.header('Cache-Control','public, max-age=120'); 90 | next(); 91 | }); 92 | 93 | // routes 94 | app.get( '/parser/search', require( './routes/search' ) ); 95 | app.get( '/parser/findbyid', require( './routes/findbyid' ) ); 96 | app.get( '/parser/query', require( './routes/query' ) ); 97 | app.get( '/parser/tokenize', require( './routes/tokenize' ) ); 98 | 99 | // demo page 100 | app.use('/demo', express.static( __dirname + '/demo' )); 101 | app.use('/', (req, res) => { res.redirect('/demo#eng'); }); 102 | 103 | // handle SIGINT and SIGTERM (required for fast docker restarts) 104 | function handler() { 105 | ph.close(); 106 | 107 | terminating = true; 108 | if (cluster.isMaster) { 109 | logger.info('Placeholder service shutting down'); 110 | for (const id in cluster.workers) { 111 | cluster.workers[id].kill('SIGINT'); 112 | cluster.workers[id].disconnect(); 113 | } 114 | } 115 | 116 | if (server) { 117 | server.close(); 118 | } 119 | } 120 | 121 | process.on('SIGINT', handler); 122 | process.on('SIGTERM', handler); 123 | 124 | // start multi-threaded server 125 | if( cpus > 1 ){ 126 | if( cluster.isMaster ){ 127 | logger.info('[master] using %d cpus', cpus); 128 | 129 | // worker exit event 130 | cluster.on('exit', (worker, code, signal) => { 131 | if (!terminating) { 132 | logger.error('[master] worker died', worker.process.pid); 133 | } 134 | }); 135 | 136 | // worker fork event 137 | cluster.on('fork', (worker, code, signal) => { 138 | logger.info('[master] worker forked', worker.process.pid); 139 | }); 140 | 141 | // fork workers 142 | for( var c=0; c { 148 | logger.info('[worker %d] listening on %s:%s', process.pid, HOST||'0.0.0.0', PORT); 149 | }); 150 | } 151 | } 152 | 153 | // start single-threaded server 154 | else { 155 | logger.info('[master] using %d cpus', cpus); 156 | 157 | server = app.listen( PORT, HOST, () => { 158 | logger.info('[master] listening on %s:%s', HOST||'0.0.0.0', PORT); 159 | }); 160 | } 161 | -------------------------------------------------------------------------------- /test/lib/analysis.js: -------------------------------------------------------------------------------- 1 | 2 | var analysis = require('../../lib/analysis'); 3 | 4 | module.exports.normalize = function(test, common) { 5 | var assert = runner.bind(null, test, 'normalize'); 6 | 7 | // Germanic substitutions 8 | assert( 'Schöneberg', [ 'schoneberg', 'schoeneberg' ] ); 9 | 10 | // apostrophe s 11 | assert( 'St. George\'s', [ 'st georges', 'st george' ] ); 12 | assert( 'St. George\‘s', [ 'st georges', 'st george' ] ); 13 | assert( 'St. George\’s', [ 'st georges', 'st george' ] ); 14 | 15 | // Punctuation substitutions 16 | assert( 'Straße', [ 'strasse' ] ); 17 | assert( 'Jǿ œ̆', [ 'jo oe' ] ); 18 | assert( 'orilẹ́ede manamari', [ 'orileede manamari' ] ); 19 | assert( 'z︠h︡ovkva', [ 'zhovkva' ] ); 20 | assert( 'Žovkva', [ 'zovkva' ] ); 21 | assert( 'Żółkiew', [ 'zolkiew' ] ); 22 | assert( 'Trinidad & Tobago', [ 'trinidad and tobago' ] ); 23 | 24 | // Tests to confirm the order of function execution 25 | // see: https://github.com/pelias/placeholder/pull/12#issuecomment-302437570 26 | test('order of execution', function(t) { 27 | t.deepEqual( analysis.normalize( 'İnceyol' ), [ 'inceyol' ] ); 28 | t.equal( analysis.normalize( 'İnceyol' )[0].length, 7 ); 29 | t.equal( analysis.normalize( 'İ' )[0].length, 1 ); 30 | t.end(); 31 | }); 32 | 33 | // Synonym contractions 34 | assert( 'SainT token sAiNt value saInt', [ 'st token st value st' ] ); 35 | assert( 'SaintE token sAinTe value saINte', [ 'ste token ste value ste' ] ); 36 | assert( 'FoRt token fORt value fOrT', [ 'ft token ft value ft' ] ); 37 | assert( 'MoUNt token mOUNt value mouNT', [ 'mt token mt value mt' ] ); 38 | 39 | // Synonym contractions - hyphens 40 | assert( 'Foo-Sainte-Bar', [ 'foostebar', 'foo ste bar' ] ); 41 | assert( 'Foo-Saint-Bar', [ 'foostbar', 'foo st bar' ] ); 42 | assert( 'Foo-Mount-Bar', [ 'foomtbar', 'foo mt bar' ] ); 43 | assert( 'Foo-Fort-Bar', [ 'fooftbar', 'foo ft bar' ] ); 44 | 45 | // Synonym - with/without official designation 46 | assert( 'County', [ 'county' ] ); 47 | assert( 'County Durham', [ 'county durham', 'durham' ] ); 48 | assert( 'County of Durham', [ 'county of durham', 'durham' ] ); 49 | assert( 'Durham County', [ 'durham county', 'durham' ] ); 50 | assert( 'County Two Words', [ 'county two words', 'two words' ] ); 51 | assert( 'County of Two Words', [ 'county of two words', 'two words' ] ); 52 | assert( 'Two Words County', [ 'two words county', 'two words' ] ); 53 | 54 | assert( 'City', [ 'city' ] ); 55 | assert( 'City London', [ 'city london' ] ); 56 | assert( 'City of London', [ 'city of london', 'london' ] ); 57 | assert( 'London City', [ 'london city' ] ); 58 | assert( 'City Salt Lake', [ 'city salt lake' ] ); 59 | assert( 'City of Salt Lake', [ 'city of salt lake', 'salt lake' ] ); 60 | assert( 'New York City', [ 'new york city' ] ); 61 | assert( 'City New York', [ 'city new york' ] ); 62 | assert( 'City of New York', [ 'city of new york', 'new york' ] ); 63 | assert( 'New York City', [ 'new york city' ] ); 64 | 65 | assert( 'City of the Sun', [ 'city of the sun' ] ); 66 | assert( 'City of Sun', [ 'city of sun', 'sun' ] ); 67 | 68 | // https://en.wikipedia.org/wiki/Charter_township 69 | assert( 'Word Charter Township', [ 'word charter township', 'word township' ] ); 70 | assert( 'Two Words Charter Township', [ 'two words charter township', 'two words township' ] ); 71 | 72 | // remove 'disambiguation' tokens from name suffix 73 | // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/885 74 | assert( 'St Kilda (Vic.)', [ 'st kilda' ] ); 75 | assert( 'Spring Mountain (Qld)', [ 'spring mountain' ] ); 76 | assert( 'Mónaco - Monaco', [ 'monaco' ] ); 77 | assert( 'Monako (peyi)', [ 'monako' ] ); 78 | assert( 'Monako [peyi]', [ 'monako' ] ); 79 | assert( 'Port Phillip (C)', [ 'port phillip' ] ); 80 | assert( 'Portland (Oregon)', [ 'portland' ] ); 81 | assert( 'Sutherland Shire (A)', [ 'sutherland shire' ] ); 82 | assert( 'Cocos- [Keeling] eilande', [ 'cocos' ] ); 83 | 84 | // remove tokens that *only* contain numbers 85 | assert( '1', [] ); 86 | assert( '22', [] ); 87 | assert( '333', [] ); 88 | assert( '22nd', ['22nd'] ); 89 | assert( 'a12', ['a12'] ); 90 | assert( '-1', [] ); // special case: handle '-1' values 91 | assert( '1 -1', [] ); 92 | assert( '1 --1', [] ); 93 | assert( '1 (foo)', [] ); 94 | assert( '1 [foo]', [] ); 95 | }; 96 | 97 | module.exports.tokenize = function(test, common) { 98 | var assert = runner.bind(null, test, 'tokenize'); 99 | 100 | // invalid type 101 | assert( [], [] ); 102 | assert( {}, [] ); 103 | 104 | // delimiters 105 | assert( 'Foo Bar', [[ 'foo', 'bar' ]] ); 106 | assert( 'Foo,,Bar', [[ 'foo', 'bar' ]] ); 107 | assert( 'Foo\'\'Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] ); 108 | assert( 'Foo‘‘Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] ); 109 | assert( 'Foo’’Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] ); 110 | assert( 'Foo\'’’Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] ); 111 | assert( 'Foo""Bar', [[ 'foo', 'bar' ]] ); 112 | 113 | // not a delimeter 114 | assert( 'Foo..Bar', [[ 'foobar' ]] ); 115 | assert( 'West L.A.', [[ 'west', 'la' ]] ); 116 | 117 | // synonymous punctuation 118 | assert( 'Foo-Bar', [[ 'foobar' ], [ 'foo', 'bar' ]] ); 119 | assert( 'Tol\'yatti', [[ 'tolyatti' ], [ 'tol', 'yatti' ]] ); 120 | assert( 'Sendai-shi', [[ 'sendaishi' ], [ 'sendai', 'shi' ]] ); 121 | }; 122 | 123 | module.exports.minor_to_major = function(test, common) { 124 | 125 | var isMajorToMinor = function( str ){ 126 | return analysis.REGEX_MAJOR_TO_MINOR.test( str ); 127 | }; 128 | 129 | test( 'minor-to-major', function(t) { 130 | t.false( isMajorToMinor('London, UK'), 'English' ); 131 | t.false( isMajorToMinor('Köln Deutschland'), 'German' ); 132 | t.false( isMajorToMinor('Orléans Nîmes Besançon'), 'French' ); 133 | t.end(); 134 | }); 135 | 136 | test( 'major-to-minor', function(t) { 137 | t.true( isMajorToMinor('г.Москва'), 'Russian' ); 138 | t.true( isMajorToMinor('경기도 광명시'), 'Korean' ); 139 | t.true( isMajorToMinor('ישראל'), 'Hebrew' ); 140 | t.true( isMajorToMinor('دبي'), 'Arabic' ); 141 | t.end(); 142 | }); 143 | }; 144 | 145 | // convenience function for writing quick 'n easy test cases 146 | function runner( test, method, actual, expected ){ 147 | test( actual, function(t) { 148 | t.deepEqual( analysis[method]( actual ), expected ); 149 | t.end(); 150 | }); 151 | } 152 | -------------------------------------------------------------------------------- /lib/Queries.js: -------------------------------------------------------------------------------- 1 | 2 | // load SQL queries from filesystem 3 | const query = require('../query/index'); 4 | const PARTIAL_TOKEN_SUFFIX = require('./analysis').PARTIAL_TOKEN_SUFFIX; 5 | const REMOVE_PARTIAL_TOKEN_REGEX = new RegExp(PARTIAL_TOKEN_SUFFIX, 'g'); 6 | const MAX_RESULTS = 100; 7 | const DEBUG = false; 8 | 9 | // set threshold bounds between 0.0-1.0 (degrees), defaults to 0.2 10 | const RTREE_ENV = parseFloat( process.env.RTREE_THRESHOLD ); 11 | const RTREE_THRESHOLD = !isNaN( RTREE_ENV ) ? Math.max( 0, Math.min( 1, RTREE_ENV ) ) : 0.2; 12 | 13 | function debug( stmt, args, cb ){ 14 | if( !DEBUG ){ return cb; } 15 | var query = renderQuery( stmt, args ); 16 | var start = new Date().getTime(); 17 | return function() { 18 | var took = new Date().getTime() - start; 19 | console.error('\x1b[1m' + query + '\x1b[0m'); 20 | console.error('\x1b[1;93mtook', took + 'ms\x1b[0m'); 21 | console.error('---------------------------------------------------------'); 22 | cb.apply( null, Array.prototype.slice.call( arguments ) ); 23 | }; 24 | } 25 | 26 | // debug statement and args 27 | function renderQuery( stmt, args ){ 28 | var output = stmt.source; 29 | Object.keys( args ).forEach( key => { 30 | output = output.replace('$' + key, '\'' + args[ key ] + '\''); 31 | }); 32 | return output; 33 | } 34 | 35 | // generic boolean query 36 | module.exports._queryBool = function( stmt, args, cb ){ 37 | cb = debug( stmt, args, cb ); 38 | try { 39 | var row = stmt.get( args ); 40 | return cb( undefined !== row ); 41 | } catch ( err ){ 42 | console.error( err ); 43 | return cb( false ); 44 | } 45 | }; 46 | 47 | // generic all query 48 | module.exports._queryAll = function( stmt, args, cb ){ 49 | cb = debug( stmt, args, cb ); 50 | try { 51 | var rows = stmt.all( args ); 52 | if( !Array.isArray( rows ) ){ return cb( null, [] ); } 53 | return cb( null, rows ); 54 | } catch ( err ){ 55 | console.error( err ); 56 | return cb( err ); 57 | } 58 | }; 59 | 60 | // cb( bool ) whether a 'subject' value exists in the db 61 | module.exports.hasSubject = function( subject, cb ){ 62 | var isPartialToken = subject.slice(-1) === PARTIAL_TOKEN_SUFFIX; 63 | subject = subject.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, ''); 64 | 65 | // no-op for empty string 66 | if( '' === subject.trim() ){ return cb( null, [] ); } 67 | 68 | if( isPartialToken ){ 69 | this._queryBool( 70 | this.prepare( query.has_subject_autocomplete ), 71 | { subject: `"${subject}" OR "${subject}"*` }, 72 | cb 73 | ); 74 | } else { 75 | this._queryBool( 76 | this.prepare( query.has_subject_autocomplete ), 77 | { subject: `"${subject}"` }, 78 | cb 79 | ); 80 | } 81 | }; 82 | 83 | module.exports.matchSubjectDistinctSubjectIds = function( subject, cb ){ 84 | var isPartialToken = subject.slice(-1) === PARTIAL_TOKEN_SUFFIX; 85 | 86 | // no-op for empty string 87 | if( '' === subject.trim() ){ return cb( null, [] ); } 88 | 89 | if( isPartialToken ){ 90 | subject = subject.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, ''); 91 | if( '' === subject.trim() ){ return cb( null, [] ); } 92 | 93 | this._queryAll( 94 | this.prepare( query.match_subject_autocomplete_distinct_subject_ids ), 95 | { subject: `"${subject}" OR "${subject}"*`, limit: MAX_RESULTS }, 96 | cb 97 | ); 98 | } else { 99 | this._queryAll( 100 | this.prepare( query.match_subject_distinct_subject_ids ), 101 | { subject: subject, limit: MAX_RESULTS }, 102 | cb 103 | ); 104 | } 105 | }; 106 | 107 | module.exports.matchSubjectObject = function( subject, object, cb ){ 108 | var isPartialToken = object.slice(-1) === PARTIAL_TOKEN_SUFFIX; 109 | 110 | // no-op for empty string 111 | if( '' === subject.trim() ){ return cb( null, [] ); } 112 | if( '' === object.trim() ){ return cb( null, [] ); } 113 | 114 | if( isPartialToken ){ 115 | object = object.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, ''); 116 | if( '' === object.trim() ){ return cb( null, [] ); } 117 | 118 | if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); } 119 | 120 | this._queryAll( 121 | this.prepare( query.match_subject_object_autocomplete ), 122 | { 123 | subject: subject, 124 | object: `${object}%`, 125 | limit: MAX_RESULTS 126 | }, 127 | cb 128 | ); 129 | } else { 130 | if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); } 131 | 132 | this._queryAll( 133 | this.prepare( query.match_subject_object ), 134 | { 135 | subject: subject, 136 | object: object, 137 | limit: MAX_RESULTS 138 | }, 139 | cb 140 | ); 141 | } 142 | }; 143 | 144 | module.exports._hasTooManyCombinations = function(subject, object) { 145 | const terms = [ subject, object ]; 146 | const stmt = this.prepare(query.count_tokens); 147 | 148 | const counts = terms.map(token => stmt.get({ token_quoted: `"${token}"` }).cnt); 149 | const combinations = counts.reduce((a, b) => a * b, 1); 150 | 151 | return combinations >= 1e6; 152 | }; 153 | 154 | module.exports.matchSubjectObjectGeomIntersects = function( subject, object, cb ){ 155 | var isPartialToken = object.slice(-1) === PARTIAL_TOKEN_SUFFIX; 156 | 157 | // no-op for empty string 158 | if( '' === subject.trim() ){ return cb( null, [] ); } 159 | if( '' === object.trim() ){ return cb( null, [] ); } 160 | 161 | // no-op when theshold is less than 0 162 | if( 0 > RTREE_THRESHOLD ){ return cb( null, [] ); } 163 | 164 | if( isPartialToken ){ 165 | object = object.replace(/ /g, '_').replace(REMOVE_PARTIAL_TOKEN_REGEX, ''); 166 | if( '' === object.trim() ){ return cb( null, [] ); } 167 | 168 | if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); } 169 | 170 | this._queryAll( 171 | this.prepare( query.match_subject_object_geom_intersects_autocomplete ), 172 | { 173 | subject, 174 | object, 175 | subject_quoted: `"${subject}"`, 176 | object_quoted: `"${object}"`, 177 | threshold: RTREE_THRESHOLD, 178 | limit: MAX_RESULTS 179 | }, 180 | cb 181 | ); 182 | } else { 183 | if (this._hasTooManyCombinations(subject, object)) { return cb( null, [] ); } 184 | 185 | this._queryAll( 186 | this.prepare( query.match_subject_object_geom_intersects ), 187 | { 188 | subject, 189 | object, 190 | subject_quoted: `"${subject}"`, 191 | object_quoted: `"${object}"`, 192 | threshold: RTREE_THRESHOLD, 193 | limit: MAX_RESULTS 194 | }, 195 | cb 196 | ); 197 | } 198 | }; 199 | -------------------------------------------------------------------------------- /test/lib/TokenIndex.js: -------------------------------------------------------------------------------- 1 | 2 | var TokenIndex = require('../../lib/TokenIndex'); 3 | 4 | module.exports.constructor = function(test, common) { 5 | test('constructor', function(t) { 6 | var db = new TokenIndex(); 7 | t.equal( db.constructor.super_.name, 'Database' ); 8 | t.equal( typeof db.reset, 'function' ); 9 | t.equal( typeof db.populate, 'function' ); 10 | 11 | t.equal( typeof db.setLineage, 'function' ); 12 | t.equal( typeof db.setTokens, 'function' ); 13 | t.end(); 14 | }); 15 | }; 16 | 17 | module.exports.reset = function(test, common) { 18 | test('reset', function(t) { 19 | var db = new TokenIndex(); 20 | db.open('/tmp/db', { test: true, reset: true }); 21 | 22 | // ensure table has been created 23 | var sql = 'PRAGMA table_info(lineage)'; 24 | t.deepEqual( db.prepare(sql).all(), [ 25 | { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 }, 26 | { cid: 1, name: 'pid', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 } 27 | ]); 28 | 29 | // ensure table has been created 30 | sql = 'PRAGMA table_info(tokens)'; 31 | t.deepEqual( db.prepare(sql).all(), [ 32 | { cid: 0, name: 'id', type: 'INTEGER', notnull: 0, dflt_value: null, pk: 0 }, 33 | { cid: 1, name: 'lang', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 }, 34 | { cid: 2, name: 'tag', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 }, 35 | { cid: 3, name: 'token', type: 'STRING', notnull: 0, dflt_value: null, pk: 0 } 36 | ]); 37 | 38 | // ensure table has been created 39 | sql = 'PRAGMA table_info(fulltext)'; 40 | t.deepEqual( db.prepare(sql).all(), [ 41 | { cid: 0, name: 'token', type: '', notnull: 0, dflt_value: null, pk: 0 } 42 | ]); 43 | 44 | // ensure fts table has been created with the correct options 45 | sql = `SELECT * FROM sqlite_master WHERE type='table' AND name='fulltext'`; 46 | const expected = 47 | 'CREATE VIRTUAL TABLE fulltext USING fts5( token, ' + [ 48 | `tokenize="unicode61 remove_diacritics 0 tokenchars '_'"`, 49 | `prefix='1 2 3 4 5 6 7 8 9 10 11 12'`, 50 | 'columnsize=0' 51 | ].join(', ') + ')'; 52 | 53 | t.deepEqual( db.prepare(sql).get().sql, expected ); 54 | t.end(); 55 | }); 56 | }; 57 | 58 | module.exports.checkSchema = function(test, common) { 59 | test('checkSchema - empty', function(t) { 60 | var db = new TokenIndex(); 61 | db.open('/tmp/db', { test: true }); 62 | t.throws(() => { db.checkSchema(); }, /schema invalid: table lineage/); 63 | t.end(); 64 | }); 65 | test('checkSchema - valid', function(t) { 66 | var db = new TokenIndex(); 67 | db.open('/tmp/db', { test: true, reset: true }); 68 | t.doesNotThrow(() => { db.checkSchema(); }); 69 | t.end(); 70 | }); 71 | test('checkSchema - invalid lineage', function(t) { 72 | var db = new TokenIndex(); 73 | db.open('/tmp/db', { test: true, reset: true }); 74 | db.db.exec('DROP TABLE IF EXISTS lineage'); 75 | t.throws(() => { db.checkSchema(); }, /schema invalid: table lineage/); 76 | t.end(); 77 | }); 78 | test('checkSchema - invalid tokens', function(t) { 79 | var db = new TokenIndex(); 80 | db.open('/tmp/db', { test: true, reset: true }); 81 | db.db.exec('DROP TABLE IF EXISTS tokens'); 82 | t.throws(() => { db.checkSchema(); }, /schema invalid: table tokens/); 83 | t.end(); 84 | }); 85 | test('checkSchema - invalid fulltext', function(t) { 86 | var db = new TokenIndex(); 87 | db.open('/tmp/db', { test: true, reset: true }); 88 | db.db.exec('DROP TABLE IF EXISTS fulltext'); 89 | t.throws(() => { db.checkSchema(); }, /schema invalid: table fulltext/); 90 | t.end(); 91 | }); 92 | }; 93 | 94 | module.exports.populate = function(test, common) { 95 | test('populate', function(t) { 96 | var db = new TokenIndex(); 97 | db.open('/tmp/db', { test: true, reset: true }); 98 | 99 | // prepare some sql statments 100 | const fulltext = { 101 | query: db.prepare('SELECT * FROM fulltext') 102 | }; 103 | const tokens = { 104 | insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )') 105 | }; 106 | 107 | // add some rows to the tokens table 108 | tokens.insert.run({ id: 1, lang: 'en', tag: 'test', token: 'hello world' }); 109 | tokens.insert.run({ id: 2, lang: 'fr', tag: 'test', token: 'a b c' }); 110 | 111 | // no rows in fulltext table 112 | t.deepEqual( fulltext.query.all(), [] ); 113 | 114 | // run populate 115 | db.populate(); 116 | 117 | // no rows in fulltext table 118 | t.deepEqual( fulltext.query.all(), [ 119 | { token: 'hello_world' }, 120 | { token: 'a_b_c' } 121 | ]); 122 | 123 | t.end(); 124 | }); 125 | }; 126 | 127 | module.exports.setLineage = function(test, common) { 128 | test('setLineage', function(t) { 129 | var db = new TokenIndex(); 130 | db.open('/tmp/db', { test: true, reset: true }); 131 | 132 | t.plan(1); 133 | 134 | const id = 100; 135 | const pids = [ 200, 300 ]; 136 | 137 | db.setLineage( id, pids, (err) => { 138 | 139 | // ensure rows have been created 140 | const sql = 'SELECT * FROM lineage'; 141 | t.deepEqual( db.prepare(sql).all(), [ 142 | { id: 100, pid: 200 }, 143 | { id: 100, pid: 300 } 144 | ]); 145 | 146 | }); 147 | }); 148 | test('setLineage - empty pids array', function(t) { 149 | var db = new TokenIndex(); 150 | db.open('/tmp/db', { test: true, reset: true }); 151 | 152 | t.plan(1); 153 | 154 | db.setLineage( 1, [], (err, res) => { 155 | t.deepEqual( db.prepare('SELECT * FROM lineage').all(), []); 156 | }); 157 | }); 158 | }; 159 | 160 | module.exports.setTokens = function(test, common) { 161 | test('setTokens', function(t) { 162 | var db = new TokenIndex(); 163 | db.open('/tmp/db', { test: true, reset: true }); 164 | 165 | t.plan(1); 166 | 167 | const id = 100; 168 | const tokens = [ 169 | { lang: 'en', tag: 'abbr', body: 'test1' }, 170 | { lang: 'fr', tag: 'variant', body: 'test2' } 171 | ]; 172 | 173 | db.setTokens( id, tokens, (err) => { 174 | 175 | // ensure rows have been created 176 | const sql = 'SELECT * FROM tokens'; 177 | t.deepEqual( db.prepare(sql).all(), [ 178 | { id: 100, lang: 'en', tag: 'abbr', token: 'test1' }, 179 | { id: 100, lang: 'fr', tag: 'variant', token: 'test2' } 180 | ]); 181 | 182 | }); 183 | }); 184 | test('setTokens - empty tokens array', function(t) { 185 | var db = new TokenIndex(); 186 | db.open('/tmp/db', { test: true, reset: true }); 187 | 188 | t.plan(1); 189 | 190 | db.setTokens( 1, [], (err, res) => { 191 | t.deepEqual( db.prepare('SELECT * FROM tokens').all(), []); 192 | }); 193 | }); 194 | }; 195 | -------------------------------------------------------------------------------- /config/language/whitelist.js: -------------------------------------------------------------------------------- 1 | /** 2 | This whitelist lists all the language codes we accept for import in to placeholder. 3 | 4 | The whosonfirst dataset contains many disused and rarely-used languages which 5 | can cause issues when the source data has been machine transiliterated. 6 | 7 | The list is non-exhaustive and was originally sourced from wikipedia and various 8 | online sources, I aimed to include the most commonly spoken languages worldwide. 9 | 10 | If you feel a language code is wrong or missing, please feel free to edit this file. 11 | **/ 12 | 13 | // Chinese - 汉语/漢語 Hànyǔ or 中文 Zhōngwén 14 | module.exports.chi = ''; 15 | module.exports.zho = ''; 16 | module.exports.cdo = ''; 17 | module.exports.cjy = ''; 18 | module.exports.cmn = ''; 19 | module.exports.cpx = ''; 20 | module.exports.czh = ''; 21 | module.exports.czo = ''; 22 | module.exports.gan = ''; 23 | module.exports.hak = ''; 24 | module.exports.hsn = ''; 25 | module.exports.mnp = ''; 26 | module.exports.nan = ''; 27 | module.exports.wuu = ''; 28 | module.exports.yue = ''; 29 | module.exports.och = ''; 30 | module.exports.ltc = ''; 31 | module.exports.lzh = ''; 32 | 33 | // Spanish - español ("Spanish") and castellano ("Castilian") 34 | module.exports.esp = ''; 35 | module.exports.spa = ''; 36 | 37 | // English 38 | module.exports.eng = ''; 39 | 40 | // Arabic - العربية 41 | module.exports.ara = ''; 42 | module.exports.arq = ''; 43 | module.exports.aao = ''; 44 | module.exports.bbz = ''; 45 | module.exports.abv = ''; 46 | module.exports.shu = ''; 47 | module.exports.acy = ''; 48 | module.exports.adf = ''; 49 | module.exports.avl = ''; 50 | module.exports.arz = ''; 51 | module.exports.afb = ''; 52 | module.exports.ayh = ''; 53 | module.exports.acw = ''; 54 | module.exports.ayl = ''; 55 | module.exports.acm = ''; 56 | module.exports.ary = ''; 57 | module.exports.ars = ''; 58 | module.exports.apc = ''; 59 | module.exports.ayp = ''; 60 | module.exports.acx = ''; 61 | module.exports.aec = ''; 62 | module.exports.ayn = ''; 63 | module.exports.ssh = ''; 64 | module.exports.ajp = ''; 65 | module.exports.arb = ''; 66 | module.exports.apb = ''; 67 | module.exports.pga = ''; 68 | module.exports.acq = ''; 69 | module.exports.abh = ''; 70 | module.exports.aeb = ''; 71 | module.exports.auz = ''; 72 | 73 | // Hindi - मानक हिन्दी - Mānak Hindī 74 | module.exports.hin = ''; 75 | 76 | // Bengali - বাংলা - Bangla 77 | module.exports.ben = ''; 78 | 79 | // Portuguese - português 80 | module.exports.por = ''; 81 | 82 | // Russian - ру́сский язы́к - russkij jazyk 83 | module.exports.rus = ''; 84 | 85 | // Japanese - 日本語 - Nihongo 86 | module.exports.jpn = ''; 87 | 88 | // Punjabi 89 | module.exports.pan = ''; 90 | module.exports.pnb = ''; 91 | 92 | // German - Deutsch 93 | module.exports.ger = ''; 94 | module.exports.deu = ''; 95 | module.exports.gmh = ''; 96 | module.exports.goh = ''; 97 | module.exports.gct = ''; 98 | module.exports.bar = ''; 99 | module.exports.cim = ''; 100 | module.exports.geh = ''; 101 | module.exports.ksh = ''; 102 | module.exports.nds = ''; 103 | module.exports.sli = ''; 104 | module.exports.ltz = ''; 105 | module.exports.vmf = ''; 106 | module.exports.mhn = ''; 107 | module.exports.pfl = ''; 108 | module.exports.pdc = ''; 109 | module.exports.pdt = ''; 110 | module.exports.swg = ''; 111 | module.exports.gsw = ''; 112 | module.exports.uln = ''; 113 | module.exports.sxu = ''; 114 | module.exports.wae = ''; 115 | module.exports.wep = ''; 116 | module.exports.hrx = ''; 117 | module.exports.yec = ''; 118 | 119 | // Javanese 120 | module.exports.jav = ''; 121 | module.exports.jvn = ''; 122 | module.exports.jas = ''; 123 | module.exports.osi = ''; 124 | module.exports.tes = ''; 125 | module.exports.kaw = ''; 126 | 127 | // Malay 128 | module.exports.msa = ''; 129 | module.exports.kxd = ''; 130 | module.exports.ind = ''; 131 | module.exports.zsm = ''; 132 | module.exports.jax = ''; 133 | module.exports.meo = ''; 134 | module.exports.kvr = ''; 135 | module.exports.xmm = ''; 136 | module.exports.min = ''; 137 | module.exports.mui = ''; 138 | module.exports.zmi = ''; 139 | module.exports.max = ''; 140 | module.exports.mfa = ''; 141 | 142 | // Lahnda 143 | module.exports.lah = ''; 144 | module.exports.hnd = ''; 145 | module.exports.hno = ''; 146 | module.exports.jat = ''; 147 | module.exports.phr = ''; 148 | module.exports.skr = ''; 149 | module.exports.xhe = ''; 150 | 151 | // Telugu 152 | module.exports.tel = ''; 153 | 154 | // Vietnamese 155 | module.exports.vie = ''; 156 | 157 | // Marathi 158 | module.exports.mar = ''; 159 | module.exports.omr = ''; 160 | 161 | // French - le français 162 | module.exports.fra = ''; 163 | module.exports.fre = ''; 164 | 165 | // Korean - 한국어 - Hangugeo 166 | module.exports.kor = ''; 167 | module.exports.jje = ''; 168 | module.exports.okm = ''; 169 | module.exports.oko = ''; 170 | 171 | // Tamil 172 | module.exports.tam = ''; 173 | module.exports.oty = ''; 174 | module.exports.ptq = ''; 175 | 176 | // Italian 177 | module.exports.ita = ''; 178 | 179 | // Urdu 180 | module.exports.urd = ''; 181 | 182 | // Tai-Kadai - ภาษาไต - p̣hās̛̄ā tay 183 | module.exports.tai = ''; 184 | 185 | // Thai 186 | module.exports.tha = ''; 187 | 188 | // Tagalog 189 | module.exports.tgl = ''; 190 | module.exports.fil = ''; 191 | 192 | // Swedish 193 | module.exports.swe = ''; 194 | 195 | // Turkish 196 | module.exports.tur = ''; 197 | 198 | // Gujarati 199 | module.exports.guj = ''; 200 | 201 | // Persian 202 | module.exports.fas = ''; 203 | module.exports.pes = ''; 204 | module.exports.prs = ''; 205 | module.exports.tgk = ''; 206 | module.exports.aiq = ''; 207 | module.exports.bhh = ''; 208 | module.exports.haz = ''; 209 | module.exports.jpr = ''; 210 | module.exports.phv = ''; 211 | module.exports.deh = ''; 212 | module.exports.jdt = ''; 213 | module.exports.ttt = ''; 214 | 215 | // Polish 216 | module.exports.pol = ''; 217 | module.exports.szl = ''; 218 | 219 | // Pashto 220 | module.exports.pus = ''; 221 | module.exports.pst = ''; 222 | module.exports.pbu = ''; 223 | module.exports.pbt = ''; 224 | module.exports.wne = ''; 225 | 226 | // Kannada 227 | module.exports.kan = ''; 228 | 229 | // Malayalam 230 | module.exports.mal = ''; 231 | 232 | // Sundanese 233 | module.exports.sun = ''; 234 | 235 | // Hausa 236 | module.exports.hau = ''; 237 | 238 | // Odia 239 | module.exports.ori = ''; 240 | module.exports.ory = ''; 241 | module.exports.spv = ''; 242 | module.exports.bpv = ''; 243 | module.exports.ort = ''; 244 | module.exports.dso = ''; 245 | 246 | // Romanian 247 | module.exports.rum = ''; 248 | module.exports.ron = ''; 249 | 250 | // Dutch 251 | module.exports.dut = ''; 252 | module.exports.nld = ''; 253 | module.exports.vls = ''; 254 | module.exports.zea = ''; 255 | 256 | // Hungarian 257 | module.exports.hun = ''; 258 | module.exports.ohu = ''; 259 | 260 | // Greek 261 | module.exports.gre = ''; 262 | module.exports.ell = ''; 263 | module.exports.grc = ''; 264 | module.exports.cpg = ''; 265 | module.exports.gmy = ''; 266 | module.exports.pnt = ''; 267 | module.exports.tsd = ''; 268 | module.exports.yej = ''; 269 | 270 | // Czech 271 | module.exports.cze = ''; 272 | module.exports.ces = ''; 273 | -------------------------------------------------------------------------------- /test/lib/Result.js: -------------------------------------------------------------------------------- 1 | const Result = require('../../lib/Result'); 2 | 3 | module.exports.constructor = function(test, common) { 4 | test('constructor', function(t) { 5 | const res = new Result(); 6 | 7 | t.equal( typeof res.getSubject, 'function' ); 8 | t.equal( typeof res.getObject, 'function' ); 9 | t.equal( typeof res.getPreviousObject, 'function' ); 10 | t.equal( typeof res.getIdsAsArray, 'function' ); 11 | t.equal( typeof res.setMask, 'function' ); 12 | t.equal( typeof res.intersect, 'function' ); 13 | 14 | t.deepEqual( res.group, [] ); 15 | t.deepEqual( res.ids, {} ); 16 | t.deepEqual( res.mask, [] ); 17 | t.deepEqual( res.pos, { subject: -2, object: -1 }); 18 | t.deepEqual( res.reset, false ); 19 | t.equal( typeof res.done, 'function' ); 20 | 21 | t.end(); 22 | }); 23 | 24 | test('constructor - set group', function(t) { 25 | const res = new Result(['a','b','c']); 26 | t.deepEqual( res.group, ['a','b','c'] ); 27 | t.deepEqual( res.mask, [false, false, false] ); 28 | t.end(); 29 | }); 30 | 31 | test('constructor - set group - invalid', function(t) { 32 | const res = new Result({ 0: 'a' }); 33 | t.deepEqual( res.group, [] ); 34 | t.end(); 35 | }); 36 | 37 | test('constructor - set done', function(t) { 38 | const done = function(){ console.error('test'); }; 39 | const res = new Result(undefined, done); 40 | t.equal( res.done, done ); 41 | t.end(); 42 | }); 43 | 44 | test('constructor - set done - invalid', function(t) { 45 | const res = new Result(undefined, {}); 46 | t.equal( typeof res.done, 'function' ); 47 | t.end(); 48 | }); 49 | }; 50 | 51 | module.exports.getSubject = function(test, common) { 52 | test('getSubject', function(t) { 53 | const res = new Result(); 54 | t.equal(res.getSubject(), undefined); 55 | 56 | const res2 = new Result(['a','b','c']); 57 | t.equal(res2.getSubject(), 'b'); 58 | 59 | const res3 = new Result(['a','b','c']); 60 | res3.pos.subject = 0; 61 | t.equal(res3.getSubject(), 'a'); 62 | 63 | t.end(); 64 | }); 65 | }; 66 | 67 | module.exports.getObject = function(test, common) { 68 | test('getObject', function(t) { 69 | const res = new Result(); 70 | t.equal(res.getObject(), undefined); 71 | 72 | const res2 = new Result(['a','b','c']); 73 | t.equal(res2.getObject(), 'c'); 74 | 75 | const res3 = new Result(['a','b','c']); 76 | res3.pos.object = 1; 77 | t.equal(res3.getObject(), 'b'); 78 | 79 | t.end(); 80 | }); 81 | }; 82 | 83 | module.exports.getPreviousObject = function(test, common) { 84 | test('getPreviousObject', function(t) { 85 | const res = new Result(); 86 | t.equal(res.getPreviousObject(), undefined); 87 | 88 | const res2 = new Result(['a','b','c']); 89 | t.equal(res2.getPreviousObject(), undefined); 90 | 91 | const res3 = new Result(['a','b','c']); 92 | res3.pos.prev_object = 1; 93 | t.equal(res3.getPreviousObject(), 'b'); 94 | 95 | t.end(); 96 | }); 97 | }; 98 | 99 | module.exports.getIdsAsArray = function(test, common) { 100 | test('getIdsAsArray', function(t) { 101 | const res = new Result(); 102 | t.deepEqual(res.getIdsAsArray(), []); 103 | 104 | const res2 = new Result(); 105 | res2.ids = { '200': true, '201': true, '202': true }; 106 | t.deepEqual(res2.getIdsAsArray(), [200, 201, 202]); 107 | 108 | t.end(); 109 | }); 110 | }; 111 | 112 | module.exports.setMask = function(test, common) { 113 | test('default mask', function(t) { 114 | const res = new Result(['a','b','c']); 115 | t.deepEqual(res.mask, [false, false, false]); 116 | t.end(); 117 | }); 118 | test('setMask - invalid property', function(t) { 119 | const res = new Result(['a','b','c']); 120 | t.deepEqual(res.mask, [false, false, false]); 121 | res.setMask('invalidproperty', true); 122 | t.deepEqual(res.mask, [false, false, false]); 123 | t.end(); 124 | }); 125 | test('setMask - subject - true', function(t) { 126 | const res = new Result(['a','b','c']); 127 | res.setMask('subject', true); 128 | t.deepEqual(res.mask, [false, true, false]); 129 | t.end(); 130 | }); 131 | test('setMask - subject - truthy', function(t) { 132 | const res = new Result(['a','b','c']); 133 | res.setMask('subject', 'non null string'); 134 | t.deepEqual(res.mask, [false, true, false]); 135 | t.end(); 136 | }); 137 | test('setMask - subject - false', function(t) { 138 | const res = new Result(['a','b','c']); 139 | res.mask = [true, true, true]; 140 | res.setMask('subject', false); 141 | t.deepEqual(res.mask, [true, false, true]); 142 | t.end(); 143 | }); 144 | test('setMask - subject - falsy', function(t) { 145 | const res = new Result(['a','b','c']); 146 | res.mask = [true, true, true]; 147 | res.setMask('subject', null); 148 | t.deepEqual(res.mask, [true, false, true]); 149 | t.end(); 150 | }); 151 | test('setMask - object - true', function(t) { 152 | const res = new Result(['a','b','c']); 153 | res.setMask('object', true); 154 | t.deepEqual(res.mask, [false, false, true]); 155 | t.end(); 156 | }); 157 | test('setMask - object - truthy', function(t) { 158 | const res = new Result(['a','b','c']); 159 | res.setMask('object', 'non null string'); 160 | t.deepEqual(res.mask, [false, false, true]); 161 | t.end(); 162 | }); 163 | test('setMask - object - false', function(t) { 164 | const res = new Result(['a','b','c']); 165 | res.mask = [true, true, true]; 166 | res.setMask('object', false); 167 | t.deepEqual(res.mask, [true, true, false]); 168 | t.end(); 169 | }); 170 | test('setMask - object - falsy', function(t) { 171 | const res = new Result(['a','b','c']); 172 | res.mask = [true, true, true]; 173 | res.setMask('object', null); 174 | t.deepEqual(res.mask, [true, true, false]); 175 | t.end(); 176 | }); 177 | }; 178 | 179 | module.exports.intersect = function(test, common) { 180 | test('intersect - error', function(t) { 181 | const res = new Result(['a','b','c','d','e']); 182 | t.deepEqual( res.pos, { subject: 3, object: 4 }); 183 | res.intersect( 'an error' ); 184 | t.deepEqual( res.pos, { subject: 2, object: 4 }); 185 | t.end(); 186 | }); 187 | test('intersect - no results', function(t) { 188 | const res = new Result(['a','b','c','d','e']); 189 | t.deepEqual( res.pos, { subject: 3, object: 4 }); 190 | res.intersect( null, [] ); 191 | t.deepEqual( res.pos, { subject: 2, object: 4 }); 192 | t.end(); 193 | }); 194 | test('intersect - match', function(t) { 195 | const res = new Result(['a','b','c','d','e']); 196 | t.deepEqual( res.pos, { subject: 3, object: 4 }); 197 | res.intersect( null, [ 198 | { subjectId: 102, objectId: 202 }, 199 | { subjectId: 105, objectId: 205 }, 200 | { subjectId: 100, objectId: 200 } 201 | ]); 202 | t.deepEqual( res.pos, { subject: 2, object: 3 }); 203 | t.deepEqual( res.ids, { 100: true, 102: true, 105: true }); 204 | t.end(); 205 | }); 206 | test('intersect - match parent', function(t) { 207 | const res = new Result(['a','b','c','d','e']); 208 | res.ids = { 200: true, 201: true, 202: true }; 209 | t.deepEqual( res.pos, { subject: 3, object: 4 }); 210 | res.intersect( null, [ 211 | { subjectId: 102, objectId: 202 }, 212 | { subjectId: 100, objectId: 200 }, 213 | { subjectId: 105, objectId: 205 } 214 | ]); 215 | t.deepEqual( res.pos, { subject: 2, object: 3 }); 216 | t.deepEqual( res.ids, { 100: true, 102: true }); 217 | t.end(); 218 | }); 219 | }; 220 | -------------------------------------------------------------------------------- /test/prototype/tokenize.js: -------------------------------------------------------------------------------- 1 | 2 | const tokenize = require('../../prototype/tokenize'); 3 | const PARTIAL_TOKEN_SUFFIX = require('../../lib/analysis').PARTIAL_TOKEN_SUFFIX; 4 | 5 | module.exports.exports = function(test, common) { 6 | test('exports', function(t) { 7 | t.equal( typeof tokenize.tokenize, 'function' ); 8 | t.equal( typeof tokenize._indexContainsPhrase, 'function' ); 9 | t.equal( typeof tokenize._eachSynonym, 'function' ); 10 | t.equal( typeof tokenize._permutations, 'function' ); 11 | t.equal( typeof tokenize._queryFilter, 'function' ); 12 | t.equal( typeof tokenize._isArrayRangeIsEqual, 'function' ); 13 | t.equal( typeof tokenize._groups, 'function' ); 14 | t.end(); 15 | }); 16 | }; 17 | 18 | // test if a phrase exists in the index 19 | module.exports._indexContainsPhrase = function(test, common) { 20 | test('_indexContainsPhrase - true', function(t) { 21 | t.plan(3); 22 | var mock = tokenize._indexContainsPhrase.bind({ 23 | index: { hasSubject: ( phrase, cb ) => { 24 | t.equals(phrase, 'hello world'); 25 | return cb( true ); 26 | }} 27 | }); 28 | 29 | mock('hello world', (err, bool) => { 30 | t.false(err); 31 | t.true(bool); 32 | }); 33 | }); 34 | test('_indexContainsPhrase - false', function(t) { 35 | t.plan(3); 36 | var mock = tokenize._indexContainsPhrase.bind({ 37 | index: { hasSubject: ( phrase, cb ) => { 38 | t.equals(phrase, 'hello world'); 39 | return cb( false ); 40 | }} 41 | }); 42 | 43 | mock('hello world', (err, bool) => { 44 | t.false(err); 45 | t.false(bool); 46 | }); 47 | }); 48 | test('_indexContainsPhrase - partial token - true', function(t) { 49 | t.plan(3); 50 | var mock = tokenize._indexContainsPhrase.bind({ 51 | index: { hasSubject: ( phrase, cb ) => { 52 | t.equals(phrase, 'hello world' + PARTIAL_TOKEN_SUFFIX); 53 | return cb( true ); 54 | }} 55 | }); 56 | 57 | mock('hello world' + PARTIAL_TOKEN_SUFFIX, (err, bool) => { 58 | t.false(err); 59 | t.true(bool); 60 | }); 61 | }); 62 | test('_indexContainsPhrase - partial token - false', function(t) { 63 | t.plan(3); 64 | var mock = tokenize._indexContainsPhrase.bind({ 65 | index: { hasSubject: ( phrase, cb ) => { 66 | t.equals(phrase, 'hello world' + PARTIAL_TOKEN_SUFFIX); 67 | return cb( false ); 68 | }} 69 | }); 70 | 71 | mock('hello world' + PARTIAL_TOKEN_SUFFIX, (err, bool) => { 72 | t.false(err); 73 | t.false(bool); 74 | }); 75 | }); 76 | }; 77 | 78 | // expand each synonym in to its permutations and check them against the database. 79 | module.exports._eachSynonym = function(test, common) { 80 | test('_eachSynonym', function(t) { 81 | 82 | const synonym = ['hello', 'big', 'bright', 'new', 'world']; 83 | const expected = [ 'hello big', 'bright', 'new world' ]; 84 | 85 | var mock = tokenize._eachSynonym.bind({ 86 | index: { hasSubject: ( phrase, cb ) => { 87 | switch( phrase ){ 88 | case 'hello big': 89 | case 'hello new': 90 | case 'new world': 91 | case 'bright': 92 | case 'world': 93 | return cb( true ); 94 | default: 95 | return cb( false ); 96 | } 97 | }} 98 | }); 99 | 100 | mock(synonym, (err, phrases) => { 101 | t.false(err); 102 | t.deepEqual(phrases, expected); 103 | t.end(); 104 | }); 105 | }); 106 | }; 107 | 108 | // _permutations takes an array of input tokens and produces 109 | // an output array consisting of all the potential adjancent 110 | // groupings of the input tokens up to the defined threshold. 111 | module.exports._permutations = function(test, common) { 112 | test('_permutations', function(t) { 113 | 114 | const tokens = ['new', 'south', 'wales']; 115 | const expected = [ 116 | 'new south wales', 117 | 'new south', 118 | 'new', 119 | 'south wales', 120 | 'south', 121 | 'wales' 122 | ]; 123 | 124 | t.deepEqual(tokenize._permutations(tokens), expected); 125 | t.end(); 126 | }); 127 | }; 128 | 129 | // _queryFilter removes unwanted queries from the array before 130 | // they are returned to the caller. 131 | module.exports._queryFilter = function(test, common) { 132 | test('_queryFilter - remove empty arrays', function(t) { 133 | 134 | const queries = [[], ['a'], [], ['b','c'], [], ['d'], []]; 135 | const expected = [['a'], ['b','c'], ['d']]; 136 | 137 | t.deepEqual(tokenize._queryFilter(queries), expected); 138 | t.end(); 139 | }); 140 | 141 | // synonymous groupings 142 | // this removes queries such as `[ B, C ]` where another group such as 143 | // `[ A, B, C ]` exists. 144 | // see: https://github.com/pelias/placeholder/issues/28 145 | test('_queryFilter - synonymous groupings', function(t) { 146 | 147 | const queries = [ 148 | ['A','B','C','D'], ['B','C','D'], ['C','D'], ['D'], 149 | ['A','B','C'], ['B','C'], ['C'], 150 | ['A','B'] 151 | ]; 152 | const expected = [ 153 | ['A','B','C','D'], 154 | ['A','B','C'], 155 | ['A','B'] 156 | ]; 157 | 158 | t.deepEqual(tokenize._queryFilter(queries), expected); 159 | t.end(); 160 | }); 161 | }; 162 | 163 | // _groups takes an array of input tokens, the tokens are first run through 164 | // the _permutations function above, each permutation is looked up in the db. 165 | // this function aims to select the best permutations to use for the query. 166 | // note: it strongly favours the longer token groupings 167 | module.exports._groups = function(test, common) { 168 | test('_groups', function(t) { 169 | 170 | const tokens = ['north', 'sydney', 'new', 'south', 'wales', 'au']; 171 | const phrases = [ 172 | 'south wales','new south wales', 'wales', 'north', 'sydney', 173 | 'north sydney', 'south', 'au' 174 | ]; 175 | const expected = ['north sydney', 'new south wales', 'au']; 176 | 177 | t.deepEqual(tokenize._groups(tokens, phrases), expected); 178 | t.end(); 179 | }); 180 | 181 | // https://github.com/pelias/placeholder/issues/231 182 | test('_groups "constructor"', function(t) { 183 | 184 | const tokens = ['constructor']; 185 | const phrases = []; 186 | const expected = []; 187 | 188 | t.deepEqual(tokenize._groups(tokens, phrases), expected); 189 | t.end(); 190 | }); 191 | }; 192 | 193 | // 194 | module.exports._isArrayRangeIsEqual = function(test, common) { 195 | test('_isArrayRangeIsEqual', function(t) { 196 | 197 | const A = [1, 2, 3, 1, 2, 3]; 198 | const B = [1, 2]; 199 | const C = [3]; 200 | 201 | t.true(tokenize._isArrayRangeIsEqual(A, B)); 202 | t.true(tokenize._isArrayRangeIsEqual(A, B, 0)); 203 | t.true(tokenize._isArrayRangeIsEqual(A, B, 3)); 204 | t.false(tokenize._isArrayRangeIsEqual(A, B, 1)); 205 | t.false(tokenize._isArrayRangeIsEqual(A, B, 2)); 206 | t.false(tokenize._isArrayRangeIsEqual(A, B, 4)); 207 | t.false(tokenize._isArrayRangeIsEqual(A, B, 5)); 208 | t.false(tokenize._isArrayRangeIsEqual(A, B, 6)); 209 | t.false(tokenize._isArrayRangeIsEqual(A, B, -1)); 210 | t.false(tokenize._isArrayRangeIsEqual(A, B, Infinity)); 211 | 212 | t.true(tokenize._isArrayRangeIsEqual(A, C, 2)); 213 | t.true(tokenize._isArrayRangeIsEqual(A, C, 5)); 214 | t.false(tokenize._isArrayRangeIsEqual(A, C)); 215 | t.false(tokenize._isArrayRangeIsEqual(A, C, 0)); 216 | t.false(tokenize._isArrayRangeIsEqual(A, C, 1)); 217 | t.false(tokenize._isArrayRangeIsEqual(A, C, 3)); 218 | t.false(tokenize._isArrayRangeIsEqual(A, C, 4)); 219 | t.false(tokenize._isArrayRangeIsEqual(A, C, 6)); 220 | t.false(tokenize._isArrayRangeIsEqual(A, C, -1)); 221 | t.false(tokenize._isArrayRangeIsEqual(A, C, Infinity)); 222 | 223 | t.end(); 224 | }); 225 | }; 226 | 227 | -------------------------------------------------------------------------------- /test/lib/Queries.js: -------------------------------------------------------------------------------- 1 | 2 | var TokenIndex = require('../../lib/TokenIndex'); 3 | 4 | module.exports.constructor = function(test, common) { 5 | test('constructor', function(t) { 6 | var db = new TokenIndex(); 7 | t.equal( typeof db._queryBool, 'function' ); 8 | t.equal( typeof db._queryAll, 'function' ); 9 | t.equal( typeof db.hasSubject, 'function' ); 10 | t.equal( typeof db.matchSubjectDistinctSubjectIds, 'function' ); 11 | t.equal( typeof db.matchSubjectObject, 'function' ); 12 | t.end(); 13 | }); 14 | }; 15 | 16 | module.exports.hasSubject = function(test, common) { 17 | test('hasSubject', function(t) { 18 | var db = new TokenIndex(); 19 | db.open('/tmp/db', { test: true, reset: true }); 20 | 21 | // prepare some sql statments 22 | const tokens = { 23 | insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )') 24 | }; 25 | 26 | // add some rows to the tokens table 27 | tokens.insert.run({ id: 1, lang: 'en', tag: 'test', token: 'hello world' }); 28 | tokens.insert.run({ id: 2, lang: 'fr', tag: 'test', token: 'a b c' }); 29 | 30 | // run populate 31 | db.populate(); 32 | 33 | t.plan(7); 34 | db.hasSubject('hel', t.false ); 35 | db.hasSubject('hello', t.false ); 36 | db.hasSubject('hello wor', t.false ); 37 | db.hasSubject('hello world', t.true ); 38 | db.hasSubject('a', t.false ); 39 | db.hasSubject('a b', t.false ); 40 | db.hasSubject('a b c', t.true ); 41 | }); 42 | }; 43 | 44 | module.exports.hasSubjectAutocomplete = function(test, common) { 45 | test('hasSubject - autocomplete', function(t) { 46 | var db = new TokenIndex(); 47 | db.open('/tmp/db', { test: true, reset: true }); 48 | 49 | // prepare some sql statments 50 | const tokens = { 51 | insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )') 52 | }; 53 | 54 | // add some rows to the tokens table 55 | tokens.insert.run({ id: 1, lang: '', tag: '', token: 'hello world' }); 56 | tokens.insert.run({ id: 2, lang: '', tag: '', token: 'a b c' }); 57 | 58 | // run populate 59 | db.populate(); 60 | 61 | t.plan(7); 62 | db.hasSubject('hel\x26', t.true ); 63 | db.hasSubject('hello\x26', t.true ); 64 | db.hasSubject('hello wor\x26', t.true ); 65 | db.hasSubject('hello world\x26', t.true ); 66 | db.hasSubject('a\x26', t.true ); 67 | db.hasSubject('a b\x26', t.true ); 68 | db.hasSubject('a b c\x26', t.true ); 69 | }); 70 | }; 71 | 72 | module.exports.matchSubjectDistinctSubjectIds = function(test, common) { 73 | test('matchSubjectDistinctSubjectIds', function(t) { 74 | var db = new TokenIndex(); 75 | db.open('/tmp/db', { test: true, reset: true }); 76 | 77 | // prepare some sql statments 78 | const tokens = { 79 | insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )') 80 | }; 81 | 82 | // add some rows to the tokens table 83 | tokens.insert.run({ id: 1, lang: '', tag: '', token: 'hello world' }); 84 | tokens.insert.run({ id: 2, lang: '', tag: '', token: 'a b c' }); 85 | tokens.insert.run({ id: 3, lang: '', tag: '', token: 'hello world' }); 86 | 87 | // run populate 88 | db.populate(); 89 | 90 | // generic failure test 91 | const fail = (err, ids) => { 92 | t.false(err); 93 | t.deepEquals(ids, []); 94 | }; 95 | 96 | t.plan(14); 97 | db.matchSubjectDistinctSubjectIds('hel', fail); 98 | db.matchSubjectDistinctSubjectIds('hello', fail); 99 | db.matchSubjectDistinctSubjectIds('hello wor', fail); 100 | db.matchSubjectDistinctSubjectIds('hello world', (err, ids) => { 101 | t.false(err); 102 | t.deepEquals(ids, [ 103 | { subjectId: 1 }, 104 | { subjectId: 3 } 105 | ]); 106 | }); 107 | db.matchSubjectDistinctSubjectIds('a', fail); 108 | db.matchSubjectDistinctSubjectIds('a b', fail); 109 | db.matchSubjectDistinctSubjectIds('a b c', (err, ids) => { 110 | t.false(err); 111 | t.deepEquals(ids, [ 112 | { subjectId: 2 } 113 | ]); 114 | }); 115 | }); 116 | }; 117 | 118 | module.exports.matchSubjectAutocompleteDistinctSubjectIds = function(test, common) { 119 | test('matchSubjectDistinctSubjectIds - autocomplete', function(t) { 120 | var db = new TokenIndex(); 121 | db.open('/tmp/db', { test: true, reset: true }); 122 | 123 | // prepare some sql statments 124 | const tokens = { 125 | insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )') 126 | }; 127 | 128 | // add some rows to the tokens table 129 | tokens.insert.run({ id: 1, lang: '', tag: '', token: 'hello world' }); 130 | tokens.insert.run({ id: 2, lang: '', tag: '', token: 'a b c' }); 131 | tokens.insert.run({ id: 3, lang: '', tag: '', token: 'hello world' }); 132 | 133 | // run populate 134 | db.populate(); 135 | 136 | // generic failure test 137 | const fail = (err, ids) => { 138 | t.false(err); 139 | t.deepEquals(ids, []); 140 | }; 141 | 142 | const passOne = (err, ids) => { 143 | t.false(err); 144 | t.deepEquals(ids, [ 145 | { subjectId: 1 }, 146 | { subjectId: 3 } 147 | ]); 148 | }; 149 | 150 | const passTwo = (err, ids) => { 151 | t.false(err); 152 | t.deepEquals(ids, [ 153 | { subjectId: 2 } 154 | ]); 155 | }; 156 | 157 | t.plan(14); 158 | db.matchSubjectDistinctSubjectIds('hel\x26', passOne); 159 | db.matchSubjectDistinctSubjectIds('hello\x26', passOne); 160 | db.matchSubjectDistinctSubjectIds('hello wor\x26', passOne); 161 | db.matchSubjectDistinctSubjectIds('hello world\x26', passOne); 162 | db.matchSubjectDistinctSubjectIds('a\x26', passTwo); 163 | db.matchSubjectDistinctSubjectIds('a b\x26', passTwo); 164 | db.matchSubjectDistinctSubjectIds('a b c\x26', passTwo); 165 | }); 166 | }; 167 | 168 | module.exports.matchSubjectObject = function(test, common) { 169 | test('matchSubjectObject', function(t) { 170 | var db = new TokenIndex(); 171 | db.open('/tmp/db', { test: true, reset: true }); 172 | 173 | // prepare some sql statments 174 | const tokens = { 175 | insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )') 176 | }; 177 | const lineage = { 178 | insert: db.prepare('INSERT INTO lineage ( id, pid ) VALUES ( $id, $pid )') 179 | }; 180 | 181 | // add some rows to the tokens table 182 | tokens.insert.run({ id: 1, lang: '', tag: '', token: 'paris' }); 183 | tokens.insert.run({ id: 2, lang: '', tag: '', token: 'paris' }); 184 | tokens.insert.run({ id: 3, lang: '', tag: '', token: 'france' }); 185 | tokens.insert.run({ id: 4, lang: '', tag: '', token: 'texas' }); 186 | 187 | // add some rows to the lineage table 188 | lineage.insert.run({ id: 1, pid: 3 }); 189 | lineage.insert.run({ id: 2, pid: 4 }); 190 | 191 | // run populate 192 | db.populate(); 193 | 194 | // generic failure test 195 | const fail = (err, ids) => { 196 | t.false(err); 197 | t.deepEquals(ids, []); 198 | }; 199 | 200 | t.plan(10); 201 | db.matchSubjectObject('paris', 'paris', fail); 202 | db.matchSubjectObject('france', 'france', fail); 203 | db.matchSubjectObject('texas', 'texas', fail); 204 | 205 | db.matchSubjectObject('paris', 'france', (err, ids) => { 206 | t.false(err); 207 | t.deepEquals(ids, [ 208 | { subjectId: 1, objectId: 3 } 209 | ]); 210 | }); 211 | 212 | db.matchSubjectObject('paris', 'texas', (err, ids) => { 213 | t.false(err); 214 | t.deepEquals(ids, [ 215 | { subjectId: 2, objectId: 4 } 216 | ]); 217 | }); 218 | }); 219 | }; 220 | 221 | module.exports.matchSubjectObjectAutocomplete = function(test, common) { 222 | test('matchSubjectObject - autocomplete', function(t) { 223 | var db = new TokenIndex(); 224 | db.open('/tmp/db', { test: true, reset: true }); 225 | 226 | // prepare some sql statments 227 | const tokens = { 228 | insert: db.prepare('INSERT INTO tokens ( id, lang, tag, token ) VALUES ( $id, $lang, $tag, $token )') 229 | }; 230 | const lineage = { 231 | insert: db.prepare('INSERT INTO lineage ( id, pid ) VALUES ( $id, $pid )') 232 | }; 233 | 234 | // add some rows to the tokens table 235 | tokens.insert.run({ id: 1, lang: '', tag: '', token: 'paris' }); 236 | tokens.insert.run({ id: 2, lang: '', tag: '', token: 'paris' }); 237 | tokens.insert.run({ id: 3, lang: '', tag: '', token: 'france' }); 238 | tokens.insert.run({ id: 4, lang: '', tag: '', token: 'texas' }); 239 | 240 | // add some rows to the lineage table 241 | lineage.insert.run({ id: 1, pid: 3 }); 242 | lineage.insert.run({ id: 2, pid: 4 }); 243 | 244 | // run populate 245 | db.populate(); 246 | 247 | // generic failure test 248 | const fail = (err, ids) => { 249 | t.false(err); 250 | t.deepEquals(ids, []); 251 | }; 252 | 253 | t.plan(10); 254 | db.matchSubjectObject('paris', 'par\x26', fail); 255 | db.matchSubjectObject('france', 'franc\x26', fail); 256 | db.matchSubjectObject('texas', 'tex\x26', fail); 257 | 258 | db.matchSubjectObject('paris', 'fr\x26', (err, ids) => { 259 | t.false(err); 260 | t.deepEquals(ids, [ 261 | { subjectId: 1, objectId: 3 } 262 | ]); 263 | }); 264 | 265 | db.matchSubjectObject('paris', 't\x26', (err, ids) => { 266 | t.false(err); 267 | t.deepEquals(ids, [ 268 | { subjectId: 2, objectId: 4 } 269 | ]); 270 | }); 271 | }); 272 | }; 273 | -------------------------------------------------------------------------------- /test/prototype/query.js: -------------------------------------------------------------------------------- 1 | const Result = require('../../lib/Result'); 2 | const query = require('../../prototype/query'); 3 | 4 | module.exports.exports = function(test, common) { 5 | test('exports', function(t) { 6 | t.equal( typeof query.query, 'function' ); 7 | t.equal( typeof query._queryGroup, 'function' ); 8 | t.equal( typeof query._queryManyGroups, 'function' ); 9 | t.end(); 10 | }); 11 | }; 12 | 13 | module.exports._queryGroup = function(test, common) { 14 | test('_queryGroup - empty group', function(t) { 15 | 16 | const group = []; 17 | 18 | const done = (err, res) => { 19 | t.deepEqual(err, null); 20 | t.deepEqual(res.constructor.name, 'Result'); 21 | t.deepEqual(res.getIdsAsArray(), []); 22 | t.deepEqual(res.mask, []); 23 | t.deepEqual(res.group, group); 24 | t.end(); 25 | }; 26 | 27 | query._queryGroup(null, group, done); 28 | }); 29 | test('_queryGroup - single token - no matches', function(t) { 30 | 31 | const group = ['hello world']; 32 | t.plan(6); 33 | 34 | const index = { 35 | matchSubjectDistinctSubjectIds: ( phrase, cb ) => { 36 | t.equal(phrase, 'hello world'); 37 | return cb( null, new Result() ); 38 | } 39 | }; 40 | 41 | const done = (err, res) => { 42 | t.deepEqual(err, null); 43 | t.deepEqual(res.constructor.name, 'Result'); 44 | t.deepEqual(res.getIdsAsArray(), []); 45 | t.deepEqual(res.mask, [ false ]); 46 | t.deepEqual(res.group, group); 47 | }; 48 | 49 | query._queryGroup(index, group, done); 50 | }); 51 | test('_queryGroup - single token - with matches', function(t) { 52 | 53 | const group = ['hello world']; 54 | t.plan(6); 55 | 56 | const index = { 57 | matchSubjectDistinctSubjectIds: ( phrase, cb ) => { 58 | t.equal(phrase, 'hello world'); 59 | return cb( null, [ 60 | { subjectId: 100 }, 61 | { subjectId: 200 }, 62 | { subjectId: 300 }, 63 | ]); 64 | } 65 | }; 66 | 67 | const done = (err, res) => { 68 | t.deepEqual(err, null); 69 | t.deepEqual(res.constructor.name, 'Result'); 70 | t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]); 71 | t.deepEqual(res.mask, [ true ]); 72 | t.deepEqual(res.group, group); 73 | }; 74 | 75 | query._queryGroup(index, group, done); 76 | }); 77 | test('_queryGroup - multiple tokens - no matches', function(t) { 78 | 79 | const group = ['hello world', 'test', 'foo bar']; 80 | t.plan(10); 81 | 82 | const index = { 83 | matchSubjectObject: ( subject, object, cb ) => { 84 | t.ok(true); 85 | return cb( null, [] ); 86 | }, 87 | matchSubjectDistinctSubjectIds: ( subject, cb ) => { 88 | t.equal(subject, 'foo bar'); 89 | return cb( null, [ 90 | { subjectId: 100 }, 91 | { subjectId: 200 }, 92 | { subjectId: 300 }, 93 | ]); 94 | }, 95 | matchSubjectObjectGeomIntersects: ( subject, object, cb ) => { 96 | t.ok(true); 97 | return cb( null, [] ); 98 | } 99 | }; 100 | 101 | const done = (err, res) => { 102 | t.deepEqual(err, null); 103 | t.deepEqual(res.constructor.name, 'Result'); 104 | t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]); 105 | t.deepEqual(res.mask, [ true, true, false ]); 106 | t.deepEqual(res.group, group); 107 | }; 108 | 109 | query._queryGroup(index, group, done); 110 | }); 111 | test('_queryGroup - multiple tokens - matches', function(t) { 112 | 113 | const group = ['hello world', 'test', 'foo bar']; 114 | t.plan(7); 115 | 116 | const index = { 117 | matchSubjectObject: ( subject, object, cb ) => { 118 | t.ok(true); 119 | switch( subject ){ 120 | case 'hello world': 121 | return cb( null, [ 122 | { subjectId: 100, objectId: 300 }, 123 | { subjectId: 200, objectId: 410 }, 124 | ]); 125 | case 'test': 126 | return cb( null, [ 127 | { subjectId: 300, objectId: 800 }, 128 | { subjectId: 400, objectId: 900 }, 129 | ]); 130 | default: 131 | return cb( null, [ 132 | { subjectId: 800, objectId: 880 }, 133 | { subjectId: 900, objectId: 990 }, 134 | ]); 135 | } 136 | }, 137 | matchSubjectObjectGeomIntersects: ( subject, object, cb ) => { 138 | t.ok(true); 139 | return cb( null, [] ); 140 | } 141 | }; 142 | 143 | const done = (err, res) => { 144 | t.deepEqual(err, null); 145 | t.deepEqual(res.constructor.name, 'Result'); 146 | t.deepEqual(res.getIdsAsArray(), [ 100 ]); 147 | t.deepEqual(res.mask, [ true, true, true ]); 148 | t.deepEqual(res.group, group); 149 | }; 150 | 151 | query._queryGroup(index, group, done); 152 | }); 153 | }; 154 | 155 | module.exports._queryManyGroups = function(test, common) { 156 | test('_queryManyGroups - empty groups', function(t) { 157 | 158 | const groups = []; 159 | 160 | const done = (err, res) => { 161 | t.deepEqual(err, null); 162 | t.deepEqual(res.constructor.name, 'Result'); 163 | t.deepEqual(res.getIdsAsArray(), []); 164 | t.deepEqual(res.mask, []); 165 | t.deepEqual(res.group, []); 166 | t.end(); 167 | }; 168 | 169 | query._queryManyGroups(null, groups, done); 170 | }); 171 | test('_queryManyGroups - single group', function(t) { 172 | 173 | t.plan(6); 174 | const groups = [ 175 | ['hello world'], 176 | ]; 177 | 178 | const index = { 179 | matchSubjectDistinctSubjectIds: ( phrase, cb ) => { 180 | t.equal(phrase, 'hello world'); 181 | return cb( null, [ 182 | { subjectId: 100 }, 183 | { subjectId: 200 }, 184 | { subjectId: 300 }, 185 | ]); 186 | } 187 | }; 188 | 189 | const done = (err, res) => { 190 | t.deepEqual(err, null); 191 | t.deepEqual(res.constructor.name, 'Result'); 192 | t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]); 193 | t.deepEqual(res.mask, [ true ]); 194 | t.deepEqual(res.group, groups[0]); 195 | }; 196 | 197 | query._queryManyGroups(index, groups, done); 198 | }); 199 | test('_queryManyGroups - multiple groups', function(t) { 200 | 201 | t.plan(7); 202 | const groups = [ 203 | ['hello world'], 204 | ['hallo welt'], 205 | ]; 206 | 207 | const index = { 208 | matchSubjectDistinctSubjectIds: ( phrase, cb ) => { 209 | t.ok(true); 210 | switch( phrase ){ 211 | case 'hello world': 212 | return cb( null, [ 213 | { subjectId: 100, objectId: 300 }, 214 | { subjectId: 200, objectId: 410 }, 215 | ]); 216 | case 'hallo welt': 217 | return cb( null, [ 218 | { subjectId: 300, objectId: 800 }, 219 | { subjectId: 400, objectId: 900 }, 220 | ]); 221 | default: 222 | return cb( null, [ 223 | { subjectId: 800, objectId: 880 }, 224 | { subjectId: 900, objectId: 990 }, 225 | ]); 226 | } 227 | } 228 | }; 229 | 230 | const done = (err, res) => { 231 | t.deepEqual(err, null); 232 | t.deepEqual(res.constructor.name, 'Result'); 233 | t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300, 400 ]); 234 | t.deepEqual(res.mask, [ true ]); 235 | t.deepEqual(res.group, groups[0]); 236 | }; 237 | 238 | query._queryManyGroups(index, groups, done); 239 | }); 240 | }; 241 | 242 | module.exports.query = function(test, common) { 243 | test('query - empty text', function(t) { 244 | 245 | const text = ''; 246 | const mock = { 247 | tokenize: ( t, cb ) => { 248 | cb( null, [] ); 249 | } 250 | }; 251 | 252 | const done = (err, res) => { 253 | t.deepEqual(err, null); 254 | t.deepEqual(res.constructor.name, 'Result'); 255 | t.deepEqual(res.getIdsAsArray(), []); 256 | t.deepEqual(res.mask, []); 257 | t.deepEqual(res.group, []); 258 | t.end(); 259 | }; 260 | 261 | query.query.call(mock, text, done); 262 | }); 263 | test('query - single group', function(t) { 264 | 265 | t.plan(6); 266 | const text = 'hello world'; 267 | const mock = { 268 | tokenize: ( t, cb ) => { 269 | cb( null, [['hello world']] ); 270 | }, 271 | index: { 272 | matchSubjectDistinctSubjectIds: ( phrase, cb ) => { 273 | t.equal(phrase, 'hello world'); 274 | return cb( null, [ 275 | { subjectId: 100 }, 276 | { subjectId: 200 }, 277 | { subjectId: 300 }, 278 | ]); 279 | } 280 | } 281 | }; 282 | 283 | const done = (err, res) => { 284 | t.deepEqual(err, null); 285 | t.deepEqual(res.constructor.name, 'Result'); 286 | t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]); 287 | t.deepEqual(res.mask, [ true ]); 288 | t.deepEqual(res.group, [ 'hello world' ]); 289 | }; 290 | 291 | query.query.call(mock, text, done); 292 | }); 293 | test('query - multiple groups', function(t) { 294 | 295 | t.plan(7); 296 | const text = 'hello world'; 297 | const mock = { 298 | tokenize: ( t, cb ) => { 299 | cb( null, [['hello world'], ['hallo welt']] ); 300 | }, 301 | index: { 302 | matchSubjectDistinctSubjectIds: ( phrase, cb ) => { 303 | t.ok(true); 304 | return cb( null, [ 305 | { subjectId: 100 }, 306 | { subjectId: 200 }, 307 | { subjectId: 300 }, 308 | ]); 309 | } 310 | } 311 | }; 312 | 313 | const done = (err, res) => { 314 | t.deepEqual(err, null); 315 | t.deepEqual(res.constructor.name, 'Result'); 316 | t.deepEqual(res.getIdsAsArray(), [ 100, 200, 300 ]); 317 | t.deepEqual(res.mask, [ true ]); 318 | t.deepEqual(res.group, [ 'hello world' ]); 319 | }; 320 | 321 | query.query.call(mock, text, done); 322 | }); 323 | }; 324 | -------------------------------------------------------------------------------- /server/demo/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 54 | 55 | 305 | 306 | 307 | 308 | 309 |
310 | 311 | 334 | 335 | 336 | 337 | -------------------------------------------------------------------------------- /prototype/wof.js: -------------------------------------------------------------------------------- 1 | 2 | // plugin for whosonfirst 3 | const _ = require('lodash'); 4 | const dir = require('require-dir'); 5 | const util = require('util'); 6 | const blacklist = require('pelias-blacklist-stream/loader')(); 7 | const analysis = require('../lib/analysis'); 8 | const language = dir('../config/language'); 9 | const LOW_POPULATION_THRESHOLD = 2000; 10 | 11 | // list of languages / tags we favour in cases of deduplication 12 | const LANG_PREFS = ['eng','und']; 13 | const TAG_PREFS = ['preferred','abbr','label','variant','colloquial']; 14 | 15 | // insert a wof record in to index 16 | function insertWofRecord( wof, next ){ 17 | 18 | var id = wof['wof:id']; 19 | if( 'string' === typeof id ){ id = parseInt( id, 10 ); } 20 | 21 | // sanity check; because WOF 22 | if( !isValidWofRecord( id, wof ) ) { return next(); } 23 | 24 | // enforce pelias/blacklist-stream exclusions 25 | let peliasGID = util.format('whosonfirst:%s:%d', wof['wof:placetype'], id); 26 | if( blacklist && blacklist.hasOwnProperty( peliasGID ) ) { return next(); } 27 | 28 | // --- document which will be saved in the doc store --- 29 | 30 | const doc = { 31 | id: id, 32 | name: wof['wof:label'] || wof['wof:name'], 33 | abbr: getAbbreviation( wof ), 34 | placetype: wof['wof:placetype'], 35 | rank: getRank( wof['wof:placetype'] ), 36 | population: getPopulation( wof ), 37 | popularity: wof['qs:photo_sum'], 38 | lineage: wof['wof:hierarchy'], 39 | geom: { 40 | area: wof['geom:area'], 41 | bbox: validBoundingBox(wof['lbl:bbox']) || validBoundingBox(wof['geom:bbox']), 42 | lat: wof['lbl:latitude'] || wof['geom:latitude'], 43 | lon: wof['lbl:longitude'] ||wof['geom:longitude'] 44 | }, 45 | names: {} 46 | }; 47 | 48 | var tokens = []; 49 | var parentIds = []; 50 | 51 | // --- cast strings to numeric types --- 52 | // note: sometimes numeric properties in WOF can be encoded as strings. 53 | 54 | doc.population = _.toInteger( doc.population ) || undefined; 55 | doc.popularity = _.toInteger( doc.popularity ) || undefined; 56 | doc.geom.area = _.toFinite( doc.geom.area ) || undefined; 57 | doc.geom.lat = _.toFinite( doc.geom.lat ); 58 | doc.geom.lon = _.toFinite( doc.geom.lon ); 59 | 60 | // --- tokens --- 61 | 62 | // disable adding tokens to the index for the 'empire' placetype. 63 | // this ensures empire records are not retrieved via search. 64 | if( 'empire' !== doc.placetype ){ 65 | 66 | // add 'wof:label' 67 | tokens.push({ lang: 'und', tag: 'label', body: wof['wof:label'] }); 68 | 69 | // add 'wof:name' 70 | tokens.push({ lang: 'und', tag: 'label', body: wof['wof:name'] }); 71 | 72 | // add 'wof:shortcode' 73 | // @todo: wof:abbreviation is deprecated, remove references to it 74 | tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:shortcode'] || wof['wof:abbreviation'] }); 75 | 76 | // add 'ne:abbrev' 77 | // tokens.push({ lang: 'und', body: wof['ne:abbrev'] }); 78 | 79 | // fields specific to countries & dependencies 80 | if( 'country' === doc.placetype || 'dependency' === doc.placetype ) { 81 | if( wof['iso:country'] && wof['iso:country'] !== 'XX' ){ 82 | 83 | // add 'ne:iso_a2' 84 | tokens.push({ lang: 'und', tag: 'abbr', body: wof['ne:iso_a2'] }); 85 | 86 | // add 'ne:iso_a3' 87 | tokens.push({ lang: 'und', tag: 'abbr', body: wof['ne:iso_a3'] }); 88 | 89 | // add 'wof:country' 90 | // warning: eg. FR for 'French Guiana' 91 | // tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:country'] }); 92 | 93 | // add 'iso:country' 94 | tokens.push({ lang: 'und', tag: 'abbr', body: wof['iso:country'] }); 95 | 96 | // add 'wof:country_alpha3' 97 | tokens.push({ lang: 'und', tag: 'abbr', body: wof['wof:country_alpha3'] }); 98 | } 99 | } 100 | 101 | // note: skip all `name:*` fields when we suspect that they were sourced from 102 | // machine transliteration via WikiData. 103 | // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/799 104 | const hasDeadOrObscureLanguages = _.has(wof, 'name:vol_x_preferred'); 105 | const isLowOrUnknownPopulation = _.get(doc, 'population', 0) < LOW_POPULATION_THRESHOLD; 106 | const isMegaCity = _.get(doc, 'wof:megacity', 0) === 1; 107 | const isCapitalCity = !_.isEmpty(_.get(doc, 'wof:capital_of')); 108 | const isLikelyTransliterated = ( 109 | hasDeadOrObscureLanguages && isLowOrUnknownPopulation && !isMegaCity && !isCapitalCity 110 | ); 111 | if (!isLikelyTransliterated) { 112 | 113 | // add 'name:*' fields 114 | for( var attr in wof ){ 115 | // https://github.com/whosonfirst/whosonfirst-names 116 | // names: preferred|colloquial|variant|unknown 117 | const match = attr.match(/^name:([a-z]{3})_x_(preferred|colloquial|variant)$/); 118 | if (!match) { continue; } 119 | 120 | // Fix for https://github.com/pelias/placeholder/pull/126 121 | // Transform iso codes 639-2/B to 639-2/T 122 | const lang = language.alternatives[match[1]] || match[1]; 123 | 124 | // skip languages in the blacklist, see config file for more info 125 | if( language.blacklist.hasOwnProperty( match[1] ) ){ continue; } 126 | 127 | // skip if both iso codes 639-2/B and 639-2/T are present and the current iso is 639-2/B 128 | if ( lang !== match[1] && wof[ 'name:' + lang + '_x_' + match[2] ]) { continue; } 129 | 130 | // index each alternative name 131 | for( var n in wof[ attr ] ){ 132 | tokens.push({ 133 | lang: lang, 134 | tag: match[2], 135 | body: wof[ attr ][ n ] 136 | }); 137 | } 138 | 139 | // doc - only store 'preferred' strings 140 | if( match[2] === 'preferred' ){ 141 | doc.names[ lang ] = wof[ attr ]; 142 | } 143 | } 144 | 145 | } 146 | } 147 | 148 | // In the USA we would like to favor the 'wof:label' property over the 'name:eng_x_preferred' property. 149 | if( 'US' === wof['iso:country'] && wof['wof:label'] ){ 150 | doc.names.eng = [ wof['wof:label'] ]; 151 | } 152 | 153 | // --- graph --- 154 | 155 | // parent_id property (some records have this property set but no hierarchy) 156 | var parentId; 157 | if( wof.hasOwnProperty('wof:parent_id') ){ 158 | parentId = wof['wof:parent_id']; 159 | if( 'string' === typeof parentId ){ parentId = parseInt( parentId, 10 ); } 160 | if( !isNaN( parentId ) && parentId !== id && parentId > 0 ){ 161 | parentIds.push( parentId ); // is child of 162 | } 163 | } 164 | 165 | // hierarchy properties 166 | for( var h in wof['wof:hierarchy'] ){ 167 | for( var i in wof['wof:hierarchy'][h] ){ 168 | var pid = wof['wof:hierarchy'][h][i]; 169 | if( 'string' === typeof pid ){ pid = parseInt( pid, 10 ); } 170 | if( pid === id || pid <= 0 || pid === parentId ){ continue; } 171 | // parentIds.push( id, pid, 'p' ); // has parent 172 | parentIds.push( pid ); // is child of 173 | } 174 | } 175 | 176 | // ---- consume aggregates 177 | 178 | // normalize tokens 179 | tokens = tokens.reduce(( res, token ) => { 180 | analysis.normalize( token.body ).forEach( norm => { 181 | res.push({ lang: token.lang, tag: token.tag, body: norm }); 182 | }); 183 | return res; 184 | }, []); 185 | 186 | // sort tokens (for optimal deduplication) 187 | tokens.sort((i1, i2) => { 188 | 189 | // sort by language 190 | const l1 = LANG_PREFS.indexOf(i1.lang); 191 | const l2 = LANG_PREFS.indexOf(i2.lang); 192 | 193 | if (l1 === -1){ return +1; } 194 | if (l2 === -1){ return -1; } 195 | if (l1 > l2){ return +1; } 196 | if (l1 < l2){ return -1; } 197 | 198 | // sort by tag 199 | const t1 = TAG_PREFS.indexOf(i1.tag); 200 | const t2 = TAG_PREFS.indexOf(i2.tag); 201 | 202 | if (t1 === -1){ return +1; } 203 | if (t2 === -1){ return -1; } 204 | if (t1 > t2){ return +1; } 205 | if (t1 < t2){ return -1; } 206 | 207 | return 0; 208 | }); 209 | 210 | // deduplicate tokens 211 | var seen = {}; 212 | tokens = tokens.filter( token => { 213 | if( seen.hasOwnProperty( 'eng:' + token.body ) ){ return false; } 214 | if( seen.hasOwnProperty( 'und:' + token.body ) ){ return false; } 215 | const key = token.lang + ':' + token.body; 216 | return seen.hasOwnProperty( key ) ? false : ( seen[ key ] = true ); 217 | }); 218 | 219 | // deduplicate parent ids 220 | parentIds = parentIds.filter(( pid, pos ) => { 221 | return parentIds.indexOf( pid ) === pos; 222 | }); 223 | 224 | // save all data to the databases 225 | this.store.set( id, doc, ( err ) => { 226 | if( err ){ console.error( err ); } 227 | this.index.setTokens( id, tokens, ( err ) => { 228 | if( err ){ console.error( err ); } 229 | this.index.setLineage( id, parentIds, ( err ) => { 230 | if( err ){ console.error( err ); } 231 | next(); 232 | }); 233 | }); 234 | }); 235 | } 236 | 237 | // check if value is a valid number 238 | function isFiniteNumber( value ){ 239 | return !_.isEmpty(_.trim( value )) && _.isFinite(_.toNumber( value )); 240 | } 241 | 242 | function isValidWofRecord( id, wof ){ 243 | 244 | // sanity check inputs 245 | if( !id || !wof ) { return false; } 246 | 247 | // sanity check; because WOF 248 | if( id <= 0 ) { return false; } 249 | 250 | // skip deprecated records 251 | const deprecated = _.trim( wof['edtf:deprecated'] ); 252 | if( !_.isEmpty( deprecated ) && deprecated !== 'uuuu' ){ 253 | return false; 254 | } 255 | 256 | // skip superseded records 257 | const superseded = wof['wof:superseded_by']; 258 | if( Array.isArray( superseded ) && superseded.length > 0 ){ 259 | return false; 260 | } 261 | 262 | /** 263 | skip non-current records 264 | 265 | 0 signifies a non-current record 266 | 1 signifies a current record 267 | -1 signifies an inderminate state, someone needs to look at this record and decide 268 | 269 | note: we are considering -1 values as current (for now) 270 | **/ 271 | const isCurrent = wof['mz:is_current']; 272 | if( isCurrent === '0' || isCurrent === 0 ){ 273 | return false; 274 | } 275 | 276 | // invalid latitude 277 | if( !isFiniteNumber(wof['lbl:latitude']) && !isFiniteNumber(wof['geom:latitude']) ){ 278 | return false; 279 | } 280 | 281 | // invalid longitude 282 | if( !isFiniteNumber(wof['lbl:longitude']) && !isFiniteNumber(wof['geom:longitude']) ){ 283 | return false; 284 | } 285 | 286 | return true; 287 | } 288 | 289 | // this function favors mz:population when available, falling back to other properties. 290 | // see: https://github.com/whosonfirst-data/whosonfirst-data/issues/240#issuecomment-294907374 291 | function getPopulation( wof ) { 292 | if( wof['mz:population'] ){ return wof['mz:population']; } 293 | else if( wof['wof:population'] ){ return wof['wof:population']; } 294 | else if( wof['wk:population'] ){ return wof['wk:population']; } 295 | else if( wof['gn:population'] ){ return wof['gn:population']; } 296 | else if( wof['gn:pop'] ){ return wof['gn:pop']; } 297 | else if( wof['qs:pop'] ){ return wof['qs:pop']; } 298 | else if( wof['qs:gn_pop'] ){ return wof['qs:gn_pop']; } 299 | else if( wof['zs:pop10'] ){ return wof['zs:pop10']; } 300 | else if( wof['meso:pop'] ){ return wof['meso:pop']; } 301 | else if( wof['statoids:population'] ){ return wof['statoids:population']; } 302 | else if( wof['ne:pop_est'] ){ return wof['ne:pop_est']; } 303 | } 304 | 305 | // abbreviations and ISO codes 306 | // logic copied from: pelias/whosonfirst src/components/extractFields.js (since modified) 307 | // @todo: wof:abbreviation is deprecated, remove references to it 308 | function getAbbreviation( wof ) { 309 | if( 'country' === wof['wof:placetype'] || 'dependency' === wof['wof:placetype'] ) { 310 | return wof['wof:country_alpha3'] || wof['ne:iso_a3']; 311 | } else if( wof['wof:shortcode'] || wof['wof:abbreviation'] ) { 312 | return wof['wof:shortcode'] || wof['wof:abbreviation']; 313 | } 314 | } 315 | 316 | const PLACETYPE_RANK = [ 317 | 'venue', 'address', 'building', 'campus', 'microhood', 'neighbourhood', 'macrohood', 'borough', 'postalcode', 318 | 'locality', 'metro area', 'localadmin', 'county', 'macrocounty', 'region', 'macroregion', 'marinearea', 319 | 'disputed', 'dependency', 'country', 'empire', 'continent', 'ocean', 'planet' 320 | ]; 321 | 322 | // this function returns an integer representation of the placetype, 323 | function getRank( placetype ){ 324 | var rank = PLACETYPE_RANK.indexOf((placetype || '').toLowerCase().trim()); 325 | return { 326 | min: rank, 327 | max: rank +1 328 | }; 329 | } 330 | 331 | // this function validates and returns the bbox property verbatim, else undefined 332 | // see: https://github.com/pelias/placeholder/issues/183 333 | // format: minx, miny, maxx, maxy 334 | function validBoundingBox(bbox) { 335 | if (!_.isString(bbox)) { return; } 336 | const coords = bbox.split(','); 337 | if (coords.length !== 4) { return; } 338 | const floats = coords.map(c => parseFloat(c)); 339 | if (floats.some(isNaN)) { return; } 340 | if (floats[0] > floats[2]) { return; } 341 | if (floats[1] > floats[3]) { return; } 342 | return bbox; 343 | } 344 | 345 | module.exports.insertWofRecord = insertWofRecord; 346 | module.exports.isValidWofRecord = isValidWofRecord; 347 | module.exports.getPopulation = getPopulation; 348 | module.exports.getAbbreviation = getAbbreviation; 349 | module.exports.validBoundingBox = validBoundingBox; 350 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 |

A modular, open-source search engine for our world.

5 |

Pelias is a geocoder powered completely by open data, available freely to everyone.

6 |

7 | 8 | 9 | 10 |

11 |

12 | Local Installation · 13 | Cloud Webservice · 14 | Documentation · 15 | Community Chat 16 |

17 |
18 | What is Pelias? 19 |
20 | Pelias is a search engine for places worldwide, powered by open data. It turns addresses and place names into geographic coordinates, and turns geographic coordinates into places and addresses. With Pelias, you’re able to turn your users’ place searches into actionable geodata and transform your geodata into real places. 21 |

22 | We think open data, open source, and open strategy win over proprietary solutions at any part of the stack and we want to ensure the services we offer are in line with that vision. We believe that an open geocoder improves over the long-term only if the community can incorporate truly representative local knowledge. 23 |
24 | 25 | # Pelias coarse geocoder 26 | 27 | This repository provides all the code & geographic data you'll need to run your own coarse geocoder. 28 | 29 | Read our [An (almost) one line coarse geocoder with Docker](https://geocode.earth/blog/2019/almost-one-line-coarse-geocoding) blog post for a quick start guide and [check out our demo](https://placeholder.demo.geocode.earth). 30 | 31 | This service is intended to be run as part of the [Pelias Gecoder](https://github.com/pelias/pelias) but can just as easily be run independently as it has no external dependencies. 32 | 33 | ## Natural language parser for geographic text 34 | 35 | The engine takes unstructured input text, such as 'Neutral Bay North Sydney New South Wales' and attempts to deduce the geographic area the user is referring to. 36 | 37 | Human beings (familiar with Australian geography) are able to quickly scan the text and establish that there 3 distinct token groups: 'Neutral Bay', 'North Sydney' & 'New South Wales'. 38 | 39 | The engine uses a similar technique to our brains, scanning across the text, cycling through a dictionary of learned terms and then trying to establish logical token groups. 40 | 41 | Once token groups have been established, a reductive algorithm is used to ensure that the token groups are logical in a geographic context. We don't want to return New York City for a term such as 'nyc france', so we need to only return things called 'nyc' *inside* places called 'france'. 42 | 43 | The engine starts from the rightmost group, and works to the left, ensuring token groups represent geographic entities contained *within* those which came before. This process is repeated until it either runs out of groups, or would return 0 results. 44 | 45 | The best estimation is then returned, either as a set of integers representing the ids of those regions, or as a JSON structure which also contains additional information such as population counts etc. 46 | 47 | The data is sourced from the [whosonfirst](https://github.com/whosonfirst-data/whosonfirst-data) project, this project also includes different language translations of place names. 48 | 49 | Placeholder supports searching on and retrieving tokens in different languages and also offers support for synonyms and abbreviations. 50 | 51 | The engine includes a rudimentary language detection algorithm which attempts to detect right-to-left languages and languages which write their addresses in major-to-minor format. It will then reverse the tokens to re-order them in to minor-to-major ordering. 52 | 53 | --- 54 | 55 | ## Requirements 56 | 57 | Placeholder requires Node.js and SQLite 58 | 59 | See [Pelias software requirements](https://github.com/pelias/documentation/blob/master/requirements.md) for required and recommended versions. 60 | 61 | ## Install 62 | 63 | ```bash 64 | $ git clone git@github.com:pelias/placeholder.git && cd placeholder 65 | $ npm install 66 | ``` 67 | 68 | ### Download the required database files 69 | 70 | Data hosting is provided by [Geocode Earth](https://geocode.earth). Other 71 | Pelias related downloads are available at https://geocode.earth/data. 72 | 73 | ```bash 74 | $ mkdir data 75 | $ curl -s https://data.geocode.earth/placeholder/store.sqlite3.gz | gunzip > data/store.sqlite3; 76 | ``` 77 | 78 | ### Confirm the build was successful 79 | 80 | ```bash 81 | $ npm test 82 | ``` 83 | 84 | ```bash 85 | $ npm run cli -- san fran 86 | 87 | > pelias-placeholder@1.0.0 cli 88 | > node cmd/cli.js "san" "fran" 89 | 90 | san fran 91 | 92 | took: 3ms 93 | - 85922583 locality San Francisco 94 | ``` 95 | 96 | --- 97 | 98 | ## Run server 99 | 100 | ```bash 101 | $ PORT=6100 npm start; 102 | ``` 103 | 104 | #### Configuration via Environment Variables 105 | 106 | The service supports additional environment variables that affect its operation: 107 | 108 | | Environment Variable | Default | Description | 109 | | -------------------- | ------- | ----------- | 110 | | `HOST` | `undefined` | The network address that the placeholder service will bind to. Defaults to whatever the current Node.js default is, which is currently to listen on `0.0.0.0` (all interfaces). See the [Node.js Net documentation](https://nodejs.org/api/net.html#net_server_listen_port_host_backlog_callback) for more information. | 111 | | `PORT` | `3000` | The TCP port that the placeholder service will use for incoming network connections | 112 | | `PLACEHOLDER_DATA` | `../data/` | Path to the directory where the placeholder service will find the `store.sqlite3` database file. | 113 | 114 | ### Open browser 115 | 116 | the server should now be running and you should be able to access the http API: 117 | 118 | ```bash 119 | http://localhost:6100/ 120 | ``` 121 | 122 | try the following paths: 123 | 124 | ```javascript 125 | /demo 126 | /parser/search?text=london 127 | /parser/findbyid?ids=101748479 128 | /parser/query?text=london 129 | /parser/tokenize?text=sydney new south wales 130 | ``` 131 | 132 | ### Changing languages 133 | 134 | the `/parser/search` endpoint accepts a `?lang=xxx` property which can be used to vary the language of data returned. 135 | 136 | for example, the following urls will return strings in Japanese / Russian where available: 137 | 138 | ```javascript 139 | /parser/search?text=germany&lang=jpn 140 | /parser/search?text=germany&lang=rus 141 | ``` 142 | 143 | documents returned by `/parser/search` contain a boolean property named `languageDefaulted` which indicates if the service was able to find a translation in the language you request (false) or whether it returned the default language (true). 144 | 145 | The `/parser/findbyid` endpoint also accepts a `?lang=xxx` property which will return the selected lang if the translation exists and all translations otherwise. 146 | 147 | for example, the following url will return strings in French / Korean where available: 148 | 149 | ```javascript 150 | /parser/findbyid?ids=85633147,102191581,85862899&lang=fra 151 | /parser/findbyid?ids=85633147,102191581,85862899&lang=kor 152 | ``` 153 | 154 | the demo is also able to serve responses in different languages by providing the language code in the URL anchor: 155 | 156 | ```bash 157 | /demo#jpn 158 | /demo#chi 159 | /demo#eng 160 | /demo#fra 161 | ... etc. 162 | ``` 163 | 164 | ### Filtering by placetype 165 | 166 | the `/parser/search` endpoint accepts a `?placetype=xxx` parameter which can be used to control the placetype of records which are returned. 167 | 168 | the API does not provide any performance benefits, it is simply a convenience API to filter by a whitelist. 169 | 170 | you may specify multiple placetypes using a comma to separate them, such as `?placetype=xxx,yyy`, these are matched as OR conditions. eg: (xxx OR yyy) 171 | 172 | for example: 173 | 174 | the query `search?text=luxemburg` will return results for the `country`, `region`, `locality` etc. 175 | 176 | you can use the placetype filter to control which records are returned: 177 | 178 | ``` 179 | # all matching results 180 | search?text=luxemburg 181 | 182 | # only return matching country records 183 | search?text=luxemburg&placetype=country 184 | 185 | # return matching country or region records 186 | search?text=luxemburg&placetype=country,region 187 | ``` 188 | 189 | ### Live mode (BETA) 190 | 191 | the `/parser/search` endpoint accepts a `?mode=live` parameter pair which can be used to enable an autocomplete-style API. 192 | 193 | in this mode the final token of each input text is considered as 'incomplete', meaning that the user has potentially only typed part of a token. 194 | 195 | this mode is currently in BETA, the interface and behaviour may change over time. 196 | 197 | ### Configuring the rtree threshold 198 | 199 | the default matching strategy uses the `lineage` table to ensure that token pairs represent a valid child->parent relationship. this ensures that queries like 'London France' do not match, because there is no entry in the lineage table linking those two places together. 200 | 201 | in some cases it's preferable to fall back to a matching strategy which considers geographically nearby places with a matching name, even if that relationship does not explicitly exist in the lineage table. 202 | 203 | for example, 'Basel France' will return 'Basel Switzerland'. this is useful for handling user input errors and errors and omissions from the lineage table. 204 | 205 | in the example above, 'Basel France' only matches because the bounding box of 'Basel' overlaps the bounding box of 'France' and no other valid entry for 'Basel France' exists. 206 | 207 | the definition of what is 'nearby' is configurable, the bbox for the minor term (left token) is expanded by a threshold (the threshold is added or subtracted to each of the bbox vertices). 208 | 209 | by default the threshold is set as `0.2` (degrees), any float value between 0 and 1 may be specified via the enviornment variable `RTREE_THRESHOLD`. 210 | 211 | a setting of less than 0 will disable the rtree functionality completely. disabling the rtree will result in nearby queries such as 'Basel France' returning 'France' instead of 'Basel Switzerland'. 212 | 213 | --- 214 | 215 | ## Run the interactive shell 216 | 217 | ```bash 218 | $ npm run repl 219 | 220 | > pelias-placeholder@1.0.0 repl 221 | > node cmd/repl.js 222 | 223 | placeholder > 224 | ``` 225 | 226 | try the following commands: 227 | 228 | ```javascript 229 | placeholder > london on 230 | - 101735809 locality London 231 | 232 | placeholder > search london on 233 | - 101735809 locality London 234 | 235 | placeholder > tokenize sydney new south wales 236 | [ [ 'sydney', 'new south wales' ] ] 237 | 238 | placeholder > token kelburn 239 | [ 1729339019 ] 240 | 241 | placeholder > id 1729339019 242 | { name: 'Kelburn', 243 | placetype: 'neighbourhood', 244 | lineage: 245 | { continent_id: 102191583, 246 | country_id: 85633345, 247 | county_id: 102079339, 248 | locality_id: 101915529, 249 | neighbourhood_id: 1729339019, 250 | region_id: 85687233 }, 251 | names: { eng: [ 'Kelburn' ] } } 252 | ``` 253 | 254 | --- 255 | 256 | ## Configuration for pelias API 257 | 258 | While Placeholder can be used as a stand-alone application or included with other geographic software / search engines, it is designed for the [Pelias geocoder](https://github.com/pelias/pelias). 259 | 260 | To connect Placeholder service to the Pelias API, [configure the pelias config file](https://github.com/pelias/api#configuration-via-pelias-config) with the port that placeholder is running on. 261 | 262 | --- 263 | 264 | ## Tests 265 | 266 | ### run the test suite 267 | 268 | ```bash 269 | $ npm test 270 | ``` 271 | 272 | ### Run the functional cases 273 | 274 | there are more exhaustive test cases included in `test/cases/`. 275 | 276 | to run all the test cases: 277 | 278 | ```bash 279 | $ npm run funcs 280 | ``` 281 | 282 | ### Generate a ~500,000 line test file 283 | 284 | this command requires the `data/wof.extract` file mentioned below in the 'building the database' section. 285 | 286 | ```bash 287 | $ npm run gentests 288 | ``` 289 | 290 | once complete you can find the generated test cases in `test/cases/generated.txt`. 291 | 292 | --- 293 | 294 | ## Docker 295 | 296 | ### Build the service image 297 | 298 | ```bash 299 | $ docker-compose build 300 | ``` 301 | 302 | ### Run the service in the background 303 | 304 | ```bash 305 | $ docker-compose up -d 306 | ``` 307 | 308 | --- 309 | 310 | ## Building the database 311 | 312 | ### Prerequisites 313 | - jq 1.5+ must be installed 314 | - on ubuntu: `sudo apt-get install jq` 315 | - on mac: `brew install jq` 316 | - Who's on First data download 317 | - use the download script in [pelias/whosonfirst](https://github.com/pelias/whosonfirst#downloading-the-data) 318 | 319 | ### Steps 320 | the database is created from geographic data sourced from the [whosonfirst](https://whosonfirst.org/) project. 321 | 322 | the whosonfirst project is distributed as geojson files, so in order to speed up development we first extract the relevant data in to a file: `data/wof.extract`. 323 | 324 | the following command will iterate over all the `geojson` files downloaded by the Pelias whosonfirst importer, extracting the relevant properties in to the file `data/wof.extract`. 325 | 326 | this process can take 30-60 minutes to run and consumes ~350MB of disk space, you will only need to run this command once, or when your local `whosonfirst-data` files are updated. 327 | 328 | ```bash 329 | $ npm run extract 330 | 331 | # alternative if you do not have a `pelias.json` file specifying where WOF data should be 332 | $ WOF_DIR=/path/to/your/whosonfirst/data npm run extract 333 | 334 | ``` 335 | 336 | now you can rebuild the `data/store.json` file with the following command: 337 | 338 | this should take 2-3 minutes to run: 339 | 340 | ```bash 341 | $ npm run build 342 | ``` 343 | 344 | --- 345 | 346 | ## Using the Docker image 347 | 348 | ### Rebuild the image 349 | 350 | you can rebuild the image on any system with the following command: 351 | 352 | ```bash 353 | $ docker build -t pelias/placeholder . 354 | ``` 355 | 356 | ### Download pre-built image 357 | 358 | Up to date Docker images are built and automatically pushed to Docker Hub from our continuous integration pipeline 359 | 360 | You can pull the latest stable image with 361 | 362 | ```bash 363 | $ docker pull pelias/placeholder 364 | ``` 365 | 366 | ### Download custom image tags 367 | 368 | We publish each commit and the latest of each branch to separate tags 369 | 370 | A list of all available tags to download can be found at https://hub.docker.com/r/pelias/placeholder/tags/ 371 | --------------------------------------------------------------------------------