├── .dockerignore ├── .github └── workflows │ ├── _test.yml │ ├── pull_request.yml │ └── push.yml ├── .gitignore ├── .jshintignore ├── .jshintrc ├── .npmrc ├── Dockerfile ├── LICENSE ├── README.md ├── bin ├── download ├── parallel ├── start └── units ├── import.js ├── lib ├── analysis │ ├── Token.js │ ├── dictionaries │ │ └── en │ │ │ ├── diagonal_contractions.txt │ │ │ ├── directional_expansions.txt │ │ │ ├── street_types_overrides.txt │ │ │ └── street_types_usps.txt │ ├── dictionary.js │ ├── ordinals.js │ └── synonyms.js ├── cleanup.js ├── cleanup_v2.js ├── importPipeline.js ├── isValidCsvRecord.js ├── parameters.js └── streams │ ├── cleanupStream.js │ ├── contentHashStream.js │ ├── documentStream.js │ ├── gnafMapperStream.js │ ├── isUSorCAHouseNumberZero.js │ ├── recordStream.js │ ├── unitSplittingMapperStream.js │ └── validRecordFilterStream.js ├── package.json ├── schema.js ├── test ├── analysis.js ├── cleanup_v2.js ├── data │ ├── au │ │ ├── input_file_3.csv │ │ └── input_file_4.csv │ ├── expected.json │ ├── input_file_1.csv │ └── input_file_2.csv ├── functional.js ├── import.js ├── importPipeline.js ├── isValidCsvRecord.js ├── openaddresses_bad_data.csv ├── openaddresses_sample.csv ├── parameters.js ├── schema.js ├── streams │ ├── cleanupStream.js │ ├── contentHashStream.js │ ├── documentStream.js │ ├── gnafMapperStream.js │ ├── isUSorCAHouseNumberZero.js │ ├── recordStream.js │ └── unitSplittingMapperStream.js └── test.js └── utils ├── OpenAddressesAPI.js ├── download_all.js ├── download_data.js └── download_filtered.js /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | node_modules 3 | -------------------------------------------------------------------------------- /.github/workflows/_test.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | on: workflow_call 3 | jobs: 4 | unit-tests: 5 | runs-on: '${{ matrix.os }}' 6 | strategy: 7 | matrix: 8 | os: 9 | - ubuntu-22.04 10 | node-version: [ 18.x, 20.x, 22.x ] 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: 'Install node.js ${{ matrix.node-version }}' 14 | uses: actions/setup-node@v4 15 | with: 16 | node-version: '${{ matrix.node-version }}' 17 | - name: Run unit tests 18 | run: | 19 | [[ -f ./bin/ci-setup ]] && ./bin/ci-setup 20 | npm install 21 | npm run ci 22 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | on: pull_request 3 | jobs: 4 | unit-tests: 5 | # only run this job for forks 6 | if: github.event.pull_request.head.repo.full_name != github.repository 7 | uses: ./.github/workflows/_test.yml 8 | -------------------------------------------------------------------------------- /.github/workflows/push.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | on: push 3 | jobs: 4 | unit-tests: 5 | uses: ./.github/workflows/_test.yml 6 | npm-publish: 7 | needs: unit-tests 8 | if: github.ref == 'refs/heads/master' && needs.unit-tests.result == 'success' 9 | runs-on: ubuntu-22.04 10 | steps: 11 | - uses: actions/checkout@v4 12 | - name: Install Node.js 13 | uses: actions/setup-node@v4 14 | with: 15 | node-version: 20.x 16 | - name: Run semantic-release 17 | env: 18 | GH_TOKEN: ${{ secrets.GH_SEMANTIC_RELEASE_TOKEN }} 19 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 20 | run: > 21 | if [[ -n "$GH_TOKEN" && -n "$NPM_TOKEN" ]]; then 22 | curl "https://raw.githubusercontent.com/pelias/ci-tools/master/semantic-release.sh" | bash - 23 | fi 24 | build-docker-images: 25 | # run this job if the unit tests passed and the npm-publish job was a success or was skipped 26 | # note: github actions won't run a job if you don't call one of the status check functions, so `always()` is called since it evalutes to `true` 27 | if: ${{ always() && needs.unit-tests.result == 'success' && (needs.npm-publish.result == 'success' || needs.npm-publish.result == 'skipped') }} 28 | needs: [unit-tests, npm-publish] 29 | runs-on: ubuntu-22.04 30 | steps: 31 | - uses: actions/checkout@v4 32 | - name: Build Docker images 33 | env: 34 | DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }} 35 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }} 36 | run: | 37 | curl "https://raw.githubusercontent.com/pelias/ci-tools/master/build-docker-images.sh" | bash - 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | *.log 3 | -------------------------------------------------------------------------------- /.jshintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /.jshintrc: -------------------------------------------------------------------------------- 1 | { 2 | "esversion": 8, 3 | "node": true, 4 | "curly": true, 5 | "eqeqeq": true, 6 | "freeze": true, 7 | "immed": true, 8 | "indent": 2, 9 | "latedef": false, 10 | "newcap": true, 11 | "noarg": true, 12 | "noempty": true, 13 | "nonbsp": true, 14 | "nonew": true, 15 | "plusplus": false, 16 | "quotmark": "single", 17 | "undef": true, 18 | "unused": true, 19 | "maxparams": 4, 20 | "maxdepth": 4, 21 | "maxlen": 120 22 | } 23 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # base image 2 | FROM pelias/baseimage 3 | 4 | # downloader apt dependencies 5 | # note: this is done in one command in order to keep down the size of intermediate containers 6 | RUN apt-get update && apt-get install --no-install-recommends -y unzip awscli && rm -rf /var/lib/apt/lists/* 7 | 8 | # change working dir 9 | ENV WORKDIR /code/pelias/openaddresses 10 | WORKDIR ${WORKDIR} 11 | 12 | # copy package.json first to prevent npm install being rerun when only code changes 13 | COPY ./package.json ${WORKDIR} 14 | RUN npm install && npm cache clean --force; 15 | 16 | # copy code into image 17 | ADD . ${WORKDIR} 18 | 19 | # run tests 20 | RUN npm test 21 | 22 | # run as the pelias user 23 | USER pelias 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Mapzen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 |

A modular, open-source search engine for our world.

5 |

Pelias is a geocoder powered completely by open data, available freely to everyone.

6 |

7 | 8 | 9 | 10 |

11 |

12 | Local Installation · 13 | Cloud Webservice · 14 | Documentation · 15 | Community Chat 16 |

17 |
18 | What is Pelias? 19 |
20 | Pelias is a search engine for places worldwide, powered by open data. It turns addresses and place names into geographic coordinates, and turns geographic coordinates into places and addresses. With Pelias, you’re able to turn your users’ place searches into actionable geodata and transform your geodata into real places. 21 |

22 | We think open data, open source, and open strategy win over proprietary solutions at any part of the stack and we want to ensure the services we offer are in line with that vision. We believe that an open geocoder improves over the long-term only if the community can incorporate truly representative local knowledge. 23 |
24 | 25 | # Pelias OpenAddresses importer 26 | 27 | [![Greenkeeper badge](https://badges.greenkeeper.io/pelias/openaddresses.svg)](https://greenkeeper.io/) 28 | 29 | ## Overview 30 | 31 | The OpenAddresses importer is used to process data from [OpenAddresses](http://openaddresses.io/) 32 | for import into the Pelias geocoder. 33 | 34 | ## Requirements 35 | 36 | Node.js is required. See [Pelias software requirements](https://github.com/pelias/documentation/blob/master/requirements.md) for supported versions. 37 | 38 | ## Installation 39 | 40 | > For instructions on setting up Pelias as a whole, see our [getting started guide](https://github.com/pelias/documentation/blob/master/getting_started_install.md). Further instructions here pertain to the OpenAddresses importer only 41 | 42 | ```bash 43 | git clone https://github.com/pelias/openaddresses 44 | cd openaddresses 45 | npm install 46 | ``` 47 | 48 | ## Data Download 49 | Use the `imports.openaddresses.files` configuration option to limit the download to just the OpenAddresses files of interest. 50 | Refer to the [OpenAddresses data listing]( http://results.openaddresses.io/?runs=all#runs) for file names. 51 | 52 | ```bash 53 | npm run download 54 | ``` 55 | 56 | ## Usage 57 | ```bash 58 | # show full command line options 59 | node import.js --help 60 | 61 | # run an import 62 | npm start 63 | ``` 64 | 65 | ## Admin Lookup 66 | OpenAddresses records do not contain information about which city, state (or 67 | other region like province), or country that they belong to. Pelias has the 68 | ability to compute these values from [Who's on First](http://whosonfirst.mapzen.com/) data. 69 | For more info on how admin lookup works, see the documentation for 70 | [pelias/wof-admin-lookup](https://github.com/pelias/wof-admin-lookup). By default, 71 | adminLookup is enabled. To disable, set `imports.adminLookup.enabled` to `false` in Pelias config. 72 | 73 | **Note:** Admin lookup requires loading around 5GB of data into memory. 74 | 75 | ## Configuration 76 | This importer can be configured in [pelias-config](https://github.com/pelias/config), in the `imports.openaddresses` 77 | hash. A sample configuration file might look like this: 78 | 79 | ```javascript 80 | { 81 | "esclient": { 82 | "hosts": [ 83 | { 84 | "env": "development", 85 | "protocol": "http", 86 | "host": "localhost", 87 | "port": 9200 88 | } 89 | ] 90 | }, 91 | "logger": { 92 | "level": "debug" 93 | }, 94 | "imports": { 95 | "whosonfirst": { 96 | "datapath": "/mnt/data/whosonfirst/", 97 | "importPostalcodes": false, 98 | "importVenues": false 99 | }, 100 | "openaddresses": { 101 | "datapath": "/mnt/data/openaddresses/", 102 | "files": [ "us/ny/city_of_new_york.csv" ] 103 | } 104 | } 105 | } 106 | ``` 107 | 108 | The following configuration options are supported by this importer. 109 | 110 | ### `imports.openaddresses.datapath` 111 | 112 | * Required: yes 113 | * Default: `` 114 | 115 | The absolute path to a directory where OpenAddresses data is located. The download command will also automatically place downloaded files in this directory. 116 | 117 | ### `imports.openaddresses.files` 118 | 119 | * Required: no 120 | * Default: `[]` 121 | 122 | An array of OpenAddresses files to be downloaded (full list can be found on the 123 | [OpenAddresses results site](http://results.openaddresses.io/?runs=all#runs)). 124 | If no files are specified, the full planet data files (11GB+) will be 125 | downloaded. 126 | 127 | ### `imports.openaddresses.missingFilesAreFatal` 128 | 129 | * Required: no 130 | * Default: `false` 131 | 132 | If set to true, any missing files will immediately halt the importer with an 133 | error. Otherwise, the importer will continue processing with a warning. The 134 | data downloader will also continue if any download errors were encountered with this set to false. 135 | 136 | ### `imports.openaddresses.dataHost` 137 | 138 | * Required: no 139 | * Default: `https://data.openaddresses.io` 140 | 141 | The location from which to download OpenAddresses data from. By default, the 142 | primary OpenAddresses servers will be used. This can be overrriden to allow 143 | downloading customized data. Paths are supported (for example, 144 | `https://yourhost.com/path/to/your/data`), but must not end with a trailing 145 | slash. 146 | 147 | S3 buckets are supported. Files will be downloaded using aws-cli. 148 | 149 | For example: `s3://data.openaddresses.io`. 150 | 151 | Note: When using s3, you might need authentcation (IAM instance role, env vars, etc.) 152 | 153 | ### `imports.openaddresses.s3Options` 154 | 155 | * Required: no 156 | 157 | If `imports.openaddresses.dataHost` is an s3 bucket, this will add options to the command. 158 | For example: `--profile my-profile` 159 | 160 | This is useful, for example, when downloading from `s3://data.openaddresses.io`, 161 | as they require the requester to pay for data transfer. 162 | You can then use the following option: `--request-payer` 163 | 164 | ### `imports.openaddresses.token` 165 | * Required: no 166 | * Default: Shared token for the pelias project 167 | 168 | Since openaddresses moved from [results.openaddresses.io](https://results.openaddresses.io) to [batch.openaddresses.io](https://batch.openaddresses.io), you need a token to access the data. There is a default shared token for the Pelias project, but if you want to use it seriously, create your own account and token on [batch.openaddresses.io. ](https://batch.openaddresses.io) to avoid possible throttling/bandwidth limit or (temporary) suspension. 169 | 170 | 171 | ## Parallel Importing 172 | 173 | Because OpenAddresses consists of many small files, this importer can be configured to run several instances in parallel that coordinate to import all the data. 174 | 175 | To use this functionality, replace calls to `npm start` with 176 | 177 | ```bash 178 | npm run parallel 3 # replace 3 with your desired level of paralellism 179 | ``` 180 | 181 | Generally, a parallelism of 2 or 3 is suitable for most tasks. 182 | -------------------------------------------------------------------------------- /bin/download: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | exec node utils/download_data.js 4 | -------------------------------------------------------------------------------- /bin/parallel: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # grab the number of workers count 4 | count=$1 5 | 6 | # remove the first argument from the arguments array ($@) 7 | shift 8 | 9 | # only do anything if count is a valid integer >= 1 10 | if [[ $count -gt 1 ]]; then 11 | echo "openaddresses: starting $count parallel builds" 12 | 13 | # spawn $count parallel builds, passing correct params and all arguments 14 | for i in `seq 0 $(($count-1))`; do 15 | cmd="./bin/start --parallel-count $count --parallel-id $i $@" 16 | $cmd & 17 | done 18 | 19 | # don't let this script finish until all parallel builds have finished 20 | wait 21 | else 22 | # invalid count value, run normal build 23 | exec ./bin/start $@ 24 | fi 25 | -------------------------------------------------------------------------------- /bin/start: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | exec node --max_old_space_size=8000 import.js $@ 4 | -------------------------------------------------------------------------------- /bin/units: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # run tests with pipefail to avoid false passes 4 | # see https://github.com/pelias/pelias/issues/744 5 | set -euo pipefail 6 | 7 | node test/test.js | npx tap-spec 8 | -------------------------------------------------------------------------------- /import.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @file Entry-point script for the OpenAddresses import pipeline. 3 | */ 4 | 5 | var peliasConfig = require( 'pelias-config' ).generate(require('./schema')); 6 | 7 | var logger = require( 'pelias-logger' ).get( 'openaddresses' ); 8 | 9 | var parameters = require( './lib/parameters' ); 10 | var importPipeline = require( './lib/importPipeline' ); 11 | 12 | const adminLookupStream = require('pelias-wof-admin-lookup'); 13 | 14 | // Pretty-print the total time the import took. 15 | function startTiming() { 16 | var startTime = new Date().getTime(); 17 | process.on( 'exit', function (){ 18 | var totalTimeTaken = (new Date().getTime() - startTime).toString(); 19 | var seconds = totalTimeTaken.slice(0, totalTimeTaken.length - 3); 20 | var milliseconds = totalTimeTaken.slice(totalTimeTaken.length - 3); 21 | logger.info( 'Total time taken: %s.%ss', seconds, milliseconds ); 22 | }); 23 | } 24 | 25 | var args = parameters.interpretUserArgs( process.argv.slice( 2 ) ); 26 | 27 | const adminLayers = ['neighbourhood', 'borough', 'locality', 'localadmin', 28 | 'county', 'macrocounty', 'region', 'macroregion', 'dependency', 'country', 29 | 'empire', 'continent']; 30 | 31 | if( 'exitCode' in args ){ 32 | ((args.exitCode > 0) ? console.error : console.info)( args.errMessage ); 33 | process.exit( args.exitCode ); 34 | } else { 35 | startTiming(); 36 | 37 | if (peliasConfig.imports.openaddresses.hasOwnProperty('adminLookup')) { 38 | logger.info('imports.openaddresses.adminLookup has been deprecated, ' + 39 | 'enable adminLookup using imports.adminLookup.enabled = true'); 40 | } 41 | 42 | var files = parameters.getFileList(peliasConfig, args); 43 | 44 | const importer_id = args['parallel-id']; 45 | let importer_name = 'openaddresses'; 46 | 47 | if (importer_id !== undefined) { 48 | importer_name = `openaddresses-${importer_id}`; 49 | } 50 | 51 | importPipeline.create( files, args.dirPath, adminLookupStream.create(adminLayers), importer_name); 52 | } 53 | -------------------------------------------------------------------------------- /lib/analysis/Token.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | 3 | class Token { 4 | constructor(body) { 5 | this.body = _.isString(body) ? body : ''; 6 | } 7 | 8 | isValid() { 9 | return _.isString(this.body) && !_.isEmpty(this.body); 10 | } 11 | 12 | isNumeric() { 13 | return /^\d+$/.test(this.body); 14 | } 15 | 16 | findCase() { 17 | if (this.body === _.toLower(this.body)) { return Token.LOWERCASED; } 18 | if (this.body === _.toUpper(this.body)) { return Token.UPPERCASED; } 19 | return Token.MIXEDCASED; 20 | } 21 | 22 | removeLeadingZeros() { 23 | this.body = this.body.replace(/^(?:0*)([1-9]\d*(st|nd|rd|th))/, '$1'); 24 | } 25 | 26 | selectivelyLowerCase() { 27 | if (this.findCase() === Token.UPPERCASED) { 28 | this.body = _.toLower(this.body); 29 | } 30 | } 31 | 32 | selectivelyUpperCase() { 33 | if (this.findCase() === Token.LOWERCASED && this.body.endsWith('.')) { 34 | this.body = _.toUpper(this.body); 35 | } 36 | } 37 | 38 | selectivelyCapitalize() { 39 | if (this.findCase() === Token.LOWERCASED) { 40 | this.body = this.body.split(/\s+/).map(word => _.capitalize(word)).join(' '); 41 | } 42 | } 43 | } 44 | 45 | Token.LOWERCASED = 0; 46 | Token.UPPERCASED = 1; 47 | Token.MIXEDCASED = 2; 48 | 49 | module.exports = Token; 50 | -------------------------------------------------------------------------------- /lib/analysis/dictionaries/en/diagonal_contractions.txt: -------------------------------------------------------------------------------- 1 | SE|southeast 2 | SW|southwest 3 | NE|northeast 4 | NW|northwest 5 | -------------------------------------------------------------------------------- /lib/analysis/dictionaries/en/directional_expansions.txt: -------------------------------------------------------------------------------- 1 | north|n 2 | south|s 3 | east|e 4 | west|w 5 | SE|se 6 | NE|ne 7 | SW|sw 8 | NW|nw 9 | -------------------------------------------------------------------------------- /lib/analysis/dictionaries/en/street_types_overrides.txt: -------------------------------------------------------------------------------- 1 | concourse|conc 2 | -------------------------------------------------------------------------------- /lib/analysis/dictionaries/en/street_types_usps.txt: -------------------------------------------------------------------------------- 1 | alley|ally|aly 2 | anex|annx|anx 3 | arcade|arc 4 | avenue|av|aven|avenu|avn|avnue|ave 5 | bayou|bayoo|byu 6 | beach|bch 7 | bend|bnd 8 | bluff|bluf|blf 9 | bluffs|blfs 10 | bottom|bot|bottm|btm 11 | boulevard|boul|boulv|blvd 12 | branch|brnch|br 13 | bridge|brdge|brg 14 | brook|brk 15 | brooks|brks 16 | burg|bg 17 | burgs|bgs 18 | bypass|bypa|bypas|byps|byp 19 | camp|cmp|cp 20 | canyon|canyn|cnyn|cyn 21 | cape|cpe 22 | causeway|causwa|cswy 23 | center|cen|cent|centr|cnter|cntr|ctr 24 | centers|ctrs 25 | circle|circ|circl|crcl|crcle|cir 26 | circles|cirs 27 | cliff|clf 28 | cliffs|clfs 29 | club|clb 30 | common|cmn 31 | commons|cmns 32 | corner|cor 33 | corners|cors 34 | course|crse 35 | court|ct 36 | courts|cts 37 | cove|cv 38 | coves|cvs 39 | creek|crk 40 | crescent|crsent|crsnt|cres 41 | crest|crst 42 | crossing|crssng|xing 43 | crossroad|xrd 44 | crossroads|xrds 45 | curve|curv 46 | dale|dl 47 | dam|dm 48 | divide|div|dvd|dv 49 | drive|driv|drv|dr 50 | drives|drs 51 | estate|est 52 | estates|ests 53 | expressway|exp|expr|expw|expy 54 | extension|extn|extnsn|ext 55 | extensions|exts 56 | falls|fls 57 | field|fld 58 | fields|flds 59 | flat|flt 60 | flats|flts 61 | ford|frd 62 | fords|frds 63 | forest|frst 64 | forge|forg|frg 65 | forges|frgs 66 | fork|frk 67 | forks|frks 68 | fort|frt|ft 69 | freeway|freewy|frway|frwy|fwy 70 | garden|gardn|grden|grdn|gdn 71 | gardens|grdns|gdns 72 | gateway|gatewy|gatway|gtway|gtwy 73 | glen|gln 74 | glens|glns 75 | green|grn 76 | greens|grns 77 | grove|grov|grv 78 | groves|grvs 79 | harbor|harb|harbr|hrbor|hbr 80 | harbors|hbrs 81 | haven|hvn 82 | heights|ht|hts 83 | highway|highwy|hiway|hiwy|hway|hwy 84 | hill|hl 85 | hills|hls 86 | hollows|holws 87 | hollow|hllw|holw 88 | inlet|inlt 89 | island|islnd|is 90 | islands|islnds|iss 91 | junction|jction|jctn|junctn|juncton|jct 92 | junctions|jctns|jcts 93 | key|ky 94 | keys|kys 95 | knoll|knol|knl 96 | knolls|knls 97 | lake|lk 98 | lakes|lks 99 | landing|lndng|lndg 100 | lane|ln 101 | light|lgt 102 | lights|lgts 103 | loaf|lf 104 | lock|lck 105 | locks|lcks 106 | lodge|ldge|lodg|ldg 107 | manor|mnr 108 | manors|mnrs 109 | meadow|mdw 110 | meadows|medows|mdws 111 | mill|ml 112 | mills|mls 113 | mission|missn|mssn|msn 114 | motorway|mtwy 115 | mount|mnt|mt 116 | mountain|mntain|mntn|mountin|mtin|mtn 117 | mountains|mntns|mtns 118 | neck|nck 119 | orchard|orchrd|orch 120 | oval|ovl 121 | overpass|opas 122 | parkway|parkwy|pkway|pky|parkways|pkwys|pkwy 123 | passage|psge 124 | pine|pne 125 | pines|pnes 126 | place|pl 127 | plain|pln 128 | plains|plns 129 | plaza|plza|plz 130 | point|pt 131 | points|pts 132 | port|prt 133 | ports|prts 134 | prairie|prr|pr 135 | radial|rad|radiel|radl 136 | ranch|ranches|rnchs|rnch 137 | rapid|rpd 138 | rapids|rpds 139 | rest|rst 140 | ridge|rdge|rdg 141 | ridges|rdgs 142 | river|rvr|rivr|riv 143 | road|rd 144 | roads|rds 145 | route|rte 146 | shoal|shl 147 | shoals|shls 148 | shore|shoar|shr 149 | shores|shoars|shrs 150 | skyway|skwy 151 | spring|spng|sprng|spg 152 | springs|spngs|sprngs|spgs 153 | square|sqr|sqre|squ|sq 154 | squares|sqrs|sqs 155 | station|statn|stn|sta 156 | stravenue|strav|straven|stravn|strvn|strvnue|stra 157 | stream|streme|strm 158 | street|strt|str|st 159 | streets|sts 160 | summit|sumit|sumitt|smt 161 | terrace|terr|ter 162 | throughway|trwy 163 | trace|traces|trce 164 | track|tracks|trk|trks|trak 165 | trafficway|trfy 166 | trail|trails|trls|trl 167 | trailer|trlrs|trlr 168 | tunnel|tunel|tunls|tunnels|tunnl|tunl 169 | turnpike|trnpk|turnpk|tpke 170 | underpass|upas 171 | union|un 172 | unions|uns 173 | valley|vally|vlly|vly 174 | valleys|vlys 175 | viaduct|vdct|viadct|via 176 | view|vw 177 | views|vws 178 | village|vill|villag|villg|villiage|vlg 179 | villages|vlgs 180 | ville|vl 181 | vista|vist|vst|vsta|vis 182 | way|wy 183 | well|wl 184 | wells|wls 185 | -------------------------------------------------------------------------------- /lib/analysis/dictionary.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const fs = require('fs'); 3 | const path = require('path'); 4 | 5 | /** 6 | load a libpostal dictionary from disk 7 | eg: https://raw.githubusercontent.com/openvenues/libpostal/master/resources/dictionaries/en/street_types.txt 8 | 9 | libpostal format: 10 | The leftmost string is treated as the canonical/normalized version. 11 | Synonyms if any, are appended to the right, delimited by the pipe character. 12 | 13 | see: https://github.com/openvenues/libpostal/tree/master/resources/dictionaries 14 | 15 | output example: 16 | { 17 | 'bruecke': 'bruecke', 18 | 'brücke': 'bruecke', 19 | 'brucke': 'bruecke', 20 | 'br.': 'bruecke' 21 | } 22 | */ 23 | 24 | // regular expression to target removal of common punctuation 25 | const PUNCTUATION_REGEX = /[.,\/#!$%\^&\*;:{}=\-_`~()]/g; 26 | 27 | module.exports = (opts) => { 28 | 29 | /** 30 | * options 31 | * 32 | * countryCode (string) -- country-code corresponding to a subdirectory in the the ./directories folder 33 | * filename (string) -- the name of the file to load inside the directory mentioed above 34 | * includeSelfReferences (bool) -- whether to also include the canonical synonym in the map 35 | * minLength (int) -- minimum valid length for a synonym in the dictionary 36 | */ 37 | const options = _.defaults({}, opts, { 38 | includeSelfReferences: false, 39 | minLength: 0 40 | }); 41 | 42 | try { 43 | const filepath = path.resolve(__dirname, 'dictionaries', options.countryCode, options.filename); 44 | const file = fs.readFileSync(filepath).toString(); 45 | const lines = file.trim().split('\n'); 46 | 47 | const map = lines.reduce((obj, line) => { 48 | var cols = line.trim().split('|'); 49 | 50 | // remove multi-word synonyms from all but the first position 51 | cols = cols.filter((col, pos) => (pos === 0) || !/[\s]/.test(col)); 52 | 53 | cols.forEach((col, pos) => { 54 | if (!options.includeSelfReferences && 0 === pos) { return; } // skip first column ( the expansion ) 55 | if (col.replace(PUNCTUATION_REGEX).length < (options.minLength || 0)) { return; } // skip very short synonyms 56 | 57 | // warn user when duplicate terms are added to the map 58 | if (obj.hasOwnProperty(col)){ 59 | console.warn(`[${options.filename}] trying to replace ${col}=>${obj[col]} with ${col}=>${cols[0]}`); 60 | } 61 | 62 | obj[col] = cols[0]; 63 | }); 64 | return obj; 65 | }, {}); 66 | 67 | return map; 68 | } 69 | catch (e) { 70 | return {}; 71 | } 72 | }; 73 | -------------------------------------------------------------------------------- /lib/analysis/ordinals.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | 3 | // The ordinal function replaces all numeric street names (ie. 30 street) 4 | // with a version including ordinals (ie. 30th street). 5 | // note: this is currently only configured for the English language 6 | 7 | function ordinals(opts) { 8 | return (tokens) => { 9 | 10 | // consider all but final token 11 | for (var o = 0; o < tokens.length-1; o++) { 12 | 13 | // token must be entirely numeric 14 | if (!tokens[o].isNumeric()) { continue; } 15 | 16 | // token must be followed by a street type token 17 | if (!_.has(opts.dict.streetTypes, _.toLower(tokens[o+1].body))) { continue; } 18 | 19 | // token must either be the leftmost token or be preceeded by a directional token 20 | if(o !== 0) { 21 | if (!_.has(opts.dict.directionalExpansions, _.toLower(tokens[o-1].body))) { 22 | continue; 23 | } 24 | } 25 | 26 | // append the english ordinal suffix 27 | tokens[o].body += englishOrdinalSuffix(tokens[o].body); 28 | 29 | // maximum of one replacement 30 | break; 31 | } 32 | 33 | return tokens; 34 | }; 35 | } 36 | 37 | function englishOrdinalSuffix(i) { 38 | const j = i % 10, k = i % 100; 39 | if (j === 1 && k !== 11) { return 'st'; } 40 | if (j === 2 && k !== 12) { return 'nd'; } 41 | if (j === 3 && k !== 13) { return 'rd'; } 42 | return 'th'; 43 | } 44 | 45 | module.exports = ordinals; 46 | -------------------------------------------------------------------------------- /lib/analysis/synonyms.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | 3 | // The synonyms function replaces all matching occurrences of tokens in the 4 | // supplied dictionary. 5 | // Some options are provided to control the iteration and termination behaviour 6 | // of the replacer. 7 | // @todo: this does not currently handle matching onmulti-word synonyms 8 | // (although it's technically possible to do so if desired at a later date) 9 | 10 | function synonyms(opts) { 11 | /** 12 | * options 13 | * 14 | * dictionary -- the dictionary to use for looking up replacements 15 | * maxElements -- maximum elements to iterate 16 | * maxReplacements -- maximum replacements which can be made 17 | * direction -- default is iterating left-to-right through the array, use 'left' for the inverse 18 | * predicate -- after a match is found this function must return true before the substitution occurs 19 | * ignore -- run on each token before matching against the dictionary, must return true or the token is skipped 20 | * normalizer -- control how the token is normalized before matching occurs 21 | * 22 | * return function(tokens) => tokens 23 | */ 24 | const options = _.defaults({}, opts, { 25 | dictionary: {}, 26 | maxElements: Infinity, 27 | maxReplacements: Infinity, 28 | direction: 'right', 29 | predicate: () => true, 30 | ignore: () => false, 31 | normalizer: (body) => _.trim(_.toLower(body), '.') 32 | }); 33 | 34 | // iterate from right-to-left 35 | if (options.direction === 'left') { 36 | return (tokens) => { 37 | var seen = 0; // keep track of how many elements we've seen 38 | var replaced = 0; // keep track of how many elements we've replaced 39 | 40 | // iterate over tokens in reverse order 41 | for (var o = tokens.length - 1; o >= 0; o--) { 42 | 43 | // support $ignore 44 | if (options.ignore(tokens[o], o, tokens)) { continue; } 45 | 46 | // support $maxElements 47 | if (++seen > options.maxElements) { break; } 48 | 49 | // search for replacement in dictionary 50 | var replacement = _.get(options.dictionary, options.normalizer(tokens[o].body)); 51 | if (replacement) { 52 | 53 | // support $predicate 54 | if (!options.predicate(tokens[o], o, tokens)) { continue; } 55 | 56 | // perform replacement 57 | tokens[o].body = replacement; 58 | 59 | // support $maxReplacements 60 | replaced++; 61 | if (replaced >= options.maxReplacements) { break; } 62 | } 63 | } 64 | 65 | return tokens; 66 | }; 67 | } 68 | 69 | // iterate from left-to-right 70 | return (tokens) => { 71 | var seen = 0; // keep track of how many elements we've seen 72 | var replaced = 0; // keep track of how many elements we've replaced 73 | 74 | // iterate over tokens in normal order 75 | for (var o = 0; o < tokens.length; o++) { 76 | 77 | // support $ignore 78 | if (options.ignore(tokens[o], o, tokens)){ continue; } 79 | 80 | // support $maxElements 81 | if (++seen > options.maxElements) { break; } 82 | 83 | // search for replacement in dictionary 84 | var replacement = _.get(options.dictionary, options.normalizer(tokens[o].body)); 85 | if (replacement) { 86 | 87 | // support $predicate 88 | if (!options.predicate(tokens[o], o, tokens)) { continue; } 89 | 90 | // perform replacement 91 | tokens[o].body = replacement; 92 | 93 | // support $maxReplacements 94 | replaced++; 95 | if (replaced >= options.maxReplacements) { break; } 96 | } 97 | } 98 | 99 | return tokens; 100 | }; 101 | } 102 | 103 | module.exports = synonyms; 104 | -------------------------------------------------------------------------------- /lib/cleanup.js: -------------------------------------------------------------------------------- 1 | var _ = require('lodash'); 2 | 3 | function removeLeadingZerosFromStreet(token) { 4 | return token.replace(/^(?:0*)([1-9]\d*(st|nd|rd|th))/,'$1'); 5 | } 6 | 7 | const directionals = ['NE', 'NW', 'SE', 'SW']; 8 | 9 | function capitalizeProperly(token){ 10 | const lowercase = token.toLowerCase(); 11 | const uppercase = token.toUpperCase(); 12 | 13 | // token is a directional, return uppercase variant 14 | if (directionals.includes(uppercase)) { 15 | return uppercase; 16 | } 17 | 18 | // token is all lowercase or all uppercase, return capitalized variant 19 | if (token === lowercase || token === uppercase) { 20 | return _.capitalize(token); 21 | } 22 | 23 | return token; 24 | } 25 | 26 | function cleanupStreetName(input) { 27 | // split streetname into tokens by whitespace 28 | return input.split(/\s/) 29 | .map(removeLeadingZerosFromStreet) 30 | // remove empty tokens 31 | .filter(function(part){ 32 | return part.length > 0; 33 | }).map(capitalizeProperly) 34 | .join(' '); 35 | } 36 | 37 | module.exports = { 38 | streetName: cleanupStreetName 39 | }; 40 | -------------------------------------------------------------------------------- /lib/cleanup_v2.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const dictionary = require('./analysis/dictionary'); 3 | const synonyms = require('./analysis/synonyms'); 4 | const ordinals = require('./analysis/ordinals'); 5 | const Token = require('./analysis/Token'); 6 | 7 | /** 8 | * This file contains a street name normalization algorithm 9 | * which attempts to convert poorly formatted street names 10 | * into a more stardardized and aethetically pleasing form. 11 | * 12 | * I've written up some more information about the potential 13 | * pitfall of doing this which explain why the code will always 14 | * tend to err on the side of caution. 15 | * 16 | * see: https://github.com/pelias/openaddresses/pull/477 17 | * 18 | * At time of writing the code follows this method: 19 | * 1. If the text is uppercase, with minor exceptions, lowercase it 20 | * 2. Expand the 'generic' portion of the name 21 | * 3. Expand the 'directional' portion of the name 22 | * 4. Capitalize all lowercased words 23 | */ 24 | 25 | // load dictionaries from disk 26 | const dict = { 27 | directionalExpansions: dictionary({ 28 | countryCode: 'en', 29 | filename: 'directional_expansions.txt', 30 | includeSelfReferences: true 31 | }), 32 | diagonalContractions: dictionary({ 33 | countryCode: 'en', 34 | filename: 'diagonal_contractions.txt', 35 | includeSelfReferences: false 36 | }), 37 | streetTypes: _.merge( 38 | dictionary({ 39 | countryCode: 'en', 40 | filename: 'street_types_usps.txt', 41 | includeSelfReferences: true, 42 | minLength: 2 43 | }), 44 | dictionary({ 45 | countryCode: 'en', 46 | filename: 'street_types_overrides.txt', 47 | includeSelfReferences: true, 48 | minLength: 2 49 | }) 50 | ) 51 | }; 52 | 53 | function cleanupStreetName(input) { 54 | // split by whitespace 55 | const words = input.split(/\s+/); 56 | 57 | // convert strings to objects 58 | var tokens = words.map(word => new Token(word)); 59 | 60 | // remove leading zeros from housenumbers 61 | tokens.forEach(token => token.removeLeadingZeros()); 62 | 63 | // if the token is all uppercase then try to lowercase it 64 | tokens.forEach(token => token.selectivelyLowerCase()); 65 | 66 | // if the token is identified as an abbreviation then uppercase it 67 | tokens.forEach(token => token.selectivelyUpperCase()); 68 | 69 | // street 'generic' expansion ie. the 'St.' or 'Rd.' portion 70 | if (tokens.length >= 2){ 71 | tokens = synonyms({ 72 | dictionary: dict.streetTypes, 73 | maxElements: 1, 74 | maxReplacements: 1, 75 | direction: 'left', 76 | 77 | // ignore tokens in the directionalExpansions dict 78 | ignore: (token) => _.has(dict.directionalExpansions, _.toLower(token.body)) 79 | })(tokens); 80 | } 81 | 82 | // directional expansions (leftmost token) 83 | if (tokens.length >= 3) { 84 | tokens = synonyms({ 85 | dictionary: dict.directionalExpansions, 86 | maxElements: 1, 87 | maxReplacements: 1, 88 | predicate: (token, pos, tokens) => { 89 | // perform a look-ahead on the next token 90 | // and ensure it's not in the streetTypes dict 91 | const next = tokens[pos+1]; 92 | if (!_.isObjectLike(next)){ return true; } 93 | return !_.has(dict.streetTypes, _.toLower(next.body)); 94 | } 95 | })(tokens); 96 | } 97 | 98 | // directional expansions (rightmost token) 99 | if (tokens.length >= 3) { 100 | tokens = synonyms({ 101 | dictionary: dict.directionalExpansions, 102 | maxElements: 1, 103 | maxReplacements: 1, 104 | direction: 'left' 105 | })(tokens); 106 | } 107 | 108 | // diagonal contractions (all tokens) 109 | if (tokens.length >= 3) { 110 | tokens = synonyms({ 111 | dictionary: dict.diagonalContractions, 112 | maxReplacements: 1, 113 | direction: 'left' 114 | })(tokens); 115 | } 116 | 117 | // capitalize lowercased tokens (leaving mixed case tokens unchanged) 118 | tokens.forEach(token => token.selectivelyCapitalize()); 119 | 120 | // add ordinals to english numeric street names 121 | tokens = ordinals({ dict })(tokens); 122 | 123 | // convert objects to strings and join by whitespace 124 | return tokens.map(token => token.body).join(' '); 125 | } 126 | 127 | module.exports = { 128 | streetName: cleanupStreetName 129 | }; 130 | -------------------------------------------------------------------------------- /lib/importPipeline.js: -------------------------------------------------------------------------------- 1 | const logger = require('pelias-logger').get('openaddresses'); 2 | const recordStream = require('./streams/recordStream'); 3 | const model = require('pelias-model'); 4 | const peliasDbclient = require('pelias-dbclient'); 5 | const blacklistStream = require('pelias-blacklist-stream'); 6 | const isUSorCAHouseNumberZero = require('./streams/isUSorCAHouseNumberZero'); 7 | 8 | /** 9 | * Import all OpenAddresses CSV files in a directory into Pelias elasticsearch. 10 | * 11 | * @param {array of string} files An array of the absolute file-paths to import. 12 | * @param {object} opts Options to configure the import. Supports the following 13 | * keys: 14 | * 15 | * adminValues: Add admin values to each address object (since 16 | * OpenAddresses doesn't contain any) using `admin-lookup`. See the 17 | * documentation: https://github.com/pelias/admin-lookup 18 | */ 19 | function createFullImportPipeline( files, dirPath, adminLookupStream, importerName ){ // jshint ignore:line 20 | logger.info( 'Importing %s files.', files.length ); 21 | 22 | recordStream.create(files, dirPath) 23 | .pipe(blacklistStream()) 24 | .pipe(adminLookupStream) 25 | .pipe(isUSorCAHouseNumberZero.create()) 26 | .pipe(model.createDocumentMapperStream()) 27 | .pipe(peliasDbclient({name: importerName})); 28 | } 29 | 30 | module.exports = { 31 | create: createFullImportPipeline 32 | }; 33 | -------------------------------------------------------------------------------- /lib/isValidCsvRecord.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const NULL_ISLAND_THRESHOLD = 0.0005; 3 | 4 | /* 5 | * Return true if a record has all of LON, LAT, NUMBER and STREET defined 6 | */ 7 | function isValidCsvRecord( record ){ 8 | return hasAllProperties(record) && 9 | !houseNumberIsExclusionaryWord(record) && 10 | !streetContainsExclusionaryWord(record) && 11 | !latLonAreOnNullIsland(record); 12 | } 13 | 14 | /* 15 | * Return false if record.NUMBER is literal word 'NULL', 'UNDEFINED', 16 | * or 'UNAVAILABLE' (case-insensitive) 17 | */ 18 | function houseNumberIsExclusionaryWord(record) { 19 | return ['NULL', 'UNDEFINED', 'UNAVAILABLE'].indexOf(_.toUpper(record.NUMBER)) !== -1; 20 | } 21 | 22 | /* 23 | * Return false if record.STREET contains literal word 'NULL', 'UNDEFINED', 24 | * or 'UNAVAILABLE' (case-insensitive) 25 | */ 26 | function streetContainsExclusionaryWord(record) { 27 | return /\b(NULL|UNDEFINED|UNAVAILABLE)\b/i.test(record.STREET); 28 | } 29 | 30 | function hasAllProperties(record) { 31 | return [ 'LON', 'LAT', 'NUMBER', 'STREET' ].every(function(prop) { 32 | return !_.isEmpty(record[ prop ]) || _.isNumber(record[ prop ]); 33 | }); 34 | } 35 | 36 | // returns true when LON and LAT are both parseable < $NULL_ISLAND_THRESHOLD 37 | // > parseFloat('0'); 38 | // 0 39 | // > parseFloat('0.000000'); 40 | // 0 41 | // > parseFloat('0.000001'); 42 | // 0.000001 43 | function latLonAreOnNullIsland(record) { 44 | return ['LON', 'LAT'].every(prop => Math.abs(parseFloat(record[prop])) < NULL_ISLAND_THRESHOLD); 45 | } 46 | 47 | module.exports = isValidCsvRecord; 48 | -------------------------------------------------------------------------------- /lib/parameters.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const util = require('util'); 3 | const glob = require('glob'); 4 | const path = require('path'); 5 | const _ = require('lodash'); 6 | const minimist = require('minimist'); 7 | 8 | const peliasConfig = require('pelias-config').generate(); 9 | const OpenAddressesAPI = require('../utils/OpenAddressesAPI'); 10 | 11 | /** 12 | * Interprets the command-line arguments passed to the script. 13 | * 14 | * @param {array} argv Should be `process.argv.slice( 2 )`. 15 | * @return {object} If arguments were succesfully parsed, an object that can be 16 | * used to call `importOpenAddressesDir`: 17 | * 18 | * { 19 | * dirPath: , 20 | * adminValues: , 21 | * } 22 | * 23 | * Otherwise, an error object. 24 | * 25 | * { 26 | * exitCode: , 27 | * errMessage: 28 | * } 29 | */ 30 | function interpretUserArgs( argv, config ){ 31 | config = config || peliasConfig; 32 | 33 | var usageMessage = [ 34 | 'A tool for importing OpenAddresses data into Pelias. Usage:', 35 | '', 36 | '\tnode import.js --help | [--admin-values] [OPENADDRESSES_DIR]', 37 | '', 38 | '', 39 | '\t--help: Print this help message.', 40 | '', 41 | '\tOPENADDRESSES_DIR: A directory containing OpenAddresses CSV files.', 42 | '\t\tIf none is specified, the path from your PELIAS_CONFIG\'s', 43 | '\t\t`imports.openaddresses.datapath` will be used.', 44 | ].join( '\n' ); 45 | 46 | argv = minimist(argv, {}); 47 | 48 | var validArgs = ['help', '_', 'parallel-count', 'parallel-id' ]; 49 | for( var arg in argv ){ 50 | if( validArgs.indexOf( arg ) === -1 ){ 51 | return { 52 | errMessage: util.format( '`%s` is not a recognized argument.', arg ), 53 | exitCode: 1 54 | }; 55 | } 56 | } 57 | 58 | if( argv.help ){ 59 | return { errMessage: usageMessage, exitCode: 0 }; 60 | } 61 | 62 | var opts = { 63 | 'parallel-count': argv['parallel-count'], 64 | 'parallel-id': argv['parallel-id'], 65 | dirPath: null 66 | }; 67 | if( argv._.length > 0 ){ 68 | opts.dirPath = argv._[ 0 ]; 69 | } 70 | else { 71 | opts.dirPath = config.imports.openaddresses.datapath; 72 | } 73 | 74 | opts.dirPath = path.normalize(opts.dirPath); 75 | 76 | if( !fs.existsSync( opts.dirPath ) ){ 77 | return { 78 | errMessage: util.format( 'Directory `%s` does not exist.', opts.dirPath ), 79 | exitCode: 2 80 | }; 81 | } 82 | else if( !fs.statSync( opts.dirPath ).isDirectory() ){ 83 | return { 84 | errMessage: util.format( '`%s` is not a directory.', opts.dirPath ), 85 | exitCode: 2 86 | }; 87 | } 88 | 89 | return opts; 90 | 91 | } 92 | 93 | function getFullFileList(peliasConfig, args) { 94 | // get the files to process 95 | const files = _.get(peliasConfig.imports.openaddresses, 'files', []); 96 | 97 | if (_.isEmpty(files)) { 98 | // no specific files listed, so return all .csv and .geojson files 99 | return glob.sync( args.dirPath + '/**/*.{csv,geojson,geojson.gz,csv.gz}' ); 100 | } else { 101 | // otherwise return the requested files with full path 102 | return files.map(file => { 103 | 104 | // normalize source 105 | const source = OpenAddressesAPI.normalize(file); 106 | 107 | // search for files matching this source id, ending in either .geojson or .csv 108 | const found = glob.sync(`${source}.{csv,geojson}`, { cwd: args.dirPath, absolute: true }); 109 | if (!_.isEmpty(found)) { return _.last(found); } // results are sorted, prefer .geojson 110 | 111 | // no matching files were found, return a non-matching absolute path 112 | return path.join(args.dirPath, file); 113 | }); 114 | } 115 | } 116 | 117 | function getFileList(peliasConfig, args) { 118 | var files = getFullFileList(peliasConfig, args); 119 | 120 | if (args['parallel-count'] > 0 && args['parallel-id'] >= 0) { 121 | files = files.filter(function(element, index) { 122 | return index % args['parallel-count'] === args['parallel-id']; 123 | }); 124 | } 125 | 126 | return files; 127 | } 128 | 129 | module.exports = { 130 | interpretUserArgs: interpretUserArgs, 131 | getFileList: getFileList 132 | }; 133 | -------------------------------------------------------------------------------- /lib/streams/cleanupStream.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const through2 = require('through2'); 3 | const cleanup = require('../cleanup'); 4 | const cleanupV2 = require('../cleanup_v2'); 5 | 6 | /* 7 | * create a stream that performs any needed cleanup on a record 8 | */ 9 | 10 | // mapping from openaddresses country codes (from the file names) 11 | // to a language code 'locale' required by next-gen analysis. 12 | const cc2LocaleMap = { 13 | 'us': 'en', 14 | 'ca': 'en', 15 | 'gb': 'en', 16 | 'ie': 'en', 17 | 'au': 'en', 18 | 'nz': 'en', 19 | }; 20 | 21 | function createCleanupStream(options) { 22 | const locale = _.get(cc2LocaleMap, _.get(options, 'countryCode', ''), '').toLowerCase(); 23 | 24 | // use 'cleanup_v2' when we know the locale is 'en', else use the existing 'cleanup' analyzer 25 | // note: this is a temporary solution to allow us to upgrade gradually without having to 26 | // test the entire world, with all it's different languages, all in the first release. 27 | const analyzer = (locale === 'en') ? cleanupV2.streetName : cleanup.streetName; 28 | 29 | // generate a stream 30 | return through2.obj(( record, enc, next ) => { 31 | 32 | // analyze street field 33 | record.STREET = analyzer(record.STREET, { locale }); 34 | 35 | // csvParse will only trim unquoted fields 36 | // so we have to do it ourselves to handle all whitespace 37 | Object.keys(record).forEach(key => { 38 | if (_.isFunction(_.get(record[key], 'trim'))) { 39 | record[key] = record[key].trim(); 40 | } 41 | }); 42 | 43 | next(null, record); 44 | }); 45 | } 46 | 47 | module.exports = { 48 | create: createCleanupStream 49 | }; 50 | -------------------------------------------------------------------------------- /lib/streams/contentHashStream.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const crypto = require('crypto'); 3 | const through2 = require('through2'); 4 | 5 | /* 6 | * create a stream that generates a content-hash for each row 7 | */ 8 | 9 | function createContentHashStream() { 10 | return through2.obj((record, enc, next) => { 11 | record.HASH = hash(record); 12 | next(null, record); 13 | }); 14 | } 15 | 16 | const normalize = { 17 | float: (fl) => (Math.floor(parseFloat(fl||0.0)*1e7)/1e7).toFixed(7), 18 | string: (str) => (str||'').toString().replace(/\s+/g, ' ').trim().toLowerCase() 19 | }; 20 | 21 | const fields = [ 22 | { key: 'LON', norm: normalize.float }, 23 | { key: 'LAT', norm: normalize.float }, 24 | { key: 'STREET', norm: normalize.string }, 25 | { key: 'NUMBER', norm: normalize.string }, 26 | { key: 'UNIT', norm: normalize.string } 27 | ]; 28 | 29 | function hash( record ) { 30 | // md5 is actually 512 bits, we only need 256 bits to match the 16x hex char 31 | // uuid4 implementation used by the openaddresses project, so half are discarded. 32 | // it was chosen due to its universal availability and maturity. 33 | // note: this algo need not be cryptographically secure, it's just more 34 | // convenient and reliable to use this method than using other methods. 35 | const h = crypto.createHash('md5'); 36 | 37 | // see: https://github.com/pelias/openaddresses/pull/442#issuecomment-535399779 38 | fields.forEach( field => { 39 | // write a null byte in place of an empty value 40 | // in order to preserve column positions. 41 | let str = '\0'; 42 | if (_.has(record, field.key)) { 43 | str = field.norm(_.get(record, field.key)); 44 | } 45 | h.update(str); 46 | }); 47 | 48 | // return a hexidecimal representation 49 | return h.digest('hex').substr(0, 16); 50 | } 51 | 52 | module.exports = { 53 | create: createContentHashStream, 54 | hash: hash 55 | }; 56 | -------------------------------------------------------------------------------- /lib/streams/documentStream.js: -------------------------------------------------------------------------------- 1 | const through = require( 'through2' ); 2 | const peliasModel = require( 'pelias-model' ); 3 | 4 | // patter to match a two character country code from the directory prefix 5 | const COUNTRY_CODE_PATTERN = /^([A-Za-z]{2})\//; 6 | 7 | /* 8 | * Create a stream of Documents from valid, cleaned CSV records 9 | */ 10 | function createDocumentStream(id_prefix, stats) { 11 | /** 12 | * Used to track the UID of individual records passing through the stream if 13 | * there is no HASH that can be used as a more unique identifier. See 14 | * `peliasModel.Document.setId()` for information about UIDs. 15 | */ 16 | let uid = 0; 17 | 18 | return through.obj( 19 | function write( record, enc, next ){ 20 | const id_number = record.HASH || uid; 21 | const model_id = `${id_prefix}:${id_number}`; 22 | uid++; 23 | 24 | try { 25 | const doc = new peliasModel.Document('openaddresses', 'address', model_id) 26 | .setName('default', `${record.NUMBER} ${record.STREET}`) 27 | .setAddress('number', record.NUMBER) 28 | .setAddress('street', record.STREET) 29 | .setCentroid({ lon: record.LON, lat: record.LAT }); 30 | 31 | if (record.POSTCODE) { 32 | doc.setAddress('zip', record.POSTCODE); 33 | } 34 | 35 | // attempt to set the country code based on the directory prefix 36 | const match = id_prefix.match(COUNTRY_CODE_PATTERN); 37 | if (match && match[1]) { 38 | doc.setMeta('country_code', match[1].toUpperCase()); 39 | } 40 | 41 | // store a reference to the original OA record in a 'meta' 42 | // field, this is available through the pipeline but is not 43 | // saved to elasticsearch. 44 | doc.setMeta('oa', record); 45 | 46 | this.push(doc); 47 | } 48 | catch ( ex ){ 49 | stats.badRecordCount++; 50 | } 51 | 52 | next(); 53 | } 54 | ); 55 | } 56 | 57 | module.exports = { 58 | create: createDocumentStream 59 | }; 60 | -------------------------------------------------------------------------------- /lib/streams/gnafMapperStream.js: -------------------------------------------------------------------------------- 1 | /** 2 | The GNAF mapper is responsible for extracting Australian GNAF 3 | identifiers from the OA 'ID' property, where available. 4 | **/ 5 | 6 | const _ = require('lodash'); 7 | const through = require('through2'); 8 | const logger = require('pelias-logger').get('openaddresses'); 9 | 10 | // examples: GAACT718519668, GASA_424005553 11 | const GNAF_PID_PATTERN = /^(GA)(NSW|VIC|QLD|SA_|WA_|TAS|NT_|ACT|OT_)([0-9]{9})$/; 12 | 13 | module.exports = function () { 14 | return through.obj((doc, enc, next) => { 15 | try { 16 | if (doc.getMeta('country_code') === 'AU') { 17 | 18 | // detect Australian G-NAF PID concordances 19 | const oaid = _.get(doc.getMeta('oa'), 'ID'); 20 | if (_.isString(oaid) && oaid.length === 14 && oaid.match(GNAF_PID_PATTERN)) { 21 | doc.setAddendum('concordances', { 'gnaf:pid': oaid }); 22 | } 23 | } 24 | } 25 | 26 | catch (e) { 27 | logger.error('gnaf_mapper error'); 28 | logger.error(e.stack); 29 | logger.error(JSON.stringify(doc, null, 2)); 30 | } 31 | 32 | return next(null, doc); 33 | }); 34 | }; 35 | -------------------------------------------------------------------------------- /lib/streams/isUSorCAHouseNumberZero.js: -------------------------------------------------------------------------------- 1 | var filter = require('through2-filter'); 2 | var _ = require('lodash'); 3 | 4 | var allZeros = /^0+$/; 5 | 6 | function isZeroHouseNumber(record) { 7 | return allZeros.test(record.address_parts.number); 8 | } 9 | 10 | function isUSorCA(record) { 11 | return _.isEqual(record.parent.country_a, ['USA']) || 12 | _.isEqual(record.parent.country_a, ['CAN']); 13 | } 14 | 15 | module.exports.create = function create() { 16 | return filter.obj(function(record) { 17 | if (isZeroHouseNumber(record) && isUSorCA(record)) { 18 | return false; 19 | } 20 | return true; 21 | }); 22 | }; 23 | -------------------------------------------------------------------------------- /lib/streams/recordStream.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const fs = require('fs'); 3 | const path = require('path'); 4 | const csvParse = require('csv-parse').parse; 5 | const combinedStream = require('combined-stream'); 6 | const through = require('through2'); 7 | const split = require('split2'); 8 | const zlib = require('zlib'); 9 | 10 | const logger = require('pelias-logger').get('openaddresses'); 11 | const config = require('pelias-config').generate(); 12 | 13 | const CleanupStream = require('./cleanupStream'); 14 | const ContentHashStream = require('./contentHashStream'); 15 | const ValidRecordFilterStream = require('./validRecordFilterStream'); 16 | const DocumentStream = require('./documentStream'); 17 | const gnafMapperStreamFactory = require('./gnafMapperStream'); 18 | const unitSplittingMapperStreamFactory = require('./unitSplittingMapperStream'); 19 | 20 | /* 21 | * Construct a suitable id prefix for a CSV file given 22 | * its full filename and the base directory of all OA CSV files. 23 | */ 24 | function getIdPrefix(filename, dirPath) { 25 | if (filename && dirPath) { 26 | // if the file is within the dir path, use the structure 27 | // of the directory tree to create the id 28 | if (filename.indexOf(dirPath) !== -1) { 29 | var subpath = _.replace(filename, dirPath, ''); 30 | var prefix = _.replace(_.replace(subpath, /\.(csv|geojson)/, ''), /\.gz/, ''); 31 | return _.trim(prefix, '/'); 32 | } 33 | } 34 | 35 | // if the dirPath doesn't contain this file, return the basename without extension 36 | return path.basename(path.basename(path.basename(filename, '.gz'), '.csv'), '.geojson'); 37 | } 38 | 39 | /** 40 | * Create a stream of Documents from an OpenAddresses file. 41 | * 42 | * @param {string} filePath The path of an OpenAddresses CSV file. 43 | * @return {stream.Readable} A stream of `Document` objects, one 44 | * for every valid record inside the OA file. 45 | */ 46 | function createRecordStream( filePath, dirPath ){ 47 | /** 48 | * A stream to convert rows of a CSV to Document objects. 49 | */ 50 | var stats = { 51 | badRecordCount: 0 52 | }; 53 | 54 | const contentHashStream = ContentHashStream.create(); 55 | const validRecordFilterStream = ValidRecordFilterStream.create(); 56 | const idPrefix = getIdPrefix(filePath, dirPath); 57 | const countryCode = idPrefix.replace(/\\/g, '/').split('/')[0]; 58 | const cleanupStream = CleanupStream.create({ countryCode }); 59 | const documentStream = DocumentStream.create(idPrefix, stats); 60 | 61 | documentStream._flush = function end( done ){ 62 | done(); 63 | }; 64 | 65 | return fileStreamDispatcher(fs.createReadStream( filePath ), filePath) 66 | .pipe( contentHashStream ) 67 | .pipe( validRecordFilterStream ) 68 | .pipe( cleanupStream ) 69 | .pipe( documentStream ) 70 | .pipe( gnafMapperStreamFactory() ) 71 | .pipe( unitSplittingMapperStreamFactory() ); 72 | } 73 | 74 | function geojsonStream(stream) { 75 | return stream 76 | .pipe(split()) 77 | .pipe(through.obj((line, _enc, next) => { 78 | let row; 79 | try { 80 | const geojson = JSON.parse(line); 81 | if (_.get(geojson, 'geometry.type') === 'Point') { 82 | row = { 83 | NUMBER: _.get(geojson, 'properties.number'), 84 | STREET: _.get(geojson, 'properties.street'), 85 | LON: _.get(geojson, 'geometry.coordinates[0]'), 86 | LAT: _.get(geojson, 'geometry.coordinates[1]'), 87 | POSTCODE: _.get(geojson, 'properties.postcode'), 88 | UNIT:_.get(geojson, 'properties.unit'), 89 | DISTRICT:_.get(geojson, 'properties.district'), 90 | REGION:_.get(geojson, 'properties.region'), 91 | CITY:_.get(geojson, 'properties.city') 92 | }; 93 | } 94 | } catch(e) { 95 | logger.error(e); 96 | } 97 | next(null, row); 98 | })); 99 | } 100 | 101 | function fileStreamDispatcher(stream, filePath) { 102 | if (filePath.endsWith('.gz')) { 103 | stream = stream.pipe(zlib.createGunzip()); 104 | } 105 | 106 | if (/\.geojson(\.gz)?/.test(filePath)) { 107 | return geojsonStream(stream); 108 | } 109 | 110 | return stream.pipe(csvParse({ 111 | bom: true, 112 | trim: true, 113 | skip_empty_lines: true, 114 | relax_column_count: true, 115 | relax: true, 116 | columns: true 117 | })); 118 | } 119 | 120 | /* 121 | * Create a single stream from many CSV files 122 | */ 123 | function createFullRecordStream(files, dirPath) { 124 | var recordStream = combinedStream.create(); 125 | 126 | files.forEach( function forEach( filePath ){ 127 | if (!fs.existsSync(filePath)) { 128 | if (config.get('imports.openaddresses.missingFilesAreFatal')) { 129 | logger.error(`File ${filePath} not found, quitting`); 130 | process.exit(1); 131 | } else { 132 | logger.warn(`File ${filePath} not found, skipping`); 133 | return; 134 | } 135 | } 136 | 137 | recordStream.append( function ( next ){ 138 | logger.info( 'Creating read stream for: ' + filePath ); 139 | next(createRecordStream( filePath, dirPath ) ); 140 | }); 141 | }); 142 | 143 | return recordStream; 144 | } 145 | 146 | module.exports = { 147 | getIdPrefix: getIdPrefix, 148 | create: createFullRecordStream 149 | }; 150 | -------------------------------------------------------------------------------- /lib/streams/unitSplittingMapperStream.js: -------------------------------------------------------------------------------- 1 | /** 2 | The unit splitting mapper is responsible for detecting when the address.number 3 | field contains the concatenation of the unit and the housenumber. 4 | 5 | eg. Flat 2 14 Smith St 6 | 7 | In this case we attempt to split the two terms into their consituent parts. 8 | 9 | note: Addressing formats vary between countries, it's unlikely that a pattern 10 | which works for one country will also work internationally. For this reason this 11 | mapper accepts a country code which can be used to select the appropriate pattern(s). 12 | 13 | Feel free to make changes to this mapping file! 14 | **/ 15 | 16 | const _ = require('lodash'); 17 | const through = require('through2'); 18 | const logger = require('pelias-logger').get('openaddresses'); 19 | const mappers = {}; 20 | 21 | // Australasian Unit Number Mapper 22 | // https://auspost.com.au/content/dam/auspost_corp/media/documents/Appendix-01.pdf 23 | // https://www.nzpost.co.nz/sites/nz/files/2021-10/adv358-address-standards.pdf 24 | const australasian = (doc) =>{ 25 | const number = doc.getAddress('number'); 26 | if(!_.isString(number) || number.length < 3){ return; } 27 | 28 | // 2/14 29 | const solidus = number.match(/^(\d+)\s*\/\s*(\d+)$/); 30 | if (solidus) { 31 | doc.setAddress('unit', solidus[1]); 32 | doc.setAddress('number', solidus[2]); 33 | doc.setName('default', `${doc.getAddress('number')} ${doc.getAddress('street')}`); 34 | return; 35 | } 36 | 37 | // Flat 2 14 | F 2 14 | Unit 2 14 | APT 2 14 38 | const verbose = number.match(/^(flat|f|unit|apartment|apt)\s*(\d+)\s+(\d+)$/i); 39 | if (verbose) { 40 | doc.setAddress('unit', verbose[2]); 41 | doc.setAddress('number', verbose[3]); 42 | doc.setName('default', `${doc.getAddress('number')} ${doc.getAddress('street')}`); 43 | return; 44 | } 45 | }; 46 | 47 | // associate mappers with country codes 48 | mappers.AU = australasian; 49 | mappers.NZ = australasian; 50 | 51 | module.exports = function () { 52 | return through.obj((doc, enc, next) => { 53 | try { 54 | // only applies to records with a 'number' set and no 'unit' set (yet). 55 | if (doc.hasAddress('number') && !doc.hasAddress('unit')) { 56 | 57 | // select the appropriate mapper based on country code 58 | const mapper = _.get(mappers, doc.getMeta('country_code')); 59 | if (_.isFunction(mapper)) { 60 | 61 | // run the country-specific mapper 62 | mapper(doc); 63 | } 64 | } 65 | } 66 | 67 | catch (e) { 68 | logger.error('unit_mapper error'); 69 | logger.error(e.stack); 70 | logger.error(JSON.stringify(doc, null, 2)); 71 | } 72 | 73 | return next(null, doc); 74 | }); 75 | }; 76 | -------------------------------------------------------------------------------- /lib/streams/validRecordFilterStream.js: -------------------------------------------------------------------------------- 1 | var through = require( 'through2' ); 2 | 3 | var logger = require( 'pelias-logger' ).get( 'openaddresses' ); 4 | var isValidCsvRecord = require('../isValidCsvRecord'); 5 | 6 | /* 7 | * Create a through2 stream to filter out invalid records 8 | */ 9 | function createValidRecordFilterStream() { 10 | var invalidCount = 0; 11 | return through.obj(function( record, enc, next ) { 12 | if (isValidCsvRecord(record)) { 13 | this.push(record); 14 | } else { 15 | invalidCount++; 16 | } 17 | next(); 18 | }, function(next) { 19 | logger.verbose('number of invalid records skipped: ' + invalidCount); 20 | next(); 21 | }); 22 | } 23 | 24 | module.exports = { 25 | create: createValidRecordFilterStream 26 | }; 27 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pelias-openaddresses", 3 | "version": "0.0.0-development", 4 | "description": "Pelias import pipeline for OpenAddresses.", 5 | "engines": { 6 | "node": ">=10.0.0" 7 | }, 8 | "main": "import.js", 9 | "dependencies": { 10 | "@hapi/joi": "^16.0.1", 11 | "async": "^3.1.0", 12 | "axios": "^1.2.2", 13 | "bottleneck": "^2.19.5", 14 | "combined-stream": "^1.0.7", 15 | "csv-parse": "^5.0.3", 16 | "fs-extra": "^8.1.0", 17 | "glob": "^7.0.0", 18 | "lodash": "^4.16.0", 19 | "minimist": "^1.2.0", 20 | "pelias-blacklist-stream": "^1.0.0", 21 | "pelias-config": "^6.0.0", 22 | "pelias-dbclient": "^3.1.0", 23 | "pelias-logger": "^1.2.1", 24 | "pelias-model": "^10.5.0", 25 | "pelias-wof-admin-lookup": "^7.12.0", 26 | "split2": "^3.2.2", 27 | "temp": "^0.9.1", 28 | "through2": "^3.0.0", 29 | "through2-filter": "^3.0.0", 30 | "through2-map": "^3.0.0", 31 | "through2-sink": "^1.0.0" 32 | }, 33 | "devDependencies": { 34 | "colors": "^1.4.0", 35 | "diff": "^5.0.0", 36 | "jshint": "^2.9.4", 37 | "precommit-hook": "^3.0.0", 38 | "proxyquire": "^2.0.0", 39 | "stream-mock": "^2.0.3", 40 | "tap-spec": "^5.0.0", 41 | "tape": "^5.0.0" 42 | }, 43 | "scripts": { 44 | "download": "./bin/download", 45 | "import": "./bin/start", 46 | "parallel": "./bin/parallel", 47 | "test": "NODE_ENV=test npm run units", 48 | "units": "./bin/units", 49 | "functional": "NODE_ENV=test node test/functional.js | tap-spec", 50 | "lint": "jshint .", 51 | "validate": "npm ls", 52 | "ci": "npm run test && npm run functional", 53 | "start": "./bin/start" 54 | }, 55 | "repository": { 56 | "type": "git", 57 | "url": "https://github.com/pelias/openaddresses.git" 58 | }, 59 | "keywords": [ 60 | "Pelias", 61 | "OpenAddresses", 62 | "import" 63 | ], 64 | "author": "mapzen", 65 | "license": "MIT", 66 | "bugs": { 67 | "url": "https://github.com/pelias/openaddresses/issues" 68 | }, 69 | "homepage": "https://github.com/pelias/openaddresses", 70 | "pre-commit": [ 71 | "lint", 72 | "validate", 73 | "test" 74 | ], 75 | "release": { 76 | "branch": "master", 77 | "success": [] 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /schema.js: -------------------------------------------------------------------------------- 1 | const Joi = require('@hapi/joi'); 2 | 3 | // Schema Configuration 4 | // datapath: string (required) 5 | // files: array of strings 6 | // adminLookup: boolean 7 | module.exports = Joi.object().keys({ 8 | imports: Joi.object().required().keys({ 9 | openaddresses: Joi.object().required().keys({ 10 | files: Joi.array().items(Joi.string()), 11 | datapath: Joi.string().required(true), 12 | dataHost: Joi.string(), 13 | s3Options: Joi.string(), 14 | adminLookup: Joi.boolean(), 15 | missingFilesAreFatal: Joi.boolean().default(false).truthy('yes').falsy('no'), 16 | token: Joi.string().required(true), 17 | }).unknown(false) 18 | }).unknown(true) 19 | }).unknown(true); 20 | -------------------------------------------------------------------------------- /test/analysis.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const through = require('through2'); 3 | const split = require('split2'); 4 | require('colors'); 5 | const Diff = require('diff'); 6 | const delim = '|'; 7 | 8 | const analyzers = { 9 | streetName: require('../lib/cleanup').streetName, 10 | streetNameV2: require('../lib/cleanup_v2').streetName 11 | }; 12 | 13 | // print each line 14 | const stream = through((chunk, enc, next) => { 15 | const line = chunk.toString('utf8'); 16 | const columns = [line]; 17 | 18 | _.forEach(analyzers, analyzer => { 19 | columns.push(analyzer(line)); 20 | }); 21 | 22 | // both analyzers produced the same result 23 | // skip these lines as they are not helpful 24 | // for debugging. 25 | if (columns[1] === columns[2]) { 26 | return next(); 27 | } 28 | 29 | var diffString = ''; 30 | var hasRemoval = false; 31 | Diff.diffChars(columns[1], columns[2]).forEach((part) => { 32 | hasRemoval = (hasRemoval || part.removed); 33 | // green for additions, red for deletions, grey for common parts 34 | const color = part.added ? 'green' : (part.removed ? 'red' : 'grey'); 35 | diffString += part.value[color]; 36 | }); 37 | 38 | columns.push(diffString); 39 | 40 | // only show lines where characters have been removed 41 | // if (!hasRemoval){ 42 | // return next(); 43 | // } 44 | 45 | console.log(columns.join(delim)); 46 | next(); 47 | }); 48 | 49 | // print header line 50 | stream.once('pipe', () => console.log(_.concat(['input'], _.keys(analyzers), ['diff']).join(delim))); 51 | 52 | process.stdin.pipe(split()).pipe(stream); 53 | -------------------------------------------------------------------------------- /test/cleanup_v2.js: -------------------------------------------------------------------------------- 1 | const tape = require('tape'); 2 | const analyzer = require('../lib/cleanup_v2').streetName; 3 | 4 | tape('analyzer', (t) => { 5 | t.equal(typeof analyzer, 'function', 'analyzer is a function'); 6 | t.equal(analyzer.length, 1, 'analyzer accepts body'); 7 | t.end(); 8 | }); 9 | 10 | // --- Letter Casing --- 11 | 12 | // fix casing on uppercased tokens 13 | tape('casing - fix uppercased tokens', (t) => { 14 | t.equal(analyzer('MAIN STREET'), 'Main Street'); 15 | t.equal(analyzer('DR M L KING JR BOULEVARD'), 'Dr M L King Jr Boulevard'); 16 | 17 | // uppercase tokens ending with a period 18 | t.equal(analyzer('DR MLK. JR. BOULEVARD'), 'Dr MLK. JR. Boulevard'); 19 | t.end(); 20 | }); 21 | 22 | // fix casing on lowercased tokens 23 | tape('casing - fix lowercased tokens', (t) => { 24 | t.equal(analyzer('main street'), 'Main Street'); 25 | t.equal(analyzer('dr m l king jr boulevard'), 'Dr M L King Jr Boulevard'); 26 | 27 | // uppercase tokens ending with a period 28 | t.equal(analyzer('dr mlk. jr. boulevard'), 'Dr MLK. JR. Boulevard'); 29 | t.end(); 30 | }); 31 | 32 | // ingore casing on mixedcase tokens 33 | tape('casing - ingore casing on mixedcase tokens', (t) => { 34 | t.equal(analyzer('Willie Mc Donald Way'), 'Willie Mc Donald Way'); 35 | t.equal(analyzer('McCallister Street'), 'McCallister Street'); 36 | t.equal(analyzer('Mc Callister Street'), 'Mc Callister Street'); 37 | t.end(); 38 | }); 39 | 40 | // --- Expanding the 'generic' part of the street name --- 41 | 42 | // expand contracted 'generic' term 43 | tape('generic expansion - final token position', (t) => { 44 | t.equal(analyzer('10 main street'), '10 Main Street'); 45 | t.equal(analyzer('10 main St.'), '10 Main Street'); 46 | t.equal(analyzer('10 main st.'), '10 Main Street'); 47 | t.equal(analyzer('10 main str'), '10 Main Street'); 48 | t.equal(analyzer('10 main st'), '10 Main Street'); 49 | 50 | t.equal(analyzer('10 main road'), '10 Main Road'); 51 | t.equal(analyzer('10 main Rd.'), '10 Main Road'); 52 | t.equal(analyzer('10 main rd.'), '10 Main Road'); 53 | t.equal(analyzer('10 main rd'), '10 Main Road'); 54 | 55 | t.equal(analyzer('10 main avenue'), '10 Main Avenue'); 56 | t.equal(analyzer('10 main Ave.'), '10 Main Avenue'); 57 | t.equal(analyzer('10 main ave.'), '10 Main Avenue'); 58 | t.equal(analyzer('10 main ave'), '10 Main Avenue'); 59 | 60 | t.equal(analyzer('10 main avenue'), '10 Main Avenue'); 61 | t.equal(analyzer('10 main Ave.'), '10 Main Avenue'); 62 | t.equal(analyzer('10 main ave.'), '10 Main Avenue'); 63 | t.equal(analyzer('10 main ave'), '10 Main Avenue'); 64 | t.end(); 65 | }); 66 | 67 | // do not expand 'generic' term when not in final token position 68 | tape('generic expansion - not final token position', (t) => { 69 | t.equal(analyzer('10 main st st'), '10 Main St Street'); 70 | t.equal(analyzer('10 main st junction'), '10 Main St Junction'); 71 | t.equal(analyzer('AVE ST RD ST PKWY ST'), 'Ave St Rd St Pkwy Street'); 72 | t.end(); 73 | }); 74 | 75 | // we should expand the 'generic' when directly before a directional 76 | tape('generic expansion - before directionals', (t) => { 77 | t.equal(analyzer('Main St N'), 'Main Street North'); 78 | t.equal(analyzer('Main St S'), 'Main Street South'); 79 | t.equal(analyzer('Main St E'), 'Main Street East'); 80 | t.equal(analyzer('Main St W'), 'Main Street West'); 81 | t.equal(analyzer('Main St North'), 'Main Street North'); 82 | t.equal(analyzer('Main St South'), 'Main Street South'); 83 | t.equal(analyzer('Main St East'), 'Main Street East'); 84 | t.equal(analyzer('Main St West'), 'Main Street West'); 85 | t.end(); 86 | }); 87 | 88 | // do not expand a 'generic' term when there is only one token 89 | // this is logical as the 'generic' should always be paired with 90 | // a 'specific'. 91 | // note: this is likely not nessesary but adds a little more safety 92 | // feel free to remove this restriction later if it doesn't make sense. 93 | tape('generic expansion - single token', (t) => { 94 | t.equal(analyzer('st'), 'St'); 95 | t.equal(analyzer('espl'), 'Espl'); 96 | t.end(); 97 | }); 98 | 99 | // @todo: what should we do when there are multiple 'generic' tokens? 100 | tape('generic expansion - multiple generic tokens', (t) => { 101 | t.equal(analyzer('W FARMS SQ PLZ'), 'West Farms Sq Plaza'); 102 | t.end(); 103 | }); 104 | 105 | // @todo: what should we do when the 'generic' preceeds the 'specific'? 106 | // @note: currently this expands 'Ave S' but not 'Ave X' because it thinks 107 | // that S refers to a directional. 108 | tape('generic expansion - multiple generic tokens', (t) => { 109 | t.equal(analyzer('AVE X'), 'Ave X'); 110 | t.equal(analyzer('AVE S'), 'Avenue S'); 111 | t.end(); 112 | }); 113 | 114 | // --- Expanding the 'directional' part of the street name --- 115 | 116 | // expand directionals 117 | // note: one issue with contracting directionals is getting 118 | // something like 'East Coast Road' to not change. 119 | tape('expand directionals - first token position', (t) => { 120 | t.equal(analyzer('N Main Street'), 'North Main Street'); 121 | t.equal(analyzer('S Main Street'), 'South Main Street'); 122 | t.equal(analyzer('E Main Street'), 'East Main Street'); 123 | t.equal(analyzer('W Main Street'), 'West Main Street'); 124 | t.end(); 125 | }); 126 | tape('expand directionals - last token position', (t) => { 127 | t.equal(analyzer('Main Street N'), 'Main Street North'); 128 | t.equal(analyzer('Main Street S'), 'Main Street South'); 129 | t.equal(analyzer('Main Street E'), 'Main Street East'); 130 | t.equal(analyzer('Main Street W'), 'Main Street West'); 131 | t.end(); 132 | }); 133 | 134 | // do not expand NSEW directionals 135 | tape('expand directionals - first token position', (t) => { 136 | t.equal(analyzer('NE Main Street'), 'NE Main Street'); 137 | t.equal(analyzer('SE Main Street'), 'SE Main Street'); 138 | t.equal(analyzer('NW Main Street'), 'NW Main Street'); 139 | t.equal(analyzer('SW Main Street'), 'SW Main Street'); 140 | t.end(); 141 | }); 142 | tape('expand directionals - last token position', (t) => { 143 | t.equal(analyzer('Main Street NE'), 'Main Street NE'); 144 | t.equal(analyzer('Main Street SE'), 'Main Street SE'); 145 | t.equal(analyzer('Main Street NW'), 'Main Street NW'); 146 | t.equal(analyzer('Main Street SW'), 'Main Street SW'); 147 | t.end(); 148 | }); 149 | 150 | // do not expand directionals unless 3 or more tokens present 151 | tape('expand directionals - only when 3 or more tokens', (t) => { 152 | t.equal(analyzer('N Street'), 'N Street'); 153 | t.equal(analyzer('S Street'), 'S Street'); 154 | t.equal(analyzer('E Street'), 'E Street'); 155 | t.equal(analyzer('W Street'), 'W Street'); 156 | t.end(); 157 | }); 158 | 159 | // do not expand directionals when followed by a 'generic' 160 | tape('expand directionals - unless followed by a generic', (t) => { 161 | t.equal(analyzer('N St Station'), 'N St Station'); 162 | t.equal(analyzer('N Street Station'), 'N Street Station'); 163 | t.equal(analyzer('N Ave Junction'), 'N Ave Junction'); 164 | t.equal(analyzer('N Avenue Junction'), 'N Avenue Junction'); 165 | t.end(); 166 | }); 167 | 168 | // contract english diagonals (southwest,southeast...) 169 | tape('contract english diagonals - first token position', (t) => { 170 | t.equal(analyzer('Northeast Main Street'), 'NE Main Street'); 171 | t.equal(analyzer('Southeast Main Street'), 'SE Main Street'); 172 | t.equal(analyzer('Northwest Main Street'), 'NW Main Street'); 173 | t.equal(analyzer('Southwest Main Street'), 'SW Main Street'); 174 | t.end(); 175 | }); 176 | tape('contract english diagonals - last token position', (t) => { 177 | t.equal(analyzer('Main Street Northeast'), 'Main Street NE'); 178 | t.equal(analyzer('Main Street Southeast'), 'Main Street SE'); 179 | t.equal(analyzer('Main Street Northwest'), 'Main Street NW'); 180 | t.equal(analyzer('Main Street Southwest'), 'Main Street SW'); 181 | t.end(); 182 | }); 183 | 184 | // add missing English street name ordinals 185 | tape('add missing English street name ordinals', (t) => { 186 | t.equal(analyzer('W 26 St'), 'West 26th Street'); 187 | t.equal(analyzer('W 26th St'), 'West 26th Street'); 188 | t.equal(analyzer('1 St'), '1st Street'); 189 | t.equal(analyzer('2 Rd'), '2nd Road'); 190 | t.equal(analyzer('3 Ave'), '3rd Avenue'); 191 | t.equal(analyzer('4 Ln'), '4th Lane'); 192 | t.equal(analyzer('11 St'), '11th Street'); 193 | t.equal(analyzer('12 Rd'), '12th Road'); 194 | t.equal(analyzer('13 Ave'), '13th Avenue'); 195 | t.equal(analyzer('14 Ln'), '14th Lane'); 196 | t.equal(analyzer('101 St'), '101st Street'); 197 | t.equal(analyzer('102 Rd'), '102nd Road'); 198 | t.equal(analyzer('103 Ave'), '103rd Avenue'); 199 | t.equal(analyzer('104 Ln'), '104th Lane'); 200 | t.equal(analyzer('no 1 st'), 'No 1 Street'); 201 | t.equal(analyzer('no #1 st'), 'No #1 Street'); 202 | t.end(); 203 | }); 204 | 205 | // --- NOOP inputs which should never change --- 206 | 207 | // no-ops, these inputs should not change regardless of the algorithm used 208 | tape('no-ops', (t) => { 209 | 210 | // street names composed entirely of 'generic' tokens 211 | t.equal(analyzer('Esplanade'), 'Esplanade'); 212 | t.equal(analyzer('Park Road'), 'Park Road'); 213 | 214 | // do not contract directionals which are part of the name 215 | t.equal(analyzer('East Coast Road'), 'East Coast Road'); 216 | 217 | // number prefix 218 | t.equal(analyzer('No 1 Road'), 'No 1 Road'); 219 | 220 | // spanish prefix 'la' should never be expanded to 'lane' 221 | t.equal(analyzer('La Bamba Road'), 'La Bamba Road'); 222 | 223 | // directional as street name 224 | t.equal(analyzer('N Street'), 'N Street'); 225 | t.equal(analyzer('No Street'), 'No Street'); 226 | t.equal(analyzer('North Street'), 'North Street'); 227 | t.equal(analyzer('Northe Street'), 'Northe Street'); 228 | 229 | // do not anglicise/de-anglicise names 230 | t.equal(analyzer('Centre Road'), 'Centre Road'); 231 | t.equal(analyzer('Center Road'), 'Center Road'); 232 | t.equal(analyzer('Annex Road'), 'Annex Road'); 233 | t.equal(analyzer('Anex Road'), 'Anex Road'); 234 | 235 | // personal title in middle of name 236 | t.equal(analyzer('Main Road St Arnaud'), 'Main Road St Arnaud'); 237 | t.equal(analyzer('Mount St John Avenue'), 'Mount St John Avenue'); 238 | 239 | t.end(); 240 | }); 241 | 242 | 243 | tape('misc', (t) => { 244 | t.equal(analyzer('YELLOWSTONE BLVD'), 'Yellowstone Boulevard'); 245 | t.equal(analyzer('YESHIVA LN'), 'Yeshiva Lane'); 246 | t.equal(analyzer('WYGANT PL'), 'Wygant Place'); 247 | t.equal(analyzer('W 262 ST'), 'West 262nd Street'); 248 | t.equal(analyzer('W 26TH ST'), 'West 26th Street'); 249 | t.equal(analyzer('WILLIE MC DONALD WAY'), 'Willie Mc Donald Way'); 250 | t.equal(analyzer('West 93rd Street'), 'West 93rd Street'); 251 | t.equal(analyzer('JFK AIRPORT'), 'Jfk Airport'); // this should really uppercase JFK 252 | t.equal(analyzer('DR M L KING JR BLVD'), 'Dr M L King Jr Boulevard'); // not perfect 253 | t.equal(analyzer('E HAMPTON BLVD'), 'East Hampton Boulevard'); 254 | t.equal(analyzer('MARATHON PKWY'), 'Marathon Parkway'); 255 | t.equal(analyzer('ANDREWS AVE S'), 'Andrews Avenue South'); 256 | t.equal(analyzer('W 13 ST'), 'West 13th Street'); 257 | t.end(); 258 | }); 259 | 260 | tape('misc directionals', (t) => { 261 | t.equal(analyzer('W KINGSBRIDGE RD'), 'West Kingsbridge Road'); 262 | t.equal(analyzer('W MOSHOLU PKWY S'), 'West Mosholu Parkway South'); 263 | t.equal(analyzer('WILLIAMSBURG ST E'), 'Williamsburg Street East'); 264 | t.equal(analyzer('W MOSHOLU PKWY N'), 'West Mosholu Parkway North'); 265 | t.equal(analyzer('W MOSHOLU PKWY SE'), 'West Mosholu Parkway SE'); 266 | t.equal(analyzer('S WILLIAM ST'), 'South William Street'); 267 | t.equal(analyzer('Foo ST South East'), 'Foo Street South East'); 268 | t.end(); 269 | }); 270 | 271 | // tape('prefix expansions', (t) => { 272 | // t.equal(analyzer('ST JAMES ST'), 'Saint James Street'); 273 | // t.equal(analyzer('ST JAMES AVE'), 'Saint James Avenue'); 274 | // t.equal(analyzer('ST. JAMES AVE'), 'Saint James Avenue'); 275 | // t.equal(analyzer('ST NICHOLAS TER'), 'Saint Nicholas Terrace'); 276 | // t.equal(analyzer('MT DOOM CRES'), 'Mount Doom Crescent'); 277 | // t.equal(analyzer('MT. DOOM CRES'), 'Mount Doom Crescent'); 278 | // t.equal(analyzer('FT IMPENETRABLE ROW'), 'Fort Impenetrable Row'); 279 | // t.equal(analyzer('FT. IMPENETRABLE ROW'), 'Fort Impenetrable Row'); 280 | // t.equal(analyzer('St Leonards Drive'), 'Saint Leonards Drive'); 281 | // t.equal(analyzer('St Andrew Street'), 'Saint Andrew Street'); 282 | // t.end(); 283 | // }); 284 | -------------------------------------------------------------------------------- /test/data/au/input_file_3.csv: -------------------------------------------------------------------------------- 1 | LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID 2 | 144.931874,-37.791488,10,Smith Street,,input city,input district,input region,input postcode,GAVIC718519668 3 | -------------------------------------------------------------------------------- /test/data/au/input_file_4.csv: -------------------------------------------------------------------------------- 1 | LON,LAT,HASH,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID 2 | 144.9804144,-37.8723977,710daac656ffd0c3,10/244,BARKLY STREET,,ST KILDA,,VIC,"3182","50579518" 3 | 145.0378718,-37.8637847,92862c98c20bbe3d,10/244-246,WATTLETREE ROAD,,MALVERN,,VIC,"3144","208518759" 4 | 145.0003807,-37.8289596,d0a21035cebcd8ab,10/244-246,MARY STREET,,RICHMOND,,VIC,"3121","51463974" 5 | 144.978361,-37.8002503,4e891155eb009dc3,10/244,BRUNSWICK STREET,,FITZROY,,VIC,"3065","210464257" 6 | 144.9591621,-37.8331898,e20c57c01d5d42c0,110/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672310" 7 | 144.9591621,-37.8331898,50c85f85cce9181f,210/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672321" 8 | 144.9591621,-37.8331898,4e737a8cc6ada9ec,310/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672332" 9 | 144.9591621,-37.8331898,d6ed0494e8c53ff8,410/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672343" 10 | 144.9591621,-37.8331898,fa0691071a173dab,510/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672353" 11 | 144.925714,-37.7516895,00be263cea28bea0,10/244,PASCOE VALE ROAD,,ESSENDON,,VIC,"3040","429232726" 12 | -------------------------------------------------------------------------------- /test/data/expected.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "_index": "pelias", 4 | "_id": "openaddresses:address:input_file_1:7552fdd1d9eb5765", 5 | "data": { 6 | "name": { 7 | "default": "100 Main St" 8 | }, 9 | "phrase": { 10 | "default": "100 Main St" 11 | }, 12 | "parent": { 13 | "country": [ 14 | "override country" 15 | ], 16 | "country_id": [ 17 | "1" 18 | ], 19 | "country_a": [ 20 | null 21 | ], 22 | "country_source": [ 23 | null 24 | ], 25 | "macroregion": [ 26 | "override macroregion" 27 | ], 28 | "macroregion_id": [ 29 | "2" 30 | ], 31 | "macroregion_a": [ 32 | null 33 | ], 34 | "macroregion_source": [ 35 | null 36 | ], 37 | "region": [ 38 | "override region" 39 | ], 40 | "region_id": [ 41 | "3" 42 | ], 43 | "region_a": [ 44 | null 45 | ], 46 | "region_source": [ 47 | null 48 | ], 49 | "macrocounty": [ 50 | "override macrocounty" 51 | ], 52 | "macrocounty_id": [ 53 | "4" 54 | ], 55 | "macrocounty_a": [ 56 | null 57 | ], 58 | "macrocounty_source": [ 59 | null 60 | ], 61 | "county": [ 62 | "override county" 63 | ], 64 | "county_id": [ 65 | "5" 66 | ], 67 | "county_a": [ 68 | null 69 | ], 70 | "county_source": [ 71 | null 72 | ], 73 | "borough": [ 74 | "override borough" 75 | ], 76 | "borough_id": [ 77 | "6" 78 | ], 79 | "borough_a": [ 80 | null 81 | ], 82 | "borough_source": [ 83 | null 84 | ], 85 | "locality": [ 86 | "override locality" 87 | ], 88 | "locality_id": [ 89 | "7" 90 | ], 91 | "locality_a": [ 92 | null 93 | ], 94 | "locality_source": [ 95 | null 96 | ], 97 | "localadmin": [ 98 | "override localadmin" 99 | ], 100 | "localadmin_id": [ 101 | "8" 102 | ], 103 | "localadmin_a": [ 104 | null 105 | ], 106 | "localadmin_source": [ 107 | null 108 | ], 109 | "neighbourhood": [ 110 | "override neighbourhood" 111 | ], 112 | "neighbourhood_id": [ 113 | "9" 114 | ], 115 | "neighbourhood_a": [ 116 | null 117 | ], 118 | "neighbourhood_source": [ 119 | null 120 | ] 121 | }, 122 | "address_parts": { 123 | "number": "100", 124 | "street": "Main St", 125 | "zip": "input postcode" 126 | }, 127 | "center_point": { 128 | "lon": 21.212121, 129 | "lat": 12.121212 130 | }, 131 | "source": "openaddresses", 132 | "layer": "address", 133 | "source_id": "input_file_1:7552fdd1d9eb5765" 134 | } 135 | }, 136 | { 137 | "_index": "pelias", 138 | "_id": "openaddresses:address:input_file_1:e21716b47966b98a", 139 | "data": { 140 | "name": { 141 | "default": "200 Main St" 142 | }, 143 | "phrase": { 144 | "default": "200 Main St" 145 | }, 146 | "address_parts": { 147 | "number": "200", 148 | "street": "Main St" 149 | }, 150 | "center_point": { 151 | "lon": 31.313131, 152 | "lat": 13.131313 153 | }, 154 | "source": "openaddresses", 155 | "layer": "address", 156 | "source_id": "input_file_1:e21716b47966b98a" 157 | } 158 | }, 159 | { 160 | "_index": "pelias", 161 | "_id": "openaddresses:address:input_file_1:7456321cc7d6d352", 162 | "data": { 163 | "name": { 164 | "default": "0 Main St" 165 | }, 166 | "phrase": { 167 | "default": "0 Main St" 168 | }, 169 | "address_parts": { 170 | "number": "0", 171 | "street": "Main St" 172 | }, 173 | "center_point": { 174 | "lon": 41.414141, 175 | "lat": 14.141414 176 | }, 177 | "source": "openaddresses", 178 | "layer": "address", 179 | "source_id": "input_file_1:7456321cc7d6d352" 180 | } 181 | }, 182 | { 183 | "_index": "pelias", 184 | "_id": "openaddresses:address:input_file_1:f026cd5494a7e4f4", 185 | "data": { 186 | "name": { 187 | "default": "0 Elm St" 188 | }, 189 | "phrase": { 190 | "default": "0 Elm St" 191 | }, 192 | "address_parts": { 193 | "number": "0", 194 | "street": "Elm St" 195 | }, 196 | "center_point": { 197 | "lon": 51.515151, 198 | "lat": 15.151515 199 | }, 200 | "source": "openaddresses", 201 | "layer": "address", 202 | "source_id": "input_file_1:f026cd5494a7e4f4" 203 | } 204 | }, 205 | { 206 | "_index": "pelias", 207 | "_id": "openaddresses:address:input_file_1:4509c0194f1efaca", 208 | "data": { 209 | "name": { 210 | "default": "300 Main St" 211 | }, 212 | "phrase": { 213 | "default": "300 Main St" 214 | }, 215 | "address_parts": { 216 | "number": "300", 217 | "street": "Main St" 218 | }, 219 | "center_point": { 220 | "lon": 61.616161, 221 | "lat": 16.161616 222 | }, 223 | "source": "openaddresses", 224 | "layer": "address", 225 | "source_id": "input_file_1:4509c0194f1efaca" 226 | } 227 | }, 228 | { 229 | "_index": "pelias", 230 | "_id": "openaddresses:address:input_file_2:fc6d8b0a0e5cda70", 231 | "data": { 232 | "name": { 233 | "default": "400 Vireo Rd" 234 | }, 235 | "phrase": { 236 | "default": "400 Vireo Rd" 237 | }, 238 | "address_parts": { 239 | "number": "400", 240 | "street": "Vireo Rd" 241 | }, 242 | "center_point": { 243 | "lon": 71.717171, 244 | "lat": 17.171717 245 | }, 246 | "source": "openaddresses", 247 | "layer": "address", 248 | "source_id": "input_file_2:fc6d8b0a0e5cda70" 249 | } 250 | }, 251 | { 252 | "_index": "pelias", 253 | "_id": "openaddresses:address:input_file_2:b7c25b5e6eea7831", 254 | "data": { 255 | "name": { 256 | "default": "0 Vireo Rd" 257 | }, 258 | "phrase": { 259 | "default": "0 Vireo Rd" 260 | }, 261 | "address_parts": { 262 | "number": "0", 263 | "street": "Vireo Rd" 264 | }, 265 | "center_point": { 266 | "lon": 81.818181, 267 | "lat": 18.181818 268 | }, 269 | "source": "openaddresses", 270 | "layer": "address", 271 | "source_id": "input_file_2:b7c25b5e6eea7831" 272 | } 273 | }, 274 | { 275 | "_index": "pelias", 276 | "_id": "openaddresses:address:input_file_2:25d52af880bfefc4", 277 | "data": { 278 | "name": { 279 | "default": "500 Calle De Lago" 280 | }, 281 | "phrase": { 282 | "default": "500 Calle De Lago" 283 | }, 284 | "address_parts": { 285 | "number": "500", 286 | "street": "Calle De Lago" 287 | }, 288 | "center_point": { 289 | "lon": 91.919191, 290 | "lat": 19.191919 291 | }, 292 | "source": "openaddresses", 293 | "layer": "address", 294 | "source_id": "input_file_2:25d52af880bfefc4" 295 | } 296 | }, 297 | { 298 | "_index": "pelias", 299 | "_id": "openaddresses:address:input_file_2:0d9cb0ba093a3d23", 300 | "data": { 301 | "name": { 302 | "default": "500 Calle De Lago" 303 | }, 304 | "phrase": { 305 | "default": "500 Calle De Lago" 306 | }, 307 | "address_parts": { 308 | "number": "500", 309 | "street": "Calle De Lago" 310 | }, 311 | "center_point": { 312 | "lon": 92.929292, 313 | "lat": 29.292929 314 | }, 315 | "source": "openaddresses", 316 | "layer": "address", 317 | "source_id": "input_file_2:0d9cb0ba093a3d23" 318 | } 319 | }, 320 | { 321 | "_index": "pelias", 322 | "_id": "openaddresses:address:au/input_file_3:0c0641950f5693a0", 323 | "data": { 324 | "name": { 325 | "default": "10 Smith Street" 326 | }, 327 | "phrase": { 328 | "default": "10 Smith Street" 329 | }, 330 | "address_parts": { 331 | "number": "10", 332 | "street": "Smith Street", 333 | "zip": "input postcode" 334 | }, 335 | "center_point": { 336 | "lon": 144.931874, 337 | "lat": -37.791488 338 | }, 339 | "source": "openaddresses", 340 | "layer": "address", 341 | "source_id": "au/input_file_3:0c0641950f5693a0", 342 | "addendum": { 343 | "concordances": "{\"gnaf:pid\":\"GAVIC718519668\"}" 344 | } 345 | } 346 | }, 347 | { 348 | "_index": "pelias", 349 | "_id": "openaddresses:address:au/input_file_4:2e7dc83e6d7c43b5", 350 | "data": { 351 | "name": { 352 | "default": "244 Barkly Street" 353 | }, 354 | "phrase": { 355 | "default": "244 Barkly Street" 356 | }, 357 | "address_parts": { 358 | "number": "244", 359 | "street": "Barkly Street", 360 | "zip": "3182", 361 | "unit": "10" 362 | }, 363 | "center_point": { 364 | "lon": 144.980414, 365 | "lat": -37.872398 366 | }, 367 | "source": "openaddresses", 368 | "layer": "address", 369 | "source_id": "au/input_file_4:2e7dc83e6d7c43b5" 370 | } 371 | }, 372 | { 373 | "_index": "pelias", 374 | "_id": "openaddresses:address:au/input_file_4:b9d4e479b3787466", 375 | "data": { 376 | "name": { 377 | "default": "10/244-246 Wattletree Road" 378 | }, 379 | "phrase": { 380 | "default": "10/244-246 Wattletree Road" 381 | }, 382 | "address_parts": { 383 | "number": "10/244-246", 384 | "street": "Wattletree Road", 385 | "zip": "3144" 386 | }, 387 | "center_point": { 388 | "lon": 145.037872, 389 | "lat": -37.863785 390 | }, 391 | "source": "openaddresses", 392 | "layer": "address", 393 | "source_id": "au/input_file_4:b9d4e479b3787466" 394 | } 395 | }, 396 | { 397 | "_index": "pelias", 398 | "_id": "openaddresses:address:au/input_file_4:5d465daf4228aeae", 399 | "data": { 400 | "name": { 401 | "default": "10/244-246 Mary Street" 402 | }, 403 | "phrase": { 404 | "default": "10/244-246 Mary Street" 405 | }, 406 | "address_parts": { 407 | "number": "10/244-246", 408 | "street": "Mary Street", 409 | "zip": "3121" 410 | }, 411 | "center_point": { 412 | "lon": 145.000381, 413 | "lat": -37.82896 414 | }, 415 | "source": "openaddresses", 416 | "layer": "address", 417 | "source_id": "au/input_file_4:5d465daf4228aeae" 418 | } 419 | }, 420 | { 421 | "_index": "pelias", 422 | "_id": "openaddresses:address:au/input_file_4:19c92a8fab88d851", 423 | "data": { 424 | "name": { 425 | "default": "244 Brunswick Street" 426 | }, 427 | "phrase": { 428 | "default": "244 Brunswick Street" 429 | }, 430 | "address_parts": { 431 | "number": "244", 432 | "street": "Brunswick Street", 433 | "zip": "3065", 434 | "unit": "10" 435 | }, 436 | "center_point": { 437 | "lon": 144.978361, 438 | "lat": -37.80025 439 | }, 440 | "source": "openaddresses", 441 | "layer": "address", 442 | "source_id": "au/input_file_4:19c92a8fab88d851" 443 | } 444 | }, 445 | { 446 | "_index": "pelias", 447 | "_id": "openaddresses:address:au/input_file_4:2d93ffa5b82d5815", 448 | "data": { 449 | "name": { 450 | "default": "244 Dorcas Street" 451 | }, 452 | "phrase": { 453 | "default": "244 Dorcas Street" 454 | }, 455 | "address_parts": { 456 | "number": "244", 457 | "street": "Dorcas Street", 458 | "zip": "3205", 459 | "unit": "110" 460 | }, 461 | "center_point": { 462 | "lon": 144.959162, 463 | "lat": -37.83319 464 | }, 465 | "source": "openaddresses", 466 | "layer": "address", 467 | "source_id": "au/input_file_4:2d93ffa5b82d5815" 468 | } 469 | }, 470 | { 471 | "_index": "pelias", 472 | "_id": "openaddresses:address:au/input_file_4:a6e27ffafd14a0da", 473 | "data": { 474 | "name": { 475 | "default": "244 Dorcas Street" 476 | }, 477 | "phrase": { 478 | "default": "244 Dorcas Street" 479 | }, 480 | "address_parts": { 481 | "number": "244", 482 | "street": "Dorcas Street", 483 | "zip": "3205", 484 | "unit": "210" 485 | }, 486 | "center_point": { 487 | "lon": 144.959162, 488 | "lat": -37.83319 489 | }, 490 | "source": "openaddresses", 491 | "layer": "address", 492 | "source_id": "au/input_file_4:a6e27ffafd14a0da" 493 | } 494 | }, 495 | { 496 | "_index": "pelias", 497 | "_id": "openaddresses:address:au/input_file_4:4c18bc2fab1af1ee", 498 | "data": { 499 | "name": { 500 | "default": "244 Dorcas Street" 501 | }, 502 | "phrase": { 503 | "default": "244 Dorcas Street" 504 | }, 505 | "address_parts": { 506 | "number": "244", 507 | "street": "Dorcas Street", 508 | "zip": "3205", 509 | "unit": "310" 510 | }, 511 | "center_point": { 512 | "lon": 144.959162, 513 | "lat": -37.83319 514 | }, 515 | "source": "openaddresses", 516 | "layer": "address", 517 | "source_id": "au/input_file_4:4c18bc2fab1af1ee" 518 | } 519 | }, 520 | { 521 | "_index": "pelias", 522 | "_id": "openaddresses:address:au/input_file_4:d5236248ba736eba", 523 | "data": { 524 | "name": { 525 | "default": "244 Dorcas Street" 526 | }, 527 | "phrase": { 528 | "default": "244 Dorcas Street" 529 | }, 530 | "address_parts": { 531 | "number": "244", 532 | "street": "Dorcas Street", 533 | "zip": "3205", 534 | "unit": "410" 535 | }, 536 | "center_point": { 537 | "lon": 144.959162, 538 | "lat": -37.83319 539 | }, 540 | "source": "openaddresses", 541 | "layer": "address", 542 | "source_id": "au/input_file_4:d5236248ba736eba" 543 | } 544 | }, 545 | { 546 | "_index": "pelias", 547 | "_id": "openaddresses:address:au/input_file_4:7dbcebf7bd632ef8", 548 | "data": { 549 | "name": { 550 | "default": "244 Dorcas Street" 551 | }, 552 | "phrase": { 553 | "default": "244 Dorcas Street" 554 | }, 555 | "address_parts": { 556 | "number": "244", 557 | "street": "Dorcas Street", 558 | "zip": "3205", 559 | "unit": "510" 560 | }, 561 | "center_point": { 562 | "lon": 144.959162, 563 | "lat": -37.83319 564 | }, 565 | "source": "openaddresses", 566 | "layer": "address", 567 | "source_id": "au/input_file_4:7dbcebf7bd632ef8" 568 | } 569 | }, 570 | { 571 | "_index": "pelias", 572 | "_id": "openaddresses:address:au/input_file_4:822280a9b8a92d85", 573 | "data": { 574 | "name": { 575 | "default": "244 Pascoe Vale Road" 576 | }, 577 | "phrase": { 578 | "default": "244 Pascoe Vale Road" 579 | }, 580 | "address_parts": { 581 | "number": "244", 582 | "street": "Pascoe Vale Road", 583 | "zip": "3040", 584 | "unit": "10" 585 | }, 586 | "center_point": { 587 | "lon": 144.925714, 588 | "lat": -37.751689 589 | }, 590 | "source": "openaddresses", 591 | "layer": "address", 592 | "source_id": "au/input_file_4:822280a9b8a92d85" 593 | } 594 | } 595 | ] -------------------------------------------------------------------------------- /test/data/input_file_1.csv: -------------------------------------------------------------------------------- 1 | LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID 2 | 21.212121,12.121212,100,Main St,,input city,input district,input region,input postcode,GOOD RECORD 3 | 31.313131,13.131313, 200 , Main St ,,,,,,GOOD RECORD WITH FIELD TRIMMING 4 | 41.414141,14.141414,0,Main St,,,,,,WILL BE LOGGED BUT NOT SKIPPED (NUMBER IS REDUCEABLE TO 0) 5 | 51.515151,15.151515,00,Elm St,,,,,,WILL BE LOGGED BUT NOT SKIPPED (NUMBER IS REDUCEABLE TO 0) 6 | 61.616161,16.161616,00300,Main St,,,,,,MAINTAINS LEADING ZEROES 7 | -------------------------------------------------------------------------------- /test/data/input_file_2.csv: -------------------------------------------------------------------------------- 1 | LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID 2 | 71.717171,17.171717,400,Vireo Rd,,,,,,GOOD RECORD 3 | 81.818181,18.181818,00000,Vireo Rd,,,,,,WILL BE LOGGED BUT NOT SKIPPED (NUMBER IS REDUCEABLE TO 0) 4 | 91.919191,19.191919,00500,Calle de Lago,,,,,,MAINTAINS LEADING ZEROES 5 | 92.929292,29.292929,00500,Calle de Lago,,,,,,THIS GETS FILTERED OUT BY DEDUPE 6 | -------------------------------------------------------------------------------- /test/functional.js: -------------------------------------------------------------------------------- 1 | require( './importPipeline' ); 2 | -------------------------------------------------------------------------------- /test/import.js: -------------------------------------------------------------------------------- 1 | const tape = require( 'tape' ); 2 | 3 | const proxyquire = require('proxyquire').noCallThru(); 4 | 5 | tape( 'config.generate throwing error should rethrow', (test) => { 6 | test.throws(() => { 7 | proxyquire('../import', { 8 | './schema': 'this is the schema', 9 | 'pelias-config': { 10 | generate: (schema) => { 11 | // the schema passed to generate should be the require'd schema 12 | test.equals(schema, 'this is the schema'); 13 | 14 | throw Error('config is not valid'); 15 | } 16 | } 17 | })(); 18 | 19 | }, /config is not valid/); 20 | 21 | test.end(); 22 | 23 | }); 24 | -------------------------------------------------------------------------------- /test/importPipeline.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const path = require('path'); 3 | const tape = require('tape'); 4 | const map = require('through2-map'); 5 | const proxyquire = require('proxyquire'); 6 | const stream_mock = require('stream-mock'); 7 | 8 | const expectedPath = path.join(__dirname, 'data/expected.json'); 9 | const expected = require(expectedPath); 10 | 11 | tape('functional test of importing four small OA files', function(t) { 12 | // expect two assertions, one for the error and one for the data 13 | t.plan(2); 14 | 15 | const assert = (err, actual) => { 16 | // uncomment this to write the actual results to the expected file 17 | // make sure they look ok though. comma left off so jshint reminds you 18 | // not to commit this line 19 | // require('fs').writeFileSync(expectedPath, JSON.stringify(actual, null, 2)) 20 | 21 | t.error(err); 22 | t.deepEquals(actual, expected); 23 | t.end(); 24 | }; 25 | 26 | const importPipeline = proxyquire('../lib/importPipeline', { 27 | 'pelias-dbclient': () => { 28 | const dbclient = new stream_mock.ObjectWritableMock(); 29 | dbclient.on('error', (e) => assert(e)); 30 | dbclient.on('finish', () => assert(null, dbclient.data)); 31 | return dbclient; 32 | } 33 | }); 34 | 35 | // mock admin lookup stream to show that input file admin values are ignored 36 | // and replaced with overrides from adminLookup 37 | const adminLookupStream = map.obj((record) => { 38 | // we're only concerned about one record being modified 39 | if (_.isEqual(record.center_point, { lat: 12.121212, lon: 21.212121})) { 40 | record.addParent('country', 'override country', '1'); 41 | record.addParent('macroregion', 'override macroregion', '2'); 42 | record.addParent('region', 'override region', '3'); 43 | record.addParent('macrocounty', 'override macrocounty', '4'); 44 | record.addParent('county', 'override county', '5'); 45 | record.addParent('borough', 'override borough', '6'); 46 | record.addParent('locality', 'override locality', '7'); 47 | record.addParent('localadmin', 'override localadmin', '8'); 48 | record.addParent('neighbourhood', 'override neighbourhood', '9'); 49 | } 50 | 51 | return record; 52 | }); 53 | 54 | // test fixtures 55 | const dirPath = path.join(__dirname, 'data'); 56 | const inputFiles = [ 57 | path.join(dirPath, 'input_file_1.csv'), 58 | path.join(dirPath, 'input_file_2.csv'), 59 | path.join(dirPath, 'au/input_file_3.csv'), 60 | path.join(dirPath, 'au/input_file_4.csv') 61 | ]; 62 | 63 | importPipeline.create(inputFiles, dirPath, adminLookupStream); 64 | }); 65 | -------------------------------------------------------------------------------- /test/isValidCsvRecord.js: -------------------------------------------------------------------------------- 1 | var tape = require( 'tape' ); 2 | 3 | var isValidCsvRecord = require( '../lib/isValidCsvRecord' ); 4 | 5 | tape( 'Identifies invalid CSV records.', function ( test ){ 6 | var records = [ 7 | {LON: '1', LAT: '2', STREET: '3', NUMBER: '', FOO: '', SOME_PROP: ''}, 8 | {LON: '', LAT: '2', STREET: '3', NUMBER: '', FOO: '', SOME_PROP: 'something'}, 9 | {LON: '', LAT: '2', STREET: '', NUMBER: '4', SOME_PROP: 'value'} 10 | ]; 11 | records.forEach( function ( rec ){ 12 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' ); 13 | }); 14 | 15 | var validRecord = {LON: '1', LAT: '2', STREET: '3', NUMBER: '4', SOME_PROP: 'abs'}; 16 | test.ok( isValidCsvRecord( validRecord ), 'Record identified as valid.' ); 17 | test.end(); 18 | }); 19 | 20 | tape( 'Identifies CSV files that have incorrect columns', function( test) { 21 | var record = { 'notLat': 'asdf', 'notLon': 5 }; 22 | 23 | test.ok( !isValidCsvRecord( record ), 'Record identified as invalid' ); 24 | test.end(); 25 | }); 26 | 27 | tape('complete record but house number is literal word `null` should return false', function(test) { 28 | var record = { 29 | LON: '1', LAT: '2', NUMBER: 'NuLl', STREET: 'Street' 30 | }; 31 | 32 | test.ok( !isValidCsvRecord(record), 'Record identified as invalid'); 33 | test.end(); 34 | 35 | }); 36 | 37 | tape('complete record but house number is literal word `undefined` should return false', function(test) { 38 | var record = { 39 | LON: '1', LAT: '2', NUMBER: 'uNdEfInEd', STREET: 'Street' 40 | }; 41 | 42 | test.ok( !isValidCsvRecord(record), 'Record identified as invalid'); 43 | test.end(); 44 | 45 | }); 46 | 47 | tape('complete record but house number is literal word `unavailable` should return false', function(test) { 48 | var record = { 49 | LON: '1', LAT: '2', NUMBER: 'uNaVaIlAbLe', STREET: 'Street' 50 | }; 51 | 52 | test.ok( !isValidCsvRecord(record), 'Record identified as invalid'); 53 | test.end(); 54 | 55 | }); 56 | 57 | tape('complete record but street contains literal word `null` should return false', function(test) { 58 | var records = [ 59 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'NuLl Name St' }, 60 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South NULL St' }, 61 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South Name null' } 62 | ]; 63 | 64 | records.forEach( function ( rec ){ 65 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' ); 66 | }); 67 | 68 | test.end(); 69 | 70 | }); 71 | 72 | tape('complete record but street contains literal word `undefined` should return false', function(test) { 73 | var records = [ 74 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'uNdEfInEd Name St' }, 75 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South UNDEFINED St' }, 76 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South Name undefined' } 77 | ]; 78 | 79 | records.forEach( function ( rec ){ 80 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' ); 81 | }); 82 | 83 | test.end(); 84 | 85 | }); 86 | 87 | tape('complete record but street contains literal word `unavailable` should return false', function(test) { 88 | var records = [ 89 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'uNaVaIlAbLe Name St' }, 90 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South UNAVAILABLE St' }, 91 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South Name unavailable' } 92 | ]; 93 | 94 | records.forEach( function ( rec ){ 95 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' ); 96 | }); 97 | 98 | test.end(); 99 | 100 | }); 101 | 102 | tape('street with substring `null` but not on word boundary should return true', function(test) { 103 | var record = { 104 | LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'Snull Street Nulls' 105 | }; 106 | 107 | test.ok( isValidCsvRecord(record), 'Record identified as valid'); 108 | test.end(); 109 | 110 | }); 111 | 112 | tape('street with substring `undefined` but not on word boundary should return true', function(test) { 113 | var record = { 114 | LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'Sundefined Street Undefineds' 115 | }; 116 | 117 | test.ok( isValidCsvRecord(record), 'Record identified as valid'); 118 | test.end(); 119 | 120 | }); 121 | 122 | tape('street with substring `unavailable` but not on word boundary should return true', function(test) { 123 | var record = { 124 | LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'Sunavailable Street Unavailables' 125 | }; 126 | 127 | test.ok( isValidCsvRecord(record), 'Record identified as valid'); 128 | test.end(); 129 | 130 | }); 131 | 132 | tape('record with lon/lat parseable as 0/0 should return false', test => { 133 | const record = { 134 | LON: '0.000000', 135 | LAT: '0.000000', 136 | NUMBER: 'Number', 137 | STREET: 'Street' 138 | }; 139 | 140 | test.notOk( isValidCsvRecord(record), 'should be rejected'); 141 | test.end(); 142 | 143 | }); 144 | 145 | tape('record with lon/lat parseable as 0/non-0 should return true', test => { 146 | const record = { 147 | LON: '0.0000', 148 | LAT: '0.0006', 149 | NUMBER: 'Number', 150 | STREET: 'Street' 151 | }; 152 | 153 | test.ok( isValidCsvRecord(record), 'should be accepted'); 154 | test.end(); 155 | 156 | }); 157 | 158 | tape('record with lon/lat parseable as non-0/0 should return true', test => { 159 | const record = { 160 | LON: '0.0006', 161 | LAT: '0.0000', 162 | NUMBER: 'Number', 163 | STREET: 'Street' 164 | }; 165 | 166 | test.ok( isValidCsvRecord(record), 'should be accepted'); 167 | test.end(); 168 | 169 | }); 170 | 171 | tape('record with lon/lat very close to 0,0 should return false', test => { 172 | const record = { 173 | LON: '0.000000', 174 | LAT: '0.000001', 175 | NUMBER: 'Number', 176 | STREET: 'Street' 177 | }; 178 | 179 | test.notOk(isValidCsvRecord(record), 'should not be accepted - too near to 0,0'); 180 | test.end(); 181 | 182 | }); 183 | 184 | tape('record with lon/lat very close to 0,0 should return false', test => { 185 | const record = { 186 | LON: '0.000001', 187 | LAT: '0.000000', 188 | NUMBER: 'Number', 189 | STREET: 'Street' 190 | }; 191 | 192 | test.notOk(isValidCsvRecord(record), 'should not be accepted - too near to 0,0'); 193 | test.end(); 194 | 195 | }); 196 | -------------------------------------------------------------------------------- /test/openaddresses_bad_data.csv: -------------------------------------------------------------------------------- 1 | LON,LAT,NUMBER,STREET,FOOBAR,DEADBEEF 2 | ,-40,a,b,, 3 | -40,,a,b,, 4 | ,,,,, 5 | ,,30,b,, 6 | 40,40,a,,, 7 | ,,,30,,, 8 | ,..........,,,,,,,,,,, 9 | 40,40,5, ,, 10 | 40,40,5," ",, 11 | -------------------------------------------------------------------------------- /test/openaddresses_sample.csv: -------------------------------------------------------------------------------- 1 | LON, LAT, NUMBER, STREET,FOOOOBAR 2 | -118.0170157,55.546026835788886,23042,Twp Road 755 A,,,, 3 | -118.75318353,55.14959214890181,712046,Rge Road 34,, 4 | -118.8218384,55.15506788763259,712078,Rge Road 34,, 5 | -118.79719936,55.153343057595535,712068,Rge Road 34,,,,, 6 | -118.66743097,55.151807043809917,712060,Rge Road 34,,,, 7 | -118.74783569,55.155320792497442,712082,Rge Road 35,,,, 8 | 1,2,number,too many spaces, 9 | 1,2,trim , multiple spaces,,,, 10 | -------------------------------------------------------------------------------- /test/parameters.js: -------------------------------------------------------------------------------- 1 | var tape = require( 'tape' ); 2 | var path = require( 'path' ); 3 | var fs = require('fs'); 4 | 5 | var temp = require( 'temp' ).track(); 6 | 7 | var parameters = require( '../lib/parameters' ); 8 | 9 | tape( 'interpretUserArgs() correctly handles arguments', function ( test ){ 10 | var testCase = [ 11 | [ 'test' ], 12 | { dirPath: 'test', 'parallel-count': undefined, 'parallel-id': undefined }, 13 | ]; 14 | 15 | test.deepEqual( 16 | parameters.interpretUserArgs( testCase[ 0 ] ), testCase[ 1 ], 17 | 'Basic arguments case passes.' 18 | ); 19 | 20 | var badArguments = [ 21 | [ 'not an arg', 'some dir' ], 22 | [ 'not an arg', 'some dir' ], 23 | [ 'not a dir' ], 24 | [ 'package.json' ], 25 | ]; 26 | badArguments.forEach( function execTestCase( testCase, ind ){ 27 | var errorObj = parameters.interpretUserArgs( testCase ); 28 | test.ok( 29 | 'exitCode' in errorObj && 'errMessage' in errorObj, 30 | 'Invalid arguments yield an error object: ' + ind 31 | ); 32 | }); 33 | test.end(); 34 | }); 35 | 36 | tape('interpretUserArgs returns given path as dirPath', function(test) { 37 | temp.mkdir('tmpdir', function(err, temporary_dir) { 38 | 39 | var input = [temporary_dir]; 40 | var result = parameters.interpretUserArgs(input); 41 | 42 | test.equal(result.dirPath, temporary_dir, 'path should be equal to specified path'); 43 | test.end(); 44 | }); 45 | }); 46 | 47 | tape('intepretUserArgs normalizes path given as parameter', function(test) { 48 | temp.mkdir('tmpdir', function(err, temporary_dir) { 49 | var input_dir = temporary_dir + path.sep + path.sep; 50 | 51 | var input = [input_dir]; 52 | var result = parameters.interpretUserArgs(input); 53 | 54 | var expected_dir = path.normalize(input_dir); 55 | test.equal(result.dirPath, expected_dir, 'path should be equal to specified path'); 56 | test.end(); 57 | }); 58 | }); 59 | 60 | tape('interpretUserArgs returns dir from pelias config if no dir specified on command line', function(test) { 61 | temp.mkdir('tmpdir2', function(err, temporary_dir) { 62 | var peliasConfig = { 63 | imports: { 64 | openaddresses: { 65 | datapath: temporary_dir 66 | } 67 | } 68 | }; 69 | 70 | var input = []; 71 | var result = parameters.interpretUserArgs(input, peliasConfig); 72 | 73 | test.equal(result.dirPath, temporary_dir, 'path should be equal to path from config'); 74 | test.end(); 75 | }); 76 | }); 77 | 78 | tape('interpretUserArgs returns normalized path from config', function(test) { 79 | temp.mkdir('tmpdir2', function(err, temporary_dir) { 80 | var input_dir = path.sep + '.' + temporary_dir; 81 | var peliasConfig = { 82 | imports: { 83 | openaddresses: { 84 | datapath: input_dir 85 | } 86 | } 87 | }; 88 | 89 | var input = []; 90 | var result = parameters.interpretUserArgs(input, peliasConfig); 91 | 92 | var expected_dir = path.normalize(input_dir); 93 | test.equal(result.dirPath, expected_dir, 'path should be equal to path from config'); 94 | test.end(); 95 | }); 96 | }); 97 | 98 | tape('getFileList returns all .csv path names when config has empty files list', function(test) { 99 | temp.mkdir('multipleFiles', function(err, temp_dir) { 100 | // add some files to the data path to be globbed 101 | fs.mkdirSync(path.join(temp_dir, 'dirA')); 102 | fs.writeFileSync(path.join(temp_dir, 'dirA', 'fileA.csv'), ''); 103 | 104 | fs.mkdirSync(path.join(temp_dir, 'dirB')); 105 | fs.writeFileSync(path.join(temp_dir, 'dirB', 'fileB.csv'), ''); 106 | 107 | fs.writeFileSync(path.join(temp_dir, 'fileC.csv'), ''); 108 | 109 | // should not be included since it's not a .csv file 110 | fs.writeFileSync(path.join(temp_dir, 'fileD.txt'), ''); 111 | 112 | var peliasConfig = { 113 | imports: { 114 | openaddresses: { 115 | files: [] 116 | } 117 | } 118 | }; 119 | var args = { 120 | dirPath: temp_dir 121 | }; 122 | 123 | var actual = parameters.getFileList(peliasConfig, args); 124 | 125 | test.equal(actual.length, 3); 126 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirA', 'fileA.csv'))); 127 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirB', 'fileB.csv'))); 128 | test.ok(actual.find((f) => f === path.join(temp_dir, 'fileC.csv'))); 129 | test.end(); 130 | 131 | }); 132 | }); 133 | 134 | tape('getFileList returns all .csv path names when config doesn\'t have files property', function(test) { 135 | temp.mkdir('multipleFiles', function(err, temp_dir) { 136 | // add some files to the data path to be globbed 137 | fs.mkdirSync(path.join(temp_dir, 'dirA')); 138 | fs.writeFileSync(path.join(temp_dir, 'dirA', 'fileA.csv'), ''); 139 | 140 | fs.mkdirSync(path.join(temp_dir, 'dirB')); 141 | fs.writeFileSync(path.join(temp_dir, 'dirB', 'fileB.csv'), ''); 142 | 143 | fs.writeFileSync(path.join(temp_dir, 'fileC.csv'), ''); 144 | 145 | // should not be included since it's not a .csv file 146 | fs.writeFileSync(path.join(temp_dir, 'fileD.txt'), ''); 147 | 148 | var peliasConfig = { 149 | imports: { 150 | openaddresses: { 151 | } 152 | } 153 | }; 154 | var args = { 155 | dirPath: temp_dir 156 | }; 157 | 158 | var actual = parameters.getFileList(peliasConfig, args); 159 | 160 | test.equal(actual.length, 3); 161 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirA', 'fileA.csv'))); 162 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirB', 'fileB.csv'))); 163 | test.ok(actual.find((f) => f === path.join(temp_dir, 'fileC.csv'))); 164 | test.end(); 165 | 166 | }); 167 | }); 168 | 169 | tape('getFileList returns fully qualified path names when config has a files list', function(test) { 170 | temp.mkdir('multipleFiles', function(err, temporary_dir) { 171 | var peliasConfig = { 172 | imports: { 173 | openaddresses: { 174 | files: ['filea.csv', 'fileb.csv'] 175 | } 176 | } 177 | }; 178 | var args = { 179 | dirPath: temporary_dir 180 | }; 181 | 182 | var expected = [path.join(temporary_dir, 'filea.csv'), path.join(temporary_dir, 'fileb.csv')]; 183 | 184 | var actual = parameters.getFileList(peliasConfig, args); 185 | 186 | test.deepEqual(actual, expected, 'file names should be equal'); 187 | test.end(); 188 | }); 189 | }); 190 | 191 | tape('getFileList handles parallel builds', function(test) { 192 | var peliasConfig = { 193 | imports: { 194 | openaddresses: { 195 | files: ['filea.csv', 'fileb.csv', 'filec.csv'] 196 | } 197 | } 198 | }; 199 | 200 | temp.mkdir('parallelBuilds', function(err, temporary_dir) { 201 | test.test('3 workers, id 0', function(t) { 202 | var args = { 203 | dirPath: temporary_dir, 204 | 'parallel-count': 3, 205 | 'parallel-id': 0 206 | }; 207 | 208 | var expected = [path.join(temporary_dir, 'filea.csv')]; 209 | 210 | var actual = parameters.getFileList(peliasConfig, args); 211 | 212 | t.deepEqual(actual, expected, 'only first file is indexed'); 213 | t.end(); 214 | }); 215 | 216 | test.test('3 workers, id 1', function(t) { 217 | var args = { 218 | dirPath: temporary_dir, 219 | 'parallel-count': 3, 220 | 'parallel-id': 1 221 | }; 222 | 223 | var expected = [path.join(temporary_dir, 'fileb.csv')]; 224 | 225 | var actual = parameters.getFileList(peliasConfig, args); 226 | 227 | t.deepEqual(actual, expected, 'only second file indexed'); 228 | t.end(); 229 | }); 230 | 231 | test.test('3 workers, id 2', function(t) { 232 | var args = { 233 | dirPath: temporary_dir, 234 | 'parallel-count': 3, 235 | 'parallel-id': 2 236 | }; 237 | 238 | var expected = [path.join(temporary_dir, 'filec.csv')]; 239 | 240 | var actual = parameters.getFileList(peliasConfig, args); 241 | 242 | t.deepEqual(actual, expected, 'only third file indexed'); 243 | t.end(); 244 | }); 245 | 246 | test.test('3 workers, id 3', function(t) { 247 | var args = { 248 | dirPath: temporary_dir, 249 | 'parallel-count': 3, 250 | 'parallel-id': 3 251 | }; 252 | 253 | var expected = []; 254 | 255 | var actual = parameters.getFileList(peliasConfig, args); 256 | 257 | t.deepEqual(actual, expected, 'file list is empty'); 258 | t.end(); 259 | }); 260 | }); 261 | }); 262 | -------------------------------------------------------------------------------- /test/schema.js: -------------------------------------------------------------------------------- 1 | const tape = require( 'tape' ); 2 | const schema = require( '../schema' ); 3 | 4 | function validate(config) { 5 | const result = schema.validate(config); 6 | if (result.error) { 7 | throw new Error(result.error.details[0].message); 8 | } 9 | } 10 | 11 | tape('missing imports should throw error', function(test) { 12 | const config = {}; 13 | 14 | test.throws(validate.bind(null, config), /"imports" is required/); 15 | test.end(); 16 | 17 | }); 18 | 19 | tape('non-object imports should throw error', function(test) { 20 | [null, 17, 'string', [], true].forEach((value) => { 21 | const config = { 22 | imports: value 23 | }; 24 | 25 | test.throws(validate.bind(null, config), /"imports" must be of type object/); 26 | }); 27 | 28 | test.end(); 29 | 30 | }); 31 | 32 | tape('missing imports.openaddresses should throw error', function(test) { 33 | const config = { 34 | imports: { 35 | } 36 | }; 37 | 38 | test.throws(validate.bind(null, config), /"imports.openaddresses" is required/); 39 | test.end(); 40 | 41 | }); 42 | 43 | tape('non-object imports.openaddresses should throw error', function(test) { 44 | [null, 17, 'string', [], true].forEach((value) => { 45 | const config = { 46 | imports: { 47 | openaddresses: value 48 | } 49 | }; 50 | 51 | test.throws(validate.bind(null, config), /"imports.openaddresses" must be of type object/); 52 | }); 53 | 54 | test.end(); 55 | 56 | }); 57 | 58 | tape( 'missing datapath should throw error', function(test) { 59 | const config = { 60 | imports: { 61 | openaddresses: {} 62 | } 63 | }; 64 | 65 | test.throws(validate.bind(null, config), /"imports.openaddresses.datapath" is required/); 66 | test.end(); 67 | 68 | }); 69 | 70 | tape( 'non-string datapath should throw error', function(test) { 71 | [null, 17, {}, [], false].forEach((value) => { 72 | const config = { 73 | imports: { 74 | openaddresses: { 75 | datapath: value 76 | } 77 | } 78 | }; 79 | 80 | test.throws(validate.bind(null, config), /"imports.openaddresses.datapath" must be a string/); 81 | 82 | }); 83 | 84 | test.end(); 85 | }); 86 | 87 | tape( 'non-array files should throw error', function(test) { 88 | [null, 17, {}, 'string', false].forEach((value) => { 89 | const config = { 90 | imports: { 91 | openaddresses: { 92 | datapath: 'this is the datapath', 93 | files: value 94 | } 95 | } 96 | }; 97 | 98 | test.throws(validate.bind(null, config), /"imports.openaddresses.files" must be an array/); 99 | }); 100 | 101 | test.end(); 102 | }); 103 | 104 | tape( 'non-string elements in files array should throw error', function(test) { 105 | [null, 17, {}, [], false].forEach((value) => { 106 | const config = { 107 | imports: { 108 | openaddresses: { 109 | datapath: 'this is the datapath', 110 | files: [value] 111 | } 112 | } 113 | }; 114 | 115 | test.throws(validate.bind(null, config), 116 | /"imports.openaddresses.files\[0\]" must be a string/, 'files elements must be strings'); 117 | }); 118 | 119 | test.end(); 120 | }); 121 | 122 | tape( 'non-boolean adminLookup should throw error', function(test) { 123 | [null, 17, {}, [], 'string'].forEach((value) => { 124 | const config = { 125 | imports: { 126 | openaddresses: { 127 | datapath: 'this is the datapath', 128 | adminLookup: value 129 | } 130 | } 131 | }; 132 | 133 | test.throws(validate.bind(null, config), 134 | /"imports.openaddresses.adminLookup" must be a boolean/); 135 | }); 136 | 137 | test.end(); 138 | }); 139 | 140 | tape( 'unknown config fields should throw error', function(test) { 141 | const config = { 142 | imports: { 143 | openaddresses: { 144 | datapath: 'this is the datapath', 145 | token: 'abc', 146 | unknown: 'value' 147 | } 148 | } 149 | }; 150 | 151 | test.throws(validate.bind(null, config), 152 | /"imports.openaddresses.unknown" is not allowed/, 'unknown fields should be disallowed'); 153 | test.end(); 154 | 155 | }); 156 | 157 | tape( 'configuration with only datapath & token should not throw error', function(test) { 158 | const config = { 159 | imports: { 160 | openaddresses: { 161 | datapath: 'this is the datapath', 162 | token: 'abc' 163 | } 164 | } 165 | }; 166 | 167 | test.doesNotThrow(validate.bind(null, config), 'config should be valid'); 168 | test.end(); 169 | 170 | }); 171 | 172 | tape( 'valid configuration should not throw error', function(test) { 173 | const config = { 174 | imports: { 175 | openaddresses: { 176 | datapath: 'this is the datapath', 177 | token: 'abc', 178 | adminLookup: false, 179 | files: ['file 1', 'file 2'] 180 | } 181 | } 182 | }; 183 | 184 | test.doesNotThrow(validate.bind(null, config), 'config should be valid'); 185 | test.end(); 186 | 187 | }); 188 | 189 | tape( 'unknown children of imports should not throw error', function(test) { 190 | const config = { 191 | imports: { 192 | openaddresses: { 193 | datapath: 'this is the datapath', 194 | token: 'abc', 195 | adminLookup: false, 196 | files: ['file 1', 'file 2'] 197 | }, 198 | other: {} 199 | } 200 | }; 201 | 202 | test.doesNotThrow(validate.bind(null, config), 'config should be valid'); 203 | test.end(); 204 | 205 | }); 206 | 207 | tape( 'unknown children of root should not throw error', function(test) { 208 | const config = { 209 | imports: { 210 | openaddresses: { 211 | datapath: 'this is the datapath', 212 | token: 'abc', 213 | adminLookup: false, 214 | files: ['file 1', 'file 2'] 215 | } 216 | }, 217 | other: {} 218 | }; 219 | 220 | test.doesNotThrow(validate.bind(null, config), 'config should be valid'); 221 | test.end(); 222 | 223 | }); 224 | -------------------------------------------------------------------------------- /test/streams/cleanupStream.js: -------------------------------------------------------------------------------- 1 | var tape = require( 'tape' ); 2 | 3 | var CleanupStream = require( '../../lib/streams/cleanupStream' ); 4 | 5 | const stream_mock = require('stream-mock'); 6 | 7 | function test_stream(input, testedStream, callback) { 8 | const reader = new stream_mock.ObjectReadableMock(input); 9 | const writer = new stream_mock.ObjectWritableMock(); 10 | writer.on('error', (e) => callback(e)); 11 | writer.on('finish', () => callback(null, writer.data)); 12 | reader.pipe(testedStream).pipe(writer); 13 | } 14 | 15 | tape( 'cleanupStream trims whitespace from all fields', function(test) { 16 | var input = { 17 | NUMBER: '5 ', 18 | STREET: ' Abcd ', 19 | LAT: 5, 20 | LON: 6, 21 | POSTCODE: ' def ' 22 | }; 23 | 24 | var cleanupStream = CleanupStream.create({ countryCode: 'us' }); 25 | 26 | test_stream([input], cleanupStream, function(err, records) { 27 | test.equal(records.length, 1, 'stream length unchanged'); 28 | 29 | var record = records[0]; 30 | test.equal(record.NUMBER, '5', 'NUMBER field is trimmed'); 31 | test.equal(record.STREET, 'Abcd', 'STREET field is trimmed'); 32 | test.equal(record.POSTCODE, 'def', 'POSTCODE field is trimmed'); 33 | test.end(); 34 | }); 35 | }); 36 | 37 | tape( 'cleanupStream does NOT trim leading 0\'s from house numbers', function(test) { 38 | var inputs = [ 39 | { 40 | NUMBER: ' 0030 ', 41 | STREET: 'Street' 42 | }, 43 | { 44 | NUMBER: '0034560', 45 | STREET: 'Street' 46 | }, 47 | { 48 | NUMBER: '12340', 49 | STREET: 'Street' 50 | } 51 | ]; 52 | 53 | var expecteds = [ 54 | { 55 | NUMBER: '0030', 56 | STREET: 'Street' 57 | }, 58 | { 59 | NUMBER: '0034560', 60 | STREET: 'Street' 61 | }, 62 | { 63 | NUMBER: '12340', 64 | STREET: 'Street' 65 | } 66 | ]; 67 | 68 | var cleanupStream = CleanupStream.create({ countryCode: 'us' }); 69 | 70 | test_stream(inputs, cleanupStream, function(err, actual) { 71 | test.deepEqual(actual, expecteds, 'leading 0\'s should not have been trimmed from NUMBER'); 72 | test.end(); 73 | }); 74 | 75 | }); 76 | 77 | tape ( 'cleanupStream trims white space in street field', function(test){ 78 | var input = { 79 | STREET: '34 West\t 93rd \nSt' 80 | }; 81 | 82 | var cleanupStream = CleanupStream.create({ countryCode: 'us' }); 83 | 84 | test_stream([input],cleanupStream, function(err,records){ 85 | test.equal(records.length, 1, 'stream length unchanged'); 86 | test.equal(records[0].STREET, '34 West 93rd Street'); 87 | test.end(); 88 | }); 89 | }); 90 | 91 | tape( 'cleanupStream converts all-caps street names to Title Case', function(test){ 92 | var inputs = [{ 93 | NUMBER: '88', 94 | STREET: 'GLASGOW STREET' 95 | }, 96 | { 97 | NUMBER: '76', 98 | STREET : 'McCallister Street' //already capitalized street should be unchanged 99 | }, 100 | { 101 | NUMBER: '9923736', 102 | STREET: 'Macalester Street'//should also be unchanged 103 | }, 104 | { 105 | NUMBER: '314', 106 | STREET: 'timid street' //should capitalize first letter of each word 107 | }, 108 | { 109 | NUMBER: '4', 110 | STREET: 'é' 111 | }, 112 | { 113 | NUMBER: '9', 114 | STREET: '丁目' 115 | }]; 116 | var expecteds = [{ 117 | NUMBER: '88', 118 | STREET: 'Glasgow Street' 119 | }, 120 | { 121 | NUMBER: '76', 122 | STREET : 'McCallister Street' //already capitalized street should be unchanged 123 | }, 124 | { 125 | NUMBER: '9923736', 126 | STREET: 'Macalester Street'//should also be unchanged 127 | }, 128 | { 129 | NUMBER: '314', 130 | STREET: 'Timid Street' //should capitalize first letter of each word 131 | }, 132 | { 133 | NUMBER: '4', 134 | STREET: 'É' //should only capitalize when more than one char 135 | }, 136 | { 137 | NUMBER: '9', 138 | STREET: '丁目' //should handle non-latin characters 139 | }]; 140 | 141 | var cleanupStream = CleanupStream.create({ countryCode: 'us' }); 142 | 143 | test_stream(inputs,cleanupStream,function(err,actual){ 144 | test.deepEqual(actual, expecteds,'we expect proper capitalization'); 145 | test.end(); 146 | }); 147 | }); 148 | 149 | tape( 'cleanupStream expands directionals.', function(test){ 150 | var inputs = [{ 151 | NUMBER: '88', 152 | STREET: 'North East Glasgow Street' 153 | }, 154 | { 155 | NUMBER: '76', 156 | STREET: 'South West McCallister Street' 157 | }, 158 | { 159 | NUMBER: '9923736', 160 | STREET: 'Serenity Street'//should be unchanged even though the start matches a directional 161 | }]; 162 | var expecteds = [{ 163 | NUMBER: '88', 164 | STREET: 'North East Glasgow Street' 165 | }, 166 | { 167 | NUMBER: '76', 168 | STREET : 'South West McCallister Street' 169 | }, 170 | { 171 | NUMBER: '9923736', 172 | STREET: 'Serenity Street'//should also be unchanged 173 | }]; 174 | 175 | var cleanupStream = CleanupStream.create({ countryCode: 'us' }); 176 | 177 | test_stream(inputs,cleanupStream,function(err,actual){ 178 | test.deepEqual(actual, expecteds,'we expect proper capitalization of street directionals'); 179 | test.end(); 180 | }); 181 | }); 182 | -------------------------------------------------------------------------------- /test/streams/contentHashStream.js: -------------------------------------------------------------------------------- 1 | const tape = require('tape'); 2 | const stream_mock = require('stream-mock'); 3 | const ContentHashStream = require('../../lib/streams/contentHashStream'); 4 | const hash = ContentHashStream.hash; 5 | const DEFAULT_HASH = 'ca9c491ac66b2c62'; 6 | 7 | function test_stream(input, testedStream, callback) { 8 | const reader = new stream_mock.ObjectReadableMock(input); 9 | const writer = new stream_mock.ObjectWritableMock(); 10 | writer.on('error', (e) => callback(e)); 11 | writer.on('finish', () => callback(null, writer.data)); 12 | reader.pipe(testedStream).pipe(writer); 13 | } 14 | 15 | tape('contentHashStream generates new hash', function (test) { 16 | var input = { 17 | NUMBER: '5', 18 | STREET: 'Abcd', 19 | LAT: 5, 20 | LON: 6, 21 | POSTCODE: 'def' 22 | }; 23 | 24 | var contentHashStream = ContentHashStream.create(); 25 | 26 | test_stream([input], contentHashStream, function (err, records) { 27 | test.equal(records.length, 1, 'stream length unchanged'); 28 | 29 | var record = records[0]; 30 | test.equal(record.HASH, 'f44048507e8fb319', 'HASH field generated'); 31 | test.end(); 32 | }); 33 | }); 34 | 35 | tape('contentHashStream replaces existing hash', function (test) { 36 | var input = { 37 | NUMBER: '5 ', 38 | STREET: ' Abcd ', 39 | LAT: 5, 40 | LON: 6, 41 | POSTCODE: ' def ', 42 | HASH: '54830a0a5bbbca8f' 43 | }; 44 | 45 | var contentHashStream = ContentHashStream.create(); 46 | 47 | test_stream([input], contentHashStream, function (err, records) { 48 | test.equal(records.length, 1, 'stream length unchanged'); 49 | 50 | var record = records[0]; 51 | test.equal(record.HASH, 'f44048507e8fb319', 'HASH field generated'); 52 | test.end(); 53 | }); 54 | }); 55 | 56 | tape('hash: default value for non-object and empty objects', function (test) { 57 | test.equal(hash(null), DEFAULT_HASH, 'default hash'); 58 | test.equal(hash(1), DEFAULT_HASH, 'default hash'); 59 | test.equal(hash(false), DEFAULT_HASH, 'default hash'); 60 | test.equal(hash('string'), DEFAULT_HASH, 'default hash'); 61 | test.equal(hash([]), DEFAULT_HASH, 'default hash'); 62 | test.equal(hash({}), DEFAULT_HASH, 'default hash'); 63 | test.end(); 64 | }); 65 | 66 | tape('hash: 16 lowercase hexidecimal chars', function (test) { 67 | const conform = /^[0-9a-f]{16}$/; 68 | for( let i=-90.0; i<+90.0; i+=0.5 ){ 69 | let h = hash({ LON: i, LAT: i }); 70 | test.true(conform.test(h), h); 71 | } 72 | test.end(); 73 | }); 74 | 75 | tape('hash: strict equality', function (test) { 76 | test.equal( 77 | hash({ LON: '1.1', LAT: '2.2' }), 78 | hash({ LON: '1.1', LAT: '2.2' }) 79 | ); 80 | test.equal( 81 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST' }), 82 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST' }) 83 | ); 84 | test.equal( 85 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10' }), 86 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10' }) 87 | ); 88 | test.equal( 89 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6B' }), 90 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6B' }) 91 | ); 92 | test.end(); 93 | }); 94 | 95 | tape('hash: ingore existing hash field', function (test) { 96 | test.equal( 97 | hash({ LON: '1.1', LAT: '2.2', HASH: 'c2f8c35aa279ee7d' }), 98 | hash({ LON: '1.1', LAT: '2.2', HASH: 'deadb33fdeadb33f' }) 99 | ); 100 | test.end(); 101 | }); 102 | 103 | tape('hash: fuzzy equality', function (test) { 104 | test.equal( 105 | hash({ STREET: 'A ST' }), 106 | hash({ STREET: 'a st' }), 107 | 'value case' 108 | ); 109 | test.equal( 110 | hash({ STREET: 'A ST' }), 111 | hash({ STREET: ' A ST ' }), 112 | 'value whitespace' 113 | ); 114 | test.equal( 115 | hash({ STREET: 1 }), 116 | hash({ STREET: '1' }), 117 | 'value type' 118 | ); 119 | test.equal( 120 | hash({ LON: 1.123456789 }), 121 | hash({ LON: 1.1234567 }), 122 | 'float precision' 123 | ); 124 | test.equal( 125 | hash({ LON: 1.12000000000 }), 126 | hash({ LON: 1.12 }), 127 | 'float precision' 128 | ); 129 | test.equal( 130 | hash({ LON: -1.000000000000 }), 131 | hash({ LON: -1 }), 132 | 'float precision' 133 | ); 134 | test.equal( 135 | hash({ LON: 0 }), 136 | hash({ LON: -0 }), 137 | 'negative zero' 138 | ); 139 | test.end(); 140 | }); 141 | 142 | tape('hash: strict inequality', function (test) { 143 | test.notEqual( 144 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6B' }), 145 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6' }) 146 | ); 147 | test.notEqual( 148 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10' }), 149 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '11' }) 150 | ); 151 | test.notEqual( 152 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST' }), 153 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A RD' }) 154 | ); 155 | test.notEqual( 156 | hash({ LON: '1.1', LAT: '2.2' }), 157 | hash({ LON: '1.1', LAT: '2.1' }) 158 | ); 159 | test.notEqual( 160 | hash({ LON: '1.1' }), 161 | hash({ LON: '-1.1' }) 162 | ); 163 | test.notEqual( 164 | hash({ NUMBER: '10' }), 165 | hash({ UNIT: '10' }) 166 | ); 167 | test.end(); 168 | }); 169 | -------------------------------------------------------------------------------- /test/streams/documentStream.js: -------------------------------------------------------------------------------- 1 | const tape = require( 'tape' ); 2 | 3 | const stream_mock = require('stream-mock'); 4 | 5 | const DocumentStream = require( '../../lib/streams/documentStream' ); 6 | 7 | function test_stream(input, testedStream, callback) { 8 | const reader = new stream_mock.ObjectReadableMock(input); 9 | const writer = new stream_mock.ObjectWritableMock(); 10 | writer.on('error', (e) => callback(e)); 11 | writer.on('finish', () => callback(null, writer.data)); 12 | reader.pipe(testedStream).pipe(writer); 13 | } 14 | 15 | tape( 'documentStream catches records with no street', function(test) { 16 | const input = { 17 | NUMBER: 5 18 | }; 19 | const stats = { badRecordCount: 0 }; 20 | const documentStream = DocumentStream.create('prefix', stats); 21 | 22 | test_stream([input], documentStream, function(err, actual) { 23 | test.equal(actual.length, 0, 'no documents should be pushed' ); 24 | test.equal(stats.badRecordCount, 1, 'bad record count updated'); 25 | test.end(); 26 | }); 27 | }); 28 | 29 | tape( 'documentStream does not set zipcode if zipcode is emptystring', function(test) { 30 | const input = { 31 | NUMBER: '5', 32 | STREET: '101st Avenue', 33 | LAT: 5, 34 | LON: 6, 35 | POSTCODE: '' 36 | }; 37 | const stats = { badRecordCount: 0 }; 38 | const documentStream = DocumentStream.create('prefix', stats); 39 | 40 | test_stream([input], documentStream, function(err, actual) { 41 | test.equal(actual.length, 1, 'the document should be pushed' ); 42 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged'); 43 | test.equal(actual[0].getAddress('zip'), undefined); 44 | test.end(); 45 | }); 46 | }); 47 | 48 | tape( 'documentStream creates id with filename-based prefix', function(test) { 49 | const input = { 50 | NUMBER: '5', 51 | STREET: '101st Avenue', 52 | LAT: 5, 53 | LON: 6, 54 | POSTCODE: '' 55 | }; 56 | 57 | const stats = { badRecordCount: 0 }; 58 | const documentStream = DocumentStream.create('prefix', stats); 59 | 60 | test_stream([input], documentStream, function(err, actual) { 61 | test.equal(actual.length, 1, 'the document should be pushed' ); 62 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged'); 63 | test.equal(actual[0].getId(), 'prefix:0'); 64 | test.end(); 65 | }); 66 | }); 67 | 68 | tape('documentStream uses HASH value if present', function(test) { 69 | const input = { 70 | NUMBER: '5', 71 | STREET: '101st Avenue', 72 | LAT: 5, 73 | LON: 6, 74 | HASH: 'abcd' 75 | }; 76 | 77 | const stats = { badRecordCount: 0 }; 78 | const documentStream = DocumentStream.create('prefix', stats); 79 | 80 | test_stream([input], documentStream, function(err, actual) { 81 | test.equal(actual.length, 1, 'the document should be pushed' ); 82 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged'); 83 | test.equal(actual[0].getId(), 'prefix:abcd'); 84 | test.end(); 85 | }); 86 | }); 87 | 88 | tape('documentStream valid country_code lowercase', function (test) { 89 | const input = { 90 | NUMBER: '5', 91 | STREET: '101st Avenue', 92 | LAT: 5, 93 | LON: 6, 94 | HASH: 'abcd' 95 | }; 96 | const stats = { badRecordCount: 0 }; 97 | const documentStream = DocumentStream.create('au/example', stats); 98 | 99 | test_stream([input], documentStream, function (err, actual) { 100 | test.equal(actual.length, 1, 'the document should be pushed'); 101 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged'); 102 | test.deepEqual(actual[0].getMeta('country_code'), 'AU', 'country_code set'); 103 | test.end(); 104 | }); 105 | }); 106 | 107 | tape('documentStream valid country_code uppercase', function (test) { 108 | const input = { 109 | NUMBER: '5', 110 | STREET: '101st Avenue', 111 | LAT: 5, 112 | LON: 6, 113 | HASH: 'abcd' 114 | }; 115 | const stats = { badRecordCount: 0 }; 116 | const documentStream = DocumentStream.create('AU/example', stats); 117 | 118 | test_stream([input], documentStream, function (err, actual) { 119 | test.equal(actual.length, 1, 'the document should be pushed'); 120 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged'); 121 | test.deepEqual(actual[0].getMeta('country_code'), 'AU', 'country_code set'); 122 | test.end(); 123 | }); 124 | }); 125 | 126 | tape('documentStream invalid country_code', function (test) { 127 | const input = { 128 | NUMBER: '5', 129 | STREET: '101st Avenue', 130 | LAT: 5, 131 | LON: 6, 132 | HASH: 'abcd' 133 | }; 134 | const stats = { badRecordCount: 0 }; 135 | const documentStream = DocumentStream.create('foo/example', stats); // note: does not match pattern 136 | 137 | test_stream([input], documentStream, function (err, actual) { 138 | test.equal(actual.length, 1, 'the document should be pushed'); 139 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged'); 140 | test.deepEqual(actual[0].getMeta('country_code'), undefined, 'country_code not set'); 141 | test.end(); 142 | }); 143 | }); 144 | 145 | tape('documentStream store reference to OA object in meta', function (test) { 146 | const input = { 147 | NUMBER: '5', 148 | STREET: '101st Avenue', 149 | LAT: 5, 150 | LON: 6, 151 | HASH: 'abcd' 152 | }; 153 | const stats = { badRecordCount: 0 }; 154 | const documentStream = DocumentStream.create('example', stats); 155 | 156 | test_stream([input], documentStream, function (err, actual) { 157 | test.equal(actual.length, 1, 'the document should be pushed'); 158 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged'); 159 | test.deepEqual(actual[0].getMeta('oa'), input, 'OA reference stored in meta'); 160 | test.end(); 161 | }); 162 | }); 163 | -------------------------------------------------------------------------------- /test/streams/gnafMapperStream.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'); 2 | const through = require('through2'); 3 | const mapper = require('../../lib/streams/gnafMapperStream'); 4 | const Document = require('pelias-model').Document; 5 | 6 | module.exports.tests = {}; 7 | 8 | // test exports 9 | module.exports.tests.interface = function (test) { 10 | test('interface: factory', t => { 11 | t.equal(typeof mapper, 'function', 'stream factory'); 12 | t.end(); 13 | }); 14 | test('interface: stream', t => { 15 | var stream = mapper(); 16 | t.equal(typeof stream, 'object', 'valid stream'); 17 | t.equal(typeof stream._read, 'function', 'valid readable'); 18 | t.equal(typeof stream._write, 'function', 'valid writeable'); 19 | t.end(); 20 | }); 21 | }; 22 | 23 | // ===================== GNAF ID mapping ====================== 24 | 25 | module.exports.tests.au_gnaf_id = function (test) { 26 | var doc = new Document('oa', 'a', 1); 27 | doc.setMeta('country_code', 'AU'); 28 | doc.setMeta('oa', { 29 | ID: 'GAVIC411412475', 30 | NUMBER: '360', 31 | STREET: 'BRUNSWICK STREET', 32 | LAT: -37.79647546, 33 | LON: 144.978997 34 | }); 35 | test('maps - GNAF ID', t => { 36 | var stream = mapper(); 37 | stream.pipe(through.obj((doc, enc, next) => { 38 | t.deepEqual(doc.getAddendum('concordances'), { 'gnaf:pid': 'GAVIC411412475' }, 'correctly mapped'); 39 | t.end(); 40 | next(); 41 | })); 42 | stream.write(doc); 43 | }); 44 | }; 45 | 46 | module.exports.tests.au_invalid_gnaf_id = function (test) { 47 | var doc = new Document('oa', 'a', 1); 48 | doc.setMeta('country_code', 'AU'); 49 | doc.setMeta('oa', { 50 | ID: 'invalid', // note: invalid GNAF ID 51 | NUMBER: '360', 52 | STREET: 'BRUNSWICK STREET', 53 | LAT: -37.79647546, 54 | LON: 144.978997 55 | }); 56 | test('maps - GNAF ID', t => { 57 | var stream = mapper(); 58 | stream.pipe(through.obj((doc, enc, next) => { 59 | t.deepEqual(doc.getAddendum('concordances'), undefined); 60 | t.end(); 61 | next(); 62 | })); 63 | stream.write(doc); 64 | }); 65 | }; 66 | 67 | module.exports.tests.au_missing_id_field = function (test) { 68 | var doc = new Document('oa', 'a', 1); 69 | doc.setMeta('country_code', 'AU'); 70 | doc.setMeta('oa', { 71 | ID: undefined, // note: missing ID field 72 | NUMBER: '360', 73 | STREET: 'BRUNSWICK STREET', 74 | LAT: -37.79647546, 75 | LON: 144.978997 76 | }); 77 | test('maps - GNAF ID', t => { 78 | var stream = mapper(); 79 | stream.pipe(through.obj((doc, enc, next) => { 80 | t.deepEqual(doc.getAddendum('concordances'), undefined); 81 | t.end(); 82 | next(); 83 | })); 84 | stream.write(doc); 85 | }); 86 | }; 87 | 88 | module.exports.tests.non_au_gnaf_id = function (test) { 89 | var doc = new Document('oa', 'a', 1); 90 | doc.setMeta('country_code', 'NZ'); // note: country code not AU 91 | doc.setMeta('oa', { 92 | ID: 'GAVIC411412475', 93 | NUMBER: '360', 94 | STREET: 'BRUNSWICK STREET', 95 | LAT: -37.79647546, 96 | LON: 144.978997 97 | }); 98 | test('maps - GNAF ID', t => { 99 | var stream = mapper(); 100 | stream.pipe(through.obj((doc, enc, next) => { 101 | t.deepEqual(doc.getAddendum('concordances'), undefined); 102 | t.end(); 103 | next(); 104 | })); 105 | stream.write(doc); 106 | }); 107 | }; 108 | 109 | 110 | function test(name, testFunction) { 111 | return tape('unit_splitting_mapper: ' + name, testFunction); 112 | } 113 | 114 | for (var testCase in module.exports.tests) { 115 | module.exports.tests[testCase](test); 116 | } 117 | -------------------------------------------------------------------------------- /test/streams/isUSorCAHouseNumberZero.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'); 2 | var isUSorCAHouseNumberZero = require('../../lib/streams/isUSorCAHouseNumberZero'); 3 | 4 | const stream_mock = require('stream-mock'); 5 | 6 | function test_stream(input, testedStream, callback) { 7 | const reader = new stream_mock.ObjectReadableMock(input); 8 | const writer = new stream_mock.ObjectWritableMock(); 9 | writer.on('error', (e) => callback(e)); 10 | writer.on('finish', () => callback(null, writer.data)); 11 | reader.pipe(testedStream).pipe(writer); 12 | } 13 | 14 | tape('isUSorCAHouseNumberZero', function(t) { 15 | t.test('non-0 house number in USA should return true', function(t) { 16 | var records = [ 17 | { 18 | parent: { 19 | country_a: ['USA'] 20 | }, 21 | address_parts: { 22 | number: '1007' 23 | } 24 | }, 25 | { 26 | parent: { 27 | country_a: ['USA'] 28 | }, 29 | address_parts: { 30 | number: '0017' 31 | } 32 | }, 33 | { 34 | parent: { 35 | country_a: ['USA'] 36 | }, 37 | address_parts: { 38 | number: '1700' 39 | } 40 | } 41 | ]; 42 | 43 | var filter = isUSorCAHouseNumberZero.create(); 44 | 45 | test_stream(records, filter, function(err, actual) { 46 | t.deepEqual(actual, records, 'none should have been filtered out'); 47 | t.end(); 48 | }); 49 | 50 | }); 51 | 52 | t.test('non-0 house number in CAN should return true', function(t) { 53 | var records = [ 54 | { 55 | parent: { 56 | country_a: ['CAN'] 57 | }, 58 | address_parts: { 59 | number: '1007' 60 | } 61 | }, 62 | { 63 | parent: { 64 | country_a: ['CAN'] 65 | }, 66 | address_parts: { 67 | number: '0017' 68 | } 69 | }, 70 | { 71 | parent: { 72 | country_a: ['CAN'] 73 | }, 74 | address_parts: { 75 | number: '1700' 76 | } 77 | } 78 | ]; 79 | 80 | var filter = isUSorCAHouseNumberZero.create(); 81 | 82 | test_stream(records, filter, function(err, actual) { 83 | t.deepEqual(actual, records, 'none should have been filtered out'); 84 | t.end(); 85 | }); 86 | 87 | }); 88 | 89 | t.test('non-0 house number in non-USA/CAN should return true', function(t) { 90 | var records = [ 91 | { 92 | parent: { 93 | country_a: ['GBR'] 94 | }, 95 | address_parts: { 96 | number: '1007' 97 | } 98 | }, 99 | { 100 | parent: { 101 | country_a: ['GBR'] 102 | }, 103 | address_parts: { 104 | number: '0017' 105 | } 106 | }, 107 | { 108 | parent: { 109 | country_a: ['GBR'] 110 | }, 111 | address_parts: { 112 | number: '1700' 113 | } 114 | } 115 | ]; 116 | 117 | var filter = isUSorCAHouseNumberZero.create(); 118 | 119 | test_stream(records, filter, function(err, actual) { 120 | t.deepEqual(actual, records, 'none should have been filtered out'); 121 | t.end(); 122 | }); 123 | 124 | }); 125 | 126 | t.test('house number reduceable to 0 in USA should return false', function(t) { 127 | var records = [ 128 | { 129 | parent: { 130 | country_a: ['USA'] 131 | }, 132 | address_parts: { 133 | number: '0' 134 | } 135 | }, 136 | { 137 | parent: { 138 | country_a: ['USA'] 139 | }, 140 | address_parts: { 141 | number: '00000' 142 | } 143 | } 144 | ]; 145 | 146 | var filter = isUSorCAHouseNumberZero.create(); 147 | 148 | test_stream(records, filter, function(err, actual) { 149 | t.deepEqual(actual, [], 'all should have been filtered out'); 150 | t.end(); 151 | }); 152 | 153 | }); 154 | 155 | t.test('house number reduceable to 0 in CAN should return false', function(t) { 156 | var records = [ 157 | { 158 | parent: { 159 | country_a: ['CAN'] 160 | }, 161 | address_parts: { 162 | number: '0' 163 | } 164 | }, 165 | { 166 | parent: { 167 | country_a: ['CAN'] 168 | }, 169 | address_parts: { 170 | number: '00000' 171 | } 172 | } 173 | ]; 174 | 175 | var filter = isUSorCAHouseNumberZero.create(); 176 | 177 | test_stream(records, filter, function(err, actual) { 178 | t.deepEqual(actual, [], 'all should have been filtered out'); 179 | t.end(); 180 | }); 181 | 182 | }); 183 | 184 | t.test('house number reduceable to 0 in non-USA/CAN should return true', function(t) { 185 | var records = [ 186 | { 187 | parent: { 188 | country_a: ['GBR'] 189 | }, 190 | address_parts: { 191 | number: '0' 192 | } 193 | }, 194 | { 195 | parent: { 196 | country_a: ['GBR'] 197 | }, 198 | address_parts: { 199 | number: '00000' 200 | } 201 | } 202 | ]; 203 | 204 | var filter = isUSorCAHouseNumberZero.create(); 205 | 206 | test_stream(records, filter, function(err, actual) { 207 | t.deepEqual(actual, records, 'none should have been filtered out'); 208 | t.end(); 209 | }); 210 | 211 | }); 212 | 213 | }); 214 | -------------------------------------------------------------------------------- /test/streams/recordStream.js: -------------------------------------------------------------------------------- 1 | var tape = require( 'tape' ); 2 | var through = require( 'through2' ); 3 | 4 | var peliasModel = require( 'pelias-model' ); 5 | 6 | var recordStream = require( '../../lib/streams/recordStream' ); 7 | 8 | /** 9 | * Tests whether records read from `test/openaddresses_sample.csv` are created 10 | * into Document objects with expected values. 11 | */ 12 | tape( 13 | 'importPipelines.createRecordStream() creates Document objects with expected values.', 14 | function ( test ){ 15 | function createTestRec( lon, lat, name ){ 16 | return { lon: lon, lat: lat, name: name }; 17 | } 18 | 19 | var expectedRecords = [ 20 | createTestRec( -118.0170157, 55.546026835788886, '23042 Twp Road 755 A' ), 21 | createTestRec( -118.75318353, 55.14959214890181, '712046 Rge Road 34' ), 22 | createTestRec( -118.8218384, 55.15506788763259, '712078 Rge Road 34' ), 23 | createTestRec( -118.79719936, 55.153343057595535, '712068 Rge Road 34' ), 24 | createTestRec( -118.66743097, 55.151807043809917, '712060 Rge Road 34' ), 25 | createTestRec( -118.74783569, 55.155320792497442, '712082 Rge Road 35' ), 26 | createTestRec( 1, 2, 'number Too Many Spaces' ), 27 | createTestRec( 1, 2, 'trim Multiple Spaces' ) 28 | ]; 29 | test.plan( expectedRecords.length * 4 + 1); 30 | 31 | var dataStream = recordStream.create(['test/openaddresses_sample.csv']); 32 | test.ok( dataStream.readable, 'Stream is readable.' ); 33 | var testStream = through.obj(function ( data, enc, next ){ 34 | test.ok( 35 | data instanceof peliasModel.Document, 'Data is a Document object.' 36 | ); 37 | 38 | var expected = expectedRecords.splice( 0, 1 )[ 0 ]; 39 | var centroid = data.getCentroid(); 40 | test.ok( expected.lon - centroid.lon < 1e-6, 'Longitude matches.' ); 41 | test.ok( expected.lat - centroid.lat < 1e-6, 'Latitude matches.' ); 42 | test.equal( data.getName( 'default' ), expected.name , 'Name matches.' ); 43 | next(); 44 | }); 45 | dataStream.pipe( testStream ); 46 | } 47 | ); 48 | 49 | tape( 'Don\'t create records for invalid data.', function ( test ){ 50 | var dataStream = recordStream.create(['test/openaddresses_bad_data.csv']); 51 | 52 | dataStream.pipe( through.obj( 53 | function write( data, _, next ){ 54 | test.fail( 'Document was created from bad data: ' + JSON.stringify( data, undefined, 4 ) ); 55 | next(); 56 | }, 57 | function end( done ){ 58 | test.pass( 'No Documents were created from bad data.' ); 59 | test.end(); 60 | done(); 61 | } 62 | )); 63 | }); 64 | 65 | tape( 'getIdPrefix returns prefix based on OA directory structure - csv', function( test ) { 66 | var filename = '/base/path/us/ca/san_francisco.csv'; 67 | var basePath = '/base/path'; 68 | 69 | var actual = recordStream.getIdPrefix(filename, basePath); 70 | 71 | var expected = 'us/ca/san_francisco'; 72 | test.equal(actual, expected, 'correct prefix generated'); 73 | test.end(); 74 | }); 75 | 76 | tape( 'getIdPrefix handles multiple levels of heirarchy - csv', function ( test ) { 77 | var filename = '/base/path/cz/countrywide.csv'; 78 | var basePath = '/base/path'; 79 | 80 | var actual = recordStream.getIdPrefix(filename, basePath); 81 | 82 | var expected = 'cz/countrywide'; 83 | test.equal(actual, expected, 'correct prefix generated'); 84 | test.end(); 85 | }); 86 | 87 | tape( 'getIdPrefix returns basename without extension when invalid basepath given - csv', function( test ) { 88 | var filename = '/path/to/a/document.csv'; 89 | var basePath = '/somewhere/else'; 90 | 91 | var actual = recordStream.getIdPrefix(filename, basePath); 92 | var expected = 'document'; 93 | 94 | test.equal(actual, expected); 95 | test.end(); 96 | }); 97 | 98 | tape( 'getIdPrefix returns prefix based on OA directory structure - geojson', function( test ) { 99 | var filename = '/base/path/us/ca/san_francisco.geojson'; 100 | var basePath = '/base/path'; 101 | 102 | var actual = recordStream.getIdPrefix(filename, basePath); 103 | 104 | var expected = 'us/ca/san_francisco'; 105 | test.equal(actual, expected, 'correct prefix generated'); 106 | test.end(); 107 | }); 108 | 109 | tape( 'getIdPrefix handles multiple levels of heirarchy - geojson', function ( test ) { 110 | var filename = '/base/path/cz/countrywide.geojson'; 111 | var basePath = '/base/path'; 112 | 113 | var actual = recordStream.getIdPrefix(filename, basePath); 114 | 115 | var expected = 'cz/countrywide'; 116 | test.equal(actual, expected, 'correct prefix generated'); 117 | test.end(); 118 | }); 119 | 120 | tape( 'getIdPrefix returns basename without extension when invalid basepath given - geojson', function( test ) { 121 | var filename = '/path/to/a/document.geojson'; 122 | var basePath = '/somewhere/else'; 123 | 124 | var actual = recordStream.getIdPrefix(filename, basePath); 125 | var expected = 'document'; 126 | 127 | test.equal(actual, expected); 128 | test.end(); 129 | }); 130 | -------------------------------------------------------------------------------- /test/streams/unitSplittingMapperStream.js: -------------------------------------------------------------------------------- 1 | var tape = require('tape'); 2 | const through = require('through2'); 3 | const mapper = require('../../lib/streams/unitSplittingMapperStream'); 4 | const Document = require('pelias-model').Document; 5 | 6 | module.exports.tests = {}; 7 | 8 | // test exports 9 | module.exports.tests.interface = function (test) { 10 | test('interface: factory', t => { 11 | t.equal(typeof mapper, 'function', 'stream factory'); 12 | t.end(); 13 | }); 14 | test('interface: stream', t => { 15 | var stream = mapper(); 16 | t.equal(typeof stream, 'object', 'valid stream'); 17 | t.equal(typeof stream._read, 'function', 'valid readable'); 18 | t.equal(typeof stream._write, 'function', 'valid writeable'); 19 | t.end(); 20 | }); 21 | }; 22 | 23 | // ===================== australasian unit number mapping ====================== 24 | 25 | module.exports.tests.australasian_solidus = function (test) { 26 | var doc = new Document('oa', 'example', 1); 27 | doc.setName('default', '2/14 Smith Street'); 28 | doc.setAddress('number', '2/14'); 29 | doc.setAddress('street', 'Smith Street'); 30 | doc.setMeta('country_code', 'AU'); 31 | 32 | test('maps - split unit from housenumber', t => { 33 | var stream = mapper(); 34 | stream.pipe(through.obj((doc, enc, next) => { 35 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 36 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 37 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 38 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 39 | t.end(); 40 | next(); 41 | })); 42 | stream.write(doc); 43 | }); 44 | }; 45 | 46 | module.exports.tests.australasian_solidus_with_whitespace = function (test) { 47 | var doc = new Document('oa', 'example', 1); 48 | doc.setName('default', '2 / 14 Smith Street'); 49 | doc.setAddress('number', '2 / 14'); 50 | doc.setAddress('street', 'Smith Street'); 51 | doc.setMeta('country_code', 'AU'); 52 | 53 | test('maps - split unit from housenumber', t => { 54 | var stream = mapper(); 55 | stream.pipe(through.obj((doc, enc, next) => { 56 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 57 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 58 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 59 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 60 | t.end(); 61 | next(); 62 | })); 63 | stream.write(doc); 64 | }); 65 | }; 66 | 67 | module.exports.tests.australasian_flat_prefix = function (test) { 68 | var doc = new Document('oa', 'example', 1); 69 | doc.setName('default', 'Flat 2 14 Smith Street'); 70 | doc.setAddress('number', 'Flat 2 14'); 71 | doc.setAddress('street', 'Smith Street'); 72 | doc.setMeta('country_code', 'AU'); 73 | 74 | test('maps - split unit from housenumber', t => { 75 | var stream = mapper(); 76 | stream.pipe(through.obj((doc, enc, next) => { 77 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 78 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 79 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 80 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 81 | t.end(); 82 | next(); 83 | })); 84 | stream.write(doc); 85 | }); 86 | }; 87 | 88 | module.exports.tests.australasian_flat_prefix_abbreviated = function (test) { 89 | var doc = new Document('oa', 'example', 1); 90 | doc.setName('default', 'F 2 14 Smith Street'); 91 | doc.setAddress('number', 'F 2 14'); 92 | doc.setAddress('street', 'Smith Street'); 93 | doc.setMeta('country_code', 'AU'); 94 | 95 | test('maps - split unit from housenumber', t => { 96 | var stream = mapper(); 97 | stream.pipe(through.obj((doc, enc, next) => { 98 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 99 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 100 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 101 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 102 | t.end(); 103 | next(); 104 | })); 105 | stream.write(doc); 106 | }); 107 | }; 108 | 109 | module.exports.tests.australasian_unit_prefix = function (test) { 110 | var doc = new Document('oa', 'example', 1); 111 | doc.setName('default', 'Unit 2 14 Smith Street'); 112 | doc.setAddress('number', 'Unit 2 14'); 113 | doc.setAddress('street', 'Smith Street'); 114 | doc.setMeta('country_code', 'AU'); 115 | 116 | test('maps - split unit from housenumber', t => { 117 | var stream = mapper(); 118 | stream.pipe(through.obj((doc, enc, next) => { 119 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 120 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 121 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 122 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 123 | t.end(); 124 | next(); 125 | })); 126 | stream.write(doc); 127 | }); 128 | }; 129 | 130 | module.exports.tests.australasian_apartment_prefix = function (test) { 131 | var doc = new Document('oa', 'example', 1); 132 | doc.setName('default', 'Apartment 2 14 Smith Street'); 133 | doc.setAddress('number', 'Apartment 2 14'); 134 | doc.setAddress('street', 'Smith Street'); 135 | doc.setMeta('country_code', 'AU'); 136 | 137 | test('maps - split unit from housenumber', t => { 138 | var stream = mapper(); 139 | stream.pipe(through.obj((doc, enc, next) => { 140 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 141 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 142 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 143 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 144 | t.end(); 145 | next(); 146 | })); 147 | stream.write(doc); 148 | }); 149 | }; 150 | 151 | module.exports.tests.australasian_apartment_prefix_abbreviated = function (test) { 152 | var doc = new Document('oa', 'example', 1); 153 | doc.setName('default', 'APT 2 14 Smith Street'); 154 | doc.setAddress('number', 'APT 2 14'); 155 | doc.setAddress('street', 'Smith Street'); 156 | doc.setMeta('country_code', 'AU'); 157 | 158 | test('maps - split unit from housenumber', t => { 159 | var stream = mapper(); 160 | stream.pipe(through.obj((doc, enc, next) => { 161 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 162 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 163 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 164 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 165 | t.end(); 166 | next(); 167 | })); 168 | stream.write(doc); 169 | }); 170 | }; 171 | 172 | module.exports.tests.australasian_allow_no_space_after_flat_designation = function (test) { 173 | var doc = new Document('oa', 'example', 1); 174 | doc.setName('default', 'APT2 14 Smith Street'); // note: 'APT2' concatenated 175 | doc.setAddress('number', 'APT2 14'); 176 | doc.setAddress('street', 'Smith Street'); 177 | doc.setMeta('country_code', 'AU'); 178 | 179 | test('maps - split unit from housenumber', t => { 180 | var stream = mapper(); 181 | stream.pipe(through.obj((doc, enc, next) => { 182 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged'); 183 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped'); 184 | t.deepEqual(doc.getAddress('number'), '14', 'mapped'); 185 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged'); 186 | t.end(); 187 | next(); 188 | })); 189 | stream.write(doc); 190 | }); 191 | }; 192 | 193 | function test(name, testFunction) { 194 | return tape('unit_splitting_mapper: ' + name, testFunction); 195 | } 196 | 197 | for (var testCase in module.exports.tests) { 198 | module.exports.tests[testCase](test); 199 | } 200 | -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @file The main entry point for the OpenAddresses importer's unit-tests. 3 | */ 4 | 5 | require( './schema' ); 6 | require( './isValidCsvRecord' ); 7 | require( './import'); 8 | require( './importPipeline'); 9 | require( './parameters' ); 10 | require( './streams/cleanupStream' ); 11 | require( './cleanup_v2' ); 12 | require( './streams/contentHashStream' ); 13 | require( './streams/documentStream' ); 14 | require( './streams/gnafMapperStream' ); 15 | require( './streams/isUSorCAHouseNumberZero' ); 16 | require( './streams/recordStream' ); 17 | require( './streams/unitSplittingMapperStream' ); 18 | -------------------------------------------------------------------------------- /utils/OpenAddressesAPI.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const axios = require('axios'); 3 | const config = require('pelias-config'); 4 | const logger = require('pelias-logger').get('openaddresses'); 5 | const HOST = 'https://batch.openaddresses.io'; 6 | 7 | class OpenAddressesAPI { 8 | constructor() { 9 | this.config = _.get(config.generate(), 'imports.openaddresses', {}); 10 | this.token = _.get(this.config, 'token'); 11 | } 12 | 13 | // remove file extensions from 'source' 14 | static normalize(source) { 15 | if (!_.isString(source)) { return source; } 16 | const norm = source.replace(/\.[^/.]+$/, ''); 17 | 18 | // source definitions previously required a file extension. 19 | // please remove file extensions from your ~/pelias.json file 20 | // to silence these warning messages. 21 | if (source !== norm) { 22 | logger.warn(`source definitions no longer require a file extension '${source}'`); 23 | } 24 | 25 | return norm; 26 | } 27 | 28 | // return the http url for a specific job id 29 | static url(job) { 30 | return `${HOST}/api/job/${job}/output/source.geojson.gz`; 31 | } 32 | 33 | // if the 'validated' mode is enabled (for financial supporters only) 34 | isValidatedModeEnabled() { 35 | return _.get(this.config, 'validated') === true; 36 | } 37 | 38 | async lookup(source) { 39 | // support the 'validated' property for financial supporters 40 | const params = { 41 | source, 42 | layer: 'addresses', 43 | validated: this.isValidatedModeEnabled() ? 'true' : 'false' 44 | }; 45 | 46 | // request extended info and return the first result 47 | const versions = await axios.get(`${HOST}/api/data`, { params }); 48 | return _.isArray(versions.data) && !_.isEmpty(versions.data) ? _.head(versions.data) : {}; 49 | } 50 | } 51 | 52 | module.exports = OpenAddressesAPI; 53 | -------------------------------------------------------------------------------- /utils/download_all.js: -------------------------------------------------------------------------------- 1 | const child_process = require('child_process'); 2 | const async = require('async'); 3 | const fs = require('fs-extra'); 4 | const temp = require('temp'); 5 | const logger = require('pelias-logger').get('openaddresses-download'); 6 | const _ = require('lodash'); 7 | 8 | function downloadAll(config, callback) { 9 | logger.info('Attempting to download all data'); 10 | 11 | const targetDir = config.imports.openaddresses.datapath; 12 | 13 | fs.ensureDir(targetDir, (err) => { 14 | if (err) { 15 | logger.error(`error making directory ${targetDir}`, err); 16 | return callback(err); 17 | } 18 | 19 | const dataHost = config.get('imports.openaddresses.dataHost') || 'https://data.openaddresses.io'; 20 | 21 | async.eachSeries( 22 | [ 23 | // all non-share-alike data 24 | `${dataHost}/openaddr-collected-global.zip`, 25 | 26 | // all share-alike data 27 | `${dataHost}/openaddr-collected-global-sa.zip` 28 | ], 29 | downloadBundle.bind(null, targetDir, config), 30 | callback); 31 | }); 32 | } 33 | 34 | function downloadBundle(targetDir, config, sourceUrl, callback) { 35 | 36 | const tmpZipFile = temp.path({suffix: '.zip'}); 37 | const referer = config.get('imports.openaddresses.dataReferer') || 'https://pelias-results.openaddresses.io'; 38 | 39 | async.series( 40 | [ 41 | // download the zip file into the temp directory 42 | (callback) => { 43 | logger.debug(`downloading ${sourceUrl}`); 44 | if (_.startsWith(sourceUrl, 's3://')) { 45 | const s3Options = config.imports.openaddresses.s3Options || ''; 46 | child_process.exec(`aws s3 cp ${sourceUrl} ${tmpZipFile} --only-show-errors ${s3Options}`, callback); 47 | } else { 48 | const flags = [ 49 | '--request GET', // HTTP GET 50 | '--silent', // be quiet 51 | '--location', // follow redirects 52 | '--fail', // exit with a non-zero code for >=400 responses 53 | '--write-out "%{http_code}"', // print status code to STDOUT 54 | `--referer ${referer}`, // set referer header 55 | `--output ${tmpZipFile}`, // set output filepath 56 | '--retry 5', // retry this number of times before giving up 57 | '--retry-connrefused', // consider ECONNREFUSED as a transient error 58 | '--retry-delay 5' // sleep this many seconds between retry attempts 59 | ].join(' '); 60 | 61 | // the `--fail*` flags cause an error to be returned as the first arg with `error.code` 62 | // as the process exit status, the `-w "%{http_code}"` flag writes the HTTP status to STDOUT. 63 | child_process.exec(`curl ${flags} ${sourceUrl}`, (error, stdout) => { 64 | if (!error) { return callback(); } 65 | 66 | // provide a more user-friendly error message 67 | error.message = `cURL request failed, HTTP status: ${stdout}, exit code: ${error.code}`; 68 | callback(error); 69 | }); 70 | } 71 | }, 72 | // unzip file into target directory 73 | (callback) => { 74 | logger.debug(`unzipping ${tmpZipFile} to ${targetDir}`); 75 | child_process.exec(`unzip -o -qq -d ${targetDir} ${tmpZipFile}`, callback); 76 | }, 77 | // delete the temp downloaded zip file 78 | fs.remove.bind(null, tmpZipFile) 79 | ], 80 | callback); 81 | } 82 | 83 | module.exports = downloadAll; 84 | -------------------------------------------------------------------------------- /utils/download_data.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const config = require( 'pelias-config' ).generate(require('../schema')); 3 | const logger = require('pelias-logger').get('openaddresses-download'); 4 | 5 | const downloadAll = require('./download_all'); 6 | const downloadFiltered = require('./download_filtered'); 7 | 8 | if (require.main === module) { 9 | download((err) => { 10 | if (err) { 11 | logger.error('Failed to download data', err); 12 | process.exit(1); 13 | } 14 | logger.info('All done!'); 15 | }); 16 | } 17 | 18 | function download(callback) { 19 | if (!_.isEmpty(config.imports.openaddresses.files)) { 20 | downloadFiltered(config, callback); 21 | } 22 | else { 23 | downloadAll(config, callback); 24 | } 25 | } 26 | 27 | module.exports = download; 28 | -------------------------------------------------------------------------------- /utils/download_filtered.js: -------------------------------------------------------------------------------- 1 | const child_process = require('child_process'); 2 | const config = require('pelias-config').generate(); 3 | const async = require('async'); 4 | const fs = require('fs-extra'); 5 | const path = require('path'); 6 | const temp = require('temp'); 7 | const logger = require('pelias-logger').get('openaddresses-download'); 8 | const Bottleneck = require('bottleneck/es5'); 9 | 10 | const OpenAddressesAPI = require('./OpenAddressesAPI'); 11 | const oa = new OpenAddressesAPI(); 12 | 13 | function downloadFiltered(config, callback) { 14 | const targetDir = config.imports.openaddresses.datapath; 15 | const errorsFatal = config.get('imports.openaddresses.missingFilesAreFatal'); 16 | 17 | fs.ensureDir(targetDir, async (err) => { 18 | if (err) { 19 | logger.error(`error making directory ${targetDir}`, err); 20 | return callback(err); 21 | } 22 | 23 | // validate sources 24 | const files = config.get('imports.openaddresses.files', []); 25 | const sources = await getSources(files); 26 | const validSources = sources.filter(source => source.url); 27 | 28 | // respect 'imports.openaddresses.missingFilesAreFatal' setting 29 | if (errorsFatal && (sources.length !== validSources.length)) { 30 | callback(sources.find(source => source.error)); // return first error 31 | return; 32 | } 33 | 34 | logger.info(`Attempting to download selected data sources: ${sources.map(source => source.id)}`); 35 | 36 | // limit requests to avoid being banned by openaddresses.io 37 | // current policy is 10 request per minute 38 | // https://github.com/pelias/openaddresses/issues/433#issuecomment-527383976 39 | // @todo: contact OA team to check if this is still required with the batch. endpoint? 40 | const options = { 41 | maxConcurrent: 1, 42 | minTime: 6000 43 | }; 44 | const limiter = new Bottleneck(options); 45 | const done = () => { 46 | if (limiter.empty()) { 47 | callback(); 48 | } 49 | }; 50 | validSources.map(source => { 51 | limiter.submit(downloadSource, targetDir, source, done); 52 | }); 53 | process.on('SIGINT', () => { 54 | limiter.stop({ dropWaitingJobs: true }); 55 | process.exit(); 56 | }); 57 | }); 58 | 59 | } 60 | 61 | async function getSources(files) { 62 | return await Promise.all(files.map(async file => { 63 | 64 | // normalize source 65 | let id = OpenAddressesAPI.normalize(file); 66 | 67 | // lookup the source using the OpenAddresses API 68 | // to find the most current job id and ensure validity 69 | const version = await oa.lookup(id); 70 | const valid = (version && version.job); 71 | 72 | // invalid source 73 | if (!valid) { 74 | return { id, error: `invalid source '${file}'` }; 75 | } 76 | 77 | // valid source 78 | return { id, url: OpenAddressesAPI.url(version.job) }; 79 | })); 80 | } 81 | 82 | function downloadSource(targetDir, source, done) { 83 | 84 | const errorsFatal = config.get('imports.openaddresses.missingFilesAreFatal'); 85 | const token = config.get('imports.openaddresses.token'); 86 | const referer = config.get('imports.openaddresses.dataReferer') || 'https://pelias-results.openaddresses.io'; 87 | logger.info(`Downloading ${source.id}`); 88 | 89 | const outFile = path.join(targetDir, `${source.id}.geojson`); 90 | const tmpFile = temp.path({ 91 | prefix: source.id.replace(new RegExp(path.sep, 'g'), '-'), 92 | dir: targetDir, 93 | suffix: '.gz' 94 | }); 95 | 96 | async.series( 97 | [ 98 | // download the compressed file into the temp directory 99 | (callback) => { 100 | logger.debug(`downloading ${source.url}`); 101 | const flags = [ 102 | '--request GET', // HTTP GET 103 | '--silent', // be quiet 104 | '--location', // follow redirects 105 | '--fail', // exit with a non-zero code for >=400 responses 106 | '--write-out "%{http_code}"', // print status code to STDOUT 107 | `--referer ${referer}`, // set referer header 108 | `--output ${tmpFile}`, // set output filepath 109 | '--retry 5', // retry this number of times before giving up 110 | '--retry-connrefused', // consider ECONNREFUSED as a transient error 111 | '--retry-delay 5', // sleep this many seconds between retry attempts 112 | `-H 'Authorization: Bearer ${token}'` // authorization token 113 | ].join(' '); 114 | 115 | // the `--fail*` flags cause an error to be returned as the first arg with `error.code` 116 | // as the process exit status, the `-w "%{http_code}"` flag writes the HTTP status to STDOUT. 117 | child_process.exec(`curl ${flags} ${source.url}`, (error, stdout) => { 118 | if (!error) { return callback(); } 119 | 120 | // provide a more user-friendly error message 121 | error.message = `cURL request failed, HTTP status: ${stdout}, exit code: ${error.code}`; 122 | callback(error); 123 | }); 124 | }, 125 | // decompress file into target directory 126 | (callback) => { 127 | logger.debug(`decompress ${tmpFile} to ${outFile}`); 128 | child_process.exec(` 129 | mkdir -p ${path.dirname(outFile)}; 130 | gzip -d < ${tmpFile} > ${outFile}; 131 | `, (error, stdout) => { 132 | if (!error) { return callback(); } 133 | 134 | // provide a more user-friendly error message 135 | error.message = `decompress failed, ${stdout}`; 136 | callback(error); 137 | }); 138 | }, 139 | ], 140 | (err) => { 141 | if (err) { 142 | logger.warn(`failed to download ${source.url}: ${err}`); 143 | } 144 | 145 | // ensure temp files are cleaned up 146 | if (fs.existsSync(tmpFile)) { fs.unlinkSync(tmpFile); } 147 | 148 | // honour 'imports.openaddresses.missingFilesAreFatal' setting 149 | done(errorsFatal ? err : null); 150 | } 151 | ); 152 | } 153 | 154 | module.exports = downloadFiltered; 155 | --------------------------------------------------------------------------------