├── .dockerignore
├── .github
└── workflows
│ ├── _test.yml
│ ├── pull_request.yml
│ └── push.yml
├── .gitignore
├── .jshintignore
├── .jshintrc
├── .npmrc
├── Dockerfile
├── LICENSE
├── README.md
├── bin
├── download
├── parallel
├── start
└── units
├── import.js
├── lib
├── analysis
│ ├── Token.js
│ ├── dictionaries
│ │ └── en
│ │ │ ├── diagonal_contractions.txt
│ │ │ ├── directional_expansions.txt
│ │ │ ├── street_types_overrides.txt
│ │ │ └── street_types_usps.txt
│ ├── dictionary.js
│ ├── ordinals.js
│ └── synonyms.js
├── cleanup.js
├── cleanup_v2.js
├── importPipeline.js
├── isValidCsvRecord.js
├── parameters.js
└── streams
│ ├── cleanupStream.js
│ ├── contentHashStream.js
│ ├── documentStream.js
│ ├── gnafMapperStream.js
│ ├── isUSorCAHouseNumberZero.js
│ ├── recordStream.js
│ ├── unitSplittingMapperStream.js
│ └── validRecordFilterStream.js
├── package.json
├── schema.js
├── test
├── analysis.js
├── cleanup_v2.js
├── data
│ ├── au
│ │ ├── input_file_3.csv
│ │ └── input_file_4.csv
│ ├── expected.json
│ ├── input_file_1.csv
│ └── input_file_2.csv
├── functional.js
├── import.js
├── importPipeline.js
├── isValidCsvRecord.js
├── openaddresses_bad_data.csv
├── openaddresses_sample.csv
├── parameters.js
├── schema.js
├── streams
│ ├── cleanupStream.js
│ ├── contentHashStream.js
│ ├── documentStream.js
│ ├── gnafMapperStream.js
│ ├── isUSorCAHouseNumberZero.js
│ ├── recordStream.js
│ └── unitSplittingMapperStream.js
└── test.js
└── utils
├── OpenAddressesAPI.js
├── download_all.js
├── download_data.js
└── download_filtered.js
/.dockerignore:
--------------------------------------------------------------------------------
1 | .git
2 | node_modules
3 |
--------------------------------------------------------------------------------
/.github/workflows/_test.yml:
--------------------------------------------------------------------------------
1 | name: Unit Tests
2 | on: workflow_call
3 | jobs:
4 | unit-tests:
5 | runs-on: '${{ matrix.os }}'
6 | strategy:
7 | matrix:
8 | os:
9 | - ubuntu-22.04
10 | node-version: [ 18.x, 20.x, 22.x ]
11 | steps:
12 | - uses: actions/checkout@v4
13 | - name: 'Install node.js ${{ matrix.node-version }}'
14 | uses: actions/setup-node@v4
15 | with:
16 | node-version: '${{ matrix.node-version }}'
17 | - name: Run unit tests
18 | run: |
19 | [[ -f ./bin/ci-setup ]] && ./bin/ci-setup
20 | npm install
21 | npm run ci
22 |
--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
1 | name: Continuous Integration
2 | on: pull_request
3 | jobs:
4 | unit-tests:
5 | # only run this job for forks
6 | if: github.event.pull_request.head.repo.full_name != github.repository
7 | uses: ./.github/workflows/_test.yml
8 |
--------------------------------------------------------------------------------
/.github/workflows/push.yml:
--------------------------------------------------------------------------------
1 | name: Continuous Integration
2 | on: push
3 | jobs:
4 | unit-tests:
5 | uses: ./.github/workflows/_test.yml
6 | npm-publish:
7 | needs: unit-tests
8 | if: github.ref == 'refs/heads/master' && needs.unit-tests.result == 'success'
9 | runs-on: ubuntu-22.04
10 | steps:
11 | - uses: actions/checkout@v4
12 | - name: Install Node.js
13 | uses: actions/setup-node@v4
14 | with:
15 | node-version: 20.x
16 | - name: Run semantic-release
17 | env:
18 | GH_TOKEN: ${{ secrets.GH_SEMANTIC_RELEASE_TOKEN }}
19 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
20 | run: >
21 | if [[ -n "$GH_TOKEN" && -n "$NPM_TOKEN" ]]; then
22 | curl "https://raw.githubusercontent.com/pelias/ci-tools/master/semantic-release.sh" | bash -
23 | fi
24 | build-docker-images:
25 | # run this job if the unit tests passed and the npm-publish job was a success or was skipped
26 | # note: github actions won't run a job if you don't call one of the status check functions, so `always()` is called since it evalutes to `true`
27 | if: ${{ always() && needs.unit-tests.result == 'success' && (needs.npm-publish.result == 'success' || needs.npm-publish.result == 'skipped') }}
28 | needs: [unit-tests, npm-publish]
29 | runs-on: ubuntu-22.04
30 | steps:
31 | - uses: actions/checkout@v4
32 | - name: Build Docker images
33 | env:
34 | DOCKER_USERNAME: ${{ secrets.DOCKER_USERNAME }}
35 | DOCKER_PASSWORD: ${{ secrets.DOCKER_PASSWORD }}
36 | run: |
37 | curl "https://raw.githubusercontent.com/pelias/ci-tools/master/build-docker-images.sh" | bash -
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | *.log
3 |
--------------------------------------------------------------------------------
/.jshintignore:
--------------------------------------------------------------------------------
1 | node_modules
2 |
--------------------------------------------------------------------------------
/.jshintrc:
--------------------------------------------------------------------------------
1 | {
2 | "esversion": 8,
3 | "node": true,
4 | "curly": true,
5 | "eqeqeq": true,
6 | "freeze": true,
7 | "immed": true,
8 | "indent": 2,
9 | "latedef": false,
10 | "newcap": true,
11 | "noarg": true,
12 | "noempty": true,
13 | "nonbsp": true,
14 | "nonew": true,
15 | "plusplus": false,
16 | "quotmark": "single",
17 | "undef": true,
18 | "unused": true,
19 | "maxparams": 4,
20 | "maxdepth": 4,
21 | "maxlen": 120
22 | }
23 |
--------------------------------------------------------------------------------
/.npmrc:
--------------------------------------------------------------------------------
1 | package-lock=false
2 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # base image
2 | FROM pelias/baseimage
3 |
4 | # downloader apt dependencies
5 | # note: this is done in one command in order to keep down the size of intermediate containers
6 | RUN apt-get update && apt-get install --no-install-recommends -y unzip awscli && rm -rf /var/lib/apt/lists/*
7 |
8 | # change working dir
9 | ENV WORKDIR /code/pelias/openaddresses
10 | WORKDIR ${WORKDIR}
11 |
12 | # copy package.json first to prevent npm install being rerun when only code changes
13 | COPY ./package.json ${WORKDIR}
14 | RUN npm install && npm cache clean --force;
15 |
16 | # copy code into image
17 | ADD . ${WORKDIR}
18 |
19 | # run tests
20 | RUN npm test
21 |
22 | # run as the pelias user
23 | USER pelias
24 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2014 Mapzen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | A modular, open-source search engine for our world.
5 | Pelias is a geocoder powered completely by open data, available freely to everyone.
6 |
7 |
8 |
9 |
10 |
11 |
12 | Local Installation ·
13 | Cloud Webservice ·
14 | Documentation ·
15 | Community Chat
16 |
17 |
18 | What is Pelias?
19 |
20 | Pelias is a search engine for places worldwide, powered by open data. It turns addresses and place names into geographic coordinates, and turns geographic coordinates into places and addresses. With Pelias, you’re able to turn your users’ place searches into actionable geodata and transform your geodata into real places.
21 |
22 | We think open data, open source, and open strategy win over proprietary solutions at any part of the stack and we want to ensure the services we offer are in line with that vision. We believe that an open geocoder improves over the long-term only if the community can incorporate truly representative local knowledge.
23 |
24 |
25 | # Pelias OpenAddresses importer
26 |
27 | [](https://greenkeeper.io/)
28 |
29 | ## Overview
30 |
31 | The OpenAddresses importer is used to process data from [OpenAddresses](http://openaddresses.io/)
32 | for import into the Pelias geocoder.
33 |
34 | ## Requirements
35 |
36 | Node.js is required. See [Pelias software requirements](https://github.com/pelias/documentation/blob/master/requirements.md) for supported versions.
37 |
38 | ## Installation
39 |
40 | > For instructions on setting up Pelias as a whole, see our [getting started guide](https://github.com/pelias/documentation/blob/master/getting_started_install.md). Further instructions here pertain to the OpenAddresses importer only
41 |
42 | ```bash
43 | git clone https://github.com/pelias/openaddresses
44 | cd openaddresses
45 | npm install
46 | ```
47 |
48 | ## Data Download
49 | Use the `imports.openaddresses.files` configuration option to limit the download to just the OpenAddresses files of interest.
50 | Refer to the [OpenAddresses data listing]( http://results.openaddresses.io/?runs=all#runs) for file names.
51 |
52 | ```bash
53 | npm run download
54 | ```
55 |
56 | ## Usage
57 | ```bash
58 | # show full command line options
59 | node import.js --help
60 |
61 | # run an import
62 | npm start
63 | ```
64 |
65 | ## Admin Lookup
66 | OpenAddresses records do not contain information about which city, state (or
67 | other region like province), or country that they belong to. Pelias has the
68 | ability to compute these values from [Who's on First](http://whosonfirst.mapzen.com/) data.
69 | For more info on how admin lookup works, see the documentation for
70 | [pelias/wof-admin-lookup](https://github.com/pelias/wof-admin-lookup). By default,
71 | adminLookup is enabled. To disable, set `imports.adminLookup.enabled` to `false` in Pelias config.
72 |
73 | **Note:** Admin lookup requires loading around 5GB of data into memory.
74 |
75 | ## Configuration
76 | This importer can be configured in [pelias-config](https://github.com/pelias/config), in the `imports.openaddresses`
77 | hash. A sample configuration file might look like this:
78 |
79 | ```javascript
80 | {
81 | "esclient": {
82 | "hosts": [
83 | {
84 | "env": "development",
85 | "protocol": "http",
86 | "host": "localhost",
87 | "port": 9200
88 | }
89 | ]
90 | },
91 | "logger": {
92 | "level": "debug"
93 | },
94 | "imports": {
95 | "whosonfirst": {
96 | "datapath": "/mnt/data/whosonfirst/",
97 | "importPostalcodes": false,
98 | "importVenues": false
99 | },
100 | "openaddresses": {
101 | "datapath": "/mnt/data/openaddresses/",
102 | "files": [ "us/ny/city_of_new_york.csv" ]
103 | }
104 | }
105 | }
106 | ```
107 |
108 | The following configuration options are supported by this importer.
109 |
110 | ### `imports.openaddresses.datapath`
111 |
112 | * Required: yes
113 | * Default: ``
114 |
115 | The absolute path to a directory where OpenAddresses data is located. The download command will also automatically place downloaded files in this directory.
116 |
117 | ### `imports.openaddresses.files`
118 |
119 | * Required: no
120 | * Default: `[]`
121 |
122 | An array of OpenAddresses files to be downloaded (full list can be found on the
123 | [OpenAddresses results site](http://results.openaddresses.io/?runs=all#runs)).
124 | If no files are specified, the full planet data files (11GB+) will be
125 | downloaded.
126 |
127 | ### `imports.openaddresses.missingFilesAreFatal`
128 |
129 | * Required: no
130 | * Default: `false`
131 |
132 | If set to true, any missing files will immediately halt the importer with an
133 | error. Otherwise, the importer will continue processing with a warning. The
134 | data downloader will also continue if any download errors were encountered with this set to false.
135 |
136 | ### `imports.openaddresses.dataHost`
137 |
138 | * Required: no
139 | * Default: `https://data.openaddresses.io`
140 |
141 | The location from which to download OpenAddresses data from. By default, the
142 | primary OpenAddresses servers will be used. This can be overrriden to allow
143 | downloading customized data. Paths are supported (for example,
144 | `https://yourhost.com/path/to/your/data`), but must not end with a trailing
145 | slash.
146 |
147 | S3 buckets are supported. Files will be downloaded using aws-cli.
148 |
149 | For example: `s3://data.openaddresses.io`.
150 |
151 | Note: When using s3, you might need authentcation (IAM instance role, env vars, etc.)
152 |
153 | ### `imports.openaddresses.s3Options`
154 |
155 | * Required: no
156 |
157 | If `imports.openaddresses.dataHost` is an s3 bucket, this will add options to the command.
158 | For example: `--profile my-profile`
159 |
160 | This is useful, for example, when downloading from `s3://data.openaddresses.io`,
161 | as they require the requester to pay for data transfer.
162 | You can then use the following option: `--request-payer`
163 |
164 | ### `imports.openaddresses.token`
165 | * Required: no
166 | * Default: Shared token for the pelias project
167 |
168 | Since openaddresses moved from [results.openaddresses.io](https://results.openaddresses.io) to [batch.openaddresses.io](https://batch.openaddresses.io), you need a token to access the data. There is a default shared token for the Pelias project, but if you want to use it seriously, create your own account and token on [batch.openaddresses.io. ](https://batch.openaddresses.io) to avoid possible throttling/bandwidth limit or (temporary) suspension.
169 |
170 |
171 | ## Parallel Importing
172 |
173 | Because OpenAddresses consists of many small files, this importer can be configured to run several instances in parallel that coordinate to import all the data.
174 |
175 | To use this functionality, replace calls to `npm start` with
176 |
177 | ```bash
178 | npm run parallel 3 # replace 3 with your desired level of paralellism
179 | ```
180 |
181 | Generally, a parallelism of 2 or 3 is suitable for most tasks.
182 |
--------------------------------------------------------------------------------
/bin/download:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | exec node utils/download_data.js
4 |
--------------------------------------------------------------------------------
/bin/parallel:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # grab the number of workers count
4 | count=$1
5 |
6 | # remove the first argument from the arguments array ($@)
7 | shift
8 |
9 | # only do anything if count is a valid integer >= 1
10 | if [[ $count -gt 1 ]]; then
11 | echo "openaddresses: starting $count parallel builds"
12 |
13 | # spawn $count parallel builds, passing correct params and all arguments
14 | for i in `seq 0 $(($count-1))`; do
15 | cmd="./bin/start --parallel-count $count --parallel-id $i $@"
16 | $cmd &
17 | done
18 |
19 | # don't let this script finish until all parallel builds have finished
20 | wait
21 | else
22 | # invalid count value, run normal build
23 | exec ./bin/start $@
24 | fi
25 |
--------------------------------------------------------------------------------
/bin/start:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | exec node --max_old_space_size=8000 import.js $@
4 |
--------------------------------------------------------------------------------
/bin/units:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # run tests with pipefail to avoid false passes
4 | # see https://github.com/pelias/pelias/issues/744
5 | set -euo pipefail
6 |
7 | node test/test.js | npx tap-spec
8 |
--------------------------------------------------------------------------------
/import.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @file Entry-point script for the OpenAddresses import pipeline.
3 | */
4 |
5 | var peliasConfig = require( 'pelias-config' ).generate(require('./schema'));
6 |
7 | var logger = require( 'pelias-logger' ).get( 'openaddresses' );
8 |
9 | var parameters = require( './lib/parameters' );
10 | var importPipeline = require( './lib/importPipeline' );
11 |
12 | const adminLookupStream = require('pelias-wof-admin-lookup');
13 |
14 | // Pretty-print the total time the import took.
15 | function startTiming() {
16 | var startTime = new Date().getTime();
17 | process.on( 'exit', function (){
18 | var totalTimeTaken = (new Date().getTime() - startTime).toString();
19 | var seconds = totalTimeTaken.slice(0, totalTimeTaken.length - 3);
20 | var milliseconds = totalTimeTaken.slice(totalTimeTaken.length - 3);
21 | logger.info( 'Total time taken: %s.%ss', seconds, milliseconds );
22 | });
23 | }
24 |
25 | var args = parameters.interpretUserArgs( process.argv.slice( 2 ) );
26 |
27 | const adminLayers = ['neighbourhood', 'borough', 'locality', 'localadmin',
28 | 'county', 'macrocounty', 'region', 'macroregion', 'dependency', 'country',
29 | 'empire', 'continent'];
30 |
31 | if( 'exitCode' in args ){
32 | ((args.exitCode > 0) ? console.error : console.info)( args.errMessage );
33 | process.exit( args.exitCode );
34 | } else {
35 | startTiming();
36 |
37 | if (peliasConfig.imports.openaddresses.hasOwnProperty('adminLookup')) {
38 | logger.info('imports.openaddresses.adminLookup has been deprecated, ' +
39 | 'enable adminLookup using imports.adminLookup.enabled = true');
40 | }
41 |
42 | var files = parameters.getFileList(peliasConfig, args);
43 |
44 | const importer_id = args['parallel-id'];
45 | let importer_name = 'openaddresses';
46 |
47 | if (importer_id !== undefined) {
48 | importer_name = `openaddresses-${importer_id}`;
49 | }
50 |
51 | importPipeline.create( files, args.dirPath, adminLookupStream.create(adminLayers), importer_name);
52 | }
53 |
--------------------------------------------------------------------------------
/lib/analysis/Token.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 |
3 | class Token {
4 | constructor(body) {
5 | this.body = _.isString(body) ? body : '';
6 | }
7 |
8 | isValid() {
9 | return _.isString(this.body) && !_.isEmpty(this.body);
10 | }
11 |
12 | isNumeric() {
13 | return /^\d+$/.test(this.body);
14 | }
15 |
16 | findCase() {
17 | if (this.body === _.toLower(this.body)) { return Token.LOWERCASED; }
18 | if (this.body === _.toUpper(this.body)) { return Token.UPPERCASED; }
19 | return Token.MIXEDCASED;
20 | }
21 |
22 | removeLeadingZeros() {
23 | this.body = this.body.replace(/^(?:0*)([1-9]\d*(st|nd|rd|th))/, '$1');
24 | }
25 |
26 | selectivelyLowerCase() {
27 | if (this.findCase() === Token.UPPERCASED) {
28 | this.body = _.toLower(this.body);
29 | }
30 | }
31 |
32 | selectivelyUpperCase() {
33 | if (this.findCase() === Token.LOWERCASED && this.body.endsWith('.')) {
34 | this.body = _.toUpper(this.body);
35 | }
36 | }
37 |
38 | selectivelyCapitalize() {
39 | if (this.findCase() === Token.LOWERCASED) {
40 | this.body = this.body.split(/\s+/).map(word => _.capitalize(word)).join(' ');
41 | }
42 | }
43 | }
44 |
45 | Token.LOWERCASED = 0;
46 | Token.UPPERCASED = 1;
47 | Token.MIXEDCASED = 2;
48 |
49 | module.exports = Token;
50 |
--------------------------------------------------------------------------------
/lib/analysis/dictionaries/en/diagonal_contractions.txt:
--------------------------------------------------------------------------------
1 | SE|southeast
2 | SW|southwest
3 | NE|northeast
4 | NW|northwest
5 |
--------------------------------------------------------------------------------
/lib/analysis/dictionaries/en/directional_expansions.txt:
--------------------------------------------------------------------------------
1 | north|n
2 | south|s
3 | east|e
4 | west|w
5 | SE|se
6 | NE|ne
7 | SW|sw
8 | NW|nw
9 |
--------------------------------------------------------------------------------
/lib/analysis/dictionaries/en/street_types_overrides.txt:
--------------------------------------------------------------------------------
1 | concourse|conc
2 |
--------------------------------------------------------------------------------
/lib/analysis/dictionaries/en/street_types_usps.txt:
--------------------------------------------------------------------------------
1 | alley|ally|aly
2 | anex|annx|anx
3 | arcade|arc
4 | avenue|av|aven|avenu|avn|avnue|ave
5 | bayou|bayoo|byu
6 | beach|bch
7 | bend|bnd
8 | bluff|bluf|blf
9 | bluffs|blfs
10 | bottom|bot|bottm|btm
11 | boulevard|boul|boulv|blvd
12 | branch|brnch|br
13 | bridge|brdge|brg
14 | brook|brk
15 | brooks|brks
16 | burg|bg
17 | burgs|bgs
18 | bypass|bypa|bypas|byps|byp
19 | camp|cmp|cp
20 | canyon|canyn|cnyn|cyn
21 | cape|cpe
22 | causeway|causwa|cswy
23 | center|cen|cent|centr|cnter|cntr|ctr
24 | centers|ctrs
25 | circle|circ|circl|crcl|crcle|cir
26 | circles|cirs
27 | cliff|clf
28 | cliffs|clfs
29 | club|clb
30 | common|cmn
31 | commons|cmns
32 | corner|cor
33 | corners|cors
34 | course|crse
35 | court|ct
36 | courts|cts
37 | cove|cv
38 | coves|cvs
39 | creek|crk
40 | crescent|crsent|crsnt|cres
41 | crest|crst
42 | crossing|crssng|xing
43 | crossroad|xrd
44 | crossroads|xrds
45 | curve|curv
46 | dale|dl
47 | dam|dm
48 | divide|div|dvd|dv
49 | drive|driv|drv|dr
50 | drives|drs
51 | estate|est
52 | estates|ests
53 | expressway|exp|expr|expw|expy
54 | extension|extn|extnsn|ext
55 | extensions|exts
56 | falls|fls
57 | field|fld
58 | fields|flds
59 | flat|flt
60 | flats|flts
61 | ford|frd
62 | fords|frds
63 | forest|frst
64 | forge|forg|frg
65 | forges|frgs
66 | fork|frk
67 | forks|frks
68 | fort|frt|ft
69 | freeway|freewy|frway|frwy|fwy
70 | garden|gardn|grden|grdn|gdn
71 | gardens|grdns|gdns
72 | gateway|gatewy|gatway|gtway|gtwy
73 | glen|gln
74 | glens|glns
75 | green|grn
76 | greens|grns
77 | grove|grov|grv
78 | groves|grvs
79 | harbor|harb|harbr|hrbor|hbr
80 | harbors|hbrs
81 | haven|hvn
82 | heights|ht|hts
83 | highway|highwy|hiway|hiwy|hway|hwy
84 | hill|hl
85 | hills|hls
86 | hollows|holws
87 | hollow|hllw|holw
88 | inlet|inlt
89 | island|islnd|is
90 | islands|islnds|iss
91 | junction|jction|jctn|junctn|juncton|jct
92 | junctions|jctns|jcts
93 | key|ky
94 | keys|kys
95 | knoll|knol|knl
96 | knolls|knls
97 | lake|lk
98 | lakes|lks
99 | landing|lndng|lndg
100 | lane|ln
101 | light|lgt
102 | lights|lgts
103 | loaf|lf
104 | lock|lck
105 | locks|lcks
106 | lodge|ldge|lodg|ldg
107 | manor|mnr
108 | manors|mnrs
109 | meadow|mdw
110 | meadows|medows|mdws
111 | mill|ml
112 | mills|mls
113 | mission|missn|mssn|msn
114 | motorway|mtwy
115 | mount|mnt|mt
116 | mountain|mntain|mntn|mountin|mtin|mtn
117 | mountains|mntns|mtns
118 | neck|nck
119 | orchard|orchrd|orch
120 | oval|ovl
121 | overpass|opas
122 | parkway|parkwy|pkway|pky|parkways|pkwys|pkwy
123 | passage|psge
124 | pine|pne
125 | pines|pnes
126 | place|pl
127 | plain|pln
128 | plains|plns
129 | plaza|plza|plz
130 | point|pt
131 | points|pts
132 | port|prt
133 | ports|prts
134 | prairie|prr|pr
135 | radial|rad|radiel|radl
136 | ranch|ranches|rnchs|rnch
137 | rapid|rpd
138 | rapids|rpds
139 | rest|rst
140 | ridge|rdge|rdg
141 | ridges|rdgs
142 | river|rvr|rivr|riv
143 | road|rd
144 | roads|rds
145 | route|rte
146 | shoal|shl
147 | shoals|shls
148 | shore|shoar|shr
149 | shores|shoars|shrs
150 | skyway|skwy
151 | spring|spng|sprng|spg
152 | springs|spngs|sprngs|spgs
153 | square|sqr|sqre|squ|sq
154 | squares|sqrs|sqs
155 | station|statn|stn|sta
156 | stravenue|strav|straven|stravn|strvn|strvnue|stra
157 | stream|streme|strm
158 | street|strt|str|st
159 | streets|sts
160 | summit|sumit|sumitt|smt
161 | terrace|terr|ter
162 | throughway|trwy
163 | trace|traces|trce
164 | track|tracks|trk|trks|trak
165 | trafficway|trfy
166 | trail|trails|trls|trl
167 | trailer|trlrs|trlr
168 | tunnel|tunel|tunls|tunnels|tunnl|tunl
169 | turnpike|trnpk|turnpk|tpke
170 | underpass|upas
171 | union|un
172 | unions|uns
173 | valley|vally|vlly|vly
174 | valleys|vlys
175 | viaduct|vdct|viadct|via
176 | view|vw
177 | views|vws
178 | village|vill|villag|villg|villiage|vlg
179 | villages|vlgs
180 | ville|vl
181 | vista|vist|vst|vsta|vis
182 | way|wy
183 | well|wl
184 | wells|wls
185 |
--------------------------------------------------------------------------------
/lib/analysis/dictionary.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const fs = require('fs');
3 | const path = require('path');
4 |
5 | /**
6 | load a libpostal dictionary from disk
7 | eg: https://raw.githubusercontent.com/openvenues/libpostal/master/resources/dictionaries/en/street_types.txt
8 |
9 | libpostal format:
10 | The leftmost string is treated as the canonical/normalized version.
11 | Synonyms if any, are appended to the right, delimited by the pipe character.
12 |
13 | see: https://github.com/openvenues/libpostal/tree/master/resources/dictionaries
14 |
15 | output example:
16 | {
17 | 'bruecke': 'bruecke',
18 | 'brücke': 'bruecke',
19 | 'brucke': 'bruecke',
20 | 'br.': 'bruecke'
21 | }
22 | */
23 |
24 | // regular expression to target removal of common punctuation
25 | const PUNCTUATION_REGEX = /[.,\/#!$%\^&\*;:{}=\-_`~()]/g;
26 |
27 | module.exports = (opts) => {
28 |
29 | /**
30 | * options
31 | *
32 | * countryCode (string) -- country-code corresponding to a subdirectory in the the ./directories folder
33 | * filename (string) -- the name of the file to load inside the directory mentioed above
34 | * includeSelfReferences (bool) -- whether to also include the canonical synonym in the map
35 | * minLength (int) -- minimum valid length for a synonym in the dictionary
36 | */
37 | const options = _.defaults({}, opts, {
38 | includeSelfReferences: false,
39 | minLength: 0
40 | });
41 |
42 | try {
43 | const filepath = path.resolve(__dirname, 'dictionaries', options.countryCode, options.filename);
44 | const file = fs.readFileSync(filepath).toString();
45 | const lines = file.trim().split('\n');
46 |
47 | const map = lines.reduce((obj, line) => {
48 | var cols = line.trim().split('|');
49 |
50 | // remove multi-word synonyms from all but the first position
51 | cols = cols.filter((col, pos) => (pos === 0) || !/[\s]/.test(col));
52 |
53 | cols.forEach((col, pos) => {
54 | if (!options.includeSelfReferences && 0 === pos) { return; } // skip first column ( the expansion )
55 | if (col.replace(PUNCTUATION_REGEX).length < (options.minLength || 0)) { return; } // skip very short synonyms
56 |
57 | // warn user when duplicate terms are added to the map
58 | if (obj.hasOwnProperty(col)){
59 | console.warn(`[${options.filename}] trying to replace ${col}=>${obj[col]} with ${col}=>${cols[0]}`);
60 | }
61 |
62 | obj[col] = cols[0];
63 | });
64 | return obj;
65 | }, {});
66 |
67 | return map;
68 | }
69 | catch (e) {
70 | return {};
71 | }
72 | };
73 |
--------------------------------------------------------------------------------
/lib/analysis/ordinals.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 |
3 | // The ordinal function replaces all numeric street names (ie. 30 street)
4 | // with a version including ordinals (ie. 30th street).
5 | // note: this is currently only configured for the English language
6 |
7 | function ordinals(opts) {
8 | return (tokens) => {
9 |
10 | // consider all but final token
11 | for (var o = 0; o < tokens.length-1; o++) {
12 |
13 | // token must be entirely numeric
14 | if (!tokens[o].isNumeric()) { continue; }
15 |
16 | // token must be followed by a street type token
17 | if (!_.has(opts.dict.streetTypes, _.toLower(tokens[o+1].body))) { continue; }
18 |
19 | // token must either be the leftmost token or be preceeded by a directional token
20 | if(o !== 0) {
21 | if (!_.has(opts.dict.directionalExpansions, _.toLower(tokens[o-1].body))) {
22 | continue;
23 | }
24 | }
25 |
26 | // append the english ordinal suffix
27 | tokens[o].body += englishOrdinalSuffix(tokens[o].body);
28 |
29 | // maximum of one replacement
30 | break;
31 | }
32 |
33 | return tokens;
34 | };
35 | }
36 |
37 | function englishOrdinalSuffix(i) {
38 | const j = i % 10, k = i % 100;
39 | if (j === 1 && k !== 11) { return 'st'; }
40 | if (j === 2 && k !== 12) { return 'nd'; }
41 | if (j === 3 && k !== 13) { return 'rd'; }
42 | return 'th';
43 | }
44 |
45 | module.exports = ordinals;
46 |
--------------------------------------------------------------------------------
/lib/analysis/synonyms.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 |
3 | // The synonyms function replaces all matching occurrences of tokens in the
4 | // supplied dictionary.
5 | // Some options are provided to control the iteration and termination behaviour
6 | // of the replacer.
7 | // @todo: this does not currently handle matching onmulti-word synonyms
8 | // (although it's technically possible to do so if desired at a later date)
9 |
10 | function synonyms(opts) {
11 | /**
12 | * options
13 | *
14 | * dictionary -- the dictionary to use for looking up replacements
15 | * maxElements -- maximum elements to iterate
16 | * maxReplacements -- maximum replacements which can be made
17 | * direction -- default is iterating left-to-right through the array, use 'left' for the inverse
18 | * predicate -- after a match is found this function must return true before the substitution occurs
19 | * ignore -- run on each token before matching against the dictionary, must return true or the token is skipped
20 | * normalizer -- control how the token is normalized before matching occurs
21 | *
22 | * return function(tokens) => tokens
23 | */
24 | const options = _.defaults({}, opts, {
25 | dictionary: {},
26 | maxElements: Infinity,
27 | maxReplacements: Infinity,
28 | direction: 'right',
29 | predicate: () => true,
30 | ignore: () => false,
31 | normalizer: (body) => _.trim(_.toLower(body), '.')
32 | });
33 |
34 | // iterate from right-to-left
35 | if (options.direction === 'left') {
36 | return (tokens) => {
37 | var seen = 0; // keep track of how many elements we've seen
38 | var replaced = 0; // keep track of how many elements we've replaced
39 |
40 | // iterate over tokens in reverse order
41 | for (var o = tokens.length - 1; o >= 0; o--) {
42 |
43 | // support $ignore
44 | if (options.ignore(tokens[o], o, tokens)) { continue; }
45 |
46 | // support $maxElements
47 | if (++seen > options.maxElements) { break; }
48 |
49 | // search for replacement in dictionary
50 | var replacement = _.get(options.dictionary, options.normalizer(tokens[o].body));
51 | if (replacement) {
52 |
53 | // support $predicate
54 | if (!options.predicate(tokens[o], o, tokens)) { continue; }
55 |
56 | // perform replacement
57 | tokens[o].body = replacement;
58 |
59 | // support $maxReplacements
60 | replaced++;
61 | if (replaced >= options.maxReplacements) { break; }
62 | }
63 | }
64 |
65 | return tokens;
66 | };
67 | }
68 |
69 | // iterate from left-to-right
70 | return (tokens) => {
71 | var seen = 0; // keep track of how many elements we've seen
72 | var replaced = 0; // keep track of how many elements we've replaced
73 |
74 | // iterate over tokens in normal order
75 | for (var o = 0; o < tokens.length; o++) {
76 |
77 | // support $ignore
78 | if (options.ignore(tokens[o], o, tokens)){ continue; }
79 |
80 | // support $maxElements
81 | if (++seen > options.maxElements) { break; }
82 |
83 | // search for replacement in dictionary
84 | var replacement = _.get(options.dictionary, options.normalizer(tokens[o].body));
85 | if (replacement) {
86 |
87 | // support $predicate
88 | if (!options.predicate(tokens[o], o, tokens)) { continue; }
89 |
90 | // perform replacement
91 | tokens[o].body = replacement;
92 |
93 | // support $maxReplacements
94 | replaced++;
95 | if (replaced >= options.maxReplacements) { break; }
96 | }
97 | }
98 |
99 | return tokens;
100 | };
101 | }
102 |
103 | module.exports = synonyms;
104 |
--------------------------------------------------------------------------------
/lib/cleanup.js:
--------------------------------------------------------------------------------
1 | var _ = require('lodash');
2 |
3 | function removeLeadingZerosFromStreet(token) {
4 | return token.replace(/^(?:0*)([1-9]\d*(st|nd|rd|th))/,'$1');
5 | }
6 |
7 | const directionals = ['NE', 'NW', 'SE', 'SW'];
8 |
9 | function capitalizeProperly(token){
10 | const lowercase = token.toLowerCase();
11 | const uppercase = token.toUpperCase();
12 |
13 | // token is a directional, return uppercase variant
14 | if (directionals.includes(uppercase)) {
15 | return uppercase;
16 | }
17 |
18 | // token is all lowercase or all uppercase, return capitalized variant
19 | if (token === lowercase || token === uppercase) {
20 | return _.capitalize(token);
21 | }
22 |
23 | return token;
24 | }
25 |
26 | function cleanupStreetName(input) {
27 | // split streetname into tokens by whitespace
28 | return input.split(/\s/)
29 | .map(removeLeadingZerosFromStreet)
30 | // remove empty tokens
31 | .filter(function(part){
32 | return part.length > 0;
33 | }).map(capitalizeProperly)
34 | .join(' ');
35 | }
36 |
37 | module.exports = {
38 | streetName: cleanupStreetName
39 | };
40 |
--------------------------------------------------------------------------------
/lib/cleanup_v2.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const dictionary = require('./analysis/dictionary');
3 | const synonyms = require('./analysis/synonyms');
4 | const ordinals = require('./analysis/ordinals');
5 | const Token = require('./analysis/Token');
6 |
7 | /**
8 | * This file contains a street name normalization algorithm
9 | * which attempts to convert poorly formatted street names
10 | * into a more stardardized and aethetically pleasing form.
11 | *
12 | * I've written up some more information about the potential
13 | * pitfall of doing this which explain why the code will always
14 | * tend to err on the side of caution.
15 | *
16 | * see: https://github.com/pelias/openaddresses/pull/477
17 | *
18 | * At time of writing the code follows this method:
19 | * 1. If the text is uppercase, with minor exceptions, lowercase it
20 | * 2. Expand the 'generic' portion of the name
21 | * 3. Expand the 'directional' portion of the name
22 | * 4. Capitalize all lowercased words
23 | */
24 |
25 | // load dictionaries from disk
26 | const dict = {
27 | directionalExpansions: dictionary({
28 | countryCode: 'en',
29 | filename: 'directional_expansions.txt',
30 | includeSelfReferences: true
31 | }),
32 | diagonalContractions: dictionary({
33 | countryCode: 'en',
34 | filename: 'diagonal_contractions.txt',
35 | includeSelfReferences: false
36 | }),
37 | streetTypes: _.merge(
38 | dictionary({
39 | countryCode: 'en',
40 | filename: 'street_types_usps.txt',
41 | includeSelfReferences: true,
42 | minLength: 2
43 | }),
44 | dictionary({
45 | countryCode: 'en',
46 | filename: 'street_types_overrides.txt',
47 | includeSelfReferences: true,
48 | minLength: 2
49 | })
50 | )
51 | };
52 |
53 | function cleanupStreetName(input) {
54 | // split by whitespace
55 | const words = input.split(/\s+/);
56 |
57 | // convert strings to objects
58 | var tokens = words.map(word => new Token(word));
59 |
60 | // remove leading zeros from housenumbers
61 | tokens.forEach(token => token.removeLeadingZeros());
62 |
63 | // if the token is all uppercase then try to lowercase it
64 | tokens.forEach(token => token.selectivelyLowerCase());
65 |
66 | // if the token is identified as an abbreviation then uppercase it
67 | tokens.forEach(token => token.selectivelyUpperCase());
68 |
69 | // street 'generic' expansion ie. the 'St.' or 'Rd.' portion
70 | if (tokens.length >= 2){
71 | tokens = synonyms({
72 | dictionary: dict.streetTypes,
73 | maxElements: 1,
74 | maxReplacements: 1,
75 | direction: 'left',
76 |
77 | // ignore tokens in the directionalExpansions dict
78 | ignore: (token) => _.has(dict.directionalExpansions, _.toLower(token.body))
79 | })(tokens);
80 | }
81 |
82 | // directional expansions (leftmost token)
83 | if (tokens.length >= 3) {
84 | tokens = synonyms({
85 | dictionary: dict.directionalExpansions,
86 | maxElements: 1,
87 | maxReplacements: 1,
88 | predicate: (token, pos, tokens) => {
89 | // perform a look-ahead on the next token
90 | // and ensure it's not in the streetTypes dict
91 | const next = tokens[pos+1];
92 | if (!_.isObjectLike(next)){ return true; }
93 | return !_.has(dict.streetTypes, _.toLower(next.body));
94 | }
95 | })(tokens);
96 | }
97 |
98 | // directional expansions (rightmost token)
99 | if (tokens.length >= 3) {
100 | tokens = synonyms({
101 | dictionary: dict.directionalExpansions,
102 | maxElements: 1,
103 | maxReplacements: 1,
104 | direction: 'left'
105 | })(tokens);
106 | }
107 |
108 | // diagonal contractions (all tokens)
109 | if (tokens.length >= 3) {
110 | tokens = synonyms({
111 | dictionary: dict.diagonalContractions,
112 | maxReplacements: 1,
113 | direction: 'left'
114 | })(tokens);
115 | }
116 |
117 | // capitalize lowercased tokens (leaving mixed case tokens unchanged)
118 | tokens.forEach(token => token.selectivelyCapitalize());
119 |
120 | // add ordinals to english numeric street names
121 | tokens = ordinals({ dict })(tokens);
122 |
123 | // convert objects to strings and join by whitespace
124 | return tokens.map(token => token.body).join(' ');
125 | }
126 |
127 | module.exports = {
128 | streetName: cleanupStreetName
129 | };
130 |
--------------------------------------------------------------------------------
/lib/importPipeline.js:
--------------------------------------------------------------------------------
1 | const logger = require('pelias-logger').get('openaddresses');
2 | const recordStream = require('./streams/recordStream');
3 | const model = require('pelias-model');
4 | const peliasDbclient = require('pelias-dbclient');
5 | const blacklistStream = require('pelias-blacklist-stream');
6 | const isUSorCAHouseNumberZero = require('./streams/isUSorCAHouseNumberZero');
7 |
8 | /**
9 | * Import all OpenAddresses CSV files in a directory into Pelias elasticsearch.
10 | *
11 | * @param {array of string} files An array of the absolute file-paths to import.
12 | * @param {object} opts Options to configure the import. Supports the following
13 | * keys:
14 | *
15 | * adminValues: Add admin values to each address object (since
16 | * OpenAddresses doesn't contain any) using `admin-lookup`. See the
17 | * documentation: https://github.com/pelias/admin-lookup
18 | */
19 | function createFullImportPipeline( files, dirPath, adminLookupStream, importerName ){ // jshint ignore:line
20 | logger.info( 'Importing %s files.', files.length );
21 |
22 | recordStream.create(files, dirPath)
23 | .pipe(blacklistStream())
24 | .pipe(adminLookupStream)
25 | .pipe(isUSorCAHouseNumberZero.create())
26 | .pipe(model.createDocumentMapperStream())
27 | .pipe(peliasDbclient({name: importerName}));
28 | }
29 |
30 | module.exports = {
31 | create: createFullImportPipeline
32 | };
33 |
--------------------------------------------------------------------------------
/lib/isValidCsvRecord.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const NULL_ISLAND_THRESHOLD = 0.0005;
3 |
4 | /*
5 | * Return true if a record has all of LON, LAT, NUMBER and STREET defined
6 | */
7 | function isValidCsvRecord( record ){
8 | return hasAllProperties(record) &&
9 | !houseNumberIsExclusionaryWord(record) &&
10 | !streetContainsExclusionaryWord(record) &&
11 | !latLonAreOnNullIsland(record);
12 | }
13 |
14 | /*
15 | * Return false if record.NUMBER is literal word 'NULL', 'UNDEFINED',
16 | * or 'UNAVAILABLE' (case-insensitive)
17 | */
18 | function houseNumberIsExclusionaryWord(record) {
19 | return ['NULL', 'UNDEFINED', 'UNAVAILABLE'].indexOf(_.toUpper(record.NUMBER)) !== -1;
20 | }
21 |
22 | /*
23 | * Return false if record.STREET contains literal word 'NULL', 'UNDEFINED',
24 | * or 'UNAVAILABLE' (case-insensitive)
25 | */
26 | function streetContainsExclusionaryWord(record) {
27 | return /\b(NULL|UNDEFINED|UNAVAILABLE)\b/i.test(record.STREET);
28 | }
29 |
30 | function hasAllProperties(record) {
31 | return [ 'LON', 'LAT', 'NUMBER', 'STREET' ].every(function(prop) {
32 | return !_.isEmpty(record[ prop ]) || _.isNumber(record[ prop ]);
33 | });
34 | }
35 |
36 | // returns true when LON and LAT are both parseable < $NULL_ISLAND_THRESHOLD
37 | // > parseFloat('0');
38 | // 0
39 | // > parseFloat('0.000000');
40 | // 0
41 | // > parseFloat('0.000001');
42 | // 0.000001
43 | function latLonAreOnNullIsland(record) {
44 | return ['LON', 'LAT'].every(prop => Math.abs(parseFloat(record[prop])) < NULL_ISLAND_THRESHOLD);
45 | }
46 |
47 | module.exports = isValidCsvRecord;
48 |
--------------------------------------------------------------------------------
/lib/parameters.js:
--------------------------------------------------------------------------------
1 | const fs = require('fs');
2 | const util = require('util');
3 | const glob = require('glob');
4 | const path = require('path');
5 | const _ = require('lodash');
6 | const minimist = require('minimist');
7 |
8 | const peliasConfig = require('pelias-config').generate();
9 | const OpenAddressesAPI = require('../utils/OpenAddressesAPI');
10 |
11 | /**
12 | * Interprets the command-line arguments passed to the script.
13 | *
14 | * @param {array} argv Should be `process.argv.slice( 2 )`.
15 | * @return {object} If arguments were succesfully parsed, an object that can be
16 | * used to call `importOpenAddressesDir`:
17 | *
18 | * {
19 | * dirPath: ,
20 | * adminValues: ,
21 | * }
22 | *
23 | * Otherwise, an error object.
24 | *
25 | * {
26 | * exitCode: ,
27 | * errMessage:
28 | * }
29 | */
30 | function interpretUserArgs( argv, config ){
31 | config = config || peliasConfig;
32 |
33 | var usageMessage = [
34 | 'A tool for importing OpenAddresses data into Pelias. Usage:',
35 | '',
36 | '\tnode import.js --help | [--admin-values] [OPENADDRESSES_DIR]',
37 | '',
38 | '',
39 | '\t--help: Print this help message.',
40 | '',
41 | '\tOPENADDRESSES_DIR: A directory containing OpenAddresses CSV files.',
42 | '\t\tIf none is specified, the path from your PELIAS_CONFIG\'s',
43 | '\t\t`imports.openaddresses.datapath` will be used.',
44 | ].join( '\n' );
45 |
46 | argv = minimist(argv, {});
47 |
48 | var validArgs = ['help', '_', 'parallel-count', 'parallel-id' ];
49 | for( var arg in argv ){
50 | if( validArgs.indexOf( arg ) === -1 ){
51 | return {
52 | errMessage: util.format( '`%s` is not a recognized argument.', arg ),
53 | exitCode: 1
54 | };
55 | }
56 | }
57 |
58 | if( argv.help ){
59 | return { errMessage: usageMessage, exitCode: 0 };
60 | }
61 |
62 | var opts = {
63 | 'parallel-count': argv['parallel-count'],
64 | 'parallel-id': argv['parallel-id'],
65 | dirPath: null
66 | };
67 | if( argv._.length > 0 ){
68 | opts.dirPath = argv._[ 0 ];
69 | }
70 | else {
71 | opts.dirPath = config.imports.openaddresses.datapath;
72 | }
73 |
74 | opts.dirPath = path.normalize(opts.dirPath);
75 |
76 | if( !fs.existsSync( opts.dirPath ) ){
77 | return {
78 | errMessage: util.format( 'Directory `%s` does not exist.', opts.dirPath ),
79 | exitCode: 2
80 | };
81 | }
82 | else if( !fs.statSync( opts.dirPath ).isDirectory() ){
83 | return {
84 | errMessage: util.format( '`%s` is not a directory.', opts.dirPath ),
85 | exitCode: 2
86 | };
87 | }
88 |
89 | return opts;
90 |
91 | }
92 |
93 | function getFullFileList(peliasConfig, args) {
94 | // get the files to process
95 | const files = _.get(peliasConfig.imports.openaddresses, 'files', []);
96 |
97 | if (_.isEmpty(files)) {
98 | // no specific files listed, so return all .csv and .geojson files
99 | return glob.sync( args.dirPath + '/**/*.{csv,geojson,geojson.gz,csv.gz}' );
100 | } else {
101 | // otherwise return the requested files with full path
102 | return files.map(file => {
103 |
104 | // normalize source
105 | const source = OpenAddressesAPI.normalize(file);
106 |
107 | // search for files matching this source id, ending in either .geojson or .csv
108 | const found = glob.sync(`${source}.{csv,geojson}`, { cwd: args.dirPath, absolute: true });
109 | if (!_.isEmpty(found)) { return _.last(found); } // results are sorted, prefer .geojson
110 |
111 | // no matching files were found, return a non-matching absolute path
112 | return path.join(args.dirPath, file);
113 | });
114 | }
115 | }
116 |
117 | function getFileList(peliasConfig, args) {
118 | var files = getFullFileList(peliasConfig, args);
119 |
120 | if (args['parallel-count'] > 0 && args['parallel-id'] >= 0) {
121 | files = files.filter(function(element, index) {
122 | return index % args['parallel-count'] === args['parallel-id'];
123 | });
124 | }
125 |
126 | return files;
127 | }
128 |
129 | module.exports = {
130 | interpretUserArgs: interpretUserArgs,
131 | getFileList: getFileList
132 | };
133 |
--------------------------------------------------------------------------------
/lib/streams/cleanupStream.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const through2 = require('through2');
3 | const cleanup = require('../cleanup');
4 | const cleanupV2 = require('../cleanup_v2');
5 |
6 | /*
7 | * create a stream that performs any needed cleanup on a record
8 | */
9 |
10 | // mapping from openaddresses country codes (from the file names)
11 | // to a language code 'locale' required by next-gen analysis.
12 | const cc2LocaleMap = {
13 | 'us': 'en',
14 | 'ca': 'en',
15 | 'gb': 'en',
16 | 'ie': 'en',
17 | 'au': 'en',
18 | 'nz': 'en',
19 | };
20 |
21 | function createCleanupStream(options) {
22 | const locale = _.get(cc2LocaleMap, _.get(options, 'countryCode', ''), '').toLowerCase();
23 |
24 | // use 'cleanup_v2' when we know the locale is 'en', else use the existing 'cleanup' analyzer
25 | // note: this is a temporary solution to allow us to upgrade gradually without having to
26 | // test the entire world, with all it's different languages, all in the first release.
27 | const analyzer = (locale === 'en') ? cleanupV2.streetName : cleanup.streetName;
28 |
29 | // generate a stream
30 | return through2.obj(( record, enc, next ) => {
31 |
32 | // analyze street field
33 | record.STREET = analyzer(record.STREET, { locale });
34 |
35 | // csvParse will only trim unquoted fields
36 | // so we have to do it ourselves to handle all whitespace
37 | Object.keys(record).forEach(key => {
38 | if (_.isFunction(_.get(record[key], 'trim'))) {
39 | record[key] = record[key].trim();
40 | }
41 | });
42 |
43 | next(null, record);
44 | });
45 | }
46 |
47 | module.exports = {
48 | create: createCleanupStream
49 | };
50 |
--------------------------------------------------------------------------------
/lib/streams/contentHashStream.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const crypto = require('crypto');
3 | const through2 = require('through2');
4 |
5 | /*
6 | * create a stream that generates a content-hash for each row
7 | */
8 |
9 | function createContentHashStream() {
10 | return through2.obj((record, enc, next) => {
11 | record.HASH = hash(record);
12 | next(null, record);
13 | });
14 | }
15 |
16 | const normalize = {
17 | float: (fl) => (Math.floor(parseFloat(fl||0.0)*1e7)/1e7).toFixed(7),
18 | string: (str) => (str||'').toString().replace(/\s+/g, ' ').trim().toLowerCase()
19 | };
20 |
21 | const fields = [
22 | { key: 'LON', norm: normalize.float },
23 | { key: 'LAT', norm: normalize.float },
24 | { key: 'STREET', norm: normalize.string },
25 | { key: 'NUMBER', norm: normalize.string },
26 | { key: 'UNIT', norm: normalize.string }
27 | ];
28 |
29 | function hash( record ) {
30 | // md5 is actually 512 bits, we only need 256 bits to match the 16x hex char
31 | // uuid4 implementation used by the openaddresses project, so half are discarded.
32 | // it was chosen due to its universal availability and maturity.
33 | // note: this algo need not be cryptographically secure, it's just more
34 | // convenient and reliable to use this method than using other methods.
35 | const h = crypto.createHash('md5');
36 |
37 | // see: https://github.com/pelias/openaddresses/pull/442#issuecomment-535399779
38 | fields.forEach( field => {
39 | // write a null byte in place of an empty value
40 | // in order to preserve column positions.
41 | let str = '\0';
42 | if (_.has(record, field.key)) {
43 | str = field.norm(_.get(record, field.key));
44 | }
45 | h.update(str);
46 | });
47 |
48 | // return a hexidecimal representation
49 | return h.digest('hex').substr(0, 16);
50 | }
51 |
52 | module.exports = {
53 | create: createContentHashStream,
54 | hash: hash
55 | };
56 |
--------------------------------------------------------------------------------
/lib/streams/documentStream.js:
--------------------------------------------------------------------------------
1 | const through = require( 'through2' );
2 | const peliasModel = require( 'pelias-model' );
3 |
4 | // patter to match a two character country code from the directory prefix
5 | const COUNTRY_CODE_PATTERN = /^([A-Za-z]{2})\//;
6 |
7 | /*
8 | * Create a stream of Documents from valid, cleaned CSV records
9 | */
10 | function createDocumentStream(id_prefix, stats) {
11 | /**
12 | * Used to track the UID of individual records passing through the stream if
13 | * there is no HASH that can be used as a more unique identifier. See
14 | * `peliasModel.Document.setId()` for information about UIDs.
15 | */
16 | let uid = 0;
17 |
18 | return through.obj(
19 | function write( record, enc, next ){
20 | const id_number = record.HASH || uid;
21 | const model_id = `${id_prefix}:${id_number}`;
22 | uid++;
23 |
24 | try {
25 | const doc = new peliasModel.Document('openaddresses', 'address', model_id)
26 | .setName('default', `${record.NUMBER} ${record.STREET}`)
27 | .setAddress('number', record.NUMBER)
28 | .setAddress('street', record.STREET)
29 | .setCentroid({ lon: record.LON, lat: record.LAT });
30 |
31 | if (record.POSTCODE) {
32 | doc.setAddress('zip', record.POSTCODE);
33 | }
34 |
35 | // attempt to set the country code based on the directory prefix
36 | const match = id_prefix.match(COUNTRY_CODE_PATTERN);
37 | if (match && match[1]) {
38 | doc.setMeta('country_code', match[1].toUpperCase());
39 | }
40 |
41 | // store a reference to the original OA record in a 'meta'
42 | // field, this is available through the pipeline but is not
43 | // saved to elasticsearch.
44 | doc.setMeta('oa', record);
45 |
46 | this.push(doc);
47 | }
48 | catch ( ex ){
49 | stats.badRecordCount++;
50 | }
51 |
52 | next();
53 | }
54 | );
55 | }
56 |
57 | module.exports = {
58 | create: createDocumentStream
59 | };
60 |
--------------------------------------------------------------------------------
/lib/streams/gnafMapperStream.js:
--------------------------------------------------------------------------------
1 | /**
2 | The GNAF mapper is responsible for extracting Australian GNAF
3 | identifiers from the OA 'ID' property, where available.
4 | **/
5 |
6 | const _ = require('lodash');
7 | const through = require('through2');
8 | const logger = require('pelias-logger').get('openaddresses');
9 |
10 | // examples: GAACT718519668, GASA_424005553
11 | const GNAF_PID_PATTERN = /^(GA)(NSW|VIC|QLD|SA_|WA_|TAS|NT_|ACT|OT_)([0-9]{9})$/;
12 |
13 | module.exports = function () {
14 | return through.obj((doc, enc, next) => {
15 | try {
16 | if (doc.getMeta('country_code') === 'AU') {
17 |
18 | // detect Australian G-NAF PID concordances
19 | const oaid = _.get(doc.getMeta('oa'), 'ID');
20 | if (_.isString(oaid) && oaid.length === 14 && oaid.match(GNAF_PID_PATTERN)) {
21 | doc.setAddendum('concordances', { 'gnaf:pid': oaid });
22 | }
23 | }
24 | }
25 |
26 | catch (e) {
27 | logger.error('gnaf_mapper error');
28 | logger.error(e.stack);
29 | logger.error(JSON.stringify(doc, null, 2));
30 | }
31 |
32 | return next(null, doc);
33 | });
34 | };
35 |
--------------------------------------------------------------------------------
/lib/streams/isUSorCAHouseNumberZero.js:
--------------------------------------------------------------------------------
1 | var filter = require('through2-filter');
2 | var _ = require('lodash');
3 |
4 | var allZeros = /^0+$/;
5 |
6 | function isZeroHouseNumber(record) {
7 | return allZeros.test(record.address_parts.number);
8 | }
9 |
10 | function isUSorCA(record) {
11 | return _.isEqual(record.parent.country_a, ['USA']) ||
12 | _.isEqual(record.parent.country_a, ['CAN']);
13 | }
14 |
15 | module.exports.create = function create() {
16 | return filter.obj(function(record) {
17 | if (isZeroHouseNumber(record) && isUSorCA(record)) {
18 | return false;
19 | }
20 | return true;
21 | });
22 | };
23 |
--------------------------------------------------------------------------------
/lib/streams/recordStream.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const fs = require('fs');
3 | const path = require('path');
4 | const csvParse = require('csv-parse').parse;
5 | const combinedStream = require('combined-stream');
6 | const through = require('through2');
7 | const split = require('split2');
8 | const zlib = require('zlib');
9 |
10 | const logger = require('pelias-logger').get('openaddresses');
11 | const config = require('pelias-config').generate();
12 |
13 | const CleanupStream = require('./cleanupStream');
14 | const ContentHashStream = require('./contentHashStream');
15 | const ValidRecordFilterStream = require('./validRecordFilterStream');
16 | const DocumentStream = require('./documentStream');
17 | const gnafMapperStreamFactory = require('./gnafMapperStream');
18 | const unitSplittingMapperStreamFactory = require('./unitSplittingMapperStream');
19 |
20 | /*
21 | * Construct a suitable id prefix for a CSV file given
22 | * its full filename and the base directory of all OA CSV files.
23 | */
24 | function getIdPrefix(filename, dirPath) {
25 | if (filename && dirPath) {
26 | // if the file is within the dir path, use the structure
27 | // of the directory tree to create the id
28 | if (filename.indexOf(dirPath) !== -1) {
29 | var subpath = _.replace(filename, dirPath, '');
30 | var prefix = _.replace(_.replace(subpath, /\.(csv|geojson)/, ''), /\.gz/, '');
31 | return _.trim(prefix, '/');
32 | }
33 | }
34 |
35 | // if the dirPath doesn't contain this file, return the basename without extension
36 | return path.basename(path.basename(path.basename(filename, '.gz'), '.csv'), '.geojson');
37 | }
38 |
39 | /**
40 | * Create a stream of Documents from an OpenAddresses file.
41 | *
42 | * @param {string} filePath The path of an OpenAddresses CSV file.
43 | * @return {stream.Readable} A stream of `Document` objects, one
44 | * for every valid record inside the OA file.
45 | */
46 | function createRecordStream( filePath, dirPath ){
47 | /**
48 | * A stream to convert rows of a CSV to Document objects.
49 | */
50 | var stats = {
51 | badRecordCount: 0
52 | };
53 |
54 | const contentHashStream = ContentHashStream.create();
55 | const validRecordFilterStream = ValidRecordFilterStream.create();
56 | const idPrefix = getIdPrefix(filePath, dirPath);
57 | const countryCode = idPrefix.replace(/\\/g, '/').split('/')[0];
58 | const cleanupStream = CleanupStream.create({ countryCode });
59 | const documentStream = DocumentStream.create(idPrefix, stats);
60 |
61 | documentStream._flush = function end( done ){
62 | done();
63 | };
64 |
65 | return fileStreamDispatcher(fs.createReadStream( filePath ), filePath)
66 | .pipe( contentHashStream )
67 | .pipe( validRecordFilterStream )
68 | .pipe( cleanupStream )
69 | .pipe( documentStream )
70 | .pipe( gnafMapperStreamFactory() )
71 | .pipe( unitSplittingMapperStreamFactory() );
72 | }
73 |
74 | function geojsonStream(stream) {
75 | return stream
76 | .pipe(split())
77 | .pipe(through.obj((line, _enc, next) => {
78 | let row;
79 | try {
80 | const geojson = JSON.parse(line);
81 | if (_.get(geojson, 'geometry.type') === 'Point') {
82 | row = {
83 | NUMBER: _.get(geojson, 'properties.number'),
84 | STREET: _.get(geojson, 'properties.street'),
85 | LON: _.get(geojson, 'geometry.coordinates[0]'),
86 | LAT: _.get(geojson, 'geometry.coordinates[1]'),
87 | POSTCODE: _.get(geojson, 'properties.postcode'),
88 | UNIT:_.get(geojson, 'properties.unit'),
89 | DISTRICT:_.get(geojson, 'properties.district'),
90 | REGION:_.get(geojson, 'properties.region'),
91 | CITY:_.get(geojson, 'properties.city')
92 | };
93 | }
94 | } catch(e) {
95 | logger.error(e);
96 | }
97 | next(null, row);
98 | }));
99 | }
100 |
101 | function fileStreamDispatcher(stream, filePath) {
102 | if (filePath.endsWith('.gz')) {
103 | stream = stream.pipe(zlib.createGunzip());
104 | }
105 |
106 | if (/\.geojson(\.gz)?/.test(filePath)) {
107 | return geojsonStream(stream);
108 | }
109 |
110 | return stream.pipe(csvParse({
111 | bom: true,
112 | trim: true,
113 | skip_empty_lines: true,
114 | relax_column_count: true,
115 | relax: true,
116 | columns: true
117 | }));
118 | }
119 |
120 | /*
121 | * Create a single stream from many CSV files
122 | */
123 | function createFullRecordStream(files, dirPath) {
124 | var recordStream = combinedStream.create();
125 |
126 | files.forEach( function forEach( filePath ){
127 | if (!fs.existsSync(filePath)) {
128 | if (config.get('imports.openaddresses.missingFilesAreFatal')) {
129 | logger.error(`File ${filePath} not found, quitting`);
130 | process.exit(1);
131 | } else {
132 | logger.warn(`File ${filePath} not found, skipping`);
133 | return;
134 | }
135 | }
136 |
137 | recordStream.append( function ( next ){
138 | logger.info( 'Creating read stream for: ' + filePath );
139 | next(createRecordStream( filePath, dirPath ) );
140 | });
141 | });
142 |
143 | return recordStream;
144 | }
145 |
146 | module.exports = {
147 | getIdPrefix: getIdPrefix,
148 | create: createFullRecordStream
149 | };
150 |
--------------------------------------------------------------------------------
/lib/streams/unitSplittingMapperStream.js:
--------------------------------------------------------------------------------
1 | /**
2 | The unit splitting mapper is responsible for detecting when the address.number
3 | field contains the concatenation of the unit and the housenumber.
4 |
5 | eg. Flat 2 14 Smith St
6 |
7 | In this case we attempt to split the two terms into their consituent parts.
8 |
9 | note: Addressing formats vary between countries, it's unlikely that a pattern
10 | which works for one country will also work internationally. For this reason this
11 | mapper accepts a country code which can be used to select the appropriate pattern(s).
12 |
13 | Feel free to make changes to this mapping file!
14 | **/
15 |
16 | const _ = require('lodash');
17 | const through = require('through2');
18 | const logger = require('pelias-logger').get('openaddresses');
19 | const mappers = {};
20 |
21 | // Australasian Unit Number Mapper
22 | // https://auspost.com.au/content/dam/auspost_corp/media/documents/Appendix-01.pdf
23 | // https://www.nzpost.co.nz/sites/nz/files/2021-10/adv358-address-standards.pdf
24 | const australasian = (doc) =>{
25 | const number = doc.getAddress('number');
26 | if(!_.isString(number) || number.length < 3){ return; }
27 |
28 | // 2/14
29 | const solidus = number.match(/^(\d+)\s*\/\s*(\d+)$/);
30 | if (solidus) {
31 | doc.setAddress('unit', solidus[1]);
32 | doc.setAddress('number', solidus[2]);
33 | doc.setName('default', `${doc.getAddress('number')} ${doc.getAddress('street')}`);
34 | return;
35 | }
36 |
37 | // Flat 2 14 | F 2 14 | Unit 2 14 | APT 2 14
38 | const verbose = number.match(/^(flat|f|unit|apartment|apt)\s*(\d+)\s+(\d+)$/i);
39 | if (verbose) {
40 | doc.setAddress('unit', verbose[2]);
41 | doc.setAddress('number', verbose[3]);
42 | doc.setName('default', `${doc.getAddress('number')} ${doc.getAddress('street')}`);
43 | return;
44 | }
45 | };
46 |
47 | // associate mappers with country codes
48 | mappers.AU = australasian;
49 | mappers.NZ = australasian;
50 |
51 | module.exports = function () {
52 | return through.obj((doc, enc, next) => {
53 | try {
54 | // only applies to records with a 'number' set and no 'unit' set (yet).
55 | if (doc.hasAddress('number') && !doc.hasAddress('unit')) {
56 |
57 | // select the appropriate mapper based on country code
58 | const mapper = _.get(mappers, doc.getMeta('country_code'));
59 | if (_.isFunction(mapper)) {
60 |
61 | // run the country-specific mapper
62 | mapper(doc);
63 | }
64 | }
65 | }
66 |
67 | catch (e) {
68 | logger.error('unit_mapper error');
69 | logger.error(e.stack);
70 | logger.error(JSON.stringify(doc, null, 2));
71 | }
72 |
73 | return next(null, doc);
74 | });
75 | };
76 |
--------------------------------------------------------------------------------
/lib/streams/validRecordFilterStream.js:
--------------------------------------------------------------------------------
1 | var through = require( 'through2' );
2 |
3 | var logger = require( 'pelias-logger' ).get( 'openaddresses' );
4 | var isValidCsvRecord = require('../isValidCsvRecord');
5 |
6 | /*
7 | * Create a through2 stream to filter out invalid records
8 | */
9 | function createValidRecordFilterStream() {
10 | var invalidCount = 0;
11 | return through.obj(function( record, enc, next ) {
12 | if (isValidCsvRecord(record)) {
13 | this.push(record);
14 | } else {
15 | invalidCount++;
16 | }
17 | next();
18 | }, function(next) {
19 | logger.verbose('number of invalid records skipped: ' + invalidCount);
20 | next();
21 | });
22 | }
23 |
24 | module.exports = {
25 | create: createValidRecordFilterStream
26 | };
27 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pelias-openaddresses",
3 | "version": "0.0.0-development",
4 | "description": "Pelias import pipeline for OpenAddresses.",
5 | "engines": {
6 | "node": ">=10.0.0"
7 | },
8 | "main": "import.js",
9 | "dependencies": {
10 | "@hapi/joi": "^16.0.1",
11 | "async": "^3.1.0",
12 | "axios": "^1.2.2",
13 | "bottleneck": "^2.19.5",
14 | "combined-stream": "^1.0.7",
15 | "csv-parse": "^5.0.3",
16 | "fs-extra": "^8.1.0",
17 | "glob": "^7.0.0",
18 | "lodash": "^4.16.0",
19 | "minimist": "^1.2.0",
20 | "pelias-blacklist-stream": "^1.0.0",
21 | "pelias-config": "^6.0.0",
22 | "pelias-dbclient": "^3.1.0",
23 | "pelias-logger": "^1.2.1",
24 | "pelias-model": "^10.5.0",
25 | "pelias-wof-admin-lookup": "^7.12.0",
26 | "split2": "^3.2.2",
27 | "temp": "^0.9.1",
28 | "through2": "^3.0.0",
29 | "through2-filter": "^3.0.0",
30 | "through2-map": "^3.0.0",
31 | "through2-sink": "^1.0.0"
32 | },
33 | "devDependencies": {
34 | "colors": "^1.4.0",
35 | "diff": "^5.0.0",
36 | "jshint": "^2.9.4",
37 | "precommit-hook": "^3.0.0",
38 | "proxyquire": "^2.0.0",
39 | "stream-mock": "^2.0.3",
40 | "tap-spec": "^5.0.0",
41 | "tape": "^5.0.0"
42 | },
43 | "scripts": {
44 | "download": "./bin/download",
45 | "import": "./bin/start",
46 | "parallel": "./bin/parallel",
47 | "test": "NODE_ENV=test npm run units",
48 | "units": "./bin/units",
49 | "functional": "NODE_ENV=test node test/functional.js | tap-spec",
50 | "lint": "jshint .",
51 | "validate": "npm ls",
52 | "ci": "npm run test && npm run functional",
53 | "start": "./bin/start"
54 | },
55 | "repository": {
56 | "type": "git",
57 | "url": "https://github.com/pelias/openaddresses.git"
58 | },
59 | "keywords": [
60 | "Pelias",
61 | "OpenAddresses",
62 | "import"
63 | ],
64 | "author": "mapzen",
65 | "license": "MIT",
66 | "bugs": {
67 | "url": "https://github.com/pelias/openaddresses/issues"
68 | },
69 | "homepage": "https://github.com/pelias/openaddresses",
70 | "pre-commit": [
71 | "lint",
72 | "validate",
73 | "test"
74 | ],
75 | "release": {
76 | "branch": "master",
77 | "success": []
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/schema.js:
--------------------------------------------------------------------------------
1 | const Joi = require('@hapi/joi');
2 |
3 | // Schema Configuration
4 | // datapath: string (required)
5 | // files: array of strings
6 | // adminLookup: boolean
7 | module.exports = Joi.object().keys({
8 | imports: Joi.object().required().keys({
9 | openaddresses: Joi.object().required().keys({
10 | files: Joi.array().items(Joi.string()),
11 | datapath: Joi.string().required(true),
12 | dataHost: Joi.string(),
13 | s3Options: Joi.string(),
14 | adminLookup: Joi.boolean(),
15 | missingFilesAreFatal: Joi.boolean().default(false).truthy('yes').falsy('no'),
16 | token: Joi.string().required(true),
17 | }).unknown(false)
18 | }).unknown(true)
19 | }).unknown(true);
20 |
--------------------------------------------------------------------------------
/test/analysis.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const through = require('through2');
3 | const split = require('split2');
4 | require('colors');
5 | const Diff = require('diff');
6 | const delim = '|';
7 |
8 | const analyzers = {
9 | streetName: require('../lib/cleanup').streetName,
10 | streetNameV2: require('../lib/cleanup_v2').streetName
11 | };
12 |
13 | // print each line
14 | const stream = through((chunk, enc, next) => {
15 | const line = chunk.toString('utf8');
16 | const columns = [line];
17 |
18 | _.forEach(analyzers, analyzer => {
19 | columns.push(analyzer(line));
20 | });
21 |
22 | // both analyzers produced the same result
23 | // skip these lines as they are not helpful
24 | // for debugging.
25 | if (columns[1] === columns[2]) {
26 | return next();
27 | }
28 |
29 | var diffString = '';
30 | var hasRemoval = false;
31 | Diff.diffChars(columns[1], columns[2]).forEach((part) => {
32 | hasRemoval = (hasRemoval || part.removed);
33 | // green for additions, red for deletions, grey for common parts
34 | const color = part.added ? 'green' : (part.removed ? 'red' : 'grey');
35 | diffString += part.value[color];
36 | });
37 |
38 | columns.push(diffString);
39 |
40 | // only show lines where characters have been removed
41 | // if (!hasRemoval){
42 | // return next();
43 | // }
44 |
45 | console.log(columns.join(delim));
46 | next();
47 | });
48 |
49 | // print header line
50 | stream.once('pipe', () => console.log(_.concat(['input'], _.keys(analyzers), ['diff']).join(delim)));
51 |
52 | process.stdin.pipe(split()).pipe(stream);
53 |
--------------------------------------------------------------------------------
/test/cleanup_v2.js:
--------------------------------------------------------------------------------
1 | const tape = require('tape');
2 | const analyzer = require('../lib/cleanup_v2').streetName;
3 |
4 | tape('analyzer', (t) => {
5 | t.equal(typeof analyzer, 'function', 'analyzer is a function');
6 | t.equal(analyzer.length, 1, 'analyzer accepts body');
7 | t.end();
8 | });
9 |
10 | // --- Letter Casing ---
11 |
12 | // fix casing on uppercased tokens
13 | tape('casing - fix uppercased tokens', (t) => {
14 | t.equal(analyzer('MAIN STREET'), 'Main Street');
15 | t.equal(analyzer('DR M L KING JR BOULEVARD'), 'Dr M L King Jr Boulevard');
16 |
17 | // uppercase tokens ending with a period
18 | t.equal(analyzer('DR MLK. JR. BOULEVARD'), 'Dr MLK. JR. Boulevard');
19 | t.end();
20 | });
21 |
22 | // fix casing on lowercased tokens
23 | tape('casing - fix lowercased tokens', (t) => {
24 | t.equal(analyzer('main street'), 'Main Street');
25 | t.equal(analyzer('dr m l king jr boulevard'), 'Dr M L King Jr Boulevard');
26 |
27 | // uppercase tokens ending with a period
28 | t.equal(analyzer('dr mlk. jr. boulevard'), 'Dr MLK. JR. Boulevard');
29 | t.end();
30 | });
31 |
32 | // ingore casing on mixedcase tokens
33 | tape('casing - ingore casing on mixedcase tokens', (t) => {
34 | t.equal(analyzer('Willie Mc Donald Way'), 'Willie Mc Donald Way');
35 | t.equal(analyzer('McCallister Street'), 'McCallister Street');
36 | t.equal(analyzer('Mc Callister Street'), 'Mc Callister Street');
37 | t.end();
38 | });
39 |
40 | // --- Expanding the 'generic' part of the street name ---
41 |
42 | // expand contracted 'generic' term
43 | tape('generic expansion - final token position', (t) => {
44 | t.equal(analyzer('10 main street'), '10 Main Street');
45 | t.equal(analyzer('10 main St.'), '10 Main Street');
46 | t.equal(analyzer('10 main st.'), '10 Main Street');
47 | t.equal(analyzer('10 main str'), '10 Main Street');
48 | t.equal(analyzer('10 main st'), '10 Main Street');
49 |
50 | t.equal(analyzer('10 main road'), '10 Main Road');
51 | t.equal(analyzer('10 main Rd.'), '10 Main Road');
52 | t.equal(analyzer('10 main rd.'), '10 Main Road');
53 | t.equal(analyzer('10 main rd'), '10 Main Road');
54 |
55 | t.equal(analyzer('10 main avenue'), '10 Main Avenue');
56 | t.equal(analyzer('10 main Ave.'), '10 Main Avenue');
57 | t.equal(analyzer('10 main ave.'), '10 Main Avenue');
58 | t.equal(analyzer('10 main ave'), '10 Main Avenue');
59 |
60 | t.equal(analyzer('10 main avenue'), '10 Main Avenue');
61 | t.equal(analyzer('10 main Ave.'), '10 Main Avenue');
62 | t.equal(analyzer('10 main ave.'), '10 Main Avenue');
63 | t.equal(analyzer('10 main ave'), '10 Main Avenue');
64 | t.end();
65 | });
66 |
67 | // do not expand 'generic' term when not in final token position
68 | tape('generic expansion - not final token position', (t) => {
69 | t.equal(analyzer('10 main st st'), '10 Main St Street');
70 | t.equal(analyzer('10 main st junction'), '10 Main St Junction');
71 | t.equal(analyzer('AVE ST RD ST PKWY ST'), 'Ave St Rd St Pkwy Street');
72 | t.end();
73 | });
74 |
75 | // we should expand the 'generic' when directly before a directional
76 | tape('generic expansion - before directionals', (t) => {
77 | t.equal(analyzer('Main St N'), 'Main Street North');
78 | t.equal(analyzer('Main St S'), 'Main Street South');
79 | t.equal(analyzer('Main St E'), 'Main Street East');
80 | t.equal(analyzer('Main St W'), 'Main Street West');
81 | t.equal(analyzer('Main St North'), 'Main Street North');
82 | t.equal(analyzer('Main St South'), 'Main Street South');
83 | t.equal(analyzer('Main St East'), 'Main Street East');
84 | t.equal(analyzer('Main St West'), 'Main Street West');
85 | t.end();
86 | });
87 |
88 | // do not expand a 'generic' term when there is only one token
89 | // this is logical as the 'generic' should always be paired with
90 | // a 'specific'.
91 | // note: this is likely not nessesary but adds a little more safety
92 | // feel free to remove this restriction later if it doesn't make sense.
93 | tape('generic expansion - single token', (t) => {
94 | t.equal(analyzer('st'), 'St');
95 | t.equal(analyzer('espl'), 'Espl');
96 | t.end();
97 | });
98 |
99 | // @todo: what should we do when there are multiple 'generic' tokens?
100 | tape('generic expansion - multiple generic tokens', (t) => {
101 | t.equal(analyzer('W FARMS SQ PLZ'), 'West Farms Sq Plaza');
102 | t.end();
103 | });
104 |
105 | // @todo: what should we do when the 'generic' preceeds the 'specific'?
106 | // @note: currently this expands 'Ave S' but not 'Ave X' because it thinks
107 | // that S refers to a directional.
108 | tape('generic expansion - multiple generic tokens', (t) => {
109 | t.equal(analyzer('AVE X'), 'Ave X');
110 | t.equal(analyzer('AVE S'), 'Avenue S');
111 | t.end();
112 | });
113 |
114 | // --- Expanding the 'directional' part of the street name ---
115 |
116 | // expand directionals
117 | // note: one issue with contracting directionals is getting
118 | // something like 'East Coast Road' to not change.
119 | tape('expand directionals - first token position', (t) => {
120 | t.equal(analyzer('N Main Street'), 'North Main Street');
121 | t.equal(analyzer('S Main Street'), 'South Main Street');
122 | t.equal(analyzer('E Main Street'), 'East Main Street');
123 | t.equal(analyzer('W Main Street'), 'West Main Street');
124 | t.end();
125 | });
126 | tape('expand directionals - last token position', (t) => {
127 | t.equal(analyzer('Main Street N'), 'Main Street North');
128 | t.equal(analyzer('Main Street S'), 'Main Street South');
129 | t.equal(analyzer('Main Street E'), 'Main Street East');
130 | t.equal(analyzer('Main Street W'), 'Main Street West');
131 | t.end();
132 | });
133 |
134 | // do not expand NSEW directionals
135 | tape('expand directionals - first token position', (t) => {
136 | t.equal(analyzer('NE Main Street'), 'NE Main Street');
137 | t.equal(analyzer('SE Main Street'), 'SE Main Street');
138 | t.equal(analyzer('NW Main Street'), 'NW Main Street');
139 | t.equal(analyzer('SW Main Street'), 'SW Main Street');
140 | t.end();
141 | });
142 | tape('expand directionals - last token position', (t) => {
143 | t.equal(analyzer('Main Street NE'), 'Main Street NE');
144 | t.equal(analyzer('Main Street SE'), 'Main Street SE');
145 | t.equal(analyzer('Main Street NW'), 'Main Street NW');
146 | t.equal(analyzer('Main Street SW'), 'Main Street SW');
147 | t.end();
148 | });
149 |
150 | // do not expand directionals unless 3 or more tokens present
151 | tape('expand directionals - only when 3 or more tokens', (t) => {
152 | t.equal(analyzer('N Street'), 'N Street');
153 | t.equal(analyzer('S Street'), 'S Street');
154 | t.equal(analyzer('E Street'), 'E Street');
155 | t.equal(analyzer('W Street'), 'W Street');
156 | t.end();
157 | });
158 |
159 | // do not expand directionals when followed by a 'generic'
160 | tape('expand directionals - unless followed by a generic', (t) => {
161 | t.equal(analyzer('N St Station'), 'N St Station');
162 | t.equal(analyzer('N Street Station'), 'N Street Station');
163 | t.equal(analyzer('N Ave Junction'), 'N Ave Junction');
164 | t.equal(analyzer('N Avenue Junction'), 'N Avenue Junction');
165 | t.end();
166 | });
167 |
168 | // contract english diagonals (southwest,southeast...)
169 | tape('contract english diagonals - first token position', (t) => {
170 | t.equal(analyzer('Northeast Main Street'), 'NE Main Street');
171 | t.equal(analyzer('Southeast Main Street'), 'SE Main Street');
172 | t.equal(analyzer('Northwest Main Street'), 'NW Main Street');
173 | t.equal(analyzer('Southwest Main Street'), 'SW Main Street');
174 | t.end();
175 | });
176 | tape('contract english diagonals - last token position', (t) => {
177 | t.equal(analyzer('Main Street Northeast'), 'Main Street NE');
178 | t.equal(analyzer('Main Street Southeast'), 'Main Street SE');
179 | t.equal(analyzer('Main Street Northwest'), 'Main Street NW');
180 | t.equal(analyzer('Main Street Southwest'), 'Main Street SW');
181 | t.end();
182 | });
183 |
184 | // add missing English street name ordinals
185 | tape('add missing English street name ordinals', (t) => {
186 | t.equal(analyzer('W 26 St'), 'West 26th Street');
187 | t.equal(analyzer('W 26th St'), 'West 26th Street');
188 | t.equal(analyzer('1 St'), '1st Street');
189 | t.equal(analyzer('2 Rd'), '2nd Road');
190 | t.equal(analyzer('3 Ave'), '3rd Avenue');
191 | t.equal(analyzer('4 Ln'), '4th Lane');
192 | t.equal(analyzer('11 St'), '11th Street');
193 | t.equal(analyzer('12 Rd'), '12th Road');
194 | t.equal(analyzer('13 Ave'), '13th Avenue');
195 | t.equal(analyzer('14 Ln'), '14th Lane');
196 | t.equal(analyzer('101 St'), '101st Street');
197 | t.equal(analyzer('102 Rd'), '102nd Road');
198 | t.equal(analyzer('103 Ave'), '103rd Avenue');
199 | t.equal(analyzer('104 Ln'), '104th Lane');
200 | t.equal(analyzer('no 1 st'), 'No 1 Street');
201 | t.equal(analyzer('no #1 st'), 'No #1 Street');
202 | t.end();
203 | });
204 |
205 | // --- NOOP inputs which should never change ---
206 |
207 | // no-ops, these inputs should not change regardless of the algorithm used
208 | tape('no-ops', (t) => {
209 |
210 | // street names composed entirely of 'generic' tokens
211 | t.equal(analyzer('Esplanade'), 'Esplanade');
212 | t.equal(analyzer('Park Road'), 'Park Road');
213 |
214 | // do not contract directionals which are part of the name
215 | t.equal(analyzer('East Coast Road'), 'East Coast Road');
216 |
217 | // number prefix
218 | t.equal(analyzer('No 1 Road'), 'No 1 Road');
219 |
220 | // spanish prefix 'la' should never be expanded to 'lane'
221 | t.equal(analyzer('La Bamba Road'), 'La Bamba Road');
222 |
223 | // directional as street name
224 | t.equal(analyzer('N Street'), 'N Street');
225 | t.equal(analyzer('No Street'), 'No Street');
226 | t.equal(analyzer('North Street'), 'North Street');
227 | t.equal(analyzer('Northe Street'), 'Northe Street');
228 |
229 | // do not anglicise/de-anglicise names
230 | t.equal(analyzer('Centre Road'), 'Centre Road');
231 | t.equal(analyzer('Center Road'), 'Center Road');
232 | t.equal(analyzer('Annex Road'), 'Annex Road');
233 | t.equal(analyzer('Anex Road'), 'Anex Road');
234 |
235 | // personal title in middle of name
236 | t.equal(analyzer('Main Road St Arnaud'), 'Main Road St Arnaud');
237 | t.equal(analyzer('Mount St John Avenue'), 'Mount St John Avenue');
238 |
239 | t.end();
240 | });
241 |
242 |
243 | tape('misc', (t) => {
244 | t.equal(analyzer('YELLOWSTONE BLVD'), 'Yellowstone Boulevard');
245 | t.equal(analyzer('YESHIVA LN'), 'Yeshiva Lane');
246 | t.equal(analyzer('WYGANT PL'), 'Wygant Place');
247 | t.equal(analyzer('W 262 ST'), 'West 262nd Street');
248 | t.equal(analyzer('W 26TH ST'), 'West 26th Street');
249 | t.equal(analyzer('WILLIE MC DONALD WAY'), 'Willie Mc Donald Way');
250 | t.equal(analyzer('West 93rd Street'), 'West 93rd Street');
251 | t.equal(analyzer('JFK AIRPORT'), 'Jfk Airport'); // this should really uppercase JFK
252 | t.equal(analyzer('DR M L KING JR BLVD'), 'Dr M L King Jr Boulevard'); // not perfect
253 | t.equal(analyzer('E HAMPTON BLVD'), 'East Hampton Boulevard');
254 | t.equal(analyzer('MARATHON PKWY'), 'Marathon Parkway');
255 | t.equal(analyzer('ANDREWS AVE S'), 'Andrews Avenue South');
256 | t.equal(analyzer('W 13 ST'), 'West 13th Street');
257 | t.end();
258 | });
259 |
260 | tape('misc directionals', (t) => {
261 | t.equal(analyzer('W KINGSBRIDGE RD'), 'West Kingsbridge Road');
262 | t.equal(analyzer('W MOSHOLU PKWY S'), 'West Mosholu Parkway South');
263 | t.equal(analyzer('WILLIAMSBURG ST E'), 'Williamsburg Street East');
264 | t.equal(analyzer('W MOSHOLU PKWY N'), 'West Mosholu Parkway North');
265 | t.equal(analyzer('W MOSHOLU PKWY SE'), 'West Mosholu Parkway SE');
266 | t.equal(analyzer('S WILLIAM ST'), 'South William Street');
267 | t.equal(analyzer('Foo ST South East'), 'Foo Street South East');
268 | t.end();
269 | });
270 |
271 | // tape('prefix expansions', (t) => {
272 | // t.equal(analyzer('ST JAMES ST'), 'Saint James Street');
273 | // t.equal(analyzer('ST JAMES AVE'), 'Saint James Avenue');
274 | // t.equal(analyzer('ST. JAMES AVE'), 'Saint James Avenue');
275 | // t.equal(analyzer('ST NICHOLAS TER'), 'Saint Nicholas Terrace');
276 | // t.equal(analyzer('MT DOOM CRES'), 'Mount Doom Crescent');
277 | // t.equal(analyzer('MT. DOOM CRES'), 'Mount Doom Crescent');
278 | // t.equal(analyzer('FT IMPENETRABLE ROW'), 'Fort Impenetrable Row');
279 | // t.equal(analyzer('FT. IMPENETRABLE ROW'), 'Fort Impenetrable Row');
280 | // t.equal(analyzer('St Leonards Drive'), 'Saint Leonards Drive');
281 | // t.equal(analyzer('St Andrew Street'), 'Saint Andrew Street');
282 | // t.end();
283 | // });
284 |
--------------------------------------------------------------------------------
/test/data/au/input_file_3.csv:
--------------------------------------------------------------------------------
1 | LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
2 | 144.931874,-37.791488,10,Smith Street,,input city,input district,input region,input postcode,GAVIC718519668
3 |
--------------------------------------------------------------------------------
/test/data/au/input_file_4.csv:
--------------------------------------------------------------------------------
1 | LON,LAT,HASH,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
2 | 144.9804144,-37.8723977,710daac656ffd0c3,10/244,BARKLY STREET,,ST KILDA,,VIC,"3182","50579518"
3 | 145.0378718,-37.8637847,92862c98c20bbe3d,10/244-246,WATTLETREE ROAD,,MALVERN,,VIC,"3144","208518759"
4 | 145.0003807,-37.8289596,d0a21035cebcd8ab,10/244-246,MARY STREET,,RICHMOND,,VIC,"3121","51463974"
5 | 144.978361,-37.8002503,4e891155eb009dc3,10/244,BRUNSWICK STREET,,FITZROY,,VIC,"3065","210464257"
6 | 144.9591621,-37.8331898,e20c57c01d5d42c0,110/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672310"
7 | 144.9591621,-37.8331898,50c85f85cce9181f,210/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672321"
8 | 144.9591621,-37.8331898,4e737a8cc6ada9ec,310/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672332"
9 | 144.9591621,-37.8331898,d6ed0494e8c53ff8,410/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672343"
10 | 144.9591621,-37.8331898,fa0691071a173dab,510/244,DORCAS STREET,,SOUTH MELBOURNE,,VIC,"3205","423672353"
11 | 144.925714,-37.7516895,00be263cea28bea0,10/244,PASCOE VALE ROAD,,ESSENDON,,VIC,"3040","429232726"
12 |
--------------------------------------------------------------------------------
/test/data/expected.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "_index": "pelias",
4 | "_id": "openaddresses:address:input_file_1:7552fdd1d9eb5765",
5 | "data": {
6 | "name": {
7 | "default": "100 Main St"
8 | },
9 | "phrase": {
10 | "default": "100 Main St"
11 | },
12 | "parent": {
13 | "country": [
14 | "override country"
15 | ],
16 | "country_id": [
17 | "1"
18 | ],
19 | "country_a": [
20 | null
21 | ],
22 | "country_source": [
23 | null
24 | ],
25 | "macroregion": [
26 | "override macroregion"
27 | ],
28 | "macroregion_id": [
29 | "2"
30 | ],
31 | "macroregion_a": [
32 | null
33 | ],
34 | "macroregion_source": [
35 | null
36 | ],
37 | "region": [
38 | "override region"
39 | ],
40 | "region_id": [
41 | "3"
42 | ],
43 | "region_a": [
44 | null
45 | ],
46 | "region_source": [
47 | null
48 | ],
49 | "macrocounty": [
50 | "override macrocounty"
51 | ],
52 | "macrocounty_id": [
53 | "4"
54 | ],
55 | "macrocounty_a": [
56 | null
57 | ],
58 | "macrocounty_source": [
59 | null
60 | ],
61 | "county": [
62 | "override county"
63 | ],
64 | "county_id": [
65 | "5"
66 | ],
67 | "county_a": [
68 | null
69 | ],
70 | "county_source": [
71 | null
72 | ],
73 | "borough": [
74 | "override borough"
75 | ],
76 | "borough_id": [
77 | "6"
78 | ],
79 | "borough_a": [
80 | null
81 | ],
82 | "borough_source": [
83 | null
84 | ],
85 | "locality": [
86 | "override locality"
87 | ],
88 | "locality_id": [
89 | "7"
90 | ],
91 | "locality_a": [
92 | null
93 | ],
94 | "locality_source": [
95 | null
96 | ],
97 | "localadmin": [
98 | "override localadmin"
99 | ],
100 | "localadmin_id": [
101 | "8"
102 | ],
103 | "localadmin_a": [
104 | null
105 | ],
106 | "localadmin_source": [
107 | null
108 | ],
109 | "neighbourhood": [
110 | "override neighbourhood"
111 | ],
112 | "neighbourhood_id": [
113 | "9"
114 | ],
115 | "neighbourhood_a": [
116 | null
117 | ],
118 | "neighbourhood_source": [
119 | null
120 | ]
121 | },
122 | "address_parts": {
123 | "number": "100",
124 | "street": "Main St",
125 | "zip": "input postcode"
126 | },
127 | "center_point": {
128 | "lon": 21.212121,
129 | "lat": 12.121212
130 | },
131 | "source": "openaddresses",
132 | "layer": "address",
133 | "source_id": "input_file_1:7552fdd1d9eb5765"
134 | }
135 | },
136 | {
137 | "_index": "pelias",
138 | "_id": "openaddresses:address:input_file_1:e21716b47966b98a",
139 | "data": {
140 | "name": {
141 | "default": "200 Main St"
142 | },
143 | "phrase": {
144 | "default": "200 Main St"
145 | },
146 | "address_parts": {
147 | "number": "200",
148 | "street": "Main St"
149 | },
150 | "center_point": {
151 | "lon": 31.313131,
152 | "lat": 13.131313
153 | },
154 | "source": "openaddresses",
155 | "layer": "address",
156 | "source_id": "input_file_1:e21716b47966b98a"
157 | }
158 | },
159 | {
160 | "_index": "pelias",
161 | "_id": "openaddresses:address:input_file_1:7456321cc7d6d352",
162 | "data": {
163 | "name": {
164 | "default": "0 Main St"
165 | },
166 | "phrase": {
167 | "default": "0 Main St"
168 | },
169 | "address_parts": {
170 | "number": "0",
171 | "street": "Main St"
172 | },
173 | "center_point": {
174 | "lon": 41.414141,
175 | "lat": 14.141414
176 | },
177 | "source": "openaddresses",
178 | "layer": "address",
179 | "source_id": "input_file_1:7456321cc7d6d352"
180 | }
181 | },
182 | {
183 | "_index": "pelias",
184 | "_id": "openaddresses:address:input_file_1:f026cd5494a7e4f4",
185 | "data": {
186 | "name": {
187 | "default": "0 Elm St"
188 | },
189 | "phrase": {
190 | "default": "0 Elm St"
191 | },
192 | "address_parts": {
193 | "number": "0",
194 | "street": "Elm St"
195 | },
196 | "center_point": {
197 | "lon": 51.515151,
198 | "lat": 15.151515
199 | },
200 | "source": "openaddresses",
201 | "layer": "address",
202 | "source_id": "input_file_1:f026cd5494a7e4f4"
203 | }
204 | },
205 | {
206 | "_index": "pelias",
207 | "_id": "openaddresses:address:input_file_1:4509c0194f1efaca",
208 | "data": {
209 | "name": {
210 | "default": "300 Main St"
211 | },
212 | "phrase": {
213 | "default": "300 Main St"
214 | },
215 | "address_parts": {
216 | "number": "300",
217 | "street": "Main St"
218 | },
219 | "center_point": {
220 | "lon": 61.616161,
221 | "lat": 16.161616
222 | },
223 | "source": "openaddresses",
224 | "layer": "address",
225 | "source_id": "input_file_1:4509c0194f1efaca"
226 | }
227 | },
228 | {
229 | "_index": "pelias",
230 | "_id": "openaddresses:address:input_file_2:fc6d8b0a0e5cda70",
231 | "data": {
232 | "name": {
233 | "default": "400 Vireo Rd"
234 | },
235 | "phrase": {
236 | "default": "400 Vireo Rd"
237 | },
238 | "address_parts": {
239 | "number": "400",
240 | "street": "Vireo Rd"
241 | },
242 | "center_point": {
243 | "lon": 71.717171,
244 | "lat": 17.171717
245 | },
246 | "source": "openaddresses",
247 | "layer": "address",
248 | "source_id": "input_file_2:fc6d8b0a0e5cda70"
249 | }
250 | },
251 | {
252 | "_index": "pelias",
253 | "_id": "openaddresses:address:input_file_2:b7c25b5e6eea7831",
254 | "data": {
255 | "name": {
256 | "default": "0 Vireo Rd"
257 | },
258 | "phrase": {
259 | "default": "0 Vireo Rd"
260 | },
261 | "address_parts": {
262 | "number": "0",
263 | "street": "Vireo Rd"
264 | },
265 | "center_point": {
266 | "lon": 81.818181,
267 | "lat": 18.181818
268 | },
269 | "source": "openaddresses",
270 | "layer": "address",
271 | "source_id": "input_file_2:b7c25b5e6eea7831"
272 | }
273 | },
274 | {
275 | "_index": "pelias",
276 | "_id": "openaddresses:address:input_file_2:25d52af880bfefc4",
277 | "data": {
278 | "name": {
279 | "default": "500 Calle De Lago"
280 | },
281 | "phrase": {
282 | "default": "500 Calle De Lago"
283 | },
284 | "address_parts": {
285 | "number": "500",
286 | "street": "Calle De Lago"
287 | },
288 | "center_point": {
289 | "lon": 91.919191,
290 | "lat": 19.191919
291 | },
292 | "source": "openaddresses",
293 | "layer": "address",
294 | "source_id": "input_file_2:25d52af880bfefc4"
295 | }
296 | },
297 | {
298 | "_index": "pelias",
299 | "_id": "openaddresses:address:input_file_2:0d9cb0ba093a3d23",
300 | "data": {
301 | "name": {
302 | "default": "500 Calle De Lago"
303 | },
304 | "phrase": {
305 | "default": "500 Calle De Lago"
306 | },
307 | "address_parts": {
308 | "number": "500",
309 | "street": "Calle De Lago"
310 | },
311 | "center_point": {
312 | "lon": 92.929292,
313 | "lat": 29.292929
314 | },
315 | "source": "openaddresses",
316 | "layer": "address",
317 | "source_id": "input_file_2:0d9cb0ba093a3d23"
318 | }
319 | },
320 | {
321 | "_index": "pelias",
322 | "_id": "openaddresses:address:au/input_file_3:0c0641950f5693a0",
323 | "data": {
324 | "name": {
325 | "default": "10 Smith Street"
326 | },
327 | "phrase": {
328 | "default": "10 Smith Street"
329 | },
330 | "address_parts": {
331 | "number": "10",
332 | "street": "Smith Street",
333 | "zip": "input postcode"
334 | },
335 | "center_point": {
336 | "lon": 144.931874,
337 | "lat": -37.791488
338 | },
339 | "source": "openaddresses",
340 | "layer": "address",
341 | "source_id": "au/input_file_3:0c0641950f5693a0",
342 | "addendum": {
343 | "concordances": "{\"gnaf:pid\":\"GAVIC718519668\"}"
344 | }
345 | }
346 | },
347 | {
348 | "_index": "pelias",
349 | "_id": "openaddresses:address:au/input_file_4:2e7dc83e6d7c43b5",
350 | "data": {
351 | "name": {
352 | "default": "244 Barkly Street"
353 | },
354 | "phrase": {
355 | "default": "244 Barkly Street"
356 | },
357 | "address_parts": {
358 | "number": "244",
359 | "street": "Barkly Street",
360 | "zip": "3182",
361 | "unit": "10"
362 | },
363 | "center_point": {
364 | "lon": 144.980414,
365 | "lat": -37.872398
366 | },
367 | "source": "openaddresses",
368 | "layer": "address",
369 | "source_id": "au/input_file_4:2e7dc83e6d7c43b5"
370 | }
371 | },
372 | {
373 | "_index": "pelias",
374 | "_id": "openaddresses:address:au/input_file_4:b9d4e479b3787466",
375 | "data": {
376 | "name": {
377 | "default": "10/244-246 Wattletree Road"
378 | },
379 | "phrase": {
380 | "default": "10/244-246 Wattletree Road"
381 | },
382 | "address_parts": {
383 | "number": "10/244-246",
384 | "street": "Wattletree Road",
385 | "zip": "3144"
386 | },
387 | "center_point": {
388 | "lon": 145.037872,
389 | "lat": -37.863785
390 | },
391 | "source": "openaddresses",
392 | "layer": "address",
393 | "source_id": "au/input_file_4:b9d4e479b3787466"
394 | }
395 | },
396 | {
397 | "_index": "pelias",
398 | "_id": "openaddresses:address:au/input_file_4:5d465daf4228aeae",
399 | "data": {
400 | "name": {
401 | "default": "10/244-246 Mary Street"
402 | },
403 | "phrase": {
404 | "default": "10/244-246 Mary Street"
405 | },
406 | "address_parts": {
407 | "number": "10/244-246",
408 | "street": "Mary Street",
409 | "zip": "3121"
410 | },
411 | "center_point": {
412 | "lon": 145.000381,
413 | "lat": -37.82896
414 | },
415 | "source": "openaddresses",
416 | "layer": "address",
417 | "source_id": "au/input_file_4:5d465daf4228aeae"
418 | }
419 | },
420 | {
421 | "_index": "pelias",
422 | "_id": "openaddresses:address:au/input_file_4:19c92a8fab88d851",
423 | "data": {
424 | "name": {
425 | "default": "244 Brunswick Street"
426 | },
427 | "phrase": {
428 | "default": "244 Brunswick Street"
429 | },
430 | "address_parts": {
431 | "number": "244",
432 | "street": "Brunswick Street",
433 | "zip": "3065",
434 | "unit": "10"
435 | },
436 | "center_point": {
437 | "lon": 144.978361,
438 | "lat": -37.80025
439 | },
440 | "source": "openaddresses",
441 | "layer": "address",
442 | "source_id": "au/input_file_4:19c92a8fab88d851"
443 | }
444 | },
445 | {
446 | "_index": "pelias",
447 | "_id": "openaddresses:address:au/input_file_4:2d93ffa5b82d5815",
448 | "data": {
449 | "name": {
450 | "default": "244 Dorcas Street"
451 | },
452 | "phrase": {
453 | "default": "244 Dorcas Street"
454 | },
455 | "address_parts": {
456 | "number": "244",
457 | "street": "Dorcas Street",
458 | "zip": "3205",
459 | "unit": "110"
460 | },
461 | "center_point": {
462 | "lon": 144.959162,
463 | "lat": -37.83319
464 | },
465 | "source": "openaddresses",
466 | "layer": "address",
467 | "source_id": "au/input_file_4:2d93ffa5b82d5815"
468 | }
469 | },
470 | {
471 | "_index": "pelias",
472 | "_id": "openaddresses:address:au/input_file_4:a6e27ffafd14a0da",
473 | "data": {
474 | "name": {
475 | "default": "244 Dorcas Street"
476 | },
477 | "phrase": {
478 | "default": "244 Dorcas Street"
479 | },
480 | "address_parts": {
481 | "number": "244",
482 | "street": "Dorcas Street",
483 | "zip": "3205",
484 | "unit": "210"
485 | },
486 | "center_point": {
487 | "lon": 144.959162,
488 | "lat": -37.83319
489 | },
490 | "source": "openaddresses",
491 | "layer": "address",
492 | "source_id": "au/input_file_4:a6e27ffafd14a0da"
493 | }
494 | },
495 | {
496 | "_index": "pelias",
497 | "_id": "openaddresses:address:au/input_file_4:4c18bc2fab1af1ee",
498 | "data": {
499 | "name": {
500 | "default": "244 Dorcas Street"
501 | },
502 | "phrase": {
503 | "default": "244 Dorcas Street"
504 | },
505 | "address_parts": {
506 | "number": "244",
507 | "street": "Dorcas Street",
508 | "zip": "3205",
509 | "unit": "310"
510 | },
511 | "center_point": {
512 | "lon": 144.959162,
513 | "lat": -37.83319
514 | },
515 | "source": "openaddresses",
516 | "layer": "address",
517 | "source_id": "au/input_file_4:4c18bc2fab1af1ee"
518 | }
519 | },
520 | {
521 | "_index": "pelias",
522 | "_id": "openaddresses:address:au/input_file_4:d5236248ba736eba",
523 | "data": {
524 | "name": {
525 | "default": "244 Dorcas Street"
526 | },
527 | "phrase": {
528 | "default": "244 Dorcas Street"
529 | },
530 | "address_parts": {
531 | "number": "244",
532 | "street": "Dorcas Street",
533 | "zip": "3205",
534 | "unit": "410"
535 | },
536 | "center_point": {
537 | "lon": 144.959162,
538 | "lat": -37.83319
539 | },
540 | "source": "openaddresses",
541 | "layer": "address",
542 | "source_id": "au/input_file_4:d5236248ba736eba"
543 | }
544 | },
545 | {
546 | "_index": "pelias",
547 | "_id": "openaddresses:address:au/input_file_4:7dbcebf7bd632ef8",
548 | "data": {
549 | "name": {
550 | "default": "244 Dorcas Street"
551 | },
552 | "phrase": {
553 | "default": "244 Dorcas Street"
554 | },
555 | "address_parts": {
556 | "number": "244",
557 | "street": "Dorcas Street",
558 | "zip": "3205",
559 | "unit": "510"
560 | },
561 | "center_point": {
562 | "lon": 144.959162,
563 | "lat": -37.83319
564 | },
565 | "source": "openaddresses",
566 | "layer": "address",
567 | "source_id": "au/input_file_4:7dbcebf7bd632ef8"
568 | }
569 | },
570 | {
571 | "_index": "pelias",
572 | "_id": "openaddresses:address:au/input_file_4:822280a9b8a92d85",
573 | "data": {
574 | "name": {
575 | "default": "244 Pascoe Vale Road"
576 | },
577 | "phrase": {
578 | "default": "244 Pascoe Vale Road"
579 | },
580 | "address_parts": {
581 | "number": "244",
582 | "street": "Pascoe Vale Road",
583 | "zip": "3040",
584 | "unit": "10"
585 | },
586 | "center_point": {
587 | "lon": 144.925714,
588 | "lat": -37.751689
589 | },
590 | "source": "openaddresses",
591 | "layer": "address",
592 | "source_id": "au/input_file_4:822280a9b8a92d85"
593 | }
594 | }
595 | ]
--------------------------------------------------------------------------------
/test/data/input_file_1.csv:
--------------------------------------------------------------------------------
1 | LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
2 | 21.212121,12.121212,100,Main St,,input city,input district,input region,input postcode,GOOD RECORD
3 | 31.313131,13.131313, 200 , Main St ,,,,,,GOOD RECORD WITH FIELD TRIMMING
4 | 41.414141,14.141414,0,Main St,,,,,,WILL BE LOGGED BUT NOT SKIPPED (NUMBER IS REDUCEABLE TO 0)
5 | 51.515151,15.151515,00,Elm St,,,,,,WILL BE LOGGED BUT NOT SKIPPED (NUMBER IS REDUCEABLE TO 0)
6 | 61.616161,16.161616,00300,Main St,,,,,,MAINTAINS LEADING ZEROES
7 |
--------------------------------------------------------------------------------
/test/data/input_file_2.csv:
--------------------------------------------------------------------------------
1 | LON,LAT,NUMBER,STREET,UNIT,CITY,DISTRICT,REGION,POSTCODE,ID
2 | 71.717171,17.171717,400,Vireo Rd,,,,,,GOOD RECORD
3 | 81.818181,18.181818,00000,Vireo Rd,,,,,,WILL BE LOGGED BUT NOT SKIPPED (NUMBER IS REDUCEABLE TO 0)
4 | 91.919191,19.191919,00500,Calle de Lago,,,,,,MAINTAINS LEADING ZEROES
5 | 92.929292,29.292929,00500,Calle de Lago,,,,,,THIS GETS FILTERED OUT BY DEDUPE
6 |
--------------------------------------------------------------------------------
/test/functional.js:
--------------------------------------------------------------------------------
1 | require( './importPipeline' );
2 |
--------------------------------------------------------------------------------
/test/import.js:
--------------------------------------------------------------------------------
1 | const tape = require( 'tape' );
2 |
3 | const proxyquire = require('proxyquire').noCallThru();
4 |
5 | tape( 'config.generate throwing error should rethrow', (test) => {
6 | test.throws(() => {
7 | proxyquire('../import', {
8 | './schema': 'this is the schema',
9 | 'pelias-config': {
10 | generate: (schema) => {
11 | // the schema passed to generate should be the require'd schema
12 | test.equals(schema, 'this is the schema');
13 |
14 | throw Error('config is not valid');
15 | }
16 | }
17 | })();
18 |
19 | }, /config is not valid/);
20 |
21 | test.end();
22 |
23 | });
24 |
--------------------------------------------------------------------------------
/test/importPipeline.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const path = require('path');
3 | const tape = require('tape');
4 | const map = require('through2-map');
5 | const proxyquire = require('proxyquire');
6 | const stream_mock = require('stream-mock');
7 |
8 | const expectedPath = path.join(__dirname, 'data/expected.json');
9 | const expected = require(expectedPath);
10 |
11 | tape('functional test of importing four small OA files', function(t) {
12 | // expect two assertions, one for the error and one for the data
13 | t.plan(2);
14 |
15 | const assert = (err, actual) => {
16 | // uncomment this to write the actual results to the expected file
17 | // make sure they look ok though. comma left off so jshint reminds you
18 | // not to commit this line
19 | // require('fs').writeFileSync(expectedPath, JSON.stringify(actual, null, 2))
20 |
21 | t.error(err);
22 | t.deepEquals(actual, expected);
23 | t.end();
24 | };
25 |
26 | const importPipeline = proxyquire('../lib/importPipeline', {
27 | 'pelias-dbclient': () => {
28 | const dbclient = new stream_mock.ObjectWritableMock();
29 | dbclient.on('error', (e) => assert(e));
30 | dbclient.on('finish', () => assert(null, dbclient.data));
31 | return dbclient;
32 | }
33 | });
34 |
35 | // mock admin lookup stream to show that input file admin values are ignored
36 | // and replaced with overrides from adminLookup
37 | const adminLookupStream = map.obj((record) => {
38 | // we're only concerned about one record being modified
39 | if (_.isEqual(record.center_point, { lat: 12.121212, lon: 21.212121})) {
40 | record.addParent('country', 'override country', '1');
41 | record.addParent('macroregion', 'override macroregion', '2');
42 | record.addParent('region', 'override region', '3');
43 | record.addParent('macrocounty', 'override macrocounty', '4');
44 | record.addParent('county', 'override county', '5');
45 | record.addParent('borough', 'override borough', '6');
46 | record.addParent('locality', 'override locality', '7');
47 | record.addParent('localadmin', 'override localadmin', '8');
48 | record.addParent('neighbourhood', 'override neighbourhood', '9');
49 | }
50 |
51 | return record;
52 | });
53 |
54 | // test fixtures
55 | const dirPath = path.join(__dirname, 'data');
56 | const inputFiles = [
57 | path.join(dirPath, 'input_file_1.csv'),
58 | path.join(dirPath, 'input_file_2.csv'),
59 | path.join(dirPath, 'au/input_file_3.csv'),
60 | path.join(dirPath, 'au/input_file_4.csv')
61 | ];
62 |
63 | importPipeline.create(inputFiles, dirPath, adminLookupStream);
64 | });
65 |
--------------------------------------------------------------------------------
/test/isValidCsvRecord.js:
--------------------------------------------------------------------------------
1 | var tape = require( 'tape' );
2 |
3 | var isValidCsvRecord = require( '../lib/isValidCsvRecord' );
4 |
5 | tape( 'Identifies invalid CSV records.', function ( test ){
6 | var records = [
7 | {LON: '1', LAT: '2', STREET: '3', NUMBER: '', FOO: '', SOME_PROP: ''},
8 | {LON: '', LAT: '2', STREET: '3', NUMBER: '', FOO: '', SOME_PROP: 'something'},
9 | {LON: '', LAT: '2', STREET: '', NUMBER: '4', SOME_PROP: 'value'}
10 | ];
11 | records.forEach( function ( rec ){
12 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' );
13 | });
14 |
15 | var validRecord = {LON: '1', LAT: '2', STREET: '3', NUMBER: '4', SOME_PROP: 'abs'};
16 | test.ok( isValidCsvRecord( validRecord ), 'Record identified as valid.' );
17 | test.end();
18 | });
19 |
20 | tape( 'Identifies CSV files that have incorrect columns', function( test) {
21 | var record = { 'notLat': 'asdf', 'notLon': 5 };
22 |
23 | test.ok( !isValidCsvRecord( record ), 'Record identified as invalid' );
24 | test.end();
25 | });
26 |
27 | tape('complete record but house number is literal word `null` should return false', function(test) {
28 | var record = {
29 | LON: '1', LAT: '2', NUMBER: 'NuLl', STREET: 'Street'
30 | };
31 |
32 | test.ok( !isValidCsvRecord(record), 'Record identified as invalid');
33 | test.end();
34 |
35 | });
36 |
37 | tape('complete record but house number is literal word `undefined` should return false', function(test) {
38 | var record = {
39 | LON: '1', LAT: '2', NUMBER: 'uNdEfInEd', STREET: 'Street'
40 | };
41 |
42 | test.ok( !isValidCsvRecord(record), 'Record identified as invalid');
43 | test.end();
44 |
45 | });
46 |
47 | tape('complete record but house number is literal word `unavailable` should return false', function(test) {
48 | var record = {
49 | LON: '1', LAT: '2', NUMBER: 'uNaVaIlAbLe', STREET: 'Street'
50 | };
51 |
52 | test.ok( !isValidCsvRecord(record), 'Record identified as invalid');
53 | test.end();
54 |
55 | });
56 |
57 | tape('complete record but street contains literal word `null` should return false', function(test) {
58 | var records = [
59 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'NuLl Name St' },
60 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South NULL St' },
61 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South Name null' }
62 | ];
63 |
64 | records.forEach( function ( rec ){
65 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' );
66 | });
67 |
68 | test.end();
69 |
70 | });
71 |
72 | tape('complete record but street contains literal word `undefined` should return false', function(test) {
73 | var records = [
74 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'uNdEfInEd Name St' },
75 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South UNDEFINED St' },
76 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South Name undefined' }
77 | ];
78 |
79 | records.forEach( function ( rec ){
80 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' );
81 | });
82 |
83 | test.end();
84 |
85 | });
86 |
87 | tape('complete record but street contains literal word `unavailable` should return false', function(test) {
88 | var records = [
89 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'uNaVaIlAbLe Name St' },
90 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South UNAVAILABLE St' },
91 | { LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'South Name unavailable' }
92 | ];
93 |
94 | records.forEach( function ( rec ){
95 | test.ok( !isValidCsvRecord( rec ), 'Record identified as invalid' );
96 | });
97 |
98 | test.end();
99 |
100 | });
101 |
102 | tape('street with substring `null` but not on word boundary should return true', function(test) {
103 | var record = {
104 | LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'Snull Street Nulls'
105 | };
106 |
107 | test.ok( isValidCsvRecord(record), 'Record identified as valid');
108 | test.end();
109 |
110 | });
111 |
112 | tape('street with substring `undefined` but not on word boundary should return true', function(test) {
113 | var record = {
114 | LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'Sundefined Street Undefineds'
115 | };
116 |
117 | test.ok( isValidCsvRecord(record), 'Record identified as valid');
118 | test.end();
119 |
120 | });
121 |
122 | tape('street with substring `unavailable` but not on word boundary should return true', function(test) {
123 | var record = {
124 | LON: '1', LAT: '2', NUMBER: 'Number', STREET: 'Sunavailable Street Unavailables'
125 | };
126 |
127 | test.ok( isValidCsvRecord(record), 'Record identified as valid');
128 | test.end();
129 |
130 | });
131 |
132 | tape('record with lon/lat parseable as 0/0 should return false', test => {
133 | const record = {
134 | LON: '0.000000',
135 | LAT: '0.000000',
136 | NUMBER: 'Number',
137 | STREET: 'Street'
138 | };
139 |
140 | test.notOk( isValidCsvRecord(record), 'should be rejected');
141 | test.end();
142 |
143 | });
144 |
145 | tape('record with lon/lat parseable as 0/non-0 should return true', test => {
146 | const record = {
147 | LON: '0.0000',
148 | LAT: '0.0006',
149 | NUMBER: 'Number',
150 | STREET: 'Street'
151 | };
152 |
153 | test.ok( isValidCsvRecord(record), 'should be accepted');
154 | test.end();
155 |
156 | });
157 |
158 | tape('record with lon/lat parseable as non-0/0 should return true', test => {
159 | const record = {
160 | LON: '0.0006',
161 | LAT: '0.0000',
162 | NUMBER: 'Number',
163 | STREET: 'Street'
164 | };
165 |
166 | test.ok( isValidCsvRecord(record), 'should be accepted');
167 | test.end();
168 |
169 | });
170 |
171 | tape('record with lon/lat very close to 0,0 should return false', test => {
172 | const record = {
173 | LON: '0.000000',
174 | LAT: '0.000001',
175 | NUMBER: 'Number',
176 | STREET: 'Street'
177 | };
178 |
179 | test.notOk(isValidCsvRecord(record), 'should not be accepted - too near to 0,0');
180 | test.end();
181 |
182 | });
183 |
184 | tape('record with lon/lat very close to 0,0 should return false', test => {
185 | const record = {
186 | LON: '0.000001',
187 | LAT: '0.000000',
188 | NUMBER: 'Number',
189 | STREET: 'Street'
190 | };
191 |
192 | test.notOk(isValidCsvRecord(record), 'should not be accepted - too near to 0,0');
193 | test.end();
194 |
195 | });
196 |
--------------------------------------------------------------------------------
/test/openaddresses_bad_data.csv:
--------------------------------------------------------------------------------
1 | LON,LAT,NUMBER,STREET,FOOBAR,DEADBEEF
2 | ,-40,a,b,,
3 | -40,,a,b,,
4 | ,,,,,
5 | ,,30,b,,
6 | 40,40,a,,,
7 | ,,,30,,,
8 | ,..........,,,,,,,,,,,
9 | 40,40,5, ,,
10 | 40,40,5," ",,
11 |
--------------------------------------------------------------------------------
/test/openaddresses_sample.csv:
--------------------------------------------------------------------------------
1 | LON, LAT, NUMBER, STREET,FOOOOBAR
2 | -118.0170157,55.546026835788886,23042,Twp Road 755 A,,,,
3 | -118.75318353,55.14959214890181,712046,Rge Road 34,,
4 | -118.8218384,55.15506788763259,712078,Rge Road 34,,
5 | -118.79719936,55.153343057595535,712068,Rge Road 34,,,,,
6 | -118.66743097,55.151807043809917,712060,Rge Road 34,,,,
7 | -118.74783569,55.155320792497442,712082,Rge Road 35,,,,
8 | 1,2,number,too many spaces,
9 | 1,2,trim , multiple spaces,,,,
10 |
--------------------------------------------------------------------------------
/test/parameters.js:
--------------------------------------------------------------------------------
1 | var tape = require( 'tape' );
2 | var path = require( 'path' );
3 | var fs = require('fs');
4 |
5 | var temp = require( 'temp' ).track();
6 |
7 | var parameters = require( '../lib/parameters' );
8 |
9 | tape( 'interpretUserArgs() correctly handles arguments', function ( test ){
10 | var testCase = [
11 | [ 'test' ],
12 | { dirPath: 'test', 'parallel-count': undefined, 'parallel-id': undefined },
13 | ];
14 |
15 | test.deepEqual(
16 | parameters.interpretUserArgs( testCase[ 0 ] ), testCase[ 1 ],
17 | 'Basic arguments case passes.'
18 | );
19 |
20 | var badArguments = [
21 | [ 'not an arg', 'some dir' ],
22 | [ 'not an arg', 'some dir' ],
23 | [ 'not a dir' ],
24 | [ 'package.json' ],
25 | ];
26 | badArguments.forEach( function execTestCase( testCase, ind ){
27 | var errorObj = parameters.interpretUserArgs( testCase );
28 | test.ok(
29 | 'exitCode' in errorObj && 'errMessage' in errorObj,
30 | 'Invalid arguments yield an error object: ' + ind
31 | );
32 | });
33 | test.end();
34 | });
35 |
36 | tape('interpretUserArgs returns given path as dirPath', function(test) {
37 | temp.mkdir('tmpdir', function(err, temporary_dir) {
38 |
39 | var input = [temporary_dir];
40 | var result = parameters.interpretUserArgs(input);
41 |
42 | test.equal(result.dirPath, temporary_dir, 'path should be equal to specified path');
43 | test.end();
44 | });
45 | });
46 |
47 | tape('intepretUserArgs normalizes path given as parameter', function(test) {
48 | temp.mkdir('tmpdir', function(err, temporary_dir) {
49 | var input_dir = temporary_dir + path.sep + path.sep;
50 |
51 | var input = [input_dir];
52 | var result = parameters.interpretUserArgs(input);
53 |
54 | var expected_dir = path.normalize(input_dir);
55 | test.equal(result.dirPath, expected_dir, 'path should be equal to specified path');
56 | test.end();
57 | });
58 | });
59 |
60 | tape('interpretUserArgs returns dir from pelias config if no dir specified on command line', function(test) {
61 | temp.mkdir('tmpdir2', function(err, temporary_dir) {
62 | var peliasConfig = {
63 | imports: {
64 | openaddresses: {
65 | datapath: temporary_dir
66 | }
67 | }
68 | };
69 |
70 | var input = [];
71 | var result = parameters.interpretUserArgs(input, peliasConfig);
72 |
73 | test.equal(result.dirPath, temporary_dir, 'path should be equal to path from config');
74 | test.end();
75 | });
76 | });
77 |
78 | tape('interpretUserArgs returns normalized path from config', function(test) {
79 | temp.mkdir('tmpdir2', function(err, temporary_dir) {
80 | var input_dir = path.sep + '.' + temporary_dir;
81 | var peliasConfig = {
82 | imports: {
83 | openaddresses: {
84 | datapath: input_dir
85 | }
86 | }
87 | };
88 |
89 | var input = [];
90 | var result = parameters.interpretUserArgs(input, peliasConfig);
91 |
92 | var expected_dir = path.normalize(input_dir);
93 | test.equal(result.dirPath, expected_dir, 'path should be equal to path from config');
94 | test.end();
95 | });
96 | });
97 |
98 | tape('getFileList returns all .csv path names when config has empty files list', function(test) {
99 | temp.mkdir('multipleFiles', function(err, temp_dir) {
100 | // add some files to the data path to be globbed
101 | fs.mkdirSync(path.join(temp_dir, 'dirA'));
102 | fs.writeFileSync(path.join(temp_dir, 'dirA', 'fileA.csv'), '');
103 |
104 | fs.mkdirSync(path.join(temp_dir, 'dirB'));
105 | fs.writeFileSync(path.join(temp_dir, 'dirB', 'fileB.csv'), '');
106 |
107 | fs.writeFileSync(path.join(temp_dir, 'fileC.csv'), '');
108 |
109 | // should not be included since it's not a .csv file
110 | fs.writeFileSync(path.join(temp_dir, 'fileD.txt'), '');
111 |
112 | var peliasConfig = {
113 | imports: {
114 | openaddresses: {
115 | files: []
116 | }
117 | }
118 | };
119 | var args = {
120 | dirPath: temp_dir
121 | };
122 |
123 | var actual = parameters.getFileList(peliasConfig, args);
124 |
125 | test.equal(actual.length, 3);
126 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirA', 'fileA.csv')));
127 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirB', 'fileB.csv')));
128 | test.ok(actual.find((f) => f === path.join(temp_dir, 'fileC.csv')));
129 | test.end();
130 |
131 | });
132 | });
133 |
134 | tape('getFileList returns all .csv path names when config doesn\'t have files property', function(test) {
135 | temp.mkdir('multipleFiles', function(err, temp_dir) {
136 | // add some files to the data path to be globbed
137 | fs.mkdirSync(path.join(temp_dir, 'dirA'));
138 | fs.writeFileSync(path.join(temp_dir, 'dirA', 'fileA.csv'), '');
139 |
140 | fs.mkdirSync(path.join(temp_dir, 'dirB'));
141 | fs.writeFileSync(path.join(temp_dir, 'dirB', 'fileB.csv'), '');
142 |
143 | fs.writeFileSync(path.join(temp_dir, 'fileC.csv'), '');
144 |
145 | // should not be included since it's not a .csv file
146 | fs.writeFileSync(path.join(temp_dir, 'fileD.txt'), '');
147 |
148 | var peliasConfig = {
149 | imports: {
150 | openaddresses: {
151 | }
152 | }
153 | };
154 | var args = {
155 | dirPath: temp_dir
156 | };
157 |
158 | var actual = parameters.getFileList(peliasConfig, args);
159 |
160 | test.equal(actual.length, 3);
161 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirA', 'fileA.csv')));
162 | test.ok(actual.find((f) => f === path.join(temp_dir, 'dirB', 'fileB.csv')));
163 | test.ok(actual.find((f) => f === path.join(temp_dir, 'fileC.csv')));
164 | test.end();
165 |
166 | });
167 | });
168 |
169 | tape('getFileList returns fully qualified path names when config has a files list', function(test) {
170 | temp.mkdir('multipleFiles', function(err, temporary_dir) {
171 | var peliasConfig = {
172 | imports: {
173 | openaddresses: {
174 | files: ['filea.csv', 'fileb.csv']
175 | }
176 | }
177 | };
178 | var args = {
179 | dirPath: temporary_dir
180 | };
181 |
182 | var expected = [path.join(temporary_dir, 'filea.csv'), path.join(temporary_dir, 'fileb.csv')];
183 |
184 | var actual = parameters.getFileList(peliasConfig, args);
185 |
186 | test.deepEqual(actual, expected, 'file names should be equal');
187 | test.end();
188 | });
189 | });
190 |
191 | tape('getFileList handles parallel builds', function(test) {
192 | var peliasConfig = {
193 | imports: {
194 | openaddresses: {
195 | files: ['filea.csv', 'fileb.csv', 'filec.csv']
196 | }
197 | }
198 | };
199 |
200 | temp.mkdir('parallelBuilds', function(err, temporary_dir) {
201 | test.test('3 workers, id 0', function(t) {
202 | var args = {
203 | dirPath: temporary_dir,
204 | 'parallel-count': 3,
205 | 'parallel-id': 0
206 | };
207 |
208 | var expected = [path.join(temporary_dir, 'filea.csv')];
209 |
210 | var actual = parameters.getFileList(peliasConfig, args);
211 |
212 | t.deepEqual(actual, expected, 'only first file is indexed');
213 | t.end();
214 | });
215 |
216 | test.test('3 workers, id 1', function(t) {
217 | var args = {
218 | dirPath: temporary_dir,
219 | 'parallel-count': 3,
220 | 'parallel-id': 1
221 | };
222 |
223 | var expected = [path.join(temporary_dir, 'fileb.csv')];
224 |
225 | var actual = parameters.getFileList(peliasConfig, args);
226 |
227 | t.deepEqual(actual, expected, 'only second file indexed');
228 | t.end();
229 | });
230 |
231 | test.test('3 workers, id 2', function(t) {
232 | var args = {
233 | dirPath: temporary_dir,
234 | 'parallel-count': 3,
235 | 'parallel-id': 2
236 | };
237 |
238 | var expected = [path.join(temporary_dir, 'filec.csv')];
239 |
240 | var actual = parameters.getFileList(peliasConfig, args);
241 |
242 | t.deepEqual(actual, expected, 'only third file indexed');
243 | t.end();
244 | });
245 |
246 | test.test('3 workers, id 3', function(t) {
247 | var args = {
248 | dirPath: temporary_dir,
249 | 'parallel-count': 3,
250 | 'parallel-id': 3
251 | };
252 |
253 | var expected = [];
254 |
255 | var actual = parameters.getFileList(peliasConfig, args);
256 |
257 | t.deepEqual(actual, expected, 'file list is empty');
258 | t.end();
259 | });
260 | });
261 | });
262 |
--------------------------------------------------------------------------------
/test/schema.js:
--------------------------------------------------------------------------------
1 | const tape = require( 'tape' );
2 | const schema = require( '../schema' );
3 |
4 | function validate(config) {
5 | const result = schema.validate(config);
6 | if (result.error) {
7 | throw new Error(result.error.details[0].message);
8 | }
9 | }
10 |
11 | tape('missing imports should throw error', function(test) {
12 | const config = {};
13 |
14 | test.throws(validate.bind(null, config), /"imports" is required/);
15 | test.end();
16 |
17 | });
18 |
19 | tape('non-object imports should throw error', function(test) {
20 | [null, 17, 'string', [], true].forEach((value) => {
21 | const config = {
22 | imports: value
23 | };
24 |
25 | test.throws(validate.bind(null, config), /"imports" must be of type object/);
26 | });
27 |
28 | test.end();
29 |
30 | });
31 |
32 | tape('missing imports.openaddresses should throw error', function(test) {
33 | const config = {
34 | imports: {
35 | }
36 | };
37 |
38 | test.throws(validate.bind(null, config), /"imports.openaddresses" is required/);
39 | test.end();
40 |
41 | });
42 |
43 | tape('non-object imports.openaddresses should throw error', function(test) {
44 | [null, 17, 'string', [], true].forEach((value) => {
45 | const config = {
46 | imports: {
47 | openaddresses: value
48 | }
49 | };
50 |
51 | test.throws(validate.bind(null, config), /"imports.openaddresses" must be of type object/);
52 | });
53 |
54 | test.end();
55 |
56 | });
57 |
58 | tape( 'missing datapath should throw error', function(test) {
59 | const config = {
60 | imports: {
61 | openaddresses: {}
62 | }
63 | };
64 |
65 | test.throws(validate.bind(null, config), /"imports.openaddresses.datapath" is required/);
66 | test.end();
67 |
68 | });
69 |
70 | tape( 'non-string datapath should throw error', function(test) {
71 | [null, 17, {}, [], false].forEach((value) => {
72 | const config = {
73 | imports: {
74 | openaddresses: {
75 | datapath: value
76 | }
77 | }
78 | };
79 |
80 | test.throws(validate.bind(null, config), /"imports.openaddresses.datapath" must be a string/);
81 |
82 | });
83 |
84 | test.end();
85 | });
86 |
87 | tape( 'non-array files should throw error', function(test) {
88 | [null, 17, {}, 'string', false].forEach((value) => {
89 | const config = {
90 | imports: {
91 | openaddresses: {
92 | datapath: 'this is the datapath',
93 | files: value
94 | }
95 | }
96 | };
97 |
98 | test.throws(validate.bind(null, config), /"imports.openaddresses.files" must be an array/);
99 | });
100 |
101 | test.end();
102 | });
103 |
104 | tape( 'non-string elements in files array should throw error', function(test) {
105 | [null, 17, {}, [], false].forEach((value) => {
106 | const config = {
107 | imports: {
108 | openaddresses: {
109 | datapath: 'this is the datapath',
110 | files: [value]
111 | }
112 | }
113 | };
114 |
115 | test.throws(validate.bind(null, config),
116 | /"imports.openaddresses.files\[0\]" must be a string/, 'files elements must be strings');
117 | });
118 |
119 | test.end();
120 | });
121 |
122 | tape( 'non-boolean adminLookup should throw error', function(test) {
123 | [null, 17, {}, [], 'string'].forEach((value) => {
124 | const config = {
125 | imports: {
126 | openaddresses: {
127 | datapath: 'this is the datapath',
128 | adminLookup: value
129 | }
130 | }
131 | };
132 |
133 | test.throws(validate.bind(null, config),
134 | /"imports.openaddresses.adminLookup" must be a boolean/);
135 | });
136 |
137 | test.end();
138 | });
139 |
140 | tape( 'unknown config fields should throw error', function(test) {
141 | const config = {
142 | imports: {
143 | openaddresses: {
144 | datapath: 'this is the datapath',
145 | token: 'abc',
146 | unknown: 'value'
147 | }
148 | }
149 | };
150 |
151 | test.throws(validate.bind(null, config),
152 | /"imports.openaddresses.unknown" is not allowed/, 'unknown fields should be disallowed');
153 | test.end();
154 |
155 | });
156 |
157 | tape( 'configuration with only datapath & token should not throw error', function(test) {
158 | const config = {
159 | imports: {
160 | openaddresses: {
161 | datapath: 'this is the datapath',
162 | token: 'abc'
163 | }
164 | }
165 | };
166 |
167 | test.doesNotThrow(validate.bind(null, config), 'config should be valid');
168 | test.end();
169 |
170 | });
171 |
172 | tape( 'valid configuration should not throw error', function(test) {
173 | const config = {
174 | imports: {
175 | openaddresses: {
176 | datapath: 'this is the datapath',
177 | token: 'abc',
178 | adminLookup: false,
179 | files: ['file 1', 'file 2']
180 | }
181 | }
182 | };
183 |
184 | test.doesNotThrow(validate.bind(null, config), 'config should be valid');
185 | test.end();
186 |
187 | });
188 |
189 | tape( 'unknown children of imports should not throw error', function(test) {
190 | const config = {
191 | imports: {
192 | openaddresses: {
193 | datapath: 'this is the datapath',
194 | token: 'abc',
195 | adminLookup: false,
196 | files: ['file 1', 'file 2']
197 | },
198 | other: {}
199 | }
200 | };
201 |
202 | test.doesNotThrow(validate.bind(null, config), 'config should be valid');
203 | test.end();
204 |
205 | });
206 |
207 | tape( 'unknown children of root should not throw error', function(test) {
208 | const config = {
209 | imports: {
210 | openaddresses: {
211 | datapath: 'this is the datapath',
212 | token: 'abc',
213 | adminLookup: false,
214 | files: ['file 1', 'file 2']
215 | }
216 | },
217 | other: {}
218 | };
219 |
220 | test.doesNotThrow(validate.bind(null, config), 'config should be valid');
221 | test.end();
222 |
223 | });
224 |
--------------------------------------------------------------------------------
/test/streams/cleanupStream.js:
--------------------------------------------------------------------------------
1 | var tape = require( 'tape' );
2 |
3 | var CleanupStream = require( '../../lib/streams/cleanupStream' );
4 |
5 | const stream_mock = require('stream-mock');
6 |
7 | function test_stream(input, testedStream, callback) {
8 | const reader = new stream_mock.ObjectReadableMock(input);
9 | const writer = new stream_mock.ObjectWritableMock();
10 | writer.on('error', (e) => callback(e));
11 | writer.on('finish', () => callback(null, writer.data));
12 | reader.pipe(testedStream).pipe(writer);
13 | }
14 |
15 | tape( 'cleanupStream trims whitespace from all fields', function(test) {
16 | var input = {
17 | NUMBER: '5 ',
18 | STREET: ' Abcd ',
19 | LAT: 5,
20 | LON: 6,
21 | POSTCODE: ' def '
22 | };
23 |
24 | var cleanupStream = CleanupStream.create({ countryCode: 'us' });
25 |
26 | test_stream([input], cleanupStream, function(err, records) {
27 | test.equal(records.length, 1, 'stream length unchanged');
28 |
29 | var record = records[0];
30 | test.equal(record.NUMBER, '5', 'NUMBER field is trimmed');
31 | test.equal(record.STREET, 'Abcd', 'STREET field is trimmed');
32 | test.equal(record.POSTCODE, 'def', 'POSTCODE field is trimmed');
33 | test.end();
34 | });
35 | });
36 |
37 | tape( 'cleanupStream does NOT trim leading 0\'s from house numbers', function(test) {
38 | var inputs = [
39 | {
40 | NUMBER: ' 0030 ',
41 | STREET: 'Street'
42 | },
43 | {
44 | NUMBER: '0034560',
45 | STREET: 'Street'
46 | },
47 | {
48 | NUMBER: '12340',
49 | STREET: 'Street'
50 | }
51 | ];
52 |
53 | var expecteds = [
54 | {
55 | NUMBER: '0030',
56 | STREET: 'Street'
57 | },
58 | {
59 | NUMBER: '0034560',
60 | STREET: 'Street'
61 | },
62 | {
63 | NUMBER: '12340',
64 | STREET: 'Street'
65 | }
66 | ];
67 |
68 | var cleanupStream = CleanupStream.create({ countryCode: 'us' });
69 |
70 | test_stream(inputs, cleanupStream, function(err, actual) {
71 | test.deepEqual(actual, expecteds, 'leading 0\'s should not have been trimmed from NUMBER');
72 | test.end();
73 | });
74 |
75 | });
76 |
77 | tape ( 'cleanupStream trims white space in street field', function(test){
78 | var input = {
79 | STREET: '34 West\t 93rd \nSt'
80 | };
81 |
82 | var cleanupStream = CleanupStream.create({ countryCode: 'us' });
83 |
84 | test_stream([input],cleanupStream, function(err,records){
85 | test.equal(records.length, 1, 'stream length unchanged');
86 | test.equal(records[0].STREET, '34 West 93rd Street');
87 | test.end();
88 | });
89 | });
90 |
91 | tape( 'cleanupStream converts all-caps street names to Title Case', function(test){
92 | var inputs = [{
93 | NUMBER: '88',
94 | STREET: 'GLASGOW STREET'
95 | },
96 | {
97 | NUMBER: '76',
98 | STREET : 'McCallister Street' //already capitalized street should be unchanged
99 | },
100 | {
101 | NUMBER: '9923736',
102 | STREET: 'Macalester Street'//should also be unchanged
103 | },
104 | {
105 | NUMBER: '314',
106 | STREET: 'timid street' //should capitalize first letter of each word
107 | },
108 | {
109 | NUMBER: '4',
110 | STREET: 'é'
111 | },
112 | {
113 | NUMBER: '9',
114 | STREET: '丁目'
115 | }];
116 | var expecteds = [{
117 | NUMBER: '88',
118 | STREET: 'Glasgow Street'
119 | },
120 | {
121 | NUMBER: '76',
122 | STREET : 'McCallister Street' //already capitalized street should be unchanged
123 | },
124 | {
125 | NUMBER: '9923736',
126 | STREET: 'Macalester Street'//should also be unchanged
127 | },
128 | {
129 | NUMBER: '314',
130 | STREET: 'Timid Street' //should capitalize first letter of each word
131 | },
132 | {
133 | NUMBER: '4',
134 | STREET: 'É' //should only capitalize when more than one char
135 | },
136 | {
137 | NUMBER: '9',
138 | STREET: '丁目' //should handle non-latin characters
139 | }];
140 |
141 | var cleanupStream = CleanupStream.create({ countryCode: 'us' });
142 |
143 | test_stream(inputs,cleanupStream,function(err,actual){
144 | test.deepEqual(actual, expecteds,'we expect proper capitalization');
145 | test.end();
146 | });
147 | });
148 |
149 | tape( 'cleanupStream expands directionals.', function(test){
150 | var inputs = [{
151 | NUMBER: '88',
152 | STREET: 'North East Glasgow Street'
153 | },
154 | {
155 | NUMBER: '76',
156 | STREET: 'South West McCallister Street'
157 | },
158 | {
159 | NUMBER: '9923736',
160 | STREET: 'Serenity Street'//should be unchanged even though the start matches a directional
161 | }];
162 | var expecteds = [{
163 | NUMBER: '88',
164 | STREET: 'North East Glasgow Street'
165 | },
166 | {
167 | NUMBER: '76',
168 | STREET : 'South West McCallister Street'
169 | },
170 | {
171 | NUMBER: '9923736',
172 | STREET: 'Serenity Street'//should also be unchanged
173 | }];
174 |
175 | var cleanupStream = CleanupStream.create({ countryCode: 'us' });
176 |
177 | test_stream(inputs,cleanupStream,function(err,actual){
178 | test.deepEqual(actual, expecteds,'we expect proper capitalization of street directionals');
179 | test.end();
180 | });
181 | });
182 |
--------------------------------------------------------------------------------
/test/streams/contentHashStream.js:
--------------------------------------------------------------------------------
1 | const tape = require('tape');
2 | const stream_mock = require('stream-mock');
3 | const ContentHashStream = require('../../lib/streams/contentHashStream');
4 | const hash = ContentHashStream.hash;
5 | const DEFAULT_HASH = 'ca9c491ac66b2c62';
6 |
7 | function test_stream(input, testedStream, callback) {
8 | const reader = new stream_mock.ObjectReadableMock(input);
9 | const writer = new stream_mock.ObjectWritableMock();
10 | writer.on('error', (e) => callback(e));
11 | writer.on('finish', () => callback(null, writer.data));
12 | reader.pipe(testedStream).pipe(writer);
13 | }
14 |
15 | tape('contentHashStream generates new hash', function (test) {
16 | var input = {
17 | NUMBER: '5',
18 | STREET: 'Abcd',
19 | LAT: 5,
20 | LON: 6,
21 | POSTCODE: 'def'
22 | };
23 |
24 | var contentHashStream = ContentHashStream.create();
25 |
26 | test_stream([input], contentHashStream, function (err, records) {
27 | test.equal(records.length, 1, 'stream length unchanged');
28 |
29 | var record = records[0];
30 | test.equal(record.HASH, 'f44048507e8fb319', 'HASH field generated');
31 | test.end();
32 | });
33 | });
34 |
35 | tape('contentHashStream replaces existing hash', function (test) {
36 | var input = {
37 | NUMBER: '5 ',
38 | STREET: ' Abcd ',
39 | LAT: 5,
40 | LON: 6,
41 | POSTCODE: ' def ',
42 | HASH: '54830a0a5bbbca8f'
43 | };
44 |
45 | var contentHashStream = ContentHashStream.create();
46 |
47 | test_stream([input], contentHashStream, function (err, records) {
48 | test.equal(records.length, 1, 'stream length unchanged');
49 |
50 | var record = records[0];
51 | test.equal(record.HASH, 'f44048507e8fb319', 'HASH field generated');
52 | test.end();
53 | });
54 | });
55 |
56 | tape('hash: default value for non-object and empty objects', function (test) {
57 | test.equal(hash(null), DEFAULT_HASH, 'default hash');
58 | test.equal(hash(1), DEFAULT_HASH, 'default hash');
59 | test.equal(hash(false), DEFAULT_HASH, 'default hash');
60 | test.equal(hash('string'), DEFAULT_HASH, 'default hash');
61 | test.equal(hash([]), DEFAULT_HASH, 'default hash');
62 | test.equal(hash({}), DEFAULT_HASH, 'default hash');
63 | test.end();
64 | });
65 |
66 | tape('hash: 16 lowercase hexidecimal chars', function (test) {
67 | const conform = /^[0-9a-f]{16}$/;
68 | for( let i=-90.0; i<+90.0; i+=0.5 ){
69 | let h = hash({ LON: i, LAT: i });
70 | test.true(conform.test(h), h);
71 | }
72 | test.end();
73 | });
74 |
75 | tape('hash: strict equality', function (test) {
76 | test.equal(
77 | hash({ LON: '1.1', LAT: '2.2' }),
78 | hash({ LON: '1.1', LAT: '2.2' })
79 | );
80 | test.equal(
81 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST' }),
82 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST' })
83 | );
84 | test.equal(
85 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10' }),
86 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10' })
87 | );
88 | test.equal(
89 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6B' }),
90 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6B' })
91 | );
92 | test.end();
93 | });
94 |
95 | tape('hash: ingore existing hash field', function (test) {
96 | test.equal(
97 | hash({ LON: '1.1', LAT: '2.2', HASH: 'c2f8c35aa279ee7d' }),
98 | hash({ LON: '1.1', LAT: '2.2', HASH: 'deadb33fdeadb33f' })
99 | );
100 | test.end();
101 | });
102 |
103 | tape('hash: fuzzy equality', function (test) {
104 | test.equal(
105 | hash({ STREET: 'A ST' }),
106 | hash({ STREET: 'a st' }),
107 | 'value case'
108 | );
109 | test.equal(
110 | hash({ STREET: 'A ST' }),
111 | hash({ STREET: ' A ST ' }),
112 | 'value whitespace'
113 | );
114 | test.equal(
115 | hash({ STREET: 1 }),
116 | hash({ STREET: '1' }),
117 | 'value type'
118 | );
119 | test.equal(
120 | hash({ LON: 1.123456789 }),
121 | hash({ LON: 1.1234567 }),
122 | 'float precision'
123 | );
124 | test.equal(
125 | hash({ LON: 1.12000000000 }),
126 | hash({ LON: 1.12 }),
127 | 'float precision'
128 | );
129 | test.equal(
130 | hash({ LON: -1.000000000000 }),
131 | hash({ LON: -1 }),
132 | 'float precision'
133 | );
134 | test.equal(
135 | hash({ LON: 0 }),
136 | hash({ LON: -0 }),
137 | 'negative zero'
138 | );
139 | test.end();
140 | });
141 |
142 | tape('hash: strict inequality', function (test) {
143 | test.notEqual(
144 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6B' }),
145 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10', UNIT: '6' })
146 | );
147 | test.notEqual(
148 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '10' }),
149 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST', NUMBER: '11' })
150 | );
151 | test.notEqual(
152 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A ST' }),
153 | hash({ LON: '1.1', LAT: '2.2', STREET: 'A RD' })
154 | );
155 | test.notEqual(
156 | hash({ LON: '1.1', LAT: '2.2' }),
157 | hash({ LON: '1.1', LAT: '2.1' })
158 | );
159 | test.notEqual(
160 | hash({ LON: '1.1' }),
161 | hash({ LON: '-1.1' })
162 | );
163 | test.notEqual(
164 | hash({ NUMBER: '10' }),
165 | hash({ UNIT: '10' })
166 | );
167 | test.end();
168 | });
169 |
--------------------------------------------------------------------------------
/test/streams/documentStream.js:
--------------------------------------------------------------------------------
1 | const tape = require( 'tape' );
2 |
3 | const stream_mock = require('stream-mock');
4 |
5 | const DocumentStream = require( '../../lib/streams/documentStream' );
6 |
7 | function test_stream(input, testedStream, callback) {
8 | const reader = new stream_mock.ObjectReadableMock(input);
9 | const writer = new stream_mock.ObjectWritableMock();
10 | writer.on('error', (e) => callback(e));
11 | writer.on('finish', () => callback(null, writer.data));
12 | reader.pipe(testedStream).pipe(writer);
13 | }
14 |
15 | tape( 'documentStream catches records with no street', function(test) {
16 | const input = {
17 | NUMBER: 5
18 | };
19 | const stats = { badRecordCount: 0 };
20 | const documentStream = DocumentStream.create('prefix', stats);
21 |
22 | test_stream([input], documentStream, function(err, actual) {
23 | test.equal(actual.length, 0, 'no documents should be pushed' );
24 | test.equal(stats.badRecordCount, 1, 'bad record count updated');
25 | test.end();
26 | });
27 | });
28 |
29 | tape( 'documentStream does not set zipcode if zipcode is emptystring', function(test) {
30 | const input = {
31 | NUMBER: '5',
32 | STREET: '101st Avenue',
33 | LAT: 5,
34 | LON: 6,
35 | POSTCODE: ''
36 | };
37 | const stats = { badRecordCount: 0 };
38 | const documentStream = DocumentStream.create('prefix', stats);
39 |
40 | test_stream([input], documentStream, function(err, actual) {
41 | test.equal(actual.length, 1, 'the document should be pushed' );
42 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged');
43 | test.equal(actual[0].getAddress('zip'), undefined);
44 | test.end();
45 | });
46 | });
47 |
48 | tape( 'documentStream creates id with filename-based prefix', function(test) {
49 | const input = {
50 | NUMBER: '5',
51 | STREET: '101st Avenue',
52 | LAT: 5,
53 | LON: 6,
54 | POSTCODE: ''
55 | };
56 |
57 | const stats = { badRecordCount: 0 };
58 | const documentStream = DocumentStream.create('prefix', stats);
59 |
60 | test_stream([input], documentStream, function(err, actual) {
61 | test.equal(actual.length, 1, 'the document should be pushed' );
62 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged');
63 | test.equal(actual[0].getId(), 'prefix:0');
64 | test.end();
65 | });
66 | });
67 |
68 | tape('documentStream uses HASH value if present', function(test) {
69 | const input = {
70 | NUMBER: '5',
71 | STREET: '101st Avenue',
72 | LAT: 5,
73 | LON: 6,
74 | HASH: 'abcd'
75 | };
76 |
77 | const stats = { badRecordCount: 0 };
78 | const documentStream = DocumentStream.create('prefix', stats);
79 |
80 | test_stream([input], documentStream, function(err, actual) {
81 | test.equal(actual.length, 1, 'the document should be pushed' );
82 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged');
83 | test.equal(actual[0].getId(), 'prefix:abcd');
84 | test.end();
85 | });
86 | });
87 |
88 | tape('documentStream valid country_code lowercase', function (test) {
89 | const input = {
90 | NUMBER: '5',
91 | STREET: '101st Avenue',
92 | LAT: 5,
93 | LON: 6,
94 | HASH: 'abcd'
95 | };
96 | const stats = { badRecordCount: 0 };
97 | const documentStream = DocumentStream.create('au/example', stats);
98 |
99 | test_stream([input], documentStream, function (err, actual) {
100 | test.equal(actual.length, 1, 'the document should be pushed');
101 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged');
102 | test.deepEqual(actual[0].getMeta('country_code'), 'AU', 'country_code set');
103 | test.end();
104 | });
105 | });
106 |
107 | tape('documentStream valid country_code uppercase', function (test) {
108 | const input = {
109 | NUMBER: '5',
110 | STREET: '101st Avenue',
111 | LAT: 5,
112 | LON: 6,
113 | HASH: 'abcd'
114 | };
115 | const stats = { badRecordCount: 0 };
116 | const documentStream = DocumentStream.create('AU/example', stats);
117 |
118 | test_stream([input], documentStream, function (err, actual) {
119 | test.equal(actual.length, 1, 'the document should be pushed');
120 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged');
121 | test.deepEqual(actual[0].getMeta('country_code'), 'AU', 'country_code set');
122 | test.end();
123 | });
124 | });
125 |
126 | tape('documentStream invalid country_code', function (test) {
127 | const input = {
128 | NUMBER: '5',
129 | STREET: '101st Avenue',
130 | LAT: 5,
131 | LON: 6,
132 | HASH: 'abcd'
133 | };
134 | const stats = { badRecordCount: 0 };
135 | const documentStream = DocumentStream.create('foo/example', stats); // note: does not match pattern
136 |
137 | test_stream([input], documentStream, function (err, actual) {
138 | test.equal(actual.length, 1, 'the document should be pushed');
139 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged');
140 | test.deepEqual(actual[0].getMeta('country_code'), undefined, 'country_code not set');
141 | test.end();
142 | });
143 | });
144 |
145 | tape('documentStream store reference to OA object in meta', function (test) {
146 | const input = {
147 | NUMBER: '5',
148 | STREET: '101st Avenue',
149 | LAT: 5,
150 | LON: 6,
151 | HASH: 'abcd'
152 | };
153 | const stats = { badRecordCount: 0 };
154 | const documentStream = DocumentStream.create('example', stats);
155 |
156 | test_stream([input], documentStream, function (err, actual) {
157 | test.equal(actual.length, 1, 'the document should be pushed');
158 | test.equal(stats.badRecordCount, 0, 'bad record count unchanged');
159 | test.deepEqual(actual[0].getMeta('oa'), input, 'OA reference stored in meta');
160 | test.end();
161 | });
162 | });
163 |
--------------------------------------------------------------------------------
/test/streams/gnafMapperStream.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape');
2 | const through = require('through2');
3 | const mapper = require('../../lib/streams/gnafMapperStream');
4 | const Document = require('pelias-model').Document;
5 |
6 | module.exports.tests = {};
7 |
8 | // test exports
9 | module.exports.tests.interface = function (test) {
10 | test('interface: factory', t => {
11 | t.equal(typeof mapper, 'function', 'stream factory');
12 | t.end();
13 | });
14 | test('interface: stream', t => {
15 | var stream = mapper();
16 | t.equal(typeof stream, 'object', 'valid stream');
17 | t.equal(typeof stream._read, 'function', 'valid readable');
18 | t.equal(typeof stream._write, 'function', 'valid writeable');
19 | t.end();
20 | });
21 | };
22 |
23 | // ===================== GNAF ID mapping ======================
24 |
25 | module.exports.tests.au_gnaf_id = function (test) {
26 | var doc = new Document('oa', 'a', 1);
27 | doc.setMeta('country_code', 'AU');
28 | doc.setMeta('oa', {
29 | ID: 'GAVIC411412475',
30 | NUMBER: '360',
31 | STREET: 'BRUNSWICK STREET',
32 | LAT: -37.79647546,
33 | LON: 144.978997
34 | });
35 | test('maps - GNAF ID', t => {
36 | var stream = mapper();
37 | stream.pipe(through.obj((doc, enc, next) => {
38 | t.deepEqual(doc.getAddendum('concordances'), { 'gnaf:pid': 'GAVIC411412475' }, 'correctly mapped');
39 | t.end();
40 | next();
41 | }));
42 | stream.write(doc);
43 | });
44 | };
45 |
46 | module.exports.tests.au_invalid_gnaf_id = function (test) {
47 | var doc = new Document('oa', 'a', 1);
48 | doc.setMeta('country_code', 'AU');
49 | doc.setMeta('oa', {
50 | ID: 'invalid', // note: invalid GNAF ID
51 | NUMBER: '360',
52 | STREET: 'BRUNSWICK STREET',
53 | LAT: -37.79647546,
54 | LON: 144.978997
55 | });
56 | test('maps - GNAF ID', t => {
57 | var stream = mapper();
58 | stream.pipe(through.obj((doc, enc, next) => {
59 | t.deepEqual(doc.getAddendum('concordances'), undefined);
60 | t.end();
61 | next();
62 | }));
63 | stream.write(doc);
64 | });
65 | };
66 |
67 | module.exports.tests.au_missing_id_field = function (test) {
68 | var doc = new Document('oa', 'a', 1);
69 | doc.setMeta('country_code', 'AU');
70 | doc.setMeta('oa', {
71 | ID: undefined, // note: missing ID field
72 | NUMBER: '360',
73 | STREET: 'BRUNSWICK STREET',
74 | LAT: -37.79647546,
75 | LON: 144.978997
76 | });
77 | test('maps - GNAF ID', t => {
78 | var stream = mapper();
79 | stream.pipe(through.obj((doc, enc, next) => {
80 | t.deepEqual(doc.getAddendum('concordances'), undefined);
81 | t.end();
82 | next();
83 | }));
84 | stream.write(doc);
85 | });
86 | };
87 |
88 | module.exports.tests.non_au_gnaf_id = function (test) {
89 | var doc = new Document('oa', 'a', 1);
90 | doc.setMeta('country_code', 'NZ'); // note: country code not AU
91 | doc.setMeta('oa', {
92 | ID: 'GAVIC411412475',
93 | NUMBER: '360',
94 | STREET: 'BRUNSWICK STREET',
95 | LAT: -37.79647546,
96 | LON: 144.978997
97 | });
98 | test('maps - GNAF ID', t => {
99 | var stream = mapper();
100 | stream.pipe(through.obj((doc, enc, next) => {
101 | t.deepEqual(doc.getAddendum('concordances'), undefined);
102 | t.end();
103 | next();
104 | }));
105 | stream.write(doc);
106 | });
107 | };
108 |
109 |
110 | function test(name, testFunction) {
111 | return tape('unit_splitting_mapper: ' + name, testFunction);
112 | }
113 |
114 | for (var testCase in module.exports.tests) {
115 | module.exports.tests[testCase](test);
116 | }
117 |
--------------------------------------------------------------------------------
/test/streams/isUSorCAHouseNumberZero.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape');
2 | var isUSorCAHouseNumberZero = require('../../lib/streams/isUSorCAHouseNumberZero');
3 |
4 | const stream_mock = require('stream-mock');
5 |
6 | function test_stream(input, testedStream, callback) {
7 | const reader = new stream_mock.ObjectReadableMock(input);
8 | const writer = new stream_mock.ObjectWritableMock();
9 | writer.on('error', (e) => callback(e));
10 | writer.on('finish', () => callback(null, writer.data));
11 | reader.pipe(testedStream).pipe(writer);
12 | }
13 |
14 | tape('isUSorCAHouseNumberZero', function(t) {
15 | t.test('non-0 house number in USA should return true', function(t) {
16 | var records = [
17 | {
18 | parent: {
19 | country_a: ['USA']
20 | },
21 | address_parts: {
22 | number: '1007'
23 | }
24 | },
25 | {
26 | parent: {
27 | country_a: ['USA']
28 | },
29 | address_parts: {
30 | number: '0017'
31 | }
32 | },
33 | {
34 | parent: {
35 | country_a: ['USA']
36 | },
37 | address_parts: {
38 | number: '1700'
39 | }
40 | }
41 | ];
42 |
43 | var filter = isUSorCAHouseNumberZero.create();
44 |
45 | test_stream(records, filter, function(err, actual) {
46 | t.deepEqual(actual, records, 'none should have been filtered out');
47 | t.end();
48 | });
49 |
50 | });
51 |
52 | t.test('non-0 house number in CAN should return true', function(t) {
53 | var records = [
54 | {
55 | parent: {
56 | country_a: ['CAN']
57 | },
58 | address_parts: {
59 | number: '1007'
60 | }
61 | },
62 | {
63 | parent: {
64 | country_a: ['CAN']
65 | },
66 | address_parts: {
67 | number: '0017'
68 | }
69 | },
70 | {
71 | parent: {
72 | country_a: ['CAN']
73 | },
74 | address_parts: {
75 | number: '1700'
76 | }
77 | }
78 | ];
79 |
80 | var filter = isUSorCAHouseNumberZero.create();
81 |
82 | test_stream(records, filter, function(err, actual) {
83 | t.deepEqual(actual, records, 'none should have been filtered out');
84 | t.end();
85 | });
86 |
87 | });
88 |
89 | t.test('non-0 house number in non-USA/CAN should return true', function(t) {
90 | var records = [
91 | {
92 | parent: {
93 | country_a: ['GBR']
94 | },
95 | address_parts: {
96 | number: '1007'
97 | }
98 | },
99 | {
100 | parent: {
101 | country_a: ['GBR']
102 | },
103 | address_parts: {
104 | number: '0017'
105 | }
106 | },
107 | {
108 | parent: {
109 | country_a: ['GBR']
110 | },
111 | address_parts: {
112 | number: '1700'
113 | }
114 | }
115 | ];
116 |
117 | var filter = isUSorCAHouseNumberZero.create();
118 |
119 | test_stream(records, filter, function(err, actual) {
120 | t.deepEqual(actual, records, 'none should have been filtered out');
121 | t.end();
122 | });
123 |
124 | });
125 |
126 | t.test('house number reduceable to 0 in USA should return false', function(t) {
127 | var records = [
128 | {
129 | parent: {
130 | country_a: ['USA']
131 | },
132 | address_parts: {
133 | number: '0'
134 | }
135 | },
136 | {
137 | parent: {
138 | country_a: ['USA']
139 | },
140 | address_parts: {
141 | number: '00000'
142 | }
143 | }
144 | ];
145 |
146 | var filter = isUSorCAHouseNumberZero.create();
147 |
148 | test_stream(records, filter, function(err, actual) {
149 | t.deepEqual(actual, [], 'all should have been filtered out');
150 | t.end();
151 | });
152 |
153 | });
154 |
155 | t.test('house number reduceable to 0 in CAN should return false', function(t) {
156 | var records = [
157 | {
158 | parent: {
159 | country_a: ['CAN']
160 | },
161 | address_parts: {
162 | number: '0'
163 | }
164 | },
165 | {
166 | parent: {
167 | country_a: ['CAN']
168 | },
169 | address_parts: {
170 | number: '00000'
171 | }
172 | }
173 | ];
174 |
175 | var filter = isUSorCAHouseNumberZero.create();
176 |
177 | test_stream(records, filter, function(err, actual) {
178 | t.deepEqual(actual, [], 'all should have been filtered out');
179 | t.end();
180 | });
181 |
182 | });
183 |
184 | t.test('house number reduceable to 0 in non-USA/CAN should return true', function(t) {
185 | var records = [
186 | {
187 | parent: {
188 | country_a: ['GBR']
189 | },
190 | address_parts: {
191 | number: '0'
192 | }
193 | },
194 | {
195 | parent: {
196 | country_a: ['GBR']
197 | },
198 | address_parts: {
199 | number: '00000'
200 | }
201 | }
202 | ];
203 |
204 | var filter = isUSorCAHouseNumberZero.create();
205 |
206 | test_stream(records, filter, function(err, actual) {
207 | t.deepEqual(actual, records, 'none should have been filtered out');
208 | t.end();
209 | });
210 |
211 | });
212 |
213 | });
214 |
--------------------------------------------------------------------------------
/test/streams/recordStream.js:
--------------------------------------------------------------------------------
1 | var tape = require( 'tape' );
2 | var through = require( 'through2' );
3 |
4 | var peliasModel = require( 'pelias-model' );
5 |
6 | var recordStream = require( '../../lib/streams/recordStream' );
7 |
8 | /**
9 | * Tests whether records read from `test/openaddresses_sample.csv` are created
10 | * into Document objects with expected values.
11 | */
12 | tape(
13 | 'importPipelines.createRecordStream() creates Document objects with expected values.',
14 | function ( test ){
15 | function createTestRec( lon, lat, name ){
16 | return { lon: lon, lat: lat, name: name };
17 | }
18 |
19 | var expectedRecords = [
20 | createTestRec( -118.0170157, 55.546026835788886, '23042 Twp Road 755 A' ),
21 | createTestRec( -118.75318353, 55.14959214890181, '712046 Rge Road 34' ),
22 | createTestRec( -118.8218384, 55.15506788763259, '712078 Rge Road 34' ),
23 | createTestRec( -118.79719936, 55.153343057595535, '712068 Rge Road 34' ),
24 | createTestRec( -118.66743097, 55.151807043809917, '712060 Rge Road 34' ),
25 | createTestRec( -118.74783569, 55.155320792497442, '712082 Rge Road 35' ),
26 | createTestRec( 1, 2, 'number Too Many Spaces' ),
27 | createTestRec( 1, 2, 'trim Multiple Spaces' )
28 | ];
29 | test.plan( expectedRecords.length * 4 + 1);
30 |
31 | var dataStream = recordStream.create(['test/openaddresses_sample.csv']);
32 | test.ok( dataStream.readable, 'Stream is readable.' );
33 | var testStream = through.obj(function ( data, enc, next ){
34 | test.ok(
35 | data instanceof peliasModel.Document, 'Data is a Document object.'
36 | );
37 |
38 | var expected = expectedRecords.splice( 0, 1 )[ 0 ];
39 | var centroid = data.getCentroid();
40 | test.ok( expected.lon - centroid.lon < 1e-6, 'Longitude matches.' );
41 | test.ok( expected.lat - centroid.lat < 1e-6, 'Latitude matches.' );
42 | test.equal( data.getName( 'default' ), expected.name , 'Name matches.' );
43 | next();
44 | });
45 | dataStream.pipe( testStream );
46 | }
47 | );
48 |
49 | tape( 'Don\'t create records for invalid data.', function ( test ){
50 | var dataStream = recordStream.create(['test/openaddresses_bad_data.csv']);
51 |
52 | dataStream.pipe( through.obj(
53 | function write( data, _, next ){
54 | test.fail( 'Document was created from bad data: ' + JSON.stringify( data, undefined, 4 ) );
55 | next();
56 | },
57 | function end( done ){
58 | test.pass( 'No Documents were created from bad data.' );
59 | test.end();
60 | done();
61 | }
62 | ));
63 | });
64 |
65 | tape( 'getIdPrefix returns prefix based on OA directory structure - csv', function( test ) {
66 | var filename = '/base/path/us/ca/san_francisco.csv';
67 | var basePath = '/base/path';
68 |
69 | var actual = recordStream.getIdPrefix(filename, basePath);
70 |
71 | var expected = 'us/ca/san_francisco';
72 | test.equal(actual, expected, 'correct prefix generated');
73 | test.end();
74 | });
75 |
76 | tape( 'getIdPrefix handles multiple levels of heirarchy - csv', function ( test ) {
77 | var filename = '/base/path/cz/countrywide.csv';
78 | var basePath = '/base/path';
79 |
80 | var actual = recordStream.getIdPrefix(filename, basePath);
81 |
82 | var expected = 'cz/countrywide';
83 | test.equal(actual, expected, 'correct prefix generated');
84 | test.end();
85 | });
86 |
87 | tape( 'getIdPrefix returns basename without extension when invalid basepath given - csv', function( test ) {
88 | var filename = '/path/to/a/document.csv';
89 | var basePath = '/somewhere/else';
90 |
91 | var actual = recordStream.getIdPrefix(filename, basePath);
92 | var expected = 'document';
93 |
94 | test.equal(actual, expected);
95 | test.end();
96 | });
97 |
98 | tape( 'getIdPrefix returns prefix based on OA directory structure - geojson', function( test ) {
99 | var filename = '/base/path/us/ca/san_francisco.geojson';
100 | var basePath = '/base/path';
101 |
102 | var actual = recordStream.getIdPrefix(filename, basePath);
103 |
104 | var expected = 'us/ca/san_francisco';
105 | test.equal(actual, expected, 'correct prefix generated');
106 | test.end();
107 | });
108 |
109 | tape( 'getIdPrefix handles multiple levels of heirarchy - geojson', function ( test ) {
110 | var filename = '/base/path/cz/countrywide.geojson';
111 | var basePath = '/base/path';
112 |
113 | var actual = recordStream.getIdPrefix(filename, basePath);
114 |
115 | var expected = 'cz/countrywide';
116 | test.equal(actual, expected, 'correct prefix generated');
117 | test.end();
118 | });
119 |
120 | tape( 'getIdPrefix returns basename without extension when invalid basepath given - geojson', function( test ) {
121 | var filename = '/path/to/a/document.geojson';
122 | var basePath = '/somewhere/else';
123 |
124 | var actual = recordStream.getIdPrefix(filename, basePath);
125 | var expected = 'document';
126 |
127 | test.equal(actual, expected);
128 | test.end();
129 | });
130 |
--------------------------------------------------------------------------------
/test/streams/unitSplittingMapperStream.js:
--------------------------------------------------------------------------------
1 | var tape = require('tape');
2 | const through = require('through2');
3 | const mapper = require('../../lib/streams/unitSplittingMapperStream');
4 | const Document = require('pelias-model').Document;
5 |
6 | module.exports.tests = {};
7 |
8 | // test exports
9 | module.exports.tests.interface = function (test) {
10 | test('interface: factory', t => {
11 | t.equal(typeof mapper, 'function', 'stream factory');
12 | t.end();
13 | });
14 | test('interface: stream', t => {
15 | var stream = mapper();
16 | t.equal(typeof stream, 'object', 'valid stream');
17 | t.equal(typeof stream._read, 'function', 'valid readable');
18 | t.equal(typeof stream._write, 'function', 'valid writeable');
19 | t.end();
20 | });
21 | };
22 |
23 | // ===================== australasian unit number mapping ======================
24 |
25 | module.exports.tests.australasian_solidus = function (test) {
26 | var doc = new Document('oa', 'example', 1);
27 | doc.setName('default', '2/14 Smith Street');
28 | doc.setAddress('number', '2/14');
29 | doc.setAddress('street', 'Smith Street');
30 | doc.setMeta('country_code', 'AU');
31 |
32 | test('maps - split unit from housenumber', t => {
33 | var stream = mapper();
34 | stream.pipe(through.obj((doc, enc, next) => {
35 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
36 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
37 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
38 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
39 | t.end();
40 | next();
41 | }));
42 | stream.write(doc);
43 | });
44 | };
45 |
46 | module.exports.tests.australasian_solidus_with_whitespace = function (test) {
47 | var doc = new Document('oa', 'example', 1);
48 | doc.setName('default', '2 / 14 Smith Street');
49 | doc.setAddress('number', '2 / 14');
50 | doc.setAddress('street', 'Smith Street');
51 | doc.setMeta('country_code', 'AU');
52 |
53 | test('maps - split unit from housenumber', t => {
54 | var stream = mapper();
55 | stream.pipe(through.obj((doc, enc, next) => {
56 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
57 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
58 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
59 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
60 | t.end();
61 | next();
62 | }));
63 | stream.write(doc);
64 | });
65 | };
66 |
67 | module.exports.tests.australasian_flat_prefix = function (test) {
68 | var doc = new Document('oa', 'example', 1);
69 | doc.setName('default', 'Flat 2 14 Smith Street');
70 | doc.setAddress('number', 'Flat 2 14');
71 | doc.setAddress('street', 'Smith Street');
72 | doc.setMeta('country_code', 'AU');
73 |
74 | test('maps - split unit from housenumber', t => {
75 | var stream = mapper();
76 | stream.pipe(through.obj((doc, enc, next) => {
77 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
78 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
79 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
80 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
81 | t.end();
82 | next();
83 | }));
84 | stream.write(doc);
85 | });
86 | };
87 |
88 | module.exports.tests.australasian_flat_prefix_abbreviated = function (test) {
89 | var doc = new Document('oa', 'example', 1);
90 | doc.setName('default', 'F 2 14 Smith Street');
91 | doc.setAddress('number', 'F 2 14');
92 | doc.setAddress('street', 'Smith Street');
93 | doc.setMeta('country_code', 'AU');
94 |
95 | test('maps - split unit from housenumber', t => {
96 | var stream = mapper();
97 | stream.pipe(through.obj((doc, enc, next) => {
98 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
99 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
100 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
101 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
102 | t.end();
103 | next();
104 | }));
105 | stream.write(doc);
106 | });
107 | };
108 |
109 | module.exports.tests.australasian_unit_prefix = function (test) {
110 | var doc = new Document('oa', 'example', 1);
111 | doc.setName('default', 'Unit 2 14 Smith Street');
112 | doc.setAddress('number', 'Unit 2 14');
113 | doc.setAddress('street', 'Smith Street');
114 | doc.setMeta('country_code', 'AU');
115 |
116 | test('maps - split unit from housenumber', t => {
117 | var stream = mapper();
118 | stream.pipe(through.obj((doc, enc, next) => {
119 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
120 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
121 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
122 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
123 | t.end();
124 | next();
125 | }));
126 | stream.write(doc);
127 | });
128 | };
129 |
130 | module.exports.tests.australasian_apartment_prefix = function (test) {
131 | var doc = new Document('oa', 'example', 1);
132 | doc.setName('default', 'Apartment 2 14 Smith Street');
133 | doc.setAddress('number', 'Apartment 2 14');
134 | doc.setAddress('street', 'Smith Street');
135 | doc.setMeta('country_code', 'AU');
136 |
137 | test('maps - split unit from housenumber', t => {
138 | var stream = mapper();
139 | stream.pipe(through.obj((doc, enc, next) => {
140 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
141 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
142 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
143 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
144 | t.end();
145 | next();
146 | }));
147 | stream.write(doc);
148 | });
149 | };
150 |
151 | module.exports.tests.australasian_apartment_prefix_abbreviated = function (test) {
152 | var doc = new Document('oa', 'example', 1);
153 | doc.setName('default', 'APT 2 14 Smith Street');
154 | doc.setAddress('number', 'APT 2 14');
155 | doc.setAddress('street', 'Smith Street');
156 | doc.setMeta('country_code', 'AU');
157 |
158 | test('maps - split unit from housenumber', t => {
159 | var stream = mapper();
160 | stream.pipe(through.obj((doc, enc, next) => {
161 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
162 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
163 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
164 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
165 | t.end();
166 | next();
167 | }));
168 | stream.write(doc);
169 | });
170 | };
171 |
172 | module.exports.tests.australasian_allow_no_space_after_flat_designation = function (test) {
173 | var doc = new Document('oa', 'example', 1);
174 | doc.setName('default', 'APT2 14 Smith Street'); // note: 'APT2' concatenated
175 | doc.setAddress('number', 'APT2 14');
176 | doc.setAddress('street', 'Smith Street');
177 | doc.setMeta('country_code', 'AU');
178 |
179 | test('maps - split unit from housenumber', t => {
180 | var stream = mapper();
181 | stream.pipe(through.obj((doc, enc, next) => {
182 | t.deepEqual(doc.getName('default'), '14 Smith Street', 'unchanged');
183 | t.deepEqual(doc.getAddress('unit'), '2', 'mapped');
184 | t.deepEqual(doc.getAddress('number'), '14', 'mapped');
185 | t.deepEqual(doc.getAddress('street'), 'Smith Street', 'unchanged');
186 | t.end();
187 | next();
188 | }));
189 | stream.write(doc);
190 | });
191 | };
192 |
193 | function test(name, testFunction) {
194 | return tape('unit_splitting_mapper: ' + name, testFunction);
195 | }
196 |
197 | for (var testCase in module.exports.tests) {
198 | module.exports.tests[testCase](test);
199 | }
200 |
--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
1 | /**
2 | * @file The main entry point for the OpenAddresses importer's unit-tests.
3 | */
4 |
5 | require( './schema' );
6 | require( './isValidCsvRecord' );
7 | require( './import');
8 | require( './importPipeline');
9 | require( './parameters' );
10 | require( './streams/cleanupStream' );
11 | require( './cleanup_v2' );
12 | require( './streams/contentHashStream' );
13 | require( './streams/documentStream' );
14 | require( './streams/gnafMapperStream' );
15 | require( './streams/isUSorCAHouseNumberZero' );
16 | require( './streams/recordStream' );
17 | require( './streams/unitSplittingMapperStream' );
18 |
--------------------------------------------------------------------------------
/utils/OpenAddressesAPI.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const axios = require('axios');
3 | const config = require('pelias-config');
4 | const logger = require('pelias-logger').get('openaddresses');
5 | const HOST = 'https://batch.openaddresses.io';
6 |
7 | class OpenAddressesAPI {
8 | constructor() {
9 | this.config = _.get(config.generate(), 'imports.openaddresses', {});
10 | this.token = _.get(this.config, 'token');
11 | }
12 |
13 | // remove file extensions from 'source'
14 | static normalize(source) {
15 | if (!_.isString(source)) { return source; }
16 | const norm = source.replace(/\.[^/.]+$/, '');
17 |
18 | // source definitions previously required a file extension.
19 | // please remove file extensions from your ~/pelias.json file
20 | // to silence these warning messages.
21 | if (source !== norm) {
22 | logger.warn(`source definitions no longer require a file extension '${source}'`);
23 | }
24 |
25 | return norm;
26 | }
27 |
28 | // return the http url for a specific job id
29 | static url(job) {
30 | return `${HOST}/api/job/${job}/output/source.geojson.gz`;
31 | }
32 |
33 | // if the 'validated' mode is enabled (for financial supporters only)
34 | isValidatedModeEnabled() {
35 | return _.get(this.config, 'validated') === true;
36 | }
37 |
38 | async lookup(source) {
39 | // support the 'validated' property for financial supporters
40 | const params = {
41 | source,
42 | layer: 'addresses',
43 | validated: this.isValidatedModeEnabled() ? 'true' : 'false'
44 | };
45 |
46 | // request extended info and return the first result
47 | const versions = await axios.get(`${HOST}/api/data`, { params });
48 | return _.isArray(versions.data) && !_.isEmpty(versions.data) ? _.head(versions.data) : {};
49 | }
50 | }
51 |
52 | module.exports = OpenAddressesAPI;
53 |
--------------------------------------------------------------------------------
/utils/download_all.js:
--------------------------------------------------------------------------------
1 | const child_process = require('child_process');
2 | const async = require('async');
3 | const fs = require('fs-extra');
4 | const temp = require('temp');
5 | const logger = require('pelias-logger').get('openaddresses-download');
6 | const _ = require('lodash');
7 |
8 | function downloadAll(config, callback) {
9 | logger.info('Attempting to download all data');
10 |
11 | const targetDir = config.imports.openaddresses.datapath;
12 |
13 | fs.ensureDir(targetDir, (err) => {
14 | if (err) {
15 | logger.error(`error making directory ${targetDir}`, err);
16 | return callback(err);
17 | }
18 |
19 | const dataHost = config.get('imports.openaddresses.dataHost') || 'https://data.openaddresses.io';
20 |
21 | async.eachSeries(
22 | [
23 | // all non-share-alike data
24 | `${dataHost}/openaddr-collected-global.zip`,
25 |
26 | // all share-alike data
27 | `${dataHost}/openaddr-collected-global-sa.zip`
28 | ],
29 | downloadBundle.bind(null, targetDir, config),
30 | callback);
31 | });
32 | }
33 |
34 | function downloadBundle(targetDir, config, sourceUrl, callback) {
35 |
36 | const tmpZipFile = temp.path({suffix: '.zip'});
37 | const referer = config.get('imports.openaddresses.dataReferer') || 'https://pelias-results.openaddresses.io';
38 |
39 | async.series(
40 | [
41 | // download the zip file into the temp directory
42 | (callback) => {
43 | logger.debug(`downloading ${sourceUrl}`);
44 | if (_.startsWith(sourceUrl, 's3://')) {
45 | const s3Options = config.imports.openaddresses.s3Options || '';
46 | child_process.exec(`aws s3 cp ${sourceUrl} ${tmpZipFile} --only-show-errors ${s3Options}`, callback);
47 | } else {
48 | const flags = [
49 | '--request GET', // HTTP GET
50 | '--silent', // be quiet
51 | '--location', // follow redirects
52 | '--fail', // exit with a non-zero code for >=400 responses
53 | '--write-out "%{http_code}"', // print status code to STDOUT
54 | `--referer ${referer}`, // set referer header
55 | `--output ${tmpZipFile}`, // set output filepath
56 | '--retry 5', // retry this number of times before giving up
57 | '--retry-connrefused', // consider ECONNREFUSED as a transient error
58 | '--retry-delay 5' // sleep this many seconds between retry attempts
59 | ].join(' ');
60 |
61 | // the `--fail*` flags cause an error to be returned as the first arg with `error.code`
62 | // as the process exit status, the `-w "%{http_code}"` flag writes the HTTP status to STDOUT.
63 | child_process.exec(`curl ${flags} ${sourceUrl}`, (error, stdout) => {
64 | if (!error) { return callback(); }
65 |
66 | // provide a more user-friendly error message
67 | error.message = `cURL request failed, HTTP status: ${stdout}, exit code: ${error.code}`;
68 | callback(error);
69 | });
70 | }
71 | },
72 | // unzip file into target directory
73 | (callback) => {
74 | logger.debug(`unzipping ${tmpZipFile} to ${targetDir}`);
75 | child_process.exec(`unzip -o -qq -d ${targetDir} ${tmpZipFile}`, callback);
76 | },
77 | // delete the temp downloaded zip file
78 | fs.remove.bind(null, tmpZipFile)
79 | ],
80 | callback);
81 | }
82 |
83 | module.exports = downloadAll;
84 |
--------------------------------------------------------------------------------
/utils/download_data.js:
--------------------------------------------------------------------------------
1 | const _ = require('lodash');
2 | const config = require( 'pelias-config' ).generate(require('../schema'));
3 | const logger = require('pelias-logger').get('openaddresses-download');
4 |
5 | const downloadAll = require('./download_all');
6 | const downloadFiltered = require('./download_filtered');
7 |
8 | if (require.main === module) {
9 | download((err) => {
10 | if (err) {
11 | logger.error('Failed to download data', err);
12 | process.exit(1);
13 | }
14 | logger.info('All done!');
15 | });
16 | }
17 |
18 | function download(callback) {
19 | if (!_.isEmpty(config.imports.openaddresses.files)) {
20 | downloadFiltered(config, callback);
21 | }
22 | else {
23 | downloadAll(config, callback);
24 | }
25 | }
26 |
27 | module.exports = download;
28 |
--------------------------------------------------------------------------------
/utils/download_filtered.js:
--------------------------------------------------------------------------------
1 | const child_process = require('child_process');
2 | const config = require('pelias-config').generate();
3 | const async = require('async');
4 | const fs = require('fs-extra');
5 | const path = require('path');
6 | const temp = require('temp');
7 | const logger = require('pelias-logger').get('openaddresses-download');
8 | const Bottleneck = require('bottleneck/es5');
9 |
10 | const OpenAddressesAPI = require('./OpenAddressesAPI');
11 | const oa = new OpenAddressesAPI();
12 |
13 | function downloadFiltered(config, callback) {
14 | const targetDir = config.imports.openaddresses.datapath;
15 | const errorsFatal = config.get('imports.openaddresses.missingFilesAreFatal');
16 |
17 | fs.ensureDir(targetDir, async (err) => {
18 | if (err) {
19 | logger.error(`error making directory ${targetDir}`, err);
20 | return callback(err);
21 | }
22 |
23 | // validate sources
24 | const files = config.get('imports.openaddresses.files', []);
25 | const sources = await getSources(files);
26 | const validSources = sources.filter(source => source.url);
27 |
28 | // respect 'imports.openaddresses.missingFilesAreFatal' setting
29 | if (errorsFatal && (sources.length !== validSources.length)) {
30 | callback(sources.find(source => source.error)); // return first error
31 | return;
32 | }
33 |
34 | logger.info(`Attempting to download selected data sources: ${sources.map(source => source.id)}`);
35 |
36 | // limit requests to avoid being banned by openaddresses.io
37 | // current policy is 10 request per minute
38 | // https://github.com/pelias/openaddresses/issues/433#issuecomment-527383976
39 | // @todo: contact OA team to check if this is still required with the batch. endpoint?
40 | const options = {
41 | maxConcurrent: 1,
42 | minTime: 6000
43 | };
44 | const limiter = new Bottleneck(options);
45 | const done = () => {
46 | if (limiter.empty()) {
47 | callback();
48 | }
49 | };
50 | validSources.map(source => {
51 | limiter.submit(downloadSource, targetDir, source, done);
52 | });
53 | process.on('SIGINT', () => {
54 | limiter.stop({ dropWaitingJobs: true });
55 | process.exit();
56 | });
57 | });
58 |
59 | }
60 |
61 | async function getSources(files) {
62 | return await Promise.all(files.map(async file => {
63 |
64 | // normalize source
65 | let id = OpenAddressesAPI.normalize(file);
66 |
67 | // lookup the source using the OpenAddresses API
68 | // to find the most current job id and ensure validity
69 | const version = await oa.lookup(id);
70 | const valid = (version && version.job);
71 |
72 | // invalid source
73 | if (!valid) {
74 | return { id, error: `invalid source '${file}'` };
75 | }
76 |
77 | // valid source
78 | return { id, url: OpenAddressesAPI.url(version.job) };
79 | }));
80 | }
81 |
82 | function downloadSource(targetDir, source, done) {
83 |
84 | const errorsFatal = config.get('imports.openaddresses.missingFilesAreFatal');
85 | const token = config.get('imports.openaddresses.token');
86 | const referer = config.get('imports.openaddresses.dataReferer') || 'https://pelias-results.openaddresses.io';
87 | logger.info(`Downloading ${source.id}`);
88 |
89 | const outFile = path.join(targetDir, `${source.id}.geojson`);
90 | const tmpFile = temp.path({
91 | prefix: source.id.replace(new RegExp(path.sep, 'g'), '-'),
92 | dir: targetDir,
93 | suffix: '.gz'
94 | });
95 |
96 | async.series(
97 | [
98 | // download the compressed file into the temp directory
99 | (callback) => {
100 | logger.debug(`downloading ${source.url}`);
101 | const flags = [
102 | '--request GET', // HTTP GET
103 | '--silent', // be quiet
104 | '--location', // follow redirects
105 | '--fail', // exit with a non-zero code for >=400 responses
106 | '--write-out "%{http_code}"', // print status code to STDOUT
107 | `--referer ${referer}`, // set referer header
108 | `--output ${tmpFile}`, // set output filepath
109 | '--retry 5', // retry this number of times before giving up
110 | '--retry-connrefused', // consider ECONNREFUSED as a transient error
111 | '--retry-delay 5', // sleep this many seconds between retry attempts
112 | `-H 'Authorization: Bearer ${token}'` // authorization token
113 | ].join(' ');
114 |
115 | // the `--fail*` flags cause an error to be returned as the first arg with `error.code`
116 | // as the process exit status, the `-w "%{http_code}"` flag writes the HTTP status to STDOUT.
117 | child_process.exec(`curl ${flags} ${source.url}`, (error, stdout) => {
118 | if (!error) { return callback(); }
119 |
120 | // provide a more user-friendly error message
121 | error.message = `cURL request failed, HTTP status: ${stdout}, exit code: ${error.code}`;
122 | callback(error);
123 | });
124 | },
125 | // decompress file into target directory
126 | (callback) => {
127 | logger.debug(`decompress ${tmpFile} to ${outFile}`);
128 | child_process.exec(`
129 | mkdir -p ${path.dirname(outFile)};
130 | gzip -d < ${tmpFile} > ${outFile};
131 | `, (error, stdout) => {
132 | if (!error) { return callback(); }
133 |
134 | // provide a more user-friendly error message
135 | error.message = `decompress failed, ${stdout}`;
136 | callback(error);
137 | });
138 | },
139 | ],
140 | (err) => {
141 | if (err) {
142 | logger.warn(`failed to download ${source.url}: ${err}`);
143 | }
144 |
145 | // ensure temp files are cleaned up
146 | if (fs.existsSync(tmpFile)) { fs.unlinkSync(tmpFile); }
147 |
148 | // honour 'imports.openaddresses.missingFilesAreFatal' setting
149 | done(errorsFatal ? err : null);
150 | }
151 | );
152 | }
153 |
154 | module.exports = downloadFiltered;
155 |
--------------------------------------------------------------------------------