├── .github └── workflows │ └── build.yml ├── .gitignore ├── CODEOWNERS ├── MIGRATING.md ├── README.md ├── cloud-config.yml ├── cmd ├── docker.sh ├── download.sh ├── elastic.sh ├── import.sh ├── prepare.sh ├── system.sh └── test.sh ├── docker-compose.yml ├── example.env ├── lib ├── cli.sh └── env.sh ├── nginx.conf ├── pelias ├── pelias.json ├── version.env └── wait_for_200.sh /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: 'Build and Deploy' 2 | on: 3 | push: 4 | branches: 5 | - 'master' 6 | workflow_dispatch: 7 | jobs: 8 | create-droplet: 9 | name: Create and provision droplet 10 | runs-on: ubuntu-24.04 11 | outputs: 12 | IPV4: ${{ steps.save.outputs.IPV4 }} 13 | env: 14 | SSH_FINGERPRINT: ${{ secrets.SSH_FINGERPRINT }} 15 | steps: 16 | - uses: actions/checkout@v2 17 | - id: install 18 | name: Install doctl 19 | uses: digitalocean/action-doctl@v2 20 | with: 21 | token: ${{ secrets.DIGITALOCEAN_ACCESS_TOKEN }} 22 | - id: create 23 | name: Create droplet 24 | run: doctl compute droplet create --enable-monitoring --image ubuntu-24-04-x64 --size s-4vcpu-8gb --region nyc3 --ssh-keys "${SSH_FINGERPRINT}" --tag-name labs --user-data-file ./cloud-config.yml --wait "geosearch-${GITHUB_RUN_ID}" 25 | - id: save 26 | name: Save IPv4 27 | run: echo "::set-output name=IPV4::$(doctl compute droplet get "geosearch-${GITHUB_RUN_ID}" --template "{{- .PublicIPv4 -}}")" 28 | healthcheck: 29 | name: Wait for healthcheck to pass 30 | runs-on: ubuntu-24.04 31 | needs: create-droplet 32 | env: 33 | IPV4: ${{needs.create-droplet.outputs.IPV4}} 34 | steps: 35 | - uses: actions/checkout@v2 36 | - id: healthcheck 37 | run: ./wait_for_200.sh "$IPV4" 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Local .terraform directories 2 | **/.terraform/* 3 | 4 | # .tfstate files 5 | *.tfstate 6 | *.tfstate.* 7 | 8 | # Crash log files 9 | crash.log 10 | 11 | # Exclude all .tfvars files, which are likely to contain sentitive data, such as 12 | # password, private keys, and other secrets. These should not be part of version 13 | # control as they are data points which are potentially sensitive and subject 14 | # to change depending on the environment. 15 | # 16 | *.tfvars 17 | 18 | # Ignore override files as they are usually used to override resources locally and so 19 | # are not checked in 20 | override.tf 21 | override.tf.json 22 | *_override.tf 23 | *_override.tf.json 24 | 25 | # Include override files you do wish to add to version control using negated pattern 26 | # 27 | # !example_override.tf 28 | 29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan 30 | # example: *tfplan* 31 | 32 | # Ignore CLI configuration files 33 | .terraformrc 34 | terraform.rc 35 | 36 | # Ignore credential file 37 | .env 38 | 39 | .DS_Store 40 | data/**/* -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @NYCPlanning/Engineering 2 | -------------------------------------------------------------------------------- /MIGRATING.md: -------------------------------------------------------------------------------- 1 | # Migrating from Geosearch v1 to v2 2 | This document outlines why we are introducing "v2" of the Geosearch API and how to migrate client application to use it, including details on breaking changes. 3 | 4 | > This document assumes you are somewhat familiar with the underlying open source software that powers Geosearch. If you aren't please read through `README.md` and then come back. 5 | 6 | ## The "why?" 7 | 1. As of December 2022, the "v1" version of the API relied on end-of-life versions of several underlying languages and open source tools. 8 | 2. The work required to update those dependencies necessitated a switch to using Pelias' official [csv-importer](https://github.com/pelias/csv-importer) for importing our custom normalized PAD data into the Pelias ElasticSearch database. 9 | 3. When using that importer, arbitrary data that we attach to each record, such as BBL and BIN, are automatically nested with each feature's `properties` object in an object called `addendum`. Because this data is kept in a different property in "v1", this means we had to introduce minor breaking changes to the responses returned by the Geosearch API. 10 | 11 | ## Breaking changes 12 | 1. The paths to the endpoints for the API are the same aside from having to switch `/v1` to `/v2`. For instance `https://geosearch.planninglabs.nyc/v1/autocomplete?text=120%20broadway` becomes `https://geosearch.planninglabs.nyc/v2/autocomplete?text=120%20broadway`. 13 | 2. The "custom" data we add to each record are now found under `addendum` in each geojson feature's `properties` object under new keys. Here are examples of are examples of the old and new response objects for reference. For instance, to get at the bbl for a particular feature, you would access `feature.properties.addendum.pad.bbl` instead of `feature.properties.pad_bbl`. You will also notice that some extraneous properties such as `pad_orig_stname` have been removed for brevity. If you are a user of Geosearch and have any questions regarding this migration, please reach out to OpenSource_DL@planning.nyc.gov. 14 | 15 | #### Old 16 | ``` 17 | { 18 | "type": "Feature", 19 | "geometry": { 20 | "type": "Point", 21 | "coordinates": [ 22 | -74.01054, 23 | 40.708225 24 | ] 25 | }, 26 | "properties": { 27 | "id": "3945", 28 | "gid": "nycpad:address:3945", 29 | "layer": "address", 30 | "source": "nycpad", 31 | "source_id": "3945", 32 | "name": "120 BROADWAY", 33 | "housenumber": "120", 34 | "street": "BROADWAY", 35 | "postalcode": "10271", 36 | "accuracy": "point", 37 | "country": "United States", 38 | "country_gid": "whosonfirst:country:85633793", 39 | "country_a": "USA", 40 | "region": "New York State", 41 | "region_gid": "whosonfirst:region:0", 42 | "region_a": "NY", 43 | "county": "New York County", 44 | "county_gid": "whosonfirst:county:061", 45 | "locality": "New York", 46 | "locality_gid": "whosonfirst:locality:0", 47 | "locality_a": "NYC", 48 | "borough": "Manhattan", 49 | "borough_gid": "whosonfirst:borough:1", 50 | "label": "120 BROADWAY, Manhattan, New York, NY, USA", 51 | "pad_low": "104", 52 | "pad_high": "124", 53 | "pad_bin": "1001026", 54 | "pad_bbl": "1000477501", 55 | "pad_geomtype": "bin", 56 | "pad_orig_stname": "BROADWAY" 57 | } 58 | } 59 | ``` 60 | 61 | #### New 62 | ``` 63 | { 64 | "type": "Feature", 65 | "geometry": { 66 | "type": "Point", 67 | "coordinates": [ 68 | -74.01052, 69 | 40.70822 70 | ] 71 | }, 72 | "properties": { 73 | "id": "3892", 74 | "gid": "nycpad:venue:3892", 75 | "layer": "venue", 76 | "source": "nycpad", 77 | "source_id": "3892", 78 | "country_code": "US", 79 | "name": "120 BROADWAY", 80 | "housenumber": "120", 81 | "street": "BROADWAY", 82 | "postalcode": "10271", 83 | "accuracy": "point", 84 | "country": "United States", 85 | "country_gid": "whosonfirst:country:85633793", 86 | "country_a": "USA", 87 | "region": "New York", 88 | "region_gid": "whosonfirst:region:85688543", 89 | "region_a": "NY", 90 | "county": "New York County", 91 | "county_gid": "whosonfirst:county:102081863", 92 | "locality": "New York", 93 | "locality_gid": "whosonfirst:locality:85977539", 94 | "locality_a": "NYC", 95 | "borough": "Manhattan", 96 | "borough_gid": "whosonfirst:borough:421205771", 97 | "neighbourhood": "Financial District", 98 | "neighbourhood_gid": "whosonfirst:neighbourhood:85865711", 99 | "label": "120 BROADWAY, New York, NY, USA", 100 | "addendum": { 101 | "pad": { 102 | "bbl": "1000477501", 103 | "bin": "1001026" 104 | } 105 | } 106 | } 107 | }, 108 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Docker Compose project for NYC Geosearch service, built on the open source [Pelias](https://github.com/pelias/pelias) geocoder and [NYC's Property Address Directory (PAD)](https://www1.nyc.gov/site/planning/data-maps/open-data.page) 2 | 3 | ## Overview 4 | 5 | - [About](#about) 6 | - [Config-Driven](#config-driven) 7 | - [Pelias CLI tool](pelias-cli-tool) 8 | - [Running Geosearch Locally](#running-geosearch-locally) 9 | - [Deployment](#redeploying-geosearch-for-quarterly-data-updates) 10 | - [How exactly to deployments work?](#how-exactly-to-deployments-work) 11 | 12 | ## About 13 | 14 | This repo serves as "home base" for the GeoSearch project, as the docker compose project orchestrates a functioning set up. Other relevant code for our Pelias deployment: 15 | 16 | - [geosearch-pad-normalize](https://github.com/NYCPlanning/labs-geosearch-pad-normalize) - an R script that ingests and transforms raw Property Address Database (PAD) data, most significantly interpolating valid address ranges. This repo ouputs a csv that conforms to the data schema required by Pelias' official [CSV Importer](https://github.com/pelias/csv-importer). Note that this repo used to output data meant to be ingested by the now deprecated [PAD Importer](https://github.com/NYCPlanning/labs-geosearch-pad-importer) project. 17 | - [geosearch-docs](https://github.com/NYCPlanning/labs-geosearch-docs) - an interactive documentation site for the Geosearch API 18 | 19 | Docker Compose allows us to quickly spin up the pelias services we need, and run scripts manually in the containers. It also makes use of volumes and internal hostnames so the various services can communicate with each other. The contents of `docker-compose.yml` are based on code from the [pelias/docker](https://github.com/pelias/docker) project. 20 | 21 | > There is one service in `docker-compose.yml` that did not come from the `pelias/docker` project and that is the one called `nginx`. We added a simple [nginx](https://nginx.org/en/) server here that uses the contents of `nginx.conf` to serve as a reverse proxy server to direct traffic to either the Geosearch docs [website](https://github.com/NYCPlanning/labs-geosearch-docs) or forward it to the Pelias API itself. 22 | 23 | For more information on Pelias services, including many which we are not using here at City Planning, check out the `pelias/docker` project, or their [documentation](https://github.com/pelias/documentation) 24 | 25 | ## Config-Driven 26 | 27 | Much of this environment is config-driven, and the two files you should pay attention to are: 28 | 29 | - [docker-compose.yml](https://github.com/NYCPlanning/labs-geosearch-dockerfiles/blob/master/pelias.json) - configurations for each of the named services, including images to use, environment variable definitions, and volume mounts. 30 | - [pelias.json](https://github.com/NYCPlanning/labs-geosearch-docker/blob/master/pelias.json) - a shared config file used by all of the pelias services 31 | 32 | ## Pelias CLI tool 33 | 34 | All steps needed to get an instance of Geosearch up and running are encapsulated within commands that can be run via the `pelias` CLI tool included in this repo. This CLI tool is comprised of the file `pelias` at the root of this repo, as well as the files found in `/lib` and `/cmd`. All of these files were taken directly from [this Pelias repo](https://github.com/pelias/docker) outlining how to run Pelias via docker and docker-compose. **Note that these files are up to date with that Pelias repo as of December 2022, but changes to that repo will not be automatically reflected in this repo.**. If you would like to set up the CLI locally, see the docs in the /pelias/docker repo. 35 | 36 | > If you are having trouble setting up the CLI, or would just prefer not to add a record to your `$PATH`, you should be able to call the file at `./pelias` directly. To do this when running the commands in the "Running Geosearch Locally" section below, just replace `pelias` with `./pelias` in the commands. For instance `pelias compose pull` becomes `./pelias compose pull` 37 | 38 | ## Running Geosearch Locally 39 | 40 | You can run Geosearch locally using the included `pelias` CLI and docker-compose.yml file. The following instructions assume that you have set up the Pelias CLI locally and have docker and docker-compose installed on your machine. 41 | 42 | Run these commands from the root directory of this repo: 43 | 44 | First, create the requisite folder for the docker volumes. Note that the `./data` folder and its contents will be gitignored 45 | ``` 46 | mkdir -p data/elasticsearch data/csv data/whosonfirst 47 | ``` 48 | 49 | Create a `.env` file and set the `DATA_DIR` environment variables for Pelias 50 | ``` 51 | echo "DATA_DIR=$(pwd)/data" > .env 52 | ``` 53 | 54 | Pull images 55 | ``` 56 | pelias compose pull 57 | ``` 58 | 59 | Start the ElasticSearch service 60 | ``` 61 | pelias elastic start 62 | ``` 63 | 64 | Wait for it to come up. **This may take longer than the timeout period built into the pelias CLI. If you get a message saying elasticsearch did not come up, try running this command a few times to see if you get the "Elasticsearch up!" message eventually** 65 | ``` 66 | pelias elastic wait 67 | ``` 68 | 69 | Create the index in EL 70 | ``` 71 | pelias elastic create 72 | ``` 73 | 74 | Download the required Who's On First dataset 75 | ``` 76 | pelias download wof 77 | ``` 78 | 79 | Download the normalized PAD CSV 80 | ``` 81 | pelias download csv 82 | ``` 83 | 84 | Import the normalized PAD data into the elasticsearch datastore. This will likely take a while. 85 | ``` 86 | pelias import csv 87 | ``` 88 | 89 | Bring up the rest of the necessary docker services, including the Pelias API and nginx server 90 | ``` 91 | pelias compose up 92 | ``` 93 | 94 | To confirm that everything is up and running, you can try to hit the API. For instance, a `GET` call to `http://localhost/v2/autocomplete?text=120%20broadway` should return results for 120 Broadway. 95 | 96 | ## Redeploying Geosearch for Quarterly Data Updates 97 | 98 | > The following section is only relevant to members of DCP's Open Source Engineering team responsible for maintaining Geosearch 99 | 100 | When a new quarterly update of PAD becomes available on Bytes of the Big Apples: 101 | 102 | 1. Head to [geosearch-pad-normalize](https://github.com/NYCPlanning/labs-geosearch-pad-normalize) and perform the process outlined there for building a new version of the normalized PAD data. Once you have merged a pull request in the `main` branch of that repo, you can monitor the progress of building and uploading the new data in the [actions for that repo](https://github.com/NYCPlanning/labs-geosearch-pad-normalize/actions). This will produce the latest version of normalized pad and upload the new CSV file to the correct DigitalOcean Space. 103 | 104 | 2. Confirm that the csv outputed by geosearch-pad-normalize has been uploaded to the "latest" folder in Digital Ocean. You can see the exact URL that this repo will attempt to download the data from by looking at the value in `imports.csv.download` in `pelias.json`. **Note that you should not have to make changes to `pelias.json` in order to do data updates.** 105 | 106 | 3. Run the "Build and Deploy" GH Action workflow. This workflow will run automatically on pushes to `main`. However, if you are only trying to deploy a new instance of Geosearch with a new version of PAD, you should not need to make any code changes to this repo. Because of that, the workflow can also be run manually. To do that, go to the "Actions" tab in the repo and select the "Build and Deploy" worklow from the list on the left-hand side. Then select "Run workflow" with the `main` branch selected. 107 | 108 | 4. The workflow will create the new Droplet in Digital Ocean and run the commands in `cloud-config.yml`. This will initialize all of the containers in `docker-compose.yml`, download the PAD data, and import it into Pelias' ElasticSearch database. Finally, the workflow will run `wait_for_200.sh` every 30 seconds for up to 1 hour so that the workflow will end with a successful status if and when your new Geosearch instance is up and ready to start receiving traffic. 109 | 110 | > As of December 2022, it typically takes about 30-45 minutes for the the droplet to be created and for the services to fully reach a "healthy" status with all of the data loaded in. In some cases, it is possible that the GH Action job that runs `wait_for_200.sh` will finish "successfully" even though there was a failure. If that job finishes successfully much more quickly than we would expect, manually test the `/v2/autocomplete` endpoint to make sure the normalized PAD data was properly loaded before going to production. 111 | 112 | 5. Once the workflow finishes successfully, you should see a new geosearch droplet in Digital Ocean. You can verify that it is working properly by sending requests at it's public IPv4 address. Traffic to the production geosearch URL (https://geosearch.planninglabs.nyc/) is sent to the IP associated with the "geosearch" load balancer. To put your new droplet in production, simply add it to the new load balancer, remove the old droplet from the load balancer, and then delete the old droplet. 113 | 114 | ## How exactly do deployments work? 115 | 116 | > The following explains what happens when we deploy a new Droplet running the code in this repo to Digital Ocean. If you are only trying to deploy a new instance of Geosearch with a new version of PAD data, everything you need should be covered in the "Deployment" section above. 117 | 118 | Deployments are primarily handled by two files: `/.github/workflows/build.yml' and 'cloud-config.yml`. The "Build and Deploy" workflow in `build.yml` is run manually or triggered by pushes to the `main` branch (note that merging PRs into main constitutes a push). This workflow is responsible for a few things: 119 | 1. It uses `doctl` to create a new droplet. It will add an SSH public key saved in DO to that Droplet and tag it with `labs`. It will also point DO to the `cloud-config.yml` file for cloud-init to use for provisioning the droplet 120 | 2. Once the droplet is up, it will use the script in `wait_for_200.sh` to wait for the droplet to be healthy. In this scenario, healthy is defined as having all Geosearch services up and ready to accept traffic. This can take a while, primarily due to the time it takes to download the normalized PAD CSV and import it into the ElasticSearch datastore. 121 | 122 | Spinning up the services defined in `docker-compose.yml` and downloading and importing data is done via the tool [cloud-init](https://cloudinit.readthedocs.io/en/latest/). cloud-init uses the contents of `cloud-config.yml` to do the following: 123 | 1. Create a new sudo user called `pelias` on the new droplet. This is necessary because, following best practice, the Pelias CLI tool cannot not be run as the `root` system user. It will assign this user to the correct groups and add the included public SSH key to it. 124 | 2. Disable root access. As a security measure, logging into the droplet as `root` will be disabled once it is initialized. 125 | 3. Install the `docker` and `docker-compose` packages. 126 | 4. Bring up Geosearch by running the commands under `runcmd`. Note that even though `cloud-config.yml` creates the pelias user, the commands in `runcmd` are executed **as root**. Most of these commands use `runuser` to execute commands as the pelias user. 127 | 128 | > If you find yourself needing to ssh into a deployed Geosearch droplet, please see your team lead for additional instructions. 129 | -------------------------------------------------------------------------------- /cloud-config.yml: -------------------------------------------------------------------------------- 1 | #cloud-config 2 | users: 3 | - name: pelias 4 | ssh-authorized-keys: 5 | - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCwHcIqMPtGJJT8xtjnuMBf4K/XGifjUb9yv/Dn2GzB5W7BBiOduHV87mg0Up6PoEC0+naOB++Elt5a+ufiIP+Lc5H7mN/X9MX68MJgIR9kRBTxuwP7FKzohHHJesbzpyZEMHuI6nTeC0NO1donF/L2oelU1O3Vqbr7vjPLY8QYu5Ra9Dcryinjwx1b3kk3jysZ3IqEkJ7ye364d2CtougoHtH+j4Xi+FhPVdbJvzLowBGGuohVcwsfm+lU122zRHwgeM+W2OR/QUsch1TDbA3GnwNOQSz8TSl+mYoEBcF00GenkskrE11Jyvq0e01lfgzBseL7kal6THcgq/YjNDRt3hpyvwCrF22jgcw7hlHyz4Dnwe4u6ua8u4maoYdm1f01MAU2UnEssO63tJ401RE8VzvOS2zHgC2a3MR2uPbxpVOvusalJJuuShxqrfB04XYX7QPpzcDSfOatc7LFgT75Ipr/qNLFda/UAEuXMNFT8gbis+m+tSaTE+eAyh14PzM= 6 | groups: sudo, docker 7 | shell: /bin/bash 8 | sudo: ['ALL=(ALL) NOPASSWD:ALL'] 9 | disable_root: true 10 | package-update: true 11 | package-upgrade: true 12 | packages: 13 | - docker 14 | - docker-compose 15 | runcmd: 16 | - 'runuser -l pelias -c "mkdir /home/pelias/geosearch"' 17 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && git clone https://github.com/NYCPlanning/labs-geosearch-docker.git ."' 18 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && mkdir -p data/elasticsearch data/csv data/whosonfirst"' 19 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && echo "DATA_DIR=/home/pelias/geosearch/data" > .env"' 20 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias compose pull"' 21 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias elastic start"' 22 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias elastic wait"' 23 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias elastic create"' 24 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias download wof"' 25 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias download csv"' 26 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias import csv"' 27 | - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias compose up"' 28 | -------------------------------------------------------------------------------- /cmd/docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | function net_init(){ 5 | docker network create ${COMPOSE_PROJECT_NAME}_default &>/dev/null || true 6 | } 7 | 8 | function compose_pull(){ compose_exec pull; } 9 | register 'compose' 'pull' 'update all docker images' compose_pull 10 | 11 | function compose_logs(){ compose_exec logs $@; } 12 | register 'compose' 'logs' 'display container logs' compose_logs 13 | 14 | function compose_ps(){ compose_exec ps $@; } 15 | register 'compose' 'ps' 'list containers' compose_ps 16 | 17 | function compose_top(){ compose_exec top $@; } 18 | register 'compose' 'top' 'display the running processes of a container' compose_top 19 | 20 | function compose_exec(){ docker-compose $@; } 21 | register 'compose' 'exec' 'execute an arbitrary docker-compose command' compose_exec 22 | 23 | function compose_run(){ net_init; docker-compose run --rm $@; } 24 | register 'compose' 'run' 'execute a docker-compose run command' compose_run 25 | 26 | function compose_up(){ docker-compose up -d $@; } 27 | register 'compose' 'up' 'start one or more docker-compose service(s)' compose_up 28 | 29 | function compose_kill(){ docker-compose kill $@; } 30 | register 'compose' 'kill' 'kill one or more docker-compose service(s)' compose_kill 31 | 32 | function compose_down(){ docker-compose down; } 33 | register 'compose' 'down' 'stop all docker-compose service(s)' compose_down 34 | 35 | -------------------------------------------------------------------------------- /cmd/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | # per-source downloads 5 | function download_wof(){ compose_run -T 'whosonfirst' './bin/download'; } 6 | function download_oa(){ compose_run -T 'openaddresses' './bin/download'; } 7 | function download_osm(){ compose_run -T 'openstreetmap' './bin/download'; } 8 | function download_geonames(){ compose_run -T 'geonames' './bin/download'; } 9 | function download_tiger(){ compose_run -T 'interpolation' './bin/download-tiger'; } 10 | function download_transit(){ compose_run -T 'transit' './bin/download'; } 11 | function download_csv(){ compose_run -T 'csv-importer' './bin/download'; } 12 | 13 | register 'download' 'wof' '(re)download whosonfirst data' download_wof 14 | register 'download' 'oa' '(re)download openaddresses data' download_oa 15 | register 'download' 'osm' '(re)download openstreetmap data' download_osm 16 | register 'download' 'geonames' '(re)download geonames data' download_geonames 17 | register 'download' 'tiger' '(re)download TIGER data' download_tiger 18 | register 'download' 'transit' '(re)download transit data' download_transit 19 | register 'download' 'csv' '(re)download csv data' download_csv 20 | 21 | # download all the data to be used by imports 22 | function download_all(){ 23 | download_wof & 24 | download_oa & 25 | download_osm & 26 | 27 | if [[ "$ENABLE_GEONAMES" == "true" ]]; then 28 | download_geonames & 29 | fi 30 | 31 | download_tiger & 32 | download_transit & 33 | download_csv & 34 | wait 35 | } 36 | 37 | register 'download' 'all' '(re)download all data' download_all 38 | -------------------------------------------------------------------------------- /cmd/elastic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | function elastic_schema_drop(){ compose_run 'schema' node scripts/drop_index "$@" || true; } 5 | function elastic_schema_create(){ compose_run 'schema' ./bin/create_index; } 6 | function elastic_start(){ 7 | mkdir -p $DATA_DIR/elasticsearch 8 | # attemp to set proper permissions if running as root 9 | chown $DOCKER_USER $DATA_DIR/elasticsearch 2>/dev/null || true 10 | compose_exec up -d elasticsearch 11 | } 12 | 13 | function elastic_stop(){ compose_exec kill elasticsearch; } 14 | 15 | register 'elastic' 'drop' 'delete elasticsearch index & all data' elastic_schema_drop 16 | register 'elastic' 'create' 'create elasticsearch index with pelias mapping' elastic_schema_create 17 | register 'elastic' 'start' 'start elasticsearch server' elastic_start 18 | register 'elastic' 'stop' 'stop elasticsearch server' elastic_stop 19 | 20 | # to use this function: 21 | # if test $(elastic_status) -ne 200; then 22 | function elastic_status(){ 23 | curl \ 24 | --output /dev/null \ 25 | --silent \ 26 | --write-out "%{http_code}" \ 27 | "http://${ELASTIC_HOST:-localhost:9200}/_cluster/health?wait_for_status=yellow&timeout=1s" \ 28 | || true; 29 | } 30 | 31 | # the same function but with a trailing newline 32 | function elastic_status_newline(){ echo $(elastic_status); } 33 | register 'elastic' 'status' 'HTTP status code of the elasticsearch service' elastic_status_newline 34 | 35 | function elastic_wait(){ 36 | echo 'waiting for elasticsearch service to come up'; 37 | retry_count=30 38 | 39 | i=1 40 | while [[ "$i" -le "$retry_count" ]]; do 41 | if [[ $(elastic_status) -eq 200 ]]; then 42 | echo "Elasticsearch up!" 43 | exit 0 44 | elif [[ $(elastic_status) -eq 408 ]]; then 45 | # 408 indicates the server is up but not yet yellow status 46 | printf ":" 47 | else 48 | printf "." 49 | fi 50 | sleep 1 51 | i=$(($i + 1)) 52 | done 53 | 54 | echo -e "\n" 55 | echo "Elasticsearch did not come up, check configuration" 56 | exit 1 57 | } 58 | 59 | register 'elastic' 'wait' 'wait for elasticsearch to start up' elastic_wait 60 | 61 | function elastic_info(){ curl -s "http://${ELASTIC_HOST:-localhost:9200}/"; } 62 | register 'elastic' 'info' 'display elasticsearch version and build info' elastic_info 63 | 64 | function elastic_stats(){ 65 | curl -s "http://${ELASTIC_HOST:-localhost:9200}/pelias/_search?request_cache=true&timeout=10s&pretty=true" \ 66 | -H 'Content-Type: application/json' \ 67 | -d '{ 68 | "aggs": { 69 | "sources": { 70 | "terms": { 71 | "field": "source", 72 | "size": 100 73 | }, 74 | "aggs": { 75 | "layers": { 76 | "terms": { 77 | "field": "layer", 78 | "size": 100 79 | } 80 | } 81 | } 82 | } 83 | }, 84 | "size": 0 85 | }'; 86 | } 87 | register 'elastic' 'stats' 'display a summary of doc counts per source/layer' elastic_stats 88 | -------------------------------------------------------------------------------- /cmd/import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | # per-source imports 5 | function import_wof(){ compose_run 'whosonfirst' './bin/start'; } 6 | function import_oa(){ compose_run 'openaddresses' "./bin/parallel ${OPENADDRESSES_PARALLELISM:-1}"; } 7 | function import_osm(){ compose_run 'openstreetmap' './bin/start'; } 8 | function import_polylines(){ compose_run 'polylines' './bin/start'; } 9 | function import_geonames(){ compose_run 'geonames' './bin/start'; } 10 | function import_transit(){ compose_run 'transit' './bin/start'; } 11 | function import_csv(){ compose_run 'csv-importer' './bin/parallel' ${CSV_PARALLELISM:-1}; } 12 | 13 | register 'import' 'wof' '(re)import whosonfirst data' import_wof 14 | register 'import' 'oa' '(re)import openaddresses data' import_oa 15 | register 'import' 'osm' '(re)import openstreetmap data' import_osm 16 | register 'import' 'polylines' '(re)import polylines data' import_polylines 17 | register 'import' 'geonames' '(re)import geonames data' import_geonames 18 | register 'import' 'transit' '(re)import transit data' import_transit 19 | register 'import' 'csv' '(re)import csv data' import_csv 20 | 21 | # import all the data to be used by imports 22 | # note: running importers in parallel can cause issues due to high CPU & RAM requirements. 23 | function import_all(){ 24 | import_wof 25 | import_oa 26 | import_osm 27 | import_polylines 28 | 29 | if [[ "$ENABLE_GEONAMES" == "true" ]]; then 30 | import_geonames 31 | fi 32 | 33 | import_transit 34 | import_csv 35 | } 36 | 37 | register 'import' 'all' '(re)import all data' import_all 38 | -------------------------------------------------------------------------------- /cmd/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | # per-source prepares 5 | function prepare_polylines(){ compose_run -T 'polylines' bash ./docker_extract.sh; } 6 | function prepare_interpolation(){ compose_run -T 'interpolation' bash ./docker_build.sh; } 7 | function prepare_placeholder(){ 8 | compose_run -T 'placeholder' ./cmd/extract.sh; 9 | compose_run -T 'placeholder' ./cmd/build.sh; 10 | } 11 | 12 | register 'prepare' 'polylines' 'export road network from openstreetmap into polylines format' prepare_polylines 13 | register 'prepare' 'interpolation' 'build interpolation sqlite databases' prepare_interpolation 14 | register 'prepare' 'placeholder' 'build placeholder sqlite databases' prepare_placeholder 15 | 16 | # prepare all the data to be used by imports 17 | function prepare_all(){ 18 | prepare_polylines & 19 | prepare_placeholder & 20 | wait 21 | prepare_interpolation 22 | } 23 | 24 | register 'prepare' 'all' 'build all services which have a prepare step' prepare_all 25 | -------------------------------------------------------------------------------- /cmd/system.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | function system_check(){ env_check; } 5 | register 'system' 'check' 'ensure the system is correctly configured' system_check 6 | 7 | function system_env(){ env; } 8 | register 'system' 'env' 'display environment variables' system_env 9 | 10 | function system_update(){ git -C $(dirname "${BASH_SOURCE[0]}") pull; } 11 | register 'system' 'update' 'update the pelias command by pulling the latest version' system_update -------------------------------------------------------------------------------- /cmd/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | # run acceptance tests 5 | function test_fuzzy(){ compose_run 'fuzzy-tester' -e 'docker' $@; } 6 | 7 | register 'test' 'run' 'run fuzzy-tester test cases' test_fuzzy -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | networks: 3 | default: 4 | driver: bridge 5 | services: 6 | nginx: 7 | image: nginx:1.26.1-alpine3.19-slim 8 | restart: unless-stopped 9 | volumes: 10 | - ./nginx.conf:/etc/nginx/conf.d/nginx.conf 11 | ports: 12 | - "80:80" 13 | - "443:443" 14 | command: "/bin/sh -c 'while :; do sleep 6h & wait $${!}; nginx -s reload; done & nginx -g \"daemon off;\"'" 15 | logging: 16 | driver: "local" 17 | options: 18 | max-size: "200m" 19 | 20 | api: 21 | image: pelias/api:v5.53.0 22 | container_name: pelias_api 23 | user: "${DOCKER_USER}" 24 | restart: always 25 | environment: [ "PORT=4000" ] 26 | ports: [ "4000:4000" ] 27 | volumes: 28 | - "./pelias.json:/code/pelias.json" 29 | logging: 30 | driver: "local" 31 | options: 32 | max-size: "200m" 33 | 34 | schema: 35 | image: pelias/schema:v6.4.0 36 | container_name: pelias_schema 37 | user: "${DOCKER_USER}" 38 | volumes: 39 | - "./pelias.json:/code/pelias.json" 40 | 41 | libpostal: 42 | image: pelias/libpostal-service:latest 43 | container_name: pelias_libpostal 44 | user: "${DOCKER_USER}" 45 | restart: always 46 | ports: [ "4400:4400" ] 47 | logging: 48 | driver: "local" 49 | options: 50 | max-size: "200m" 51 | 52 | csv-importer: 53 | image: pelias/csv-importer:v2.13.0 54 | container_name: pelias_csv_importer 55 | user: "${DOCKER_USER}" 56 | volumes: 57 | - "./pelias.json:/code/pelias.json" 58 | - "${DATA_DIR}:/data" 59 | 60 | whosonfirst: 61 | image: pelias/whosonfirst:v5.5.1 62 | container_name: pelias_whosonfirst 63 | user: "${DOCKER_USER}" 64 | volumes: 65 | - "./pelias.json:/code/pelias.json" 66 | - "${DATA_DIR}:/data" 67 | 68 | pip: 69 | image: pelias/pip-service:v2.2.0 70 | container_name: pelias_pip-service 71 | user: "${DOCKER_USER}" 72 | restart: always 73 | environment: ["PORT=4200"] 74 | ports: ["4200:4200"] 75 | volumes: 76 | - "./pelias.json:/code/pelias.json" 77 | - "${DATA_DIR}:/data" 78 | logging: 79 | driver: "local" 80 | options: 81 | max-size: "200m" 82 | 83 | elasticsearch: 84 | image: pelias/elasticsearch:7.16.1 85 | container_name: pelias_elasticsearch 86 | user: "${DOCKER_USER}" 87 | restart: always 88 | ports: [ "9200:9200", "9300:9300" ] 89 | environment: 90 | - "ES_JAVA_OPTS=-Xms2g -Xmx2g" 91 | volumes: 92 | - "${DATA_DIR}/elasticsearch:/usr/share/elasticsearch/data" 93 | ulimits: 94 | memlock: 95 | soft: -1 96 | hard: -1 97 | nofile: 98 | soft: 65536 99 | hard: 65536 100 | cap_add: [ "IPC_LOCK" ] 101 | logging: 102 | driver: "local" 103 | options: 104 | max-size: "200m" 105 | -------------------------------------------------------------------------------- /example.env: -------------------------------------------------------------------------------- 1 | DATA_DIR=./data 2 | DOCKER_USER=1100 3 | DO_PAT=your_digitalocean_personal_access_token 4 | PVT_KEY=/path/to/your/private_key 5 | PASSWORD=some_password_for_pelias -------------------------------------------------------------------------------- /lib/cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | declare -a commands 5 | declare -a actions 6 | declare -a hints 7 | declare -a functions 8 | 9 | function register(){ 10 | commands+=("$1") 11 | actions+=("$2") 12 | hints+=("$3") 13 | functions+=("$4") 14 | } 15 | 16 | function help(){ 17 | printf 'Usage: %s [command] [action] [options]\n\n' ${0} 18 | 19 | for (( i = 0; i < ${#commands[@]}; ++i )); do 20 | echo -e " ${commands[$i]}\t${actions[$i]}\t ${hints[$i]}" 21 | done | column -ts $'\t' 22 | 23 | echo 24 | } 25 | 26 | function cli(){ 27 | cmd="${1}"; shift || true 28 | action="${1}"; shift || true 29 | valid_command=false 30 | valid_action=false 31 | 32 | for (( i = 0; i < ${#commands[@]}; ++i )); do 33 | if [ "${cmd}" = "${commands[$i]}" ]; then 34 | valid_command=true 35 | if [ "${action}" = "${actions[$i]}" ]; then 36 | valid_action=true 37 | "${functions[$i]}" "$@" 38 | exit $? 39 | fi 40 | fi 41 | done 42 | echo 43 | 44 | [ -z "${cmd}" ] || [ "$valid_command" = true ] || printf 'invalid command "%s"\n\n' "${cmd}" 45 | [ -z "${action}" ] || [ "$valid_action" = true ] || printf 'invalid action "%s"\n\n' "${action}" 46 | help 47 | 48 | exit 1 49 | } 50 | -------------------------------------------------------------------------------- /lib/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e; 3 | 4 | # disable verbose logging 5 | ENV_DISPLAY_WARNINGS=false 6 | 7 | # ensure docker runs containers as the current user (even when running with sudo) 8 | # note: SUDO_USER is not portable to all systems but its the best we've got. 9 | function set_docker_user(){ 10 | CURRENT_USER=$(id -u ${SUDO_USER-${USER}}):$(id -g ${SUDO_USER-${USER}}) 11 | if [ ! -z "${DOCKER_USER}" ] && [ "${DOCKER_USER}" != "${CURRENT_USER}" ]; then 12 | 2>&1 printf "WARNING: The DOCKER_USER env var is deprecated, using %s.\n" ${CURRENT_USER} 13 | 2>&1 echo "Remove the DOCKER_USER line from your .env file to silence this message." 14 | fi 15 | export DOCKER_USER="${CURRENT_USER}"; 16 | } 17 | 18 | # ensure the user environment is correctly set up 19 | function env_check(){ 20 | if [ "${DOCKER_USER}" = "0:0" ]; then 21 | echo "You are running as root" 22 | echo "This is insecure and not supported by Pelias." 23 | echo "Please try again as a non-root user." 24 | exit 1 25 | fi 26 | 27 | if [ -z "${DATA_DIR}" ]; then 28 | echo "You must set the DATA_DIR env var to a valid directory on your local machine." 29 | echo 30 | echo "Edit the '.env' file in this repository, update the DATA_DIR to a valid path and try again." 31 | echo "Alternatively, you can set the variable in your environment using a command such as 'export DATA_DIR=/tmp'." 32 | exit 1 33 | elif [ ! -d "${DATA_DIR}" ]; then 34 | printf "The directory specified by DATA_DIR does not exist: %s\n" ${DATA_DIR} 35 | echo 36 | echo "Edit the '.env' file in this repository, update the DATA_DIR to a valid path and try again." 37 | echo "Alternatively, you can set the variable in your environment using a command such as 'export DATA_DIR=/tmp'." 38 | exit 1 39 | fi 40 | } 41 | 42 | # loads environment vars from a stream (such as a file) 43 | # example: env_load_stream < .env 44 | function env_load_stream(){ 45 | [[ -n $DATA_DIR ]] && printf "DATA_DIR is already set to '$DATA_DIR' - this may cause the DATA_DIR specified in the .env to be ignored\n" 46 | while IFS='=' read -r key value; do 47 | ([ -z $key ] || [ -z $value ]) && printf 'Invalid environment var "%s=%s"\n' $key $value && exit 1 48 | if [ -z ${!key} ]; then 49 | export "${key}=${value}" 50 | elif $ENV_DISPLAY_WARNINGS; then 51 | printf '[warn] skip setting environment var "%s=%s", already set "%s=%s"\n' $key $value $key ${!key} 52 | fi 53 | done 54 | } 55 | 56 | # ensure locale is correctly set? 57 | # export LC_ALL=en_US.UTF-8 58 | 59 | # load DATA_DIR and other vars from docker-compose .env file 60 | # note: strips comments and empty lines 61 | [ -f .env ] && env_load_stream < <(grep -v '^$\|^\s*$\#' .env) 62 | 63 | # use the default compose file unless one was specified 64 | # if [ -z "${COMPOSE_FILE}" ]; then 65 | # if [ ! -f "docker-compose.yml" ]; then 66 | # export COMPOSE_FILE="${BASEDIR}/docker-compose.yml" 67 | # fi 68 | # fi 69 | 70 | set_docker_user 71 | 72 | # ensure the user env is correctly set up 73 | env_check 74 | -------------------------------------------------------------------------------- /nginx.conf: -------------------------------------------------------------------------------- 1 | server { 2 | listen 80 default_server; 3 | server_name geosearch.planninglabs.nyc; 4 | location / { 5 | proxy_set_header X-Real-IP $remote_addr; 6 | proxy_set_header Host labs-geosearch-docs.netlify.app; 7 | proxy_pass https://labs-geosearch-docs.netlify.app; 8 | } 9 | location /v1 { 10 | default_type application/json; 11 | return 410 '{"message": "v1 API has been permanently removed. For details on migrating to the v2 API, see https://github.com/NYCPlanning/labs-geosearch-docker/blob/master/MIGRATING.md"}'; 12 | } 13 | location /v2 { 14 | if ($request_method != GET) { 15 | return 403; 16 | } 17 | proxy_set_header X-Real-IP $remote_addr; 18 | proxy_set_header Host $http_host; 19 | # point to the Pelias API 20 | proxy_pass http://api:4000/v1; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /pelias: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # OSX comes bundled with versions of readlink, sed, parallel etc which are not 5 | # compatible with the linux tools. Force OSX users to install the GNU 6 | # compatible versions (prefixed with 'g', such as 'greadlink', 'gsed' etc.). 7 | export CMD_READLINK='readlink' 8 | if [[ "$OSTYPE" == "darwin"* ]]; then 9 | if [ -x "$(command -v greadlink)" ]; then 10 | CMD_READLINK='greadlink'; 11 | else 12 | 2>&1 echo 'OSX: you must install the gnu standard tooling using:' 13 | 2>&1 echo 'brew install coreutils' 14 | fi 15 | fi 16 | 17 | # resolve path to this file (following symlinks) and load libs 18 | BASEDIR=$( dirname $( ${CMD_READLINK} -f "${BASH_SOURCE[0]}" ) ) 19 | for f in ${BASEDIR}/lib/* ${BASEDIR}/cmd/*; do source $f; done 20 | 21 | # cli runner 22 | cli "$@" 23 | -------------------------------------------------------------------------------- /pelias.json: -------------------------------------------------------------------------------- 1 | { 2 | "esclient": { 3 | "apiVersion": "7.5", 4 | "hosts": [ 5 | { "host": "elasticsearch" } 6 | ] 7 | }, 8 | "schema": { 9 | "indexName": "pelias" 10 | }, 11 | "api": { 12 | "services": { 13 | "libpostal": { 14 | "url": "http://libpostal:4400" 15 | }, 16 | "pip": { "url": "http://pip:4200" } 17 | }, 18 | "targets": { 19 | "auto_discover": true 20 | }, 21 | "host": "api", 22 | "indexName": "pelias" 23 | }, 24 | "imports": { 25 | "adminLookup": { 26 | "enabled": true 27 | }, 28 | "csv": { 29 | "datapath": "/data/csv", 30 | "download": [ 31 | "https://planninglabs.nyc3.digitaloceanspaces.com/geosearch-data/latest/labs-geosearch-pad-normalized.csv" 32 | ] 33 | }, 34 | "whosonfirst": { 35 | "datapath": "/data/whosonfirst", 36 | "importPostalcodes": false, 37 | "countryCode": "US", 38 | "importPlace": [ 39 | "85977539" 40 | ] 41 | } 42 | }, 43 | "logger": { 44 | "level": "http", 45 | "timestamp": true, 46 | "colorize": true 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /version.env: -------------------------------------------------------------------------------- 1 | VERSION=22a1 2 | -------------------------------------------------------------------------------- /wait_for_200.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | set -e; 4 | 5 | function health_status(){ 6 | curl \ 7 | --output /dev/null \ 8 | --silent \ 9 | --write-out "%{http_code}" \ 10 | "http://$1/v2/autocomplete?text=120%20broadway" \ 11 | || true; 12 | } 13 | 14 | function health_wait(){ 15 | echo 'waiting for healthcheck to $1 to return 200'; 16 | retry_count=120 17 | 18 | i=1 19 | while [[ "$i" -le "$retry_count" ]]; do 20 | if [[ $(health_status $1) -eq 200 ]]; then 21 | echo "Geosearch is up!" 22 | exit 0 23 | else 24 | echo "Healthcheck did not return 200 status code. Trying again in 30 seconds..." 25 | fi 26 | sleep 30 27 | i=$(($i + 1)) 28 | done 29 | 30 | echo -e "\n" 31 | echo "Geosearch did not come up. Check cloudinit logs for details." 32 | exit 1 33 | } 34 | 35 | for var in "$@"; do 36 | health_wait "$var" 37 | done --------------------------------------------------------------------------------