├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── CODEOWNERS
├── MIGRATING.md
├── README.md
├── cloud-config.yml
├── cmd
    ├── docker.sh
    ├── download.sh
    ├── elastic.sh
    ├── import.sh
    ├── prepare.sh
    ├── system.sh
    └── test.sh
├── docker-compose.yml
├── example.env
├── lib
    ├── cli.sh
    └── env.sh
├── nginx.conf
├── pelias
├── pelias.json
├── version.env
└── wait_for_200.sh


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: 'Build and Deploy'
 2 | on: 
 3 |   push:
 4 |     branches:
 5 |       - 'master'
 6 |   workflow_dispatch:
 7 | jobs:
 8 |   create-droplet:
 9 |     name: Create and provision droplet
10 |     runs-on: ubuntu-24.04
11 |     outputs:
12 |       IPV4: ${{ steps.save.outputs.IPV4 }}
13 |     env:
14 |       SSH_FINGERPRINT: ${{ secrets.SSH_FINGERPRINT }}
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - id: install
18 |       name: Install doctl
19 |       uses: digitalocean/action-doctl@v2
20 |       with:
21 |         token: ${{ secrets.DIGITALOCEAN_ACCESS_TOKEN }}
22 |     - id: create
23 |       name: Create droplet
24 |       run: doctl compute droplet create --enable-monitoring --image ubuntu-24-04-x64 --size s-4vcpu-8gb --region nyc3 --ssh-keys "${SSH_FINGERPRINT}" --tag-name labs --user-data-file ./cloud-config.yml --wait "geosearch-${GITHUB_RUN_ID}"
25 |     - id: save
26 |       name: Save IPv4
27 |       run: echo "::set-output name=IPV4::$(doctl compute droplet get "geosearch-${GITHUB_RUN_ID}" --template "{{- .PublicIPv4 -}}")"
28 |   healthcheck:
29 |     name: Wait for healthcheck to pass
30 |     runs-on: ubuntu-24.04
31 |     needs: create-droplet
32 |     env: 
33 |       IPV4: ${{needs.create-droplet.outputs.IPV4}}
34 |     steps:
35 |     - uses: actions/checkout@v2
36 |     - id: healthcheck
37 |       run: ./wait_for_200.sh "$IPV4"
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Local .terraform directories
 2 | **/.terraform/*
 3 | 
 4 | # .tfstate files
 5 | *.tfstate
 6 | *.tfstate.*
 7 | 
 8 | # Crash log files
 9 | crash.log
10 | 
11 | # Exclude all .tfvars files, which are likely to contain sentitive data, such as
12 | # password, private keys, and other secrets. These should not be part of version 
13 | # control as they are data points which are potentially sensitive and subject 
14 | # to change depending on the environment.
15 | #
16 | *.tfvars
17 | 
18 | # Ignore override files as they are usually used to override resources locally and so
19 | # are not checked in
20 | override.tf
21 | override.tf.json
22 | *_override.tf
23 | *_override.tf.json
24 | 
25 | # Include override files you do wish to add to version control using negated pattern
26 | #
27 | # !example_override.tf
28 | 
29 | # Include tfplan files to ignore the plan output of command: terraform plan -out=tfplan
30 | # example: *tfplan*
31 | 
32 | # Ignore CLI configuration files
33 | .terraformrc
34 | terraform.rc
35 | 
36 | # Ignore credential file
37 | .env
38 | 
39 | .DS_Store
40 | data/**/*


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | *       @NYCPlanning/Engineering
2 | 


--------------------------------------------------------------------------------
/MIGRATING.md:
--------------------------------------------------------------------------------
  1 | # Migrating from Geosearch v1 to v2
  2 | This document outlines why we are introducing "v2" of the Geosearch API and how to migrate client application to use it, including details on breaking changes.
  3 | 
  4 | > This document assumes you are somewhat familiar with the underlying open source software that powers Geosearch. If you aren't please read through `README.md` and then come back.
  5 | 
  6 | ## The "why?"
  7 | 1. As of December 2022, the "v1" version of the API relied on end-of-life versions of several underlying languages and open source tools.
  8 | 2. The work required to update those dependencies necessitated a switch to using Pelias' official [csv-importer](https://github.com/pelias/csv-importer) for importing our custom normalized PAD data into the Pelias ElasticSearch database.
  9 | 3. When using that importer, arbitrary data that we attach to each record, such as BBL and BIN, are automatically nested with each feature's `properties` object in an object called `addendum`. Because this data is kept in a different property in "v1", this means we had to introduce minor breaking changes to the responses returned by the Geosearch API.
 10 | 
 11 | ## Breaking changes
 12 | 1. The paths to the endpoints for the API are the same aside from having to switch `/v1` to `/v2`. For instance `https://geosearch.planninglabs.nyc/v1/autocomplete?text=120%20broadway` becomes `https://geosearch.planninglabs.nyc/v2/autocomplete?text=120%20broadway`.
 13 | 2. The "custom" data we add to each record are now found under `addendum` in each geojson feature's `properties` object under new keys. Here are examples of are examples of the old and new response objects for reference. For instance, to get at the bbl for a particular feature, you would access `feature.properties.addendum.pad.bbl` instead of `feature.properties.pad_bbl`. You will also notice that some extraneous properties such as `pad_orig_stname` have been removed for brevity. If you are a user of Geosearch and have any questions regarding this migration, please reach out to OpenSource_DL@planning.nyc.gov.
 14 | 
 15 | #### Old
 16 | ```
 17 | {
 18 |       "type": "Feature",
 19 |       "geometry": {
 20 |         "type": "Point",
 21 |         "coordinates": [
 22 |           -74.01054,
 23 |           40.708225
 24 |         ]
 25 |       },
 26 |       "properties": {
 27 |         "id": "3945",
 28 |         "gid": "nycpad:address:3945",
 29 |         "layer": "address",
 30 |         "source": "nycpad",
 31 |         "source_id": "3945",
 32 |         "name": "120 BROADWAY",
 33 |         "housenumber": "120",
 34 |         "street": "BROADWAY",
 35 |         "postalcode": "10271",
 36 |         "accuracy": "point",
 37 |         "country": "United States",
 38 |         "country_gid": "whosonfirst:country:85633793",
 39 |         "country_a": "USA",
 40 |         "region": "New York State",
 41 |         "region_gid": "whosonfirst:region:0",
 42 |         "region_a": "NY",
 43 |         "county": "New York County",
 44 |         "county_gid": "whosonfirst:county:061",
 45 |         "locality": "New York",
 46 |         "locality_gid": "whosonfirst:locality:0",
 47 |         "locality_a": "NYC",
 48 |         "borough": "Manhattan",
 49 |         "borough_gid": "whosonfirst:borough:1",
 50 |         "label": "120 BROADWAY, Manhattan, New York, NY, USA",
 51 |         "pad_low": "104",
 52 |         "pad_high": "124",
 53 |         "pad_bin": "1001026",
 54 |         "pad_bbl": "1000477501",
 55 |         "pad_geomtype": "bin",
 56 |         "pad_orig_stname": "BROADWAY"
 57 |       }
 58 |     }
 59 | ```
 60 | 
 61 | #### New
 62 | ```
 63 | {
 64 |       "type": "Feature",
 65 |       "geometry": {
 66 |         "type": "Point",
 67 |         "coordinates": [
 68 |           -74.01052,
 69 |           40.70822
 70 |         ]
 71 |       },
 72 |       "properties": {
 73 |         "id": "3892",
 74 |         "gid": "nycpad:venue:3892",
 75 |         "layer": "venue",
 76 |         "source": "nycpad",
 77 |         "source_id": "3892",
 78 |         "country_code": "US",
 79 |         "name": "120 BROADWAY",
 80 |         "housenumber": "120",
 81 |         "street": "BROADWAY",
 82 |         "postalcode": "10271",
 83 |         "accuracy": "point",
 84 |         "country": "United States",
 85 |         "country_gid": "whosonfirst:country:85633793",
 86 |         "country_a": "USA",
 87 |         "region": "New York",
 88 |         "region_gid": "whosonfirst:region:85688543",
 89 |         "region_a": "NY",
 90 |         "county": "New York County",
 91 |         "county_gid": "whosonfirst:county:102081863",
 92 |         "locality": "New York",
 93 |         "locality_gid": "whosonfirst:locality:85977539",
 94 |         "locality_a": "NYC",
 95 |         "borough": "Manhattan",
 96 |         "borough_gid": "whosonfirst:borough:421205771",
 97 |         "neighbourhood": "Financial District",
 98 |         "neighbourhood_gid": "whosonfirst:neighbourhood:85865711",
 99 |         "label": "120 BROADWAY, New York, NY, USA",
100 |         "addendum": {
101 |           "pad": {
102 |             "bbl": "1000477501",
103 |             "bin": "1001026"
104 |           }
105 |         }
106 |       }
107 |     },
108 | ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Docker Compose project for NYC Geosearch service, built on the open source [Pelias](https://github.com/pelias/pelias) geocoder and [NYC's Property Address Directory (PAD)](https://www1.nyc.gov/site/planning/data-maps/open-data.page)
  2 | 
  3 | ## Overview
  4 | 
  5 | - [About](#about)
  6 | - [Config-Driven](#config-driven)
  7 | - [Pelias CLI tool](pelias-cli-tool)
  8 | - [Running Geosearch Locally](#running-geosearch-locally)
  9 | - [Deployment](#redeploying-geosearch-for-quarterly-data-updates)
 10 | - [How exactly to deployments work?](#how-exactly-to-deployments-work)
 11 | 
 12 | ## About
 13 | 
 14 | This repo serves as "home base" for the GeoSearch project, as the docker compose project orchestrates a functioning set up.  Other relevant code for our Pelias deployment:
 15 | 
 16 | - [geosearch-pad-normalize](https://github.com/NYCPlanning/labs-geosearch-pad-normalize) - an R script that ingests and transforms raw Property Address Database (PAD) data, most significantly interpolating valid address ranges. This repo ouputs a csv that conforms to the data schema required by Pelias' official [CSV Importer](https://github.com/pelias/csv-importer). Note that this repo used to output data meant to be ingested by the now deprecated [PAD Importer](https://github.com/NYCPlanning/labs-geosearch-pad-importer) project.
 17 | - [geosearch-docs](https://github.com/NYCPlanning/labs-geosearch-docs) - an interactive documentation site for the Geosearch API
 18 | 
 19 | Docker Compose allows us to quickly spin up the pelias services we need, and run scripts manually in the containers.  It also makes use of volumes and internal hostnames so the various services can communicate with each other. The contents of `docker-compose.yml` are based on code from the [pelias/docker](https://github.com/pelias/docker) project.
 20 | 
 21 | > There is one service in `docker-compose.yml` that did not come from the `pelias/docker` project and that is the one called `nginx`. We added a simple [nginx](https://nginx.org/en/) server here that uses the contents of `nginx.conf` to serve as a reverse proxy server to direct traffic to either the Geosearch docs [website](https://github.com/NYCPlanning/labs-geosearch-docs) or forward it to the Pelias API itself.
 22 | 
 23 | For more information on Pelias services, including many which we are not using here at City Planning, check out the `pelias/docker` project, or their [documentation](https://github.com/pelias/documentation)
 24 | 
 25 | ## Config-Driven
 26 | 
 27 | Much of this environment is config-driven, and the two files you should pay attention to are:
 28 | 
 29 | - [docker-compose.yml](https://github.com/NYCPlanning/labs-geosearch-dockerfiles/blob/master/pelias.json) - configurations for each of the named services, including images to use, environment variable definitions, and volume mounts.
 30 | - [pelias.json](https://github.com/NYCPlanning/labs-geosearch-docker/blob/master/pelias.json) - a shared config file used by all of the pelias services
 31 | 
 32 | ## Pelias CLI tool
 33 | 
 34 | All steps needed to get an instance of Geosearch up and running are encapsulated within commands that can be run via the `pelias` CLI tool included in this repo. This CLI tool is comprised of the file `pelias` at the root of this repo, as well as the files found in `/lib` and `/cmd`. All of these files were taken directly from [this Pelias repo](https://github.com/pelias/docker) outlining how to run Pelias via docker and docker-compose. **Note that these files are up to date with that Pelias repo as of December 2022, but changes to that repo will not be automatically reflected in this repo.**. If you would like to set up the CLI locally, see the docs in the /pelias/docker repo.
 35 | 
 36 | > If you are having trouble setting up the CLI, or would just prefer not to add a record to your `$PATH`, you should be able to call the file at `./pelias` directly. To do this when running the commands in the "Running Geosearch Locally" section below, just replace `pelias` with `./pelias` in the commands. For instance `pelias compose pull` becomes `./pelias compose pull`
 37 | 
 38 | ## Running Geosearch Locally
 39 | 
 40 | You can run Geosearch locally using the included `pelias` CLI and docker-compose.yml file. The following instructions assume that you have set up the Pelias CLI locally and have docker and docker-compose installed on your machine.
 41 | 
 42 | Run these commands from the root directory of this repo:
 43 | 
 44 | First, create the requisite folder for the docker volumes. Note that the `./data` folder and its contents will be gitignored
 45 | ```
 46 | mkdir -p data/elasticsearch data/csv data/whosonfirst
 47 | ```
 48 | 
 49 | Create a `.env` file and set the `DATA_DIR` environment variables for Pelias
 50 | ```
 51 | echo "DATA_DIR=$(pwd)/data" > .env
 52 | ```
 53 | 
 54 | Pull images
 55 | ```
 56 | pelias compose pull
 57 | ```
 58 | 
 59 | Start the ElasticSearch service
 60 | ```
 61 | pelias elastic start
 62 | ```
 63 | 
 64 | Wait for it to come up. **This may take longer than the timeout period built into the pelias CLI. If you get a message saying elasticsearch did not come up, try running this command a few times to see if you get the "Elasticsearch up!" message eventually**
 65 | ```
 66 | pelias elastic wait
 67 | ```
 68 | 
 69 | Create the index in EL
 70 | ```
 71 | pelias elastic create
 72 | ```
 73 | 
 74 | Download the required Who's On First dataset
 75 | ```
 76 | pelias download wof
 77 | ```
 78 | 
 79 | Download the normalized PAD CSV
 80 | ```
 81 | pelias download csv
 82 | ```
 83 | 
 84 | Import the normalized PAD data into the elasticsearch datastore. This will likely take a while.
 85 | ```
 86 | pelias import csv
 87 | ```
 88 | 
 89 | Bring up the rest of the necessary docker services, including the Pelias API and nginx server
 90 | ```
 91 | pelias compose up
 92 | ```
 93 | 
 94 | To confirm that everything is up and running, you can try to hit the API. For instance, a `GET` call to `http://localhost/v2/autocomplete?text=120%20broadway` should return results for 120 Broadway.
 95 | 
 96 | ## Redeploying Geosearch for Quarterly Data Updates
 97 | 
 98 | > The following section is only relevant to members of DCP's Open Source Engineering team responsible for maintaining Geosearch
 99 | 
100 | When a new quarterly update of PAD becomes available on Bytes of the Big Apples:
101 | 
102 | 1. Head to [geosearch-pad-normalize](https://github.com/NYCPlanning/labs-geosearch-pad-normalize) and perform the process outlined there for building a new version of the normalized PAD data. Once you have merged a pull request in the `main` branch of that repo, you can monitor the progress of building and uploading the new data in the [actions for that repo](https://github.com/NYCPlanning/labs-geosearch-pad-normalize/actions). This will produce the latest version of normalized pad and upload the new CSV file to the correct DigitalOcean Space.
103 | 
104 | 2. Confirm that the csv outputed by geosearch-pad-normalize has been uploaded to the "latest" folder in Digital Ocean. You can see the exact URL that this repo will attempt to download the data from by looking at the value in `imports.csv.download` in `pelias.json`. **Note that you should not have to make changes to `pelias.json` in order to do data updates.**
105 | 
106 | 3. Run the "Build and Deploy" GH Action workflow. This workflow will run automatically on pushes to `main`. However, if you are only trying to deploy a new instance of Geosearch with a new version of PAD, you should not need to make any code changes to this repo. Because of that, the workflow can also be run manually. To do that, go to the "Actions" tab in the repo and select the "Build and Deploy" worklow from the list on the left-hand side. Then select "Run workflow" with the `main` branch selected. 
107 | 
108 | 4. The workflow will create the new Droplet in Digital Ocean and run the commands in `cloud-config.yml`. This will initialize all of the containers in `docker-compose.yml`, download the PAD data, and import it into Pelias' ElasticSearch database. Finally, the workflow will run `wait_for_200.sh` every 30 seconds for up to 1 hour so that the workflow will end with a successful status if and when your new Geosearch instance is up and ready to start receiving traffic.
109 | 
110 | > As of December 2022, it typically takes about 30-45 minutes for the the droplet to be created and for the services to fully reach a "healthy" status with all of the data loaded in. In some cases, it is possible that the GH Action job that runs `wait_for_200.sh` will finish "successfully" even though there was a failure. If that job finishes successfully much more quickly than we would expect, manually test the `/v2/autocomplete` endpoint to make sure the normalized PAD data was properly loaded before going to production.
111 | 
112 | 5. Once the workflow finishes successfully, you should see a new geosearch droplet in Digital Ocean. You can verify that it is working properly by sending requests at it's public IPv4 address. Traffic to the production geosearch URL (https://geosearch.planninglabs.nyc/) is sent to the IP associated with the "geosearch" load balancer. To put your new droplet in production, simply add it to the new load balancer, remove the old droplet from the load balancer, and then delete the old droplet.
113 | 
114 | ## How exactly do deployments work?
115 | 
116 | > The following explains what happens when we deploy a new Droplet running the code in this repo to Digital Ocean. If you are only trying to deploy a new instance of Geosearch with a new version of PAD data, everything you need should be covered in the "Deployment" section above.
117 | 
118 | Deployments are primarily handled by two files: `/.github/workflows/build.yml' and 'cloud-config.yml`. The "Build and Deploy" workflow in `build.yml` is run manually or triggered by pushes to the `main` branch (note that merging PRs into main constitutes a push). This workflow is responsible for a few things:
119 | 1. It uses `doctl` to create a new droplet. It will add an SSH public key saved in DO to that Droplet and tag it with `labs`. It will also point DO to the `cloud-config.yml` file for cloud-init to use for provisioning the droplet
120 | 2. Once the droplet is up, it will use the script in `wait_for_200.sh` to wait for the droplet to be healthy. In this scenario, healthy is defined as having all Geosearch services up and ready to accept traffic. This can take a while, primarily due to the time it takes to download the normalized PAD CSV and import it into the ElasticSearch datastore.
121 | 
122 | Spinning up the services defined in `docker-compose.yml` and downloading and importing data is done via the tool [cloud-init](https://cloudinit.readthedocs.io/en/latest/). cloud-init uses the contents of `cloud-config.yml` to do the following:
123 | 1. Create a new sudo user called `pelias` on the new droplet. This is necessary because, following best practice, the Pelias CLI tool cannot not be run as the `root` system user. It will assign this user to the correct groups and add the included public SSH key to it.
124 | 2. Disable root access. As a security measure, logging into the droplet as `root` will be disabled once it is initialized.
125 | 3. Install the `docker` and `docker-compose` packages.
126 | 4. Bring up Geosearch by running the commands under `runcmd`. Note that even though `cloud-config.yml` creates the pelias user, the commands in `runcmd` are executed **as root**. Most of these commands use `runuser` to execute commands as the pelias user.
127 | 
128 | > If you find yourself needing to ssh into a deployed Geosearch droplet, please see your team lead for additional instructions.
129 | 


--------------------------------------------------------------------------------
/cloud-config.yml:
--------------------------------------------------------------------------------
 1 | #cloud-config
 2 | users:
 3 |   - name: pelias
 4 |     ssh-authorized-keys:
 5 |       - ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCwHcIqMPtGJJT8xtjnuMBf4K/XGifjUb9yv/Dn2GzB5W7BBiOduHV87mg0Up6PoEC0+naOB++Elt5a+ufiIP+Lc5H7mN/X9MX68MJgIR9kRBTxuwP7FKzohHHJesbzpyZEMHuI6nTeC0NO1donF/L2oelU1O3Vqbr7vjPLY8QYu5Ra9Dcryinjwx1b3kk3jysZ3IqEkJ7ye364d2CtougoHtH+j4Xi+FhPVdbJvzLowBGGuohVcwsfm+lU122zRHwgeM+W2OR/QUsch1TDbA3GnwNOQSz8TSl+mYoEBcF00GenkskrE11Jyvq0e01lfgzBseL7kal6THcgq/YjNDRt3hpyvwCrF22jgcw7hlHyz4Dnwe4u6ua8u4maoYdm1f01MAU2UnEssO63tJ401RE8VzvOS2zHgC2a3MR2uPbxpVOvusalJJuuShxqrfB04XYX7QPpzcDSfOatc7LFgT75Ipr/qNLFda/UAEuXMNFT8gbis+m+tSaTE+eAyh14PzM=
 6 |     groups: sudo, docker
 7 |     shell: /bin/bash
 8 |     sudo: ['ALL=(ALL) NOPASSWD:ALL']
 9 | disable_root: true
10 | package-update: true
11 | package-upgrade: true
12 | packages:
13 |   - docker
14 |   - docker-compose
15 | runcmd:
16 |   - 'runuser -l pelias -c "mkdir /home/pelias/geosearch"'
17 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && git clone https://github.com/NYCPlanning/labs-geosearch-docker.git ."'
18 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && mkdir -p data/elasticsearch data/csv data/whosonfirst"'
19 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && echo "DATA_DIR=/home/pelias/geosearch/data" > .env"'
20 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias compose pull"'
21 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias elastic start"'
22 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias elastic wait"'
23 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias elastic create"'
24 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias download wof"'
25 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias download csv"'
26 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias import csv"'
27 |   - 'runuser -l pelias -c "cd /home/pelias/geosearch && ./pelias compose up"'
28 | 


--------------------------------------------------------------------------------
/cmd/docker.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | function net_init(){
 5 |   docker network create ${COMPOSE_PROJECT_NAME}_default &>/dev/null || true
 6 | }
 7 | 
 8 | function compose_pull(){ compose_exec pull; }
 9 | register 'compose' 'pull' 'update all docker images' compose_pull
10 | 
11 | function compose_logs(){ compose_exec logs $@; }
12 | register 'compose' 'logs' 'display container logs' compose_logs
13 | 
14 | function compose_ps(){ compose_exec ps $@; }
15 | register 'compose' 'ps' 'list containers' compose_ps
16 | 
17 | function compose_top(){ compose_exec top $@; }
18 | register 'compose' 'top' 'display the running processes of a container' compose_top
19 | 
20 | function compose_exec(){ docker-compose $@; }
21 | register 'compose' 'exec' 'execute an arbitrary docker-compose command' compose_exec
22 | 
23 | function compose_run(){ net_init; docker-compose run --rm $@; }
24 | register 'compose' 'run' 'execute a docker-compose run command' compose_run
25 | 
26 | function compose_up(){ docker-compose up -d $@; }
27 | register 'compose' 'up' 'start one or more docker-compose service(s)' compose_up
28 | 
29 | function compose_kill(){ docker-compose kill $@; }
30 | register 'compose' 'kill' 'kill one or more docker-compose service(s)' compose_kill
31 | 
32 | function compose_down(){ docker-compose down; }
33 | register 'compose' 'down' 'stop all docker-compose service(s)' compose_down
34 | 
35 | 


--------------------------------------------------------------------------------
/cmd/download.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | # per-source downloads
 5 | function download_wof(){ compose_run -T 'whosonfirst' './bin/download'; }
 6 | function download_oa(){ compose_run -T 'openaddresses' './bin/download'; }
 7 | function download_osm(){ compose_run -T 'openstreetmap' './bin/download'; }
 8 | function download_geonames(){ compose_run -T 'geonames' './bin/download'; }
 9 | function download_tiger(){ compose_run -T 'interpolation' './bin/download-tiger'; }
10 | function download_transit(){ compose_run -T 'transit' './bin/download'; }
11 | function download_csv(){ compose_run -T 'csv-importer' './bin/download'; }
12 | 
13 | register 'download' 'wof' '(re)download whosonfirst data' download_wof
14 | register 'download' 'oa' '(re)download openaddresses data' download_oa
15 | register 'download' 'osm' '(re)download openstreetmap data' download_osm
16 | register 'download' 'geonames' '(re)download geonames data' download_geonames
17 | register 'download' 'tiger' '(re)download TIGER data' download_tiger
18 | register 'download' 'transit' '(re)download transit data' download_transit
19 | register 'download' 'csv' '(re)download csv data' download_csv
20 | 
21 | # download all the data to be used by imports
22 | function download_all(){
23 |   download_wof &
24 |   download_oa &
25 |   download_osm &
26 | 
27 |   if [[ "$ENABLE_GEONAMES" == "true" ]]; then
28 |     download_geonames &
29 |   fi
30 | 
31 |   download_tiger &
32 |   download_transit &
33 |   download_csv &
34 |   wait
35 | }
36 | 
37 | register 'download' 'all' '(re)download all data' download_all
38 | 


--------------------------------------------------------------------------------
/cmd/elastic.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | function elastic_schema_drop(){ compose_run 'schema' node scripts/drop_index "$@" || true; }
 5 | function elastic_schema_create(){ compose_run 'schema' ./bin/create_index; }
 6 | function elastic_start(){
 7 |   mkdir -p $DATA_DIR/elasticsearch
 8 |   # attemp to set proper permissions if running as root
 9 |   chown $DOCKER_USER $DATA_DIR/elasticsearch 2>/dev/null || true
10 |   compose_exec up -d elasticsearch
11 | }
12 | 
13 | function elastic_stop(){ compose_exec kill elasticsearch; }
14 | 
15 | register 'elastic' 'drop' 'delete elasticsearch index & all data' elastic_schema_drop
16 | register 'elastic' 'create' 'create elasticsearch index with pelias mapping' elastic_schema_create
17 | register 'elastic' 'start' 'start elasticsearch server' elastic_start
18 | register 'elastic' 'stop' 'stop elasticsearch server' elastic_stop
19 | 
20 | # to use this function:
21 | # if test $(elastic_status) -ne 200; then
22 | function elastic_status(){
23 |   curl \
24 |     --output /dev/null \
25 |     --silent \
26 |     --write-out "%{http_code}" \
27 |     "http://${ELASTIC_HOST:-localhost:9200}/_cluster/health?wait_for_status=yellow&timeout=1s" \
28 |       || true;
29 | }
30 | 
31 | # the same function but with a trailing newline
32 | function elastic_status_newline(){ echo $(elastic_status); }
33 | register 'elastic' 'status' 'HTTP status code of the elasticsearch service' elastic_status_newline
34 | 
35 | function elastic_wait(){
36 |   echo 'waiting for elasticsearch service to come up';
37 |   retry_count=30
38 | 
39 |   i=1
40 |   while [[ "$i" -le "$retry_count" ]]; do
41 |     if [[ $(elastic_status) -eq 200 ]]; then
42 |       echo "Elasticsearch up!"
43 |       exit 0
44 |     elif [[ $(elastic_status) -eq 408 ]]; then
45 |       # 408 indicates the server is up but not yet yellow status
46 |       printf ":"
47 |     else
48 |       printf "."
49 |     fi
50 |     sleep 1
51 |     i=$(($i + 1))
52 |   done
53 | 
54 |   echo -e "\n"
55 |   echo "Elasticsearch did not come up, check configuration"
56 |   exit 1
57 | }
58 | 
59 | register 'elastic' 'wait' 'wait for elasticsearch to start up' elastic_wait
60 | 
61 | function elastic_info(){ curl -s "http://${ELASTIC_HOST:-localhost:9200}/"; }
62 | register 'elastic' 'info' 'display elasticsearch version and build info' elastic_info
63 | 
64 | function elastic_stats(){
65 |   curl -s "http://${ELASTIC_HOST:-localhost:9200}/pelias/_search?request_cache=true&timeout=10s&pretty=true" \
66 |     -H 'Content-Type: application/json' \
67 |     -d '{
68 |           "aggs": {
69 |             "sources": {
70 |               "terms": {
71 |                 "field": "source",
72 |                 "size": 100
73 |               },
74 |               "aggs": {
75 |                 "layers": {
76 |                   "terms": {
77 |                     "field": "layer",
78 |                     "size": 100
79 |                   }
80 |                 }
81 |               }
82 |             }
83 |           },
84 |           "size": 0
85 |         }';
86 | }
87 | register 'elastic' 'stats' 'display a summary of doc counts per source/layer' elastic_stats
88 | 


--------------------------------------------------------------------------------
/cmd/import.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | # per-source imports
 5 | function import_wof(){ compose_run 'whosonfirst' './bin/start'; }
 6 | function import_oa(){ compose_run 'openaddresses' "./bin/parallel ${OPENADDRESSES_PARALLELISM:-1}"; }
 7 | function import_osm(){ compose_run 'openstreetmap' './bin/start'; }
 8 | function import_polylines(){ compose_run 'polylines' './bin/start'; }
 9 | function import_geonames(){ compose_run 'geonames' './bin/start'; }
10 | function import_transit(){ compose_run 'transit' './bin/start'; }
11 | function import_csv(){ compose_run 'csv-importer' './bin/parallel' ${CSV_PARALLELISM:-1}; }
12 | 
13 | register 'import' 'wof' '(re)import whosonfirst data' import_wof
14 | register 'import' 'oa' '(re)import openaddresses data' import_oa
15 | register 'import' 'osm' '(re)import openstreetmap data' import_osm
16 | register 'import' 'polylines' '(re)import polylines data' import_polylines
17 | register 'import' 'geonames' '(re)import geonames data' import_geonames
18 | register 'import' 'transit' '(re)import transit data' import_transit
19 | register 'import' 'csv' '(re)import csv data' import_csv
20 | 
21 | # import all the data to be used by imports
22 | # note: running importers in parallel can cause issues due to high CPU & RAM requirements.
23 | function import_all(){
24 |   import_wof
25 |   import_oa
26 |   import_osm
27 |   import_polylines
28 | 
29 |   if [[ "$ENABLE_GEONAMES" == "true" ]]; then
30 |     import_geonames
31 |   fi
32 | 
33 |   import_transit
34 |   import_csv
35 | }
36 | 
37 | register 'import' 'all' '(re)import all data' import_all
38 | 


--------------------------------------------------------------------------------
/cmd/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | # per-source prepares
 5 | function prepare_polylines(){ compose_run -T 'polylines' bash ./docker_extract.sh; }
 6 | function prepare_interpolation(){ compose_run -T 'interpolation' bash ./docker_build.sh; }
 7 | function prepare_placeholder(){
 8 |   compose_run -T 'placeholder' ./cmd/extract.sh;
 9 |   compose_run -T 'placeholder' ./cmd/build.sh;
10 | }
11 | 
12 | register 'prepare' 'polylines' 'export road network from openstreetmap into polylines format' prepare_polylines
13 | register 'prepare' 'interpolation' 'build interpolation sqlite databases' prepare_interpolation
14 | register 'prepare' 'placeholder' 'build placeholder sqlite databases' prepare_placeholder
15 | 
16 | # prepare all the data to be used by imports
17 | function prepare_all(){
18 |   prepare_polylines &
19 |   prepare_placeholder &
20 |   wait
21 |   prepare_interpolation
22 | }
23 | 
24 | register 'prepare' 'all' 'build all services which have a prepare step' prepare_all
25 | 


--------------------------------------------------------------------------------
/cmd/system.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | function system_check(){ env_check; }
 5 | register 'system' 'check' 'ensure the system is correctly configured' system_check
 6 | 
 7 | function system_env(){ env; }
 8 | register 'system' 'env' 'display environment variables' system_env
 9 | 
10 | function system_update(){ git -C $(dirname "${BASH_SOURCE[0]}") pull; }
11 | register 'system' 'update' 'update the pelias command by pulling the latest version' system_update


--------------------------------------------------------------------------------
/cmd/test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e;
3 | 
4 | # run acceptance tests
5 | function test_fuzzy(){ compose_run 'fuzzy-tester' -e 'docker' $@; }
6 | 
7 | register 'test' 'run' 'run fuzzy-tester test cases' test_fuzzy


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
  1 | version: '3'
  2 | networks:
  3 |   default:
  4 |     driver: bridge
  5 | services:
  6 |   nginx:
  7 |     image: nginx:1.26.1-alpine3.19-slim
  8 |     restart: unless-stopped
  9 |     volumes:
 10 |       - ./nginx.conf:/etc/nginx/conf.d/nginx.conf
 11 |     ports:
 12 |       - "80:80"
 13 |       - "443:443"
 14 |     command: "/bin/sh -c 'while :; do sleep 6h & wait $${!}; nginx -s reload; done & nginx -g \"daemon off;\"'"
 15 |     logging:
 16 |       driver: "local"
 17 |       options:
 18 |         max-size: "200m"
 19 | 
 20 |   api:
 21 |     image: pelias/api:v5.53.0
 22 |     container_name: pelias_api
 23 |     user: "${DOCKER_USER}"
 24 |     restart: always
 25 |     environment: [ "PORT=4000" ]
 26 |     ports: [ "4000:4000" ]
 27 |     volumes:
 28 |       - "./pelias.json:/code/pelias.json"
 29 |     logging:
 30 |       driver: "local"
 31 |       options:
 32 |         max-size: "200m"
 33 | 
 34 |   schema:
 35 |     image: pelias/schema:v6.4.0
 36 |     container_name: pelias_schema
 37 |     user: "${DOCKER_USER}"
 38 |     volumes:
 39 |       - "./pelias.json:/code/pelias.json"
 40 | 
 41 |   libpostal:
 42 |     image: pelias/libpostal-service:latest
 43 |     container_name: pelias_libpostal
 44 |     user: "${DOCKER_USER}"
 45 |     restart: always
 46 |     ports: [ "4400:4400" ]
 47 |     logging:
 48 |       driver: "local"
 49 |       options:
 50 |         max-size: "200m"
 51 | 
 52 |   csv-importer:
 53 |     image: pelias/csv-importer:v2.13.0
 54 |     container_name: pelias_csv_importer
 55 |     user: "${DOCKER_USER}"
 56 |     volumes:
 57 |       - "./pelias.json:/code/pelias.json"
 58 |       - "${DATA_DIR}:/data"
 59 | 
 60 |   whosonfirst:
 61 |     image: pelias/whosonfirst:v5.5.1
 62 |     container_name: pelias_whosonfirst
 63 |     user: "${DOCKER_USER}"
 64 |     volumes:
 65 |       - "./pelias.json:/code/pelias.json"
 66 |       - "${DATA_DIR}:/data"
 67 | 
 68 |   pip:
 69 |     image: pelias/pip-service:v2.2.0
 70 |     container_name: pelias_pip-service
 71 |     user: "${DOCKER_USER}"
 72 |     restart: always
 73 |     environment: ["PORT=4200"]
 74 |     ports: ["4200:4200"]
 75 |     volumes:
 76 |       - "./pelias.json:/code/pelias.json"
 77 |       - "${DATA_DIR}:/data"
 78 |     logging:
 79 |       driver: "local"
 80 |       options:
 81 |         max-size: "200m"
 82 | 
 83 |   elasticsearch:
 84 |     image: pelias/elasticsearch:7.16.1
 85 |     container_name: pelias_elasticsearch
 86 |     user: "${DOCKER_USER}"
 87 |     restart: always
 88 |     ports: [ "9200:9200", "9300:9300" ]
 89 |     environment:
 90 |       - "ES_JAVA_OPTS=-Xms2g -Xmx2g"
 91 |     volumes:
 92 |       - "${DATA_DIR}/elasticsearch:/usr/share/elasticsearch/data"
 93 |     ulimits:
 94 |       memlock:
 95 |         soft: -1
 96 |         hard: -1
 97 |       nofile:
 98 |         soft: 65536
 99 |         hard: 65536
100 |     cap_add: [ "IPC_LOCK" ]
101 |     logging:
102 |       driver: "local"
103 |       options:
104 |         max-size: "200m"
105 | 


--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
1 | DATA_DIR=./data
2 | DOCKER_USER=1100
3 | DO_PAT=your_digitalocean_personal_access_token
4 | PVT_KEY=/path/to/your/private_key
5 | PASSWORD=some_password_for_pelias


--------------------------------------------------------------------------------
/lib/cli.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | declare -a commands
 5 | declare -a actions
 6 | declare -a hints
 7 | declare -a functions
 8 | 
 9 | function register(){
10 |   commands+=("$1")
11 |   actions+=("$2")
12 |   hints+=("$3")
13 |   functions+=("$4")
14 | }
15 | 
16 | function help(){
17 |   printf 'Usage: %s [command] [action] [options]\n\n' ${0}
18 | 
19 |   for (( i = 0; i < ${#commands[@]}; ++i )); do
20 |     echo -e "  ${commands[$i]}\t${actions[$i]}\t          ${hints[$i]}"
21 |   done | column -ts $'\t'
22 | 
23 |   echo
24 | }
25 | 
26 | function cli(){
27 |   cmd="${1}"; shift || true
28 |   action="${1}"; shift || true
29 |   valid_command=false
30 |   valid_action=false
31 | 
32 |   for (( i = 0; i < ${#commands[@]}; ++i )); do
33 |     if [ "${cmd}" = "${commands[$i]}" ]; then
34 |       valid_command=true
35 |       if [ "${action}" = "${actions[$i]}" ]; then
36 |         valid_action=true
37 |         "${functions[$i]}" "$@"
38 |         exit $?
39 |       fi
40 |     fi
41 |   done
42 |   echo
43 | 
44 |   [ -z "${cmd}" ] || [ "$valid_command" = true ] || printf 'invalid command "%s"\n\n' "${cmd}"
45 |   [ -z "${action}" ] || [ "$valid_action" = true ] || printf 'invalid action "%s"\n\n' "${action}"
46 |   help
47 | 
48 |   exit 1
49 | }
50 | 


--------------------------------------------------------------------------------
/lib/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e;
 3 | 
 4 | # disable verbose logging
 5 | ENV_DISPLAY_WARNINGS=false
 6 | 
 7 | # ensure docker runs containers as the current user (even when running with sudo)
 8 | # note: SUDO_USER is not portable to all systems but its the best we've got.
 9 | function set_docker_user(){
10 |   CURRENT_USER=$(id -u ${SUDO_USER-${USER}}):$(id -g ${SUDO_USER-${USER}})
11 |   if [ ! -z "${DOCKER_USER}" ] && [ "${DOCKER_USER}" != "${CURRENT_USER}" ]; then
12 |     2>&1 printf "WARNING: The DOCKER_USER env var is deprecated, using %s.\n" ${CURRENT_USER}
13 |     2>&1 echo "Remove the DOCKER_USER line from your .env file to silence this message."
14 |   fi
15 |   export DOCKER_USER="${CURRENT_USER}";
16 | }
17 | 
18 | # ensure the user environment is correctly set up
19 | function env_check(){
20 |   if [ "${DOCKER_USER}" = "0:0" ]; then
21 |     echo "You are running as root"
22 |     echo "This is insecure and not supported by Pelias."
23 |     echo "Please try again as a non-root user."
24 |     exit 1
25 |   fi
26 | 
27 |   if [ -z "${DATA_DIR}" ]; then
28 |     echo "You must set the DATA_DIR env var to a valid directory on your local machine."
29 |     echo
30 |     echo "Edit the '.env' file in this repository, update the DATA_DIR to a valid path and try again."
31 |     echo "Alternatively, you can set the variable in your environment using a command such as 'export DATA_DIR=/tmp'."
32 |     exit 1
33 |   elif [ ! -d "${DATA_DIR}" ]; then
34 |     printf "The directory specified by DATA_DIR does not exist: %s\n" ${DATA_DIR}
35 |     echo
36 |     echo "Edit the '.env' file in this repository, update the DATA_DIR to a valid path and try again."
37 |     echo "Alternatively, you can set the variable in your environment using a command such as 'export DATA_DIR=/tmp'."
38 |     exit 1
39 |   fi
40 | }
41 | 
42 | # loads environment vars from a stream (such as a file)
43 | # example: env_load_stream < .env
44 | function env_load_stream(){
45 |   [[ -n $DATA_DIR ]] && printf "DATA_DIR is already set to '$DATA_DIR' - this may cause the DATA_DIR specified in the .env to be ignored\n"
46 |   while IFS='=' read -r key value; do
47 |     ([ -z $key ] || [ -z $value ]) && printf 'Invalid environment var "%s=%s"\n' $key $value && exit 1
48 |     if [ -z ${!key} ]; then
49 |       export "${key}=${value}"
50 |     elif $ENV_DISPLAY_WARNINGS; then
51 |       printf '[warn] skip setting environment var "%s=%s", already set "%s=%s"\n' $key $value $key ${!key}
52 |     fi
53 |   done
54 | }
55 | 
56 | # ensure locale is correctly set?
57 | # export LC_ALL=en_US.UTF-8
58 | 
59 | # load DATA_DIR and other vars from docker-compose .env file
60 | # note: strips comments and empty lines
61 | [ -f .env ] && env_load_stream < <(grep -v '^$\|^\s*$\#' .env)
62 | 
63 | # use the default compose file unless one was specified
64 | # if [ -z "${COMPOSE_FILE}" ]; then
65 | #   if [ ! -f "docker-compose.yml" ]; then
66 | #     export COMPOSE_FILE="${BASEDIR}/docker-compose.yml"
67 | #   fi
68 | # fi
69 | 
70 | set_docker_user
71 | 
72 | # ensure the user env is correctly set up
73 | env_check
74 | 


--------------------------------------------------------------------------------
/nginx.conf:
--------------------------------------------------------------------------------
 1 | server {
 2 |     listen 80 default_server;
 3 |     server_name geosearch.planninglabs.nyc;
 4 |     location / {
 5 |         proxy_set_header   X-Real-IP $remote_addr;
 6 |         proxy_set_header   Host      labs-geosearch-docs.netlify.app;
 7 |         proxy_pass           https://labs-geosearch-docs.netlify.app;
 8 |     }
 9 |     location /v1 {
10 |         default_type application/json;
11 |         return 410 '{"message": "v1 API has been permanently removed. For details on migrating to the v2 API, see https://github.com/NYCPlanning/labs-geosearch-docker/blob/master/MIGRATING.md"}';
12 |     }
13 |     location /v2 {
14 |         if ($request_method != GET) {
15 |             return 403;
16 |         }
17 |         proxy_set_header   X-Real-IP $remote_addr;
18 |         proxy_set_header   Host      $http_host;
19 |         # point to the Pelias API
20 |         proxy_pass         http://api:4000/v1;
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/pelias:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # OSX comes bundled with versions of readlink, sed, parallel etc which are not
 5 | # compatible with the linux tools. Force OSX users to install the GNU
 6 | # compatible versions (prefixed with 'g', such as 'greadlink', 'gsed' etc.).
 7 | export CMD_READLINK='readlink'
 8 | if [[ "$OSTYPE" == "darwin"* ]]; then
 9 |   if [ -x "$(command -v greadlink)" ]; then
10 |     CMD_READLINK='greadlink';
11 |   else
12 |     2>&1 echo 'OSX: you must install the gnu standard tooling using:'
13 |     2>&1 echo 'brew install coreutils'
14 |   fi
15 | fi
16 | 
17 | # resolve path to this file (following symlinks) and load libs
18 | BASEDIR=$( dirname $( ${CMD_READLINK} -f "${BASH_SOURCE[0]}" ) )
19 | for f in ${BASEDIR}/lib/* ${BASEDIR}/cmd/*; do source $f; done
20 | 
21 | # cli runner
22 | cli "$@"
23 | 


--------------------------------------------------------------------------------
/pelias.json:
--------------------------------------------------------------------------------
 1 | {
 2 |    "esclient": {
 3 |     "apiVersion": "7.5",
 4 |     "hosts": [
 5 |       { "host": "elasticsearch" }
 6 |     ]
 7 |   },
 8 |   "schema": {
 9 |     "indexName": "pelias"
10 |   },
11 |   "api": {
12 |     "services": {
13 |       "libpostal": {
14 |         "url": "http://libpostal:4400"
15 |       },
16 |       "pip": { "url": "http://pip:4200" }
17 |     },
18 |     "targets": {
19 |       "auto_discover": true
20 |     },
21 |     "host": "api",
22 |     "indexName": "pelias"
23 |   },
24 |   "imports": {
25 |     "adminLookup": {
26 |       "enabled": true
27 |     },
28 |     "csv": {
29 |       "datapath": "/data/csv",
30 |       "download": [
31 |         "https://planninglabs.nyc3.digitaloceanspaces.com/geosearch-data/latest/labs-geosearch-pad-normalized.csv"
32 |       ]
33 |     },
34 |     "whosonfirst": {
35 |       "datapath": "/data/whosonfirst",
36 |       "importPostalcodes": false,
37 |       "countryCode": "US",
38 |       "importPlace": [
39 |         "85977539"
40 |       ]
41 |     }
42 |   },
43 |   "logger": {
44 |     "level": "http",
45 |     "timestamp": true,
46 |     "colorize": true
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/version.env:
--------------------------------------------------------------------------------
1 | VERSION=22a1
2 | 


--------------------------------------------------------------------------------
/wait_for_200.sh:
--------------------------------------------------------------------------------
 1 | #/bin/bash
 2 | 
 3 | set -e;
 4 | 
 5 | function health_status(){
 6 |   curl \
 7 |     --output /dev/null \
 8 |     --silent \
 9 |     --write-out "%{http_code}" \
10 |     "http://$1/v2/autocomplete?text=120%20broadway" \
11 |       || true;
12 | }
13 | 
14 | function health_wait(){
15 |   echo 'waiting for healthcheck to $1 to return 200';
16 |   retry_count=120
17 | 
18 |   i=1
19 |   while [[ "$i" -le "$retry_count" ]]; do
20 |     if [[ $(health_status $1) -eq 200 ]]; then
21 |       echo "Geosearch is up!"
22 |       exit 0
23 |     else
24 |       echo "Healthcheck did not return 200 status code. Trying again in 30 seconds..."
25 |     fi
26 |     sleep 30
27 |     i=$(($i + 1))
28 |   done
29 | 
30 |   echo -e "\n"
31 |   echo "Geosearch did not come up. Check cloudinit logs for details."
32 |   exit 1
33 | }
34 | 
35 | for var in "$@"; do
36 |     health_wait "$var"
37 | done


--------------------------------------------------------------------------------