├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── config
    └── geonotebook.ini
├── emr-docker
    ├── Makefile
    ├── README.md
    ├── bootstrap-geopyspark-docker.sh
    ├── config-aws.mk.template
    ├── config-emr.mk
    └── configurations.json
├── kernels
    ├── local
    │   └── kernel.json
    └── yarn
    │   └── kernel.json
├── notebooks
    ├── Getting the mask.ipynb
    ├── Landsat.ipynb
    ├── NLCD viewer.ipynb
    ├── Park citing.ipynb
    ├── Pine Habitat.ipynb
    ├── SRTM-emr.ipynb
    ├── SRTM-local.ipynb
    ├── libya.ipynb
    └── sanfranmvp.ipynb
├── rpms
    └── build
    │   ├── .dockerignore
    │   ├── Dockerfile.base
    │   ├── Dockerfile.gcc4
    │   ├── Makefile
    │   ├── README.md
    │   ├── archives
    │       └── .gitignore
    │   ├── blobs
    │       └── .gitignore
    │   ├── build.sh
    │   ├── configurable-http-proxy.mk
    │   ├── configurable-http-proxy
    │       └── .gitignore
    │   ├── configurable-http-server.mk
    │   ├── etc
    │       └── pam.d
    │       │   └── login
    │   ├── fetch.sh
    │   ├── gdal.mk
    │   ├── geopyspark.mk
    │   ├── patches
    │       └── patch.diff
    │   ├── publish.sh
    │   ├── rpmbuild
    │       ├── BUILD
    │       │   └── .gitignore
    │       ├── RPMS
    │       │   └── .gitignore
    │       ├── SOURCES
    │       │   └── .gitignore
    │       ├── SPECS
    │       │   ├── configurable-http-proxy.spec
    │       │   ├── gdal.spec
    │       │   ├── hdf5.spec
    │       │   ├── netcdf.spec
    │       │   ├── nodejs.spec
    │       │   ├── openjpeg.spec
    │       │   └── proj.spec
    │       └── SRPMS
    │       │   └── .gitignore
    │   ├── scripts
    │       ├── configurable-http-proxy.sh
    │       ├── gdal.sh
    │       ├── hdf5.sh
    │       ├── netcdf.sh
    │       ├── nodejs.sh
    │       ├── not.sh
    │       ├── openjpeg.sh
    │       ├── proj.sh
    │       └── wheel.sh
    │   ├── wheel
    │       └── requirements.txt
    │   └── wheels.mk
└── terraform
    ├── .gitignore
    ├── aws.tf
    ├── bootstrap.sh
    ├── cluster-configurations.json
    ├── emr.tf
    ├── s3.tf
    ├── security-group.tf
    └── variables.tf
/.dockerignore:
--------------------------------------------------------------------------------
 1 | archives/
 2 | bootstrap/
 3 | emr/
 4 | emr-nodocker/
 5 | geopyspark-*/
 6 | netcdf-backend/
 7 | notebooks/
 8 | rpms/
 9 | scratch/
10 | thredds-feature-s3-hdfs/
11 | .travis/
12 | 
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | archives/*.jar
 2 | archives/*.tar.*
 3 | archives/*.whl
 4 | archives/*.zip
 5 | blobs/*
 6 | emr/config-aws.mk
 7 | geopyspark-*/
 8 | notebooks/.ipynb_checkpoints/
 9 | rpms/build/blobs/*
10 | rpms/build/rpmbuild/RPMS/x86_64/*
11 | rpms/build/rpmbuild/SOURCES/*
12 | rpms/build/wheel/http-requirements.txt
13 | rpms/build/wheel/*.whl
14 | scratch/*
15 | target/*
16 | terraform-docker/.terraform/*
17 | terraform-docker/.terraform*
18 | terraform-docker/terraform.tfstate*
19 | terraform-nodocker/.terraform/*
20 | terraform-nodocker/.terraform*
21 | terraform-nodocker/terraform.tfstate*
22 | 
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: required
 2 | 
 3 | language:
 4 |    - scala
 5 | 
 6 | scala:
 7 |    - "2.11.11"
 8 | 
 9 | jdk:
10 |    - openjdk8
11 | 
12 | services:
13 |    - docker
14 | 
15 | env:
16 |    global:
17 |       - CLEAN_TRAVIS_TAG=${TRAVIS_TAG/[[:space:]]/}
18 |       - TAG=${CLEAN_TRAVIS_TAG:-${TRAVIS_COMMIT:0:7}}
19 | 
20 | branches:
21 |    only:
22 |       - master
23 | 
24 | addons:
25 |    apt:
26 |       packages:
27 |          - make
28 | 
29 | script:
30 |    - TRAVIS=1 TAG=$TAG make image
31 | 
32 | after_success:
33 |    - if [ "$QUAY_USERNAME" != "" -a "$QUAY_PASSWORD" != "" ]; then
34 |      docker login -u="$QUAY_USERNAME" -p="$QUAY_PASSWORD" quay.io;
35 |      TRAVIS=1 TAG=$TAG make publish;
36 |      fi
37 | 
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM quay.io/geodocker/jupyter-geopyspark:base-8
 2 | 
 3 | ARG GEOPYSPARK_VERSION
 4 | ARG GEOPYSPARKSHA
 5 | 
 6 | ENV PYSPARK_PYTHON=python3.4
 7 | ENV PYSPARK_DRIVER_PYTHON=python3.4
 8 | 
 9 | # Set up Jupyter
10 | RUN mkdir /home/hadoop/notebooks && \
11 |     pip3 install --user pytest && \
12 |     jupyter nbextension enable --py widgetsnbextension
13 | COPY kernels/local/kernel.json /home/hadoop/.local/share/jupyter/kernels/pyspark/kernel.json
14 | 
15 | # Install GeoPySpark
16 | RUN pip3 install --user protobuf==3.3.0 traitlets==4.3.2 "https://github.com/locationtech-labs/geopyspark/archive/$GEOPYSPARKSHA.zip"
17 | 
18 | # Install Jars
19 | ADD https://s3.amazonaws.com/geopyspark-dependency-jars/geotrellis-backend-assembly-${GEOPYSPARK_VERSION}.jar /opt/jars/
20 | 
21 | USER root
22 | RUN chmod ugo+r /opt/jars/*
23 | RUN chown -R hadoop:hadoop /home/hadoop/.local/share
24 | USER hadoop
25 | 
26 | WORKDIR /tmp
27 | CMD ["jupyterhub", "--no-ssl", "--Spawner.notebook_dir=/home/hadoop/notebooks"]
28 | 
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: image clean cleaner cleanest mrproper
 2 | 
 3 | TAG ?= 8
 4 | FAMILY := quay.io/geodocker/jupyter-geopyspark
 5 | IMAGE := $(FAMILY):$(TAG)
 6 | GEOPYSPARK_SHA ?= a75dda65434b472045f3fd97baca1e9bdc1ac353
 7 | GEOPYSPARK_VERSION ?= 0.4.2
 8 | 
 9 | all: image
10 | 
11 | image: Dockerfile
12 | 	docker build \
13 |           --build-arg GEOPYSPARK_VERSION=$(GEOPYSPARK_VERSION) \
14 |           --build-arg GEOPYSPARKSHA=$(GEOPYSPARK_SHA) \
15 |           -t $(IMAGE) -f Dockerfile .
16 | 
17 | clean:
18 | 
19 | cleaner: clean
20 | 
21 | cleanest: cleaner
22 | 
23 | mrproper: cleanest
24 | 
25 | publish:
26 | 	docker tag $(IMAGE) "$(FAMILY):latest"
27 | 	docker push $(IMAGE)
28 | 	docker push "$(FAMILY):latest"
29 | 
30 | run:
31 | 	mkdir -p $(HOME)/.aws
32 | 	docker run -it --rm --name geopyspark \
33 |           -p 8000:8000 -p 4040:4040 \
34 |           $(EXTRA_FLAGS) \
35 |           -v $(shell pwd)/notebooks:/home/hadoop/notebooks:rw \
36 |           -v $(HOME)/.aws:/home/hadoop/.aws:ro \
37 |           $(IMAGE)
38 | 
39 | run-editable:
40 | 	mkdir -p $(HOME)/.aws
41 | 	docker run -it --rm --name geopyspark \
42 |           -p 8000:8000 -p 4040:4040 \
43 |           $(EXTRA_FLAGS) \
44 |           -v $(GEOPYSPARK_DIR):/home/hadoop/.local/lib/python3.4/site-packages/geopyspark:rw \
45 |           -v $(shell pwd)/notebooks:/home/hadoop/notebooks:rw \
46 |           -v $(HOME)/.aws:/home/hadoop/.aws:ro \
47 |           $(IMAGE)
48 | 
49 | shell:
50 | 	docker exec -it geopyspark bash
51 | 
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction #
  2 | 
  3 | This repository contains the configuration and build files necessary to produce the [`quay.io/geodocker/jupyter-geopyspark` Docker image](https://quay.io/repository/geodocker/jupyter-geopyspark?tab=tags).
  4 | The Docker image allows easy use of [GeoPySpark](https://github.com/locationtech-labs/geopyspark) in a web browser via [Jupyter](https://github.com/jupyter/jupyter/) and [GeoNotebook](https://github.com/opengeoscience/geonotebook) without having to modify or configure the host computer (beyond what is needed to run Docker).
  5 | 
  6 | The process of [using a pre-built container](#without-a-clone) is discussed in the next section,
  7 | and instructions for [building the image](#building-the-image) and [modifying it](#modifying-the-image-or-image-architecture) are also discussed.
  8 | 
  9 | # Using The Image #
 10 | 
 11 | You will be prompted for a username and a password when direct your web browser to the container: the username and password are both `hadoop`.
 12 | 
 13 | One can use the image with or without making a clone of this repository.
 14 | 
 15 | ## Without A Clone ##
 16 | 
 17 | To use the image without (or from outside of) a clone of this repository,
 18 | first make sure that you are in possession of the image.
 19 | The command
 20 | ```
 21 | docker pull quay.io/geodocker/jupyter-geopyspark
 22 | ```
 23 | will pull the latest version of the image.
 24 | 
 25 | The container can then be started by typing
 26 | ```
 27 | docker run -it --rm --name geopyspark \
 28 |    -p 8000:8000 -p 4040:4040 \
 29 |    quay.io/geodocker/jupyter-geopyspark
 30 | ```
 31 | or perhaps
 32 | ```
 33 | docker run -it --rm --name geopyspark \
 34 |    -p 8000:8000 -p 4040:4040 \
 35 |    -v $HOME/.aws:/home/hadoop/.aws:ro \
 36 |    quay.io/geodocker/jupyter-geopyspark
 37 | ```
 38 | if you wish to have your AWS credentials available in the container (e.g. for pulling data from S3).
 39 | 
 40 | ## From A Clone ##
 41 | 
 42 | To use the image from within a clone of this repository,
 43 | there are [two useful targets in the Makefile: `run` and `run-editable`](Makefile#L149-L166).
 44 | To use the `run` target, type something like
 45 | ```
 46 | TAG=latest make run
 47 | ```
 48 | or to use the `run` target with some image other than the latest one, something like
 49 | ```
 50 | TAG=a1b78b9 make run
 51 | ```
 52 | will launch a container using the image `quay.io/geodocker/jupyter-geopyspark:a1b78b9`.
 53 | 
 54 | The `run-editable` target also exists, which attempts to map one's local clone of the GeoPySpark into the container so that that code can be edited and iterated on in a fairly convenient fashion.
 55 | By default, it is assumed that the GeoPySpark code is present in `../geopyspark/geopyspark`, but that assumption can be changed by passing in an alternate location through the `GEOPYSPARK_DIR` environment variable.
 56 | Here
 57 | ```
 58 | TAG=latest GEOPYSPARK_DIR=/tmp/geopyspark/geopyspark run-editable
 59 | ```
 60 | is an example of that.
 61 | 
 62 | Both of those targets also pay attention to the `EXTRA_FLAGS` environment variable which can be used to pass additional flags to docker.
 63 | 
 64 | # Building The Image #
 65 | 
 66 | To build the image, type `make all`, `make image`, or simply `make`.
 67 | 
 68 | Type
 69 | ```
 70 | make run
 71 | ```
 72 | to run the newly-built image.
 73 | The `TAG` environment variable is not set, so by default the `run` target will use the tag of the new image.
 74 | 
 75 | # Modifying The Image; or, Image Architecture #
 76 | 
 77 | In this section we describe the structure of the repository
 78 | and document how the various pieces interact as part of the build process.
 79 | 
 80 | ## Repository Structure ##
 81 | 
 82 |    - [`archives`](archives) is an initially-empty directory that is populated with source code and built artifacts as part of the build process.
 83 |    - [`blobs`](blobs) is an initially-empty directory that is populated with built artifacts from the `archives` directory.
 84 |      This directory exists because `archives` is listed in the [`.dockerignore`](.dockerignore) file
 85 |      (which was done to reduce the size of the [build context](https://docs.docker.com/engine/reference/commandline/build/) of the final image).
 86 |      Please see the [README](bootstrap/README.md) in that directory for more information.
 87 |    - [`config`](config) contains the [GeoNotebook configuration file](config/geonotebook.ini)
 88 |      and a [list of python dependencies](config/requirements.txt) that GeoNotebook requires.
 89 |    - [`emr-docker`](emr-docker) contains files useful for running the image on Amazon EMR (please see the [README](emr-docker/README.md) in that directory for more information).
 90 |    - [`terraform-docker`](terraform-docker) contains file useful for running the image on Amazon EMR using Terraform.  Its remit is similar to that of the directory mentioned in the previous bullet-point, but it uses Terraform instead of shell scripts.
 91 |    - [`kernels`](kernels) contains Jupyter kernel configuration files.
 92 |      The one most likely to be of interest is [the one](geonotebook/kernel.json) that enables GeoNotebook and GeoPySpark, the other two kernels are mostly vestigial/ceremonial.
 93 |    - [`notebooks`](notebook) contains various sample notebooks.
 94 |    - [`scratch`](scratch) is a scratch directory used during the build process.
 95 |      The files that are added under this directory during the build can be harmlessly deleted after the build is complete,
 96 |      but not doing so will accelerate subsequent builds.
 97 |    - [`scripts`](scripts) contains various scripts using for building and installing artifacts.
 98 |       - [`netcdf.sh`](scripts/netcdf.sh) builds a jar from a [particular branch](https://github.com/Unidata/thredds/tree/feature/s3+hdfs) of the [Thredds](https://github.com/Unidata/thredds) project that provides support for reading NetCDF files.
 99 |       - [`build-python-blob1.sh`](build-python-blob1.sh) [runs in the context of the AWS build container](Makefile#L62-L70),
100 |         its purpose is to acquire most of the python dependencies needed by GeoPySpark and GeoNotebook and package them together into a tarball for later installation.
101 |       - [`build-pytohn-blob2.sh`](build-python-blob2.sh) [runs in the context of the AWS build container](Makefile#L72-L80),
102 |         its purpose is to package GeoPySpark and [`GeoPySpark-NetCDF`](https://github.com/geotrellis/geopyspark-netcdf) into a tarball for later installation.
103 |       - [`install-blob1.sh`](scripts/install-blob1.sh) runs [in the context of the final image build](Dockerfile#L17).
104 |         Its purpose is to install the artifacts created earlier by `build-python-blob1.sh`.
105 |       - [`install-blob2.sh`](scripts/install-blob2.sh) runs [in the context of the final image build](Dockerfile#L40).
106 |         Its purpose is to install the artifacts created earlier by `build-python-blob2.sh`.
107 |    - [`Dockerfile`](Dockerfile) specifies the final image, the output of the build process.
108 |    - [`Makefile`](Makefile) coordinates the build process.
109 |    - [`README.md`](README.md) this file.
110 | 
111 | ## Build Process ##
112 | 
113 | The build process can be divided into three stages: the bootstrap image creation phase, the EMR-compatible artifact creation stage, and the final image build stage.
114 | 
115 | When the `all` makefile target is invoked, the last two stages of the three-stage build process are done.
116 | 
117 | ### Stage 0: Build Bootstrap Images ###
118 | 
119 | The first of the three stages is done using the contents of the [`rpms/build`](rpms/build) directory.
120 | Its results have already been pushed to the `quay.io/geodocker` docker repository, so unless the reader wishes to modify the bootstrap images, this stage can be considered complete.
121 | To rebuild the boostrap images, the reader should navigate into the `rpms/build` directory and run the `./build.sh` script.
122 | 
123 | ### Stage 1: EMR-Compatible Artifacts  ###
124 | 
125 | The purpose of this stage is to build python artifacts that need to be linked against those binary dependencies which have been built
126 | in a context that resembles EMR (because we want the image to be usable on EMR).
127 | 
128 | First, a tarball containing python code linked against the binary dependencies mentioned above [is created](Makefile#L62-L70).
129 | Then, another python tarball containing GeoPySpark [is created](Makefile#L72-L80).
130 | The reason that there are two python tarballs instead of one is simply because contents of the two tarballs change at different rates;
131 | over repeated builds, the first tarball is built less frequently than the second one.
132 | 
133 | ### Stage 2: Build Final Image ###
134 | 
135 | In the third of the three stages, the artifacts which were created earlier are brought together and installed into the final docker image.
136 | 
137 | ## Adding Binary Dependencies ##
138 | 
139 | As an example of how to make a meaningful modification to the image,
140 | in this section we will describe the process of adding new binary dependencies to the image.
141 | 
142 | Currently, all binary dependencies are located in the file [`gdal-and-friends.tar.gz`](bootstrap/Makefile#L123-L134) which comes in via the [`quay.io/geodocker/jupyter-geopyspark:base-2`](rpms/build/Dockerfile.base) image on which the final image is based.
143 | If we want to add an additional binary dependency inside of that file,
144 | then we only need to [download or otherwise acquire the source code](rpms/build/Makefile#L3-L17)
145 | and update the [build script](rpms/build/scripts/build-gdal.sh) to build and package the additional code.
146 | If we wish to add a binary dependency outside of the `gdal-and-friends.tar.gz` file, then the process is slightly more involved,
147 | but potentially faster because it is not necessary to rebuild bootstrap images.
148 | 
149 | The strategy for adding new binary dependency, hypothetically `libHelloWorld` packaged in a file called `helloworld-and-friends.tar.gz`,
150 | will be to mirror the process for `gdal-and-friends.tar.gz` to the extent that we can.
151 | The difference is that this time we will add the binary to the final image rather than to a bootstrap image.
152 |    - First, augment to the [`Makefile`](Makefile) to download or otherwise ensure the existence of the `libHelloWorld` source code.
153 |    - Next, we want to build and package `libHelloWorld` in the context of the AWS build image, so that it will be usable on EMR.
154 |      This would probably be done by first creating a script analogous to [the one for GDAL](rpms/build/scripts/build-gdal.sh) that builds, links, and archives the dependency.
155 |    - That script should run in the context of the AWS build container so that the created binaries are compiled and linked in an environment that resembles EMR.
156 |    - The resulting archived binary blob should then be added to the final image so that it can be distributed to the Spark executors.
157 |      That should probably be done by adding a the `COPY` command to the Dockerfile to copy the new blob to the `/blobs` directory of the image.
158 |    - Finally, the image environment and the kernel should both be modified to make use of the new dependency.
159 |      The former will probably involve the addition of an `ENV` command to the Dockerfile to augment the `LD_LIBRARY_PATH` environment variable to be able to find any new shared libraries;
160 |      The latter is described below.
161 | 
162 | The changes to the kernel described in the last bullet-point would probably look something like this
163 | ```diff
164 | @@ -14,6 +14,6 @@
165 |          "PYTHONPATH": "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip:/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip",
166 |          "GEOPYSPARK_JARS_PATH": "/opt/jars",
167 |          "YARN_CONF_DIR": "/etc/hadoop/conf",
168 | -        "PYSPARK_SUBMIT_ARGS": "--archives /blobs/gdal-and-friends.tar.gz,/blobs/friends-of-geopyspark.tar.gz,/blobs/geopyspark-sans-friends.tar.gz --conf spark.executorEnv.LD_LIBRARY_PATH=gdal-and-friends.tar.gz/lib --conf spark.executorEnv.PYTHONPATH=friends-of-geopyspark.tar.gz/:geopyspark-sans-friends.tar.gz/ --conf hadoop.yarn.timeline-service.enabled=false pyspark-shell"
169 | +        "PYSPARK_SUBMIT_ARGS": "--archives /blobs/helloworld-and-friends.tar.gz,/blobs/gdal-and-friends.tar.gz,/blobs/friends-of-geopyspark.tar.gz,/blobs/geopyspark-sans-friends.tar.gz --conf spark.executorEnv.LD_LIBRARY_PATH=helloworld-and-friends.tar.gz/lib:gdal-and-friends.tar.gz/lib --conf spark.executorEnv.PYTHONPATH=friends-of-geopyspark.tar.gz/:geopyspark-sans-friends.tar.gz/ --conf hadoop.yarn.timeline-service.enabled=false pyspark-shell"
170 |      }
171 |  }
172 | ```
173 | 
174 | (The changes represented by the diff above have not been tested.)
175 | 
176 | The process for adding new distributed python dependencies is analogous to the one above,
177 | except that changes to `LD_LIBRARY_PATH` variable on the executors might not be required,
178 | and additions most-probably will need to be made to the `--conf spark.executorEnv.PYTHONPATH` configuration passed in via `PYSPARK_SUBMIT_ARGS` in the kernel.
179 | 
180 | # RPM-based Deployment #
181 | 
182 | ## Build RPMs ##
183 | 
184 | To build the RPMs, navigate into the [`rpms/build`](rpms/build/) directory and type `./build.sh`.
185 | 
186 | ## Terraform And AWS ##
187 | 
188 | To use the RPM-based deployment, navigate into the [`terraform-nodocker`](terraform-nodocker/) directory.
189 | The configuration in that directory require [Terraform](https://www.terraform.io/) version 0.10.6 or greater.
190 | If you want to use Google OAuth, GitHub OAuth, or some supported generic type of OAuth, then type
191 | ```bash
192 | terraform init
193 | terraform apply
194 | ```
195 | and respond appropriatly to the prompts.
196 | 
197 | Doing that will upload (or sync) the RPMs to the S3 location that you specify, and will also upload the [`terraform-nodocker/bootstrap.sh`](terraform-nodocker/bootstrap.sh) bootstrap script.
198 | 
199 | If you do not wish to use OAuth, then [some modifications to the bootstrap script](terraform-nodocker/bootstrap.sh#L84-L93) will be required.
200 | 
201 | # OAuth #
202 | 
203 | ## With The Docker Image ##
204 | 
205 | In to use OAuth for login, two things are necessary:
206 | It is necessary to [set three environment](https://github.com/jupyterhub/oauthenticator/blame/f5e39b1ece62b8d075832054ed3213cc04f85030/README.md#L74-L78) variables inside of the container before the JupyterHub process is launched, and
207 | it is necessary to use a `jupyterhub_config.py` file that enables the desired OAuth setup.
208 | 
209 | ### Environment Variables ###
210 | 
211 | The three environment variables that must be set are `OAUTH_CALLBACK_URL`, `OAUTH_CLIENT_ID`, and `OAUTH_CLIENT_SECRET`.
212 | The first of those three variables should be set to `http://localhost:8000/hub/oauth_callback` for local testing and something like `http://$(hostname -f):8000/hub/oauth_callback` for deployment.
213 | The second and third are dependent on the OAuth provider.
214 | 
215 | ### `jupyterhub_config.py` ###
216 | 
217 | There three such files already included in the image:
218 | One for [Google](config/jupyterhub_config_google.py) and related services,
219 | one for [GitHub](config/jupyterhub_config_github.py),
220 | and a [generic](config/jupyterhub_config_generic.py) one.
221 | There is some variability in precise details of how OAuth providers work (e.g. some require variables to be passed in the URL of a POST request, whereas others require variables to passed in the body of a POST request).
222 | For that reason, the generic configuration should be considered a starting point rather than something that is guranteed to work in its unmodified state.
223 | 
224 | There are only two user accounts in the image: `root` and `hadoop`.
225 | All three of the configurations discussed above map all valid OAuth users to the `hadoop` account.
226 | That is done because -- without additional configuration -- Spark jobs on EMR must come from a user named "`hadoop`".
227 | (The users inside of the container are separate and distinct from those on the host instance,
228 | but the username is evidently part of a Spark job submission, so it must match that of the user that EMR is expecting submissions from.)
229 | 
230 | ### Using ###
231 | 
232 | To use OAuth, launch a container with the three variables supplied and with the appropriate `jupyterhub_config.py` used.
233 | 
234 | ```bash
235 | docker run -it --rm --name geopyspark \
236 |    -p 8000:8000 \
237 |    -e OAUTH_CALLBACK_URL=http://localhost:8000/hub/oauth_callback \
238 |    -e OAUTH_CLIENT_ID=xyz \
239 |    -e OAUTH_CLIENT_SECRET=abc \
240 |    quay.io/geodocker/jupyter-geopyspark:latest \
241 |       jupyterhub \
242 |       -f /etc/jupterhub/jupyterhub_config_github.py \
243 |       --no-ssl --Spawner.notebook_dir=/home/hadoop/notebooks
244 | ```
245 | 
246 | ## With The RPM-based Deployment ##
247 | 
248 | This was discussed [earlier](#terraform-and-aws).
249 | 
--------------------------------------------------------------------------------
/config/geonotebook.ini:
--------------------------------------------------------------------------------
 1 | [default]
 2 | vis_server = geotrellis
 3 | log_level = WARNING
 4 | 
 5 | [geotrellis]
 6 | url = http://127.0.0.1:8000/user/jack/geotrellis
 7 | 
 8 | [geoserver]
 9 | username = admin
10 | password = geoserver
11 | url = http://127.0.0.1:8080/geoserver
12 | 
13 | [ktile]
14 | url = http://127.0.0.1:8000/ktile
15 | default_cache = ktile_default_cache
16 | 
17 | [ktile_default_cache]
18 | name = Test
19 | path = /tmp/stache
20 | umask = 0000
21 | 
22 | [basemap]
23 | url = http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png
24 | attribution = Tile data ©  OpenStreetMap contributors
25 | 
--------------------------------------------------------------------------------
/emr-docker/Makefile:
--------------------------------------------------------------------------------
 1 | include config-aws.mk     # Vars related to AWS credentials and services used
 2 | include config-emr.mk     # Vars related to type and size of EMR cluster
 3 | 
 4 | SCRIPT_RUNNER := s3://elasticmapreduce/libs/script-runner/script-runner.jar
 5 | 
 6 | ifeq ($(USE_SPOT),true)
 7 | MASTER_BID_PRICE:=BidPrice=${MASTER_PRICE},
 8 | WORKER_BID_PRICE:=BidPrice=${WORKER_PRICE},
 9 | BACKEND=accumulo
10 | endif
11 | 
12 | ifndef CLUSTER_ID
13 | CLUSTER_ID=$(shell if [ -e "cluster-id.txt" ]; then cat cluster-id.txt; fi)
14 | endif
15 | 
16 | 
17 | upload-code:
18 | 	@aws s3 cp bootstrap-geopyspark-docker.sh ${S3_URI}/bootstrap-geopyspark-docker.sh
19 | 
20 | create-cluster:
21 | 	aws emr create-cluster --name "${NAME}" \
22 | --release-label emr-5.7.0 \
23 | --output text \
24 | --use-default-roles \
25 | --log-uri ${S3_URI}/logs \
26 | --ec2-attributes KeyName=${EC2_KEY},SubnetId=${SUBNET_ID} \
27 | --applications Name=Hadoop Name=Spark Name=Zeppelin \
28 | --instance-groups \
29 | Name=Master,${MASTER_BID_PRICE}InstanceCount=1,InstanceGroupType=MASTER,InstanceType=${MASTER_INSTANCE} \
30 | Name=Workers,${WORKER_BID_PRICE}InstanceCount=${WORKER_COUNT},InstanceGroupType=CORE,InstanceType=${WORKER_INSTANCE} \
31 | --bootstrap-actions Name=GeoPySpark,Path=${S3_URI}/bootstrap-geopyspark-docker.sh \
32 | | tee cluster-id.txt
33 | 
34 | wait: INTERVAL:=60
35 | wait: STEP_ID=$(shell cat last-step-id.txt)
36 | wait:
37 | 	@while (true); do \
38 | 	OUT=$$(aws emr describe-step --cluster-id ${CLUSTER_ID} --step-id ${STEP_ID}); \
39 | 	[[ $$OUT =~ (\"State\": \"([A-Z]+)\") ]]; \
40 | 	echo $${BASH_REMATCH[2]}; \
41 | 	case $${BASH_REMATCH[2]} in \
42 | 	                PENDING | RUNNING) sleep ${INTERVAL};; \
43 | 	                COMPLETED) exit 0;; \
44 | 	                *) exit 1;; \
45 | 	esac; \
46 | 	done
47 | 
48 | terminate-cluster:
49 | 	aws emr terminate-clusters --cluster-ids ${CLUSTER_ID}
50 | 	rm -f cluster-id.txt
51 | 	rm -f last-step-id.txt
52 | 
53 | proxy:
54 | 	aws emr socks --cluster-id ${CLUSTER_ID} --key-pair-file "${HOME}/${EC2_KEY}.pem"
55 | 
56 | ssh:
57 | 	aws emr ssh --cluster-id ${CLUSTER_ID} --key-pair-file "${HOME}/${EC2_KEY}.pem"
58 | 
59 | get-logs:
60 | 	@aws emr ssh --cluster-id $(CLUSTER_ID) --key-pair-file "${HOME}/${EC2_KEY}.pem" \
61 | 	        --command "rm -rf /tmp/spark-logs && hdfs dfs -copyToLocal /var/log/spark/apps /tmp/spark-logs"
62 | 	@mkdir -p  logs/$(CLUSTER_ID)
63 | 	@aws emr get --cluster-id $(CLUSTER_ID) --key-pair-file "${HOME}/${EC2_KEY}.pem" --src "/tmp/spark-logs/" --dest logs/$(CLUSTER_ID)
64 | 
65 | 
66 | .PHONY: create-cluster get-logs
67 | 
--------------------------------------------------------------------------------
/emr-docker/README.md:
--------------------------------------------------------------------------------
 1 | # Running Jupyter, GeoNotebook and GeoPySpark on EMR
 2 | 
 3 | This section of the repository contains a boostrap script and a Makefile
 4 | that lets you easily spin up an EMR cluster that is running the docker container
 5 | of this repository.
 6 | 
 7 | _Requires_: Reasonably up to date [`aws-cli`](https://aws.amazon.com/cli/) this document is written with version `1.10`.
 8 | 
 9 | ### Configuration
10 | 
11 | Configuraiton has been broken out into two sections which are imported by the `Makefile`.
12 | 
13 |  - __config-aws.mk__: AWS credentials, S3 staging bucket, subnet, etc
14 |  - __config-emr.mk__: EMR cluster type and size
15 | 
16 | You will need to create your `config-aws.mk` based off of `config-aws.mk.template` to reflect your credentials and your VPC configuration.
17 | 
18 | `config-emr.mk` contains the following params:
19 | 
20 |  - __NAME__: The name of the EMR cluster
21 |  - __MASTER_INSTANCE__: The type of instance to use for the master node.
22 |  - __MASTER_PRICE__: The maximum bid price for the master node, if using spot instances.
23 |  - __WORKER_INSTANCE__: The type of instance to use for the worker nodes.
24 |  - __WORKER_PRICE__: The maximum bid price for the worker nodes, if using spot instances.
25 |  - __WORKER_COUNT__: The number of workers to include in this cluster.
26 |  - __USE_SPOT__: Set to `true` to use spot instances.
27 | 
28 | ### The bootstrap script
29 | 
30 | EMR allows you to specify a script to run on the creation of both the master and worker nodes.
31 | We supply a script here, `bootstrap-geopyspark-docker.sh`, that will set up and run
32 | this docker container with the proper configuration in the bootstrap step.
33 | 
34 | The script needs to be on S3 in order to be available to the EMR starutp process;
35 | to place on S3, use the Makefile command
36 | 
37 | ```bash
38 | $ make upload-code
39 | ```
40 | 
41 | ### Starting the cluster
42 | 
43 | Now all we have to do to interact with the cluster is use the following Makefile commands:
44 | 
45 | ```bash
46 | # Create the cluster
47 | $ make create-cluster
48 | 
49 | # Terminate the cluster
50 | $ make terminate-cluster
51 | 
52 | # SSH into the master node
53 | $ make ssh
54 | 
55 | # Create an SSH tunnel to the master for viewing EMR Application UIs
56 | $ make proxy
57 | 
58 | # Grab the logs from the master
59 | $ make get-logs
60 | ```
61 | 
62 | The create-cluster command will place a text file, `cluster-id.txt` in this directy which holds the Cluster ID.
63 | All the other commands use that ID to interact with the cluster. `teriminate-cluster` will remove this text file.
64 | 
65 | ### Accessing JupyterHub
66 | 
67 | Grab the public DNS name for the master node of the cluster, and visit `http://[MASTER DNS]:8000`. You
68 | should see the JupyterHub login page. The user and password are both `hadoop`.
69 | 
70 | _Note_: Don't forget to open up port `8000` in the security group of the master node, or else you won't
71 | be able to access the JupyterHub endpoint.
72 | 
--------------------------------------------------------------------------------
/emr-docker/bootstrap-geopyspark-docker.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | IMAGE=quay.io/geodocker/jupyter-geopyspark:${TAG:-"latest"}
 4 | 
 5 | # Parses a configuration file put in place by EMR to determine the role of this node
 6 | is_master() {
 7 |   if [ $(jq '.isMaster' /mnt/var/lib/info/instance.json) = 'true' ]; then
 8 |     return 0
 9 |   else
10 |     return 1
11 |   fi
12 | }
13 | 
14 | for i in "$@"
15 | do
16 |     case $i in
17 |         --continue)
18 |             CONTINUE=true
19 |             shift
20 |             ;;
21 |         -i=*|--image=*)
22 |             IMAGE="${i#*=}"
23 |             shift
24 |             ;;
25 |         -e=*|--env=*)
26 |             ENV_VARS+=("-e ${i#*=}")
27 |             shift
28 |             ;;
29 |         *)
30 |             ;;
31 |     esac
32 | done
33 | 
34 | ### MAIN ####
35 | 
36 | # EMR bootstrap runs before HDFS or YARN are initilized
37 | if [ ! $CONTINUE ]; then
38 |     sudo yum -y install docker
39 |     sudo usermod -aG docker hadoop
40 |     sudo service docker start
41 | 
42 |     THIS_SCRIPT="$(realpath "${BASH_SOURCE[0]}")"
43 |     TIMEOUT= is_master && TIMEOUT=3 || TIMEOUT=4
44 |     echo "bash -x $THIS_SCRIPT --continue $ARGS > /tmp/bootstrap-geopyspark-docker.log" | at now + $TIMEOUT min
45 |     exit 0 # Bail and let EMR finish initializing
46 | fi
47 | 
48 | YARN_RM=$(xmllint --xpath "//property[name='yarn.resourcemanager.hostname']/value/text()"  /etc/hadoop/conf/yarn-site.xml)
49 | 
50 | DOCKER_ENV="-e USER=hadoop \
51 | -e ZOOKEEPERS=$YARN_RM \
52 | ${ENV_VARS[@]} \
53 | -v /etc/hadoop/conf:/etc/hadoop/conf \
54 | -v /usr/lib/hadoop-hdfs/bin:/usr/lib/hadoop-hdfs/bin"
55 | 
56 | DOCKER_OPT="-d --net=host --restart=always --memory-swappiness=0"
57 | 
58 | if is_master ; then
59 |     sudo docker run $DOCKER_OPT --name=geopyspark $DOCKER_ENV $IMAGE
60 | fi
61 | 
--------------------------------------------------------------------------------
/emr-docker/config-aws.mk.template:
--------------------------------------------------------------------------------
1 | export EC2_KEY:=[KEY NAME]
2 | export AWS_DEFAULT_REGION:=us-east-1
3 | export S3_URI:=[S3 PATH FOR UPLOADING CODE]
4 | export SUBNET_ID:=[SUBNET ID]
5 | 
--------------------------------------------------------------------------------
/emr-docker/config-emr.mk:
--------------------------------------------------------------------------------
1 | export NAME := GeoPySpark ${USER}
2 | export MASTER_INSTANCE:=m3.xlarge
3 | export MASTER_PRICE := 0.5
4 | export WORKER_INSTANCE:=m3.xlarge
5 | export WORKER_PRICE := 0.5
6 | export WORKER_COUNT := 2
7 | export USE_SPOT := true
8 | 
--------------------------------------------------------------------------------
/emr-docker/configurations.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "Classification": "spark",
 4 |     "Properties": {
 5 |       "maximizeResourceAllocation": "false",
 6 |       "spark.dynamicAllocation.enabled": "true"
 7 |     }
 8 |   },
 9 |   {
10 |     "Classification": "spark-log4j",
11 |     "Properties": {
12 |       "log4j.logger.geotrellis.spark.tiling": "DEBUG"
13 |     }
14 |   },
15 | 
16 |   {
17 |     "Classification": "hdfs-site",
18 |     "Properties": {
19 |       "dfs.replication": "1",
20 |       "dfs.permissions": "false",
21 |       "dfs.datanode.max.xcievers": "16384",
22 |       "dfs.datanode.max.transfer.threads": "16384",
23 |       "dfs.datanode.balance.max.concurrent.moves": "1000",
24 |       "dfs.datanode.balance.bandwidthPerSec": "100000000"
25 |     }
26 |   },
27 |   {
28 |     "Classification": "yarn-site",
29 |     "Properties": {
30 |       "yarn.resourcemanager.am.max-attempts": "1"
31 |     }
32 |   },
33 |   {
34 |     "Classification": "hadoop-env",
35 |     "Configurations": [
36 |       {
37 |         "Classification": "export",
38 |         "Properties": {
39 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
40 |           "GDAL_DATA": "/usr/local/share/gdal",
41 |           "LD_LIBRARY_PATH": "/usr/local/lib",
42 |           "PYSPARK_PYTHON": "python27",
43 |           "PYSPARK_DRIVER_PYTHON": "python27"
44 |         }
45 |       }
46 |     ]
47 |   },
48 |   {
49 |     "Classification": "spark-env",
50 |     "Configurations": [
51 |       {
52 |         "Classification": "export",
53 |         "Properties": {
54 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
55 |           "GDAL_DATA": "/usr/local/share/gdal",
56 |           "LD_LIBRARY_PATH": "/usr/local/lib",
57 |           "PYSPARK_PYTHON": "python27",
58 |           "PYSPARK_DRIVER_PYTHON": "python27"
59 |         }
60 |       }
61 |     ]
62 |   },
63 |   {
64 |     "Classification": "yarn-env",
65 |     "Configurations": [
66 |       {
67 |         "Classification": "export",
68 |         "Properties": {
69 |           "JAVA_HOME": "/usr/lib/jvm/java-1.8.0",
70 |           "GDAL_DATA": "/usr/local/share/gdal",
71 |           "LD_LIBRARY_PATH": "/usr/local/lib",
72 |           "PYSPARK_PYTHON": "python27",
73 |           "PYSPARK_DRIVER_PYTHON": "python27"
74 |         }
75 |       }
76 |     ]
77 |   }
78 | ]
79 | 
--------------------------------------------------------------------------------
/kernels/local/kernel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "argv": [
 3 |     "/usr/bin/python3.4",
 4 |     "-m",
 5 |     "ipykernel",
 6 |     "-f",
 7 |     "{connection_file}"
 8 |   ],
 9 |   "env": {
10 |       "LD_LIBRARY_PATH": "/usr/local/lib",
11 |       "PYSPARK_PYTHON": "/usr/bin/python3.4",
12 |       "SPARK_HOME": "/usr/local/spark-2.1.0-bin-hadoop2.7",
13 |       "GEOPYSPARK_JARS_PATH": "/opt/jars",
14 |       "PYTHONPATH": "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip:/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip",
15 |       "PYSPARK_SUBMIT_ARGS": "--master local[*] --conf spark.executorEnv.LD_LIBRARY_PATH=/home/hadoop/local/gdal/lib/ pyspark-shell"
16 |   },
17 |   "language": "python",
18 |   "display_name": "PySpark (local)"
19 | }
20 | 
--------------------------------------------------------------------------------
/kernels/yarn/kernel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "argv": [
 3 |     "/usr/bin/python3.4",
 4 |     "-m",
 5 |     "ipykernel",
 6 |     "-f",
 7 |     "{connection_file}"
 8 |   ],
 9 |   "env": {
10 |       "PYSPARK_PYTHON": "/usr/bin/python3.4",
11 |       "SPARK_HOME": "/usr/local/spark-2.1.0-bin-hadoop2.7",
12 |       "PYTHONPATH": "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip:/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip",
13 |       "YARN_CONF_DIR": "/etc/hadoop/conf",
14 |       "PYSPARK_SUBMIT_ARGS": "--archives /blobs/gdal-and-friends.tar.gz,/blobs/friends-of-geopyspark.tar.gz,/blobs/geopyspark-sans-friends.tar.gz --conf spark.executorEnv.LD_LIBRARY_PATH=gdal-and-friends.tar.gz/lib --conf spark.executorEnv.PYTHONPATH=friends-of-geopyspark.tar.gz/:geopyspark-sans-friends.tar.gz/ pyspark-shell"
15 |   },
16 |   "language": "python",
17 |   "display_name": "GeoPySpark"
18 | }
19 | 
--------------------------------------------------------------------------------
/notebooks/Getting the mask.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "import math\n",
 11 |     "from shapely.geometry import shape, mapping, Polygon"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "!curl 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries/USA/FL.geo.json' -o /tmp/FL.geo.json"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "txt = open(\"/tmp/FL.geo.json\").read()"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "geojson = json.loads(txt)\n"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": null,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "geoms = list(map(lambda x: shape(x[\"geometry\"]), geojson[\"features\"]))"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "g1 = geoms[0]"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "full = g1\n",
 66 |     "for g in geoms[1:]:\n",
 67 |     "    full = full.union(g)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "len(geoms)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": null,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "s = set([])\n",
 86 |     "for g in geoms:\n",
 87 |     "    s.add(g.geom_type)\n",
 88 |     "s\n"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "lens= []\n",
 98 |     "for g in geoms:\n",
 99 |     "    g.geom_type"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "def get_size(g):\n",
109 |     "    b = g.bounds\n",
110 |     "    w = b[2] - b[0]\n",
111 |     "    h = b[3] - b[1]\n",
112 |     "    return math.sqrt(w*w + h*h)\n",
113 |     "max_size = 0.0\n",
114 |     "max_size_geom = None\n",
115 |     "for g in geoms:\n",
116 |     "    s = get_size(g)\n",
117 |     "    if max_size < s:\n",
118 |     "        max_size = s\n",
119 |     "        max_size_geom = g\n",
120 |     "g"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "geoms.sort(key=lambda g: -get_size(g))"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "gj = mapping(geoms[0])"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "open('/tmp/poly.json', 'w').write(json.dumps(gj))"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "bg = geoms[0]"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": null,
162 |    "metadata": {},
163 |    "outputs": [],
164 |    "source": [
165 |     "p = Polygon(bg.exterior)\n"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "p"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": []
183 |   }
184 |  ],
185 |  "metadata": {
186 |   "kernelspec": {
187 |    "display_name": "GeoPySpark",
188 |    "language": "python",
189 |    "name": "gps"
190 |   },
191 |   "language_info": {
192 |    "codemirror_mode": {
193 |     "name": "ipython",
194 |     "version": 3
195 |    },
196 |    "file_extension": ".py",
197 |    "mimetype": "text/x-python",
198 |    "name": "python",
199 |    "nbconvert_exporter": "python",
200 |    "pygments_lexer": "ipython3",
201 |    "version": "3.4.6"
202 |   }
203 |  },
204 |  "nbformat": 4,
205 |  "nbformat_minor": 2
206 | }
207 | 
--------------------------------------------------------------------------------
/notebooks/Landsat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import rasterio\n",
 10 |     "import rasterio.features\n",
 11 |     "import rasterio.warp\n",
 12 |     "import geopyspark as gps\n",
 13 |     "import numpy as np\n",
 14 |     "import csv\n",
 15 |     "import matplotlib.pyplot as plt\n",
 16 |     "\n",
 17 |     "from datetime import datetime\n",
 18 |     "from pyspark import SparkContext\n",
 19 |     "from osgeo import osr\n",
 20 |     "\n",
 21 |     "%matplotlib inline"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "sc = SparkContext(conf=gps.geopyspark_conf(appName=\"Landsat\").set(\"spark.ui.enabled\",True))"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": null,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "csv_data = [{'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B2.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '2'},\n",
 40 |     "            {'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B3.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '3'},\n",
 41 |     "            {'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B4.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '4'},\n",
 42 |     "            {'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B5.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '5'}]"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "rdd0 = sc.parallelize(csv_data)"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def get_metadata(line):\n",
 61 |     "    \n",
 62 |     "    try:\n",
 63 |     "        with rasterio.open(line['uri']) as dataset:\n",
 64 |     "            bounds = dataset.bounds\n",
 65 |     "            height = height = dataset.height\n",
 66 |     "            width = dataset.width\n",
 67 |     "            crs = dataset.get_crs()\n",
 68 |     "            srs = osr.SpatialReference()\n",
 69 |     "            srs.ImportFromWkt(crs.wkt)\n",
 70 |     "            proj4 = srs.ExportToProj4()\n",
 71 |     "            ws = [w for (ij, w) in dataset.block_windows()]\n",
 72 |     "    except:\n",
 73 |     "            ws = []\n",
 74 |     "            \n",
 75 |     "    def windows(line, ws):\n",
 76 |     "        for w in ws:\n",
 77 |     "            ((row_start, row_stop), (col_start, col_stop)) = w\n",
 78 |     "\n",
 79 |     "            left  = bounds.left + (bounds.right - bounds.left)*(float(col_start)/width)\n",
 80 |     "            right = bounds.left + (bounds.right - bounds.left)*(float(col_stop)/ width)\n",
 81 |     "            bottom = bounds.top + (bounds.bottom - bounds.top)*(float(row_stop)/height)\n",
 82 |     "            top = bounds.top + (bounds.bottom - bounds.top)*(float(row_start)/height)\n",
 83 |     "            extent = gps.Extent(left,bottom,right,top)\n",
 84 |     "            instant = datetime.strptime(line['date'], '%Y%j')\n",
 85 |     "                \n",
 86 |     "            new_line = line.copy()\n",
 87 |     "            new_line.pop('date')\n",
 88 |     "            new_line.pop('scene_id')\n",
 89 |     "            new_line['window'] = w\n",
 90 |     "            new_line['projected_extent'] = gps.TemporalProjectedExtent(extent=extent, instant=instant, proj4=proj4)\n",
 91 |     "            yield new_line\n",
 92 |     "    \n",
 93 |     "    return [i for i in windows(line, ws)]\n"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "rdd1 = rdd0.flatMap(get_metadata)\n",
103 |     "rdd1.first()"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "def get_data(line):\n",
113 |     "    \n",
114 |     "    new_line = line.copy()\n",
115 |     "\n",
116 |     "    with rasterio.open(line['uri']) as dataset:\n",
117 |     "        new_line['data'] = dataset.read(1, window=line['window'])\n",
118 |     "        new_line.pop('window')\n",
119 |     "        new_line.pop('uri')\n",
120 |     "    \n",
121 |     "    return new_line"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "rdd2 = rdd1.map(get_data)\n",
131 |     "rdd2.first()"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": [
140 |     "rdd3 = rdd2.groupBy(lambda line: line['projected_extent'])\n",
141 |     "rdd3.first()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "def make_tiles(line):\n",
151 |     "    projected_extent = line[0]\n",
152 |     "    bands = sorted(line[1], key=lambda l: l['band'])\n",
153 |     "    array = np.array([l['data'] for l in bands])\n",
154 |     "    tile = gps.Tile.from_numpy_array(array, no_data_value=0)\n",
155 |     "    return (projected_extent, tile)\n",
156 |     "\n",
157 |     "def interesting_tile(line):\n",
158 |     "    [tpe, tile] = line\n",
159 |     "    return (np.sum(tile[0][0]) != 0)\n",
160 |     "\n",
161 |     "def square_tile(line):\n",
162 |     "    [tpe, tile] = line\n",
163 |     "    return tile[0][0].shape == (512,512)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "rdd4 = rdd3.map(make_tiles).filter(square_tile)\n",
173 |     "data = rdd4.filter(interesting_tile).first()\n",
174 |     "data"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "plt.imshow(data[1][0][0])"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "raster_layer = gps.RasterLayer.from_numpy_rdd(gps.LayerType.SPACETIME, rdd4)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "tiled_raster_layer = raster_layer.tile_to_layout(layout = gps.GlobalLayout(), target_crs=3857)"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {},
208 |    "outputs": [],
209 |    "source": [
210 |     "pyramid = tiled_raster_layer.pyramid()"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {},
217 |    "outputs": [],
218 |    "source": [
219 |     "for layer in pyramid.levels.values():\n",
220 |     "    gps.write(\"file:///tmp/catalog/\", \"landsat\", layer, time_unit=gps.TimeUnit.DAYS)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "## Display (Optional) ##"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "pyramid = tiled_raster_layer.to_spatial_layer().pyramid()"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "for layer in pyramid.levels.values():\n",
246 |     "    gps.write(\"file:///tmp/catalog/\", \"landsat-spatial\", layer)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {},
253 |    "outputs": [],
254 |    "source": [
255 |     "from PIL import Image\n",
256 |     "\n",
257 |     "def render_tile(tile):\n",
258 |     "    norm = np.uint8(tile[0] / tile[0].max() * 255)\n",
259 |     "    mask = np.uint8((norm[0] != 0) * 255)\n",
260 |     "    return Image.fromarray(np.dstack([norm[2], norm[1], norm[0], mask]), mode='RGBA')"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "tms_server = gps.TMS.build((\"file:///tmp/catalog/\", \"landsat-spatial\"), display=render_tile)\n",
270 |     "tms_server.bind('0.0.0.0')"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "import folium\n",
280 |     "\n",
281 |     "m = folium.Map(tiles='Stamen Terrain', location=[35.6, 140.1], zoom_start=5)\n",
282 |     "folium.TileLayer(tiles=tms_server.url_pattern, attr='GeoPySpark').add_to(m)\n",
283 |     "m"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": []
292 |   }
293 |  ],
294 |  "metadata": {
295 |   "kernelspec": {
296 |    "display_name": "GeoPySpark",
297 |    "language": "python",
298 |    "name": "gps"
299 |   },
300 |   "language_info": {
301 |    "codemirror_mode": {
302 |     "name": "ipython",
303 |     "version": 3
304 |    },
305 |    "file_extension": ".py",
306 |    "mimetype": "text/x-python",
307 |    "name": "python",
308 |    "nbconvert_exporter": "python",
309 |    "pygments_lexer": "ipython3",
310 |    "version": "3.4.6"
311 |   }
312 |  },
313 |  "nbformat": 4,
314 |  "nbformat_minor": 2
315 | }
316 | 
--------------------------------------------------------------------------------
/notebooks/NLCD viewer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from functools import partial\n",
 10 |     "import geopyspark as gps\n",
 11 |     "import numpy as np\n",
 12 |     "import fiona\n",
 13 |     "import json\n",
 14 |     "import pyproj\n",
 15 |     "\n",
 16 |     "from pyspark import SparkContext\n",
 17 |     "from colortools import Color\n",
 18 |     "\n",
 19 |     "from shapely.geometry import mapping, shape\n",
 20 |     "from shapely.ops import transform\n",
 21 |     "\n",
 22 |     "from folium import Map, TileLayer, GeoJson"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "conf = gps.geopyspark_conf(master=\"local[*]\", appName=\"NLCD Viewer\")\n",
 32 |     "sc = SparkContext(conf=conf)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "catalog_uri = \"s3://azavea-datahub/catalog\"\n",
 42 |     "layer_name = \"nlcd-2011-epsg3857\""
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## Viewing NLCD"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "nlcd_cmap = gps.ColorMap.nlcd_colormap()\n",
 59 |     "nlcd_tms_server = gps.TMS.build((catalog_uri, layer_name), display=nlcd_cmap)\n",
 60 |     "nlcd_tms_server.bind('0.0.0.0')\n",
 61 |     "nlcd_tms_server.url_pattern"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "m = Map(tiles='Stamen Terrain', location=[37.1, -95.7], zoom_start=4)\n",
 71 |     "TileLayer(tiles=nlcd_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(m)\n",
 72 |     "m"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "markdown",
 77 |    "metadata": {},
 78 |    "source": [
 79 |     "## Viewing reclassified tiles\n",
 80 |     "\n",
 81 |     "This example shows how to do custom, on-the-fly display from an existing catalog using a callback to a Python rendering function.  This method is much slower than using color maps.  Please be patient during map display/zooming."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "import struct\n",
 91 |     "from PIL import Image\n",
 92 |     "\n",
 93 |     "def from_color_get_component(i):\n",
 94 |     "    def fn(x):\n",
 95 |     "        split = struct.Struct(\">I\").pack\n",
 96 |     "        r,g,b,a = split(x & 0xffffffff)\n",
 97 |     "        return np.array([r,g,b,a], dtype='uint8')[i]\n",
 98 |     "    return fn\n",
 99 |     "\n",
100 |     "def render_tile(tile):\n",
101 |     "    rr = np.vectorize(from_color_get_component(0))(tile)\n",
102 |     "    gg = np.vectorize(from_color_get_component(1))(tile)\n",
103 |     "    bb = np.vectorize(from_color_get_component(2))(tile)\n",
104 |     "    aa = np.vectorize(from_color_get_component(3))(tile)\n",
105 |     "    return Image.fromarray(np.dstack([rr, gg, bb, aa]), mode='RGBA')"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "from PIL import Image\n",
115 |     "import struct\n",
116 |     "\n",
117 |     "def render_cultivated(tile):\n",
118 |     "    # NLCD codes in the 80's are Planted/Cultivated\n",
119 |     "    # See https://www.mrlc.gov/nlcd11_leg.php\n",
120 |     "    colorize = np.vectorize(lambda x: 0x7110b2aa if ((80 <= x) & (x < 90)) else 0x00000000)\n",
121 |     "    return render_tile(colorize(tile[0][0]))"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "custom_nlcd_tms_server = gps.TMS.build((catalog_uri, layer_name), display=render_cultivated)\n",
131 |     "custom_nlcd_tms_server.bind('0.0.0.0')\n",
132 |     "custom_nlcd_tms_server.url_pattern"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "m = Map(tiles='Stamen Terrain', location=[37.1, -95.7], zoom_start=4)\n",
142 |     "TileLayer(tiles=custom_nlcd_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(m)\n",
143 |     "m"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "markdown",
148 |    "metadata": {},
149 |    "source": [
150 |     "## Chattanooga geometry"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "!curl -o /tmp/mask.json https://s3.amazonaws.com/chattademo/chatta_mask.json"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "txt = open('/tmp/mask.json').read()\n",
169 |     "js = json.loads(txt)\n",
170 |     "geom = shape(js)\n",
171 |     "center = geom.centroid\n",
172 |     "chatta_center = [center.y, center.x] # Location in lat/long"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(m)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "m.location = chatta_center\n",
191 |     "m.zoom_start = 8\n",
192 |     "m"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## Fetching an RDD of NLCD masked to Chattanooga"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "project = partial(\n",
209 |     "    pyproj.transform,\n",
210 |     "    pyproj.Proj(init='epsg:4326'),\n",
211 |     "    pyproj.Proj(init='epsg:3857'))\n",
212 |     "\n",
213 |     "chatta_poly = transform(project, geom)"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "query_rdd = gps.query(catalog_uri,\n",
223 |     "                      layer_name,\n",
224 |     "                      12,\n",
225 |     "                      query_geom=chatta_poly)"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "masked = query_rdd.mask([chatta_poly])\n",
235 |     "masked_tms_server = gps.TMS.build(masked.pyramid(), display=nlcd_cmap)\n",
236 |     "masked_tms_server.bind('0.0.0.0')"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": [
245 |     "chatta_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n",
246 |     "TileLayer(tiles=masked_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(chatta_map)\n",
247 |     "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(chatta_map)\n",
248 |     "chatta_map"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "metadata": {},
254 |    "source": [
255 |     "## Reclassifying an RDD"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "reclassified = masked.reclassify({0: 1, 80: 2, 90: 1},\n",
265 |     "                                 int,\n",
266 |     "                                 gps.ClassificationStrategy.GREATER_THAN_OR_EQUAL_TO).repartition(150)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {},
273 |    "outputs": [],
274 |    "source": [
275 |     "colors = gps.get_colors_from_colors(\n",
276 |     "    [Color(\"#CA9146FF\"), Color(\"#00FFAA88\")])\n",
277 |     "\n",
278 |     "breaks = {\n",
279 |     "    1: colors[0],\n",
280 |     "    2: colors[1]\n",
281 |     "}\n",
282 |     "\n",
283 |     "reclassified_cmap = gps.ColorMap.build(breaks)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "reclassified_tms_server = gps.TMS.build(reclassified.pyramid(), display=reclassified_cmap)\n",
293 |     "reclassified_tms_server.bind('0.0.0.0')"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "reclass_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n",
303 |     "TileLayer(tiles=reclassified_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(reclass_map)\n",
304 |     "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(reclass_map)\n",
305 |     "reclass_map"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "## Saving the reclassified layer locally"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "local_catalog_uri = \"file:///tmp/catalog\"\n",
322 |     "local_layer_name = \"cultivated-land-cover\""
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "for layer in reclassified.pyramid().levels.values():\n",
332 |     "    gps.write(local_catalog_uri, local_layer_name, layer)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "## Viewing the local Layer"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "nlcd_tms_server.unbind()\n",
349 |     "custom_nlcd_tms_server.unbind()\n",
350 |     "masked_tms_server.unbind()\n",
351 |     "reclassified_tms_server.unbind()"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "local_tms_server = gps.TMS.build((local_catalog_uri, local_layer_name), reclassified_cmap)\n",
361 |     "local_tms_server.bind('0.0.0.0')"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {
368 |     "scrolled": false
369 |    },
370 |    "outputs": [],
371 |    "source": [
372 |     "local_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n",
373 |     "TileLayer(tiles=local_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(local_map)\n",
374 |     "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(local_map)\n",
375 |     "local_map"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": null,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "layers = [gps.query(local_catalog_uri, local_layer_name, x) for x in range(0, 11)]"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": [
393 |     "read_in_pyramid = gps.Pyramid(layers)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": null,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "# This cannot display as well\n",
403 |     "server = gps.TMS.build(read_in_pyramid, reclassified_cmap)\n",
404 |     "server.bind('0.0.0.0')"
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "code",
409 |    "execution_count": null,
410 |    "metadata": {},
411 |    "outputs": [],
412 |    "source": [
413 |     "rdd_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n",
414 |     "TileLayer(tiles=server.url_pattern, attr='GeoPySpark Tiles').add_to(rdd_map)\n",
415 |     "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(rdd_map)\n",
416 |     "rdd_map"
417 |    ]
418 |   }
419 |  ],
420 |  "metadata": {
421 |   "kernelspec": {
422 |    "display_name": "GeoPySpark",
423 |    "language": "python",
424 |    "name": "gps"
425 |   },
426 |   "language_info": {
427 |    "codemirror_mode": {
428 |     "name": "ipython",
429 |     "version": 3
430 |    },
431 |    "file_extension": ".py",
432 |    "mimetype": "text/x-python",
433 |    "name": "python",
434 |    "nbconvert_exporter": "python",
435 |    "pygments_lexer": "ipython3",
436 |    "version": "3.4.6"
437 |   }
438 |  },
439 |  "nbformat": 4,
440 |  "nbformat_minor": 2
441 | }
442 | 
--------------------------------------------------------------------------------
/notebooks/Park citing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Finding the Optimal Location for a New Park\n",
  8 |     "\n",
  9 |     "This example notebook will show how to find the next potential location for a new park in San Fransisco. To accomplish this, three factors will be taken into consideration when deciding on a possible spot: existing parks, schools, and Bay Area Regional Transit (BART) stops. By calculating Euclidean Distance for these three factors and then weighing them together, we will be able to produce a visual representation of where is and is not a good location for a new park.\n",
 10 |     "\n",
 11 |     "## Importing the Libraries"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import geopyspark as gps\n",
 21 |     "import fiona\n",
 22 |     "\n",
 23 |     "from pyspark import SparkContext, StorageLevel\n",
 24 |     "from shapely.geometry import MultiPoint, MultiPolygon, shape\n",
 25 |     "\n",
 26 |     "import folium"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "## Setup the SparkContext"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": null,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "conf = gps.geopyspark_conf(appName=\"park-siting\", master=\"local[*]\")\n",
 43 |     "sc = SparkContext.getOrCreate(conf=conf)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "## Set map display parameters"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "center = [37.8, -122.2]\n",
 60 |     "zoom_start = 9.5"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "## Download the Geometries as GeoJsons"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "!curl -o /tmp/bart.geojson https://s3.amazonaws.com/geopyspark-demo/bayarea/bart.geojson\n",
 77 |     "!curl -o /tmp/school.geojson https://s3.amazonaws.com/geopyspark-demo/bayarea/school.geojson\n",
 78 |     "!curl -o /tmp/parks.geojson https://s3.amazonaws.com/geopyspark-demo/bayarea/parks.geojson"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## Read in the GeoJsons as Shapely Geometries"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "with fiona.open(\"/tmp/bart.geojson\") as source:\n",
 95 |     "    bart_crs = source.crs['init']\n",
 96 |     "    bart = MultiPoint([shape(f['geometry']) for f in source])\n",
 97 |     "\n",
 98 |     "with fiona.open(\"/tmp/school.geojson\") as source:\n",
 99 |     "    schools_crs = source.crs['init']\n",
100 |     "    schools = MultiPoint([shape(f['geometry']) for f in source])\n",
101 |     "\n",
102 |     "with fiona.open(\"/tmp/parks.geojson\") as source:\n",
103 |     "    parks_crs = source.crs['init']\n",
104 |     "    parks = MultiPolygon([shape(f['geometry']) for f in source])"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Calculate Euclidean Distance for Each Geometry\n",
112 |     "\n",
113 |     "Three new `TiledRasterLayer`s will be produced from the Euclidean Distance calculations for each geometry. All resulting layers will have a `zoom_level` of 12."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "bart_layer = gps.euclidean_distance(geometry=bart,\n",
123 |     "                                    source_crs=bart_crs,\n",
124 |     "                                    zoom=12)\n",
125 |     "\n",
126 |     "schools_layer = gps.euclidean_distance(geometry=schools,\n",
127 |     "                                       source_crs=schools_crs,\n",
128 |     "                                       zoom=12)\n",
129 |     "\n",
130 |     "parks_layer = gps.euclidean_distance(geometry=parks,\n",
131 |     "                                     source_crs=parks_crs,\n",
132 |     "                                     zoom=12)\n",
133 |     "\n",
134 |     "# Persists each layer to memory and disk\n",
135 |     "bart_layer.persist(StorageLevel.MEMORY_AND_DISK)\n",
136 |     "schools_layer.persist(StorageLevel.MEMORY_AND_DISK)\n",
137 |     "parks_layer.persist(StorageLevel.MEMORY_AND_DISK)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "## Weighing the Layers Together"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "weighted_layer = -1 * bart_layer - schools_layer + 3 * parks_layer\n",
154 |     "\n",
155 |     "# Persists the weighted layer to memory and disk\n",
156 |     "weighted_layer.persist(StorageLevel.MEMORY_AND_DISK)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "## Reprojecting, Pyramiding, and Calculating the Histogram"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "# The following code may take awhile to complete\n",
173 |     "reprojected = weighted_layer.tile_to_layout(layout=gps.GlobalLayout(),\n",
174 |     "                                            target_crs=\"EPSG:3857\")\n",
175 |     "pyramid = reprojected.pyramid(resample_method=gps.ResampleMethod.AVERAGE)\n",
176 |     "histogram = pyramid.get_histogram()"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "## Creating the ColorMap\n",
184 |     "\n",
185 |     "The below code creates a `ColorMap` instance using the `Histogram` from `pyramid` for its `breaks`. For the color, the `matplotlib` color palette, `viridus` will be used."
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "color_map = gps.ColorMap.build(breaks=histogram,\n",
195 |     "                               colors='viridis')"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "## Running the Server"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "tms = gps.TMS.build(source=pyramid,\n",
212 |     "                    display=color_map)\n",
213 |     "\n",
214 |     "tms.bind('0.0.0.0')"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {},
221 |    "outputs": [],
222 |    "source": [
223 |     "m = folium.Map(tiles='OpenStreetMap', location=center, zoom_start=zoom_start)\n",
224 |     "folium.TileLayer(tiles=tms.url_pattern, overlay=True, attr='GeoPySpark tiles').add_to(m)\n",
225 |     "folium.GeoJson(data='/tmp/bart.geojson', name='BART stops').add_to(m)\n",
226 |     "folium.GeoJson(data='/tmp/parks.geojson', name='Parks').add_to(m)\n",
227 |     "folium.LayerControl().add_to(m)\n",
228 |     "m"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "## Cleaning up"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "tms.unbind()"
245 |    ]
246 |   }
247 |  ],
248 |  "metadata": {
249 |   "kernelspec": {
250 |    "display_name": "GeoPySpark",
251 |    "language": "python",
252 |    "name": "gps"
253 |   },
254 |   "language_info": {
255 |    "codemirror_mode": {
256 |     "name": "ipython",
257 |     "version": 3
258 |    },
259 |    "file_extension": ".py",
260 |    "mimetype": "text/x-python",
261 |    "name": "python",
262 |    "nbconvert_exporter": "python",
263 |    "pygments_lexer": "ipython3",
264 |    "version": "3.4.6"
265 |   }
266 |  },
267 |  "nbformat": 4,
268 |  "nbformat_minor": 2
269 | }
270 | 
--------------------------------------------------------------------------------
/notebooks/Pine Habitat.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# This tutorial will show you how to find the suitable habitat range for Bristlecone pine using GeoPySpark\n",
  8 |     "\n",
  9 |     "This tutorial will focus on GeoPySpark functionality, but you can find more resources and tutorials about GeoNotebooks [here](https://github.com/OpenGeoscience/geonotebook/tree/master/notebooks).\n",
 10 |     "\n",
 11 |     "### Suitability analysis is a classic GIS case study that enables the combination of factors to return a desired result \n",
 12 |     "This tutorial sets the premise that you are interested in two factors for locating Bristlecone pines:\n",
 13 |     "- Located between 3,000 and 4,000 meters\n",
 14 |     "- Located on a south facing slope\n",
 15 |     "     "
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import geopyspark as gps\n",
 25 |     "from pyspark import SparkContext"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "You will need to set up a spark context. To learn more about what that means take a look [here](https://spark.apache.org/docs/latest/programming-guide.html#initializing-spark)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "conf=gps.geopyspark_conf(appName=\"BristleConePine\")\n",
 42 |     "conf.set('spark.ui.enabled', True)\n",
 43 |     "sc = SparkContext(conf = conf)"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "Retrieving an elevation .tif from AWS S3:"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": null,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "elev_rdd = gps.geotiff.get(\n",
 60 |     "    layer_type='spatial', \n",
 61 |     "    uri='s3://geopyspark-demo/elevation/ca-elevation.tif')"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "markdown",
 66 |    "metadata": {},
 67 |    "source": [
 68 |     "## Tile, reproject, pyramid:"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "elev_tiled_rdd = elev_rdd.tile_to_layout(\n",
 78 |     "    layout=gps.GlobalLayout(), \n",
 79 |     "    target_crs=3857)\n",
 80 |     "elev_pyramided_rdd = elev_tiled_rdd.pyramid().cache()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "Imports for creating a TMS server capable of serving layers with custom colormaps"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "from geopyspark.geotrellis.color import get_colors_from_matplotlib\n",
 97 |     "elev_histo        = elev_pyramided_rdd.get_histogram()\n",
 98 |     "elev_colors       = get_colors_from_matplotlib('viridis', 100)\n",
 99 |     "elev_color_map    = gps.ColorMap.from_histogram(elev_histo, elev_colors)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "elev_tms = gps.TMS.build(elev_pyramided_rdd, elev_color_map)\n",
109 |     "elev_tms.bind('0.0.0.0')"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "markdown",
114 |    "metadata": {},
115 |    "source": [
116 |     "Display the tiles in an embedded [Folium](https://python-visualization.github.io/folium/) map:"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "import folium\n",
126 |     "\n",
127 |     "map_center = [37.75, -118.85]\n",
128 |     "zoom = 7\n",
129 |     "\n",
130 |     "m = folium.Map(location=map_center, zoom_start=zoom)\n",
131 |     "folium.TileLayer(tiles=\"Stamen Terrain\", overlay=False).add_to(m)\n",
132 |     "folium.TileLayer(tiles=elev_tms.url_pattern, attr=\"GeoPySpark\", overlay=True).add_to(m)\n",
133 |     "m"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Classify the elevation such that values of interest (between 3,000 and 4,000 meters) return a value of 1."
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "# use: elev_reprojected_rdd\n",
150 |     "elev_reclass_pre = elev_tiled_rdd.reclassify({1000:2, 2000:2, 3000:2, 4000:1, 5000:2}, int)\n",
151 |     "elev_reclass_rdd = elev_reclass_pre.reclassify({1:1}, int)\n",
152 |     "elev_reclass_pyramid_rdd = elev_reclass_rdd.pyramid()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "elev_reclass_histo = elev_reclass_pyramid_rdd.get_histogram()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": [
170 |     "#elev_reclass_color_map = ColorMap.from_histogram(sc, elev_reclass_histo, get_breaks(sc, 'Viridis', num_colors=100))\n",
171 |     "elev_reclass_color_map = gps.ColorMap.from_colors(\n",
172 |     "    breaks =[1], \n",
173 |     "    color_list = [0xff000080])"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "elev_reclass_tms = gps.TMS.build(elev_reclass_pyramid_rdd, elev_reclass_color_map)\n",
183 |     "elev_reclass_tms.bind('0.0.0.0')"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "m2 = folium.Map(location=map_center, zoom_start=zoom)\n",
193 |     "folium.TileLayer(tiles=\"Stamen Terrain\", overlay=False).add_to(m2)\n",
194 |     "folium.TileLayer(tiles=elev_tms.url_pattern, attr='GeoPySpark', name=\"Elevation\", overlay=True).add_to(m2)\n",
195 |     "folium.TileLayer(tiles=elev_reclass_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m2)\n",
196 |     "folium.LayerControl().add_to(m2)\n",
197 |     "m2"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "Focal operation: aspect. To find south facing slopes"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "# square_neighborhood = Square(extent=1)\n",
214 |     "aspect_rdd = elev_tiled_rdd.focal(\n",
215 |     "    gps.Operation.ASPECT, \n",
216 |     "    gps.Neighborhood.SQUARE, 1)"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "aspect_pyramid_rdd  = aspect_rdd.pyramid()\n",
226 |     "aspect_histo        = aspect_pyramid_rdd.get_histogram()\n",
227 |     "aspect_color_map    = gps.ColorMap.from_histogram(aspect_histo, get_colors_from_matplotlib('viridis', num_colors=256))"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "aspect_tms = gps.TMS.build(aspect_pyramid_rdd, aspect_color_map)\n",
237 |     "aspect_tms.bind('0.0.0.0')"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": null,
243 |    "metadata": {},
244 |    "outputs": [],
245 |    "source": [
246 |     "m3 = folium.Map(tiles='Stamen Terrain', location=map_center, zoom_start=zoom)\n",
247 |     "folium.TileLayer(tiles=aspect_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m3)\n",
248 |     "m3"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "aspect_tms.unbind()"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "metadata": {},
263 |    "source": [
264 |     "Reclassify values such that values between 120 and 240 degrees (south) have a value of 1"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "aspect_reclass_pre  = aspect_rdd.reclassify({120:2, 240:1, 360: 2}, int)\n",
274 |     "aspect_reclass      = aspect_reclass_pre.reclassify({1:1}, int)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {},
281 |    "outputs": [],
282 |    "source": [
283 |     "aspect_reclass_pyramid_rdd = aspect_reclass.pyramid()\n",
284 |     "aspect_reclass_histo       = aspect_reclass_pyramid_rdd.get_histogram()\n",
285 |     "aspect_reclass_color_map   = gps.ColorMap.from_histogram(aspect_reclass_histo, get_colors_from_matplotlib('viridis', num_colors=256))"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "aspect_reclass_tms = gps.TMS.build(aspect_reclass_pyramid_rdd, aspect_reclass_color_map)\n",
295 |     "aspect_reclass_tms.bind('0.0.0.0')"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {},
302 |    "outputs": [],
303 |    "source": [
304 |     "m4 = folium.Map(tiles='Stamen Terrain', location=map_center, zoom_start=zoom)\n",
305 |     "folium.TileLayer(tiles=aspect_reclass_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m4)\n",
306 |     "m4"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {},
313 |    "outputs": [],
314 |    "source": [
315 |     "aspect_reclass_tms.unbind()"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "markdown",
320 |    "metadata": {},
321 |    "source": [
322 |     "Now add the values togehter to find the suitable range:"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "added = elev_reclass_pyramid_rdd + aspect_reclass_pyramid_rdd\n",
332 |     "added_histo = added.get_histogram()\n",
333 |     "added_color_map = gps.ColorMap.from_histogram(added_histo, get_colors_from_matplotlib('viridis', num_colors=256))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "added_tms = gps.TMS.build(added, added_color_map)\n",
343 |     "added_tms.bind('0.0.0.0')"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": [
352 |     "m5 = folium.Map(tiles='Stamen Terrain', location=map_center, zoom_start=zoom)\n",
353 |     "folium.TileLayer(tiles=added_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m5)\n",
354 |     "m5"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": null,
360 |    "metadata": {},
361 |    "outputs": [],
362 |    "source": [
363 |     "import matplotlib.pyplot as plt\n",
364 |     "%matplotlib inline"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "code",
369 |    "execution_count": null,
370 |    "metadata": {
371 |     "scrolled": true
372 |    },
373 |    "outputs": [],
374 |    "source": [
375 |     "v = elev_tiled_rdd.lookup(342,787)\n",
376 |     "plt.imshow(v[0].cells[0])"
377 |    ]
378 |   }
379 |  ],
380 |  "metadata": {
381 |   "kernelspec": {
382 |    "display_name": "GeoPySpark",
383 |    "language": "python",
384 |    "name": "gps"
385 |   },
386 |   "language_info": {
387 |    "codemirror_mode": {
388 |     "name": "ipython",
389 |     "version": 3
390 |    },
391 |    "file_extension": ".py",
392 |    "mimetype": "text/x-python",
393 |    "name": "python",
394 |    "nbconvert_exporter": "python",
395 |    "pygments_lexer": "ipython3",
396 |    "version": "3.4.6"
397 |   }
398 |  },
399 |  "nbformat": 4,
400 |  "nbformat_minor": 2
401 | }
402 | 
--------------------------------------------------------------------------------
/notebooks/SRTM-emr.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import rasterio\n",
 12 |     "import rasterio.features\n",
 13 |     "import rasterio.warp\n",
 14 |     "import geopyspark as gps\n",
 15 |     "import numpy as np\n",
 16 |     "import matplotlib.pyplot as plt\n",
 17 |     "\n",
 18 |     "from pyspark import SparkContext\n",
 19 |     "from osgeo import osr\n",
 20 |     "\n",
 21 |     "import os\n",
 22 |     "import math\n",
 23 |     "import boto3\n",
 24 |     "\n",
 25 |     "%matplotlib inline"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "conf = gps.geopyspark_conf(\"yarn-client\", \"SRTM Ingest\") \\\n",
 37 |     "          .set(\"spark.dynamicAllocation.enabled\", False) \\\n",
 38 |     "          .set(\"spark.executor.instances\", \"50\") \\\n",
 39 |     "          .set(\"spark.executor.memory\", \"9472M\") \\\n",
 40 |     "          .set(\"spark.executor.cores\", \"4\") \\\n",
 41 |     "          .set(\"spark.ui.enabled\", True) \\\n",
 42 |     "          .set(\"spark.hadoop.yarn.timeline-service.enabled\", False)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "sc = SparkContext(conf=conf)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "s3 = boto3.client('s3')\n",
 65 |     "def get_raster_s3_objects(bucket, prefix, extension=\"hgt\"):\n",
 66 |     "    paginator = s3.get_paginator('list_objects_v2')\n",
 67 |     "    page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)\n",
 68 |     "    results = []\n",
 69 |     "    for page in page_iterator:\n",
 70 |     "        for item in page['Contents']:\n",
 71 |     "            if item['Key'].endswith(extension):\n",
 72 |     "                results.append(item)\n",
 73 |     "    return results\n"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "object_names = get_raster_s3_objects(\"mrgeo-source\", \"srtm-v3-30\")"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": [
 93 |     "file_names = list(map(lambda d: d['Key'][len('srtm-v3-30/'):], object_names))\n",
 94 |     "print(len(file_names))\n",
 95 |     "print(file_names[0:10])"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "def get_metadata(uri):\n",
107 |     "    import rasterio\n",
108 |     "    from osgeo import osr\n",
109 |     "    import os\n",
110 |     "    \n",
111 |     "    if \"GDAL_DATA\" not in os.environ:\n",
112 |     "        os.environ[\"GDAL_DATA\"]=\"/usr/local/lib64/python3.4/site-packages/fiona/gdal_data\"\n",
113 |     "    \n",
114 |     "    try:\n",
115 |     "        with rasterio.open(uri) as dataset:\n",
116 |     "            bounds = dataset.bounds\n",
117 |     "            height = dataset.height\n",
118 |     "            width = dataset.width\n",
119 |     "            crs = dataset.get_crs()\n",
120 |     "            srs = osr.SpatialReference()\n",
121 |     "            srs.ImportFromWkt(crs.wkt)\n",
122 |     "            proj4 = srs.ExportToProj4()\n",
123 |     "            tile_cols = (int)(math.ceil(width/512)) * 512\n",
124 |     "            tile_rows = (int)(math.ceil(height/512)) * 512\n",
125 |     "            ws = [((x, min(width-1,x + 512)), (y, min(height-1,y + 512))) for x in range(0, tile_cols, 512) for y in range(0, tile_rows, 512)]\n",
126 |     "    except:\n",
127 |     "            ws = []\n",
128 |     "            \n",
129 |     "    def windows(uri, ws):\n",
130 |     "        for w in ws:\n",
131 |     "            ((row_start, row_stop), (col_start, col_stop)) = w\n",
132 |     "\n",
133 |     "            left  = bounds.left + (bounds.right - bounds.left)*(float(col_start)/width)\n",
134 |     "            right = bounds.left + (bounds.right - bounds.left)*(float(col_stop)/ width)\n",
135 |     "            bottom = bounds.top + (bounds.bottom - bounds.top)*(float(row_stop)/height)\n",
136 |     "            top = bounds.top + (bounds.bottom - bounds.top)*(float(row_start)/height)\n",
137 |     "            extent = gps.Extent(left,bottom,right,top)\n",
138 |     "                \n",
139 |     "            new_line = {}\n",
140 |     "            new_line['uri'] = uri\n",
141 |     "            new_line['window'] = w\n",
142 |     "            new_line['projected_extent'] = gps.ProjectedExtent(extent=extent, proj4=proj4)\n",
143 |     "            yield new_line\n",
144 |     "    \n",
145 |     "    return [i for i in windows(uri, ws)]\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "def get_data(line):\n",
157 |     "    import rasterio\n",
158 |     "    \n",
159 |     "    new_line = line.copy()\n",
160 |     "\n",
161 |     "    with rasterio.open(line['uri']) as dataset:\n",
162 |     "        new_line['data'] = dataset.read(1, window=line['window'])\n",
163 |     "        new_line.pop('window')\n",
164 |     "        new_line.pop('uri')\n",
165 |     "    \n",
166 |     "    return new_line"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {
173 |     "collapsed": true
174 |    },
175 |    "outputs": [],
176 |    "source": [
177 |     "def filename_to_data(filename):\n",
178 |     "    import os\n",
179 |     "    \n",
180 |     "    full_filename = \"/vsicurl/https://s3.amazonaws.com/mrgeo-source/srtm-v3-30/{}\".format(filename)\n",
181 |     "    data = [get_data(line) for line in get_metadata(full_filename)]\n",
182 |     "    return data"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "rdd0 = sc.parallelize(file_names)\n",
192 |     "rdd1 = rdd0.flatMap(filename_to_data)\n",
193 |     "print(rdd1.count())"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "code",
198 |    "execution_count": null,
199 |    "metadata": {
200 |     "collapsed": true
201 |    },
202 |    "outputs": [],
203 |    "source": [
204 |     "rdd2 = rdd1.groupBy(lambda line: line['projected_extent']) # XXX"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {
211 |     "collapsed": true
212 |    },
213 |    "outputs": [],
214 |    "source": [
215 |     "def make_tiles(line):\n",
216 |     "    projected_extent = line[0]\n",
217 |     "    array = np.array([l['data'] for l in line[1]])\n",
218 |     "    tile = gps.Tile.from_numpy_array(array, no_data_value=0)\n",
219 |     "    return (projected_extent, tile)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {
226 |     "collapsed": true
227 |    },
228 |    "outputs": [],
229 |    "source": [
230 |     "rdd3 = rdd2.repartition(50 * 1024).map(make_tiles)"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {
237 |     "collapsed": true
238 |    },
239 |    "outputs": [],
240 |    "source": [
241 |     "raster_layer = gps.RasterLayer.from_numpy_rdd(gps.LayerType.SPATIAL, rdd3)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {
248 |     "collapsed": true
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "tiled_raster_layer = raster_layer.tile_to_layout(layout = gps.GlobalLayout(), target_crs=3857)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {
259 |     "collapsed": true
260 |    },
261 |    "outputs": [],
262 |    "source": [
263 |     "pyramid = tiled_raster_layer.pyramid()"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": true
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "for layer in pyramid.levels.values():\n",
275 |     "    gps.write(\"s3://geotrellis-test/dg-srtm/\", \"srtm-geopyspark\", layer)"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": null,
281 |    "metadata": {
282 |     "collapsed": true
283 |    },
284 |    "outputs": [],
285 |    "source": []
286 |   }
287 |  ],
288 |  "metadata": {
289 |   "kernelspec": {
290 |    "display_name": "GeoPySpark",
291 |    "language": "python",
292 |    "name": "gps"
293 |   },
294 |   "language_info": {
295 |    "codemirror_mode": {
296 |     "name": "ipython",
297 |     "version": 3
298 |    },
299 |    "file_extension": ".py",
300 |    "mimetype": "text/x-python",
301 |    "name": "python",
302 |    "nbconvert_exporter": "python",
303 |    "pygments_lexer": "ipython3",
304 |    "version": "3.4.6"
305 |   }
306 |  },
307 |  "nbformat": 4,
308 |  "nbformat_minor": 2
309 | }
310 | 
--------------------------------------------------------------------------------
/notebooks/SRTM-local.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import rasterio\n",
 10 |     "import rasterio.features\n",
 11 |     "import rasterio.warp\n",
 12 |     "import geopyspark as gps\n",
 13 |     "import numpy as np\n",
 14 |     "import matplotlib.pyplot as plt\n",
 15 |     "\n",
 16 |     "from pyspark import SparkContext\n",
 17 |     "from osgeo import osr\n",
 18 |     "\n",
 19 |     "import os\n",
 20 |     "import math\n",
 21 |     "import boto3\n",
 22 |     "\n",
 23 |     "%matplotlib inline"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "conf = gps.geopyspark_conf(\"local[*]\", \"SRTM Ingest\") \\\n",
 33 |     "          .set(\"spark.dynamicAllocation.enabled\", False) \\\n",
 34 |     "          .set(\"spark.ui.enabled\",True) \\\n",
 35 |     "          .set(\"spark.hadoop.yarn.timeline-service.enabled\", False)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "sc = SparkContext(conf=conf)"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "default_gdal_data_dir='/usr/local/share/gdal'"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "file_names = ['N00E006.hgt', 'N00E009.hgt', 'N00E010.hgt', 'N00E011.hgt', 'N00E012.hgt', 'N00E013.hgt', 'N00E014.hgt', 'N00E015.hgt', 'N00E016.hgt', 'N00E017.hgt']\n",
 63 |     "# file_names = file_names[0:2]\n",
 64 |     "print(len(file_names))\n",
 65 |     "print(file_names[0:10])"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "def get_metadata(uri):\n",
 75 |     "    if \"GDAL_DATA\" not in os.environ:\n",
 76 |     "        os.environ[\"GDAL_DATA\"]=default_gdal_data_dir\n",
 77 |     "    \n",
 78 |     "    try:\n",
 79 |     "        with rasterio.open(uri) as dataset:\n",
 80 |     "            bounds = dataset.bounds\n",
 81 |     "            height = dataset.height\n",
 82 |     "            width = dataset.width\n",
 83 |     "            crs = dataset.get_crs()\n",
 84 |     "            srs = osr.SpatialReference()\n",
 85 |     "            srs.ImportFromWkt(crs.wkt)\n",
 86 |     "            proj4 = srs.ExportToProj4()\n",
 87 |     "            # ws = [w for (ij, w) in dataset.block_windows()]\n",
 88 |     "            tile_cols = (int)(math.ceil(width/512)) * 512\n",
 89 |     "            tile_rows = (int)(math.ceil(height/512)) * 512\n",
 90 |     "            ws = [((x, min(width-1,x + 512)), (y, min(height-1,y + 512))) for x in range(0, tile_cols, 512) for y in range(0, tile_rows, 512)]\n",
 91 |     "    except:\n",
 92 |     "            ws = []\n",
 93 |     "            \n",
 94 |     "    def windows(uri, ws):\n",
 95 |     "        for w in ws:\n",
 96 |     "            ((row_start, row_stop), (col_start, col_stop)) = w\n",
 97 |     "\n",
 98 |     "            left  = bounds.left + (bounds.right - bounds.left)*(float(col_start)/width)\n",
 99 |     "            right = bounds.left + (bounds.right - bounds.left)*(float(col_stop)/ width)\n",
100 |     "            bottom = bounds.top + (bounds.bottom - bounds.top)*(float(row_stop)/height)\n",
101 |     "            top = bounds.top + (bounds.bottom - bounds.top)*(float(row_start)/height)\n",
102 |     "            extent = gps.Extent(left,bottom,right,top)\n",
103 |     "                \n",
104 |     "            new_line = {}\n",
105 |     "            new_line['uri'] = uri\n",
106 |     "            new_line['window'] = w\n",
107 |     "            new_line['projected_extent'] = gps.ProjectedExtent(extent=extent, proj4=proj4)\n",
108 |     "            yield new_line\n",
109 |     "    \n",
110 |     "    return [i for i in windows(uri, ws)]\n"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": null,
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "def get_data(line):\n",
120 |     "    new_line = line.copy()\n",
121 |     "\n",
122 |     "    with rasterio.open(line['uri']) as dataset:\n",
123 |     "        new_line['data'] = dataset.read(1, window=line['window'])\n",
124 |     "        new_line.pop('window')\n",
125 |     "        new_line.pop('uri')\n",
126 |     "    \n",
127 |     "    return new_line"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "def filename_to_data(filename):\n",
137 |     "    #full_filename = \"/vsicurl/https://s3.amazonaws.com/mrgeo-source/srtm-v3-30/{}\".format(filename)\n",
138 |     "    full_filename = \"s3://mrgeo-source/srtm-v3-30/{}\".format(filename)\n",
139 |     "    data = [get_data(line) for line in get_metadata(full_filename)]\n",
140 |     "    return data"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "rdd0 = sc.parallelize(file_names)\n",
150 |     "rdd1 = rdd0.flatMap(filename_to_data)\n",
151 |     "print(rdd1.count())"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "rdd2 = rdd1.groupBy(lambda line: line['projected_extent']) # XXX"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "def make_tiles(line):\n",
170 |     "    projected_extent = line[0]\n",
171 |     "    array = np.array([l['data'] for l in line[1]])\n",
172 |     "    tile = gps.Tile.from_numpy_array(array, no_data_value=0)\n",
173 |     "    return (projected_extent, tile)\n"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {},
180 |    "outputs": [],
181 |    "source": [
182 |     "rdd3 = rdd2.map(make_tiles)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "raster_layer = gps.RasterLayer.from_numpy_rdd(gps.LayerType.SPATIAL, rdd3)"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "tiled_raster_layer = raster_layer.tile_to_layout(layout = gps.GlobalLayout(), target_crs=3857)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "pyramid = tiled_raster_layer.pyramid()"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "for layer in pyramid.levels.values():\n",
219 |     "    gps.write(\"file:///tmp/dg-srtm/\", \"srtm-geopyspark-1\", layer)"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": null,
225 |    "metadata": {},
226 |    "outputs": [],
227 |    "source": [
228 |     "# pyramid2 = gps.Pyramid([gps.query(\"file:///tmp/dg-srtm\", \"srtm-geopyspark\", layer_zoom=n, num_partitions=1024*16) for n in range(0,13+1)])"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "histogram = pyramid.get_histogram()\n",
238 |     "color_map = gps.ColorMap.build(breaks=histogram, colors='viridis')"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": null,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "tms = gps.TMS.build(('file:///tmp/dg-srtm', 'srtm-geopyspark-1'), display=color_map)\n",
248 |     "tms.bind('0.0.0.0')"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": null,
254 |    "metadata": {},
255 |    "outputs": [],
256 |    "source": [
257 |     "import folium\n",
258 |     "\n",
259 |     "m = folium.Map(tiles='Stamen Terrain')\n",
260 |     "folium.TileLayer(tiles=tms.url_pattern, attr='GeoPySpark').add_to(m)\n",
261 |     "m"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "tms.unbind()"
271 |    ]
272 |   }
273 |  ],
274 |  "metadata": {
275 |   "kernelspec": {
276 |    "display_name": "GeoPySpark",
277 |    "language": "python",
278 |    "name": "gps"
279 |   },
280 |   "language_info": {
281 |    "codemirror_mode": {
282 |     "name": "ipython",
283 |     "version": 3
284 |    },
285 |    "file_extension": ".py",
286 |    "mimetype": "text/x-python",
287 |    "name": "python",
288 |    "nbconvert_exporter": "python",
289 |    "pygments_lexer": "ipython3",
290 |    "version": "3.4.6"
291 |   }
292 |  },
293 |  "nbformat": 4,
294 |  "nbformat_minor": 2
295 | }
296 | 
--------------------------------------------------------------------------------
/notebooks/libya.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Finding the Cost of Traversing Through Libya\n",
  8 |     "\n",
  9 |     "In this notebook, we will be calculating and visulizing the cost distance of traveling from one population center to another by road while avoiding conflict zones in Libya."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Import and Setup SparkContext"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import os\n",
 26 |     "import json\n",
 27 |     "import requests\n",
 28 |     "from functools import partial\n",
 29 |     "import pyproj\n",
 30 |     "import geopyspark as gps\n",
 31 |     "\n",
 32 |     "from pyspark import SparkContext\n",
 33 |     "from shapely.geometry import shape, MultiPoint, MultiLineString\n",
 34 |     "from shapely.ops import transform"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "conf = gps.geopyspark_conf(appName=\"Libya Weighted Overlay\", master=\"local[*]\")\n",
 44 |     "conf.set(\"spark.hadoop.yarn.timeline-service.enabled\", False)\n",
 45 |     "pysc = SparkContext.getOrCreate(conf)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Rasterize Libya Roads to RasterLayer"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {
 59 |     "scrolled": true
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "libya_roads_json = requests.get('https://s3.amazonaws.com/geopyspark-demo/libya/roads.geojson').json()\n",
 64 |     "libya_roads = MultiLineString([shape(geom['geometry']) for geom in libya_roads_json['features']])"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "# All execution time here is sending WKB over py4j socket\n",
 74 |     "ro = gps.RasterizerOptions(includePartial=True, sampleType='PixelIsArea')\n",
 75 |     "\n",
 76 |     "road_raster = gps.rasterize(geoms=list(libya_roads.geoms), \n",
 77 |     "                            crs=\"EPSG:3857\",\n",
 78 |     "                            zoom=8, \n",
 79 |     "                            fill_value=1,\n",
 80 |     "                            cell_type=gps.CellType.FLOAT32,\n",
 81 |     "                            options=ro)\n",
 82 |     "\n",
 83 |     "road_raster.layer_metadata.bounds"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "## Show Rasterized Roads on a Map"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "# Pyramid up from base layer\n",
100 |     "road_pp = road_raster.pyramid(resample_method=gps.ResampleMethod.MAX).cache()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "scrolled": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "# color map roads 1 to red\n",
112 |     "roads_cm = gps.ColorMap.from_colors(breaks=[1], color_list=[0xff000080])\n",
113 |     "\n",
114 |     "# start JVM tile server and serve tiles to map\n",
115 |     "server = gps.TMS.build(source=road_pp, display=roads_cm)\n",
116 |     "server.bind(\"0.0.0.0\")\n",
117 |     "server.url_pattern"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "from folium import Map, TileLayer\n",
127 |     "\n",
128 |     "m = Map(tiles='Stamen Toner', location=[27.7351, 17.2283], zoom_start=5)\n",
129 |     "TileLayer(tiles=server.url_pattern, attr='GeoPySpark Tiles').add_to(m)\n",
130 |     "m"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## Cost Distance Based on Road Network"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# road network will shape our friction layer\n",
147 |     "road_friction = road_raster.reclassify(value_map={1:1},\n",
148 |     "                                       data_type=int,\n",
149 |     "                                       replace_nodata_with=10)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "# starting points for cost distance operation\n",
159 |     "\n",
160 |     "population_json = requests.get('https://s3.amazonaws.com/geopyspark-demo/libya/population.geojson').json()\n",
161 |     "population_centers = MultiPoint([shape(geom['geometry']) for geom in population_json['features']])\n",
162 |     "\n",
163 |     "conflict_json = requests.get('https://s3.amazonaws.com/geopyspark-demo/libya/conflict.geojson').json()\n",
164 |     "conflict_centers = MultiPoint([shape(feature['geometry']) for feature in conflict_json['features'] if feature['geometry'] != None])\n",
165 |     "\n",
166 |     "conflict_centers"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# Convert population centers data from EPSG:3857 to EPSG:4326 for display on map\n",
176 |     "project = partial(\n",
177 |     "    pyproj.transform,\n",
178 |     "    pyproj.Proj(init='epsg:3857'),\n",
179 |     "    pyproj.Proj(init='epsg:4326'))\n",
180 |     "\n",
181 |     "population_4326 = transform(project, population_centers)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "# Write reprojected data to file\n",
191 |     "\n",
192 |     "if 'VIRTUAL_ENV' in os.environ:\n",
193 |     "    !pip3 install geojson\n",
194 |     "else:\n",
195 |     "    !pip3 install --user geojson\n",
196 |     "    \n",
197 |     "import geojson\n",
198 |     "\n",
199 |     "with open('/tmp/population-4326.geojson', 'w') as f:\n",
200 |     "    geojson.dump(geojson.Feature(geometry=population_4326, properties={}), f)\n",
201 |     "    f.flush()"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "### Cost Distance Between Population Centers"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "pop_cd = gps.cost_distance(\n",
218 |     "    friction_layer=road_friction,\n",
219 |     "    geometries=population_centers, \n",
220 |     "    max_distance=1400000.0\n",
221 |     ")\n",
222 |     "\n",
223 |     "pop_pp = pop_cd.pyramid()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "markdown",
228 |    "metadata": {},
229 |    "source": [
230 |     "### Cost Distance Between Conflict Centers"
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": null,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "con_cd = gps.cost_distance(\n",
240 |     "    friction_layer=road_friction,\n",
241 |     "    geometries=conflict_centers, \n",
242 |     "    max_distance=1400000.0\n",
243 |     ")\n",
244 |     "\n",
245 |     "con_pp = con_cd.pyramid()"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "## Displaying the Weighted Cost Distance Layer With Population Centers"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": [
261 |     "# prepare color map for weighted overlay based on max cost\n",
262 |     "breaks = [x for x in range(0, 1000000, 10000)]\n",
263 |     "colors = gps.get_colors_from_matplotlib(ramp_name='viridis', num_colors=len(breaks))\n",
264 |     "wo_cm = gps.ColorMap.from_colors(breaks=breaks, color_list=colors)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": null,
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "# our weighted layer avoids conflict centers focusing on just population centers\n",
274 |     "weighted_overlay = (con_pp * 0.0) + (pop_pp * 1.0)\n",
275 |     "\n",
276 |     "server2 = gps.TMS.build(source=weighted_overlay, display=wo_cm)\n",
277 |     "server2.bind('0.0.0.0')"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": null,
283 |    "metadata": {},
284 |    "outputs": [],
285 |    "source": [
286 |     "from folium import GeoJson\n",
287 |     "\n",
288 |     "m2 = Map(tiles='Stamen Toner', location=[27.7351, 17.2283], zoom_start=5)\n",
289 |     "TileLayer(tiles=server2.url_pattern, attr='GeoPySpark Tiles').add_to(m2)\n",
290 |     "GeoJson(\"/tmp/population-4326.geojson\").add_to(m2)\n",
291 |     "m2"
292 |    ]
293 |   }
294 |  ],
295 |  "metadata": {
296 |   "kernelspec": {
297 |    "display_name": "GeoPySpark",
298 |    "language": "python",
299 |    "name": "gps"
300 |   },
301 |   "language_info": {
302 |    "codemirror_mode": {
303 |     "name": "ipython",
304 |     "version": 3
305 |    },
306 |    "file_extension": ".py",
307 |    "mimetype": "text/x-python",
308 |    "name": "python",
309 |    "nbconvert_exporter": "python",
310 |    "pygments_lexer": "ipython3",
311 |    "version": "3.4.6"
312 |   }
313 |  },
314 |  "nbformat": 4,
315 |  "nbformat_minor": 2
316 | }
317 | 
--------------------------------------------------------------------------------
/notebooks/sanfranmvp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from functools import partial\n",
 10 |     "import geopyspark as gps\n",
 11 |     "import fiona\n",
 12 |     "import pyproj\n",
 13 |     "\n",
 14 |     "from pyspark import SparkContext\n",
 15 |     "from shapely.geometry import MultiPoint, MultiLineString, shape\n",
 16 |     "from shapely.ops import transform\n",
 17 |     "import folium"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "# Set up our spark context \n",
 27 |     "conf = gps.geopyspark_conf(appName=\"San Fran MVP\", master=\"local[*]\") \n",
 28 |     "sc = SparkContext(conf=conf)"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": null,
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "# Set the map center to be over San Francisco\n",
 38 |     "map_center = [37.75, -122.45]\n",
 39 |     "zoom = 11"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "# Download the needed geojsons\n",
 49 |     "\n",
 50 |     "!curl -o /tmp/bars.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/bars.geojson\n",
 51 |     "!curl -o /tmp/cafes.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/cafes.geojson \n",
 52 |     "!curl -o /tmp/transit.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/transit.geojson \n",
 53 |     "!curl -o /tmp/roads.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/roads.geojson "
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "# Read in all of the downloaded geojsons as Shapely geometries\n",
 63 |     "\n",
 64 |     "with fiona.open(\"/tmp/bars.geojson\") as source:\n",
 65 |     "    bars_crs = source.crs['init']\n",
 66 |     "    bars = MultiPoint([shape(f['geometry']) for f in source])\n",
 67 |     "\n",
 68 |     "with fiona.open(\"/tmp/cafes.geojson\") as source:\n",
 69 |     "    cafes_crs = source.crs['init']\n",
 70 |     "    cafes = MultiPoint([shape(f['geometry']) for f in source])\n",
 71 |     "    \n",
 72 |     "with fiona.open(\"/tmp/transit.geojson\") as source:\n",
 73 |     "    transit_crs = source.crs['init']\n",
 74 |     "    transit = MultiPoint([shape(f['geometry']) for f in source]) \n",
 75 |     "    \n",
 76 |     "with fiona.open(\"/tmp/roads.geojson\") as source:\n",
 77 |     "    roads_crs = source.crs['init']\n",
 78 |     "    roads = [MultiLineString(shape(line['geometry'])) for line in source]"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {},
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "# Reproject each Shapely geometry to EPSG:3857 so it can be\n",
 88 |     "# displayed on the map\n",
 89 |     "\n",
 90 |     "def create_partial_reprojection_func(crs):\n",
 91 |     "    return partial(pyproj.transform,\n",
 92 |     "                   pyproj.Proj(init=crs),\n",
 93 |     "                   pyproj.Proj(init='epsg:3857'))\n",
 94 |     "\n",
 95 |     "reprojected_bars = [transform(create_partial_reprojection_func(bars_crs), bar) for bar in bars]\n",
 96 |     "reprojected_cafes = [transform(create_partial_reprojection_func(cafes_crs), cafe) for cafe in cafes]\n",
 97 |     "reprojected_transit = [transform(create_partial_reprojection_func(transit_crs), trans) for trans in transit]\n",
 98 |     "reprojected_roads = [transform(create_partial_reprojection_func(roads_crs), road) for road in roads]"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "# Rasterize the road vectors and create the road fricition\n",
108 |     "# layer.\n",
109 |     "\n",
110 |     "rasterize_options = gps.RasterizerOptions(includePartial=True, sampleType='PixelIsArea')\n",
111 |     "\n",
112 |     "road_raster = gps.rasterize(geoms=reprojected_roads,\n",
113 |     "                            crs=\"EPSG:3857\",\n",
114 |     "                            zoom=12,\n",
115 |     "                            fill_value=1,\n",
116 |     "                            cell_type=gps.CellType.FLOAT32,\n",
117 |     "                            options=rasterize_options)\n",
118 |     "\n",
119 |     "road_friction = road_raster.reclassify(value_map={1:1},\n",
120 |     "                                       data_type=int,\n",
121 |     "                                       replace_nodata_with=10)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# Create the cost distance layer for bars based on the\n",
131 |     "# road network. Then pyramid the layer.\n",
132 |     "\n",
133 |     "bar_layer = gps.cost_distance(friction_layer=road_friction,\n",
134 |     "                              geometries=reprojected_bars,\n",
135 |     "                              max_distance=1500000.0)\n",
136 |     "\n",
137 |     "bar_pyramid = bar_layer.pyramid()"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "# Create the cost distance layer for cafes based on the\n",
147 |     "# road network. Then pyramid the layer.\n",
148 |     "\n",
149 |     "cafe_layer = gps.cost_distance(friction_layer=road_friction,\n",
150 |     "                               geometries=reprojected_cafes,\n",
151 |     "                               max_distance=1500000.0)\n",
152 |     "\n",
153 |     "cafe_pyramid = cafe_layer.pyramid()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "# Create the cost distance layer for the transit stops\n",
163 |     "# based on the road network. Then pyramid the layer.\n",
164 |     "\n",
165 |     "transit_layer = gps.cost_distance(friction_layer=road_friction,\n",
166 |     "                                  geometries=reprojected_transit,\n",
167 |     "                                  max_distance=1500000.0)\n",
168 |     "\n",
169 |     "transit_pyramid = transit_layer.pyramid()"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "# Calculate the weighted layer based on our preferences.\n",
179 |     "\n",
180 |     "weighted_layer = (-1 * bar_pyramid) + (transit_pyramid * 5) + (cafe_pyramid * 1)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "# Calculate the histogram for the weighted layer and\n",
190 |     "# then create a ColorRamp from the histogram.\n",
191 |     "\n",
192 |     "weighted_histogram = weighted_layer.get_histogram()\n",
193 |     "weighted_color_map = gps.ColorMap.build(breaks=weighted_histogram,\n",
194 |     "                                        colors='viridis')"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "# Build the TMS server from the weighted layer with its\n",
204 |     "# ColorMap\n",
205 |     "\n",
206 |     "tms = gps.TMS.build(source=weighted_layer,\n",
207 |     "                    display=weighted_color_map)\n",
208 |     "tms.bind('0.0.0.0')"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": null,
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "# Adds the weighted layer and all of the geometries to the map\n",
218 |     "# Bars are red\n",
219 |     "# Cafes are orange\n",
220 |     "# Transit stops are green\n",
221 |     "\n",
222 |     "#M.add_layer(TMSRasterData(tms), name=\"Weighted Layer\")\n",
223 |     "#M.add_layer(VectorData(\"/tmp/bars.geojson\"), name=\"Bars\", colors=[0xff0000])\n",
224 |     "#M.add_layer(VectorData(\"/tmp/cafes.geojson\"), name=\"Cafes\")\n",
225 |     "#M.add_layer(VectorData(\"/tmp/transit.geojson\"), name=\"Transit\", colors=[0x00FF00])\n",
226 |     "\n",
227 |     "m = folium.Map(tiles='OpenStreetMap', location=map_center, zoom_start=zoom)\n",
228 |     "folium.TileLayer(tiles=tms.url_pattern, attr='GeoPySpark', name='Weighted layer', overlay=True).add_to(m)\n",
229 |     "folium.GeoJson('/tmp/bars.geojson', name='Bars', style_function=lambda x: {'radius': 2, 'color': 'red'}, overlay=True).add_to(m)\n",
230 |     "folium.GeoJson('/tmp/cafes.geojson', name='Cafes', style_function=lambda x: {'fillColor': 'orange'}, overlay=True).add_to(m)\n",
231 |     "folium.GeoJson('/tmp/transit.geojson', name='Transit', style_function=lambda x: {'fillColor': 'green'}, overlay=True).add_to(m)\n",
232 |     "m"
233 |    ]
234 |   }
235 |  ],
236 |  "metadata": {
237 |   "kernelspec": {
238 |    "display_name": "GeoPySpark",
239 |    "language": "python",
240 |    "name": "gps"
241 |   },
242 |   "language_info": {
243 |    "codemirror_mode": {
244 |     "name": "ipython",
245 |     "version": 3
246 |    },
247 |    "file_extension": ".py",
248 |    "mimetype": "text/x-python",
249 |    "name": "python",
250 |    "nbconvert_exporter": "python",
251 |    "pygments_lexer": "ipython3",
252 |    "version": "3.4.6"
253 |   }
254 |  },
255 |  "nbformat": 4,
256 |  "nbformat_minor": 2
257 | }
258 | 
--------------------------------------------------------------------------------
/rpms/build/.dockerignore:
--------------------------------------------------------------------------------
1 | archives/
2 | rpmbuild/SOURCES/
3 | 
--------------------------------------------------------------------------------
/rpms/build/Dockerfile.base:
--------------------------------------------------------------------------------
 1 | FROM amazonlinux:2016.09.1.20161221
 2 | MAINTAINER James McClain 
 3 | 
 4 | RUN yum makecache fast
 5 | 
 6 | # Java
 7 | RUN yum update -y
 8 | RUN yum install -y java-1.8.0-openjdk
 9 | 
10 | # Spark
11 | ENV SPARK_HOME /usr/local/spark-2.1.0-bin-hadoop2.7
12 | ADD blobs/spark-2.1.0-bin-hadoop2.7.tgz /usr/local
13 | RUN ln -s /usr/local/spark-2.1.0-bin-hadoop2.7 /usr/local/spark
14 | 
15 | # kit, caboodle
16 | RUN yum install -y \
17 |     http://localhost:18080/hdf5-1.8.20-33.x86_64.rpm \
18 |     http://localhost:18080/netcdf-4.5.0-33.x86_64.rpm \
19 |     http://localhost:18080/openjpeg230-2.3.0-33.x86_64.rpm \
20 |     http://localhost:18080/gdal231-2.3.1-33.x86_64.rpm \
21 |     http://localhost:18080/nodejs-8.5.0-13.x86_64.rpm \
22 |     http://localhost:18080/proj493-lib-4.9.3-33.x86_64.rpm \
23 |     http://localhost:18080/configurable-http-proxy-0.0.0-13.x86_64.rpm
24 | 
25 | RUN echo /usr/local/lib >> /etc/ld.so.conf.d/local.conf && \
26 |     echo /usr/local/lib64 >> /etc/ld.so.conf.d/local.conf && \
27 |     ldconfig
28 | 
29 | # Create user
30 | RUN yum install -y shadow-utils && \
31 |     useradd hadoop -m && usermod -a -G root hadoop && (echo 'hadoop:hadoop' | chpasswd)
32 | 
33 | # Misc
34 | RUN yum install -y unzip python34 pam
35 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.4
36 | RUN ln -s /usr/local/share/jupyter /usr/share/jupyter
37 | COPY etc/pam.d/login /etc/pam.d/login
38 | 
39 | RUN pip3.4 install -r http://localhost:28080/http-requirements.txt
40 | 
41 | USER hadoop
42 | 
--------------------------------------------------------------------------------
/rpms/build/Dockerfile.gcc4:
--------------------------------------------------------------------------------
 1 | FROM amazonlinux:2016.09.1.20161221
 2 | MAINTAINER James McClain 
 3 | 
 4 | RUN yum -y groupinstall "Development Tools" || echo
 5 | RUN yum -y install python34-devel cmake less nano && yum clean all
 6 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.4
 7 | RUN yum update -y
 8 | RUN yum install -y java-1.8.0-openjdk-devel
 9 | RUN yum clean all -y && yum update -y && \
10 |     yum install -y \
11 |       bzip2-devel \
12 |       cairo-devel \
13 |       libjpeg-turbo-devel \
14 |       libpng-devel \
15 |       libtiff-devel \
16 |       make \
17 |       pkgconfig \
18 |       rpm-build \
19 |       which \
20 |       zlib-devel
21 | RUN ln -s /usr/include/python3.4m /usr/include/python3.4
22 | 
23 | RUN yum makecache fast
24 | ENV JAVA_HOME=/etc/alternatives/jre
25 | 
--------------------------------------------------------------------------------
/rpms/build/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all gcc4 aws-build-gdal rpms src wheels
 2 | 
 3 | FAMILY := quay.io/geodocker/emr-build
 4 | VERSION := 8
 5 | GCC4IMAGE := $(FAMILY):gcc4-$(VERSION)
 6 | BASEIMAGE := quay.io/geodocker/jupyter-geopyspark:base-$(VERSION)
 7 | INTERFACE ?= eth0
 8 | IP_ADDR := $(shell ifconfig $(INTERFACE) | grep -i mask | awk '{print $$2}' | cut -f2 -d:)
 9 | 
10 | all:
11 | 	echo "see build.sh"
12 | 
13 | gcc4:
14 | 	docker build -t $(GCC4IMAGE) -f Dockerfile.$@ .
15 | 
16 | base: blobs/spark-2.1.0-bin-hadoop2.7.tgz rpms wheel/http-requirements.txt
17 | 	docker run -dit --rm --name rpm-server --hostname rpm-server -p "18080:80" -v $(shell pwd)/rpmbuild/RPMS/x86_64:/usr/local/apache2/htdocs httpd:2.4
18 | 	docker run -dit --rm --name whl-server --hostname whl-server -p "28080:80" -v $(shell pwd)/wheel:/usr/local/apache2/htdocs httpd:2.4
19 | 	docker build --no-cache --add-host="localhost:$(IP_ADDR)" -t $(BASEIMAGE) -f Dockerfile.base .
20 | 	docker stop whl-server
21 | 	docker stop rpm-server
22 | 
23 | rpms: rpmbuild/RPMS/x86_64/proj493-lib-4.9.3-33.x86_64.rpm \
24 | rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm \
25 | rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm \
26 | rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm \
27 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm \
28 | rpmbuild/RPMS/x86_64/configurable-http-proxy-0.0.0-13.x86_64.rpm
29 | 
30 | src: rpmbuild/SOURCES/curl-7.57.0.tar.bz2 rpmbuild/SOURCES/zlib-1.2.11.tar.gz \
31 | rpmbuild/SOURCES/libpng-1.6.30.tar.xz rpmbuild/SOURCES/geos-3.6.1.tar.bz2 \
32 | rpmbuild/SOURCES/lcms2-2.8.tar.gz rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz \
33 | rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2 rpmbuild/SOURCES/netcdf-4.5.0.tar.gz \
34 | rpmbuild/SOURCES/gdal-2.3.1.tar.gz rpmbuild/SOURCES/node-v8.5.0.tar.gz \
35 | rpmbuild/SOURCES/proj-4.9.3.tar.gz
36 | 
37 | blobs/spark-2.1.0-bin-hadoop2.7.tgz:
38 | 	curl -L "http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz" -o $@
39 | 
40 | rpmbuild/SOURCES/%.tar: %/
41 | 	tar cvf $@ $<
42 | 
43 | include configurable-http-proxy.mk
44 | include gdal.mk
45 | include wheels.mk
46 | 
47 | clean:
48 | 	rm -f rpmbuild/SOURCES/*.tar
49 | 
50 | cleaner: clean
51 | 
52 | cleanest: cleaner
53 | 	rm -f rpmbuild/RPMS/x86_64/*
54 | 
55 | mrproper: cleanest
56 | 	rm -f rpmbuild/SOURCES/SOURCES/*
57 | 	rm -f wheel/*.whl
58 | 
--------------------------------------------------------------------------------
/rpms/build/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction #
 2 | 
 3 | This directory contains the configuration and build files needed to (re)build the base image and the RPMs.
 4 | 
 5 | # Inventory #
 6 | 
 7 | ## Images ##
 8 | 
 9 | The following images can be built from the materials in this directory:
10 | 
11 |    - [`quay.io/geodocker/jupyter-geopyspark:aws-build-gdal-3`](Dockerfile.aws-build-gdal) is an image used to build the `gdal-and-friend.tar.gz` binary blob.  This image is meant to mimic the environment of an EC2 (EMR) instance as closely as possible so as to create a compatible artifact.
12 |    - [`quay.io/geodocker/jupyter-geopyspark:base-3`](Dockerfile.base) is the ancestor images of the image produced in the root of this repository.  It contains mostly slow-rate-of-change binary dependencies.
13 |    - [`quay.io/geodocker/emr-build:gcc4-3`](Dockerfile.gcc4) is used to build RPMs with gcc 4.8.
14 |    - [`quay.io/geodocker/emr-build:gcc6-3`](Dockerfile.gcc6) is used to build RPMs with gcc 6.4.
15 | 
16 | ## Files and Directories ##
17 | 
18 |    - [`archives`](archives) is an initially-empty directory that is populated with source code, tarballs, and RPMs downloaded or produced during the build process.
19 |    - [`blobs`](blobs) is an initially-empty directory that is populated with archives and RPMS from the `archives` directory.
20 |    - [`rpmbuild`](rpmbuild) is a directory containing configuration files used to produce the RPMs.
21 |    - [`scripts`](scripts) is a directory containing scripts used to build the RPMs mentioned above, as well as the `gdal-and-friends.tar.gz` tarball.
22 |    - [`Makefile`](Makefile) coordinates the build process.
23 |    - [`etc`](etc) contains additional configuration files that are included in the base image.
24 |    - The various Dockerfiles specify the various images discussed above.
25 |    - `*.mk`: these are included in the Makefile.
26 |    - `README.md`: this file.
27 | 
28 | # RPMs #
29 | 
30 | ## Building ##
31 | 
32 | From within this directory, type `./build.sh` to build all of the RPMs (this could take a very long time).
33 | Once they are built, type `./publish.sh s3://bucket/prefix/` where `s3://bucket/prefix/` is a "directory" on S3 for which you have write permissions.
34 | The RPMs will be published to `s3://bucket/prefix/abc123/` where `abc123` is the present git SHA.
35 | 
36 | This will also produce all of the images described above (including the base image).
37 | 
38 | ## Fetching ##
39 | 
40 | From within this directory, type `./fetch s3://bucket/prefix/abc123/` where `s3://bucket/prefix/` is the path to a "directory" on S3 where RPMs have been previously-published, and `abc123` is the git SHA from which those RPMs were produced.
41 | 
42 | ## Refreshing GeoPySpark ##
43 | 
44 | With a complete set of RPMs already present, the GeoPySpark RPMs can be refreshed (for example to a newer version) by deleting the old GeoPySpark RPMs, then executing the `rpms` Makefile target.
45 | 
46 | ```bash
47 | rm -f rpmbuild/RPMS/x86_64/geopyspark-*.rpm
48 | make rpms
49 | ```
50 | 
--------------------------------------------------------------------------------
/rpms/build/archives/.gitignore:
--------------------------------------------------------------------------------
1 | *.jar
2 | *.rpm
3 | *.tar
4 | *.tar.bz2
5 | *.tar.gz
6 | *.tar.xz
7 | *.tgz
8 | *.zip
9 | 
--------------------------------------------------------------------------------
/rpms/build/blobs/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/blobs/.gitignore
--------------------------------------------------------------------------------
/rpms/build/build.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ ! -z "$1" ]
 4 | then
 5 |     ./fetch.sh $1
 6 | fi
 7 | 
 8 | make gcc4
 9 | make rpms
10 | make wheels
11 | make base
12 | 
--------------------------------------------------------------------------------
/rpms/build/configurable-http-proxy.mk:
--------------------------------------------------------------------------------
 1 | rpmbuild/SOURCES/node-v8.5.0.tar.gz:
 2 | 	curl -L "https://nodejs.org/dist/v8.5.0/node-v8.5.0.tar.gz" -o $@
 3 | 
 4 | archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip:
 5 | 	curl -L "https://github.com/ipython/ipykernel/archive/629ac54cae9767310616d47d769665453619ac64.zip" -o $@
 6 | 
 7 | archives/ipykernel.zip: archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip \
 8 | patches/patch.diff
 9 | 	rm -rf ipykernel-629ac54cae9767310616d47d769665453619ac64/
10 | 	unzip $<
11 | 	cd ipykernel-629ac54cae9767310616d47d769665453619ac64; patch -p1 < ../patches/patch.diff
12 | 	zip -r $@ ipykernel-629ac54cae9767310616d47d769665453619ac64/
13 | 	rm -r ipykernel-629ac54cae9767310616d47d769665453619ac64/
14 | 
15 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm: rpmbuild/SPECS/nodejs.spec \
16 | scripts/nodejs.sh \
17 | rpmbuild/SOURCES/node-v8.5.0.tar.gz
18 | 	docker run -it --rm \
19 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
20 |           -v $(shell pwd)/scripts:/scripts:ro \
21 |           $(GCC4IMAGE) /scripts/nodejs.sh $(shell id -u) $(shell id -g)
22 | 
23 | rpmbuild/RPMS/x86_64/configurable-http-proxy-0.0.0-13.x86_64.rpm: rpmbuild/SPECS/configurable-http-proxy.spec \
24 | scripts/configurable-http-proxy.sh \
25 | rpmbuild/SOURCES/configurable-http-proxy.tar \
26 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm
27 | 	docker run -it --rm \
28 |           -v $(shell pwd)/archives:/archives:ro \
29 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
30 |           -v $(shell pwd)/scripts:/scripts:ro \
31 |           $(GCC4IMAGE) /scripts/configurable-http-proxy.sh $(shell id -u) $(shell id -g)
32 | 
--------------------------------------------------------------------------------
/rpms/build/configurable-http-proxy/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/configurable-http-proxy/.gitignore
--------------------------------------------------------------------------------
/rpms/build/configurable-http-server.mk:
--------------------------------------------------------------------------------
 1 | rpmbuild/SOURCES/node-v8.5.0.tar.gz:
 2 | 	curl -L "https://nodejs.org/dist/v8.5.0/node-v8.5.0.tar.gz" -o $@
 3 | 
 4 | archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip:
 5 | 	curl -L "https://github.com/ipython/ipykernel/archive/629ac54cae9767310616d47d769665453619ac64.zip" -o $@
 6 | 
 7 | archives/ipykernel.zip: archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip patches/patch.diff
 8 | 	rm -rf ipykernel-629ac54cae9767310616d47d769665453619ac64/
 9 | 	unzip $<
10 | 	cd ipykernel-629ac54cae9767310616d47d769665453619ac64; patch -p1 < ../patches/patch.diff
11 | 	zip $@ $(shell find ipykernel-629ac54cae9767310616d47d769665453619ac64)
12 | 	rm -r ipykernel-629ac54cae9767310616d47d769665453619ac64/
13 | 
14 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm: rpmbuild/SPECS/nodejs.spec scripts/nodejs.sh rpmbuild/SOURCES/node-v8.5.0.tar.gz
15 | 	docker run -it --rm \
16 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
17 |           -v $(shell pwd)/scripts:/scripts:ro \
18 |           $(GCC4IMAGE) /scripts/nodejs.sh $(shell id -u) $(shell id -g)
19 | 
20 | rpmbuild/RPMS/x86_64/configurable-http-proxy-0.0.0-13.x86_64.rpm: rpmbuild/SPECS/configurable-http-proxy.spec rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm
21 | 	docker run -it --rm \
22 |           -v $(shell pwd)/archives:/archives:ro \
23 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
24 |           -v $(shell pwd)/scripts:/scripts:ro \
25 |           $(GCC4IMAGE) /scripts/configurable-http-proxy.sh $(shell id -u) $(shell id -g)
26 | 
--------------------------------------------------------------------------------
/rpms/build/etc/pam.d/login:
--------------------------------------------------------------------------------
 1 | #%PAM-1.0
 2 | auth [user_unknown=ignore success=ok ignore=ignore default=bad] pam_securetty.so
 3 | auth       substack     system-auth
 4 | auth       include      postlogin
 5 | account    required     pam_nologin.so
 6 | account    include      system-auth
 7 | password   include      system-auth
 8 | # pam_selinux.so close should be the first session rule
 9 | session    required     pam_selinux.so close
10 | session    required     pam_loginuid.so
11 | session    optional     pam_console.so
12 | # pam_selinux.so open should only be followed by sessions to be executed in the user context
13 | session    required     pam_selinux.so open
14 | session    required     pam_namespace.so
15 | session    optional     pam_keyinit.so force revoke
16 | session    include      system-auth
17 | session    include      postlogin
18 | -session   optional     pam_ck_connector.so
19 | 
--------------------------------------------------------------------------------
/rpms/build/fetch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ ! -z "$1" ]
 4 | then
 5 |     URI=$(echo $1 | sed 's,/$,,')
 6 |     make src
 7 |     aws s3 sync $URI rpmbuild/RPMS/x86_64/
 8 |     mv -f rpmbuild/RPMS/x86_64/*.whl wheel/
 9 |     touch rpmbuild/RPMS/x86_64/*.rpm
10 |     touch rpmbuild/RPMS/x86_64/hdf5-*.rpm
11 |     touch rpmbuild/RPMS/x86_64/netcdf-*.rpm
12 |     touch rpmbuild/RPMS/x86_64/gdal231-*.rpm
13 |     touch rpmbuild/RPMS/x86_64/jupyterhub-*.rpm
14 | fi
15 | 
--------------------------------------------------------------------------------
/rpms/build/gdal.mk:
--------------------------------------------------------------------------------
 1 | rpmbuild/SOURCES/proj-4.9.3.tar.gz:
 2 | 	curl -L "http://download.osgeo.org/proj/proj-4.9.3.tar.gz" -o $@
 3 | 
 4 | rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2:
 5 | 	curl -L "https://support.hdfgroup.org/ftp/HDF5/current18/src/hdf5-1.8.20.tar.bz2" -o $@
 6 | 
 7 | rpmbuild/SOURCES/netcdf-4.5.0.tar.gz:
 8 | 	curl -L "ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4.5.0.tar.gz" -o $@
 9 | 
10 | rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz:
11 | 	curl -L "https://github.com/uclouvain/openjpeg/archive/v2.3.0.tar.gz" -o $@
12 | 
13 | rpmbuild/SOURCES/gdal-2.3.1.tar.gz:
14 | 	curl -L "http://download.osgeo.org/gdal/2.3.1/gdal-2.3.1.tar.gz" -o $@
15 | 
16 | rpmbuild/RPMS/x86_64/openjpeg-2.3.0-33.x86_64.rpm: rpmbuild/SPECS/openjpeg.spec rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz
17 | 	docker run -it --rm \
18 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
19 |           -v $(shell pwd)/scripts:/scripts:ro \
20 |           $(GCC4IMAGE) /scripts/openjpeg.sh $(shell id -u) $(shell id -g)
21 | 
22 | rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm: rpmbuild/SPECS/hdf5.spec rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2
23 | 	docker run -it --rm \
24 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
25 |           -v $(shell pwd)/scripts:/scripts:ro \
26 |           $(GCC4IMAGE) /scripts/hdf5.sh $(shell id -u) $(shell id -g)
27 | 
28 | rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm: rpmbuild/SPECS/netcdf.spec rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/SOURCES/netcdf-4.5.0.tar.gz
29 | 	docker run -it --rm \
30 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
31 |           -v $(shell pwd)/scripts:/scripts:ro \
32 |           $(GCC4IMAGE) /scripts/netcdf.sh $(shell id -u) $(shell id -g)
33 | 
34 | rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/RPMS/x86_64/proj493-lib-4.9.3-33.x86_64.rpm: rpmbuild/SPECS/proj.spec rpmbuild/SOURCES/proj-4.9.3.tar.gz
35 | 	docker run -it --rm \
36 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
37 |           -v $(shell pwd)/scripts:/scripts:ro \
38 |           $(GCC4IMAGE) /scripts/proj.sh $(shell id -u) $(shell id -g)
39 | 
40 | rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm rpmbuild/RPMS/x86_64/gdal231-lib-2.3.1-33.x86_64.rpm: rpmbuild/SPECS/gdal.spec rpmbuild/RPMS/x86_64/openjpeg-2.3.0-33.x86_64.rpm rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm rpmbuild/SOURCES/gdal-2.3.1.tar.gz
41 | 	docker run -it --rm \
42 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
43 |           -v $(shell pwd)/scripts:/scripts:ro \
44 |           $(GCC4IMAGE) /scripts/gdal.sh $(shell id -u) $(shell id -g)
45 | 
--------------------------------------------------------------------------------
/rpms/build/geopyspark.mk:
--------------------------------------------------------------------------------
 1 | rpmbuild/SOURCES/geopyspark.tar: geopyspark/
 2 | 	tar cvf $@ geopyspark/
 3 | 
 4 | rpmbuild/RPMS/x86_64/geopyspark-deps-0.0.0-13.x86_64.rpm: rpmbuild/SPECS/geopyspark.spec \
 5 | scripts/geopyspark.sh rpmbuild/SOURCES/geopyspark.tar
 6 | 	docker run -it --rm \
 7 |           -v $(shell pwd)/archives:/archives:ro \
 8 | 	  -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \
 9 |           -v $(shell pwd)/scripts:/scripts:ro \
10 |           $(GCC4IMAGE) /scripts/geopyspark.sh $(shell id -u) $(shell id -g)
11 | 
--------------------------------------------------------------------------------
/rpms/build/patches/patch.diff:
--------------------------------------------------------------------------------
 1 | diff --git a/ipykernel/jsonutil.py b/ipykernel/jsonutil.py
 2 | index 3121e53..01b5d34 100644
 3 | --- a/ipykernel/jsonutil.py
 4 | +++ b/ipykernel/jsonutil.py
 5 | @@ -164,7 +164,10 @@ def json_clean(obj):
 6 |          # If all OK, proceed by making the new dict that will be json-safe
 7 |          out = {}
 8 |          for k,v in iteritems(obj):
 9 | -            out[unicode_type(k)] = json_clean(v)
10 | +            if str(type(v)) == "":
11 | +                out[unicode_type(k)] = json_clean(list(v))
12 | +            else:
13 | +                out[unicode_type(k)] = json_clean(v)
14 |          return out
15 |      if isinstance(obj, datetime):
16 |          return obj.strftime(ISO8601)
17 | 
--------------------------------------------------------------------------------
/rpms/build/publish.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ ! -z "$1" ]
 4 | then
 5 |     URI=$(echo $1 | sed 's,/$,,')/$(git rev-parse HEAD)/
 6 |     aws s3 sync rpmbuild/RPMS/x86_64/ $URI
 7 |     aws s3 sync wheel/ $URI
 8 | else
 9 |     echo "Need location"
10 | fi
11 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/BUILD/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/BUILD/.gitignore
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/RPMS/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/RPMS/.gitignore
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SOURCES/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/SOURCES/.gitignore
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SPECS/configurable-http-proxy.spec:
--------------------------------------------------------------------------------
 1 | %define _topdir   /tmp/rpmbuild
 2 | %define name      configurable-http-proxy
 3 | %define release   13
 4 | %define version   0.0.0
 5 | 
 6 | %define debug_package %{nil}
 7 | 
 8 | BuildRoot: %{buildroot}
 9 | Summary:   configurable-http-proxy
10 | License:   BSD-3
11 | Name:      %{name}
12 | Version:   %{version}
13 | Release:   %{release}
14 | Source:    configurable-http-proxy.tar
15 | Prefix:    /usr/local
16 | Group:     Development
17 | AutoReq:   no
18 | Requires:  pam
19 | Requires:  nodejs
20 | BuildRequires: nodejs
21 | %global _enable_debug_package 0
22 | %global debug_package %{nil}
23 | %global __os_install_post /usr/lib/rpm/brp-compress %{nil}
24 | 
25 | %description
26 | configurable-http-proxy
27 | 
28 | %prep
29 | %setup -q -n configurable-http-proxy
30 | 
31 | %build
32 | echo
33 | 
34 | %install
35 | find /usr/local | sort > before.txt
36 | npm install -g configurable-http-proxy
37 | find /usr/local | sort > after.txt
38 | tar cf /tmp/packages.tar $(diff before.txt after.txt | grep '^>' | cut -f2 '-d ')
39 | cd %{buildroot}
40 | tar axf /tmp/packages.tar
41 | 
42 | %files
43 | %defattr(-,root,root)
44 | /usr/local/*
45 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SPECS/gdal.spec:
--------------------------------------------------------------------------------
 1 | %define _topdir   /tmp/rpmbuild
 2 | %define name      gdal231
 3 | %define release   33
 4 | %define version   2.3.1
 5 | 
 6 | %define debug_package %{nil}
 7 | 
 8 | BuildRoot: %{buildroot}
 9 | Summary:   GDAL
10 | License:   X/MIT
11 | Name:      %{name}
12 | Version:   %{version}
13 | Release:   %{release}
14 | Source:    gdal-%{version}.tar.gz
15 | Prefix:    /usr/local
16 | Group:     Azavea
17 | Requires:  libpng
18 | Requires:  libcurl
19 | Requires:  libgeos
20 | Requires:  hdf5
21 | Requires:  netcdf
22 | BuildRequires: geos-devel
23 | BuildRequires: lcms2-devel
24 | BuildRequires: libcurl-devel
25 | BuildRequires: libpng-devel
26 | BuildRequires: openjpeg230
27 | BuildRequires: zlib-devel
28 | BuildRequires: hdf5
29 | BuildRequires: netcdf
30 | 
31 | %description
32 | GDAL
33 | 
34 | %prep
35 | %setup -q -n gdal-2.3.1
36 | 
37 | %build
38 | PKG_CONFIG_PATH=/usr/local/lib/pkgconfig LDFLAGS='-L/usr/local/lib -L/usr/local/lib64' CC='gcc48' ./configure --prefix=/usr/local --with-java --with-curl --with-openjpeg
39 | nice -n 19 make -k -j$(grep -c ^processor /proc/cpuinfo) || make
40 | make -C swig/java
41 | 
42 | %install
43 | nice -n 19 make DESTDIR=%{buildroot} install
44 | cp -L swig/java/.libs/libgdalalljni* %{buildroot}/usr/local/lib/
45 | cp swig/java/gdal.jar %{buildroot}/usr/local/share/
46 | 
47 | %package lib
48 | Group: Geography
49 | Summary: GDAL
50 | %description lib
51 | The libraries
52 | 
53 | %files lib
54 | %defattr(-,root,root)
55 | /usr/local/lib
56 | /usr/local/share/gdal.jar
57 | 
58 | %files
59 | %defattr(-,root,root)
60 | /usr/local/*
61 | /usr/local/share/gdal.jar
62 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SPECS/hdf5.spec:
--------------------------------------------------------------------------------
 1 | %define _topdir   /tmp/rpmbuild
 2 | %define name      hdf5
 3 | %define release   33
 4 | %define version   1.8.20
 5 | 
 6 | %define debug_package %{nil}
 7 | 
 8 | BuildRoot: %{buildroot}
 9 | Summary:   HDF5
10 | License:   X/MIT
11 | Name:      %{name}
12 | Version:   %{version}
13 | Release:   %{release}
14 | Source:    hdf5-%{version}.tar.bz2
15 | Prefix:    /usr/local
16 | Group:     Azavea
17 | Requires:  libcurl
18 | BuildRequires: libcurl-devel
19 | 
20 | %description
21 | HDF5
22 | 
23 | %prep
24 | %setup -q -n hdf5-1.8.20
25 | 
26 | %build
27 | ./configure --prefix=/usr/local
28 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo)
29 | 
30 | %install
31 | nice -n 19 make DESTDIR=%{buildroot} install
32 | 
33 | %files
34 | %defattr(-,root,root)
35 | /usr/local/*
36 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SPECS/netcdf.spec:
--------------------------------------------------------------------------------
 1 | %define _topdir   /tmp/rpmbuild
 2 | %define name      netcdf
 3 | %define release   33
 4 | %define version   4.5.0
 5 | 
 6 | %define debug_package %{nil}
 7 | 
 8 | BuildRoot: %{buildroot}
 9 | Summary:   NetCDF
10 | License:   X/MIT
11 | Name:      %{name}
12 | Version:   %{version}
13 | Release:   %{release}
14 | Source:    netcdf-%{version}.tar.gz
15 | Prefix:    /usr/local
16 | Group:     Azavea
17 | Requires:  libcurl
18 | Requires:  hdf5
19 | BuildRequires: libcurl-devel
20 | BuildRequires: hdf5
21 | 
22 | %description
23 | NetCDF
24 | 
25 | %prep
26 | %setup -q -n netcdf-4.5.0
27 | 
28 | %build
29 | ./configure --prefix=/usr/local
30 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo)
31 | 
32 | %install
33 | nice -n 19 make DESTDIR=%{buildroot} install
34 | 
35 | %files
36 | %defattr(-,root,root)
37 | /usr/local/*
38 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SPECS/nodejs.spec:
--------------------------------------------------------------------------------
 1 | %define _topdir   /tmp/rpmbuild
 2 | %define name      nodejs
 3 | %define release   13
 4 | %define version   8.5.0
 5 | 
 6 | %define debug_package %{nil}
 7 | 
 8 | BuildRoot: %{buildroot}
 9 | Summary:   NodeJS
10 | License:   node.js
11 | Name:      %{name}
12 | Version:   %{version}
13 | Release:   %{release}
14 | Source:    node-v%{version}.tar.gz
15 | Prefix:    /usr/local
16 | Group:     Azavea
17 | 
18 | %description
19 | Node.js 8.5.0
20 | 
21 | %prep
22 | %setup -q -n node-v8.5.0
23 | 
24 | %build
25 | ./configure --prefix=/usr/local
26 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo)
27 | 
28 | %install
29 | make DESTDIR=%{buildroot} install
30 | 
31 | %files
32 | %defattr(-,root,root)
33 | /usr/local/*
34 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SPECS/openjpeg.spec:
--------------------------------------------------------------------------------
 1 | %define _topdir   /tmp/rpmbuild
 2 | %define name      openjpeg230
 3 | %define release   33
 4 | %define version   2.3.0
 5 | 
 6 | %define debug_package %{nil}
 7 | 
 8 | BuildRoot: %{buildroot}
 9 | Summary:   OpenJPEG
10 | License:   X/MIT
11 | Name:      %{name}
12 | Version:   %{version}
13 | Release:   %{release}
14 | Source:    openjpeg-%{version}.tar.gz
15 | Prefix:    /usr/local
16 | Group:     Azavea
17 | Requires:  libpng
18 | Requires:  libcurl
19 | Requires:  libgeos
20 | Requires:  hdf5
21 | Requires:  netcdf
22 | BuildRequires: cmake
23 | BuildRequires: lcms2-devel
24 | 
25 | %description
26 | OpenJPEG
27 | 
28 | %prep
29 | %setup -q -n openjpeg-2.3.0
30 | 
31 | %build
32 | mkdir build
33 | cd build
34 | cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local
35 | make
36 | 
37 | %install
38 | cd build && make DESTDIR=%{buildroot} install
39 | 
40 | %files
41 | %defattr(-,root,root)
42 | /usr/local/lib/openjpeg-2.3/*
43 | /usr/local/lib/pkgconfig/libopenjp2.pc
44 | /usr/local/lib/libopenjp2*
45 | /usr/local/bin/opj*
46 | /usr/local/include/openjpeg-2.3
47 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SPECS/proj.spec:
--------------------------------------------------------------------------------
 1 | %define _topdir   /tmp/rpmbuild
 2 | %define name      proj493
 3 | %define release   33
 4 | %define version   4.9.3
 5 | 
 6 | %define debug_package %{nil}
 7 | 
 8 | BuildRoot: %{buildroot}
 9 | Summary:   Proj4
10 | License:   MIT
11 | Name:      %{name}
12 | Version:   %{version}
13 | Release:   %{release}
14 | Source:    proj-%{version}.tar.gz
15 | Prefix:    /usr/local
16 | Group:     Azavea
17 | 
18 | %description
19 | Proj 4.9.3
20 | 
21 | %prep
22 | %setup -q -n proj-4.9.3
23 | 
24 | %build
25 | ./configure --prefix=/usr/local
26 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo)
27 | 
28 | %install
29 | make DESTDIR=%{buildroot} install
30 | 
31 | %package lib
32 | Group: Geography
33 | Summary: Proj 4.9.3 libraries
34 | %description lib
35 | The libraries
36 | 
37 | %files
38 | %defattr(-,root,root)
39 | /usr/local/*
40 | 
41 | %files lib
42 | %defattr(-,root,root)
43 | /usr/local/lib/*
44 | 
--------------------------------------------------------------------------------
/rpms/build/rpmbuild/SRPMS/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/SRPMS/.gitignore
--------------------------------------------------------------------------------
/rpms/build/scripts/configurable-http-proxy.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | yum install -y /tmp/rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm
 7 | ldconfig
 8 | 
 9 | cd /tmp/rpmbuild
10 | chown -R root:root /tmp/rpmbuild/SOURCES/configurable-http-proxy.tar
11 | rpmbuild -v -bb --clean SPECS/configurable-http-proxy.spec
12 | chown -R $USERID:$GROUPID /tmp/rpmbuild
13 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/gdal.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | yum install -y geos-devel lcms2-devel libcurl-devel libpng-devel zlib-devel swig
 7 | yum localinstall -y /tmp/rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/openjpeg230-2.3.0-33.x86_64.rpm
 8 | ldconfig
 9 | 
10 | curl -o /tmp/ant.zip -L http://apache.spinellicreations.com//ant/binaries/apache-ant-1.9.13-bin.zip
11 | unzip -d /tmp/apache-ant /tmp/ant.zip
12 | export ANT_HOME=/tmp/apache-ant/apache-ant-1.9.13
13 | export PATH=$PATH:$ANT_HOME/bin
14 | 
15 | cd /tmp/rpmbuild
16 | chown -R root:root /tmp/rpmbuild/SOURCES/gdal-2.3.1.tar.gz
17 | rpmbuild -v -bb --clean SPECS/gdal.spec
18 | chown -R $USERID:$GROUPID /tmp/rpmbuild
19 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/hdf5.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | yum install -y libcurl-devel
 7 | ldconfig
 8 | 
 9 | cd /tmp/rpmbuild
10 | chown -R root:root /tmp/rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2
11 | rpmbuild -v -bb --clean SPECS/hdf5.spec
12 | chown -R $USERID:$GROUPID /tmp/rpmbuild
13 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/netcdf.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | yum install -y libcurl-devel
 7 | yum localinstall -y /tmp/rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm
 8 | ldconfig
 9 | 
10 | cd /tmp/rpmbuild
11 | chown -R root:root /tmp/rpmbuild/SOURCES/netcdf-4.5.0.tar.gz
12 | rpmbuild -v -bb --clean SPECS/netcdf.spec
13 | chown -R $USERID:$GROUPID /tmp/rpmbuild
14 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/nodejs.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | ldconfig
 7 | 
 8 | cd /tmp/rpmbuild
 9 | chown -R root:root /tmp/rpmbuild/SOURCES/node-v8.5.0.tar.gz
10 | rpmbuild -v -bb --clean SPECS/nodejs.spec
11 | chown -R $USERID:$GROUPID /tmp/rpmbuild
12 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/not.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | if [ ! -f $1 ]; then
4 |     echo $1;
5 | fi
6 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/openjpeg.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | yum install -y lcms2-devel libcurl-devel zlib-devel
 7 | ldconfig
 8 | 
 9 | cd /tmp/rpmbuild
10 | chown -R root:root /tmp/rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz
11 | rpmbuild -v -bb --clean SPECS/openjpeg.spec
12 | chown -R $USERID:$GROUPID /tmp/rpmbuild
13 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/proj.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | ldconfig
 7 | 
 8 | cd /tmp/rpmbuild
 9 | chown -R root:root /tmp/rpmbuild/SOURCES/proj-4.9.3.tar.gz
10 | rpmbuild -v -bb --clean SPECS/proj.spec
11 | chown -R $USERID:$GROUPID /tmp/rpmbuild
12 | 
--------------------------------------------------------------------------------
/rpms/build/scripts/wheel.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | USERID=$1
 4 | GROUPID=$2
 5 | 
 6 | yum localinstall -y \
 7 |     /tmp/rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm \
 8 |     /tmp/rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm \
 9 |     /tmp/rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm \
10 |     /tmp/rpmbuild/RPMS/x86_64/openjpeg230-2.3.0-33.x86_64.rpm \
11 |     /tmp/rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm
12 | ldconfig
13 | 
14 | mkdir -p /usr/share/jupyter/kernels
15 | mkdir /tmp/wheel
16 | cd /tmp/wheel
17 | cp /wheel/requirements.txt .
18 | export CC=gcc48
19 | pip3.4 install numpy==1.12.1
20 | pip3.4 wheel -r requirements.txt
21 | pip3.4 install ipython==5.1.0 /archives/ipykernel.zip
22 | pip3.4 wheel ipython==5.1.0 /archives/ipykernel.zip
23 | 
24 | chown -R $USERID:$GROUPID .
25 | 
26 | # Clean up duplicate packages (leave most recent version)
27 | rm -f tornado-5*
28 | rm -f pyzmq-17*
29 | for f in $(ls -1r | sed -e 's/^\(.*\)$/\1 \1/' | sed -e 's/^\([a-zA-Z0-9_]*\)-[0-9].* \(.*\)$/\1 \2/' | awk '{ if (seen[$1]++){print $2} }')
30 | do
31 |     echo "rm -f $f"
32 |     rm -f $f
33 | done
34 | 
35 | echo "Final wheel list: =============================================="
36 | ls -1r
37 | 
38 | cp *.whl /wheel
39 | 
--------------------------------------------------------------------------------
/rpms/build/wheel/requirements.txt:
--------------------------------------------------------------------------------
 1 | affine==2.0.0.post1
 2 | alembic==0.8.9
 3 | appdirs==1.4.3
 4 | backports-abc==0.5
 5 | bleach==1.5.0
 6 | boto3==1.7.26
 7 | click==6.7
 8 | click-plugins==1.0.3
 9 | cligj==0.4.0
10 | colortools==0.1.2
11 | cycler==0.10.0
12 | decorator==4.0.10
13 | entrypoints==0.2.2
14 | Fiona==1.7.1
15 | Flask==0.12.1
16 | Flask-Cors==3.0.2
17 | folium==0.5.0
18 | futures==3.1.1
19 | GDAL==2.3.1
20 | gevent==1.2.1
21 | html5lib==0.9999999
22 | https://github.com/jupyterhub/oauthenticator/archive/84ab3cce8db8c599ebd3bbbd836724bea6eb93a1.zip
23 | ipython==5.1.0
24 | ipython-genutils==0.1.0
25 | ipywidgets==6.0.0
26 | itsdangerous==0.24
27 | Jinja2==2.9.4
28 | jsonschema==2.5.1
29 | jupyter-client==4.4.0
30 | jupyter-core==4.2.1
31 | jupyterhub==0.8.1
32 | lxml==3.7.3
33 | Mako==1.0.6
34 | MarkupSafe==0.23
35 | matplotlib==2.0.0
36 | mistune==0.7.3
37 | ModestMaps==1.4.7
38 | munch==2.1.1
39 | nbconvert==5.0.0
40 | nbformat==4.2.0
41 | networkx==1.11
42 | notebook==5.0.0
43 | numpy==1.12.1
44 | olefile==0.44
45 | packaging==16.8
46 | pamela==0.3.0
47 | pandas==0.19.2
48 | pandocfilters==1.4.1
49 | pexpect==4.2.1
50 | pickleshare==0.7.4
51 | Pillow==5.1.0
52 | promise==0.4.2
53 | prompt-toolkit==1.0.9
54 | protobuf==3.6.1
55 | ptyprocess==0.5.1
56 | Pygments==2.1.3
57 | pyparsing==2.2.0
58 | pyproj==1.9.5.1
59 | python-dateutil==2.6.1
60 | python-editor==1.0.3
61 | pytz==2017.2
62 | PyWavelets==0.5.2
63 | pyzmq==16.0.2
64 | rasterio==1.0.3
65 | requests==2.12.4
66 | rise==5.2.0
67 | s3contents==0.1.10
68 | scikit-image==0.13.0
69 | scipy==0.19.0
70 | setuptools==18.5
71 | Shapely==1.6b4
72 | simplegeneric==0.8.1
73 | simplejson==3.13.2
74 | six==1.10.0
75 | snuggs==1.4.1
76 | SQLAlchemy==1.1.4
77 | sudospawner==0.5.1
78 | terminado==0.6
79 | testpath==0.3
80 | tornado==4.4.2
81 | traitlets==4.3.2
82 | virtualenv==13.1.2
83 | wcwidth==0.1.7
84 | Werkzeug==0.11.13
85 | widgetsnbextension==2.0.0
86 | 
--------------------------------------------------------------------------------
/rpms/build/wheels.mk:
--------------------------------------------------------------------------------
 1 | wheels wheel/http-requirements.txt: archives/ipykernel.zip \
 2 | rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm \
 3 | rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm \
 4 | rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm \
 5 | rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm \
 6 | wheel/requirements.txt
 7 | 	(cd wheel ; rm -f *.whl)
 8 | 	docker run -it --rm \
 9 |           -v $(shell pwd)/archives:/archives:ro \
10 |           -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:ro \
11 |           -v $(shell pwd)/wheel:/wheel:rw \
12 |           -v $(shell pwd)/scripts:/scripts:ro \
13 |           $(GCC4IMAGE) /scripts/wheel.sh $(shell id -u) $(shell id -g)
14 | 	(cd wheel ; ls *.whl | sed 's,^,http://localhost:28080/,' > http-requirements.txt)
15 | 
--------------------------------------------------------------------------------
/terraform/.gitignore:
--------------------------------------------------------------------------------
1 | .terraform*
2 | terraform.tfstate*
3 | 
--------------------------------------------------------------------------------
/terraform/aws.tf:
--------------------------------------------------------------------------------
1 | provider "aws" {
2 |   region     = "${var.region}"
3 | }
4 | 
--------------------------------------------------------------------------------
/terraform/bootstrap.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | RPM_URI=$(echo $1 | sed 's,/$,,')
  4 | NB_BUCKET=$(echo $2 | sed 's,s3://\([^/]*\).*,\1,')
  5 | NB_PREFIX=$(echo $2 | sed 's,s3://[^/]*/,,' | sed 's,/$,,')
  6 | OAUTH_MODULE=$3
  7 | OAUTH_CLASS=$4
  8 | OAUTH_CLIENT_ID=$5
  9 | OAUTH_CLIENT_SECRET=$6
 10 | GEOPYSPARKJARS=$7
 11 | GEOPYSPARKURI=$8
 12 | 
 13 | # Parses a configuration file put in place by EMR to determine the role of this node
 14 | is_master() {
 15 |   if [ $(jq '.isMaster' /mnt/var/lib/info/instance.json) = 'true' ]; then
 16 |     return 0
 17 |   else
 18 |     return 1
 19 |   fi
 20 | }
 21 | 
 22 | if is_master; then
 23 | 
 24 |     # Download packages
 25 |     mkdir -p /tmp/blobs/
 26 |     aws s3 sync $RPM_URI /tmp/blobs/
 27 | 
 28 |     # Install binary packages
 29 |     (cd /tmp/blobs; sudo yum localinstall -y openjpeg230-2.3.0-33.x86_64.rpm gdal231-2.3.1-33.x86_64.rpm hdf5-1.8.20-33.x86_64.rpm netcdf-4.5.0-33.x86_64.rpm nodejs-8.5.0-13.x86_64.rpm proj493-lib-4.9.3-33.x86_64.rpm configurable-http-proxy-0.0.0-13.x86_64.rpm)
 30 | 
 31 |     # Install Python packages
 32 |     sudo pip-3.4 install --upgrade pip
 33 |     sudo ln -s /usr/local/bin/pip3 /usr/bin/
 34 |     sudo ln -s /usr/local/bin/pip3.4 /usr/bin/
 35 |     (cd /tmp/blobs ; sudo pip3.4 install *.whl)
 36 | 
 37 |     # Linkage
 38 |     echo '/usr/local/lib' > /tmp/local.conf
 39 |     echo '/usr/local/lib64' >> /tmp/local.conf
 40 |     sudo cp /tmp/local.conf /etc/ld.so.conf.d/local.conf
 41 |     sudo ldconfig
 42 |     rm -f /tmp/local.conf
 43 | 
 44 |     # Set up user account to manage JupyterHub
 45 |     sudo groupadd shadow
 46 |     sudo chgrp shadow /etc/shadow
 47 |     sudo chmod 640 /etc/shadow
 48 |     # sudo usermod -a -G shadow hadoop
 49 |     sudo useradd -G shadow -r hublauncher
 50 |     sudo groupadd jupyterhub
 51 | 
 52 |     # Ensure that all members of `jupyterhub` group may log in to JupyterHub
 53 |     echo 'hublauncher ALL=(%jupyterhub) NOPASSWD: /usr/local/bin/sudospawner' | sudo tee -a /etc/sudoers
 54 |     echo 'hublauncher ALL=(ALL) NOPASSWD: /usr/sbin/useradd' | sudo tee -a /etc/sudoers
 55 |     echo 'hublauncher ALL=(ALL) NOPASSWD: /bin/chown' | sudo tee -a /etc/sudoers
 56 |     echo 'hublauncher ALL=(ALL) NOPASSWD: /bin/mkdir' | sudo tee -a /etc/sudoers
 57 |     echo 'hublauncher ALL=(ALL) NOPASSWD: /bin/mv' | sudo tee -a /etc/sudoers
 58 |     echo 'hublauncher ALL=(hdfs) NOPASSWD: /usr/bin/hdfs' | sudo tee -a /etc/sudoers
 59 | 
 60 |     # Environment setup
 61 |     cat < /tmp/oauth_profile.sh
 62 | export AWS_DNS_NAME=$(aws ec2 describe-network-interfaces --filters Name=private-ip-address,Values=$(hostname -i) | jq -r '.[] | .[] | .Association.PublicDnsName')
 63 | export OAUTH_CALLBACK_URL=http://\$AWS_DNS_NAME:8000/hub/oauth_callback
 64 | export OAUTH_CLIENT_ID=$OAUTH_CLIENT_ID
 65 | export OAUTH_CLIENT_SECRET=$OAUTH_CLIENT_SECRET
 66 | 
 67 | alias launch_hub='sudo -u hublauncher -E env "PATH=/usr/local/bin:$PATH" jupyterhub --JupyterHub.spawner_class=sudospawner.SudoSpawner --SudoSpawner.sudospawner_path=/usr/local/bin/sudospawner --Spawner.notebook_dir=/home/{username}'
 68 | EOF
 69 |     sudo mv /tmp/oauth_profile.sh /etc/profile.d
 70 |     . /etc/profile.d/oauth_profile.sh
 71 | 
 72 |     # Setup required scripts/configurations for launching JupyterHub
 73 |     cat < /tmp/new_user
 74 | #!/bin/bash
 75 | 
 76 | export user=\$1
 77 | 
 78 | sudo useradd -m -G jupyterhub,hadoop \$user
 79 | sudo -u hdfs hdfs dfs -mkdir /user/\$user
 80 | 
 81 | sudo mkdir -p /home/\$user/.jupyter/
 82 | 
 83 | cat << LOL > /tmp/jupyter_notebook_config.py.\$user
 84 | from s3contents import S3ContentsManager
 85 | 
 86 | c.NotebookApp.contents_manager_class = S3ContentsManager
 87 | c.S3ContentsManager.bucket = "$NB_BUCKET"
 88 | c.S3ContentsManager.prefix = "$NB_PREFIX"
 89 | 
 90 | LOL
 91 | 
 92 | sudo mv /tmp/jupyter_notebook_config.py.\$user /home/\$user/.jupyter/jupyter_notebook_config.py
 93 | sudo chown \$user:\$user -R /home/\$user/.jupyter/
 94 | 
 95 | EOF
 96 |     chmod +x /tmp/new_user
 97 |     sudo chown root:root /tmp/new_user
 98 |     sudo mv /tmp/new_user /usr/local/bin
 99 | 
100 |     cat < /tmp/jupyterhub_config.py
101 | from oauthenticator.$OAUTH_MODULE import $OAUTH_CLASS
102 | 
103 | c = get_config()
104 | 
105 | c.JupyterHub.authenticator_class = $OAUTH_CLASS
106 | c.${OAUTH_CLASS}.create_system_users = True
107 | 
108 | c.JupyterHub.spawner_class='sudospawner.SudoSpawner'
109 | c.SudoSpawner.sudospawner_path='/usr/local/bin/sudospawner'
110 | c.Spawner.notebook_dir='/home/{username}'
111 | c.LocalAuthenticator.add_user_cmd = ['new_user']
112 | 
113 | EOF
114 | 
115 |     # Install GeoPySpark
116 |     if [[ $GEOPYSPARKURI == s3* ]]; then
117 | 	aws s3 cp $GEOPYSPARKURI /tmp/geopyspark.zip
118 | 	GEOPYSPARKURI=/tmp/geopyspark.zip
119 |     fi
120 |     sudo -E env "PATH=/usr/local/bin:$PATH" pip3.4 install "$GEOPYSPARKURI"
121 |     sudo -E env "PATH=/usr/local/bin:$PATH" jupyter nbextension enable --py widgetsnbextension --system
122 |     sudo mkdir -p /opt/jars/
123 |     for url in $(echo $GEOPYSPARKJARS | tr , "\n")
124 |     do
125 | 	if [[ $url == s3* ]]; then
126 | 	    (cd /opt/jars ; sudo aws s3 cp $url . )
127 | 	else
128 | 	    (cd /opt/jars ; sudo curl -L -O -C - $url )
129 | 	fi
130 |     done
131 | 
132 |     # Install GeoPySpark kernel
133 |     cat < /tmp/kernel.json
134 | {
135 |     "language": "python",
136 |     "display_name": "GeoPySpark",
137 |     "argv": [
138 |         "/usr/bin/python3.4",
139 |         "-m",
140 |         "ipykernel",
141 |         "-f",
142 |         "{connection_file}"
143 |     ],
144 |     "env": {
145 |         "PYSPARK_PYTHON": "/usr/bin/python3.4",
146 |         "PYSPARK_DRIVER_PYTHON": "/usr/bin/python3.4",
147 |         "SPARK_HOME": "/usr/lib/spark",
148 |         "PYTHONPATH": "/usr/lib/spark/python/lib/pyspark.zip:/usr/lib/spark/python/lib/py4j-0.10.6-src.zip",
149 |         "GEOPYSPARK_JARS_PATH": "/opt/jars",
150 |         "YARN_CONF_DIR": "/etc/hadoop/conf",
151 |         "LD_LIBRARY_PATH": "/usr/local/lib:/usr/lib",
152 |         "PYSPARK_SUBMIT_ARGS": "--conf spark.executorEnv.LD_LIBRARY_PATH=/usr/local/lib:/usr/lib --conf hadoop.yarn.timeline-service.enabled=false pyspark-shell"
153 |     }
154 | }
155 | EOF
156 |     sudo mkdir -p /usr/local/share/jupyter/kernels/geopyspark
157 |     sudo cp /tmp/kernel.json /usr/local/share/jupyter/kernels/geopyspark/kernel.json
158 |     rm -f /tmp/kernel.json
159 | 
160 |     # Execute
161 |     cd /tmp
162 |     sudo -u hublauncher -E env "PATH=/usr/local/bin:$PATH" jupyterhub --JupyterHub.spawner_class=sudospawner.SudoSpawner --SudoSpawner.sudospawner_path=/usr/local/bin/sudospawner --Spawner.notebook_dir=/home/{username} -f /tmp/jupyterhub_config.py &
163 | 
164 | else
165 |     # Download packages
166 |     mkdir -p /tmp/blobs/
167 |     aws s3 sync $RPM_URI /tmp/blobs/
168 | 
169 |     # Install packages
170 |     (cd /tmp/blobs; sudo yum localinstall -y openjpeg230-2.3.0-33.x86_64.rpm gdal231-2.3.1-33.x86_64.rpm hdf5-1.8.20-33.x86_64.rpm netcdf-4.5.0-33.x86_64.rpm proj493-lib-4.9.3-33.x86_64.rpm)
171 | 
172 |     # Install Python packages
173 |     sudo pip-3.4 install --upgrade pip
174 |     sudo ln -s /usr/local/bin/pip3 /usr/bin/
175 |     sudo ln -s /usr/local/bin/pip3.4 /usr/bin/
176 |     (cd /tmp/blobs ; sudo pip3.4 install *.whl)
177 | 
178 |     # Install GeoPySpark
179 |     if [[ $GEOPYSPARKURI == s3* ]]; then
180 | 	aws s3 cp $GEOPYSPARKURI /tmp/geopyspark.zip
181 | 	GEOPYSPARKURI=/tmp/geopyspark.zip
182 |     fi
183 |     sudo -E env "PATH=/usr/local/bin:$PATH" pip3.4 install "$GEOPYSPARKURI"
184 | 
185 |     # Linkage
186 |     echo '/usr/local/lib' > /tmp/local.conf
187 |     echo '/usr/local/lib64' >> /tmp/local.conf
188 |     sudo cp /tmp/local.conf /etc/ld.so.conf.d/local.conf
189 |     sudo ldconfig
190 |     rm -f /tmp/local.conf
191 | fi
192 | 
--------------------------------------------------------------------------------
/terraform/cluster-configurations.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "Classification": "spark",
 4 |         "Properties": {
 5 |             "maximizeResourceAllocation": "true"
 6 |         }
 7 |     },
 8 |     {
 9 |         "Classification": "spark-defaults",
10 |         "Properties": {
11 |             "spark.driver.maxResultSize": "3G",
12 |             "spark.dynamicAllocation.enabled": "true",
13 |             "spark.shuffle.service.enabled": "true",
14 |             "spark.shuffle.compress": "true",
15 |             "spark.shuffle.spill.compress": "true",
16 |             "spark.rdd.compress": "true",
17 |             "spark.yarn.executor.memoryOverhead": "1G",
18 |             "spark.yarn.driver.memoryOverhead": "1G",
19 |             "spark.driver.maxResultSize": "3G",
20 |             "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64"
21 |         }
22 |     },
23 |     {
24 |         "Classification": "yarn-site",
25 |         "Properties": {
26 |             "yarn.resourcemanager.am.max-attempts": "1",
27 |             "yarn.nodemanager.vmem-check-enabled": "false",
28 |             "yarn.nodemanager.pmem-check-enabled": "false"
29 |         }
30 |     }
31 | ]
32 | 
--------------------------------------------------------------------------------
/terraform/emr.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_emr_cluster" "emr-spark-cluster" {
 2 |   name          = "GeoPySpark Cluster"
 3 |   applications  = ["Hadoop", "Spark", "Ganglia"]
 4 |   log_uri       = "${var.s3_log_uri}"
 5 |   release_label = "emr-5.13.0"
 6 |   service_role  = "${var.emr_service_role}"
 7 | 
 8 |   ec2_attributes {
 9 |     # subnet_id        = "subnet-xxxxxxxx"
10 |     instance_profile = "${var.emr_instance_profile}"
11 |     key_name         = "${var.key_name}"
12 | 
13 |     emr_managed_master_security_group = "${aws_security_group.security-group.id}"
14 |     emr_managed_slave_security_group  = "${aws_security_group.security-group.id}"
15 |   }
16 | 
17 |   instance_group {
18 |     # bid_price      = "${var.bid_price}"
19 |     instance_count = 1
20 |     instance_role  = "MASTER"
21 |     instance_type  = "m3.xlarge"
22 |     name           = "geopyspark-master"
23 |   }
24 | 
25 |   instance_group {
26 |     bid_price      = "${var.bid_price}"
27 |     instance_count = "${var.worker_count}"
28 |     instance_role  = "CORE"
29 |     instance_type  = "m3.xlarge"
30 |     name           = "geopyspark-core"
31 |   }
32 | 
33 |   bootstrap_action {
34 |     path = "s3://${var.bs_bucket}/${var.bs_prefix}/bootstrap.sh"
35 |     name = "geopyspark"
36 |     args = [
37 |       "${var.s3_rpm_uri}",
38 |       "${var.s3_notebook_uri}",
39 |       "${var.jupyterhub_oauth_module}",
40 |       "${var.jupyterhub_oauth_class}",
41 |       "${var.oauth_client_id}",
42 |       "${var.oauth_client_secret}",
43 |       "${var.geopyspark_jars}",
44 |       "${var.geopyspark_uri}"
45 |     ]
46 |   }
47 | 
48 |   configurations = "cluster-configurations.json"
49 | 
50 |   depends_on = ["aws_s3_bucket_object.bootstrap"]
51 | }
52 | 
53 | output "emr-id" {
54 |   value = "${aws_emr_cluster.emr-spark-cluster.id}"
55 | }
56 | 
57 | output "emr-master" {
58 |   value = "${aws_emr_cluster.emr-spark-cluster.master_public_dns}"
59 | }
60 | 
--------------------------------------------------------------------------------
/terraform/s3.tf:
--------------------------------------------------------------------------------
1 | # bootstrap.sh
2 | resource "aws_s3_bucket_object" "bootstrap" {
3 |   bucket = "${var.bs_bucket}"
4 |   key    = "${var.bs_prefix}/bootstrap.sh"
5 |   source = "./bootstrap.sh"
6 |   etag   = "${md5(file("./bootstrap.sh"))}"
7 | }
8 | 
--------------------------------------------------------------------------------
/terraform/security-group.tf:
--------------------------------------------------------------------------------
 1 | resource "aws_security_group" "security-group" {
 2 |   ingress {
 3 |     from_port = 0
 4 |     to_port   = 0
 5 |     protocol  = "-1"
 6 |     self      = true
 7 |   }
 8 | 
 9 |   ingress {
10 |     from_port = 49152
11 |     to_port   = 65535
12 |     protocol  = "tcp"
13 |     cidr_blocks = ["0.0.0.0/0"]
14 |     description = "Allow TMS ports"
15 |   }
16 | 
17 |   ingress {
18 |     from_port   = "22"
19 |     to_port     = "22"
20 |     protocol    = "tcp"
21 |     cidr_blocks = ["0.0.0.0/0"]
22 |   }
23 | 
24 |   ingress {
25 |     from_port   = "${var.jupyterhub_port}"
26 |     to_port     = "${var.jupyterhub_port}"
27 |     protocol    = "tcp"
28 |     cidr_blocks = ["0.0.0.0/0"]
29 |   }
30 | 
31 |   egress {
32 |     from_port   = 0
33 |     to_port     = 0
34 |     protocol    = "-1"
35 |     cidr_blocks = ["0.0.0.0/0"]
36 |   }
37 | 
38 |   lifecycle {
39 |     create_before_destroy = true
40 |   }
41 | }
42 | 
--------------------------------------------------------------------------------
/terraform/variables.tf:
--------------------------------------------------------------------------------
  1 | variable "region" {
  2 |     type        = "string"
  3 |     description = "AWS Region"
  4 |     default     = "us-east-1"
  5 | }
  6 | 
  7 | variable "key_name" {
  8 |     type        = "string"
  9 |     description = "The name of the EC2 secret key (primarily for SSH access)"
 10 | }
 11 | 
 12 | variable "s3_log_uri" {
 13 |     type        = "string"
 14 |     description = "Where EMR logs will be sent"
 15 | }
 16 | 
 17 | variable "ecs_ami" {
 18 |     type        = "string"
 19 |     description = "AMI to use for the ECS Instance"
 20 |     default     = "ami-9eb4b1e5"
 21 | }
 22 | 
 23 | variable "jupyterhub_port" {
 24 |     type        = "string"
 25 |     description = "The port on which to connect to JupyterHub"
 26 |     default     = "8000"
 27 | }
 28 | 
 29 | variable "worker_count" {
 30 |     type        = "string"
 31 |     description = "The number of worker nodes"
 32 |     default     = "1"
 33 | }
 34 | 
 35 | variable "emr_service_role" {
 36 |   type        = "string"
 37 |   description = "EMR service role"
 38 |   default     = "EMR_DefaultRole"
 39 | }
 40 | 
 41 | variable "emr_instance_profile" {
 42 |   type        = "string"
 43 |   description = "EMR instance profile"
 44 |   default     = "EMR_EC2_DefaultRole"
 45 | }
 46 | 
 47 | variable "ecs_instance_profile" {
 48 |   type        = "string"
 49 |   description = "ECS instance profile"
 50 |   default     = "ecsInstanceRole"
 51 | }
 52 | 
 53 | variable "bs_bucket" {
 54 |   type        = "string"
 55 |   description = "S3 Bucket containing the bootstrap script (e.g. bucket if the whole path is s3://bucket/containing/bootstrap)"
 56 | }
 57 | 
 58 | variable "bs_prefix" {
 59 |   type        = "string"
 60 |   description = "The prefix of the bootstrap script within the s3 bucket (e.g. containing/bootstrap if the whole path is s3://bucket/containing/bootstrap/bootstrap.sh)"
 61 | }
 62 | 
 63 | variable "s3_rpm_uri" {
 64 |   type        = "string"
 65 |   description = "S3 path containing RPMs (e.g. s3://bucket/containing/rpms/)"
 66 | }
 67 | 
 68 | variable "s3_notebook_uri" {
 69 |   type        = "string"
 70 |   description = "S3 path for notebooks (e.g. s3://bucket/containing/notebooks/)"
 71 | }
 72 | 
 73 | variable "jupyterhub_oauth_module" {
 74 |   type        = "string"
 75 |   description = "Name of the jupyterhub/oauthenticator module to import the jupyterhub_oauth_class from"
 76 |   default     = "github"
 77 | }
 78 | 
 79 | variable "jupyterhub_oauth_class" {
 80 |   type        = "string"
 81 |   description = "Name of the OAuth class provided by jupyterhub/oauthenticator"
 82 |   default     = "LocalGitHubOAuthenticator"
 83 | }
 84 | 
 85 | variable "oauth_client_id" {
 86 |   type        = "string"
 87 |   description = "Client ID token for OAuth server"
 88 | }
 89 | 
 90 | variable "oauth_client_secret" {
 91 |   type        = "string"
 92 |   description = "Client secret token for OAuth server"
 93 | }
 94 | 
 95 | variable "geopyspark_jars" {
 96 |   type        = "string"
 97 |   description = "Comma-separated list of URIs pointing to GeoPySpark jars"
 98 |   default     = "s3://geopyspark-dependency-jars/geotrellis-backend-assembly-0.4.1.jar"
 99 | }
100 | 
101 | variable "geopyspark_uri" {
102 |   type        = "string"
103 |   description = "URI from which the GeoPySpark Python code is to be obtained"
104 |   default     = "https://github.com/locationtech-labs/geopyspark/archive/d03d95fcd0e24cfca7df81fa56dcd84e30035a0f.zip"
105 | }
106 | 
107 | variable "bid_price" {
108 |   type        = "string"
109 |   description = "Bid Price"
110 |   default     = "0.07"
111 | }
112 | 
--------------------------------------------------------------------------------