├── .dockerignore ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── config └── geonotebook.ini ├── emr-docker ├── Makefile ├── README.md ├── bootstrap-geopyspark-docker.sh ├── config-aws.mk.template ├── config-emr.mk └── configurations.json ├── kernels ├── local │ └── kernel.json └── yarn │ └── kernel.json ├── notebooks ├── Getting the mask.ipynb ├── Landsat.ipynb ├── NLCD viewer.ipynb ├── Park citing.ipynb ├── Pine Habitat.ipynb ├── SRTM-emr.ipynb ├── SRTM-local.ipynb ├── libya.ipynb └── sanfranmvp.ipynb ├── rpms └── build │ ├── .dockerignore │ ├── Dockerfile.base │ ├── Dockerfile.gcc4 │ ├── Makefile │ ├── README.md │ ├── archives │ └── .gitignore │ ├── blobs │ └── .gitignore │ ├── build.sh │ ├── configurable-http-proxy.mk │ ├── configurable-http-proxy │ └── .gitignore │ ├── configurable-http-server.mk │ ├── etc │ └── pam.d │ │ └── login │ ├── fetch.sh │ ├── gdal.mk │ ├── geopyspark.mk │ ├── patches │ └── patch.diff │ ├── publish.sh │ ├── rpmbuild │ ├── BUILD │ │ └── .gitignore │ ├── RPMS │ │ └── .gitignore │ ├── SOURCES │ │ └── .gitignore │ ├── SPECS │ │ ├── configurable-http-proxy.spec │ │ ├── gdal.spec │ │ ├── hdf5.spec │ │ ├── netcdf.spec │ │ ├── nodejs.spec │ │ ├── openjpeg.spec │ │ └── proj.spec │ └── SRPMS │ │ └── .gitignore │ ├── scripts │ ├── configurable-http-proxy.sh │ ├── gdal.sh │ ├── hdf5.sh │ ├── netcdf.sh │ ├── nodejs.sh │ ├── not.sh │ ├── openjpeg.sh │ ├── proj.sh │ └── wheel.sh │ ├── wheel │ └── requirements.txt │ └── wheels.mk └── terraform ├── .gitignore ├── aws.tf ├── bootstrap.sh ├── cluster-configurations.json ├── emr.tf ├── s3.tf ├── security-group.tf └── variables.tf /.dockerignore: -------------------------------------------------------------------------------- 1 | archives/ 2 | bootstrap/ 3 | emr/ 4 | emr-nodocker/ 5 | geopyspark-*/ 6 | netcdf-backend/ 7 | notebooks/ 8 | rpms/ 9 | scratch/ 10 | thredds-feature-s3-hdfs/ 11 | .travis/ 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | archives/*.jar 2 | archives/*.tar.* 3 | archives/*.whl 4 | archives/*.zip 5 | blobs/* 6 | emr/config-aws.mk 7 | geopyspark-*/ 8 | notebooks/.ipynb_checkpoints/ 9 | rpms/build/blobs/* 10 | rpms/build/rpmbuild/RPMS/x86_64/* 11 | rpms/build/rpmbuild/SOURCES/* 12 | rpms/build/wheel/http-requirements.txt 13 | rpms/build/wheel/*.whl 14 | scratch/* 15 | target/* 16 | terraform-docker/.terraform/* 17 | terraform-docker/.terraform* 18 | terraform-docker/terraform.tfstate* 19 | terraform-nodocker/.terraform/* 20 | terraform-nodocker/.terraform* 21 | terraform-nodocker/terraform.tfstate* 22 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | language: 4 | - scala 5 | 6 | scala: 7 | - "2.11.11" 8 | 9 | jdk: 10 | - openjdk8 11 | 12 | services: 13 | - docker 14 | 15 | env: 16 | global: 17 | - CLEAN_TRAVIS_TAG=${TRAVIS_TAG/[[:space:]]/} 18 | - TAG=${CLEAN_TRAVIS_TAG:-${TRAVIS_COMMIT:0:7}} 19 | 20 | branches: 21 | only: 22 | - master 23 | 24 | addons: 25 | apt: 26 | packages: 27 | - make 28 | 29 | script: 30 | - TRAVIS=1 TAG=$TAG make image 31 | 32 | after_success: 33 | - if [ "$QUAY_USERNAME" != "" -a "$QUAY_PASSWORD" != "" ]; then 34 | docker login -u="$QUAY_USERNAME" -p="$QUAY_PASSWORD" quay.io; 35 | TRAVIS=1 TAG=$TAG make publish; 36 | fi 37 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/geodocker/jupyter-geopyspark:base-8 2 | 3 | ARG GEOPYSPARK_VERSION 4 | ARG GEOPYSPARKSHA 5 | 6 | ENV PYSPARK_PYTHON=python3.4 7 | ENV PYSPARK_DRIVER_PYTHON=python3.4 8 | 9 | # Set up Jupyter 10 | RUN mkdir /home/hadoop/notebooks && \ 11 | pip3 install --user pytest && \ 12 | jupyter nbextension enable --py widgetsnbextension 13 | COPY kernels/local/kernel.json /home/hadoop/.local/share/jupyter/kernels/pyspark/kernel.json 14 | 15 | # Install GeoPySpark 16 | RUN pip3 install --user protobuf==3.3.0 traitlets==4.3.2 "https://github.com/locationtech-labs/geopyspark/archive/$GEOPYSPARKSHA.zip" 17 | 18 | # Install Jars 19 | ADD https://s3.amazonaws.com/geopyspark-dependency-jars/geotrellis-backend-assembly-${GEOPYSPARK_VERSION}.jar /opt/jars/ 20 | 21 | USER root 22 | RUN chmod ugo+r /opt/jars/* 23 | RUN chown -R hadoop:hadoop /home/hadoop/.local/share 24 | USER hadoop 25 | 26 | WORKDIR /tmp 27 | CMD ["jupyterhub", "--no-ssl", "--Spawner.notebook_dir=/home/hadoop/notebooks"] 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: image clean cleaner cleanest mrproper 2 | 3 | TAG ?= 8 4 | FAMILY := quay.io/geodocker/jupyter-geopyspark 5 | IMAGE := $(FAMILY):$(TAG) 6 | GEOPYSPARK_SHA ?= a75dda65434b472045f3fd97baca1e9bdc1ac353 7 | GEOPYSPARK_VERSION ?= 0.4.2 8 | 9 | all: image 10 | 11 | image: Dockerfile 12 | docker build \ 13 | --build-arg GEOPYSPARK_VERSION=$(GEOPYSPARK_VERSION) \ 14 | --build-arg GEOPYSPARKSHA=$(GEOPYSPARK_SHA) \ 15 | -t $(IMAGE) -f Dockerfile . 16 | 17 | clean: 18 | 19 | cleaner: clean 20 | 21 | cleanest: cleaner 22 | 23 | mrproper: cleanest 24 | 25 | publish: 26 | docker tag $(IMAGE) "$(FAMILY):latest" 27 | docker push $(IMAGE) 28 | docker push "$(FAMILY):latest" 29 | 30 | run: 31 | mkdir -p $(HOME)/.aws 32 | docker run -it --rm --name geopyspark \ 33 | -p 8000:8000 -p 4040:4040 \ 34 | $(EXTRA_FLAGS) \ 35 | -v $(shell pwd)/notebooks:/home/hadoop/notebooks:rw \ 36 | -v $(HOME)/.aws:/home/hadoop/.aws:ro \ 37 | $(IMAGE) 38 | 39 | run-editable: 40 | mkdir -p $(HOME)/.aws 41 | docker run -it --rm --name geopyspark \ 42 | -p 8000:8000 -p 4040:4040 \ 43 | $(EXTRA_FLAGS) \ 44 | -v $(GEOPYSPARK_DIR):/home/hadoop/.local/lib/python3.4/site-packages/geopyspark:rw \ 45 | -v $(shell pwd)/notebooks:/home/hadoop/notebooks:rw \ 46 | -v $(HOME)/.aws:/home/hadoop/.aws:ro \ 47 | $(IMAGE) 48 | 49 | shell: 50 | docker exec -it geopyspark bash 51 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction # 2 | 3 | This repository contains the configuration and build files necessary to produce the [`quay.io/geodocker/jupyter-geopyspark` Docker image](https://quay.io/repository/geodocker/jupyter-geopyspark?tab=tags). 4 | The Docker image allows easy use of [GeoPySpark](https://github.com/locationtech-labs/geopyspark) in a web browser via [Jupyter](https://github.com/jupyter/jupyter/) and [GeoNotebook](https://github.com/opengeoscience/geonotebook) without having to modify or configure the host computer (beyond what is needed to run Docker). 5 | 6 | The process of [using a pre-built container](#without-a-clone) is discussed in the next section, 7 | and instructions for [building the image](#building-the-image) and [modifying it](#modifying-the-image-or-image-architecture) are also discussed. 8 | 9 | # Using The Image # 10 | 11 | You will be prompted for a username and a password when direct your web browser to the container: the username and password are both `hadoop`. 12 | 13 | One can use the image with or without making a clone of this repository. 14 | 15 | ## Without A Clone ## 16 | 17 | To use the image without (or from outside of) a clone of this repository, 18 | first make sure that you are in possession of the image. 19 | The command 20 | ``` 21 | docker pull quay.io/geodocker/jupyter-geopyspark 22 | ``` 23 | will pull the latest version of the image. 24 | 25 | The container can then be started by typing 26 | ``` 27 | docker run -it --rm --name geopyspark \ 28 | -p 8000:8000 -p 4040:4040 \ 29 | quay.io/geodocker/jupyter-geopyspark 30 | ``` 31 | or perhaps 32 | ``` 33 | docker run -it --rm --name geopyspark \ 34 | -p 8000:8000 -p 4040:4040 \ 35 | -v $HOME/.aws:/home/hadoop/.aws:ro \ 36 | quay.io/geodocker/jupyter-geopyspark 37 | ``` 38 | if you wish to have your AWS credentials available in the container (e.g. for pulling data from S3). 39 | 40 | ## From A Clone ## 41 | 42 | To use the image from within a clone of this repository, 43 | there are [two useful targets in the Makefile: `run` and `run-editable`](Makefile#L149-L166). 44 | To use the `run` target, type something like 45 | ``` 46 | TAG=latest make run 47 | ``` 48 | or to use the `run` target with some image other than the latest one, something like 49 | ``` 50 | TAG=a1b78b9 make run 51 | ``` 52 | will launch a container using the image `quay.io/geodocker/jupyter-geopyspark:a1b78b9`. 53 | 54 | The `run-editable` target also exists, which attempts to map one's local clone of the GeoPySpark into the container so that that code can be edited and iterated on in a fairly convenient fashion. 55 | By default, it is assumed that the GeoPySpark code is present in `../geopyspark/geopyspark`, but that assumption can be changed by passing in an alternate location through the `GEOPYSPARK_DIR` environment variable. 56 | Here 57 | ``` 58 | TAG=latest GEOPYSPARK_DIR=/tmp/geopyspark/geopyspark run-editable 59 | ``` 60 | is an example of that. 61 | 62 | Both of those targets also pay attention to the `EXTRA_FLAGS` environment variable which can be used to pass additional flags to docker. 63 | 64 | # Building The Image # 65 | 66 | To build the image, type `make all`, `make image`, or simply `make`. 67 | 68 | Type 69 | ``` 70 | make run 71 | ``` 72 | to run the newly-built image. 73 | The `TAG` environment variable is not set, so by default the `run` target will use the tag of the new image. 74 | 75 | # Modifying The Image; or, Image Architecture # 76 | 77 | In this section we describe the structure of the repository 78 | and document how the various pieces interact as part of the build process. 79 | 80 | ## Repository Structure ## 81 | 82 | - [`archives`](archives) is an initially-empty directory that is populated with source code and built artifacts as part of the build process. 83 | - [`blobs`](blobs) is an initially-empty directory that is populated with built artifacts from the `archives` directory. 84 | This directory exists because `archives` is listed in the [`.dockerignore`](.dockerignore) file 85 | (which was done to reduce the size of the [build context](https://docs.docker.com/engine/reference/commandline/build/) of the final image). 86 | Please see the [README](bootstrap/README.md) in that directory for more information. 87 | - [`config`](config) contains the [GeoNotebook configuration file](config/geonotebook.ini) 88 | and a [list of python dependencies](config/requirements.txt) that GeoNotebook requires. 89 | - [`emr-docker`](emr-docker) contains files useful for running the image on Amazon EMR (please see the [README](emr-docker/README.md) in that directory for more information). 90 | - [`terraform-docker`](terraform-docker) contains file useful for running the image on Amazon EMR using Terraform. Its remit is similar to that of the directory mentioned in the previous bullet-point, but it uses Terraform instead of shell scripts. 91 | - [`kernels`](kernels) contains Jupyter kernel configuration files. 92 | The one most likely to be of interest is [the one](geonotebook/kernel.json) that enables GeoNotebook and GeoPySpark, the other two kernels are mostly vestigial/ceremonial. 93 | - [`notebooks`](notebook) contains various sample notebooks. 94 | - [`scratch`](scratch) is a scratch directory used during the build process. 95 | The files that are added under this directory during the build can be harmlessly deleted after the build is complete, 96 | but not doing so will accelerate subsequent builds. 97 | - [`scripts`](scripts) contains various scripts using for building and installing artifacts. 98 | - [`netcdf.sh`](scripts/netcdf.sh) builds a jar from a [particular branch](https://github.com/Unidata/thredds/tree/feature/s3+hdfs) of the [Thredds](https://github.com/Unidata/thredds) project that provides support for reading NetCDF files. 99 | - [`build-python-blob1.sh`](build-python-blob1.sh) [runs in the context of the AWS build container](Makefile#L62-L70), 100 | its purpose is to acquire most of the python dependencies needed by GeoPySpark and GeoNotebook and package them together into a tarball for later installation. 101 | - [`build-pytohn-blob2.sh`](build-python-blob2.sh) [runs in the context of the AWS build container](Makefile#L72-L80), 102 | its purpose is to package GeoPySpark and [`GeoPySpark-NetCDF`](https://github.com/geotrellis/geopyspark-netcdf) into a tarball for later installation. 103 | - [`install-blob1.sh`](scripts/install-blob1.sh) runs [in the context of the final image build](Dockerfile#L17). 104 | Its purpose is to install the artifacts created earlier by `build-python-blob1.sh`. 105 | - [`install-blob2.sh`](scripts/install-blob2.sh) runs [in the context of the final image build](Dockerfile#L40). 106 | Its purpose is to install the artifacts created earlier by `build-python-blob2.sh`. 107 | - [`Dockerfile`](Dockerfile) specifies the final image, the output of the build process. 108 | - [`Makefile`](Makefile) coordinates the build process. 109 | - [`README.md`](README.md) this file. 110 | 111 | ## Build Process ## 112 | 113 | The build process can be divided into three stages: the bootstrap image creation phase, the EMR-compatible artifact creation stage, and the final image build stage. 114 | 115 | When the `all` makefile target is invoked, the last two stages of the three-stage build process are done. 116 | 117 | ### Stage 0: Build Bootstrap Images ### 118 | 119 | The first of the three stages is done using the contents of the [`rpms/build`](rpms/build) directory. 120 | Its results have already been pushed to the `quay.io/geodocker` docker repository, so unless the reader wishes to modify the bootstrap images, this stage can be considered complete. 121 | To rebuild the boostrap images, the reader should navigate into the `rpms/build` directory and run the `./build.sh` script. 122 | 123 | ### Stage 1: EMR-Compatible Artifacts ### 124 | 125 | The purpose of this stage is to build python artifacts that need to be linked against those binary dependencies which have been built 126 | in a context that resembles EMR (because we want the image to be usable on EMR). 127 | 128 | First, a tarball containing python code linked against the binary dependencies mentioned above [is created](Makefile#L62-L70). 129 | Then, another python tarball containing GeoPySpark [is created](Makefile#L72-L80). 130 | The reason that there are two python tarballs instead of one is simply because contents of the two tarballs change at different rates; 131 | over repeated builds, the first tarball is built less frequently than the second one. 132 | 133 | ### Stage 2: Build Final Image ### 134 | 135 | In the third of the three stages, the artifacts which were created earlier are brought together and installed into the final docker image. 136 | 137 | ## Adding Binary Dependencies ## 138 | 139 | As an example of how to make a meaningful modification to the image, 140 | in this section we will describe the process of adding new binary dependencies to the image. 141 | 142 | Currently, all binary dependencies are located in the file [`gdal-and-friends.tar.gz`](bootstrap/Makefile#L123-L134) which comes in via the [`quay.io/geodocker/jupyter-geopyspark:base-2`](rpms/build/Dockerfile.base) image on which the final image is based. 143 | If we want to add an additional binary dependency inside of that file, 144 | then we only need to [download or otherwise acquire the source code](rpms/build/Makefile#L3-L17) 145 | and update the [build script](rpms/build/scripts/build-gdal.sh) to build and package the additional code. 146 | If we wish to add a binary dependency outside of the `gdal-and-friends.tar.gz` file, then the process is slightly more involved, 147 | but potentially faster because it is not necessary to rebuild bootstrap images. 148 | 149 | The strategy for adding new binary dependency, hypothetically `libHelloWorld` packaged in a file called `helloworld-and-friends.tar.gz`, 150 | will be to mirror the process for `gdal-and-friends.tar.gz` to the extent that we can. 151 | The difference is that this time we will add the binary to the final image rather than to a bootstrap image. 152 | - First, augment to the [`Makefile`](Makefile) to download or otherwise ensure the existence of the `libHelloWorld` source code. 153 | - Next, we want to build and package `libHelloWorld` in the context of the AWS build image, so that it will be usable on EMR. 154 | This would probably be done by first creating a script analogous to [the one for GDAL](rpms/build/scripts/build-gdal.sh) that builds, links, and archives the dependency. 155 | - That script should run in the context of the AWS build container so that the created binaries are compiled and linked in an environment that resembles EMR. 156 | - The resulting archived binary blob should then be added to the final image so that it can be distributed to the Spark executors. 157 | That should probably be done by adding a the `COPY` command to the Dockerfile to copy the new blob to the `/blobs` directory of the image. 158 | - Finally, the image environment and the kernel should both be modified to make use of the new dependency. 159 | The former will probably involve the addition of an `ENV` command to the Dockerfile to augment the `LD_LIBRARY_PATH` environment variable to be able to find any new shared libraries; 160 | The latter is described below. 161 | 162 | The changes to the kernel described in the last bullet-point would probably look something like this 163 | ```diff 164 | @@ -14,6 +14,6 @@ 165 | "PYTHONPATH": "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip:/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip", 166 | "GEOPYSPARK_JARS_PATH": "/opt/jars", 167 | "YARN_CONF_DIR": "/etc/hadoop/conf", 168 | - "PYSPARK_SUBMIT_ARGS": "--archives /blobs/gdal-and-friends.tar.gz,/blobs/friends-of-geopyspark.tar.gz,/blobs/geopyspark-sans-friends.tar.gz --conf spark.executorEnv.LD_LIBRARY_PATH=gdal-and-friends.tar.gz/lib --conf spark.executorEnv.PYTHONPATH=friends-of-geopyspark.tar.gz/:geopyspark-sans-friends.tar.gz/ --conf hadoop.yarn.timeline-service.enabled=false pyspark-shell" 169 | + "PYSPARK_SUBMIT_ARGS": "--archives /blobs/helloworld-and-friends.tar.gz,/blobs/gdal-and-friends.tar.gz,/blobs/friends-of-geopyspark.tar.gz,/blobs/geopyspark-sans-friends.tar.gz --conf spark.executorEnv.LD_LIBRARY_PATH=helloworld-and-friends.tar.gz/lib:gdal-and-friends.tar.gz/lib --conf spark.executorEnv.PYTHONPATH=friends-of-geopyspark.tar.gz/:geopyspark-sans-friends.tar.gz/ --conf hadoop.yarn.timeline-service.enabled=false pyspark-shell" 170 | } 171 | } 172 | ``` 173 | 174 | (The changes represented by the diff above have not been tested.) 175 | 176 | The process for adding new distributed python dependencies is analogous to the one above, 177 | except that changes to `LD_LIBRARY_PATH` variable on the executors might not be required, 178 | and additions most-probably will need to be made to the `--conf spark.executorEnv.PYTHONPATH` configuration passed in via `PYSPARK_SUBMIT_ARGS` in the kernel. 179 | 180 | # RPM-based Deployment # 181 | 182 | ## Build RPMs ## 183 | 184 | To build the RPMs, navigate into the [`rpms/build`](rpms/build/) directory and type `./build.sh`. 185 | 186 | ## Terraform And AWS ## 187 | 188 | To use the RPM-based deployment, navigate into the [`terraform-nodocker`](terraform-nodocker/) directory. 189 | The configuration in that directory require [Terraform](https://www.terraform.io/) version 0.10.6 or greater. 190 | If you want to use Google OAuth, GitHub OAuth, or some supported generic type of OAuth, then type 191 | ```bash 192 | terraform init 193 | terraform apply 194 | ``` 195 | and respond appropriatly to the prompts. 196 | 197 | Doing that will upload (or sync) the RPMs to the S3 location that you specify, and will also upload the [`terraform-nodocker/bootstrap.sh`](terraform-nodocker/bootstrap.sh) bootstrap script. 198 | 199 | If you do not wish to use OAuth, then [some modifications to the bootstrap script](terraform-nodocker/bootstrap.sh#L84-L93) will be required. 200 | 201 | # OAuth # 202 | 203 | ## With The Docker Image ## 204 | 205 | In to use OAuth for login, two things are necessary: 206 | It is necessary to [set three environment](https://github.com/jupyterhub/oauthenticator/blame/f5e39b1ece62b8d075832054ed3213cc04f85030/README.md#L74-L78) variables inside of the container before the JupyterHub process is launched, and 207 | it is necessary to use a `jupyterhub_config.py` file that enables the desired OAuth setup. 208 | 209 | ### Environment Variables ### 210 | 211 | The three environment variables that must be set are `OAUTH_CALLBACK_URL`, `OAUTH_CLIENT_ID`, and `OAUTH_CLIENT_SECRET`. 212 | The first of those three variables should be set to `http://localhost:8000/hub/oauth_callback` for local testing and something like `http://$(hostname -f):8000/hub/oauth_callback` for deployment. 213 | The second and third are dependent on the OAuth provider. 214 | 215 | ### `jupyterhub_config.py` ### 216 | 217 | There three such files already included in the image: 218 | One for [Google](config/jupyterhub_config_google.py) and related services, 219 | one for [GitHub](config/jupyterhub_config_github.py), 220 | and a [generic](config/jupyterhub_config_generic.py) one. 221 | There is some variability in precise details of how OAuth providers work (e.g. some require variables to be passed in the URL of a POST request, whereas others require variables to passed in the body of a POST request). 222 | For that reason, the generic configuration should be considered a starting point rather than something that is guranteed to work in its unmodified state. 223 | 224 | There are only two user accounts in the image: `root` and `hadoop`. 225 | All three of the configurations discussed above map all valid OAuth users to the `hadoop` account. 226 | That is done because -- without additional configuration -- Spark jobs on EMR must come from a user named "`hadoop`". 227 | (The users inside of the container are separate and distinct from those on the host instance, 228 | but the username is evidently part of a Spark job submission, so it must match that of the user that EMR is expecting submissions from.) 229 | 230 | ### Using ### 231 | 232 | To use OAuth, launch a container with the three variables supplied and with the appropriate `jupyterhub_config.py` used. 233 | 234 | ```bash 235 | docker run -it --rm --name geopyspark \ 236 | -p 8000:8000 \ 237 | -e OAUTH_CALLBACK_URL=http://localhost:8000/hub/oauth_callback \ 238 | -e OAUTH_CLIENT_ID=xyz \ 239 | -e OAUTH_CLIENT_SECRET=abc \ 240 | quay.io/geodocker/jupyter-geopyspark:latest \ 241 | jupyterhub \ 242 | -f /etc/jupterhub/jupyterhub_config_github.py \ 243 | --no-ssl --Spawner.notebook_dir=/home/hadoop/notebooks 244 | ``` 245 | 246 | ## With The RPM-based Deployment ## 247 | 248 | This was discussed [earlier](#terraform-and-aws). 249 | -------------------------------------------------------------------------------- /config/geonotebook.ini: -------------------------------------------------------------------------------- 1 | [default] 2 | vis_server = geotrellis 3 | log_level = WARNING 4 | 5 | [geotrellis] 6 | url = http://127.0.0.1:8000/user/jack/geotrellis 7 | 8 | [geoserver] 9 | username = admin 10 | password = geoserver 11 | url = http://127.0.0.1:8080/geoserver 12 | 13 | [ktile] 14 | url = http://127.0.0.1:8000/ktile 15 | default_cache = ktile_default_cache 16 | 17 | [ktile_default_cache] 18 | name = Test 19 | path = /tmp/stache 20 | umask = 0000 21 | 22 | [basemap] 23 | url = http://{s}.tile.openstreetmap.org/{z}/{x}/{y}.png 24 | attribution = Tile data © OpenStreetMap contributors 25 | -------------------------------------------------------------------------------- /emr-docker/Makefile: -------------------------------------------------------------------------------- 1 | include config-aws.mk # Vars related to AWS credentials and services used 2 | include config-emr.mk # Vars related to type and size of EMR cluster 3 | 4 | SCRIPT_RUNNER := s3://elasticmapreduce/libs/script-runner/script-runner.jar 5 | 6 | ifeq ($(USE_SPOT),true) 7 | MASTER_BID_PRICE:=BidPrice=${MASTER_PRICE}, 8 | WORKER_BID_PRICE:=BidPrice=${WORKER_PRICE}, 9 | BACKEND=accumulo 10 | endif 11 | 12 | ifndef CLUSTER_ID 13 | CLUSTER_ID=$(shell if [ -e "cluster-id.txt" ]; then cat cluster-id.txt; fi) 14 | endif 15 | 16 | 17 | upload-code: 18 | @aws s3 cp bootstrap-geopyspark-docker.sh ${S3_URI}/bootstrap-geopyspark-docker.sh 19 | 20 | create-cluster: 21 | aws emr create-cluster --name "${NAME}" \ 22 | --release-label emr-5.7.0 \ 23 | --output text \ 24 | --use-default-roles \ 25 | --log-uri ${S3_URI}/logs \ 26 | --ec2-attributes KeyName=${EC2_KEY},SubnetId=${SUBNET_ID} \ 27 | --applications Name=Hadoop Name=Spark Name=Zeppelin \ 28 | --instance-groups \ 29 | Name=Master,${MASTER_BID_PRICE}InstanceCount=1,InstanceGroupType=MASTER,InstanceType=${MASTER_INSTANCE} \ 30 | Name=Workers,${WORKER_BID_PRICE}InstanceCount=${WORKER_COUNT},InstanceGroupType=CORE,InstanceType=${WORKER_INSTANCE} \ 31 | --bootstrap-actions Name=GeoPySpark,Path=${S3_URI}/bootstrap-geopyspark-docker.sh \ 32 | | tee cluster-id.txt 33 | 34 | wait: INTERVAL:=60 35 | wait: STEP_ID=$(shell cat last-step-id.txt) 36 | wait: 37 | @while (true); do \ 38 | OUT=$$(aws emr describe-step --cluster-id ${CLUSTER_ID} --step-id ${STEP_ID}); \ 39 | [[ $$OUT =~ (\"State\": \"([A-Z]+)\") ]]; \ 40 | echo $${BASH_REMATCH[2]}; \ 41 | case $${BASH_REMATCH[2]} in \ 42 | PENDING | RUNNING) sleep ${INTERVAL};; \ 43 | COMPLETED) exit 0;; \ 44 | *) exit 1;; \ 45 | esac; \ 46 | done 47 | 48 | terminate-cluster: 49 | aws emr terminate-clusters --cluster-ids ${CLUSTER_ID} 50 | rm -f cluster-id.txt 51 | rm -f last-step-id.txt 52 | 53 | proxy: 54 | aws emr socks --cluster-id ${CLUSTER_ID} --key-pair-file "${HOME}/${EC2_KEY}.pem" 55 | 56 | ssh: 57 | aws emr ssh --cluster-id ${CLUSTER_ID} --key-pair-file "${HOME}/${EC2_KEY}.pem" 58 | 59 | get-logs: 60 | @aws emr ssh --cluster-id $(CLUSTER_ID) --key-pair-file "${HOME}/${EC2_KEY}.pem" \ 61 | --command "rm -rf /tmp/spark-logs && hdfs dfs -copyToLocal /var/log/spark/apps /tmp/spark-logs" 62 | @mkdir -p logs/$(CLUSTER_ID) 63 | @aws emr get --cluster-id $(CLUSTER_ID) --key-pair-file "${HOME}/${EC2_KEY}.pem" --src "/tmp/spark-logs/" --dest logs/$(CLUSTER_ID) 64 | 65 | 66 | .PHONY: create-cluster get-logs 67 | -------------------------------------------------------------------------------- /emr-docker/README.md: -------------------------------------------------------------------------------- 1 | # Running Jupyter, GeoNotebook and GeoPySpark on EMR 2 | 3 | This section of the repository contains a boostrap script and a Makefile 4 | that lets you easily spin up an EMR cluster that is running the docker container 5 | of this repository. 6 | 7 | _Requires_: Reasonably up to date [`aws-cli`](https://aws.amazon.com/cli/) this document is written with version `1.10`. 8 | 9 | ### Configuration 10 | 11 | Configuraiton has been broken out into two sections which are imported by the `Makefile`. 12 | 13 | - __config-aws.mk__: AWS credentials, S3 staging bucket, subnet, etc 14 | - __config-emr.mk__: EMR cluster type and size 15 | 16 | You will need to create your `config-aws.mk` based off of `config-aws.mk.template` to reflect your credentials and your VPC configuration. 17 | 18 | `config-emr.mk` contains the following params: 19 | 20 | - __NAME__: The name of the EMR cluster 21 | - __MASTER_INSTANCE__: The type of instance to use for the master node. 22 | - __MASTER_PRICE__: The maximum bid price for the master node, if using spot instances. 23 | - __WORKER_INSTANCE__: The type of instance to use for the worker nodes. 24 | - __WORKER_PRICE__: The maximum bid price for the worker nodes, if using spot instances. 25 | - __WORKER_COUNT__: The number of workers to include in this cluster. 26 | - __USE_SPOT__: Set to `true` to use spot instances. 27 | 28 | ### The bootstrap script 29 | 30 | EMR allows you to specify a script to run on the creation of both the master and worker nodes. 31 | We supply a script here, `bootstrap-geopyspark-docker.sh`, that will set up and run 32 | this docker container with the proper configuration in the bootstrap step. 33 | 34 | The script needs to be on S3 in order to be available to the EMR starutp process; 35 | to place on S3, use the Makefile command 36 | 37 | ```bash 38 | $ make upload-code 39 | ``` 40 | 41 | ### Starting the cluster 42 | 43 | Now all we have to do to interact with the cluster is use the following Makefile commands: 44 | 45 | ```bash 46 | # Create the cluster 47 | $ make create-cluster 48 | 49 | # Terminate the cluster 50 | $ make terminate-cluster 51 | 52 | # SSH into the master node 53 | $ make ssh 54 | 55 | # Create an SSH tunnel to the master for viewing EMR Application UIs 56 | $ make proxy 57 | 58 | # Grab the logs from the master 59 | $ make get-logs 60 | ``` 61 | 62 | The create-cluster command will place a text file, `cluster-id.txt` in this directy which holds the Cluster ID. 63 | All the other commands use that ID to interact with the cluster. `teriminate-cluster` will remove this text file. 64 | 65 | ### Accessing JupyterHub 66 | 67 | Grab the public DNS name for the master node of the cluster, and visit `http://[MASTER DNS]:8000`. You 68 | should see the JupyterHub login page. The user and password are both `hadoop`. 69 | 70 | _Note_: Don't forget to open up port `8000` in the security group of the master node, or else you won't 71 | be able to access the JupyterHub endpoint. 72 | -------------------------------------------------------------------------------- /emr-docker/bootstrap-geopyspark-docker.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | IMAGE=quay.io/geodocker/jupyter-geopyspark:${TAG:-"latest"} 4 | 5 | # Parses a configuration file put in place by EMR to determine the role of this node 6 | is_master() { 7 | if [ $(jq '.isMaster' /mnt/var/lib/info/instance.json) = 'true' ]; then 8 | return 0 9 | else 10 | return 1 11 | fi 12 | } 13 | 14 | for i in "$@" 15 | do 16 | case $i in 17 | --continue) 18 | CONTINUE=true 19 | shift 20 | ;; 21 | -i=*|--image=*) 22 | IMAGE="${i#*=}" 23 | shift 24 | ;; 25 | -e=*|--env=*) 26 | ENV_VARS+=("-e ${i#*=}") 27 | shift 28 | ;; 29 | *) 30 | ;; 31 | esac 32 | done 33 | 34 | ### MAIN #### 35 | 36 | # EMR bootstrap runs before HDFS or YARN are initilized 37 | if [ ! $CONTINUE ]; then 38 | sudo yum -y install docker 39 | sudo usermod -aG docker hadoop 40 | sudo service docker start 41 | 42 | THIS_SCRIPT="$(realpath "${BASH_SOURCE[0]}")" 43 | TIMEOUT= is_master && TIMEOUT=3 || TIMEOUT=4 44 | echo "bash -x $THIS_SCRIPT --continue $ARGS > /tmp/bootstrap-geopyspark-docker.log" | at now + $TIMEOUT min 45 | exit 0 # Bail and let EMR finish initializing 46 | fi 47 | 48 | YARN_RM=$(xmllint --xpath "//property[name='yarn.resourcemanager.hostname']/value/text()" /etc/hadoop/conf/yarn-site.xml) 49 | 50 | DOCKER_ENV="-e USER=hadoop \ 51 | -e ZOOKEEPERS=$YARN_RM \ 52 | ${ENV_VARS[@]} \ 53 | -v /etc/hadoop/conf:/etc/hadoop/conf \ 54 | -v /usr/lib/hadoop-hdfs/bin:/usr/lib/hadoop-hdfs/bin" 55 | 56 | DOCKER_OPT="-d --net=host --restart=always --memory-swappiness=0" 57 | 58 | if is_master ; then 59 | sudo docker run $DOCKER_OPT --name=geopyspark $DOCKER_ENV $IMAGE 60 | fi 61 | -------------------------------------------------------------------------------- /emr-docker/config-aws.mk.template: -------------------------------------------------------------------------------- 1 | export EC2_KEY:=[KEY NAME] 2 | export AWS_DEFAULT_REGION:=us-east-1 3 | export S3_URI:=[S3 PATH FOR UPLOADING CODE] 4 | export SUBNET_ID:=[SUBNET ID] 5 | -------------------------------------------------------------------------------- /emr-docker/config-emr.mk: -------------------------------------------------------------------------------- 1 | export NAME := GeoPySpark ${USER} 2 | export MASTER_INSTANCE:=m3.xlarge 3 | export MASTER_PRICE := 0.5 4 | export WORKER_INSTANCE:=m3.xlarge 5 | export WORKER_PRICE := 0.5 6 | export WORKER_COUNT := 2 7 | export USE_SPOT := true 8 | -------------------------------------------------------------------------------- /emr-docker/configurations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Classification": "spark", 4 | "Properties": { 5 | "maximizeResourceAllocation": "false", 6 | "spark.dynamicAllocation.enabled": "true" 7 | } 8 | }, 9 | { 10 | "Classification": "spark-log4j", 11 | "Properties": { 12 | "log4j.logger.geotrellis.spark.tiling": "DEBUG" 13 | } 14 | }, 15 | 16 | { 17 | "Classification": "hdfs-site", 18 | "Properties": { 19 | "dfs.replication": "1", 20 | "dfs.permissions": "false", 21 | "dfs.datanode.max.xcievers": "16384", 22 | "dfs.datanode.max.transfer.threads": "16384", 23 | "dfs.datanode.balance.max.concurrent.moves": "1000", 24 | "dfs.datanode.balance.bandwidthPerSec": "100000000" 25 | } 26 | }, 27 | { 28 | "Classification": "yarn-site", 29 | "Properties": { 30 | "yarn.resourcemanager.am.max-attempts": "1" 31 | } 32 | }, 33 | { 34 | "Classification": "hadoop-env", 35 | "Configurations": [ 36 | { 37 | "Classification": "export", 38 | "Properties": { 39 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 40 | "GDAL_DATA": "/usr/local/share/gdal", 41 | "LD_LIBRARY_PATH": "/usr/local/lib", 42 | "PYSPARK_PYTHON": "python27", 43 | "PYSPARK_DRIVER_PYTHON": "python27" 44 | } 45 | } 46 | ] 47 | }, 48 | { 49 | "Classification": "spark-env", 50 | "Configurations": [ 51 | { 52 | "Classification": "export", 53 | "Properties": { 54 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 55 | "GDAL_DATA": "/usr/local/share/gdal", 56 | "LD_LIBRARY_PATH": "/usr/local/lib", 57 | "PYSPARK_PYTHON": "python27", 58 | "PYSPARK_DRIVER_PYTHON": "python27" 59 | } 60 | } 61 | ] 62 | }, 63 | { 64 | "Classification": "yarn-env", 65 | "Configurations": [ 66 | { 67 | "Classification": "export", 68 | "Properties": { 69 | "JAVA_HOME": "/usr/lib/jvm/java-1.8.0", 70 | "GDAL_DATA": "/usr/local/share/gdal", 71 | "LD_LIBRARY_PATH": "/usr/local/lib", 72 | "PYSPARK_PYTHON": "python27", 73 | "PYSPARK_DRIVER_PYTHON": "python27" 74 | } 75 | } 76 | ] 77 | } 78 | ] 79 | -------------------------------------------------------------------------------- /kernels/local/kernel.json: -------------------------------------------------------------------------------- 1 | { 2 | "argv": [ 3 | "/usr/bin/python3.4", 4 | "-m", 5 | "ipykernel", 6 | "-f", 7 | "{connection_file}" 8 | ], 9 | "env": { 10 | "LD_LIBRARY_PATH": "/usr/local/lib", 11 | "PYSPARK_PYTHON": "/usr/bin/python3.4", 12 | "SPARK_HOME": "/usr/local/spark-2.1.0-bin-hadoop2.7", 13 | "GEOPYSPARK_JARS_PATH": "/opt/jars", 14 | "PYTHONPATH": "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip:/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip", 15 | "PYSPARK_SUBMIT_ARGS": "--master local[*] --conf spark.executorEnv.LD_LIBRARY_PATH=/home/hadoop/local/gdal/lib/ pyspark-shell" 16 | }, 17 | "language": "python", 18 | "display_name": "PySpark (local)" 19 | } 20 | -------------------------------------------------------------------------------- /kernels/yarn/kernel.json: -------------------------------------------------------------------------------- 1 | { 2 | "argv": [ 3 | "/usr/bin/python3.4", 4 | "-m", 5 | "ipykernel", 6 | "-f", 7 | "{connection_file}" 8 | ], 9 | "env": { 10 | "PYSPARK_PYTHON": "/usr/bin/python3.4", 11 | "SPARK_HOME": "/usr/local/spark-2.1.0-bin-hadoop2.7", 12 | "PYTHONPATH": "/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/pyspark.zip:/usr/local/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip", 13 | "YARN_CONF_DIR": "/etc/hadoop/conf", 14 | "PYSPARK_SUBMIT_ARGS": "--archives /blobs/gdal-and-friends.tar.gz,/blobs/friends-of-geopyspark.tar.gz,/blobs/geopyspark-sans-friends.tar.gz --conf spark.executorEnv.LD_LIBRARY_PATH=gdal-and-friends.tar.gz/lib --conf spark.executorEnv.PYTHONPATH=friends-of-geopyspark.tar.gz/:geopyspark-sans-friends.tar.gz/ pyspark-shell" 15 | }, 16 | "language": "python", 17 | "display_name": "GeoPySpark" 18 | } 19 | -------------------------------------------------------------------------------- /notebooks/Getting the mask.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "import math\n", 11 | "from shapely.geometry import shape, mapping, Polygon" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "!curl 'https://raw.githubusercontent.com/johan/world.geo.json/master/countries/USA/FL.geo.json' -o /tmp/FL.geo.json" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "txt = open(\"/tmp/FL.geo.json\").read()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "geojson = json.loads(txt)\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "geoms = list(map(lambda x: shape(x[\"geometry\"]), geojson[\"features\"]))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "g1 = geoms[0]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "full = g1\n", 66 | "for g in geoms[1:]:\n", 67 | " full = full.union(g)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "len(geoms)" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "s = set([])\n", 86 | "for g in geoms:\n", 87 | " s.add(g.geom_type)\n", 88 | "s\n" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "lens= []\n", 98 | "for g in geoms:\n", 99 | " g.geom_type" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "def get_size(g):\n", 109 | " b = g.bounds\n", 110 | " w = b[2] - b[0]\n", 111 | " h = b[3] - b[1]\n", 112 | " return math.sqrt(w*w + h*h)\n", 113 | "max_size = 0.0\n", 114 | "max_size_geom = None\n", 115 | "for g in geoms:\n", 116 | " s = get_size(g)\n", 117 | " if max_size < s:\n", 118 | " max_size = s\n", 119 | " max_size_geom = g\n", 120 | "g" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "geoms.sort(key=lambda g: -get_size(g))" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": null, 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "gj = mapping(geoms[0])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "open('/tmp/poly.json', 'w').write(json.dumps(gj))" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "bg = geoms[0]" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "p = Polygon(bg.exterior)\n" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "p" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [] 183 | } 184 | ], 185 | "metadata": { 186 | "kernelspec": { 187 | "display_name": "GeoPySpark", 188 | "language": "python", 189 | "name": "gps" 190 | }, 191 | "language_info": { 192 | "codemirror_mode": { 193 | "name": "ipython", 194 | "version": 3 195 | }, 196 | "file_extension": ".py", 197 | "mimetype": "text/x-python", 198 | "name": "python", 199 | "nbconvert_exporter": "python", 200 | "pygments_lexer": "ipython3", 201 | "version": "3.4.6" 202 | } 203 | }, 204 | "nbformat": 4, 205 | "nbformat_minor": 2 206 | } 207 | -------------------------------------------------------------------------------- /notebooks/Landsat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import rasterio\n", 10 | "import rasterio.features\n", 11 | "import rasterio.warp\n", 12 | "import geopyspark as gps\n", 13 | "import numpy as np\n", 14 | "import csv\n", 15 | "import matplotlib.pyplot as plt\n", 16 | "\n", 17 | "from datetime import datetime\n", 18 | "from pyspark import SparkContext\n", 19 | "from osgeo import osr\n", 20 | "\n", 21 | "%matplotlib inline" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "sc = SparkContext(conf=gps.geopyspark_conf(appName=\"Landsat\").set(\"spark.ui.enabled\",True))" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": null, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "csv_data = [{'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B2.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '2'},\n", 40 | " {'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B3.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '3'},\n", 41 | " {'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B4.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '4'},\n", 42 | " {'uri': 's3://landsat-pds/L8/107/035/LC81070352015218LGN00/LC81070352015218LGN00_B5.TIF', 'scene_id': 'LC81070352015218LGN00', 'date': '2015218', 'band': '5'}]" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "rdd0 = sc.parallelize(csv_data)" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "def get_metadata(line):\n", 61 | " \n", 62 | " try:\n", 63 | " with rasterio.open(line['uri']) as dataset:\n", 64 | " bounds = dataset.bounds\n", 65 | " height = height = dataset.height\n", 66 | " width = dataset.width\n", 67 | " crs = dataset.get_crs()\n", 68 | " srs = osr.SpatialReference()\n", 69 | " srs.ImportFromWkt(crs.wkt)\n", 70 | " proj4 = srs.ExportToProj4()\n", 71 | " ws = [w for (ij, w) in dataset.block_windows()]\n", 72 | " except:\n", 73 | " ws = []\n", 74 | " \n", 75 | " def windows(line, ws):\n", 76 | " for w in ws:\n", 77 | " ((row_start, row_stop), (col_start, col_stop)) = w\n", 78 | "\n", 79 | " left = bounds.left + (bounds.right - bounds.left)*(float(col_start)/width)\n", 80 | " right = bounds.left + (bounds.right - bounds.left)*(float(col_stop)/ width)\n", 81 | " bottom = bounds.top + (bounds.bottom - bounds.top)*(float(row_stop)/height)\n", 82 | " top = bounds.top + (bounds.bottom - bounds.top)*(float(row_start)/height)\n", 83 | " extent = gps.Extent(left,bottom,right,top)\n", 84 | " instant = datetime.strptime(line['date'], '%Y%j')\n", 85 | " \n", 86 | " new_line = line.copy()\n", 87 | " new_line.pop('date')\n", 88 | " new_line.pop('scene_id')\n", 89 | " new_line['window'] = w\n", 90 | " new_line['projected_extent'] = gps.TemporalProjectedExtent(extent=extent, instant=instant, proj4=proj4)\n", 91 | " yield new_line\n", 92 | " \n", 93 | " return [i for i in windows(line, ws)]\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "rdd1 = rdd0.flatMap(get_metadata)\n", 103 | "rdd1.first()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "def get_data(line):\n", 113 | " \n", 114 | " new_line = line.copy()\n", 115 | "\n", 116 | " with rasterio.open(line['uri']) as dataset:\n", 117 | " new_line['data'] = dataset.read(1, window=line['window'])\n", 118 | " new_line.pop('window')\n", 119 | " new_line.pop('uri')\n", 120 | " \n", 121 | " return new_line" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "rdd2 = rdd1.map(get_data)\n", 131 | "rdd2.first()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "rdd3 = rdd2.groupBy(lambda line: line['projected_extent'])\n", 141 | "rdd3.first()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "def make_tiles(line):\n", 151 | " projected_extent = line[0]\n", 152 | " bands = sorted(line[1], key=lambda l: l['band'])\n", 153 | " array = np.array([l['data'] for l in bands])\n", 154 | " tile = gps.Tile.from_numpy_array(array, no_data_value=0)\n", 155 | " return (projected_extent, tile)\n", 156 | "\n", 157 | "def interesting_tile(line):\n", 158 | " [tpe, tile] = line\n", 159 | " return (np.sum(tile[0][0]) != 0)\n", 160 | "\n", 161 | "def square_tile(line):\n", 162 | " [tpe, tile] = line\n", 163 | " return tile[0][0].shape == (512,512)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "rdd4 = rdd3.map(make_tiles).filter(square_tile)\n", 173 | "data = rdd4.filter(interesting_tile).first()\n", 174 | "data" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "plt.imshow(data[1][0][0])" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "raster_layer = gps.RasterLayer.from_numpy_rdd(gps.LayerType.SPACETIME, rdd4)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "tiled_raster_layer = raster_layer.tile_to_layout(layout = gps.GlobalLayout(), target_crs=3857)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "pyramid = tiled_raster_layer.pyramid()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": {}, 217 | "outputs": [], 218 | "source": [ 219 | "for layer in pyramid.levels.values():\n", 220 | " gps.write(\"file:///tmp/catalog/\", \"landsat\", layer, time_unit=gps.TimeUnit.DAYS)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "## Display (Optional) ##" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "pyramid = tiled_raster_layer.to_spatial_layer().pyramid()" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "for layer in pyramid.levels.values():\n", 246 | " gps.write(\"file:///tmp/catalog/\", \"landsat-spatial\", layer)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "from PIL import Image\n", 256 | "\n", 257 | "def render_tile(tile):\n", 258 | " norm = np.uint8(tile[0] / tile[0].max() * 255)\n", 259 | " mask = np.uint8((norm[0] != 0) * 255)\n", 260 | " return Image.fromarray(np.dstack([norm[2], norm[1], norm[0], mask]), mode='RGBA')" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "tms_server = gps.TMS.build((\"file:///tmp/catalog/\", \"landsat-spatial\"), display=render_tile)\n", 270 | "tms_server.bind('0.0.0.0')" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "import folium\n", 280 | "\n", 281 | "m = folium.Map(tiles='Stamen Terrain', location=[35.6, 140.1], zoom_start=5)\n", 282 | "folium.TileLayer(tiles=tms_server.url_pattern, attr='GeoPySpark').add_to(m)\n", 283 | "m" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [] 292 | } 293 | ], 294 | "metadata": { 295 | "kernelspec": { 296 | "display_name": "GeoPySpark", 297 | "language": "python", 298 | "name": "gps" 299 | }, 300 | "language_info": { 301 | "codemirror_mode": { 302 | "name": "ipython", 303 | "version": 3 304 | }, 305 | "file_extension": ".py", 306 | "mimetype": "text/x-python", 307 | "name": "python", 308 | "nbconvert_exporter": "python", 309 | "pygments_lexer": "ipython3", 310 | "version": "3.4.6" 311 | } 312 | }, 313 | "nbformat": 4, 314 | "nbformat_minor": 2 315 | } 316 | -------------------------------------------------------------------------------- /notebooks/NLCD viewer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from functools import partial\n", 10 | "import geopyspark as gps\n", 11 | "import numpy as np\n", 12 | "import fiona\n", 13 | "import json\n", 14 | "import pyproj\n", 15 | "\n", 16 | "from pyspark import SparkContext\n", 17 | "from colortools import Color\n", 18 | "\n", 19 | "from shapely.geometry import mapping, shape\n", 20 | "from shapely.ops import transform\n", 21 | "\n", 22 | "from folium import Map, TileLayer, GeoJson" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "conf = gps.geopyspark_conf(master=\"local[*]\", appName=\"NLCD Viewer\")\n", 32 | "sc = SparkContext(conf=conf)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "catalog_uri = \"s3://azavea-datahub/catalog\"\n", 42 | "layer_name = \"nlcd-2011-epsg3857\"" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Viewing NLCD" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "nlcd_cmap = gps.ColorMap.nlcd_colormap()\n", 59 | "nlcd_tms_server = gps.TMS.build((catalog_uri, layer_name), display=nlcd_cmap)\n", 60 | "nlcd_tms_server.bind('0.0.0.0')\n", 61 | "nlcd_tms_server.url_pattern" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "m = Map(tiles='Stamen Terrain', location=[37.1, -95.7], zoom_start=4)\n", 71 | "TileLayer(tiles=nlcd_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(m)\n", 72 | "m" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "## Viewing reclassified tiles\n", 80 | "\n", 81 | "This example shows how to do custom, on-the-fly display from an existing catalog using a callback to a Python rendering function. This method is much slower than using color maps. Please be patient during map display/zooming." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "import struct\n", 91 | "from PIL import Image\n", 92 | "\n", 93 | "def from_color_get_component(i):\n", 94 | " def fn(x):\n", 95 | " split = struct.Struct(\">I\").pack\n", 96 | " r,g,b,a = split(x & 0xffffffff)\n", 97 | " return np.array([r,g,b,a], dtype='uint8')[i]\n", 98 | " return fn\n", 99 | "\n", 100 | "def render_tile(tile):\n", 101 | " rr = np.vectorize(from_color_get_component(0))(tile)\n", 102 | " gg = np.vectorize(from_color_get_component(1))(tile)\n", 103 | " bb = np.vectorize(from_color_get_component(2))(tile)\n", 104 | " aa = np.vectorize(from_color_get_component(3))(tile)\n", 105 | " return Image.fromarray(np.dstack([rr, gg, bb, aa]), mode='RGBA')" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "from PIL import Image\n", 115 | "import struct\n", 116 | "\n", 117 | "def render_cultivated(tile):\n", 118 | " # NLCD codes in the 80's are Planted/Cultivated\n", 119 | " # See https://www.mrlc.gov/nlcd11_leg.php\n", 120 | " colorize = np.vectorize(lambda x: 0x7110b2aa if ((80 <= x) & (x < 90)) else 0x00000000)\n", 121 | " return render_tile(colorize(tile[0][0]))" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "custom_nlcd_tms_server = gps.TMS.build((catalog_uri, layer_name), display=render_cultivated)\n", 131 | "custom_nlcd_tms_server.bind('0.0.0.0')\n", 132 | "custom_nlcd_tms_server.url_pattern" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "m = Map(tiles='Stamen Terrain', location=[37.1, -95.7], zoom_start=4)\n", 142 | "TileLayer(tiles=custom_nlcd_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(m)\n", 143 | "m" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Chattanooga geometry" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "!curl -o /tmp/mask.json https://s3.amazonaws.com/chattademo/chatta_mask.json" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "txt = open('/tmp/mask.json').read()\n", 169 | "js = json.loads(txt)\n", 170 | "geom = shape(js)\n", 171 | "center = geom.centroid\n", 172 | "chatta_center = [center.y, center.x] # Location in lat/long" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(m)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "m.location = chatta_center\n", 191 | "m.zoom_start = 8\n", 192 | "m" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## Fetching an RDD of NLCD masked to Chattanooga" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "project = partial(\n", 209 | " pyproj.transform,\n", 210 | " pyproj.Proj(init='epsg:4326'),\n", 211 | " pyproj.Proj(init='epsg:3857'))\n", 212 | "\n", 213 | "chatta_poly = transform(project, geom)" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "query_rdd = gps.query(catalog_uri,\n", 223 | " layer_name,\n", 224 | " 12,\n", 225 | " query_geom=chatta_poly)" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "masked = query_rdd.mask([chatta_poly])\n", 235 | "masked_tms_server = gps.TMS.build(masked.pyramid(), display=nlcd_cmap)\n", 236 | "masked_tms_server.bind('0.0.0.0')" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "chatta_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n", 246 | "TileLayer(tiles=masked_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(chatta_map)\n", 247 | "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(chatta_map)\n", 248 | "chatta_map" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "metadata": {}, 254 | "source": [ 255 | "## Reclassifying an RDD" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "reclassified = masked.reclassify({0: 1, 80: 2, 90: 1},\n", 265 | " int,\n", 266 | " gps.ClassificationStrategy.GREATER_THAN_OR_EQUAL_TO).repartition(150)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "colors = gps.get_colors_from_colors(\n", 276 | " [Color(\"#CA9146FF\"), Color(\"#00FFAA88\")])\n", 277 | "\n", 278 | "breaks = {\n", 279 | " 1: colors[0],\n", 280 | " 2: colors[1]\n", 281 | "}\n", 282 | "\n", 283 | "reclassified_cmap = gps.ColorMap.build(breaks)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "reclassified_tms_server = gps.TMS.build(reclassified.pyramid(), display=reclassified_cmap)\n", 293 | "reclassified_tms_server.bind('0.0.0.0')" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "reclass_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n", 303 | "TileLayer(tiles=reclassified_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(reclass_map)\n", 304 | "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(reclass_map)\n", 305 | "reclass_map" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "## Saving the reclassified layer locally" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "local_catalog_uri = \"file:///tmp/catalog\"\n", 322 | "local_layer_name = \"cultivated-land-cover\"" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "for layer in reclassified.pyramid().levels.values():\n", 332 | " gps.write(local_catalog_uri, local_layer_name, layer)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "## Viewing the local Layer" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "nlcd_tms_server.unbind()\n", 349 | "custom_nlcd_tms_server.unbind()\n", 350 | "masked_tms_server.unbind()\n", 351 | "reclassified_tms_server.unbind()" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "local_tms_server = gps.TMS.build((local_catalog_uri, local_layer_name), reclassified_cmap)\n", 361 | "local_tms_server.bind('0.0.0.0')" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "scrolled": false 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "local_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n", 373 | "TileLayer(tiles=local_tms_server.url_pattern, attr='GeoPySpark Tiles').add_to(local_map)\n", 374 | "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(local_map)\n", 375 | "local_map" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "layers = [gps.query(local_catalog_uri, local_layer_name, x) for x in range(0, 11)]" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "read_in_pyramid = gps.Pyramid(layers)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "# This cannot display as well\n", 403 | "server = gps.TMS.build(read_in_pyramid, reclassified_cmap)\n", 404 | "server.bind('0.0.0.0')" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "metadata": {}, 411 | "outputs": [], 412 | "source": [ 413 | "rdd_map = Map(tiles='Stamen Terrain', location=chatta_center, zoom_start=8)\n", 414 | "TileLayer(tiles=server.url_pattern, attr='GeoPySpark Tiles').add_to(rdd_map)\n", 415 | "GeoJson('/tmp/mask.json', name='Chattanooga').add_to(rdd_map)\n", 416 | "rdd_map" 417 | ] 418 | } 419 | ], 420 | "metadata": { 421 | "kernelspec": { 422 | "display_name": "GeoPySpark", 423 | "language": "python", 424 | "name": "gps" 425 | }, 426 | "language_info": { 427 | "codemirror_mode": { 428 | "name": "ipython", 429 | "version": 3 430 | }, 431 | "file_extension": ".py", 432 | "mimetype": "text/x-python", 433 | "name": "python", 434 | "nbconvert_exporter": "python", 435 | "pygments_lexer": "ipython3", 436 | "version": "3.4.6" 437 | } 438 | }, 439 | "nbformat": 4, 440 | "nbformat_minor": 2 441 | } 442 | -------------------------------------------------------------------------------- /notebooks/Park citing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding the Optimal Location for a New Park\n", 8 | "\n", 9 | "This example notebook will show how to find the next potential location for a new park in San Fransisco. To accomplish this, three factors will be taken into consideration when deciding on a possible spot: existing parks, schools, and Bay Area Regional Transit (BART) stops. By calculating Euclidean Distance for these three factors and then weighing them together, we will be able to produce a visual representation of where is and is not a good location for a new park.\n", 10 | "\n", 11 | "## Importing the Libraries" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import geopyspark as gps\n", 21 | "import fiona\n", 22 | "\n", 23 | "from pyspark import SparkContext, StorageLevel\n", 24 | "from shapely.geometry import MultiPoint, MultiPolygon, shape\n", 25 | "\n", 26 | "import folium" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Setup the SparkContext" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "conf = gps.geopyspark_conf(appName=\"park-siting\", master=\"local[*]\")\n", 43 | "sc = SparkContext.getOrCreate(conf=conf)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Set map display parameters" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "center = [37.8, -122.2]\n", 60 | "zoom_start = 9.5" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": {}, 66 | "source": [ 67 | "## Download the Geometries as GeoJsons" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "!curl -o /tmp/bart.geojson https://s3.amazonaws.com/geopyspark-demo/bayarea/bart.geojson\n", 77 | "!curl -o /tmp/school.geojson https://s3.amazonaws.com/geopyspark-demo/bayarea/school.geojson\n", 78 | "!curl -o /tmp/parks.geojson https://s3.amazonaws.com/geopyspark-demo/bayarea/parks.geojson" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## Read in the GeoJsons as Shapely Geometries" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "with fiona.open(\"/tmp/bart.geojson\") as source:\n", 95 | " bart_crs = source.crs['init']\n", 96 | " bart = MultiPoint([shape(f['geometry']) for f in source])\n", 97 | "\n", 98 | "with fiona.open(\"/tmp/school.geojson\") as source:\n", 99 | " schools_crs = source.crs['init']\n", 100 | " schools = MultiPoint([shape(f['geometry']) for f in source])\n", 101 | "\n", 102 | "with fiona.open(\"/tmp/parks.geojson\") as source:\n", 103 | " parks_crs = source.crs['init']\n", 104 | " parks = MultiPolygon([shape(f['geometry']) for f in source])" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Calculate Euclidean Distance for Each Geometry\n", 112 | "\n", 113 | "Three new `TiledRasterLayer`s will be produced from the Euclidean Distance calculations for each geometry. All resulting layers will have a `zoom_level` of 12." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "bart_layer = gps.euclidean_distance(geometry=bart,\n", 123 | " source_crs=bart_crs,\n", 124 | " zoom=12)\n", 125 | "\n", 126 | "schools_layer = gps.euclidean_distance(geometry=schools,\n", 127 | " source_crs=schools_crs,\n", 128 | " zoom=12)\n", 129 | "\n", 130 | "parks_layer = gps.euclidean_distance(geometry=parks,\n", 131 | " source_crs=parks_crs,\n", 132 | " zoom=12)\n", 133 | "\n", 134 | "# Persists each layer to memory and disk\n", 135 | "bart_layer.persist(StorageLevel.MEMORY_AND_DISK)\n", 136 | "schools_layer.persist(StorageLevel.MEMORY_AND_DISK)\n", 137 | "parks_layer.persist(StorageLevel.MEMORY_AND_DISK)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "## Weighing the Layers Together" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "weighted_layer = -1 * bart_layer - schools_layer + 3 * parks_layer\n", 154 | "\n", 155 | "# Persists the weighted layer to memory and disk\n", 156 | "weighted_layer.persist(StorageLevel.MEMORY_AND_DISK)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "## Reprojecting, Pyramiding, and Calculating the Histogram" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "# The following code may take awhile to complete\n", 173 | "reprojected = weighted_layer.tile_to_layout(layout=gps.GlobalLayout(),\n", 174 | " target_crs=\"EPSG:3857\")\n", 175 | "pyramid = reprojected.pyramid(resample_method=gps.ResampleMethod.AVERAGE)\n", 176 | "histogram = pyramid.get_histogram()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## Creating the ColorMap\n", 184 | "\n", 185 | "The below code creates a `ColorMap` instance using the `Histogram` from `pyramid` for its `breaks`. For the color, the `matplotlib` color palette, `viridus` will be used." 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "color_map = gps.ColorMap.build(breaks=histogram,\n", 195 | " colors='viridis')" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Running the Server" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "tms = gps.TMS.build(source=pyramid,\n", 212 | " display=color_map)\n", 213 | "\n", 214 | "tms.bind('0.0.0.0')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": {}, 221 | "outputs": [], 222 | "source": [ 223 | "m = folium.Map(tiles='OpenStreetMap', location=center, zoom_start=zoom_start)\n", 224 | "folium.TileLayer(tiles=tms.url_pattern, overlay=True, attr='GeoPySpark tiles').add_to(m)\n", 225 | "folium.GeoJson(data='/tmp/bart.geojson', name='BART stops').add_to(m)\n", 226 | "folium.GeoJson(data='/tmp/parks.geojson', name='Parks').add_to(m)\n", 227 | "folium.LayerControl().add_to(m)\n", 228 | "m" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "## Cleaning up" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "tms.unbind()" 245 | ] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "GeoPySpark", 251 | "language": "python", 252 | "name": "gps" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.4.6" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 2 269 | } 270 | -------------------------------------------------------------------------------- /notebooks/Pine Habitat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# This tutorial will show you how to find the suitable habitat range for Bristlecone pine using GeoPySpark\n", 8 | "\n", 9 | "This tutorial will focus on GeoPySpark functionality, but you can find more resources and tutorials about GeoNotebooks [here](https://github.com/OpenGeoscience/geonotebook/tree/master/notebooks).\n", 10 | "\n", 11 | "### Suitability analysis is a classic GIS case study that enables the combination of factors to return a desired result \n", 12 | "This tutorial sets the premise that you are interested in two factors for locating Bristlecone pines:\n", 13 | "- Located between 3,000 and 4,000 meters\n", 14 | "- Located on a south facing slope\n", 15 | " " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "import geopyspark as gps\n", 25 | "from pyspark import SparkContext" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "You will need to set up a spark context. To learn more about what that means take a look [here](https://spark.apache.org/docs/latest/programming-guide.html#initializing-spark)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "conf=gps.geopyspark_conf(appName=\"BristleConePine\")\n", 42 | "conf.set('spark.ui.enabled', True)\n", 43 | "sc = SparkContext(conf = conf)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Retrieving an elevation .tif from AWS S3:" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "elev_rdd = gps.geotiff.get(\n", 60 | " layer_type='spatial', \n", 61 | " uri='s3://geopyspark-demo/elevation/ca-elevation.tif')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "metadata": {}, 67 | "source": [ 68 | "## Tile, reproject, pyramid:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "elev_tiled_rdd = elev_rdd.tile_to_layout(\n", 78 | " layout=gps.GlobalLayout(), \n", 79 | " target_crs=3857)\n", 80 | "elev_pyramided_rdd = elev_tiled_rdd.pyramid().cache()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "Imports for creating a TMS server capable of serving layers with custom colormaps" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "from geopyspark.geotrellis.color import get_colors_from_matplotlib\n", 97 | "elev_histo = elev_pyramided_rdd.get_histogram()\n", 98 | "elev_colors = get_colors_from_matplotlib('viridis', 100)\n", 99 | "elev_color_map = gps.ColorMap.from_histogram(elev_histo, elev_colors)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "elev_tms = gps.TMS.build(elev_pyramided_rdd, elev_color_map)\n", 109 | "elev_tms.bind('0.0.0.0')" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "Display the tiles in an embedded [Folium](https://python-visualization.github.io/folium/) map:" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "import folium\n", 126 | "\n", 127 | "map_center = [37.75, -118.85]\n", 128 | "zoom = 7\n", 129 | "\n", 130 | "m = folium.Map(location=map_center, zoom_start=zoom)\n", 131 | "folium.TileLayer(tiles=\"Stamen Terrain\", overlay=False).add_to(m)\n", 132 | "folium.TileLayer(tiles=elev_tms.url_pattern, attr=\"GeoPySpark\", overlay=True).add_to(m)\n", 133 | "m" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Classify the elevation such that values of interest (between 3,000 and 4,000 meters) return a value of 1." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "# use: elev_reprojected_rdd\n", 150 | "elev_reclass_pre = elev_tiled_rdd.reclassify({1000:2, 2000:2, 3000:2, 4000:1, 5000:2}, int)\n", 151 | "elev_reclass_rdd = elev_reclass_pre.reclassify({1:1}, int)\n", 152 | "elev_reclass_pyramid_rdd = elev_reclass_rdd.pyramid()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "elev_reclass_histo = elev_reclass_pyramid_rdd.get_histogram()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "#elev_reclass_color_map = ColorMap.from_histogram(sc, elev_reclass_histo, get_breaks(sc, 'Viridis', num_colors=100))\n", 171 | "elev_reclass_color_map = gps.ColorMap.from_colors(\n", 172 | " breaks =[1], \n", 173 | " color_list = [0xff000080])" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "elev_reclass_tms = gps.TMS.build(elev_reclass_pyramid_rdd, elev_reclass_color_map)\n", 183 | "elev_reclass_tms.bind('0.0.0.0')" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "m2 = folium.Map(location=map_center, zoom_start=zoom)\n", 193 | "folium.TileLayer(tiles=\"Stamen Terrain\", overlay=False).add_to(m2)\n", 194 | "folium.TileLayer(tiles=elev_tms.url_pattern, attr='GeoPySpark', name=\"Elevation\", overlay=True).add_to(m2)\n", 195 | "folium.TileLayer(tiles=elev_reclass_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m2)\n", 196 | "folium.LayerControl().add_to(m2)\n", 197 | "m2" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "Focal operation: aspect. To find south facing slopes" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "# square_neighborhood = Square(extent=1)\n", 214 | "aspect_rdd = elev_tiled_rdd.focal(\n", 215 | " gps.Operation.ASPECT, \n", 216 | " gps.Neighborhood.SQUARE, 1)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "aspect_pyramid_rdd = aspect_rdd.pyramid()\n", 226 | "aspect_histo = aspect_pyramid_rdd.get_histogram()\n", 227 | "aspect_color_map = gps.ColorMap.from_histogram(aspect_histo, get_colors_from_matplotlib('viridis', num_colors=256))" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "aspect_tms = gps.TMS.build(aspect_pyramid_rdd, aspect_color_map)\n", 237 | "aspect_tms.bind('0.0.0.0')" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "m3 = folium.Map(tiles='Stamen Terrain', location=map_center, zoom_start=zoom)\n", 247 | "folium.TileLayer(tiles=aspect_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m3)\n", 248 | "m3" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "aspect_tms.unbind()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": {}, 263 | "source": [ 264 | "Reclassify values such that values between 120 and 240 degrees (south) have a value of 1" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "aspect_reclass_pre = aspect_rdd.reclassify({120:2, 240:1, 360: 2}, int)\n", 274 | "aspect_reclass = aspect_reclass_pre.reclassify({1:1}, int)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "aspect_reclass_pyramid_rdd = aspect_reclass.pyramid()\n", 284 | "aspect_reclass_histo = aspect_reclass_pyramid_rdd.get_histogram()\n", 285 | "aspect_reclass_color_map = gps.ColorMap.from_histogram(aspect_reclass_histo, get_colors_from_matplotlib('viridis', num_colors=256))" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "aspect_reclass_tms = gps.TMS.build(aspect_reclass_pyramid_rdd, aspect_reclass_color_map)\n", 295 | "aspect_reclass_tms.bind('0.0.0.0')" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "m4 = folium.Map(tiles='Stamen Terrain', location=map_center, zoom_start=zoom)\n", 305 | "folium.TileLayer(tiles=aspect_reclass_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m4)\n", 306 | "m4" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "aspect_reclass_tms.unbind()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "Now add the values togehter to find the suitable range:" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "added = elev_reclass_pyramid_rdd + aspect_reclass_pyramid_rdd\n", 332 | "added_histo = added.get_histogram()\n", 333 | "added_color_map = gps.ColorMap.from_histogram(added_histo, get_colors_from_matplotlib('viridis', num_colors=256))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "added_tms = gps.TMS.build(added, added_color_map)\n", 343 | "added_tms.bind('0.0.0.0')" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "m5 = folium.Map(tiles='Stamen Terrain', location=map_center, zoom_start=zoom)\n", 353 | "folium.TileLayer(tiles=added_tms.url_pattern, attr='GeoPySpark', name=\"High Elevation Areas\", overlay=True).add_to(m5)\n", 354 | "m5" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "import matplotlib.pyplot as plt\n", 364 | "%matplotlib inline" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "metadata": { 371 | "scrolled": true 372 | }, 373 | "outputs": [], 374 | "source": [ 375 | "v = elev_tiled_rdd.lookup(342,787)\n", 376 | "plt.imshow(v[0].cells[0])" 377 | ] 378 | } 379 | ], 380 | "metadata": { 381 | "kernelspec": { 382 | "display_name": "GeoPySpark", 383 | "language": "python", 384 | "name": "gps" 385 | }, 386 | "language_info": { 387 | "codemirror_mode": { 388 | "name": "ipython", 389 | "version": 3 390 | }, 391 | "file_extension": ".py", 392 | "mimetype": "text/x-python", 393 | "name": "python", 394 | "nbconvert_exporter": "python", 395 | "pygments_lexer": "ipython3", 396 | "version": "3.4.6" 397 | } 398 | }, 399 | "nbformat": 4, 400 | "nbformat_minor": 2 401 | } 402 | -------------------------------------------------------------------------------- /notebooks/SRTM-emr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import rasterio\n", 12 | "import rasterio.features\n", 13 | "import rasterio.warp\n", 14 | "import geopyspark as gps\n", 15 | "import numpy as np\n", 16 | "import matplotlib.pyplot as plt\n", 17 | "\n", 18 | "from pyspark import SparkContext\n", 19 | "from osgeo import osr\n", 20 | "\n", 21 | "import os\n", 22 | "import math\n", 23 | "import boto3\n", 24 | "\n", 25 | "%matplotlib inline" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "conf = gps.geopyspark_conf(\"yarn-client\", \"SRTM Ingest\") \\\n", 37 | " .set(\"spark.dynamicAllocation.enabled\", False) \\\n", 38 | " .set(\"spark.executor.instances\", \"50\") \\\n", 39 | " .set(\"spark.executor.memory\", \"9472M\") \\\n", 40 | " .set(\"spark.executor.cores\", \"4\") \\\n", 41 | " .set(\"spark.ui.enabled\", True) \\\n", 42 | " .set(\"spark.hadoop.yarn.timeline-service.enabled\", False)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "sc = SparkContext(conf=conf)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "s3 = boto3.client('s3')\n", 65 | "def get_raster_s3_objects(bucket, prefix, extension=\"hgt\"):\n", 66 | " paginator = s3.get_paginator('list_objects_v2')\n", 67 | " page_iterator = paginator.paginate(Bucket=bucket, Prefix=prefix)\n", 68 | " results = []\n", 69 | " for page in page_iterator:\n", 70 | " for item in page['Contents']:\n", 71 | " if item['Key'].endswith(extension):\n", 72 | " results.append(item)\n", 73 | " return results\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "object_names = get_raster_s3_objects(\"mrgeo-source\", \"srtm-v3-30\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "file_names = list(map(lambda d: d['Key'][len('srtm-v3-30/'):], object_names))\n", 94 | "print(len(file_names))\n", 95 | "print(file_names[0:10])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "def get_metadata(uri):\n", 107 | " import rasterio\n", 108 | " from osgeo import osr\n", 109 | " import os\n", 110 | " \n", 111 | " if \"GDAL_DATA\" not in os.environ:\n", 112 | " os.environ[\"GDAL_DATA\"]=\"/usr/local/lib64/python3.4/site-packages/fiona/gdal_data\"\n", 113 | " \n", 114 | " try:\n", 115 | " with rasterio.open(uri) as dataset:\n", 116 | " bounds = dataset.bounds\n", 117 | " height = dataset.height\n", 118 | " width = dataset.width\n", 119 | " crs = dataset.get_crs()\n", 120 | " srs = osr.SpatialReference()\n", 121 | " srs.ImportFromWkt(crs.wkt)\n", 122 | " proj4 = srs.ExportToProj4()\n", 123 | " tile_cols = (int)(math.ceil(width/512)) * 512\n", 124 | " tile_rows = (int)(math.ceil(height/512)) * 512\n", 125 | " ws = [((x, min(width-1,x + 512)), (y, min(height-1,y + 512))) for x in range(0, tile_cols, 512) for y in range(0, tile_rows, 512)]\n", 126 | " except:\n", 127 | " ws = []\n", 128 | " \n", 129 | " def windows(uri, ws):\n", 130 | " for w in ws:\n", 131 | " ((row_start, row_stop), (col_start, col_stop)) = w\n", 132 | "\n", 133 | " left = bounds.left + (bounds.right - bounds.left)*(float(col_start)/width)\n", 134 | " right = bounds.left + (bounds.right - bounds.left)*(float(col_stop)/ width)\n", 135 | " bottom = bounds.top + (bounds.bottom - bounds.top)*(float(row_stop)/height)\n", 136 | " top = bounds.top + (bounds.bottom - bounds.top)*(float(row_start)/height)\n", 137 | " extent = gps.Extent(left,bottom,right,top)\n", 138 | " \n", 139 | " new_line = {}\n", 140 | " new_line['uri'] = uri\n", 141 | " new_line['window'] = w\n", 142 | " new_line['projected_extent'] = gps.ProjectedExtent(extent=extent, proj4=proj4)\n", 143 | " yield new_line\n", 144 | " \n", 145 | " return [i for i in windows(uri, ws)]\n" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "def get_data(line):\n", 157 | " import rasterio\n", 158 | " \n", 159 | " new_line = line.copy()\n", 160 | "\n", 161 | " with rasterio.open(line['uri']) as dataset:\n", 162 | " new_line['data'] = dataset.read(1, window=line['window'])\n", 163 | " new_line.pop('window')\n", 164 | " new_line.pop('uri')\n", 165 | " \n", 166 | " return new_line" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": { 173 | "collapsed": true 174 | }, 175 | "outputs": [], 176 | "source": [ 177 | "def filename_to_data(filename):\n", 178 | " import os\n", 179 | " \n", 180 | " full_filename = \"/vsicurl/https://s3.amazonaws.com/mrgeo-source/srtm-v3-30/{}\".format(filename)\n", 181 | " data = [get_data(line) for line in get_metadata(full_filename)]\n", 182 | " return data" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "rdd0 = sc.parallelize(file_names)\n", 192 | "rdd1 = rdd0.flatMap(filename_to_data)\n", 193 | "print(rdd1.count())" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "collapsed": true 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "rdd2 = rdd1.groupBy(lambda line: line['projected_extent']) # XXX" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "collapsed": true 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "def make_tiles(line):\n", 216 | " projected_extent = line[0]\n", 217 | " array = np.array([l['data'] for l in line[1]])\n", 218 | " tile = gps.Tile.from_numpy_array(array, no_data_value=0)\n", 219 | " return (projected_extent, tile)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "collapsed": true 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "rdd3 = rdd2.repartition(50 * 1024).map(make_tiles)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": { 237 | "collapsed": true 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "raster_layer = gps.RasterLayer.from_numpy_rdd(gps.LayerType.SPATIAL, rdd3)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": { 248 | "collapsed": true 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "tiled_raster_layer = raster_layer.tile_to_layout(layout = gps.GlobalLayout(), target_crs=3857)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": { 259 | "collapsed": true 260 | }, 261 | "outputs": [], 262 | "source": [ 263 | "pyramid = tiled_raster_layer.pyramid()" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "for layer in pyramid.levels.values():\n", 275 | " gps.write(\"s3://geotrellis-test/dg-srtm/\", \"srtm-geopyspark\", layer)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [] 286 | } 287 | ], 288 | "metadata": { 289 | "kernelspec": { 290 | "display_name": "GeoPySpark", 291 | "language": "python", 292 | "name": "gps" 293 | }, 294 | "language_info": { 295 | "codemirror_mode": { 296 | "name": "ipython", 297 | "version": 3 298 | }, 299 | "file_extension": ".py", 300 | "mimetype": "text/x-python", 301 | "name": "python", 302 | "nbconvert_exporter": "python", 303 | "pygments_lexer": "ipython3", 304 | "version": "3.4.6" 305 | } 306 | }, 307 | "nbformat": 4, 308 | "nbformat_minor": 2 309 | } 310 | -------------------------------------------------------------------------------- /notebooks/SRTM-local.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import rasterio\n", 10 | "import rasterio.features\n", 11 | "import rasterio.warp\n", 12 | "import geopyspark as gps\n", 13 | "import numpy as np\n", 14 | "import matplotlib.pyplot as plt\n", 15 | "\n", 16 | "from pyspark import SparkContext\n", 17 | "from osgeo import osr\n", 18 | "\n", 19 | "import os\n", 20 | "import math\n", 21 | "import boto3\n", 22 | "\n", 23 | "%matplotlib inline" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "conf = gps.geopyspark_conf(\"local[*]\", \"SRTM Ingest\") \\\n", 33 | " .set(\"spark.dynamicAllocation.enabled\", False) \\\n", 34 | " .set(\"spark.ui.enabled\",True) \\\n", 35 | " .set(\"spark.hadoop.yarn.timeline-service.enabled\", False)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "sc = SparkContext(conf=conf)" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "default_gdal_data_dir='/usr/local/share/gdal'" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "file_names = ['N00E006.hgt', 'N00E009.hgt', 'N00E010.hgt', 'N00E011.hgt', 'N00E012.hgt', 'N00E013.hgt', 'N00E014.hgt', 'N00E015.hgt', 'N00E016.hgt', 'N00E017.hgt']\n", 63 | "# file_names = file_names[0:2]\n", 64 | "print(len(file_names))\n", 65 | "print(file_names[0:10])" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "def get_metadata(uri):\n", 75 | " if \"GDAL_DATA\" not in os.environ:\n", 76 | " os.environ[\"GDAL_DATA\"]=default_gdal_data_dir\n", 77 | " \n", 78 | " try:\n", 79 | " with rasterio.open(uri) as dataset:\n", 80 | " bounds = dataset.bounds\n", 81 | " height = dataset.height\n", 82 | " width = dataset.width\n", 83 | " crs = dataset.get_crs()\n", 84 | " srs = osr.SpatialReference()\n", 85 | " srs.ImportFromWkt(crs.wkt)\n", 86 | " proj4 = srs.ExportToProj4()\n", 87 | " # ws = [w for (ij, w) in dataset.block_windows()]\n", 88 | " tile_cols = (int)(math.ceil(width/512)) * 512\n", 89 | " tile_rows = (int)(math.ceil(height/512)) * 512\n", 90 | " ws = [((x, min(width-1,x + 512)), (y, min(height-1,y + 512))) for x in range(0, tile_cols, 512) for y in range(0, tile_rows, 512)]\n", 91 | " except:\n", 92 | " ws = []\n", 93 | " \n", 94 | " def windows(uri, ws):\n", 95 | " for w in ws:\n", 96 | " ((row_start, row_stop), (col_start, col_stop)) = w\n", 97 | "\n", 98 | " left = bounds.left + (bounds.right - bounds.left)*(float(col_start)/width)\n", 99 | " right = bounds.left + (bounds.right - bounds.left)*(float(col_stop)/ width)\n", 100 | " bottom = bounds.top + (bounds.bottom - bounds.top)*(float(row_stop)/height)\n", 101 | " top = bounds.top + (bounds.bottom - bounds.top)*(float(row_start)/height)\n", 102 | " extent = gps.Extent(left,bottom,right,top)\n", 103 | " \n", 104 | " new_line = {}\n", 105 | " new_line['uri'] = uri\n", 106 | " new_line['window'] = w\n", 107 | " new_line['projected_extent'] = gps.ProjectedExtent(extent=extent, proj4=proj4)\n", 108 | " yield new_line\n", 109 | " \n", 110 | " return [i for i in windows(uri, ws)]\n" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "def get_data(line):\n", 120 | " new_line = line.copy()\n", 121 | "\n", 122 | " with rasterio.open(line['uri']) as dataset:\n", 123 | " new_line['data'] = dataset.read(1, window=line['window'])\n", 124 | " new_line.pop('window')\n", 125 | " new_line.pop('uri')\n", 126 | " \n", 127 | " return new_line" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "def filename_to_data(filename):\n", 137 | " #full_filename = \"/vsicurl/https://s3.amazonaws.com/mrgeo-source/srtm-v3-30/{}\".format(filename)\n", 138 | " full_filename = \"s3://mrgeo-source/srtm-v3-30/{}\".format(filename)\n", 139 | " data = [get_data(line) for line in get_metadata(full_filename)]\n", 140 | " return data" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "rdd0 = sc.parallelize(file_names)\n", 150 | "rdd1 = rdd0.flatMap(filename_to_data)\n", 151 | "print(rdd1.count())" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "rdd2 = rdd1.groupBy(lambda line: line['projected_extent']) # XXX" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "def make_tiles(line):\n", 170 | " projected_extent = line[0]\n", 171 | " array = np.array([l['data'] for l in line[1]])\n", 172 | " tile = gps.Tile.from_numpy_array(array, no_data_value=0)\n", 173 | " return (projected_extent, tile)\n" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "rdd3 = rdd2.map(make_tiles)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "raster_layer = gps.RasterLayer.from_numpy_rdd(gps.LayerType.SPATIAL, rdd3)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "tiled_raster_layer = raster_layer.tile_to_layout(layout = gps.GlobalLayout(), target_crs=3857)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "pyramid = tiled_raster_layer.pyramid()" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "for layer in pyramid.levels.values():\n", 219 | " gps.write(\"file:///tmp/dg-srtm/\", \"srtm-geopyspark-1\", layer)" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# pyramid2 = gps.Pyramid([gps.query(\"file:///tmp/dg-srtm\", \"srtm-geopyspark\", layer_zoom=n, num_partitions=1024*16) for n in range(0,13+1)])" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "histogram = pyramid.get_histogram()\n", 238 | "color_map = gps.ColorMap.build(breaks=histogram, colors='viridis')" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": null, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "tms = gps.TMS.build(('file:///tmp/dg-srtm', 'srtm-geopyspark-1'), display=color_map)\n", 248 | "tms.bind('0.0.0.0')" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": null, 254 | "metadata": {}, 255 | "outputs": [], 256 | "source": [ 257 | "import folium\n", 258 | "\n", 259 | "m = folium.Map(tiles='Stamen Terrain')\n", 260 | "folium.TileLayer(tiles=tms.url_pattern, attr='GeoPySpark').add_to(m)\n", 261 | "m" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "tms.unbind()" 271 | ] 272 | } 273 | ], 274 | "metadata": { 275 | "kernelspec": { 276 | "display_name": "GeoPySpark", 277 | "language": "python", 278 | "name": "gps" 279 | }, 280 | "language_info": { 281 | "codemirror_mode": { 282 | "name": "ipython", 283 | "version": 3 284 | }, 285 | "file_extension": ".py", 286 | "mimetype": "text/x-python", 287 | "name": "python", 288 | "nbconvert_exporter": "python", 289 | "pygments_lexer": "ipython3", 290 | "version": "3.4.6" 291 | } 292 | }, 293 | "nbformat": 4, 294 | "nbformat_minor": 2 295 | } 296 | -------------------------------------------------------------------------------- /notebooks/libya.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finding the Cost of Traversing Through Libya\n", 8 | "\n", 9 | "In this notebook, we will be calculating and visulizing the cost distance of traveling from one population center to another by road while avoiding conflict zones in Libya." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Import and Setup SparkContext" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import os\n", 26 | "import json\n", 27 | "import requests\n", 28 | "from functools import partial\n", 29 | "import pyproj\n", 30 | "import geopyspark as gps\n", 31 | "\n", 32 | "from pyspark import SparkContext\n", 33 | "from shapely.geometry import shape, MultiPoint, MultiLineString\n", 34 | "from shapely.ops import transform" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "conf = gps.geopyspark_conf(appName=\"Libya Weighted Overlay\", master=\"local[*]\")\n", 44 | "conf.set(\"spark.hadoop.yarn.timeline-service.enabled\", False)\n", 45 | "pysc = SparkContext.getOrCreate(conf)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "## Rasterize Libya Roads to RasterLayer" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": { 59 | "scrolled": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "libya_roads_json = requests.get('https://s3.amazonaws.com/geopyspark-demo/libya/roads.geojson').json()\n", 64 | "libya_roads = MultiLineString([shape(geom['geometry']) for geom in libya_roads_json['features']])" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "# All execution time here is sending WKB over py4j socket\n", 74 | "ro = gps.RasterizerOptions(includePartial=True, sampleType='PixelIsArea')\n", 75 | "\n", 76 | "road_raster = gps.rasterize(geoms=list(libya_roads.geoms), \n", 77 | " crs=\"EPSG:3857\",\n", 78 | " zoom=8, \n", 79 | " fill_value=1,\n", 80 | " cell_type=gps.CellType.FLOAT32,\n", 81 | " options=ro)\n", 82 | "\n", 83 | "road_raster.layer_metadata.bounds" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Show Rasterized Roads on a Map" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "metadata": {}, 97 | "outputs": [], 98 | "source": [ 99 | "# Pyramid up from base layer\n", 100 | "road_pp = road_raster.pyramid(resample_method=gps.ResampleMethod.MAX).cache()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "scrolled": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "# color map roads 1 to red\n", 112 | "roads_cm = gps.ColorMap.from_colors(breaks=[1], color_list=[0xff000080])\n", 113 | "\n", 114 | "# start JVM tile server and serve tiles to map\n", 115 | "server = gps.TMS.build(source=road_pp, display=roads_cm)\n", 116 | "server.bind(\"0.0.0.0\")\n", 117 | "server.url_pattern" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "from folium import Map, TileLayer\n", 127 | "\n", 128 | "m = Map(tiles='Stamen Toner', location=[27.7351, 17.2283], zoom_start=5)\n", 129 | "TileLayer(tiles=server.url_pattern, attr='GeoPySpark Tiles').add_to(m)\n", 130 | "m" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Cost Distance Based on Road Network" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# road network will shape our friction layer\n", 147 | "road_friction = road_raster.reclassify(value_map={1:1},\n", 148 | " data_type=int,\n", 149 | " replace_nodata_with=10)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# starting points for cost distance operation\n", 159 | "\n", 160 | "population_json = requests.get('https://s3.amazonaws.com/geopyspark-demo/libya/population.geojson').json()\n", 161 | "population_centers = MultiPoint([shape(geom['geometry']) for geom in population_json['features']])\n", 162 | "\n", 163 | "conflict_json = requests.get('https://s3.amazonaws.com/geopyspark-demo/libya/conflict.geojson').json()\n", 164 | "conflict_centers = MultiPoint([shape(feature['geometry']) for feature in conflict_json['features'] if feature['geometry'] != None])\n", 165 | "\n", 166 | "conflict_centers" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# Convert population centers data from EPSG:3857 to EPSG:4326 for display on map\n", 176 | "project = partial(\n", 177 | " pyproj.transform,\n", 178 | " pyproj.Proj(init='epsg:3857'),\n", 179 | " pyproj.Proj(init='epsg:4326'))\n", 180 | "\n", 181 | "population_4326 = transform(project, population_centers)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# Write reprojected data to file\n", 191 | "\n", 192 | "if 'VIRTUAL_ENV' in os.environ:\n", 193 | " !pip3 install geojson\n", 194 | "else:\n", 195 | " !pip3 install --user geojson\n", 196 | " \n", 197 | "import geojson\n", 198 | "\n", 199 | "with open('/tmp/population-4326.geojson', 'w') as f:\n", 200 | " geojson.dump(geojson.Feature(geometry=population_4326, properties={}), f)\n", 201 | " f.flush()" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "### Cost Distance Between Population Centers" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "pop_cd = gps.cost_distance(\n", 218 | " friction_layer=road_friction,\n", 219 | " geometries=population_centers, \n", 220 | " max_distance=1400000.0\n", 221 | ")\n", 222 | "\n", 223 | "pop_pp = pop_cd.pyramid()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": {}, 229 | "source": [ 230 | "### Cost Distance Between Conflict Centers" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "con_cd = gps.cost_distance(\n", 240 | " friction_layer=road_friction,\n", 241 | " geometries=conflict_centers, \n", 242 | " max_distance=1400000.0\n", 243 | ")\n", 244 | "\n", 245 | "con_pp = con_cd.pyramid()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "## Displaying the Weighted Cost Distance Layer With Population Centers" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "# prepare color map for weighted overlay based on max cost\n", 262 | "breaks = [x for x in range(0, 1000000, 10000)]\n", 263 | "colors = gps.get_colors_from_matplotlib(ramp_name='viridis', num_colors=len(breaks))\n", 264 | "wo_cm = gps.ColorMap.from_colors(breaks=breaks, color_list=colors)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "# our weighted layer avoids conflict centers focusing on just population centers\n", 274 | "weighted_overlay = (con_pp * 0.0) + (pop_pp * 1.0)\n", 275 | "\n", 276 | "server2 = gps.TMS.build(source=weighted_overlay, display=wo_cm)\n", 277 | "server2.bind('0.0.0.0')" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "from folium import GeoJson\n", 287 | "\n", 288 | "m2 = Map(tiles='Stamen Toner', location=[27.7351, 17.2283], zoom_start=5)\n", 289 | "TileLayer(tiles=server2.url_pattern, attr='GeoPySpark Tiles').add_to(m2)\n", 290 | "GeoJson(\"/tmp/population-4326.geojson\").add_to(m2)\n", 291 | "m2" 292 | ] 293 | } 294 | ], 295 | "metadata": { 296 | "kernelspec": { 297 | "display_name": "GeoPySpark", 298 | "language": "python", 299 | "name": "gps" 300 | }, 301 | "language_info": { 302 | "codemirror_mode": { 303 | "name": "ipython", 304 | "version": 3 305 | }, 306 | "file_extension": ".py", 307 | "mimetype": "text/x-python", 308 | "name": "python", 309 | "nbconvert_exporter": "python", 310 | "pygments_lexer": "ipython3", 311 | "version": "3.4.6" 312 | } 313 | }, 314 | "nbformat": 4, 315 | "nbformat_minor": 2 316 | } 317 | -------------------------------------------------------------------------------- /notebooks/sanfranmvp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from functools import partial\n", 10 | "import geopyspark as gps\n", 11 | "import fiona\n", 12 | "import pyproj\n", 13 | "\n", 14 | "from pyspark import SparkContext\n", 15 | "from shapely.geometry import MultiPoint, MultiLineString, shape\n", 16 | "from shapely.ops import transform\n", 17 | "import folium" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "# Set up our spark context \n", 27 | "conf = gps.geopyspark_conf(appName=\"San Fran MVP\", master=\"local[*]\") \n", 28 | "sc = SparkContext(conf=conf)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Set the map center to be over San Francisco\n", 38 | "map_center = [37.75, -122.45]\n", 39 | "zoom = 11" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "# Download the needed geojsons\n", 49 | "\n", 50 | "!curl -o /tmp/bars.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/bars.geojson\n", 51 | "!curl -o /tmp/cafes.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/cafes.geojson \n", 52 | "!curl -o /tmp/transit.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/transit.geojson \n", 53 | "!curl -o /tmp/roads.geojson https://s3.amazonaws.com/geopyspark-demo/MVP_San_Francisco/roads.geojson " 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "# Read in all of the downloaded geojsons as Shapely geometries\n", 63 | "\n", 64 | "with fiona.open(\"/tmp/bars.geojson\") as source:\n", 65 | " bars_crs = source.crs['init']\n", 66 | " bars = MultiPoint([shape(f['geometry']) for f in source])\n", 67 | "\n", 68 | "with fiona.open(\"/tmp/cafes.geojson\") as source:\n", 69 | " cafes_crs = source.crs['init']\n", 70 | " cafes = MultiPoint([shape(f['geometry']) for f in source])\n", 71 | " \n", 72 | "with fiona.open(\"/tmp/transit.geojson\") as source:\n", 73 | " transit_crs = source.crs['init']\n", 74 | " transit = MultiPoint([shape(f['geometry']) for f in source]) \n", 75 | " \n", 76 | "with fiona.open(\"/tmp/roads.geojson\") as source:\n", 77 | " roads_crs = source.crs['init']\n", 78 | " roads = [MultiLineString(shape(line['geometry'])) for line in source]" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "# Reproject each Shapely geometry to EPSG:3857 so it can be\n", 88 | "# displayed on the map\n", 89 | "\n", 90 | "def create_partial_reprojection_func(crs):\n", 91 | " return partial(pyproj.transform,\n", 92 | " pyproj.Proj(init=crs),\n", 93 | " pyproj.Proj(init='epsg:3857'))\n", 94 | "\n", 95 | "reprojected_bars = [transform(create_partial_reprojection_func(bars_crs), bar) for bar in bars]\n", 96 | "reprojected_cafes = [transform(create_partial_reprojection_func(cafes_crs), cafe) for cafe in cafes]\n", 97 | "reprojected_transit = [transform(create_partial_reprojection_func(transit_crs), trans) for trans in transit]\n", 98 | "reprojected_roads = [transform(create_partial_reprojection_func(roads_crs), road) for road in roads]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Rasterize the road vectors and create the road fricition\n", 108 | "# layer.\n", 109 | "\n", 110 | "rasterize_options = gps.RasterizerOptions(includePartial=True, sampleType='PixelIsArea')\n", 111 | "\n", 112 | "road_raster = gps.rasterize(geoms=reprojected_roads,\n", 113 | " crs=\"EPSG:3857\",\n", 114 | " zoom=12,\n", 115 | " fill_value=1,\n", 116 | " cell_type=gps.CellType.FLOAT32,\n", 117 | " options=rasterize_options)\n", 118 | "\n", 119 | "road_friction = road_raster.reclassify(value_map={1:1},\n", 120 | " data_type=int,\n", 121 | " replace_nodata_with=10)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "# Create the cost distance layer for bars based on the\n", 131 | "# road network. Then pyramid the layer.\n", 132 | "\n", 133 | "bar_layer = gps.cost_distance(friction_layer=road_friction,\n", 134 | " geometries=reprojected_bars,\n", 135 | " max_distance=1500000.0)\n", 136 | "\n", 137 | "bar_pyramid = bar_layer.pyramid()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "# Create the cost distance layer for cafes based on the\n", 147 | "# road network. Then pyramid the layer.\n", 148 | "\n", 149 | "cafe_layer = gps.cost_distance(friction_layer=road_friction,\n", 150 | " geometries=reprojected_cafes,\n", 151 | " max_distance=1500000.0)\n", 152 | "\n", 153 | "cafe_pyramid = cafe_layer.pyramid()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "# Create the cost distance layer for the transit stops\n", 163 | "# based on the road network. Then pyramid the layer.\n", 164 | "\n", 165 | "transit_layer = gps.cost_distance(friction_layer=road_friction,\n", 166 | " geometries=reprojected_transit,\n", 167 | " max_distance=1500000.0)\n", 168 | "\n", 169 | "transit_pyramid = transit_layer.pyramid()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Calculate the weighted layer based on our preferences.\n", 179 | "\n", 180 | "weighted_layer = (-1 * bar_pyramid) + (transit_pyramid * 5) + (cafe_pyramid * 1)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "# Calculate the histogram for the weighted layer and\n", 190 | "# then create a ColorRamp from the histogram.\n", 191 | "\n", 192 | "weighted_histogram = weighted_layer.get_histogram()\n", 193 | "weighted_color_map = gps.ColorMap.build(breaks=weighted_histogram,\n", 194 | " colors='viridis')" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "# Build the TMS server from the weighted layer with its\n", 204 | "# ColorMap\n", 205 | "\n", 206 | "tms = gps.TMS.build(source=weighted_layer,\n", 207 | " display=weighted_color_map)\n", 208 | "tms.bind('0.0.0.0')" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# Adds the weighted layer and all of the geometries to the map\n", 218 | "# Bars are red\n", 219 | "# Cafes are orange\n", 220 | "# Transit stops are green\n", 221 | "\n", 222 | "#M.add_layer(TMSRasterData(tms), name=\"Weighted Layer\")\n", 223 | "#M.add_layer(VectorData(\"/tmp/bars.geojson\"), name=\"Bars\", colors=[0xff0000])\n", 224 | "#M.add_layer(VectorData(\"/tmp/cafes.geojson\"), name=\"Cafes\")\n", 225 | "#M.add_layer(VectorData(\"/tmp/transit.geojson\"), name=\"Transit\", colors=[0x00FF00])\n", 226 | "\n", 227 | "m = folium.Map(tiles='OpenStreetMap', location=map_center, zoom_start=zoom)\n", 228 | "folium.TileLayer(tiles=tms.url_pattern, attr='GeoPySpark', name='Weighted layer', overlay=True).add_to(m)\n", 229 | "folium.GeoJson('/tmp/bars.geojson', name='Bars', style_function=lambda x: {'radius': 2, 'color': 'red'}, overlay=True).add_to(m)\n", 230 | "folium.GeoJson('/tmp/cafes.geojson', name='Cafes', style_function=lambda x: {'fillColor': 'orange'}, overlay=True).add_to(m)\n", 231 | "folium.GeoJson('/tmp/transit.geojson', name='Transit', style_function=lambda x: {'fillColor': 'green'}, overlay=True).add_to(m)\n", 232 | "m" 233 | ] 234 | } 235 | ], 236 | "metadata": { 237 | "kernelspec": { 238 | "display_name": "GeoPySpark", 239 | "language": "python", 240 | "name": "gps" 241 | }, 242 | "language_info": { 243 | "codemirror_mode": { 244 | "name": "ipython", 245 | "version": 3 246 | }, 247 | "file_extension": ".py", 248 | "mimetype": "text/x-python", 249 | "name": "python", 250 | "nbconvert_exporter": "python", 251 | "pygments_lexer": "ipython3", 252 | "version": "3.4.6" 253 | } 254 | }, 255 | "nbformat": 4, 256 | "nbformat_minor": 2 257 | } 258 | -------------------------------------------------------------------------------- /rpms/build/.dockerignore: -------------------------------------------------------------------------------- 1 | archives/ 2 | rpmbuild/SOURCES/ 3 | -------------------------------------------------------------------------------- /rpms/build/Dockerfile.base: -------------------------------------------------------------------------------- 1 | FROM amazonlinux:2016.09.1.20161221 2 | MAINTAINER James McClain 3 | 4 | RUN yum makecache fast 5 | 6 | # Java 7 | RUN yum update -y 8 | RUN yum install -y java-1.8.0-openjdk 9 | 10 | # Spark 11 | ENV SPARK_HOME /usr/local/spark-2.1.0-bin-hadoop2.7 12 | ADD blobs/spark-2.1.0-bin-hadoop2.7.tgz /usr/local 13 | RUN ln -s /usr/local/spark-2.1.0-bin-hadoop2.7 /usr/local/spark 14 | 15 | # kit, caboodle 16 | RUN yum install -y \ 17 | http://localhost:18080/hdf5-1.8.20-33.x86_64.rpm \ 18 | http://localhost:18080/netcdf-4.5.0-33.x86_64.rpm \ 19 | http://localhost:18080/openjpeg230-2.3.0-33.x86_64.rpm \ 20 | http://localhost:18080/gdal231-2.3.1-33.x86_64.rpm \ 21 | http://localhost:18080/nodejs-8.5.0-13.x86_64.rpm \ 22 | http://localhost:18080/proj493-lib-4.9.3-33.x86_64.rpm \ 23 | http://localhost:18080/configurable-http-proxy-0.0.0-13.x86_64.rpm 24 | 25 | RUN echo /usr/local/lib >> /etc/ld.so.conf.d/local.conf && \ 26 | echo /usr/local/lib64 >> /etc/ld.so.conf.d/local.conf && \ 27 | ldconfig 28 | 29 | # Create user 30 | RUN yum install -y shadow-utils && \ 31 | useradd hadoop -m && usermod -a -G root hadoop && (echo 'hadoop:hadoop' | chpasswd) 32 | 33 | # Misc 34 | RUN yum install -y unzip python34 pam 35 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.4 36 | RUN ln -s /usr/local/share/jupyter /usr/share/jupyter 37 | COPY etc/pam.d/login /etc/pam.d/login 38 | 39 | RUN pip3.4 install -r http://localhost:28080/http-requirements.txt 40 | 41 | USER hadoop 42 | -------------------------------------------------------------------------------- /rpms/build/Dockerfile.gcc4: -------------------------------------------------------------------------------- 1 | FROM amazonlinux:2016.09.1.20161221 2 | MAINTAINER James McClain 3 | 4 | RUN yum -y groupinstall "Development Tools" || echo 5 | RUN yum -y install python34-devel cmake less nano && yum clean all 6 | RUN curl https://bootstrap.pypa.io/get-pip.py | python3.4 7 | RUN yum update -y 8 | RUN yum install -y java-1.8.0-openjdk-devel 9 | RUN yum clean all -y && yum update -y && \ 10 | yum install -y \ 11 | bzip2-devel \ 12 | cairo-devel \ 13 | libjpeg-turbo-devel \ 14 | libpng-devel \ 15 | libtiff-devel \ 16 | make \ 17 | pkgconfig \ 18 | rpm-build \ 19 | which \ 20 | zlib-devel 21 | RUN ln -s /usr/include/python3.4m /usr/include/python3.4 22 | 23 | RUN yum makecache fast 24 | ENV JAVA_HOME=/etc/alternatives/jre 25 | -------------------------------------------------------------------------------- /rpms/build/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all gcc4 aws-build-gdal rpms src wheels 2 | 3 | FAMILY := quay.io/geodocker/emr-build 4 | VERSION := 8 5 | GCC4IMAGE := $(FAMILY):gcc4-$(VERSION) 6 | BASEIMAGE := quay.io/geodocker/jupyter-geopyspark:base-$(VERSION) 7 | INTERFACE ?= eth0 8 | IP_ADDR := $(shell ifconfig $(INTERFACE) | grep -i mask | awk '{print $$2}' | cut -f2 -d:) 9 | 10 | all: 11 | echo "see build.sh" 12 | 13 | gcc4: 14 | docker build -t $(GCC4IMAGE) -f Dockerfile.$@ . 15 | 16 | base: blobs/spark-2.1.0-bin-hadoop2.7.tgz rpms wheel/http-requirements.txt 17 | docker run -dit --rm --name rpm-server --hostname rpm-server -p "18080:80" -v $(shell pwd)/rpmbuild/RPMS/x86_64:/usr/local/apache2/htdocs httpd:2.4 18 | docker run -dit --rm --name whl-server --hostname whl-server -p "28080:80" -v $(shell pwd)/wheel:/usr/local/apache2/htdocs httpd:2.4 19 | docker build --no-cache --add-host="localhost:$(IP_ADDR)" -t $(BASEIMAGE) -f Dockerfile.base . 20 | docker stop whl-server 21 | docker stop rpm-server 22 | 23 | rpms: rpmbuild/RPMS/x86_64/proj493-lib-4.9.3-33.x86_64.rpm \ 24 | rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm \ 25 | rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm \ 26 | rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm \ 27 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm \ 28 | rpmbuild/RPMS/x86_64/configurable-http-proxy-0.0.0-13.x86_64.rpm 29 | 30 | src: rpmbuild/SOURCES/curl-7.57.0.tar.bz2 rpmbuild/SOURCES/zlib-1.2.11.tar.gz \ 31 | rpmbuild/SOURCES/libpng-1.6.30.tar.xz rpmbuild/SOURCES/geos-3.6.1.tar.bz2 \ 32 | rpmbuild/SOURCES/lcms2-2.8.tar.gz rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz \ 33 | rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2 rpmbuild/SOURCES/netcdf-4.5.0.tar.gz \ 34 | rpmbuild/SOURCES/gdal-2.3.1.tar.gz rpmbuild/SOURCES/node-v8.5.0.tar.gz \ 35 | rpmbuild/SOURCES/proj-4.9.3.tar.gz 36 | 37 | blobs/spark-2.1.0-bin-hadoop2.7.tgz: 38 | curl -L "http://d3kbcqa49mib13.cloudfront.net/spark-2.1.0-bin-hadoop2.7.tgz" -o $@ 39 | 40 | rpmbuild/SOURCES/%.tar: %/ 41 | tar cvf $@ $< 42 | 43 | include configurable-http-proxy.mk 44 | include gdal.mk 45 | include wheels.mk 46 | 47 | clean: 48 | rm -f rpmbuild/SOURCES/*.tar 49 | 50 | cleaner: clean 51 | 52 | cleanest: cleaner 53 | rm -f rpmbuild/RPMS/x86_64/* 54 | 55 | mrproper: cleanest 56 | rm -f rpmbuild/SOURCES/SOURCES/* 57 | rm -f wheel/*.whl 58 | -------------------------------------------------------------------------------- /rpms/build/README.md: -------------------------------------------------------------------------------- 1 | # Introduction # 2 | 3 | This directory contains the configuration and build files needed to (re)build the base image and the RPMs. 4 | 5 | # Inventory # 6 | 7 | ## Images ## 8 | 9 | The following images can be built from the materials in this directory: 10 | 11 | - [`quay.io/geodocker/jupyter-geopyspark:aws-build-gdal-3`](Dockerfile.aws-build-gdal) is an image used to build the `gdal-and-friend.tar.gz` binary blob. This image is meant to mimic the environment of an EC2 (EMR) instance as closely as possible so as to create a compatible artifact. 12 | - [`quay.io/geodocker/jupyter-geopyspark:base-3`](Dockerfile.base) is the ancestor images of the image produced in the root of this repository. It contains mostly slow-rate-of-change binary dependencies. 13 | - [`quay.io/geodocker/emr-build:gcc4-3`](Dockerfile.gcc4) is used to build RPMs with gcc 4.8. 14 | - [`quay.io/geodocker/emr-build:gcc6-3`](Dockerfile.gcc6) is used to build RPMs with gcc 6.4. 15 | 16 | ## Files and Directories ## 17 | 18 | - [`archives`](archives) is an initially-empty directory that is populated with source code, tarballs, and RPMs downloaded or produced during the build process. 19 | - [`blobs`](blobs) is an initially-empty directory that is populated with archives and RPMS from the `archives` directory. 20 | - [`rpmbuild`](rpmbuild) is a directory containing configuration files used to produce the RPMs. 21 | - [`scripts`](scripts) is a directory containing scripts used to build the RPMs mentioned above, as well as the `gdal-and-friends.tar.gz` tarball. 22 | - [`Makefile`](Makefile) coordinates the build process. 23 | - [`etc`](etc) contains additional configuration files that are included in the base image. 24 | - The various Dockerfiles specify the various images discussed above. 25 | - `*.mk`: these are included in the Makefile. 26 | - `README.md`: this file. 27 | 28 | # RPMs # 29 | 30 | ## Building ## 31 | 32 | From within this directory, type `./build.sh` to build all of the RPMs (this could take a very long time). 33 | Once they are built, type `./publish.sh s3://bucket/prefix/` where `s3://bucket/prefix/` is a "directory" on S3 for which you have write permissions. 34 | The RPMs will be published to `s3://bucket/prefix/abc123/` where `abc123` is the present git SHA. 35 | 36 | This will also produce all of the images described above (including the base image). 37 | 38 | ## Fetching ## 39 | 40 | From within this directory, type `./fetch s3://bucket/prefix/abc123/` where `s3://bucket/prefix/` is the path to a "directory" on S3 where RPMs have been previously-published, and `abc123` is the git SHA from which those RPMs were produced. 41 | 42 | ## Refreshing GeoPySpark ## 43 | 44 | With a complete set of RPMs already present, the GeoPySpark RPMs can be refreshed (for example to a newer version) by deleting the old GeoPySpark RPMs, then executing the `rpms` Makefile target. 45 | 46 | ```bash 47 | rm -f rpmbuild/RPMS/x86_64/geopyspark-*.rpm 48 | make rpms 49 | ``` 50 | -------------------------------------------------------------------------------- /rpms/build/archives/.gitignore: -------------------------------------------------------------------------------- 1 | *.jar 2 | *.rpm 3 | *.tar 4 | *.tar.bz2 5 | *.tar.gz 6 | *.tar.xz 7 | *.tgz 8 | *.zip 9 | -------------------------------------------------------------------------------- /rpms/build/blobs/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/blobs/.gitignore -------------------------------------------------------------------------------- /rpms/build/build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -z "$1" ] 4 | then 5 | ./fetch.sh $1 6 | fi 7 | 8 | make gcc4 9 | make rpms 10 | make wheels 11 | make base 12 | -------------------------------------------------------------------------------- /rpms/build/configurable-http-proxy.mk: -------------------------------------------------------------------------------- 1 | rpmbuild/SOURCES/node-v8.5.0.tar.gz: 2 | curl -L "https://nodejs.org/dist/v8.5.0/node-v8.5.0.tar.gz" -o $@ 3 | 4 | archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip: 5 | curl -L "https://github.com/ipython/ipykernel/archive/629ac54cae9767310616d47d769665453619ac64.zip" -o $@ 6 | 7 | archives/ipykernel.zip: archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip \ 8 | patches/patch.diff 9 | rm -rf ipykernel-629ac54cae9767310616d47d769665453619ac64/ 10 | unzip $< 11 | cd ipykernel-629ac54cae9767310616d47d769665453619ac64; patch -p1 < ../patches/patch.diff 12 | zip -r $@ ipykernel-629ac54cae9767310616d47d769665453619ac64/ 13 | rm -r ipykernel-629ac54cae9767310616d47d769665453619ac64/ 14 | 15 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm: rpmbuild/SPECS/nodejs.spec \ 16 | scripts/nodejs.sh \ 17 | rpmbuild/SOURCES/node-v8.5.0.tar.gz 18 | docker run -it --rm \ 19 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 20 | -v $(shell pwd)/scripts:/scripts:ro \ 21 | $(GCC4IMAGE) /scripts/nodejs.sh $(shell id -u) $(shell id -g) 22 | 23 | rpmbuild/RPMS/x86_64/configurable-http-proxy-0.0.0-13.x86_64.rpm: rpmbuild/SPECS/configurable-http-proxy.spec \ 24 | scripts/configurable-http-proxy.sh \ 25 | rpmbuild/SOURCES/configurable-http-proxy.tar \ 26 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm 27 | docker run -it --rm \ 28 | -v $(shell pwd)/archives:/archives:ro \ 29 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 30 | -v $(shell pwd)/scripts:/scripts:ro \ 31 | $(GCC4IMAGE) /scripts/configurable-http-proxy.sh $(shell id -u) $(shell id -g) 32 | -------------------------------------------------------------------------------- /rpms/build/configurable-http-proxy/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/configurable-http-proxy/.gitignore -------------------------------------------------------------------------------- /rpms/build/configurable-http-server.mk: -------------------------------------------------------------------------------- 1 | rpmbuild/SOURCES/node-v8.5.0.tar.gz: 2 | curl -L "https://nodejs.org/dist/v8.5.0/node-v8.5.0.tar.gz" -o $@ 3 | 4 | archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip: 5 | curl -L "https://github.com/ipython/ipykernel/archive/629ac54cae9767310616d47d769665453619ac64.zip" -o $@ 6 | 7 | archives/ipykernel.zip: archives/ipykernel-629ac54cae9767310616d47d769665453619ac64.zip patches/patch.diff 8 | rm -rf ipykernel-629ac54cae9767310616d47d769665453619ac64/ 9 | unzip $< 10 | cd ipykernel-629ac54cae9767310616d47d769665453619ac64; patch -p1 < ../patches/patch.diff 11 | zip $@ $(shell find ipykernel-629ac54cae9767310616d47d769665453619ac64) 12 | rm -r ipykernel-629ac54cae9767310616d47d769665453619ac64/ 13 | 14 | rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm: rpmbuild/SPECS/nodejs.spec scripts/nodejs.sh rpmbuild/SOURCES/node-v8.5.0.tar.gz 15 | docker run -it --rm \ 16 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 17 | -v $(shell pwd)/scripts:/scripts:ro \ 18 | $(GCC4IMAGE) /scripts/nodejs.sh $(shell id -u) $(shell id -g) 19 | 20 | rpmbuild/RPMS/x86_64/configurable-http-proxy-0.0.0-13.x86_64.rpm: rpmbuild/SPECS/configurable-http-proxy.spec rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm 21 | docker run -it --rm \ 22 | -v $(shell pwd)/archives:/archives:ro \ 23 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 24 | -v $(shell pwd)/scripts:/scripts:ro \ 25 | $(GCC4IMAGE) /scripts/configurable-http-proxy.sh $(shell id -u) $(shell id -g) 26 | -------------------------------------------------------------------------------- /rpms/build/etc/pam.d/login: -------------------------------------------------------------------------------- 1 | #%PAM-1.0 2 | auth [user_unknown=ignore success=ok ignore=ignore default=bad] pam_securetty.so 3 | auth substack system-auth 4 | auth include postlogin 5 | account required pam_nologin.so 6 | account include system-auth 7 | password include system-auth 8 | # pam_selinux.so close should be the first session rule 9 | session required pam_selinux.so close 10 | session required pam_loginuid.so 11 | session optional pam_console.so 12 | # pam_selinux.so open should only be followed by sessions to be executed in the user context 13 | session required pam_selinux.so open 14 | session required pam_namespace.so 15 | session optional pam_keyinit.so force revoke 16 | session include system-auth 17 | session include postlogin 18 | -session optional pam_ck_connector.so 19 | -------------------------------------------------------------------------------- /rpms/build/fetch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -z "$1" ] 4 | then 5 | URI=$(echo $1 | sed 's,/$,,') 6 | make src 7 | aws s3 sync $URI rpmbuild/RPMS/x86_64/ 8 | mv -f rpmbuild/RPMS/x86_64/*.whl wheel/ 9 | touch rpmbuild/RPMS/x86_64/*.rpm 10 | touch rpmbuild/RPMS/x86_64/hdf5-*.rpm 11 | touch rpmbuild/RPMS/x86_64/netcdf-*.rpm 12 | touch rpmbuild/RPMS/x86_64/gdal231-*.rpm 13 | touch rpmbuild/RPMS/x86_64/jupyterhub-*.rpm 14 | fi 15 | -------------------------------------------------------------------------------- /rpms/build/gdal.mk: -------------------------------------------------------------------------------- 1 | rpmbuild/SOURCES/proj-4.9.3.tar.gz: 2 | curl -L "http://download.osgeo.org/proj/proj-4.9.3.tar.gz" -o $@ 3 | 4 | rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2: 5 | curl -L "https://support.hdfgroup.org/ftp/HDF5/current18/src/hdf5-1.8.20.tar.bz2" -o $@ 6 | 7 | rpmbuild/SOURCES/netcdf-4.5.0.tar.gz: 8 | curl -L "ftp://ftp.unidata.ucar.edu/pub/netcdf/netcdf-4.5.0.tar.gz" -o $@ 9 | 10 | rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz: 11 | curl -L "https://github.com/uclouvain/openjpeg/archive/v2.3.0.tar.gz" -o $@ 12 | 13 | rpmbuild/SOURCES/gdal-2.3.1.tar.gz: 14 | curl -L "http://download.osgeo.org/gdal/2.3.1/gdal-2.3.1.tar.gz" -o $@ 15 | 16 | rpmbuild/RPMS/x86_64/openjpeg-2.3.0-33.x86_64.rpm: rpmbuild/SPECS/openjpeg.spec rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz 17 | docker run -it --rm \ 18 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 19 | -v $(shell pwd)/scripts:/scripts:ro \ 20 | $(GCC4IMAGE) /scripts/openjpeg.sh $(shell id -u) $(shell id -g) 21 | 22 | rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm: rpmbuild/SPECS/hdf5.spec rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2 23 | docker run -it --rm \ 24 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 25 | -v $(shell pwd)/scripts:/scripts:ro \ 26 | $(GCC4IMAGE) /scripts/hdf5.sh $(shell id -u) $(shell id -g) 27 | 28 | rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm: rpmbuild/SPECS/netcdf.spec rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/SOURCES/netcdf-4.5.0.tar.gz 29 | docker run -it --rm \ 30 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 31 | -v $(shell pwd)/scripts:/scripts:ro \ 32 | $(GCC4IMAGE) /scripts/netcdf.sh $(shell id -u) $(shell id -g) 33 | 34 | rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/RPMS/x86_64/proj493-lib-4.9.3-33.x86_64.rpm: rpmbuild/SPECS/proj.spec rpmbuild/SOURCES/proj-4.9.3.tar.gz 35 | docker run -it --rm \ 36 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 37 | -v $(shell pwd)/scripts:/scripts:ro \ 38 | $(GCC4IMAGE) /scripts/proj.sh $(shell id -u) $(shell id -g) 39 | 40 | rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm rpmbuild/RPMS/x86_64/gdal231-lib-2.3.1-33.x86_64.rpm: rpmbuild/SPECS/gdal.spec rpmbuild/RPMS/x86_64/openjpeg-2.3.0-33.x86_64.rpm rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm rpmbuild/SOURCES/gdal-2.3.1.tar.gz 41 | docker run -it --rm \ 42 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 43 | -v $(shell pwd)/scripts:/scripts:ro \ 44 | $(GCC4IMAGE) /scripts/gdal.sh $(shell id -u) $(shell id -g) 45 | -------------------------------------------------------------------------------- /rpms/build/geopyspark.mk: -------------------------------------------------------------------------------- 1 | rpmbuild/SOURCES/geopyspark.tar: geopyspark/ 2 | tar cvf $@ geopyspark/ 3 | 4 | rpmbuild/RPMS/x86_64/geopyspark-deps-0.0.0-13.x86_64.rpm: rpmbuild/SPECS/geopyspark.spec \ 5 | scripts/geopyspark.sh rpmbuild/SOURCES/geopyspark.tar 6 | docker run -it --rm \ 7 | -v $(shell pwd)/archives:/archives:ro \ 8 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:rw \ 9 | -v $(shell pwd)/scripts:/scripts:ro \ 10 | $(GCC4IMAGE) /scripts/geopyspark.sh $(shell id -u) $(shell id -g) 11 | -------------------------------------------------------------------------------- /rpms/build/patches/patch.diff: -------------------------------------------------------------------------------- 1 | diff --git a/ipykernel/jsonutil.py b/ipykernel/jsonutil.py 2 | index 3121e53..01b5d34 100644 3 | --- a/ipykernel/jsonutil.py 4 | +++ b/ipykernel/jsonutil.py 5 | @@ -164,7 +164,10 @@ def json_clean(obj): 6 | # If all OK, proceed by making the new dict that will be json-safe 7 | out = {} 8 | for k,v in iteritems(obj): 9 | - out[unicode_type(k)] = json_clean(v) 10 | + if str(type(v)) == "": 11 | + out[unicode_type(k)] = json_clean(list(v)) 12 | + else: 13 | + out[unicode_type(k)] = json_clean(v) 14 | return out 15 | if isinstance(obj, datetime): 16 | return obj.strftime(ISO8601) 17 | -------------------------------------------------------------------------------- /rpms/build/publish.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -z "$1" ] 4 | then 5 | URI=$(echo $1 | sed 's,/$,,')/$(git rev-parse HEAD)/ 6 | aws s3 sync rpmbuild/RPMS/x86_64/ $URI 7 | aws s3 sync wheel/ $URI 8 | else 9 | echo "Need location" 10 | fi 11 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/BUILD/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/BUILD/.gitignore -------------------------------------------------------------------------------- /rpms/build/rpmbuild/RPMS/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/RPMS/.gitignore -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SOURCES/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/SOURCES/.gitignore -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SPECS/configurable-http-proxy.spec: -------------------------------------------------------------------------------- 1 | %define _topdir /tmp/rpmbuild 2 | %define name configurable-http-proxy 3 | %define release 13 4 | %define version 0.0.0 5 | 6 | %define debug_package %{nil} 7 | 8 | BuildRoot: %{buildroot} 9 | Summary: configurable-http-proxy 10 | License: BSD-3 11 | Name: %{name} 12 | Version: %{version} 13 | Release: %{release} 14 | Source: configurable-http-proxy.tar 15 | Prefix: /usr/local 16 | Group: Development 17 | AutoReq: no 18 | Requires: pam 19 | Requires: nodejs 20 | BuildRequires: nodejs 21 | %global _enable_debug_package 0 22 | %global debug_package %{nil} 23 | %global __os_install_post /usr/lib/rpm/brp-compress %{nil} 24 | 25 | %description 26 | configurable-http-proxy 27 | 28 | %prep 29 | %setup -q -n configurable-http-proxy 30 | 31 | %build 32 | echo 33 | 34 | %install 35 | find /usr/local | sort > before.txt 36 | npm install -g configurable-http-proxy 37 | find /usr/local | sort > after.txt 38 | tar cf /tmp/packages.tar $(diff before.txt after.txt | grep '^>' | cut -f2 '-d ') 39 | cd %{buildroot} 40 | tar axf /tmp/packages.tar 41 | 42 | %files 43 | %defattr(-,root,root) 44 | /usr/local/* 45 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SPECS/gdal.spec: -------------------------------------------------------------------------------- 1 | %define _topdir /tmp/rpmbuild 2 | %define name gdal231 3 | %define release 33 4 | %define version 2.3.1 5 | 6 | %define debug_package %{nil} 7 | 8 | BuildRoot: %{buildroot} 9 | Summary: GDAL 10 | License: X/MIT 11 | Name: %{name} 12 | Version: %{version} 13 | Release: %{release} 14 | Source: gdal-%{version}.tar.gz 15 | Prefix: /usr/local 16 | Group: Azavea 17 | Requires: libpng 18 | Requires: libcurl 19 | Requires: libgeos 20 | Requires: hdf5 21 | Requires: netcdf 22 | BuildRequires: geos-devel 23 | BuildRequires: lcms2-devel 24 | BuildRequires: libcurl-devel 25 | BuildRequires: libpng-devel 26 | BuildRequires: openjpeg230 27 | BuildRequires: zlib-devel 28 | BuildRequires: hdf5 29 | BuildRequires: netcdf 30 | 31 | %description 32 | GDAL 33 | 34 | %prep 35 | %setup -q -n gdal-2.3.1 36 | 37 | %build 38 | PKG_CONFIG_PATH=/usr/local/lib/pkgconfig LDFLAGS='-L/usr/local/lib -L/usr/local/lib64' CC='gcc48' ./configure --prefix=/usr/local --with-java --with-curl --with-openjpeg 39 | nice -n 19 make -k -j$(grep -c ^processor /proc/cpuinfo) || make 40 | make -C swig/java 41 | 42 | %install 43 | nice -n 19 make DESTDIR=%{buildroot} install 44 | cp -L swig/java/.libs/libgdalalljni* %{buildroot}/usr/local/lib/ 45 | cp swig/java/gdal.jar %{buildroot}/usr/local/share/ 46 | 47 | %package lib 48 | Group: Geography 49 | Summary: GDAL 50 | %description lib 51 | The libraries 52 | 53 | %files lib 54 | %defattr(-,root,root) 55 | /usr/local/lib 56 | /usr/local/share/gdal.jar 57 | 58 | %files 59 | %defattr(-,root,root) 60 | /usr/local/* 61 | /usr/local/share/gdal.jar 62 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SPECS/hdf5.spec: -------------------------------------------------------------------------------- 1 | %define _topdir /tmp/rpmbuild 2 | %define name hdf5 3 | %define release 33 4 | %define version 1.8.20 5 | 6 | %define debug_package %{nil} 7 | 8 | BuildRoot: %{buildroot} 9 | Summary: HDF5 10 | License: X/MIT 11 | Name: %{name} 12 | Version: %{version} 13 | Release: %{release} 14 | Source: hdf5-%{version}.tar.bz2 15 | Prefix: /usr/local 16 | Group: Azavea 17 | Requires: libcurl 18 | BuildRequires: libcurl-devel 19 | 20 | %description 21 | HDF5 22 | 23 | %prep 24 | %setup -q -n hdf5-1.8.20 25 | 26 | %build 27 | ./configure --prefix=/usr/local 28 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo) 29 | 30 | %install 31 | nice -n 19 make DESTDIR=%{buildroot} install 32 | 33 | %files 34 | %defattr(-,root,root) 35 | /usr/local/* 36 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SPECS/netcdf.spec: -------------------------------------------------------------------------------- 1 | %define _topdir /tmp/rpmbuild 2 | %define name netcdf 3 | %define release 33 4 | %define version 4.5.0 5 | 6 | %define debug_package %{nil} 7 | 8 | BuildRoot: %{buildroot} 9 | Summary: NetCDF 10 | License: X/MIT 11 | Name: %{name} 12 | Version: %{version} 13 | Release: %{release} 14 | Source: netcdf-%{version}.tar.gz 15 | Prefix: /usr/local 16 | Group: Azavea 17 | Requires: libcurl 18 | Requires: hdf5 19 | BuildRequires: libcurl-devel 20 | BuildRequires: hdf5 21 | 22 | %description 23 | NetCDF 24 | 25 | %prep 26 | %setup -q -n netcdf-4.5.0 27 | 28 | %build 29 | ./configure --prefix=/usr/local 30 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo) 31 | 32 | %install 33 | nice -n 19 make DESTDIR=%{buildroot} install 34 | 35 | %files 36 | %defattr(-,root,root) 37 | /usr/local/* 38 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SPECS/nodejs.spec: -------------------------------------------------------------------------------- 1 | %define _topdir /tmp/rpmbuild 2 | %define name nodejs 3 | %define release 13 4 | %define version 8.5.0 5 | 6 | %define debug_package %{nil} 7 | 8 | BuildRoot: %{buildroot} 9 | Summary: NodeJS 10 | License: node.js 11 | Name: %{name} 12 | Version: %{version} 13 | Release: %{release} 14 | Source: node-v%{version}.tar.gz 15 | Prefix: /usr/local 16 | Group: Azavea 17 | 18 | %description 19 | Node.js 8.5.0 20 | 21 | %prep 22 | %setup -q -n node-v8.5.0 23 | 24 | %build 25 | ./configure --prefix=/usr/local 26 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo) 27 | 28 | %install 29 | make DESTDIR=%{buildroot} install 30 | 31 | %files 32 | %defattr(-,root,root) 33 | /usr/local/* 34 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SPECS/openjpeg.spec: -------------------------------------------------------------------------------- 1 | %define _topdir /tmp/rpmbuild 2 | %define name openjpeg230 3 | %define release 33 4 | %define version 2.3.0 5 | 6 | %define debug_package %{nil} 7 | 8 | BuildRoot: %{buildroot} 9 | Summary: OpenJPEG 10 | License: X/MIT 11 | Name: %{name} 12 | Version: %{version} 13 | Release: %{release} 14 | Source: openjpeg-%{version}.tar.gz 15 | Prefix: /usr/local 16 | Group: Azavea 17 | Requires: libpng 18 | Requires: libcurl 19 | Requires: libgeos 20 | Requires: hdf5 21 | Requires: netcdf 22 | BuildRequires: cmake 23 | BuildRequires: lcms2-devel 24 | 25 | %description 26 | OpenJPEG 27 | 28 | %prep 29 | %setup -q -n openjpeg-2.3.0 30 | 31 | %build 32 | mkdir build 33 | cd build 34 | cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/usr/local 35 | make 36 | 37 | %install 38 | cd build && make DESTDIR=%{buildroot} install 39 | 40 | %files 41 | %defattr(-,root,root) 42 | /usr/local/lib/openjpeg-2.3/* 43 | /usr/local/lib/pkgconfig/libopenjp2.pc 44 | /usr/local/lib/libopenjp2* 45 | /usr/local/bin/opj* 46 | /usr/local/include/openjpeg-2.3 47 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SPECS/proj.spec: -------------------------------------------------------------------------------- 1 | %define _topdir /tmp/rpmbuild 2 | %define name proj493 3 | %define release 33 4 | %define version 4.9.3 5 | 6 | %define debug_package %{nil} 7 | 8 | BuildRoot: %{buildroot} 9 | Summary: Proj4 10 | License: MIT 11 | Name: %{name} 12 | Version: %{version} 13 | Release: %{release} 14 | Source: proj-%{version}.tar.gz 15 | Prefix: /usr/local 16 | Group: Azavea 17 | 18 | %description 19 | Proj 4.9.3 20 | 21 | %prep 22 | %setup -q -n proj-4.9.3 23 | 24 | %build 25 | ./configure --prefix=/usr/local 26 | nice -n 19 make -j$(grep -c ^processor /proc/cpuinfo) 27 | 28 | %install 29 | make DESTDIR=%{buildroot} install 30 | 31 | %package lib 32 | Group: Geography 33 | Summary: Proj 4.9.3 libraries 34 | %description lib 35 | The libraries 36 | 37 | %files 38 | %defattr(-,root,root) 39 | /usr/local/* 40 | 41 | %files lib 42 | %defattr(-,root,root) 43 | /usr/local/lib/* 44 | -------------------------------------------------------------------------------- /rpms/build/rpmbuild/SRPMS/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geodocker/geodocker-jupyter-geopyspark/9432107c7de8c067bf44182ba9a9a27cc027a927/rpms/build/rpmbuild/SRPMS/.gitignore -------------------------------------------------------------------------------- /rpms/build/scripts/configurable-http-proxy.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | yum install -y /tmp/rpmbuild/RPMS/x86_64/nodejs-8.5.0-13.x86_64.rpm 7 | ldconfig 8 | 9 | cd /tmp/rpmbuild 10 | chown -R root:root /tmp/rpmbuild/SOURCES/configurable-http-proxy.tar 11 | rpmbuild -v -bb --clean SPECS/configurable-http-proxy.spec 12 | chown -R $USERID:$GROUPID /tmp/rpmbuild 13 | -------------------------------------------------------------------------------- /rpms/build/scripts/gdal.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | yum install -y geos-devel lcms2-devel libcurl-devel libpng-devel zlib-devel swig 7 | yum localinstall -y /tmp/rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/openjpeg230-2.3.0-33.x86_64.rpm 8 | ldconfig 9 | 10 | curl -o /tmp/ant.zip -L http://apache.spinellicreations.com//ant/binaries/apache-ant-1.9.13-bin.zip 11 | unzip -d /tmp/apache-ant /tmp/ant.zip 12 | export ANT_HOME=/tmp/apache-ant/apache-ant-1.9.13 13 | export PATH=$PATH:$ANT_HOME/bin 14 | 15 | cd /tmp/rpmbuild 16 | chown -R root:root /tmp/rpmbuild/SOURCES/gdal-2.3.1.tar.gz 17 | rpmbuild -v -bb --clean SPECS/gdal.spec 18 | chown -R $USERID:$GROUPID /tmp/rpmbuild 19 | -------------------------------------------------------------------------------- /rpms/build/scripts/hdf5.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | yum install -y libcurl-devel 7 | ldconfig 8 | 9 | cd /tmp/rpmbuild 10 | chown -R root:root /tmp/rpmbuild/SOURCES/hdf5-1.8.20.tar.bz2 11 | rpmbuild -v -bb --clean SPECS/hdf5.spec 12 | chown -R $USERID:$GROUPID /tmp/rpmbuild 13 | -------------------------------------------------------------------------------- /rpms/build/scripts/netcdf.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | yum install -y libcurl-devel 7 | yum localinstall -y /tmp/rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm /tmp/rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm 8 | ldconfig 9 | 10 | cd /tmp/rpmbuild 11 | chown -R root:root /tmp/rpmbuild/SOURCES/netcdf-4.5.0.tar.gz 12 | rpmbuild -v -bb --clean SPECS/netcdf.spec 13 | chown -R $USERID:$GROUPID /tmp/rpmbuild 14 | -------------------------------------------------------------------------------- /rpms/build/scripts/nodejs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | ldconfig 7 | 8 | cd /tmp/rpmbuild 9 | chown -R root:root /tmp/rpmbuild/SOURCES/node-v8.5.0.tar.gz 10 | rpmbuild -v -bb --clean SPECS/nodejs.spec 11 | chown -R $USERID:$GROUPID /tmp/rpmbuild 12 | -------------------------------------------------------------------------------- /rpms/build/scripts/not.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ ! -f $1 ]; then 4 | echo $1; 5 | fi 6 | -------------------------------------------------------------------------------- /rpms/build/scripts/openjpeg.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | yum install -y lcms2-devel libcurl-devel zlib-devel 7 | ldconfig 8 | 9 | cd /tmp/rpmbuild 10 | chown -R root:root /tmp/rpmbuild/SOURCES/openjpeg-2.3.0.tar.gz 11 | rpmbuild -v -bb --clean SPECS/openjpeg.spec 12 | chown -R $USERID:$GROUPID /tmp/rpmbuild 13 | -------------------------------------------------------------------------------- /rpms/build/scripts/proj.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | ldconfig 7 | 8 | cd /tmp/rpmbuild 9 | chown -R root:root /tmp/rpmbuild/SOURCES/proj-4.9.3.tar.gz 10 | rpmbuild -v -bb --clean SPECS/proj.spec 11 | chown -R $USERID:$GROUPID /tmp/rpmbuild 12 | -------------------------------------------------------------------------------- /rpms/build/scripts/wheel.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | USERID=$1 4 | GROUPID=$2 5 | 6 | yum localinstall -y \ 7 | /tmp/rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm \ 8 | /tmp/rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm \ 9 | /tmp/rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm \ 10 | /tmp/rpmbuild/RPMS/x86_64/openjpeg230-2.3.0-33.x86_64.rpm \ 11 | /tmp/rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm 12 | ldconfig 13 | 14 | mkdir -p /usr/share/jupyter/kernels 15 | mkdir /tmp/wheel 16 | cd /tmp/wheel 17 | cp /wheel/requirements.txt . 18 | export CC=gcc48 19 | pip3.4 install numpy==1.12.1 20 | pip3.4 wheel -r requirements.txt 21 | pip3.4 install ipython==5.1.0 /archives/ipykernel.zip 22 | pip3.4 wheel ipython==5.1.0 /archives/ipykernel.zip 23 | 24 | chown -R $USERID:$GROUPID . 25 | 26 | # Clean up duplicate packages (leave most recent version) 27 | rm -f tornado-5* 28 | rm -f pyzmq-17* 29 | for f in $(ls -1r | sed -e 's/^\(.*\)$/\1 \1/' | sed -e 's/^\([a-zA-Z0-9_]*\)-[0-9].* \(.*\)$/\1 \2/' | awk '{ if (seen[$1]++){print $2} }') 30 | do 31 | echo "rm -f $f" 32 | rm -f $f 33 | done 34 | 35 | echo "Final wheel list: ==============================================" 36 | ls -1r 37 | 38 | cp *.whl /wheel 39 | -------------------------------------------------------------------------------- /rpms/build/wheel/requirements.txt: -------------------------------------------------------------------------------- 1 | affine==2.0.0.post1 2 | alembic==0.8.9 3 | appdirs==1.4.3 4 | backports-abc==0.5 5 | bleach==1.5.0 6 | boto3==1.7.26 7 | click==6.7 8 | click-plugins==1.0.3 9 | cligj==0.4.0 10 | colortools==0.1.2 11 | cycler==0.10.0 12 | decorator==4.0.10 13 | entrypoints==0.2.2 14 | Fiona==1.7.1 15 | Flask==0.12.1 16 | Flask-Cors==3.0.2 17 | folium==0.5.0 18 | futures==3.1.1 19 | GDAL==2.3.1 20 | gevent==1.2.1 21 | html5lib==0.9999999 22 | https://github.com/jupyterhub/oauthenticator/archive/84ab3cce8db8c599ebd3bbbd836724bea6eb93a1.zip 23 | ipython==5.1.0 24 | ipython-genutils==0.1.0 25 | ipywidgets==6.0.0 26 | itsdangerous==0.24 27 | Jinja2==2.9.4 28 | jsonschema==2.5.1 29 | jupyter-client==4.4.0 30 | jupyter-core==4.2.1 31 | jupyterhub==0.8.1 32 | lxml==3.7.3 33 | Mako==1.0.6 34 | MarkupSafe==0.23 35 | matplotlib==2.0.0 36 | mistune==0.7.3 37 | ModestMaps==1.4.7 38 | munch==2.1.1 39 | nbconvert==5.0.0 40 | nbformat==4.2.0 41 | networkx==1.11 42 | notebook==5.0.0 43 | numpy==1.12.1 44 | olefile==0.44 45 | packaging==16.8 46 | pamela==0.3.0 47 | pandas==0.19.2 48 | pandocfilters==1.4.1 49 | pexpect==4.2.1 50 | pickleshare==0.7.4 51 | Pillow==5.1.0 52 | promise==0.4.2 53 | prompt-toolkit==1.0.9 54 | protobuf==3.6.1 55 | ptyprocess==0.5.1 56 | Pygments==2.1.3 57 | pyparsing==2.2.0 58 | pyproj==1.9.5.1 59 | python-dateutil==2.6.1 60 | python-editor==1.0.3 61 | pytz==2017.2 62 | PyWavelets==0.5.2 63 | pyzmq==16.0.2 64 | rasterio==1.0.3 65 | requests==2.12.4 66 | rise==5.2.0 67 | s3contents==0.1.10 68 | scikit-image==0.13.0 69 | scipy==0.19.0 70 | setuptools==18.5 71 | Shapely==1.6b4 72 | simplegeneric==0.8.1 73 | simplejson==3.13.2 74 | six==1.10.0 75 | snuggs==1.4.1 76 | SQLAlchemy==1.1.4 77 | sudospawner==0.5.1 78 | terminado==0.6 79 | testpath==0.3 80 | tornado==4.4.2 81 | traitlets==4.3.2 82 | virtualenv==13.1.2 83 | wcwidth==0.1.7 84 | Werkzeug==0.11.13 85 | widgetsnbextension==2.0.0 86 | -------------------------------------------------------------------------------- /rpms/build/wheels.mk: -------------------------------------------------------------------------------- 1 | wheels wheel/http-requirements.txt: archives/ipykernel.zip \ 2 | rpmbuild/RPMS/x86_64/proj493-4.9.3-33.x86_64.rpm \ 3 | rpmbuild/RPMS/x86_64/hdf5-1.8.20-33.x86_64.rpm \ 4 | rpmbuild/RPMS/x86_64/netcdf-4.5.0-33.x86_64.rpm \ 5 | rpmbuild/RPMS/x86_64/gdal231-2.3.1-33.x86_64.rpm \ 6 | wheel/requirements.txt 7 | (cd wheel ; rm -f *.whl) 8 | docker run -it --rm \ 9 | -v $(shell pwd)/archives:/archives:ro \ 10 | -v $(shell pwd)/rpmbuild:/tmp/rpmbuild:ro \ 11 | -v $(shell pwd)/wheel:/wheel:rw \ 12 | -v $(shell pwd)/scripts:/scripts:ro \ 13 | $(GCC4IMAGE) /scripts/wheel.sh $(shell id -u) $(shell id -g) 14 | (cd wheel ; ls *.whl | sed 's,^,http://localhost:28080/,' > http-requirements.txt) 15 | -------------------------------------------------------------------------------- /terraform/.gitignore: -------------------------------------------------------------------------------- 1 | .terraform* 2 | terraform.tfstate* 3 | -------------------------------------------------------------------------------- /terraform/aws.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = "${var.region}" 3 | } 4 | -------------------------------------------------------------------------------- /terraform/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RPM_URI=$(echo $1 | sed 's,/$,,') 4 | NB_BUCKET=$(echo $2 | sed 's,s3://\([^/]*\).*,\1,') 5 | NB_PREFIX=$(echo $2 | sed 's,s3://[^/]*/,,' | sed 's,/$,,') 6 | OAUTH_MODULE=$3 7 | OAUTH_CLASS=$4 8 | OAUTH_CLIENT_ID=$5 9 | OAUTH_CLIENT_SECRET=$6 10 | GEOPYSPARKJARS=$7 11 | GEOPYSPARKURI=$8 12 | 13 | # Parses a configuration file put in place by EMR to determine the role of this node 14 | is_master() { 15 | if [ $(jq '.isMaster' /mnt/var/lib/info/instance.json) = 'true' ]; then 16 | return 0 17 | else 18 | return 1 19 | fi 20 | } 21 | 22 | if is_master; then 23 | 24 | # Download packages 25 | mkdir -p /tmp/blobs/ 26 | aws s3 sync $RPM_URI /tmp/blobs/ 27 | 28 | # Install binary packages 29 | (cd /tmp/blobs; sudo yum localinstall -y openjpeg230-2.3.0-33.x86_64.rpm gdal231-2.3.1-33.x86_64.rpm hdf5-1.8.20-33.x86_64.rpm netcdf-4.5.0-33.x86_64.rpm nodejs-8.5.0-13.x86_64.rpm proj493-lib-4.9.3-33.x86_64.rpm configurable-http-proxy-0.0.0-13.x86_64.rpm) 30 | 31 | # Install Python packages 32 | sudo pip-3.4 install --upgrade pip 33 | sudo ln -s /usr/local/bin/pip3 /usr/bin/ 34 | sudo ln -s /usr/local/bin/pip3.4 /usr/bin/ 35 | (cd /tmp/blobs ; sudo pip3.4 install *.whl) 36 | 37 | # Linkage 38 | echo '/usr/local/lib' > /tmp/local.conf 39 | echo '/usr/local/lib64' >> /tmp/local.conf 40 | sudo cp /tmp/local.conf /etc/ld.so.conf.d/local.conf 41 | sudo ldconfig 42 | rm -f /tmp/local.conf 43 | 44 | # Set up user account to manage JupyterHub 45 | sudo groupadd shadow 46 | sudo chgrp shadow /etc/shadow 47 | sudo chmod 640 /etc/shadow 48 | # sudo usermod -a -G shadow hadoop 49 | sudo useradd -G shadow -r hublauncher 50 | sudo groupadd jupyterhub 51 | 52 | # Ensure that all members of `jupyterhub` group may log in to JupyterHub 53 | echo 'hublauncher ALL=(%jupyterhub) NOPASSWD: /usr/local/bin/sudospawner' | sudo tee -a /etc/sudoers 54 | echo 'hublauncher ALL=(ALL) NOPASSWD: /usr/sbin/useradd' | sudo tee -a /etc/sudoers 55 | echo 'hublauncher ALL=(ALL) NOPASSWD: /bin/chown' | sudo tee -a /etc/sudoers 56 | echo 'hublauncher ALL=(ALL) NOPASSWD: /bin/mkdir' | sudo tee -a /etc/sudoers 57 | echo 'hublauncher ALL=(ALL) NOPASSWD: /bin/mv' | sudo tee -a /etc/sudoers 58 | echo 'hublauncher ALL=(hdfs) NOPASSWD: /usr/bin/hdfs' | sudo tee -a /etc/sudoers 59 | 60 | # Environment setup 61 | cat < /tmp/oauth_profile.sh 62 | export AWS_DNS_NAME=$(aws ec2 describe-network-interfaces --filters Name=private-ip-address,Values=$(hostname -i) | jq -r '.[] | .[] | .Association.PublicDnsName') 63 | export OAUTH_CALLBACK_URL=http://\$AWS_DNS_NAME:8000/hub/oauth_callback 64 | export OAUTH_CLIENT_ID=$OAUTH_CLIENT_ID 65 | export OAUTH_CLIENT_SECRET=$OAUTH_CLIENT_SECRET 66 | 67 | alias launch_hub='sudo -u hublauncher -E env "PATH=/usr/local/bin:$PATH" jupyterhub --JupyterHub.spawner_class=sudospawner.SudoSpawner --SudoSpawner.sudospawner_path=/usr/local/bin/sudospawner --Spawner.notebook_dir=/home/{username}' 68 | EOF 69 | sudo mv /tmp/oauth_profile.sh /etc/profile.d 70 | . /etc/profile.d/oauth_profile.sh 71 | 72 | # Setup required scripts/configurations for launching JupyterHub 73 | cat < /tmp/new_user 74 | #!/bin/bash 75 | 76 | export user=\$1 77 | 78 | sudo useradd -m -G jupyterhub,hadoop \$user 79 | sudo -u hdfs hdfs dfs -mkdir /user/\$user 80 | 81 | sudo mkdir -p /home/\$user/.jupyter/ 82 | 83 | cat << LOL > /tmp/jupyter_notebook_config.py.\$user 84 | from s3contents import S3ContentsManager 85 | 86 | c.NotebookApp.contents_manager_class = S3ContentsManager 87 | c.S3ContentsManager.bucket = "$NB_BUCKET" 88 | c.S3ContentsManager.prefix = "$NB_PREFIX" 89 | 90 | LOL 91 | 92 | sudo mv /tmp/jupyter_notebook_config.py.\$user /home/\$user/.jupyter/jupyter_notebook_config.py 93 | sudo chown \$user:\$user -R /home/\$user/.jupyter/ 94 | 95 | EOF 96 | chmod +x /tmp/new_user 97 | sudo chown root:root /tmp/new_user 98 | sudo mv /tmp/new_user /usr/local/bin 99 | 100 | cat < /tmp/jupyterhub_config.py 101 | from oauthenticator.$OAUTH_MODULE import $OAUTH_CLASS 102 | 103 | c = get_config() 104 | 105 | c.JupyterHub.authenticator_class = $OAUTH_CLASS 106 | c.${OAUTH_CLASS}.create_system_users = True 107 | 108 | c.JupyterHub.spawner_class='sudospawner.SudoSpawner' 109 | c.SudoSpawner.sudospawner_path='/usr/local/bin/sudospawner' 110 | c.Spawner.notebook_dir='/home/{username}' 111 | c.LocalAuthenticator.add_user_cmd = ['new_user'] 112 | 113 | EOF 114 | 115 | # Install GeoPySpark 116 | if [[ $GEOPYSPARKURI == s3* ]]; then 117 | aws s3 cp $GEOPYSPARKURI /tmp/geopyspark.zip 118 | GEOPYSPARKURI=/tmp/geopyspark.zip 119 | fi 120 | sudo -E env "PATH=/usr/local/bin:$PATH" pip3.4 install "$GEOPYSPARKURI" 121 | sudo -E env "PATH=/usr/local/bin:$PATH" jupyter nbextension enable --py widgetsnbextension --system 122 | sudo mkdir -p /opt/jars/ 123 | for url in $(echo $GEOPYSPARKJARS | tr , "\n") 124 | do 125 | if [[ $url == s3* ]]; then 126 | (cd /opt/jars ; sudo aws s3 cp $url . ) 127 | else 128 | (cd /opt/jars ; sudo curl -L -O -C - $url ) 129 | fi 130 | done 131 | 132 | # Install GeoPySpark kernel 133 | cat < /tmp/kernel.json 134 | { 135 | "language": "python", 136 | "display_name": "GeoPySpark", 137 | "argv": [ 138 | "/usr/bin/python3.4", 139 | "-m", 140 | "ipykernel", 141 | "-f", 142 | "{connection_file}" 143 | ], 144 | "env": { 145 | "PYSPARK_PYTHON": "/usr/bin/python3.4", 146 | "PYSPARK_DRIVER_PYTHON": "/usr/bin/python3.4", 147 | "SPARK_HOME": "/usr/lib/spark", 148 | "PYTHONPATH": "/usr/lib/spark/python/lib/pyspark.zip:/usr/lib/spark/python/lib/py4j-0.10.6-src.zip", 149 | "GEOPYSPARK_JARS_PATH": "/opt/jars", 150 | "YARN_CONF_DIR": "/etc/hadoop/conf", 151 | "LD_LIBRARY_PATH": "/usr/local/lib:/usr/lib", 152 | "PYSPARK_SUBMIT_ARGS": "--conf spark.executorEnv.LD_LIBRARY_PATH=/usr/local/lib:/usr/lib --conf hadoop.yarn.timeline-service.enabled=false pyspark-shell" 153 | } 154 | } 155 | EOF 156 | sudo mkdir -p /usr/local/share/jupyter/kernels/geopyspark 157 | sudo cp /tmp/kernel.json /usr/local/share/jupyter/kernels/geopyspark/kernel.json 158 | rm -f /tmp/kernel.json 159 | 160 | # Execute 161 | cd /tmp 162 | sudo -u hublauncher -E env "PATH=/usr/local/bin:$PATH" jupyterhub --JupyterHub.spawner_class=sudospawner.SudoSpawner --SudoSpawner.sudospawner_path=/usr/local/bin/sudospawner --Spawner.notebook_dir=/home/{username} -f /tmp/jupyterhub_config.py & 163 | 164 | else 165 | # Download packages 166 | mkdir -p /tmp/blobs/ 167 | aws s3 sync $RPM_URI /tmp/blobs/ 168 | 169 | # Install packages 170 | (cd /tmp/blobs; sudo yum localinstall -y openjpeg230-2.3.0-33.x86_64.rpm gdal231-2.3.1-33.x86_64.rpm hdf5-1.8.20-33.x86_64.rpm netcdf-4.5.0-33.x86_64.rpm proj493-lib-4.9.3-33.x86_64.rpm) 171 | 172 | # Install Python packages 173 | sudo pip-3.4 install --upgrade pip 174 | sudo ln -s /usr/local/bin/pip3 /usr/bin/ 175 | sudo ln -s /usr/local/bin/pip3.4 /usr/bin/ 176 | (cd /tmp/blobs ; sudo pip3.4 install *.whl) 177 | 178 | # Install GeoPySpark 179 | if [[ $GEOPYSPARKURI == s3* ]]; then 180 | aws s3 cp $GEOPYSPARKURI /tmp/geopyspark.zip 181 | GEOPYSPARKURI=/tmp/geopyspark.zip 182 | fi 183 | sudo -E env "PATH=/usr/local/bin:$PATH" pip3.4 install "$GEOPYSPARKURI" 184 | 185 | # Linkage 186 | echo '/usr/local/lib' > /tmp/local.conf 187 | echo '/usr/local/lib64' >> /tmp/local.conf 188 | sudo cp /tmp/local.conf /etc/ld.so.conf.d/local.conf 189 | sudo ldconfig 190 | rm -f /tmp/local.conf 191 | fi 192 | -------------------------------------------------------------------------------- /terraform/cluster-configurations.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Classification": "spark", 4 | "Properties": { 5 | "maximizeResourceAllocation": "true" 6 | } 7 | }, 8 | { 9 | "Classification": "spark-defaults", 10 | "Properties": { 11 | "spark.driver.maxResultSize": "3G", 12 | "spark.dynamicAllocation.enabled": "true", 13 | "spark.shuffle.service.enabled": "true", 14 | "spark.shuffle.compress": "true", 15 | "spark.shuffle.spill.compress": "true", 16 | "spark.rdd.compress": "true", 17 | "spark.yarn.executor.memoryOverhead": "1G", 18 | "spark.yarn.driver.memoryOverhead": "1G", 19 | "spark.driver.maxResultSize": "3G", 20 | "spark.executor.extraJavaOptions" : "-XX:+UseParallelGC -Dgeotrellis.s3.threads.rdd.write=64" 21 | } 22 | }, 23 | { 24 | "Classification": "yarn-site", 25 | "Properties": { 26 | "yarn.resourcemanager.am.max-attempts": "1", 27 | "yarn.nodemanager.vmem-check-enabled": "false", 28 | "yarn.nodemanager.pmem-check-enabled": "false" 29 | } 30 | } 31 | ] 32 | -------------------------------------------------------------------------------- /terraform/emr.tf: -------------------------------------------------------------------------------- 1 | resource "aws_emr_cluster" "emr-spark-cluster" { 2 | name = "GeoPySpark Cluster" 3 | applications = ["Hadoop", "Spark", "Ganglia"] 4 | log_uri = "${var.s3_log_uri}" 5 | release_label = "emr-5.13.0" 6 | service_role = "${var.emr_service_role}" 7 | 8 | ec2_attributes { 9 | # subnet_id = "subnet-xxxxxxxx" 10 | instance_profile = "${var.emr_instance_profile}" 11 | key_name = "${var.key_name}" 12 | 13 | emr_managed_master_security_group = "${aws_security_group.security-group.id}" 14 | emr_managed_slave_security_group = "${aws_security_group.security-group.id}" 15 | } 16 | 17 | instance_group { 18 | # bid_price = "${var.bid_price}" 19 | instance_count = 1 20 | instance_role = "MASTER" 21 | instance_type = "m3.xlarge" 22 | name = "geopyspark-master" 23 | } 24 | 25 | instance_group { 26 | bid_price = "${var.bid_price}" 27 | instance_count = "${var.worker_count}" 28 | instance_role = "CORE" 29 | instance_type = "m3.xlarge" 30 | name = "geopyspark-core" 31 | } 32 | 33 | bootstrap_action { 34 | path = "s3://${var.bs_bucket}/${var.bs_prefix}/bootstrap.sh" 35 | name = "geopyspark" 36 | args = [ 37 | "${var.s3_rpm_uri}", 38 | "${var.s3_notebook_uri}", 39 | "${var.jupyterhub_oauth_module}", 40 | "${var.jupyterhub_oauth_class}", 41 | "${var.oauth_client_id}", 42 | "${var.oauth_client_secret}", 43 | "${var.geopyspark_jars}", 44 | "${var.geopyspark_uri}" 45 | ] 46 | } 47 | 48 | configurations = "cluster-configurations.json" 49 | 50 | depends_on = ["aws_s3_bucket_object.bootstrap"] 51 | } 52 | 53 | output "emr-id" { 54 | value = "${aws_emr_cluster.emr-spark-cluster.id}" 55 | } 56 | 57 | output "emr-master" { 58 | value = "${aws_emr_cluster.emr-spark-cluster.master_public_dns}" 59 | } 60 | -------------------------------------------------------------------------------- /terraform/s3.tf: -------------------------------------------------------------------------------- 1 | # bootstrap.sh 2 | resource "aws_s3_bucket_object" "bootstrap" { 3 | bucket = "${var.bs_bucket}" 4 | key = "${var.bs_prefix}/bootstrap.sh" 5 | source = "./bootstrap.sh" 6 | etag = "${md5(file("./bootstrap.sh"))}" 7 | } 8 | -------------------------------------------------------------------------------- /terraform/security-group.tf: -------------------------------------------------------------------------------- 1 | resource "aws_security_group" "security-group" { 2 | ingress { 3 | from_port = 0 4 | to_port = 0 5 | protocol = "-1" 6 | self = true 7 | } 8 | 9 | ingress { 10 | from_port = 49152 11 | to_port = 65535 12 | protocol = "tcp" 13 | cidr_blocks = ["0.0.0.0/0"] 14 | description = "Allow TMS ports" 15 | } 16 | 17 | ingress { 18 | from_port = "22" 19 | to_port = "22" 20 | protocol = "tcp" 21 | cidr_blocks = ["0.0.0.0/0"] 22 | } 23 | 24 | ingress { 25 | from_port = "${var.jupyterhub_port}" 26 | to_port = "${var.jupyterhub_port}" 27 | protocol = "tcp" 28 | cidr_blocks = ["0.0.0.0/0"] 29 | } 30 | 31 | egress { 32 | from_port = 0 33 | to_port = 0 34 | protocol = "-1" 35 | cidr_blocks = ["0.0.0.0/0"] 36 | } 37 | 38 | lifecycle { 39 | create_before_destroy = true 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /terraform/variables.tf: -------------------------------------------------------------------------------- 1 | variable "region" { 2 | type = "string" 3 | description = "AWS Region" 4 | default = "us-east-1" 5 | } 6 | 7 | variable "key_name" { 8 | type = "string" 9 | description = "The name of the EC2 secret key (primarily for SSH access)" 10 | } 11 | 12 | variable "s3_log_uri" { 13 | type = "string" 14 | description = "Where EMR logs will be sent" 15 | } 16 | 17 | variable "ecs_ami" { 18 | type = "string" 19 | description = "AMI to use for the ECS Instance" 20 | default = "ami-9eb4b1e5" 21 | } 22 | 23 | variable "jupyterhub_port" { 24 | type = "string" 25 | description = "The port on which to connect to JupyterHub" 26 | default = "8000" 27 | } 28 | 29 | variable "worker_count" { 30 | type = "string" 31 | description = "The number of worker nodes" 32 | default = "1" 33 | } 34 | 35 | variable "emr_service_role" { 36 | type = "string" 37 | description = "EMR service role" 38 | default = "EMR_DefaultRole" 39 | } 40 | 41 | variable "emr_instance_profile" { 42 | type = "string" 43 | description = "EMR instance profile" 44 | default = "EMR_EC2_DefaultRole" 45 | } 46 | 47 | variable "ecs_instance_profile" { 48 | type = "string" 49 | description = "ECS instance profile" 50 | default = "ecsInstanceRole" 51 | } 52 | 53 | variable "bs_bucket" { 54 | type = "string" 55 | description = "S3 Bucket containing the bootstrap script (e.g. bucket if the whole path is s3://bucket/containing/bootstrap)" 56 | } 57 | 58 | variable "bs_prefix" { 59 | type = "string" 60 | description = "The prefix of the bootstrap script within the s3 bucket (e.g. containing/bootstrap if the whole path is s3://bucket/containing/bootstrap/bootstrap.sh)" 61 | } 62 | 63 | variable "s3_rpm_uri" { 64 | type = "string" 65 | description = "S3 path containing RPMs (e.g. s3://bucket/containing/rpms/)" 66 | } 67 | 68 | variable "s3_notebook_uri" { 69 | type = "string" 70 | description = "S3 path for notebooks (e.g. s3://bucket/containing/notebooks/)" 71 | } 72 | 73 | variable "jupyterhub_oauth_module" { 74 | type = "string" 75 | description = "Name of the jupyterhub/oauthenticator module to import the jupyterhub_oauth_class from" 76 | default = "github" 77 | } 78 | 79 | variable "jupyterhub_oauth_class" { 80 | type = "string" 81 | description = "Name of the OAuth class provided by jupyterhub/oauthenticator" 82 | default = "LocalGitHubOAuthenticator" 83 | } 84 | 85 | variable "oauth_client_id" { 86 | type = "string" 87 | description = "Client ID token for OAuth server" 88 | } 89 | 90 | variable "oauth_client_secret" { 91 | type = "string" 92 | description = "Client secret token for OAuth server" 93 | } 94 | 95 | variable "geopyspark_jars" { 96 | type = "string" 97 | description = "Comma-separated list of URIs pointing to GeoPySpark jars" 98 | default = "s3://geopyspark-dependency-jars/geotrellis-backend-assembly-0.4.1.jar" 99 | } 100 | 101 | variable "geopyspark_uri" { 102 | type = "string" 103 | description = "URI from which the GeoPySpark Python code is to be obtained" 104 | default = "https://github.com/locationtech-labs/geopyspark/archive/d03d95fcd0e24cfca7df81fa56dcd84e30035a0f.zip" 105 | } 106 | 107 | variable "bid_price" { 108 | type = "string" 109 | description = "Bid Price" 110 | default = "0.07" 111 | } 112 | --------------------------------------------------------------------------------