├── deploy ├── .gitignore ├── secrets.yaml.example ├── README.md ├── Makefile ├── config.yaml ├── manual.yaml └── node.yaml ├── .flake8 ├── data-catalog.png ├── pyproject.toml ├── .pre-commit-config.yaml ├── README.md ├── hurricane-florence-animation.ipynb ├── introduction.ipynb └── crop-prediction.ipynb /deploy/.gitignore: -------------------------------------------------------------------------------- 1 | secrets.yaml 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E402,W503 3 | max-line-length = 90 4 | -------------------------------------------------------------------------------- /data-catalog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TomAugspurger/pc-cng-outreach-2022/HEAD/data-catalog.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.nbqa.addopts] 2 | flake8 = [ 3 | "--max-line-length=94" 4 | ] 5 | 6 | [tool.nbqa.exclude] 7 | black = "reading-stac-r.ipynb" 8 | flake8 = "reading-stac-r.ipynb" 9 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/nbQA-dev/nbQA 3 | rev: 0.12.0 4 | hooks: 5 | - id: nbqa-black 6 | additional_dependencies: [black==22.3.0] 7 | args: [--nbqa-mutate] 8 | - id: nbqa-flake8 9 | additional_dependencies: [flake8==3.9.2] 10 | -------------------------------------------------------------------------------- /deploy/secrets.yaml.example: -------------------------------------------------------------------------------- 1 | jupyterhub: 2 | hub: 3 | services: 4 | dask-gateway: 5 | # generate with openssl rand -hex 32 6 | apiToken: "" 7 | 8 | config: 9 | DummyAuthenticator: 10 | # You probably should use a real authenticator. 11 | password: "" 12 | 13 | dask-gateway: 14 | gateway: 15 | auth: 16 | jupyterhub: 17 | # This should match the apiToken from above. 18 | apiToken: "" 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Planetary Computer @ Cloud Native Geospatial Outreach Event 2022 2 | 3 | This tutorial was presented at the 2022 Cloud Native Geospatial Outreach event. 4 | 5 | It collects examples from various places. Visit https://planetarycomputer.microsoft.com/docs and https://github.com/microsoft/planetarycomputerexamples for up-to-date materials. 6 | 7 | 1. introduction.ipynb 8 | 2. data-access.ipynb 9 | 3. hurricane-florence-animation.ipynb 10 | 4. crop-prediction.ipynb 11 | 12 | ### Backup Binder 13 | 14 | If the link above isn't working, you can launch this notebook on [mybinder.org](https://mybinder.org/). 15 | Note that the Jupyter kernel will *not* be running in Azure, so read operations will be slower. 16 | 17 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/TomAugspurger/pc-binder-python/main?urlpath=git-pull%3Frepo%3Dhttps%253A%252F%252Fgithub.com%252FTomAugspurger%252Fpc-cng-outreach-2022%26urlpath%3Dlab%252Ftree%252Fpc-cng-outreach-2022%252Fintroduction.ipynb%26branch%3Dmain) 18 | 19 | ### Deployment 20 | 21 | This repository also contains the code to deploy the hub. See for more. 22 | 23 | [cng]: ... 24 | [hub]: ... 25 | 26 | -------------------------------------------------------------------------------- /deploy/README.md: -------------------------------------------------------------------------------- 1 | # Deploy 2 | 3 | Deploying a pangeo / Planetary-Computer-style JupyterHub on Azure. 4 | This is a mostly standard [daskhub](https://github.com/dask/helm-chart/tree/main/daskhub) deployment. 5 | 6 | ## Prerequisites 7 | 8 | * An Azure subscription 9 | * [Helm](https://helm.sh/) 10 | * A `secrets.yaml` file filled in. Use `secrets.yaml.example` as a template. 11 | 12 | ## Authentication 13 | 14 | We used the `dummy` authenticator for the workshop. You'll also probably want use one of JupyterHub's [real authenticators](https://jupyterhub.readthedocs.io/en/stable/reference/authenticators.html). 15 | 16 | ## Helm configuration 17 | 18 | There are a few azure specific things in the configuration 19 | 20 | * `jupyterhub.proxy.service.annotations.service.beta.kubernetes.io/azure-dns-label-name`: Set this is you want to use AKS's automatic domain name feature. Otherwise, just delete it. 21 | * `jupyterhub.proxy.hosts`: Set this to your hub URL 22 | 23 | ## Deployment 24 | 25 | ``` 26 | $ make resource-group 27 | $ make cluster 28 | $ make hub 29 | $ make userpools 30 | $ NODE_COUNT=1 make scale # your number of users 31 | ``` 32 | 33 | That'll get you a multi-user, Dask enabled hub up and running in 10-15 minutes. 34 | 35 | ## Capacity notes 36 | 37 | We're assuming ~100 users for the tutorial. We're using a `Standard_D8s_v3` for the user pool, and assigning two users per node (4 CPU, 16 GiB of RAM). -------------------------------------------------------------------------------- /deploy/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: group cluster hub clean storage acr image 2 | 3 | LOCATION?=westeurope 4 | GROUP?=rg-pc-cng 5 | CLUSTER?=cngcluster 6 | ACR_NAME?=cngacr 7 | STORAGE_ACCOUNT?=$(GROUP) 8 | STORAGE_CONTAINER?=$(GROUP) 9 | MAX_USER_NODE_COUNT?=500 10 | # 00 11 | NODE_COUNT?=0 12 | SUBSCRIPTION="Planetary Computer" 13 | 14 | group: 15 | az group create --name $(GROUP) --location $(LOCATION) 16 | 17 | cluster: 18 | az aks create --resource-group $(GROUP) --name $(CLUSTER) \ 19 | --generate-ssh-keys \ 20 | --node-count=1 \ 21 | --nodepool-name core \ 22 | --nodepool-labels hub.jupyter.org/node-purpose=core 23 | az aks get-credentials --name $(CLUSTER) --resource-group $(GROUP) 24 | az aks nodepool add \ 25 | --name users \ 26 | --cluster-name $(CLUSTER) \ 27 | --resource-group $(GROUP) \ 28 | --enable-cluster-autoscaler \ 29 | --node-count 1 \ 30 | --min-count 0 --max-count $(MAX_USER_NODE_COUNT) \ 31 | --node-vm-size Standard_D8s_v3 \ 32 | --labels hub.jupyter.org/node-purpose=user 33 | az aks nodepool add \ 34 | --name manualusers \ 35 | --cluster-name $(CLUSTER) \ 36 | --resource-group $(GROUP) \ 37 | --node-count 0 \ 38 | --node-vm-size Standard_D8s_v3 \ 39 | --labels hub.jupyter.org/node-purpose=user 40 | az aks nodepool add \ 41 | --name workers \ 42 | --cluster-name $(CLUSTER) \ 43 | --resource-group $(GROUP) \ 44 | --enable-cluster-autoscaler \ 45 | --node-count 1 \ 46 | --min-count 0 --max-count 200 \ 47 | --node-vm-size Standard_E16_v3 \ 48 | --priority Spot \ 49 | --eviction-policy Delete \ 50 | --spot-max-price -1 \ 51 | --labels="k8s.dask.org/dedicated=worker" 52 | 53 | hub: 54 | helm upgrade --wait --install --create-namespace \ 55 | dask dask/daskhub \ 56 | --version=2021.7.0 \ 57 | --namespace=dhub \ 58 | --values=config.yaml \ 59 | --values=secrets.yaml \ 60 | --kube-context=$(CLUSTER) 61 | 62 | scale: 63 | az aks nodepool scale \ 64 | --name=manualusers \ 65 | --cluster-name=$(CLUSTER) \ 66 | --resource-group=$(GROUP) \ 67 | --node-count=$(NODE_COUNT) 68 | 69 | clean: 70 | az group delete -n $(GROUP) 71 | 72 | acr: 73 | # az acr create --resource-group=$(GROUP) --name=$(ACR_NAME) --sku=Standard --location=$(LOCATION) 74 | az aks update -g $(GROUP) -n $(CLUSTER) --attach-acr $(ACR_NAME) 75 | 76 | image: 77 | az acr build -g $(GROUP) -r $(ACR_NAME) -t pccng:latest .. -f ../binder/Dockerfile 78 | -------------------------------------------------------------------------------- /deploy/config.yaml: -------------------------------------------------------------------------------- 1 | jupyterhub: 2 | proxy: 3 | https: 4 | enabled: true 5 | hosts: 6 | - "pc-cng.westeurope.cloudapp.azure.com" 7 | letsencrypt: 8 | contactEmail: "taugspurger@microsoft.com" 9 | service: 10 | annotations: 11 | # Update this with your hub's name. 12 | service.beta.kubernetes.io/azure-dns-label-name: "pc-cng" 13 | 14 | hub: 15 | # Disable hub network Policy, so that the dask gateway server API can reach the hub directly 16 | # Not required for dask-gateway>0.9.0 17 | # https://github.com/dask/helm-chart/issues/142 18 | networkPolicy: 19 | enabled: false 20 | 21 | # dask-gateway service added in secrets.yaml 22 | 23 | config: 24 | JupyterHub: 25 | authenticator_class: dummy 26 | # password is set in secrets.yaml 27 | 28 | prePuller: 29 | continuous: 30 | enabled: true 31 | 32 | scheduling: 33 | userPlaceholder: 34 | enabled: true 35 | podPriority: 36 | enabled: true 37 | userPlaceholder: 38 | enabled: true 39 | replicas: 4 40 | 41 | singleuser: 42 | image: 43 | name: "cngacr.azurecr.io/pccng" 44 | tag: "latest" 45 | startTimeout: 1200 # 20 * 60s = 10 minutes 46 | cpu: 47 | guarantee: 1.65 # below 3.78 48 | # guarantee: 1.65 # below 3.78 49 | limit: 4 50 | memory: 51 | guarantee: "6.5G" 52 | limit: "7.5G" 53 | 54 | storage: 55 | capacity: "15Gi" 56 | 57 | defaultUrl: "/lab/tree/pc-cng-outreach-2022/introduction.ipynb" 58 | lifecycleHooks: 59 | postStart: 60 | exec: 61 | command: 62 | [ 63 | "/srv/conda/envs/notebook/bin/gitpuller", 64 | "https://github.com/TomAugspurger/pc-cng-outreach-2022", 65 | "main", 66 | "pc-cng-outreach-2022", 67 | ] 68 | 69 | extraEnv: 70 | DASK_GATEWAY__CLUSTER__OPTIONS__IMAGE: '{JUPYTER_IMAGE_SPEC}' 71 | DASK_DISTRIBUTED__DASHBOARD__LINK: '/user/{JUPYTERHUB_USER}/proxy/{port}/status' 72 | DASK_LABEXTENSION__FACTORY__MODULE: 'dask_gateway' 73 | DASK_LABEXTENSION__FACTORY__CLASS: 'GatewayCluster' 74 | # GDAL / Rasterio environment variables for performance 75 | GDAL_DISABLE_READDIR_ON_OPEN: "EMPTY_DIR" 76 | GDAL_HTTP_MERGE_CONSECUTIVE_RANGES: "YES" 77 | GDAL_HTTP_MAX_RETRY: "5" 78 | 79 | dask-gateway: 80 | gateway: 81 | # auth set in secrets.yaml 82 | backend: 83 | worker: 84 | # Ensure workers are scheduled on the worker pool 85 | extraPodConfig: 86 | affinity: 87 | nodeAffinity: 88 | requiredDuringSchedulingIgnoredDuringExecution: 89 | nodeSelectorTerms: 90 | - matchExpressions: 91 | - key: "k8s.dask.org/dedicated" 92 | operator: "In" 93 | values: 94 | - "worker" 95 | 96 | tolerations: 97 | # allow workers to be scheduled on the worker pool, which has preemptible nodes. 98 | - key: "k8s.dask.org/dedicated" 99 | operator: "Equal" 100 | value: "worker" 101 | effect: "NoSchedule" 102 | - key: "k8s.dask.org_dedicated" 103 | operator: "Equal" 104 | value: "worker" 105 | effect: "NoSchedule" 106 | - key: "kubernetes.azure.com/scalesetpriority" 107 | operator: "Equal" 108 | value: "spot" 109 | effect: "NoSchedule" 110 | 111 | extraConfig: 112 | 00-clusterconfig: | 113 | c.KubeClusterConfig.idle_timeout = 10 * 60 # in seconds 114 | c.KubeClusterConfig.cluster_max_cores = 400 # 50 nodes @ 8 workers / node, 1 core / worker 115 | c.KubeClusterConfig.cluster_max_memory = "3200 G" # 8 GiB / core 116 | c.KubeClusterConfig.cluster_max_workers = 400 # 1 core, 8 GiB / worker 117 | 118 | 01-optionHandler: | 119 | # Configure options to 120 | # 1. Have the default worker image match the singleuser image 121 | # 2. Place bounds on worker CPU and Memory requests 122 | # 3. Accept a mapping of environment variables to pass to workers. 123 | from dask_gateway_server.options import Options, Float, String, Mapping 124 | def cluster_options(user): 125 | def option_handler(options): 126 | if ":" not in options.image: 127 | raise ValueError("When specifying an image you must also provide a tag") 128 | 129 | return { 130 | "worker_cores": 0.88 * min(options.worker_cores / 2, 1), 131 | "worker_cores_limit": options.worker_cores, 132 | "worker_memory": "%fG" % (0.9 * options.worker_memory), 133 | "worker_memory_limit": "%fG" % options.worker_memory, 134 | "image": options.image, 135 | "environment": options.environment, 136 | } 137 | return Options( 138 | Float("worker_cores", 1, min=1, max=16, label="Worker Cores"), 139 | Float("worker_memory", 8, min=8, max=128, label="Worker Memory (GiB)"), 140 | String("image", default="pangeo/pangeo-notebook:latest", label="Image"), 141 | Mapping("environment", {}, label="Environment Variables"), 142 | handler=option_handler, 143 | ) 144 | c.Backend.cluster_options = cluster_options 145 | -------------------------------------------------------------------------------- /deploy/manual.yaml: -------------------------------------------------------------------------------- 1 | Name: aks-users-14533555-vmss000006 2 | Roles: agent 3 | Labels: agentpool=users 4 | beta.kubernetes.io/arch=amd64 5 | beta.kubernetes.io/instance-type=Standard_D8s_v3 6 | beta.kubernetes.io/os=linux 7 | failure-domain.beta.kubernetes.io/region=westeurope 8 | failure-domain.beta.kubernetes.io/zone=0 9 | hub.jupyter.org/node-purpose=user 10 | kubernetes.azure.com/agentpool=users 11 | kubernetes.azure.com/cluster=MC_rg-pc-cng_cngcluster_westeurope 12 | kubernetes.azure.com/mode=user 13 | kubernetes.azure.com/node-image-version=AKSUbuntu-1804gen2containerd-2022.03.29 14 | kubernetes.azure.com/os-sku=Ubuntu 15 | kubernetes.azure.com/role=agent 16 | kubernetes.io/arch=amd64 17 | kubernetes.io/hostname=aks-users-14533555-vmss000006 18 | kubernetes.io/os=linux 19 | kubernetes.io/role=agent 20 | node-role.kubernetes.io/agent= 21 | node.kubernetes.io/instance-type=Standard_D8s_v3 22 | topology.disk.csi.azure.com/zone= 23 | topology.kubernetes.io/region=westeurope 24 | topology.kubernetes.io/zone=0 25 | Annotations: csi.volume.kubernetes.io/nodeid: {"disk.csi.azure.com":"aks-users-14533555-vmss000006","file.csi.azure.com":"aks-users-14533555-vmss000006"} 26 | node.alpha.kubernetes.io/ttl: 0 27 | volumes.kubernetes.io/controller-managed-attach-detach: true 28 | CreationTimestamp: Tue, 19 Apr 2022 10:11:02 -0700 29 | Taints: 30 | Unschedulable: false 31 | Lease: 32 | HolderIdentity: aks-users-14533555-vmss000006 33 | AcquireTime: 34 | RenewTime: Tue, 19 Apr 2022 10:17:51 -0700 35 | Conditions: 36 | Type Status LastHeartbeatTime LastTransitionTime Reason Message 37 | ---- ------ ----------------- ------------------ ------ ------- 38 | NetworkUnavailable False Tue, 19 Apr 2022 10:11:38 -0700 Tue, 19 Apr 2022 10:11:38 -0700 RouteCreated RouteController created a route 39 | MemoryPressure False Tue, 19 Apr 2022 10:15:42 -0700 Tue, 19 Apr 2022 10:11:02 -0700 KubeletHasSufficientMemory kubelet has sufficient memory available 40 | DiskPressure False Tue, 19 Apr 2022 10:15:42 -0700 Tue, 19 Apr 2022 10:11:02 -0700 KubeletHasNoDiskPressure kubelet has no disk pressure 41 | PIDPressure False Tue, 19 Apr 2022 10:15:42 -0700 Tue, 19 Apr 2022 10:11:02 -0700 KubeletHasSufficientPID kubelet has sufficient PID available 42 | Ready True Tue, 19 Apr 2022 10:15:42 -0700 Tue, 19 Apr 2022 10:11:12 -0700 KubeletReady kubelet is posting ready status. AppArmor enabled 43 | Addresses: 44 | Hostname: aks-users-14533555-vmss000006 45 | InternalIP: 10.240.0.7 46 | Capacity: 47 | attachable-volumes-azure-disk: 16 48 | cpu: 8 49 | ephemeral-storage: 129900528Ki 50 | hugepages-1Gi: 0 51 | hugepages-2Mi: 0 52 | memory: 32882856Ki 53 | pods: 110 54 | Allocatable: 55 | attachable-volumes-azure-disk: 16 56 | cpu: 7820m 57 | ephemeral-storage: 119716326407 58 | hugepages-1Gi: 0 59 | hugepages-2Mi: 0 60 | memory: 28382376Ki 61 | pods: 110 62 | System Info: 63 | Machine ID: f0491616424f42eab7691122b59729b5 64 | System UUID: 6b280cc2-54b8-4059-aae0-70bc7c4f68b5 65 | Boot ID: 46e1b8b0-6cd2-46b9-b83a-d859ae571d18 66 | Kernel Version: 5.4.0-1073-azure 67 | OS Image: Ubuntu 18.04.6 LTS 68 | Operating System: linux 69 | Architecture: amd64 70 | Container Runtime Version: containerd://1.4.12+azure-3 71 | Kubelet Version: v1.21.9 72 | Kube-Proxy Version: v1.21.9 73 | PodCIDR: 10.244.4.0/24 74 | PodCIDRs: 10.244.4.0/24 75 | ProviderID: azure:///subscriptions/9da7523a-cb61-4c3e-b1d4-afa5fc6d2da9/resourceGroups/mc_rg-pc-cng_cngcluster_westeurope/providers/Microsoft.Compute/virtualMachineScaleSets/aks-users-14533555-vmss/virtualMachines/6 76 | Non-terminated Pods: (7 in total) 77 | Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits AGE 78 | --------- ---- ------------ ---------- --------------- ------------- --- 79 | dhub continuous-image-puller-sjgbk 0 (0%) 0 (0%) 0 (0%) 0 (0%) 6m46s 80 | dhub jupyter-taugspurger 1650m (21%) 4 (51%) 7301444403 (25%) 8053063680 (27%) 2m21s 81 | dhub user-placeholder-0 1650m (21%) 4 (51%) 6800M (23%) 7500M (25%) 6m17s 82 | kube-system azure-ip-masq-agent-lw66b 100m (1%) 500m (6%) 50Mi (0%) 250Mi (0%) 6m56s 83 | kube-system csi-azuredisk-node-wzkmr 30m (0%) 0 (0%) 60Mi (0%) 400Mi (1%) 6m56s 84 | kube-system csi-azurefile-node-wzrqc 30m (0%) 0 (0%) 60Mi (0%) 500Mi (1%) 6m56s 85 | kube-system kube-proxy-tfdgv 100m (1%) 0 (0%) 0 (0%) 0 (0%) 6m56s 86 | Allocated resources: 87 | (Total limits may be over 100 percent, i.e., overcommitted.) 88 | Resource Requests Limits 89 | -------- -------- ------ 90 | cpu 3560m (45%) 8500m (108%) 91 | memory 14279702323 (49%) 16758926080 (57%) 92 | ephemeral-storage 0 (0%) 0 (0%) 93 | hugepages-1Gi 0 (0%) 0 (0%) 94 | hugepages-2Mi 0 (0%) 0 (0%) 95 | attachable-volumes-azure-disk 0 0 96 | Events: 97 | Type Reason Age From Message 98 | ---- ------ ---- ---- ------- 99 | Normal Starting 6m56s kubelet Starting kubelet. 100 | Warning InvalidDiskCapacity 6m56s kubelet invalid capacity 0 on image filesystem 101 | Normal NodeHasSufficientMemory 6m56s (x2 over 6m56s) kubelet Node aks-users-14533555-vmss000006 status is now: NodeHasSufficientMemory 102 | Normal NodeHasNoDiskPressure 6m56s (x2 over 6m56s) kubelet Node aks-users-14533555-vmss000006 status is now: NodeHasNoDiskPressure 103 | Normal NodeHasSufficientPID 6m56s (x2 over 6m56s) kubelet Node aks-users-14533555-vmss000006 status is now: NodeHasSufficientPID 104 | Normal NodeAllocatableEnforced 6m56s kubelet Updated Node Allocatable limit across pods 105 | Normal Starting 6m50s kube-proxy Starting kube-proxy. 106 | Normal NodeReady 6m46s kubelet Node aks-users-14533555-vmss000006 status is now: NodeReady 107 | -------------------------------------------------------------------------------- /deploy/node.yaml: -------------------------------------------------------------------------------- 1 | Name: aks-users-14533555-vmss000005 2 | Roles: agent 3 | Labels: agentpool=users 4 | beta.kubernetes.io/arch=amd64 5 | beta.kubernetes.io/instance-type=Standard_D8s_v3 6 | beta.kubernetes.io/os=linux 7 | failure-domain.beta.kubernetes.io/region=westeurope 8 | failure-domain.beta.kubernetes.io/zone=0 9 | hub.jupyter.org/node-purpose=user 10 | kubernetes.azure.com/agentpool=users 11 | kubernetes.azure.com/cluster=MC_rg-pc-cng_cngcluster_westeurope 12 | kubernetes.azure.com/mode=user 13 | kubernetes.azure.com/node-image-version=AKSUbuntu-1804gen2containerd-2022.03.29 14 | kubernetes.azure.com/os-sku=Ubuntu 15 | kubernetes.azure.com/role=agent 16 | kubernetes.io/arch=amd64 17 | kubernetes.io/hostname=aks-users-14533555-vmss000005 18 | kubernetes.io/os=linux 19 | kubernetes.io/role=agent 20 | node-role.kubernetes.io/agent= 21 | node.kubernetes.io/instance-type=Standard_D8s_v3 22 | topology.disk.csi.azure.com/zone= 23 | topology.kubernetes.io/region=westeurope 24 | topology.kubernetes.io/zone=0 25 | Annotations: csi.volume.kubernetes.io/nodeid: {"disk.csi.azure.com":"aks-users-14533555-vmss000005","file.csi.azure.com":"aks-users-14533555-vmss000005"} 26 | node.alpha.kubernetes.io/ttl: 0 27 | volumes.kubernetes.io/controller-managed-attach-detach: true 28 | CreationTimestamp: Tue, 19 Apr 2022 08:57:15 -0700 29 | Taints: 30 | Unschedulable: false 31 | Lease: 32 | HolderIdentity: aks-users-14533555-vmss000005 33 | AcquireTime: 34 | RenewTime: Tue, 19 Apr 2022 09:12:43 -0700 35 | Conditions: 36 | Type Status LastHeartbeatTime LastTransitionTime Reason Message 37 | ---- ------ ----------------- ------------------ ------ ------- 38 | NetworkUnavailable False Tue, 19 Apr 2022 08:57:38 -0700 Tue, 19 Apr 2022 08:57:38 -0700 RouteCreated RouteController created a route 39 | MemoryPressure False Tue, 19 Apr 2022 09:08:45 -0700 Tue, 19 Apr 2022 08:57:15 -0700 KubeletHasSufficientMemory kubelet has sufficient memory available 40 | DiskPressure False Tue, 19 Apr 2022 09:08:45 -0700 Tue, 19 Apr 2022 08:57:15 -0700 KubeletHasNoDiskPressure kubelet has no disk pressure 41 | PIDPressure False Tue, 19 Apr 2022 09:08:45 -0700 Tue, 19 Apr 2022 08:57:15 -0700 KubeletHasSufficientPID kubelet has sufficient PID available 42 | Ready True Tue, 19 Apr 2022 09:08:45 -0700 Tue, 19 Apr 2022 08:57:25 -0700 KubeletReady kubelet is posting ready status. AppArmor enabled 43 | Addresses: 44 | Hostname: aks-users-14533555-vmss000005 45 | InternalIP: 10.240.0.6 46 | Capacity: 47 | attachable-volumes-azure-disk: 16 48 | cpu: 8 49 | ephemeral-storage: 129900528Ki 50 | hugepages-1Gi: 0 51 | hugepages-2Mi: 0 52 | memory: 32882856Ki 53 | pods: 110 54 | Allocatable: 55 | attachable-volumes-azure-disk: 16 56 | cpu: 7820m 57 | ephemeral-storage: 119716326407 58 | hugepages-1Gi: 0 59 | hugepages-2Mi: 0 60 | memory: 28382376Ki 61 | pods: 110 62 | System Info: 63 | Machine ID: 5ecbcce2460741b1a64f722b13b184fe 64 | System UUID: f297e343-c2c9-4ae6-add8-41c295c3ff95 65 | Boot ID: 20b9e004-70b0-42a4-a8a9-efe1ec208cc3 66 | Kernel Version: 5.4.0-1073-azure 67 | OS Image: Ubuntu 18.04.6 LTS 68 | Operating System: linux 69 | Architecture: amd64 70 | Container Runtime Version: containerd://1.4.12+azure-3 71 | Kubelet Version: v1.21.9 72 | Kube-Proxy Version: v1.21.9 73 | PodCIDR: 10.244.2.0/24 74 | PodCIDRs: 10.244.2.0/24 75 | ProviderID: azure:///subscriptions/9da7523a-cb61-4c3e-b1d4-afa5fc6d2da9/resourceGroups/mc_rg-pc-cng_cngcluster_westeurope/providers/Microsoft.Compute/virtualMachineScaleSets/aks-users-14533555-vmss/virtualMachines/5 76 | Non-terminated Pods: (9 in total) 77 | Namespace Name CPU Requests CPU Limits Memory Requests Memory Limits AGE 78 | --------- ---- ------------ ---------- --------------- ------------- --- 79 | dhub continuous-image-puller-z4fcr 0 (0%) 0 (0%) 0 (0%) 0 (0%) 15m 80 | dhub user-placeholder-0 1650m (21%) 4 (51%) 7050000Ki (24%) 14104148Ki (49%) 92s 81 | dhub user-placeholder-1 1650m (21%) 4 (51%) 7050000Ki (24%) 14104148Ki (49%) 86s 82 | dhub user-placeholder-2 1650m (21%) 4 (51%) 7050000Ki (24%) 14104148Ki (49%) 100s 83 | dhub user-placeholder-3 1650m (21%) 4 (51%) 7050000Ki (24%) 14104148Ki (49%) 115s 84 | kube-system azure-ip-masq-agent-5rr5w 100m (1%) 500m (6%) 50Mi (0%) 250Mi (0%) 15m 85 | kube-system csi-azuredisk-node-lx9sr 30m (0%) 0 (0%) 60Mi (0%) 400Mi (1%) 15m 86 | kube-system csi-azurefile-node-w468x 30m (0%) 0 (0%) 60Mi (0%) 500Mi (1%) 15m 87 | kube-system kube-proxy-xv8fp 100m (1%) 0 (0%) 0 (0%) 0 (0%) 15m 88 | Allocated resources: 89 | (Total limits may be over 100 percent, i.e., overcommitted.) 90 | Resource Requests Limits 91 | -------- -------- ------ 92 | cpu 6860m (87%) 16500m (210%) 93 | memory 28374080Ki (99%) 57594192Ki (202%) 94 | ephemeral-storage 0 (0%) 0 (0%) 95 | hugepages-1Gi 0 (0%) 0 (0%) 96 | hugepages-2Mi 0 (0%) 0 (0%) 97 | attachable-volumes-azure-disk 0 0 98 | Events: 99 | Type Reason Age From Message 100 | ---- ------ ---- ---- ------- 101 | Normal Starting 15m kubelet Starting kubelet. 102 | Warning InvalidDiskCapacity 15m kubelet invalid capacity 0 on image filesystem 103 | Normal NodeHasSufficientMemory 15m (x2 over 15m) kubelet Node aks-users-14533555-vmss000005 status is now: NodeHasSufficientMemory 104 | Normal NodeHasNoDiskPressure 15m (x2 over 15m) kubelet Node aks-users-14533555-vmss000005 status is now: NodeHasNoDiskPressure 105 | Normal NodeHasSufficientPID 15m (x2 over 15m) kubelet Node aks-users-14533555-vmss000005 status is now: NodeHasSufficientPID 106 | Normal NodeAllocatableEnforced 15m kubelet Updated Node Allocatable limit across pods 107 | Normal Starting 15m kube-proxy Starting kube-proxy. 108 | Normal NodeReady 15m kubelet Node aks-users-14533555-vmss000005 status is now: NodeReady 109 | -------------------------------------------------------------------------------- /hurricane-florence-animation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d9ff4103", 6 | "metadata": {}, 7 | "source": [ 8 | "## Visualizing Hurricane Florence\n", 9 | "\n", 10 | "This examples makes a true-color animation of Hurricane Florence by stitching together images from GOES. It builds off this example from [pytroll-examples](https://github.com/pytroll/pytroll-examples/blob/main/satpy/GOES-16%20ABI%20-%20True%20Color%20Animation%20-%20Hurricane%20Florence.ipynb). You can see the output of that example [here](https://twitter.com/PyTrollOrg/status/1039555399433834497).\n", 11 | "\n", 12 | "Here's what our final animation will look like:\n", 13 | "\n", 14 | "" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "id": "d48a3b9f", 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import urllib.request\n", 26 | "\n", 27 | "import contextily as ctx\n", 28 | "import geopandas\n", 29 | "import matplotlib.animation as animation\n", 30 | "import matplotlib.pyplot as plt\n", 31 | "import numpy as np\n", 32 | "import pandas as pd\n", 33 | "import planetary_computer\n", 34 | "import pystac_client\n", 35 | "import rioxarray\n", 36 | "import xarray as xr" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "6217412c", 42 | "metadata": {}, 43 | "source": [ 44 | "### Find the storm\n", 45 | "\n", 46 | "First, we need to find where and when on earth the storm was. The NCEI [International Best Track Archive for Climate Stewardship](https://www.ncei.noaa.gov/access/metadata/landing-page/bin/iso?id=gov.noaa.ncdc:C00834) provides files with all the information we need." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "id": "a9601ba0", 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "file, _ = urllib.request.urlretrieve(\n", 57 | " \"https://www.ncei.noaa.gov/data/international-best-track-archive-for-\"\n", 58 | " \"climate-stewardship-ibtracs/v04r00/access/netcdf/IBTrACS.NA.v04r00.nc\"\n", 59 | ")\n", 60 | "# The storm id comes from the text file in\n", 61 | "# https://www.ncei.noaa.gov/data/international-best-track-archive-for-climate-stewardship-ibtracs\n", 62 | "# /v04r00/access/netcdf/\n", 63 | "# The name of this file changes with the update date, so we can't access it programatically.\n", 64 | "STORM_ID = b\"2018242N13343\"\n", 65 | "ds = xr.open_dataset(file)\n", 66 | "storm_loc = (ds.sid == STORM_ID).argmax().item()\n", 67 | "\n", 68 | "data = ds.sel(storm=storm_loc)\n", 69 | "geometry = geopandas.points_from_xy(data.lon, data.lat)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "id": "e5b46235", 75 | "metadata": {}, 76 | "source": [ 77 | "`geometry` is a geopandas GeoArray with points tracking the location of the storm over time. We'll match those up with the timestamps to plot plot storm's trajectory. We'll also overlay the time period covered by our animation in red." 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "3b5ada5a", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "df = (\n", 88 | " geopandas.GeoDataFrame(\n", 89 | " dict(\n", 90 | " time=pd.to_datetime(data.time).tz_localize(\"UTC\"),\n", 91 | " geometry=geopandas.points_from_xy(data.lon, data.lat),\n", 92 | " )\n", 93 | " )\n", 94 | " .set_crs(4326)\n", 95 | " .dropna()\n", 96 | ")\n", 97 | "\n", 98 | "start = pd.Timestamp(\"2018-09-11T13:00:00Z\")\n", 99 | "stop = pd.Timestamp(\"2018-09-11T15:40:00Z\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "id": "cbd23142", 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "ax = df.to_crs(epsg=3857).plot(figsize=(12, 12))\n", 110 | "subset = df[df.time.dt.date == start.date()]\n", 111 | "subset.to_crs(epsg=3857).plot(ax=ax, color=\"r\")\n", 112 | "\n", 113 | "ctx.add_basemap(ax)\n", 114 | "ax.set_axis_off()\n", 115 | "ax.set(title=\"Path of Hurricane Florence (animation period in red)\");" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "id": "0ca49d78", 121 | "metadata": {}, 122 | "source": [ 123 | "Let's save the bounding box for the subset of points we're animating . We'll use it in our query later on." 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "fc950ea3", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "bbox = list(subset.total_bounds)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "id": "1330e644", 139 | "metadata": {}, 140 | "source": [ 141 | "### Get the imagery\n", 142 | "\n", 143 | "Now we'll get the GOES imagery using the Planteary Computer's STAC API. We'll use the `goes-cmi` collection. We'll also have the API filter down the images to just the \"mesoscale\" images (GOES takes images with various fields of view)." 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "4df7dad0", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "catalog = pystac_client.Client.open(\n", 154 | " \"https://planetarycomputer.microsoft.com/api/stac/v1/\"\n", 155 | ")\n", 156 | "search = catalog.search(\n", 157 | " collections=[\"goes-cmi\"],\n", 158 | " bbox=bbox,\n", 159 | " datetime=[start, stop],\n", 160 | " limit=500,\n", 161 | " query={\"goes:image-type\": {\"eq\": \"MESOSCALE\"}},\n", 162 | ")\n", 163 | "items = search.get_all_items()\n", 164 | "signed_items = sorted(\n", 165 | " [planetary_computer.sign(item) for item in items], key=lambda x: x.datetime\n", 166 | ")" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "id": "333e4d59", 172 | "metadata": {}, 173 | "source": [ 174 | "Let's load and plot the first item, just to make sure we're on the right track." 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "92855bfe", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "ds = rioxarray.open_rasterio(signed_items[0].assets[\"C01_2km\"].href).load()\n", 185 | "ds[0].plot.imshow(size=9, cmap=\"Blues\");" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "id": "377a0ab0", 191 | "metadata": {}, 192 | "source": [ 193 | "Great. Now we can load all the data for the first three bands (blue, red, and near-infrared)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "id": "82b0addf", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "bands = [\"C01_2km\", \"C02_2km\", \"C03_2km\"]\n", 204 | "common_names = [\n", 205 | " items[0].assets[band].extra_fields[\"eo:bands\"][0][\"common_name\"] for band in bands\n", 206 | "]\n", 207 | "time = xr.DataArray(\n", 208 | " pd.to_datetime([x.datetime for x in signed_items]).tz_localize(None),\n", 209 | " name=\"time\",\n", 210 | " dims=[\"time\"],\n", 211 | ")\n", 212 | "arrays = [\n", 213 | " xr.concat(\n", 214 | " [rioxarray.open_rasterio(item.assets[band].href) for band in bands], dim=\"band\"\n", 215 | " ).assign_coords(band=common_names)\n", 216 | " for item in signed_items\n", 217 | "]\n", 218 | "data = xr.concat(arrays, dim=time).rename(\"goes\")\n", 219 | "data" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "id": "e3f1a578", 225 | "metadata": {}, 226 | "source": [ 227 | "GOES doesn't have a true green band, which we need for our true color animation. We'll simulate it with a linear combination of the other bands (See [Bah et. al (2018)](https://agupubs.onlinelibrary.wiley.com/doi/10.1029/2018EA000379) for more on this technique)." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "id": "733c325f", 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "green = (\n", 238 | " 0.45 * data.sel(band=\"red\")\n", 239 | " + 0.1 * data.sel(band=\"nir09\")\n", 240 | " + 0.45 * data.sel(band=\"blue\")\n", 241 | ").assign_coords(band=\"green\")\n", 242 | "green" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "107c4dba", 248 | "metadata": {}, 249 | "source": [ 250 | "Now we'll normalize the data and apply a gamma correction for plotting." 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "d2a0a541", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "γ = 2.2\n", 261 | "\n", 262 | "rgb = xr.concat([data, green], dim=\"band\").sel(band=[\"red\", \"green\", \"blue\"])\n", 263 | "rgb = rgb / rgb.max(dim=[\"band\", \"y\", \"x\"])\n", 264 | "rgb = np.clip(rgb ** (1 / γ), 0, 1)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "id": "bbb5b41b", 270 | "metadata": {}, 271 | "source": [ 272 | "Let's check out the first image." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "id": "a4b9fee9", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "fig, ax = plt.subplots(figsize=(16, 16))\n", 283 | "rgb.isel(time=0).plot.imshow(rgb=\"band\", add_labels=False)\n", 284 | "ax.set_axis_off()" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "id": "354e41ba", 290 | "metadata": {}, 291 | "source": [ 292 | "### Create the animation\n", 293 | "\n", 294 | "We'll use matplotlib's [FuncAnimation](https://matplotlib.org/stable/api/_as_gen/matplotlib.animation.FuncAnimation.html) to create the animation." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "67caf4c8-f802-4f21-afec-72a7a0d3de0a", 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "fig, ax = plt.subplots(figsize=(16, 16))\n", 305 | "fig.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=None, hspace=None)\n", 306 | "ax.set_axis_off()\n", 307 | "\n", 308 | "img = rgb[0].plot.imshow(ax=ax, add_colorbar=False, rgb=\"band\", add_labels=False)\n", 309 | "label = ax.text(\n", 310 | " 0.4,\n", 311 | " 0.03,\n", 312 | " pd.Timestamp(rgb.time.data[0]).isoformat(),\n", 313 | " transform=ax.transAxes,\n", 314 | " color=\"k\",\n", 315 | " size=20,\n", 316 | ")\n", 317 | "\n", 318 | "\n", 319 | "def animate(i):\n", 320 | " img.set_data(rgb[i].transpose(\"y\", \"x\", \"band\"))\n", 321 | " label.set_text(pd.Timestamp(rgb.time.data[i]).isoformat())\n", 322 | " return img, label\n", 323 | "\n", 324 | "\n", 325 | "ani = animation.FuncAnimation(fig, animate, frames=len(rgb), interval=120)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "id": "2c8ec661-5533-4137-9ec5-98afcbc82311", 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "ani.save(\n", 336 | " \"goes.mp4\",\n", 337 | " fps=15,\n", 338 | " extra_args=[\"-vcodec\", \"libx264\"],\n", 339 | " savefig_kwargs=dict(pad_inches=0, transparent=True),\n", 340 | " progress_callback=lambda i, n: print(f\"Saving frame {i+1} of {n}\", end=\"\\r\"),\n", 341 | ")" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "id": "f38385c1", 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "from IPython.display import Video\n", 352 | "\n", 353 | "Video(\"goes.mp4\")" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "id": "593fad1a", 359 | "metadata": {}, 360 | "source": [ 361 | "### Next steps\n", 362 | "\n", 363 | "Learn more about GOES and using the Planetary Computer\n", 364 | "\n", 365 | "* [GOES quickstart](../datasets/goes/goes-example.ipynb)\n", 366 | "* [Reading from the STAC API](https://planetarycomputer.microsoft.com/docs/quickstarts/reading-stac/)" 367 | ] 368 | } 369 | ], 370 | "metadata": { 371 | "kernelspec": { 372 | "display_name": "Python 3 (ipykernel)", 373 | "language": "python", 374 | "name": "python3" 375 | }, 376 | "language_info": { 377 | "codemirror_mode": { 378 | "name": "ipython", 379 | "version": 3 380 | }, 381 | "file_extension": ".py", 382 | "mimetype": "text/x-python", 383 | "name": "python", 384 | "nbconvert_exporter": "python", 385 | "pygments_lexer": "ipython3", 386 | "version": "3.8.13" 387 | } 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 5 391 | } 392 | -------------------------------------------------------------------------------- /introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7825713c", 6 | "metadata": {}, 7 | "source": [ 8 | "# [aka.ms/pc-cng](https://aka.ms/pc-cng)" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "8072d2d0", 14 | "metadata": {}, 15 | "source": [ 16 | "# Introduction\n", 17 | "\n", 18 | "A *very brief* introduction to the Planetary Computer." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "8d0931cf", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import adlfs\n", 29 | "import planetary_computer\n", 30 | "import ipyleaflet\n", 31 | "import requests\n", 32 | "import shapely\n", 33 | "from IPython.display import IFrame" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "id": "be119a93-a557-41dd-a697-bfdea16f4680", 39 | "metadata": {}, 40 | "source": [ 41 | "The Planetary Computer hosts lots of geospatial data. Anyone can use it (ideally from Azure!).\n", 42 | "\n", 43 | "\n", 44 | "\n", 45 | "Check out https://planetarycomputer.microsoft.com/catalog for the full catalog. We'll talk about how to access it in a bit.\n", 46 | "\n", 47 | "We also provide APIs for searching and working with that data. For example, our STAC API:" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "c71fce63-3f4f-49b5-8fb6-52ba92953f44", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "IFrame(\n", 58 | " \"https://planetarycomputer-staging.microsoft.com/api/stac/v1/docs\",\n", 59 | " width=800,\n", 60 | " height=400,\n", 61 | ")" 62 | ] 63 | }, 64 | { 65 | "cell_type": "markdown", 66 | "id": "881124fd-9da6-4a64-ae74-bd358a86ca5e", 67 | "metadata": {}, 68 | "source": [ 69 | "Finally, we offer some compute next to the data. Azure has *many* ways of doing compute (plain VMs, Azure Functions, Kubernets, Azure Batch, ...). We host a very convenient [Dask-enabled JupyterHub](https://planetarycomputer.microsoft.com/docs/overview/environment/). Right now we're on a similar JupyterHub deployment.\n", 70 | "\n", 71 | "So that's the Planetary Computer: Geosptial Data + APIs + Compute, all on Azure.\n", 72 | "\n", 73 | "## Cloud-native Principals\n", 74 | "\n", 75 | "The Planetary Computer implements many cloud-native concepts. Here, we'll just list some of them.\n", 76 | "\n", 77 | "\n", 78 | "1. You have *direct* access to *all* of the data\n", 79 | " - You have access to PBs of data\n", 80 | " - Data assets are hosted in the highly scalable Azure Blob Storage\n", 81 | " - You have direct access to the files, using plain HTTPs or Azure Blob Storage APIs. This means you can open the files using any tool that can speak HTTP\n", 82 | "2. Cloud-native formats\n", 83 | " - Wherever possible, we use cloud-native / friendly file formats. We'll see examples using COG, Zarr, (geo)parquet, and COPC\n", 84 | "3. Compute is next to the data\n", 85 | " - All of our files are in the West Europe Azure data region. For best performance, compute should be in that same data center.\n", 86 | "4. Ability to scale\n", 87 | " - Azure makes it easy to get lots of compute\n", 88 | "\n", 89 | "\n", 90 | "### Compute → Data\n", 91 | "\n", 92 | "Putting the compute next to the data can be crucial for performance. Let's consider the simple task of reading the metadata from a COG file with `gdalinfo`.\n", 93 | "\n", 94 | "Running this command from my laptop in Des Moines, IA, we spend a *lot* of time waiting:\n", 95 | "\n", 96 | "```console\n", 97 | "$ time gdalinfo /vsicurl/https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.tif > /dev/null\n", 98 | "real 0m7.158s\n", 99 | "user 0m0.195s\n", 100 | "sys 0m0.032s\n", 101 | "```" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "id": "fafa6826-f0a1-4ce4-b93e-b684b9d72696", 107 | "metadata": {}, 108 | "source": [ 109 | "Running that from this Jupyter kernel, which is in the same Azure data center as the dataset, things look different." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "id": "8efe036d-89d9-4717-82f7-b64310121765", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "!time gdalinfo /vsicurl/https://naipeuwest.blob.core.windows.net/naip/v002/ia/2019/ia_60cm_2019/42091/m_4209150_sw_15_060_20190828.tif > /dev/null" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "510c1a54-9c8e-4566-8970-22e310a32dbb", 125 | "metadata": {}, 126 | "source": [ 127 | "So a nice 35x speedup!" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "id": "2a97a5e6-a7e8-4c4b-adb3-d83d208275d1", 133 | "metadata": {}, 134 | "source": [ 135 | "## STAC\n", 136 | "\n", 137 | "Having access to the data is great, but it's not enough. For example, how would you find all the Sentinel-2 images over Wyoming for July 2021? Consider what we'd do if we just had files in blob storage. We'll use `adlfs` to list some folders, to try to figure out the naming convention (we could also read the docs, but where's the fun in that?)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "90dedf8e-76a4-45c8-ab80-a98890e9b9ce", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "token = planetary_computer.sas.get_token(\"sentinel2l2a01\", \"sentinel2-l2\").token\n", 148 | "\n", 149 | "fs = adlfs.AzureBlobFileSystem(\"sentinel2l2a01\", credential=token)\n", 150 | "fs.ls(\"sentinel2-l2/01/C/CV/2021\") # ...?" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "bc7d692e-410c-445d-9c02-d5f58dc951a1", 156 | "metadata": {}, 157 | "source": [ 158 | "Some of those kinda look like dates. I don't know what the \"C\" and \"DH\" mean.\n", 159 | "\n", 160 | "But I don't need to figure that out! STAC makes this kind of spatio-temporal filtering straightforward." 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "384110d0-694f-4cfb-affe-fd301ea42d65", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "import pystac_client\n", 171 | "\n", 172 | "catalog = pystac_client.Client.open(\n", 173 | " \"https://planetarycomputer.microsoft.com/api/stac/v1\"\n", 174 | ")" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "7c04472f-f054-4460-b342-159363621978", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "%%time\n", 185 | "\n", 186 | "wyoming_bbox = [-111.0717, 41.0296, -103.9965, 45.02695]\n", 187 | "search = catalog.search(\n", 188 | " collections=[\"sentinel-2-l2a\"], bbox=wyoming_bbox, datetime=\"2021-07-01/2021-07-31\"\n", 189 | ")\n", 190 | "items = search.get_all_items()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "13343f76-fa84-424f-8a99-ab8ecbcbfed9", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "len(items)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "id": "5b8fe313-be32-4fa8-8483-1e1883db1b3e", 206 | "metadata": {}, 207 | "source": [ 208 | "Even better: STAC is a standard. It isn't specific to Sentinel-2, or even remote sensing data. Landsat Collection 2 Level-2, which uses a completely different folder structure in blob storage, can be searched by just chagning the collection ID." 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "id": "182640ee-2453-4507-92f7-3bee685fb58a", 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "%%time\n", 219 | "\n", 220 | "search = catalog.search(\n", 221 | " collections=[\"landsat-8-c2-l2\"], bbox=wyoming_bbox, datetime=\"2021-07-01/2021-07-31\"\n", 222 | ")\n", 223 | "landsat_items = search.get_all_items()\n", 224 | "len(landsat_items)" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "id": "d9265298-aecf-48b3-883a-dcaf94cd1ed8", 230 | "metadata": {}, 231 | "source": [ 232 | "STAC items are GeoJSON Features. So even though they're just metadata, we can treat them a bit like data. We can do things like plot the various tiles on a map." 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "06321d68-44b3-41d7-a759-103c3d4f8b1a", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "import geopandas\n", 243 | "\n", 244 | "df = geopandas.GeoDataFrame.from_features(items.to_dict()).set_crs(4326)\n", 245 | "\n", 246 | "df[[\"geometry\", \"s2:mgrs_tile\", \"datetime\"]].explore(\n", 247 | " column=\"datetime\", style_kwds={\"fillOpacity\": 0.1}, cmap=\"viridis\"\n", 248 | ")" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "id": "e77eefd7-ab77-491b-bdbb-ab75ef3d969b", 254 | "metadata": {}, 255 | "source": [ 256 | "## Data APIs\n", 257 | "\n", 258 | "The Planetary Computer also provides a data API, based on [TiTiler](https://developmentseed.org/titiler/), which provides endpoints for some common geospatial analysis routines. This can be a nice alternative to setting up your own compute in Azure if you're doing something basic, like putting an image on a Map (or even more advanced things like mosaicing many images)." 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "id": "76e3d926-5bf8-484a-bdf6-72778f2c2bf2", 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "def plot(item, map_kwargs={}):\n", 269 | " (tiles_url,) = requests.get(item.assets[\"tilejson\"].href).json()[\"tiles\"]\n", 270 | " center = shapely.geometry.shape(item.geometry).centroid.bounds[1::-1]\n", 271 | "\n", 272 | " m = ipyleaflet.Map(\n", 273 | " center=center, controls=[ipyleaflet.FullScreenControl()], **map_kwargs\n", 274 | " )\n", 275 | " m.add_layer(ipyleaflet.TileLayer(url=tiles_url))\n", 276 | " m.scroll_wheel_zoom = True\n", 277 | " return m" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "id": "2d0047fb-d95d-47f0-811b-cfcbbbac0be4", 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "plot(items[1])" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "8160011f-40dc-4816-945f-b342e132f298", 293 | "metadata": {}, 294 | "source": [ 295 | "Whoops, that's a pretty cloudy image. But STAC makes it easy to filter those out." 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "id": "9c6d55a5-d34e-4ec8-b623-6b59744480de", 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "wyoming_bbox = [-111.0717, 41.0296, -103.9965, 45.02695]\n", 306 | "search = catalog.search(\n", 307 | " collections=[\"sentinel-2-l2a\"],\n", 308 | " bbox=wyoming_bbox,\n", 309 | " datetime=\"2021-07-01/2021-07-31\",\n", 310 | " query={\"eo:cloud_cover\": {\"lt\": 10}},\n", 311 | ")\n", 312 | "%time items = search.get_all_items()\n", 313 | "len(items)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "c19f1a54-f94c-4d1d-86f1-8c156d25997c", 320 | "metadata": { 321 | "tags": [] 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "plot(items[1], map_kwargs=dict(zoom=9))" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "24f03968-cb4f-4ce3-9dd3-5218f4cca2d9", 331 | "metadata": {}, 332 | "source": [ 333 | "Fun fact: the STAC and Data APIs power our [explorer](https://planetarycomputer.microsoft.com/explore?c=118.8189%2C37.4070&z=11.00)." 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "037a5a78-2232-4ed5-bf87-c03570a00c88", 339 | "metadata": {}, 340 | "source": [ 341 | "### Scaling\n", 342 | "\n", 343 | "We aren't going to see it today, but our Hub includes a Dask Gateway. This lets you create Dask clusters to distribute your computation, without having to worry about intrastructure things like Kubernetes.\n", 344 | "\n", 345 | "```python\n", 346 | ">>> from dask_gateway import GatewayCluster\n", 347 | ">>> cluater = GatewayCluster()\n", 348 | ">>> cluster.scale(40)\n", 349 | ">>> client = cluster.get_client()\n", 350 | "```\n", 351 | "\n", 352 | "We'll be using Dask later to parallelize some computations, but just on a single machine instead of a cluster." 353 | ] 354 | } 355 | ], 356 | "metadata": { 357 | "kernelspec": { 358 | "display_name": "Python 3 (ipykernel)", 359 | "language": "python", 360 | "name": "python3" 361 | }, 362 | "language_info": { 363 | "codemirror_mode": { 364 | "name": "ipython", 365 | "version": 3 366 | }, 367 | "file_extension": ".py", 368 | "mimetype": "text/x-python", 369 | "name": "python", 370 | "nbconvert_exporter": "python", 371 | "pygments_lexer": "ipython3", 372 | "version": "3.8.10" 373 | } 374 | }, 375 | "nbformat": 4, 376 | "nbformat_minor": 5 377 | } 378 | -------------------------------------------------------------------------------- /crop-prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3c61d41b", 6 | "metadata": {}, 7 | "source": [ 8 | "# Crop Type Prediction\n", 9 | "\n", 10 | "This notebook trains a model to predict crop types from Sentinel 2 Level 2-A imagery.\n", 11 | "\n", 12 | "Our training labels come from the Radiant Earth [South Africa Crop Type Competition](https://registry.mlhub.earth/10.34911/rdnt.j0co8q/). They're a collection of scenes, with integers indicating the crop type at each pixel in the scene.\n", 13 | "\n", 14 | "Our training data comes from Microsoft's Planetary Computer. The [Sentinel 2 Level 2-A](https://planetarycomputer.microsoft.com/dataset/sentinel-2-l2a) page describes what all is avaiable.\n", 15 | "\n", 16 | "## Data access\n", 17 | "\n", 18 | "We'll use STAC for data access. Specifically, we'll interact with two STAC catalogs\n", 19 | "\n", 20 | "1. A static catalog for the labels, hosted in a Blob Storage container\n", 21 | "2. The Planetary Computer's STAC API, to query for scenes matching some condition\n", 22 | "\n", 23 | "The overall workflow will be\n", 24 | "\n", 25 | "1. Load a \"chip\" with the label data (a 256x256 array of integer codes indicate the crop type)\n", 26 | "2. Search for and load a scene with Sentinel 2 imagery covering the `labels` chip\n", 27 | "3. Transform and crop the (very large) Sentinel 2 scene to match the 256x256 label scene\n", 28 | "4. Stack and reshape the data for the machine learning model" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "19f14c2a-7fb3-4791-958e-40d76b3bca44", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import pystac\n", 39 | "import pystac_client\n", 40 | "import requests\n", 41 | "import shapely.geometry\n", 42 | "import shapely.ops\n", 43 | "import warnings\n", 44 | "\n", 45 | "warnings.filterwarnings(\"ignore\", \"Creating an ndarray from ragged\")" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "b7f87bc8", 51 | "metadata": {}, 52 | "source": [ 53 | "### Load labels\n", 54 | "\n", 55 | "We have a STAC catalog of labels for the training data, which is based off the collection used in the Radiant Earth competition." 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "id": "d2951813", 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "training_catalog = pystac.read_file(\n", 66 | " \"https://esip2021.blob.core.windows.net/esip2021/train/collection.json\"\n", 67 | ")\n", 68 | "training_catalog" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "ea99d2b3-ba8c-49c8-86ce-2fe6cde0bbd1", 74 | "metadata": {}, 75 | "source": [ 76 | "Each of these Items contains a few thing. They all share the same set of labels: integer codes indicating a particular crop type." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "id": "d2c359e4-1f9a-4d44-94ab-69a4cbf2d15b", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "N_SCENES = 25\n", 87 | "links = training_catalog.get_item_links()[:N_SCENES]\n", 88 | "label_items = [link.resolve_stac_object().target for link in links]\n", 89 | "\n", 90 | "labels = requests.get(label_items[0].assets[\"raster_values\"].href).json()\n", 91 | "\n", 92 | "labels" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "id": "424f58f1-8791-48e7-aa9f-e52f3766383b", 98 | "metadata": {}, 99 | "source": [ 100 | "And like any STAC item, they have a specific footprint. Let's plot them on the map." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "7b94b06e-4cc4-450a-8f5d-e68dae7cd3cd", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import geopandas\n", 111 | "\n", 112 | "df = geopandas.GeoDataFrame.from_features([x.to_dict() for x in label_items]).set_crs(\n", 113 | " 4326\n", 114 | ")\n", 115 | "m = df.explore()\n", 116 | "m" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "f0a676b5-ccaf-450e-a290-c9e31c5a51f2", 122 | "metadata": {}, 123 | "source": [ 124 | "Each one of these plots is a (256 x 256) \"chip\"." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "id": "91765be9-eecd-4dfd-b1ac-639fa10fd604", 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "import rioxarray\n", 135 | "\n", 136 | "rioxarray.open_rasterio(label_items[9].assets[\"labels\"].href).squeeze().plot.imshow(\n", 137 | " cmap=\"tab10\"\n", 138 | ");" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "2b9ba998-4ce7-40b9-a0cf-972d27e6757a", 144 | "metadata": {}, 145 | "source": [ 146 | "We need to associate the label items with a Sentinel-2 Level 2-A item. We need to find an item that (mostly) covers the field and isn't too cloudy.\n", 147 | "\n", 148 | "We could make one STAC query per label item, but that would be a bit slow and inefficient. Instead, we'll do one search to get all the items covering the bounding box of *all* of our fields. So we need the union of all the bounding boxes." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "e054aff6-971e-4bef-80d3-bf19bf6fe519", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "bbox = shapely.ops.unary_union(\n", 159 | " [shapely.geometry.box(*item.bbox) for item in label_items]\n", 160 | ").bounds\n", 161 | "bbox" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "cafbfb43-83eb-4b81-a5a9-7f43231fa983", 167 | "metadata": {}, 168 | "source": [ 169 | "Now we'll make a search for all the items matching our requirements, similar to the previous notebook." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "id": "af6b9073-cc64-4a70-90d8-0fa3eedafe42", 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "stac_client = pystac_client.Client.open(\n", 180 | " \"https://planetarycomputer.microsoft.com/api/stac/v1/\"\n", 181 | ")\n", 182 | "\n", 183 | "date_range = \"2017-06-01/2017-09-01\"\n", 184 | "\n", 185 | "search = stac_client.search(\n", 186 | " collections=[\"sentinel-2-l2a\"],\n", 187 | " bbox=bbox,\n", 188 | " datetime=date_range,\n", 189 | " query={\"eo:cloud_cover\": {\"lt\": 25}},\n", 190 | ")\n", 191 | "sentinel_items = list(search.get_all_items())\n", 192 | "len(sentinel_items)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "id": "336cbc89-dc45-4cdd-9e7f-dbd74681bf1e", 198 | "metadata": {}, 199 | "source": [ 200 | "So we have bunch of Sentinel 2 items that together cover all of our fields. But these Sentinel scenes are much larger than our fields:" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "499af7a6-0d41-4d0d-beb7-3aa792a673e0", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "import folium\n", 211 | "\n", 212 | "sentinel_item = sentinel_items[1]\n", 213 | "\n", 214 | "layer = folium.TileLayer(\n", 215 | " requests.get(sentinel_item.assets[\"tilejson\"].href).json()[\"tiles\"][0],\n", 216 | " attr=\"Sentinel-2 L2A\",\n", 217 | ")\n", 218 | "\n", 219 | "layer.add_to(m)\n", 220 | "m" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "id": "b17c86e4-ad3e-4ece-a084-9a178a6f61c4", 226 | "metadata": {}, 227 | "source": [ 228 | "How do we know which (part of a) Sentinel-2 scene goes with each field? That's what we do in the next section. It's a bit complicated, but the basic idea is to pick the least-cloudy Sentinel-2 scene that (mostly) covers our field (at least 90% of it anyway)." 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "id": "952b8402-f169-4c51-8e6b-a33e5dfd56c0", 235 | "metadata": {}, 236 | "outputs": [], 237 | "source": [ 238 | "def find_match(label_item, sentinel_items):\n", 239 | " # make sure we pick a sentinel scene that overlaps substantially with the label\n", 240 | " label_shape = shapely.geometry.shape(label_item.geometry)\n", 241 | " items2 = [\n", 242 | " item\n", 243 | " for item in sentinel_items\n", 244 | " if (\n", 245 | " shapely.geometry.shape(item.geometry).intersection(label_shape).area\n", 246 | " / label_shape.area\n", 247 | " )\n", 248 | " > 0.90\n", 249 | " ]\n", 250 | " sentinel_item = min(\n", 251 | " items2, key=lambda item: pystac.extensions.eo.EOExtension.ext(item).cloud_cover\n", 252 | " )\n", 253 | " return sentinel_item" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "id": "6dc03fdf-6ff9-4244-9cdf-d0e37066a15b", 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "import planetary_computer\n", 264 | "\n", 265 | "matched = [\n", 266 | " planetary_computer.sign(find_match(label_item, sentinel_items))\n", 267 | " for label_item in label_items\n", 268 | "]" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "id": "4f92abcb-97d1-4075-9687-3b43065a1769", 274 | "metadata": {}, 275 | "source": [ 276 | "Given the matched `(label_item, sentinel_item)` pairs, we can load in the actual data. Like in the last notebook, we'll use `stackstac` to load a bunch of bands for the training data. We'll also load the label data at the same time.\n", 277 | "\n", 278 | "Finally, there's a slight pixel alignmnet issue, where the coordinates on the `label` data are shifted by a half-pixel from the coordinates for the training data. We'll shift the training data to match the label data." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "id": "8c02ff8d-d114-4303-9f96-4a6674418cd9", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "import rioxarray\n", 289 | "import stackstac\n", 290 | "\n", 291 | "\n", 292 | "def load(label_item, sentinel_item):\n", 293 | " label_data = rioxarray.open_rasterio(label_item.assets[\"labels\"].href).squeeze()\n", 294 | "\n", 295 | " assets = [\"B02\", \"B03\", \"B04\", \"B05\", \"B06\", \"B07\", \"B09\"]\n", 296 | " data = (\n", 297 | " stackstac.stack(\n", 298 | " sentinel_item.to_dict(),\n", 299 | " assets=assets,\n", 300 | " epsg=label_data.rio.crs.to_epsg(), # reproject to the labels' CRS\n", 301 | " bounds=label_data.rio.bounds(), # crop to the labels' bounds\n", 302 | " resolution=10, # resample all assets to the highest resolution\n", 303 | " dtype=\"float32\",\n", 304 | " fill_value=0,\n", 305 | " )\n", 306 | " .squeeze()\n", 307 | " .assign_coords(\n", 308 | " y=lambda ds: (ds.y - 5).round(), # fix half-pixel label issue\n", 309 | " x=lambda ds: (ds.x + 5).round(),\n", 310 | " )\n", 311 | " .compute()\n", 312 | " )\n", 313 | "\n", 314 | " assert data.shape[1:] == label_data.shape\n", 315 | "\n", 316 | " # Add a label_id dimension, to track which training data goes with\n", 317 | " # which pixels. This will be helpful later on in evaluation.\n", 318 | " data = data.expand_dims({\"label_id\": [label_item.id]})\n", 319 | " label_data = label_data.expand_dims({\"label_id\": [label_item.id]})\n", 320 | "\n", 321 | " return data, label_data" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "id": "d8ed08d1-3d32-4ca5-8f00-fc95ac7020ce", 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "import warnings\n", 332 | "\n", 333 | "warnings.filterwarnings(\"ignore\", message=\"pandas.Float64\")" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "e7c198e3-5b4e-4133-8537-44ba0ef7c60d", 339 | "metadata": {}, 340 | "source": [ 341 | "We're actually loading data now. This will take a bit of time." 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "id": "89a76edb-1d41-422b-ab91-97dbe785c4a9", 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "%%time\n", 352 | "Xs, ys = zip(\n", 353 | " *[\n", 354 | " load(label_item, sentinel_item)\n", 355 | " for label_item, sentinel_item in zip(label_items, matched)\n", 356 | " ]\n", 357 | ")" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "id": "4dd21b7b-7b75-48b8-bcf3-6b77fb609a09", 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "Xs[0].shape" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "id": "f35a4ef6-bbf6-45f9-894f-48a49a45b220", 373 | "metadata": {}, 374 | "source": [ 375 | "Now we have a list of DataArrays, each with the dimensions `(label_id, band, y, x)`. We'll use Scikit-Learn to train the model, which expects a 2-D array with dimensions `(observations, features)`. In this case, an \"observation\" is a single pixel (the pixel at coordinate `(-3717125, 274725)` for example), and the features are the 7 bands.\n", 376 | "\n", 377 | "So we need to reshape each DataArray from size `(1, 7, 256, 256)` to `(65536, 7)` and then concatenate them all vertically." 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "id": "57608124-e807-4786-9c41-f4d9837cebda", 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "import xarray as xr\n", 388 | "\n", 389 | "X = xr.concat([x.stack(pixel=(\"label_id\", \"y\", \"x\")).T for x in Xs], dim=\"pixel\")\n", 390 | "y = xr.concat([y.stack(pixel=(\"label_id\", \"y\", \"x\")) for y in ys], dim=\"pixel\")\n", 391 | "assert X.indexes[\"pixel\"].equals(y.indexes[\"pixel\"])" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "id": "c5978086-477a-4458-ac4f-81f22b7f9419", 398 | "metadata": {}, 399 | "outputs": [], 400 | "source": [ 401 | "X.shape" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "id": "9ec4db34-5969-4236-884a-8119cc124b6b", 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "y.shape" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "id": "068fb84f-23ed-4571-817e-739f667ca1a5", 417 | "metadata": {}, 418 | "source": [ 419 | "Thanks to xarray's indexing, we can easily go from these stacked DataArray back to a plot." 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "id": "fb7cf3e5-5252-4dd1-96a1-51f57c76cfbf", 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "label_id = label_items[0].id\n", 430 | "X.sel(label_id=label_id).unstack().sel(band=\"B04\").plot(cmap=\"Reds\", figsize=(12, 9));" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "id": "ccc15f80-1b0f-4890-83a8-ce2ed18d7a61", 436 | "metadata": {}, 437 | "source": [ 438 | "## Train the model\n", 439 | "\n", 440 | "Now that we've done all the pre-processing, we can train the actual model.\n", 441 | "\n", 442 | "We'll start with a scikit-learn KNeighborsClassfier ([User Guide](https://scikit-learn.org/stable/modules/neighbors.html#classification), [API Reference](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)) to establish a baseline model for this dataset." 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": null, 448 | "id": "ab5f8139", 449 | "metadata": {}, 450 | "outputs": [], 451 | "source": [ 452 | "import sklearn.neighbors\n", 453 | "import sklearn.model_selection" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": null, 459 | "id": "1024bdac", 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "id": "47015616-9d34-4b0b-8e13-670b0cbfb59f", 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "clf = sklearn.neighbors.KNeighborsClassifier()\n", 474 | "clf.fit(X_train, y_train)" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "id": "a4a85f52-8d01-4ab4-aa18-5782208531be", 480 | "metadata": {}, 481 | "source": [ 482 | "Training score:" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "id": "35b112c6-1342-4e91-92a4-3909d467b6f0", 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "clf.score(X_train[::100], y_train[::100])" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "id": "0be8bca2-b417-41af-b548-d612aa095781", 498 | "metadata": {}, 499 | "source": [ 500 | "Test score:" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "id": "c414d3de-1b51-4e3c-b506-c6896c2611e1", 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "clf.score(X_test[::100], y_test[::100])" 511 | ] 512 | }, 513 | { 514 | "cell_type": "markdown", 515 | "id": "273eb561-5249-4a36-aa49-a31508ccd688", 516 | "metadata": {}, 517 | "source": [ 518 | "Plot the first field." 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "id": "0c27fa38-683f-48ee-8238-ac50807fa8e0", 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [ 528 | "x = X.sel(label_id=label_id)\n", 529 | "yhat = clf.predict(x)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "id": "613035de-ebe2-44d8-bb6a-eda1827337ac", 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "import matplotlib.pyplot as plt\n", 540 | "\n", 541 | "fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 4))\n", 542 | "\n", 543 | "ys[0].plot(x=\"x\", y=\"y\", cmap=\"tab10\", ax=ax1, add_colorbar=False)\n", 544 | "ax2.imshow(yhat.reshape(256, 256), cmap=\"tab10\")\n", 545 | "plt.tight_layout()\n", 546 | "\n", 547 | "ax1.set_axis_off()\n", 548 | "ax2.set_axis_off()\n", 549 | "\n", 550 | "ax1.set(title=\"Actual\")\n", 551 | "ax2.set(title=\"Predicted\");" 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "id": "d4c89261-f766-4fcb-a735-4a130e00837a", 557 | "metadata": {}, 558 | "source": [ 559 | "So we seems to be able to differentiate \"field\" from \"not a field\", but do a bad job of predicting the actual crop type. Plenty of room for improvement.\n", 560 | "\n", 561 | "## Recap\n", 562 | "\n", 563 | "We were able to train a basic ML model to predict crop types from Sentinel-2 satellite imagery. We used STAC to find and load our data, xarray to reshape the data into an appropriate form for the model, and scikit-learn to train the model." 564 | ] 565 | } 566 | ], 567 | "metadata": { 568 | "kernelspec": { 569 | "display_name": "Python 3 (ipykernel)", 570 | "language": "python", 571 | "name": "python3" 572 | }, 573 | "language_info": { 574 | "codemirror_mode": { 575 | "name": "ipython", 576 | "version": 3 577 | }, 578 | "file_extension": ".py", 579 | "mimetype": "text/x-python", 580 | "name": "python", 581 | "nbconvert_exporter": "python", 582 | "pygments_lexer": "ipython3", 583 | "version": "3.8.10" 584 | }, 585 | "widgets": { 586 | "application/vnd.jupyter.widget-state+json": { 587 | "state": { 588 | "1792dda3292141c6b54a76ec61e737eb": { 589 | "model_module": "@jupyter-widgets/controls", 590 | "model_module_version": "1.5.0", 591 | "model_name": "VBoxModel", 592 | "state": { 593 | "layout": "IPY_MODEL_f741701f6a0b4b46836126f9bf10ca7e" 594 | } 595 | }, 596 | "f741701f6a0b4b46836126f9bf10ca7e": { 597 | "model_module": "@jupyter-widgets/base", 598 | "model_module_version": "1.2.0", 599 | "model_name": "LayoutModel", 600 | "state": {} 601 | } 602 | }, 603 | "version_major": 2, 604 | "version_minor": 0 605 | } 606 | } 607 | }, 608 | "nbformat": 4, 609 | "nbformat_minor": 5 610 | } 611 | --------------------------------------------------------------------------------