├── .gitignore
├── .spyproject
    ├── codestyle.ini
    ├── encoding.ini
    ├── vcs.ini
    └── workspace.ini
├── LICENSE
├── README.md
├── images
    ├── compVizApp.png
    └── new
├── notebooks
    ├── UNET_G4G_2019_Parking.ipynb
    └── UNET_G4G_2019_solar.ipynb
└── utils
    ├── array_tools.py
    ├── calibration.py
    ├── ee_tools.py
    ├── model_tools.py
    ├── pc_tools.py
    ├── prediction_tools.py
    ├── processing.py
    ├── raster_tools.py
    └── stats.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | .httr-oauth
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | *__pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | *.pyc
  8 | 
  9 | # Data directories
 10 | data/
 11 | 
 12 | # C extensions
 13 | *.so
 14 | 
 15 | # Azure stuff
 16 | *.amlignore
 17 | *.amltmp
 18 | .ipynb_aml_checkpoints
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | pip-wheel-metadata/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106 | __pypackages__/
107 | 
108 | # Celery stuff
109 | celerybeat-schedule
110 | celerybeat.pid
111 | 
112 | # SageMath parsed files
113 | *.sage.py
114 | 
115 | # Environments
116 | .env
117 | .venv
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 | 
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 | 
128 | # Rope project settings
129 | .ropeproject
130 | 
131 | # mkdocs documentation
132 | /site
133 | 
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 | 
139 | # Pyre type checker
140 | .pyre
141 | 


--------------------------------------------------------------------------------
/.spyproject/codestyle.ini:
--------------------------------------------------------------------------------
1 | [codestyle]
2 | indentation = True
3 | 
4 | [main]
5 | version = 0.1.0
6 | 
7 | 


--------------------------------------------------------------------------------
/.spyproject/encoding.ini:
--------------------------------------------------------------------------------
1 | [encoding]
2 | text_encoding = utf-8
3 | 
4 | [main]
5 | version = 0.1.0
6 | 
7 | 


--------------------------------------------------------------------------------
/.spyproject/vcs.ini:
--------------------------------------------------------------------------------
1 | [vcs]
2 | use_version_control = False
3 | version_control_system = 
4 | 
5 | [main]
6 | version = 0.1.0
7 | 
8 | 


--------------------------------------------------------------------------------
/.spyproject/workspace.ini:
--------------------------------------------------------------------------------
 1 | [workspace]
 2 | restore_data_on_startup = True
 3 | save_data_on_exit = True
 4 | save_history = True
 5 | save_non_project_files = False
 6 | 
 7 | [main]
 8 | version = 0.1.0
 9 | recent_files = ['C:\\Users\\mevans\\OneDrive - Defenders of Wildlife\\repos\\Satellite_ComputerVision\\utils\\prediction_tools.py', 'C:\\Users\\mevans\\OneDrive - Defenders of Wildlife\\repos\\Satellite_ComputerVision\\utils\\model_tools.py', 'C:\\Users\\mevans\\OneDrive - Defenders of Wildlife\\repos\\Satellite_ComputerVision\\utils\\processing.py', 'C:\\Users\\mevans\\OneDrive - Defenders of Wildlife\\repos\\Satellite_ComputerVision\\azure\\train_wetland.py', 'C:\\Users\\mevans\\OneDrive - Defenders of Wildlife\\repos\\Satellite_ComputerVision\\azure\\train_landcover.py']
10 | 
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2019, mjevans26
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Computer Vision with Free Satellite Data
 2 | This repository contains code used to produce computer vision models that can identify infrastructure in publicly available satellite imagery.
 3 | 
 4 | ## Organization
 5 | The bulk of useful code in this repository is contained in the 'utils' directory. These python files are modules that can be used. They are organized, generally, by the imports they rely on and the kinds of functions they contain. For instance, utils/pc_tools.py imports the planetary computer ecosystem of packages and contains functions and classes for working with data from the MPC. Similarly, model_tools imports tensorflow and keras libraries and contains functions and classes for constructing and training deep learning models using these libraries.
 6 | 
 7 | ## Parking lots
 8 | As part of the [Long Island Solar Roadmap](https://solarroadmap.org), we are testing the ability for computer vision models to automate the detection and delineation of parking lots in NAIP satellite imagery.  This analysis uses the Deeplab v3 model with a pre-trained ResNet backbone.
 9 | 
10 | ## Solar arrays
11 | Ground mounted solar arrays are prominent features on the landscape, and their proliferation can be hard to keep up with.  The Chesapeake Conservancy trained a computer vision model to detect and delineate solar arrays from Sentinel-2 data.  This UNET model can be used to rapidly update the map of solar energy in DE, MD, PA, NY, VA, WV and other eastern states. These outputs were recently published in a [Biological Conservation](https://www.sciencedirect.com/science/article/pii/S0006320723001751) paper. 
12 | 
13 | ### App
14 | The outputs are available for inspection interactively through a [Google Earth Engine App]([https://defendersofwildlifegis.users.earthengine.app/view/compviz](https://mevans-cic.users.earthengine.app/view/cpksolar)https://mevans-cic.users.earthengine.app/view/cpksolar)
15 | ![App image](/images/compVizApp.png)
16 | 


--------------------------------------------------------------------------------
/images/compVizApp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mjevans26/Satellite_ComputerVision/9753cedf4403a529503e4bfea3f6f3b9ee68f740/images/compVizApp.png


--------------------------------------------------------------------------------
/images/new:
--------------------------------------------------------------------------------
1 | k
2 | 


--------------------------------------------------------------------------------
/notebooks/UNET_G4G_2019_Parking.ipynb:
--------------------------------------------------------------------------------
1 | {"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"UNET_G4G_2019_Parking.ipynb","provenance":[],"private_outputs":true,"collapsed_sections":[],"toc_visible":true,"machine_shape":"hm"},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"view-in-github","colab_type":"text"},"source":["<a href=\"https://colab.research.google.com/github/mjevans26/Satellite_ComputerVision/blob/master/UNET_G4G_2019_Parking.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"]},{"cell_type":"code","metadata":{"id":"esIMGVxhDI0f","colab_type":"code","colab":{}},"source":["#@title Copyright 2019 Google LLC. { display-mode: \"form\" }\n","# Licensed under the Apache License, Version 2.0 (the \"License\");\n","# you may not use this file except in compliance with the License.\n","# You may obtain a copy of the License at\n","#\n","# https://www.apache.org/licenses/LICENSE-2.0\n","#\n","# Unless required by applicable law or agreed to in writing, software\n","# distributed under the License is distributed on an \"AS IS\" BASIS,\n","# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n","# See the License for the specific language governing permissions and\n","# limitations under the License."],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_SHAc5qbiR8l","colab_type":"text"},"source":["# Introduction\n","\n","This is a Google Colab notebook demonstrating the process used to export training, evaluation, and prediction data from Google Earth Engine used to develop a [Deeplab V3](https://arxiv.org/abs/1706.05587) convolutional neural network that delineates ground mounted solar arrays in [NAIP](https://www.fsa.usda.gov/programs-and-services/aerial-photography/imagery-programs/naip-imagery/).  Model training and predictions are accomplished in a separate notebook."]},{"cell_type":"markdown","metadata":{"id":"_MJ4kW1pEhwP","colab_type":"text"},"source":["# Setup software libraries\n","\n","Install needed libraries to the notebook VM.  Authenticate as necessary."]},{"cell_type":"code","metadata":{"id":"neIa46CpciXq","colab_type":"code","colab":{}},"source":["# Cloud authentication.\n","from google.colab import auth\n","auth.authenticate_user()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"4D6ArFWrckmS","colab_type":"code","colab":{}},"source":["# Earth Engine install to notebook VM.\n","!pip install earthengine-api"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"jat01FEoUMqg","colab_type":"code","colab":{}},"source":["# Import, authenticate and initialize the Earth Engine library.\n","import ee\n","ee.Authenticate()\n","ee.Initialize()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"8RnZzcYhcpsQ","colab_type":"code","colab":{}},"source":["# Tensorflow setup.\n","import tensorflow as tf\n","\n","tf.enable_eager_execution()\n","print(tf.__version__)\n","\n","%load_ext tensorboard"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"n1hFdpBQfyhN","colab_type":"code","colab":{}},"source":["# Folium setup.\n","import folium\n","print(folium.__version__)\n","\n","# Define the URL format used for Earth Engine generated map tiles.\n","EE_TILES = 'https://earthengine.googleapis.com/map/{mapid}/{{z}}/{{x}}/{{y}}?token={token}'"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"WjUgYcsAs9Ed","colab_type":"text"},"source":["##Mount Google Drive"]},{"cell_type":"code","metadata":{"id":"JKDKpX4FtQA1","colab_type":"code","colab":{}},"source":["# Attach specified google drive directory to this notebook\n","from google.colab import drive\n","drive.mount('/content/drive')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"M6pVAfdDIJ-a","colab_type":"code","colab":{}},"source":["%cd '/content/drive/My Drive/repos/Satellite_ComputerVision'"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"iT8ycmzClYwf","colab_type":"text"},"source":["# Variables\n","\n","Declare the variables that will be in use throughout the notebook."]},{"cell_type":"markdown","metadata":{"id":"qKs6HuxOzjMl","colab_type":"text"},"source":["## Specify your Cloud Storage Bucket\n","You must have write access to a bucket to run this demo!  To run it read-only, use the demo bucket below, but note that writes to this bucket will not work."]},{"cell_type":"code","metadata":{"id":"obDDH1eDzsch","colab_type":"code","colab":{}},"source":["# This is read-only:\n","BUCKET = 'cvod-203614-mlengine'"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"wmfKLl9XcnGJ","colab_type":"text"},"source":["## Set other global variables"]},{"cell_type":"code","metadata":{"id":"psz7wJKalaoj","colab_type":"code","colab":{}},"source":["# Specify names locations for outputs in Cloud Storage. \n","FOLDER = 'LI_parking'\n","PRED_BASE = 'data/predict'\n","TRAINING_BASE = 'data/training'\n","EVAL_BASE = 'data/eval'\n","MODEL_BASE = 'models'\n","log_dir = 'drive/My Drive/Tensorflow/models/UNET256'\n","\n","# Specify inputs (Landsat bands) to the model and the response variable.\n","opticalBands = ['R', 'G', 'B']\n","thermalBands = ['B8', 'B11', 'B12']\n","pcaBands = ['pc1', 'pc2', 'pc3']\n","BANDS = opticalBands# + thermalBands# + pcaBands\n","RESPONSE = 'landcover'\n","FEATURES = BANDS + [RESPONSE]\n","\n","# Specify the size and shape of patches expected by the model.\n","KERNEL_SIZE = 512\n","KERNEL_SHAPE = [KERNEL_SIZE, KERNEL_SIZE]\n","COLUMNS = [\n","  tf.io.FixedLenFeature(shape=KERNEL_SHAPE, dtype=tf.float32) for k in FEATURES\n","]\n","FEATURES_DICT = dict(zip(FEATURES, COLUMNS))\n","\n","# Sizes of the training and evaluation datasets.\n","TRAIN_SIZE = 8000\n","EVAL_SIZE = 5000\n","\n","# Specify model training parameters.\n","BATCH_SIZE = 16\n","EPOCHS = 20\n","BUFFER_SIZE = 8000\n","OPTIMIZER = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)\n","LOSS = 'binary_crossentropy'\n","METRICS = [tf.keras.metrics.categorical_accuracy, tf.keras.metrics.MeanIoU(num_classes=2)]"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"hgoDc7Hilfc4","colab_type":"text"},"source":["# Imagery\n","\n","Gather and setup the imagery to use for inputs (predictors).  This is a three-year, cloud-free, Landsat 8 composite.  Display it in the notebook for a sanity check."]},{"cell_type":"code","metadata":{"id":"-IlgXu-vcUEY","colab_type":"code","colab":{}},"source":["# Use Landsat 8 surface reflectance data.\n","NAIP = ee.ImageCollection(\"USDA/NAIP/DOQQ\")\n","towns = ee.FeatureCollection(\"users/defendersofwildlifeGIS/LongIsland/towns\")\n","\n","begin = '2017-01-01'\n","end = '2017-12-30'\n","\n","# The image input data is a cloud-masked median composite.\n","image = NAIP.filterDate(begin, end)\\\n",".filterBounds(towns)\\\n",".filterDate(begin, end)\\\n",".median()\\\n",".select(BANDS)\\\n",".clip(towns)\n","\n","# Use folium to visualize the imagery.\n","mapid = image.getMapId({'bands': ['R', 'G', 'B'], 'min': 0, 'max': 256})\n","map = folium.Map(location=[40.8175, -73.195])\n","folium.TileLayer(\n","    tiles=EE_TILES.format(**mapid),\n","    attr='Google Earth Engine',\n","    overlay=True,\n","    name='median composite',\n","  ).add_to(map)\n","\n","map.add_child(folium.LayerControl())\n","map"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gHznnctkJsZJ","colab_type":"text"},"source":["Prepare the response (what we want to predict).  This is impervious surface area (in fraction of a pixel) from the 2016 NLCD dataset.  Display to check."]},{"cell_type":"code","metadata":{"id":"5Wxz9BPYHBwh","colab_type":"code","colab":{}},"source":["def set_landcover(ft):\n","  return ft.set('label', 1)\n","\n","nassauParkingFootprints = ee.FeatureCollection(\"users/defendersofwildlifeGIS/LongIsland/NassauParking\")\n","suffolkParkingFootprints = ee.FeatureCollection('users/defendersofwildlifeGIS/LongIsland/SuffolkParking')\n","parkingFootprints = nassauParkingFootprints.merge(suffolkParkingFootprints)\n","parking = parkingFootprints.map(set_landcover)\n","blankimg = ee.Image.constant(0)\n","parking_footprint = parking.reduceToImage(['label'], ee.Reducer.first())\n","labelimg = blankimg.where(parking_footprint, parking_footprint).rename('landcover')\n","\n","mapid = labelimg.getMapId({'bands': 'landcover', 'min':0, 'max': 1})\n","print(mapid)\n","map = folium.Map(location = [40.8175, -73.195])\n","folium.TileLayer(\n","    tiles = EE_TILES.format(**mapid),\n","    attr='Google Earth Engine',\n","    overlay = True,\n","    name = 'parking lots',\n",").add_to(map)\n","map.add_child(folium.LayerControl())\n","map"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"CTS7_ZzPDhhg","colab_type":"text"},"source":["Stack the 2D images (Landsat composite and NLCD impervious surface) to create a single image from which samples can be taken.  Convert the image into an array image in which each pixel stores 256x256 patches of pixels for each band.  This is a key step that bears emphasis: to export training patches, convert a multi-band image to [an array image](https://developers.google.com/earth-engine/arrays_array_images#array-images) using [`neighborhoodToArray()`](https://developers.google.com/earth-engine/api_docs#eeimageneighborhoodtoarray), then sample the image at points."]},{"cell_type":"code","metadata":{"id":"eGHYsdAOipa4","colab_type":"code","colab":{}},"source":["featureStack = ee.Image.cat([\n","  image.select(BANDS),\n","  labelimg.select(RESPONSE)\n","]).float()\n","\n","print(featureStack.bandNames().getInfo())\n","\n","list = ee.List.repeat(1, KERNEL_SIZE)\n","lists = ee.List.repeat(list, KERNEL_SIZE)\n","kernel = ee.Kernel.fixed(KERNEL_SIZE, KERNEL_SIZE, lists)\n","\n","arrays = featureStack.neighborhoodToArray(kernel)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"F4djSxBRG2el","colab_type":"text"},"source":["Use some pre-made geometries to sample the stack in strategic locations.  Specifically, these are hand-made polygons in which to take the 256x256 samples.  Display the sampling polygons on a map, red for training polygons, blue for evaluation."]},{"cell_type":"code","metadata":{"id":"ure_WaD0itQY","colab_type":"code","colab":{}},"source":["import re\n","towns = towns.randomColumn('random', 52.0)\n","townList = ee.List(towns.aggregate_array('TOWN')).distinct()\n","townList = [townList.get(town).getInfo() for town in range(townList.size().getInfo())]\n","townList = [town for town in townList if not re.search(r\"City|Water|Indian\", town)]\n","trainList = townList[0:(len(townList)//10) * 8]\n","evalList = townList[(len(townList)//10) * 8:]\n","\n","trainFilter = ee.Filter.inList('TOWN', ee.List(trainList))\n","evalFilter = ee.Filter.inList(\"TOWN\", ee.List(evalList))\n","\n","trainingPolys = towns.filter(trainFilter)\n","print('training size', len(trainList))\n","\n","evalPolys = towns.filter(evalFilter)\n","print('eval size', len(evalList))\n","\n","polyImage = ee.Image(0).byte().paint(trainingPolys, 1).paint(evalPolys, 2)\n","polyImage = polyImage.updateMask(polyImage)\n","\n","mapid = polyImage.getMapId({'min': 1, 'max': 2, 'palette': ['red', 'blue']})\n","map = folium.Map(location=[40.8175, -73.195], zoom_start=8)\n","folium.TileLayer(\n","    tiles=EE_TILES.format(**mapid),\n","    attr='Google Earth Engine',\n","    overlay=True,\n","    name='training polygons',\n","  ).add_to(map)\n","map.add_child(folium.LayerControl())\n","map"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"ZV890gPHeZqz","colab_type":"text"},"source":["# Sampling\n","\n","The mapped data look reasonable so take a sample from each polygon and merge the results into a single export.  The key step is sampling the array image at points, to get all the pixels in a 256x256 neighborhood at each point.  It's worth noting that to build the training and testing data for the FCNN, you export a single TFRecord file that contains patches of pixel values in each record.  You do NOT need to export each training/testing patch to a different image.  Since each record potentially contains a lot of data (especially with big patches or many input bands), some manual sharding of the computation is necessary to avoid the `computed value too large` error.  Specifically, the following code takes multiple (smaller) samples within each geometry, merging the results to get a single export."]},{"cell_type":"code","metadata":{"id":"FyRpvwENxE-A","colab_type":"code","cellView":"code","colab":{}},"source":["#@title Don't run\n","# Convert the feature collections to lists for iteration.\n","#trainingPolysList = trainingPolys.toList(trainingPolys.size())\n","#evalPolysList = trainingPolys.toList(trainingPolys.size())\n","\n","# These numbers determined experimentally.\n","n = 100 # Number of shards in each town.\n","N = 1000 # Total sample size in each town.\n","\n","for town in trainList:\n","  geomSample = ee.FeatureCollection([])\n","  for i in range (n):\n","    sample = arrays.sample(\n","      region = trainingPolys.filterMetadata('TOWN', 'equals', town),\n","      scale = 1,\n","      numPixels = N/n,\n","      seed = i,\n","      tileScale = 8\n","    )\n","    geomSample = geomSample.merge(sample)\n","    \n","  desc = 'DeepLab_' + str(KERNEL_SIZE) + '_NAIP_' + town\n","  task = ee.batch.Export.table.toCloudStorage(\n","      collection = geomSample,\n","      description = desc,\n","      bucket = BUCKET,\n","      fileNamePrefix = FOLDER + '/' + TRAINING_BASE + '/' + desc,\n","      fileFormat = 'TFRecord',\n","      selectors = BANDS + [RESPONSE]\n","  )\n","  task.start()\n","    \n","for town in evalList:\n","  geomSample = ee.FeatureCollection([])\n","  for i in range(n):\n","    sample = arrays.sample(\n","        region = evalPolys.filterMetadata('TOWN', 'equals', town),\n","        scale = 1,\n","        numPixels = N/n,\n","        seed = i,\n","        tileScale = 8\n","    )\n","    geomSample = geomSample.merge(sample)\n","    \n","  desc = 'DeepLab_' + str(KERNEL_SIZE) + 'NAIP_' + town\n","  task = ee.batch.Export.table.toCloudStorage(\n","      collection = geomSample,\n","      description = desc,\n","      bucket = BUCKET,\n","      fileNamePrefix = FOLDER + '/' + EVAL_BASE + '/' + desc,\n","      fileFormat = 'TFRecord',\n","      selectors = BANDS + [RESPONSE]\n","  )\n","  task.start()\n","                            \n","#Export all the training data (in many pieces), with one task \n","#per geometry.\n","# for g in range(trainingPolys.size().getInfo()):\n","#   geomSample = ee.FeatureCollection([])\n","#   for i in range(n):\n","#     sample = arrays.sample(\n","#       region = ee.Feature(trainingPolysList.get(g)).geometry(), \n","#       scale = 30, \n","#       numPixels = N / n, # Size of the shard.\n","#       seed = i,\n","#       tileScale = 8\n","#     )\n","#     geomSample = geomSample.merge(sample)\n","  \n","#   desc = TRAINING_BASE + '_g' + str(g)\n","#   task = ee.batch.Export.table.toCloudStorage(\n","#     collection = geomSample,\n","#     description = desc, \n","#     bucket = BUCKET, \n","#     fileNamePrefix = FOLDER + '/' + desc,\n","#     fileFormat = 'TFRecord',\n","#     selectors = BANDS + [RESPONSE]\n","#   )\n","#   task.start()\n","\n","# # Export all the evaluation data.\n","# for g in range(evalPolys.size().getInfo()):\n","#   geomSample = ee.FeatureCollection([])\n","#   for i in range(n):\n","#     sample = arrays.sample(\n","#       region = ee.Feature(evalPolysList.get(g)).geometry(), \n","#       scale = 30, \n","#       numPixels = N / n,\n","#       seed = i,\n","#       tileScale = 8\n","#     )\n","#     geomSample = geomSample.merge(sample)\n","  \n","#   desc = EVAL_BASE + '_g' + str(g)\n","#   task = ee.batch.Export.table.toCloudStorage(\n","#     collection = geomSample,\n","#     description = desc, \n","#     bucket = BUCKET, \n","#     fileNamePrefix = FOLDER + '/' + desc,\n","#     fileFormat = 'TFRecord',\n","#     selectors = BANDS + [RESPONSE]\n","#   )\n","#   task.start()"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dk51-l7MH2Sa","colab_type":"text"},"source":["##Preprocessing\n","Define functions that apply random manipulations to our training data"]},{"cell_type":"code","metadata":{"id":"ajyp48-vINuy","colab_type":"code","colab":{}},"source":["def augColor(x):\n","    \"\"\"Color augmentation\n","\n","    Args:\n","        x: Image\n","\n","    Returns:\n","        Augmented image\n","    \"\"\"\n","    x = tf.image.random_hue(x, 0.08)\n","    x = tf.image.random_saturation(x, 0.6, 1.6)\n","    x = tf.image.random_brightness(x, 0.05)\n","    x = tf.image.random_contrast(x, 0.7, 1.3)\n","    return x\n","  \n","  \n","def augImg(img):\n","  outDims = tf.shape(img)[0:1]\n","  x = tf.image.random_flip_left_right(img)\n","  x = tf.image.random_flip_up_down(x)\n","  x = rotated = tf.image.rot90(x, tf.random_uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))\n","  #x = zoom(x, outDims)\n","  #since were gonna map_fn this on a 4d image, output must be 3d, so squeeze the artificial 'sample' dimension\n","  return tf.squeeze(x)\n","\n","def preprocess(img, labels):\n","  dims = tf.shape(img)\n","  #need to combine labels and bands for morphological transformations\n","  comb = tf.concat([img, tf.expand_dims(labels, axis = 2)], axis = 2)\n","  aug = aug_img(comb)\n","  #aug = tf.map_fn(fn = aug_img, elems = comb)\n","  labels = tf.squeeze(aug[:, :, -1])\n","  band_stack = color(aug[:, :, 0:dims[2]])\n","  return band_stack, labels"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"rWXrvBE4607G","colab_type":"text"},"source":["# Training data\n","\n","Load the data exported from Earth Engine into a `tf.data.Dataset`.  The following are helper functions for that."]},{"cell_type":"code","metadata":{"id":"WWZ0UXCVMyJP","colab_type":"code","colab":{}},"source":["def parse_tfrecord(example_proto):\n","  \"\"\"The parsing function.\n","  Read a serialized example into the structure defined by FEATURES_DICT.\n","  Args:\n","    example_proto: a serialized Example.\n","  Returns: \n","    A dictionary of tensors, keyed by feature name.\n","  \"\"\"\n","  return tf.io.parse_single_example(example_proto, FEATURES_DICT)\n","\n","\n","def to_tuple(inputs):\n","  \"\"\"Function to convert a dictionary of tensors to a tuple of (inputs, outputs).\n","  Turn the tensors returned by parse_tfrecord into a stack in HWC shape.\n","  Args:\n","    inputs: A dictionary of tensors, keyed by feature name.\n","  Returns: \n","    A dtuple of (inputs, outputs).\n","  \"\"\"\n","  inputsList = [inputs.get(key) for key in FEATURES]\n","  stacked = tf.stack(inputsList, axis=0)\n","  # Convert from CHW to HWC\n","  stacked = tf.transpose(stacked, [1, 2, 0])\n","  stacked = augImg(stacked)\n","  return stacked[:,:,:len(BANDS)], stacked[:,:,len(BANDS):]\n","\n","\n","def get_dataset(pattern):\n","  \"\"\"Function to read, parse and format to tuple a set of input tfrecord files.\n","  Get all the files matching the pattern, parse and convert to tuple.\n","  Args:\n","    pattern: A file pattern to match in a Cloud Storage bucket.\n","  Returns: \n","    A tf.data.Dataset\n","  \"\"\"\n","  glob = tf.gfile.Glob(pattern)\n","  dataset = tf.data.TFRecordDataset(glob, compression_type='GZIP')\n","  dataset = dataset.map(parse_tfrecord, num_parallel_calls=5)\n","  dataset = dataset.map(to_tuple, num_parallel_calls=5)\n","  return dataset"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"Xg1fa18336D2","colab_type":"text"},"source":["Use the helpers to read in the training dataset.  Print the first record to check."]},{"cell_type":"code","metadata":{"id":"rm0qRF0fAYcC","colab_type":"code","colab":{}},"source":["def get_training_dataset():\n","\t\"\"\"Get the preprocessed training dataset\n","  Returns: \n","    A tf.data.Dataset of training data.\n","  \"\"\"\n","\tglob = 'gs://' + BUCKET + '/' + FOLDER + '/' + TRAINING_BASE + '/*'\n","\tprint(glob)\n","\tdataset = get_dataset(glob)\n","\tdataset = dataset.shuffle(8000).batch(BATCH_SIZE).repeat()\n","\treturn dataset\n","\n","training = get_training_dataset()\n","\n","print(iter(training.take(1)).next())"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"7CRGG26bYWQZ","colab_type":"code","colab":{}},"source":["print(iter(training.take(1)).next())"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"j-cQO5RL6vob","colab_type":"text"},"source":["# Evaluation data\n","\n","Now do the same thing to get an evaluation dataset.  Note that unlike the training dataset, the evaluation dataset has a batch size of 1, is not repeated and is not shuffled."]},{"cell_type":"code","metadata":{"id":"ieKTCGiJ6xzo","colab_type":"code","colab":{}},"source":["def get_eval_dataset():\n","\t\"\"\"Get the preprocessed evaluation dataset\n","  Returns: \n","    A tf.data.Dataset of evaluation data.\n","  \"\"\"\n","\tglob = 'gs://' + BUCKET + '/' + FOLDER + '/' + EVAL_BASE + '/*'\n","\tprint(glob)\n","\tdataset = get_dataset(glob)\n","\tdataset = dataset.batch(1).repeat()\n","\treturn dataset\n","\n","evaluation = get_eval_dataset()"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"keoalUvBbSkh","colab_type":"code","colab":{}},"source":["print(iter(evaluation.take(1)).next())"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"9JIE7Yl87lgU","colab_type":"text"},"source":["# Model\n","\n","Here we use the Keras implementation of the U-Net model as found [in the TensorFlow examples](https://github.com/tensorflow/models/blob/master/samples/outreach/blogs/segmentation_blogpost/image_segmentation.ipynb).  The U-Net model takes 256x256 pixel patches as input and outputs per-pixel class probability, label or a continuous output.  We can implement the model essentially unmodified, but will use mean squared error loss on the sigmoidal output since we are treating this as a regression problem, rather than a classification problem.  Since impervious surface fraction is constrained to [0,1], with many values close to zero or one, a saturating activation function is suitable here."]},{"cell_type":"markdown","metadata":{"id":"Xh2EZyyPu84H","colab_type":"text"},"source":["##Metrics"]},{"cell_type":"code","metadata":{"id":"mISCOXUHu7G_","colab_type":"code","colab":{}},"source":["def weighted_bce(y_true, y_pred):\n","  bce = tf.nn.weighted_cross_entropy_with_logits(labels = y_true, logits = y_pred, pos_weight = 20)\n","  return tf.reduce_mean(bce)\n","\n","def iou(true, pred):\n","\n","    intersection = true * pred\n","\n","    notTrue = 1 - true\n","    union = true + (notTrue * pred)\n","\n","    return tf.reduce_sum(intersection)/tf.reduce_sum(union)\n","\n"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"wsnnnz56yS3l","colab_type":"code","colab":{}},"source":["from tensorflow.python.keras import layers\n","from tensorflow.python.keras import losses\n","from tensorflow.python.keras import models\n","from tensorflow.python.keras import metrics\n","from tensorflow.python.keras import optimizers\n","\n","def conv_block(input_tensor, num_filters):\n","\tencoder = layers.Conv2D(num_filters, (3, 3), padding='same')(input_tensor)\n","\tencoder = layers.BatchNormalization()(encoder)\n","\tencoder = layers.Activation('relu')(encoder)\n","\tencoder = layers.Conv2D(num_filters, (3, 3), padding='same')(encoder)\n","\tencoder = layers.BatchNormalization()(encoder)\n","\tencoder = layers.Activation('relu')(encoder)\n","\treturn encoder\n","\n","def encoder_block(input_tensor, num_filters):\n","\tencoder = conv_block(input_tensor, num_filters)\n","\tencoder_pool = layers.MaxPooling2D((2, 2), strides=(2, 2))(encoder)\n","\treturn encoder_pool, encoder\n","\n","def decoder_block(input_tensor, concat_tensor, num_filters):\n","\tdecoder = layers.Conv2DTranspose(num_filters, (2, 2), strides=(2, 2), padding='same')(input_tensor)\n","\tdecoder = layers.concatenate([concat_tensor, decoder], axis=-1)\n","\tdecoder = layers.BatchNormalization()(decoder)\n","\tdecoder = layers.Activation('relu')(decoder)\n","\tdecoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder)\n","\tdecoder = layers.BatchNormalization()(decoder)\n","\tdecoder = layers.Activation('relu')(decoder)\n","\tdecoder = layers.Conv2D(num_filters, (3, 3), padding='same')(decoder)\n","\tdecoder = layers.BatchNormalization()(decoder)\n","\tdecoder = layers.Activation('relu')(decoder)\n","\treturn decoder\n","\n","def get_model():\n","\tinputs = layers.Input(shape=[None, None, len(BANDS)]) # 256\n","\tencoder0_pool, encoder0 = encoder_block(inputs, 32) # 128\n","\tencoder1_pool, encoder1 = encoder_block(encoder0_pool, 64) # 64\n","\tencoder2_pool, encoder2 = encoder_block(encoder1_pool, 128) # 32\n","\tencoder3_pool, encoder3 = encoder_block(encoder2_pool, 256) # 16\n","\tencoder4_pool, encoder4 = encoder_block(encoder3_pool, 512) # 8\n","\tcenter = conv_block(encoder4_pool, 1024) # center\n","\tdecoder4 = decoder_block(center, encoder4, 512) # 16\n","\tdecoder3 = decoder_block(decoder4, encoder3, 256) # 32\n","\tdecoder2 = decoder_block(decoder3, encoder2, 128) # 64\n","\tdecoder1 = decoder_block(decoder2, encoder1, 64) # 128\n","\tdecoder0 = decoder_block(decoder1, encoder0, 32) # 256\n","\toutputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(decoder0)\n","\n","\tmodel = models.Model(inputs=[inputs], outputs=[outputs])\n","\n","\tmodel.compile(\n","\t\toptimizer=OPTIMIZER, \n","    loss = weighted_bce,\n","\t\t#loss=losses.get(LOSS),\n","\t\tmetrics=[metrics.get(metric) for metric in METRICS])\n","\n","\treturn model\n","\n","\n","log_dir = 'drive/My Drive/Tensorflow/models/UNET256'\n","\n","checkpoint = tf.keras.callbacks.ModelCheckpoint(\n","    log_dir+'best_weights.hdf5',\n","    monitor='val_mean_io_u',\n","    verbose=1,\n","    save_best_only=True,\n","    mode='max'\n","    )"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"uu_E7OTDBCoS","colab_type":"text"},"source":["# Training the model\n","\n","You train a Keras model by calling `.fit()` on it.  Here we're going to train for 10 epochs, which is suitable for demonstration purposes.  For production use, you probably want to optimize this parameter, for example through [hyperparamter tuning](https://cloud.google.com/ml-engine/docs/tensorflow/using-hyperparameter-tuning)."]},{"cell_type":"code","metadata":{"id":"NzzaWxOhSxBy","colab_type":"code","colab":{}},"source":["m = get_model()\n","\n","\n","m.fit(\n","    x=training, \n","    epochs=EPOCHS, \n","    steps_per_epoch=int(TRAIN_SIZE / BATCH_SIZE), \n","    validation_data=evaluation,\n","    validation_steps=EVAL_SIZE/BATCH_SIZE,\n","    callbacks = [checkpoint]\n","    )\n","\n","m.save('drive/My Drive/Tensorflow/models/UNET256/UNET256.h5')\n","\n","#!gsutil cp best_weights.hdf5 gs://cvod-203614-mlengine/NC_solar/models/UNET256/best_weights.hdf5\n","#!gsutil cp UNET256.h5 gs://cvod-203614-mlengine/NC_solar/models/UNET256/UNET256.h5"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"U2XrwZHp66j4","colab_type":"text"},"source":["Note that the notebook VM is sometimes not heavy-duty enough to get through a whole training job, especially if you have a large buffer size or a large number of epochs.  You can still use this notebook for training, but may need to set up an alternative VM ([learn more](https://research.google.com/colaboratory/local-runtimes.html)) for production use.  Alternatively, you can package your code for running large training jobs on Google's AI Platform [as described here](https://cloud.google.com/ml-engine/docs/tensorflow/trainer-considerations).  The following code loads a pre-trained model, which you can use for predictions right away."]},{"cell_type":"markdown","metadata":{"id":"zvIqqpNXqJSE","colab_type":"text"},"source":["##Load model and resume training"]},{"cell_type":"code","metadata":{"id":"q0xgBhsaqInV","colab_type":"code","colab":{}},"source":["#bring in the architecture and best weights from GCS\n","m = models.load_model('drive/My Drive/Tensorflow/models/UNET256/UNET256.h5', custom_objects={'weighted_bce': weighted_bce})\n","m.load_weights('drive/My Drive/Tensorflow/models/UNET256/best_weights.hdf5') \n","\n","#lets see where were at\n","evalMetrics = m.evaluate(x=evaluation, steps = EVAL_SIZE, verbose = 1)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"xlsFciElxOUA","colab_type":"code","colab":{}},"source":["#set the monitored value (val_mean_io_u) to current evaluation output\n","checkpoint = tf.keras.callbacks.ModelCheckpoint(\n","    log_dir+'best_weights.hdf5',\n","    monitor='val_mean_io_u',\n","    verbose=1,\n","    save_best_only=True,\n","    mode='max'\n","    )\n","\n","checkpoint.best = evalMetrics[2]\n","print(checkpoint.__dict__)\n","print(checkpoint.best)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"7eq0aLlw864A","colab_type":"text"},"source":["## Set up tensorboard"]},{"cell_type":"code","metadata":{"id":"PA2gJENE8-J1","colab_type":"code","colab":{}},"source":["tensorboard = tf.keras.callbacks.TensorBoard(log_dir= 'drive/My Drive/Tensorflow/models/UNET256')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"Ty8wCxDtqWBM","colab_type":"code","colab":{}},"source":["#Now keep training!\n","m.fit(\n","    x=training, \n","    epochs= 10, \n","    steps_per_epoch=int(TRAIN_SIZE / BATCH_SIZE), \n","    validation_data=evaluation,\n","    validation_steps=EVAL_SIZE/BATCH_SIZE,\n","    callbacks = [checkpoint, tensorboard]\n","    )\n","#m.save('drive/My Drive/Tensorflow/models/UNET256/UNET256.h5')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"tyhWcGHJ82e8","colab_type":"code","colab":{}},"source":["m.save('drive/My Drive/Tensorflow/models/UNET256/UNET256.h5')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"i9OM5BiS1xYQ","colab_type":"code","colab":{}},"source":["%tensorboard --logdir 'drive/My Drive/Tensorflow/models/UNET256'"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"-RJpNfEUS1qp","colab_type":"code","colab":{}},"source":["# Load a trained model. 50 epochs. 25 hours. Final RMSE ~0.08.\n","MODEL_DIR = BUCKET + '/' + FOLDER + '/' + 'models/UNET256'\n","m = tf.contrib.saved_model.load_keras_model(MODEL_DIR)\n","m.summary()"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"J1ySNup0xCqN","colab_type":"text"},"source":["# Prediction\n","\n","The prediction pipeline is:\n","\n","1.  Export imagery on which to do predictions from Earth Engine in TFRecord format to a Cloud Storge bucket.\n","2.  Use the trained model to make the predictions.\n","3.  Write the predictions to a TFRecord file in a Cloud Storage.\n","4.  Upload the predictions TFRecord file to Earth Engine.\n","\n","The following functions handle this process.  It's useful to separate the export from the predictions so that you can experiment with different models without running the export every time."]},{"cell_type":"code","metadata":{"id":"lv6nb0ShH4_T","colab_type":"code","colab":{}},"source":["#Inspect the prediction outputs\n","predictions = m.predict(evaluation, steps=1, verbose=1)\n","for prediction in predictions:\n","  print(predictions)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"M3WDAa-RUpXP","colab_type":"code","colab":{}},"source":["def doExport(image, out_image_base, kernel_buffer, region):\n","  \"\"\"Run the image export task.  Block until complete.\n","  \"\"\"\n","  task = ee.batch.Export.image.toCloudStorage(\n","    image = image.select(BANDS+[RESPONSE]), \n","    description = out_image_base, \n","    bucket = BUCKET, \n","    fileNamePrefix = FOLDER + '/' + PRED_BASE + '/' + out_image_base, \n","    region = region.getInfo()['coordinates'], \n","    scale = 1, \n","    fileFormat = 'TFRecord', \n","    maxPixels = 1e10,\n","    formatOptions = { \n","      'patchDimensions': KERNEL_SHAPE,\n","      'kernelSize': kernel_buffer,\n","      'compressed': True,\n","      'maxFileSize': 104857600\n","    }\n","  )\n","  task.start()\n","\n","  # Block until the task completes.\n","  print('Running image export to Cloud Storage...')\n","  import time\n","  while task.active():\n","    time.sleep(30)\n","\n","  # Error condition\n","  if task.status()['state'] != 'COMPLETED':\n","    print('Error with image export.')\n","  else:\n","    print('Image export completed.')"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"zb_9_FflygVw","colab_type":"code","colab":{}},"source":["def doPrediction(out_image_base, user_folder, kernel_buffer, region):\n","  \"\"\"Perform inference on exported imagery, upload to Earth Engine.\n","  \"\"\"\n","\n","  print('Looking for TFRecord files...')\n","  \n","  # Get a list of all the files in the output bucket.\n","  filesList = !gsutil ls 'gs://'{BUCKET}'/'{FOLDER}'/'{PRED_BASE}\n","  # Get only the files generated by the image export.\n","  exportFilesList = [s for s in filesList if out_image_base in s]\n","\n","  # Get the list of image files and the JSON mixer file.\n","  imageFilesList = []\n","  jsonFile = None\n","  for f in exportFilesList:\n","    if f.endswith('.tfrecord.gz'):\n","      imageFilesList.append(f)\n","    elif f.endswith('.json'):\n","      jsonFile = f\n","\n","  # Make sure the files are in the right order.\n","  imageFilesList.sort()\n","\n","  from pprint import pprint\n","  pprint(imageFilesList)\n","  print(jsonFile)\n","  \n","  import json\n","  # Load the contents of the mixer file to a JSON object.\n","  jsonText = !gsutil cat {jsonFile}\n","  # Get a single string w/ newlines from the IPython.utils.text.SList\n","  mixer = json.loads(jsonText.nlstr)\n","  pprint(mixer)\n","  patches = mixer['totalPatches']\n","  \n","  # Get set up for prediction.\n","  x_buffer = int(kernel_buffer[0] / 2)\n","  y_buffer = int(kernel_buffer[1] / 2)\n","\n","  buffered_shape = [\n","      KERNEL_SHAPE[0] + kernel_buffer[0],\n","      KERNEL_SHAPE[1] + kernel_buffer[1]]\n","\n","  imageColumns = [\n","    tf.FixedLenFeature(shape=buffered_shape, dtype=tf.float32) \n","      for k in BANDS\n","  ]\n","\n","  imageFeaturesDict = dict(zip(BANDS, imageColumns))\n","\n","  def parse_image(example_proto):\n","    return tf.parse_single_example(example_proto, imageFeaturesDict)\n","\n","  def toTupleImage(dict):\n","    inputsList = [dict.get(key) for key in BANDS]\n","    stacked = tf.stack(inputsList, axis=0)\n","    stacked = tf.transpose(stacked, [1, 2, 0])\n","    return stacked\n","  \n","   # Create a dataset from the TFRecord file(s) in Cloud Storage.\n","  imageDataset = tf.data.TFRecordDataset(imageFilesList, compression_type='GZIP')\n","  imageDataset = imageDataset.map(parse_image, num_parallel_calls=5)\n","  imageDataset = imageDataset.map(toTupleImage).batch(1)\n","  \n","  # Perform inference.\n","  print('Running predictions...')\n","  predictions = m.predict(imageDataset, steps=patches, verbose=1)\n","  # print(predictions[0])\n","\n","  print('Writing predictions...')\n","  out_image_file = 'gs://' + BUCKET + '/' + FOLDER + '/' + PRED_BASE + '/outputs/' + out_image_base + '.TFRecord'\n","  writer = tf.python_io.TFRecordWriter(out_image_file)\n","  patches = 0\n","  for predictionPatch in predictions:\n","    print('Writing patch ' + str(patches) + '...')\n","    predictionPatch = predictionPatch[\n","        x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]\n","\n","    # Create an example.\n","    example = tf.train.Example(\n","      features=tf.train.Features(\n","        feature={\n","          'probability': tf.train.Feature(\n","              float_list=tf.train.FloatList(\n","                  value=predictionPatch.flatten()))\n","        }\n","      )\n","    )\n","    # Write the example.\n","    writer.write(example.SerializeToString())\n","    patches += 1\n","\n","  writer.close()\n","\n","  # Start the upload.\n","  out_image_asset = user_folder + '/' + out_image_base\n","  !earthengine upload image --asset_id={out_image_asset} {out_image_file} {jsonFile}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"LZqlymOehnQO","colab_type":"text"},"source":["Now there's all the code needed to run the prediction pipeline, all that remains is to specify the output region in which to do the prediction, the names of the output files, where to put them, and the shape of the outputs.  In terms of the shape, the model is trained on 256x256 patches, but can work (in theory) on any patch that's big enough with even dimensions ([reference](https://www.cv-foundation.org/openaccess/content_cvpr_2015/papers/Long_Fully_Convolutional_Networks_2015_CVPR_paper.pdf)).  Because of tile boundary artifacts, give the model slightly larger patches for prediction, then clip out the middle 256x256 patch.  This is controlled with a kernel buffer, half the size of which will extend beyond the kernel buffer.  For example, specifying a 128x128 kernel will append 64 pixels on each side of the patch, to ensure that the pixels in the output are taken from inputs completely covered by the kernel.  "]},{"cell_type":"code","metadata":{"id":"FPANwc7B1-TS","colab_type":"code","colab":{}},"source":["# This has a read-only asset in it:\n","user_folder = 'users/defendersofwildlifeGIS'\n","\n","# Base file name to use for TFRecord files and assets.\n","li_image_base = 'li_parking_deeplab512Pred'\n","# Half this will extend on the sides of each patch.\n","li_kernel_buffer = [256, 256]\n","# Huntington\n","li_region = ee.Feature(towns.filterMetadata(\"TOWN\", 'equals', 'Huntington').first()).geometry()\n","print(li_region.area().getInfo())"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lLNEOLkXWvSi","colab_type":"code","cellView":"both","colab":{}},"source":["#@title Don't run\n","# Run the export.\n","doExport(featureStack, li_image_base, li_kernel_buffer, li_region)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"KxACnxKFrQ_J","colab_type":"code","cellView":"both","colab":{}},"source":["#@title Don't run\n","# Run the prediction.\n","doPrediction(nc_image_base, user_folder, nc_kernel_buffer, nc_region)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"uj_G9OZ1xH6K","colab_type":"text"},"source":["# Display the output\n","\n","One the data has been exported, the model has made predictions and the predictions have been written to a file, and the image imported to Earth Engine, it's possible to display the resultant Earth Engine asset.  Here, display the impervious area predictions over Beijing, China."]},{"cell_type":"code","metadata":{"id":"Jgco6HJ4R5p2","colab_type":"code","colab":{}},"source":["out_image = ee.Image(user_folder + '/' + bj_image_base)\n","mapid = out_image.getMapId({'min': 0, 'max': 1})\n","map = folium.Map(location=[39.898, 116.5097])\n","folium.TileLayer(\n","    tiles=EE_TILES.format(**mapid),\n","    attr='Google Earth Engine',\n","    overlay=True,\n","    name='predicted impervious',\n","  ).add_to(map)\n","map.add_child(folium.LayerControl())\n","map"],"execution_count":0,"outputs":[]}]}
2 | 


--------------------------------------------------------------------------------
/utils/array_tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Mar 226 10:50:44 2023
  4 | 
  5 | @author: MEvans
  6 | """
  7 | 
  8 | import numpy as np
  9 | import math
 10 | from random import shuffle, randint, uniform
 11 | 
 12 | def make_harmonics(times: np.ndarray, timesteps, dims):
 13 |     """Create arrays of sin and cos representations of time
 14 |     Parameters:
 15 |         times (np.ndarray): 1D array of start times
 16 |         timesteps (int): number of annual timesteps
 17 |         dims (tpl): H, W dimensions of output data
 18 |     Returns:
 19 |         np.ndarray: 4D array (B, (dims), 2) with 
 20 |     """
 21 |     xys = [sin_cos(time, timesteps) for time in times] # use the T dimension to get number of intervals
 22 |     # r = deg_to_radians(lat) # convert latitude to radians
 23 |     out = np.stack([np.stack([np.full(dims, x), np.full(dims, y)], axis = -1) for x,y in xys], axis = 0)
 24 |     return out
 25 |     
 26 | def merge_classes(cond_array, trans, out_array):
 27 |     """Reclassify categorical array values
 28 |     Parameters
 29 |     ---
 30 |     cond_array: np.ndarray
 31 |       array with values to be evaluated by conditional expression
 32 |     trans: list[tpl]
 33 |       tuples containing condition and value to return where true
 34 |     array: np.ndarray
 35 |       array to be returned where condition false
 36 |     Returns
 37 |     ---
 38 |     np.darray
 39 |         reclassified array same shape and size as input
 40 |     """
 41 |     output = np.copy(out_array)
 42 |     for x,y in trans:
 43 |       output[cond_array == x] = y
 44 |     return output
 45 |         
 46 |  
 47 | def normalize_array(img, axes=[2], epsilon=1e-8, moments = None, splits = None):
 48 |     """
 49 |     Standardize incoming image patches by mean and variance.
 50 | 
 51 |     Moments can be calculated based on patch data by providing axes:      
 52 |     To standardize each pixel use axes = [2]
 53 |     To standardize each channel use axes = [0, 1]
 54 |     To standardize globally use axes = [0, 1, 2]
 55 | 
 56 |     To standardize by global, or per-channel moments supply a list of [mean, variance] tuples.
 57 |     To standardize groups of channels separately, identify the size of each group. Groups of
 58 |     channels must be stacked contiguously and group sizes must sum to the total # of channels
 59 |     
 60 |     Parameters
 61 |     ---
 62 |         img: np.ndarray
 63 |             nD image (usually 3d) to be normalized
 64 |         axes: list: int
 65 |             Array of ints. Axes along which to compute mean and variance, usually length n-1
 66 |         epsilon: float
 67 |             small number to avoid dividing by zero
 68 |         moments: list:tpl:int
 69 |             list of global mean, std tuples for standardization
 70 |         splits: list:int
 71 |             size(s) of groups of features to be kept together
 72 |     Return:
 73 |         tensor: nD image tensor normalized by channels
 74 |     """
 75 |     
 76 |     # define a basic function to normalize a 3d tensor
 77 |     def normalize(img):
 78 | #        shape = tf.shape(x).numpy()
 79 |         # if we've defined global or per-channel moments...
 80 |         if moments:
 81 |             # cast moments to arrays for mean and variance
 82 |             mean = np.array([tpl[0] for tpl in moments], dtype = 'float32')
 83 |             std = np.array([tpl[1] for tpl in moments], dtype = 'float32')
 84 |         # otherwise, calculate moments along provided axes
 85 |         else:
 86 |             mean = np.nanmean(img, axes, keepdims = True)
 87 |             std = np.nanstd(img, axes, keepdims = True)
 88 |             # keepdims = True to ensure compatibility with input tensor
 89 | 
 90 |         # normalize the input tensor
 91 |         normed = (img - mean)/(std + epsilon)
 92 |         return normed
 93 |     
 94 |     # if splits are given, apply tensor normalization to each split
 95 |     if splits:
 96 |         splitLen = sum(splits)
 97 |         toNorm = img[:,:,0:splitLen]
 98 |         dontNorm = img[:,:,splitLen:]
 99 |         arrays = np.split(toNorm, splits, axis = -1)
100 |         normed = [normalize(array) for array in arrays]
101 |         normed.append(dontNorm)
102 |         # gather normalized splits into single tensor
103 |         img_normed = np.concatenate(normed, axis = -1)
104 |     else:
105 |         img_normed = normalize(img)
106 | 
107 |     return img_normed
108 | 
109 | def rescale_array(img, axes = -1, epsilon=1e-8, moments = None, splits = None):
110 |     """
111 |     Rescale incoming image patch to [0,1] based on min and max values
112 |     
113 |     Min, max can be calculated based on patch data by providing axes:      
114 |     To rescale each pixel use axes = [2]
115 |     To rescale each channel use axes = [0, 1]
116 |     To rescale globally use axes = [0, 1, 2]
117 | 
118 |     To rescale by global, or per-channel moments supply a list of [mean, variance] tuples.
119 |     To rescale groups of channels separately, identify the size of each group. Groups of
120 |     channels must be stacked contiguously and group sizes must sum to the total # of channels
121 |     
122 |     Parameters
123 |     ---
124 |         img: np.ndarray
125 |             array to be rescaled, usually 3D (H,W,C)
126 |         axes: list: int
127 |             Array of ints. Axes along which to compute mean and variance, usually length n-1
128 |         epsilon: float
129 |             small number to avoid dividing by zero
130 |         moments: list:tpl:int
131 |             optional, list of global mean, std tuples for standardization
132 |         splits: list:int
133 |             optional, size(s) of groups of features to be kept together
134 |     Return:
135 |         tensor: 3D tensor of same shape as input, with values [0,1]
136 |     """
137 |     def rescale(img):
138 |         if moments:
139 |             minimum = np.array([tpl[0] for tpl in moments], dtype = 'float32')
140 |             maximum = np.array([tpl[1] for tpl in moments], dtype = 'float32')
141 |         else:
142 |             minimum = np.nanmin(img, axis = axes, keepdims = True)
143 |             maximum = np.nanmax(img, axis = axes, keepdims = True)
144 |         scaled = (img - minimum)/((maximum - minimum) + epsilon)
145 | #        scaled = tf.divide(tf.subtract(img, minimum), tf.add(tf.subtract(maximum, minimum))
146 |         return scaled
147 |     
148 |     # if splits are given, apply tensor normalization to each split
149 |     if splits:
150 |         arrays = np.split(img, splits, axis = -1)
151 |         rescaled = [rescale(array) for array in arrays]
152 |         # gather normalized splits into single tensor
153 |         img_rescaled = np.concat(rescaled, axis = -1)
154 |     else:
155 |         img_rescaled = rescale(img)
156 |         
157 |     return img_rescaled
158 | 
159 | def aug_array_color(img: np.ndarray) -> np.ndarray:
160 |     """Randomly change the brightness and contrast of an image
161 |     Parameters
162 |     ---
163 |     img: np.ndarray
164 |         image to be adjusted
165 | 
166 |     Return
167 |     ---
168 |     np.ndarray: input array with brightness and contrast adjusted 
169 |     """
170 |     dims = len(img.shape)
171 |     n_ch = img.shape[-1]
172 |     axes = (0,1) if dims == 3 else (1,2)
173 | 
174 |     contra_adj = 0.05
175 |     bright_adj = 0.05
176 | 
177 |     ch_mean = np.nanmean(img, axis = axes, keepdims = True)
178 |     # print('channel means', ch_mean)
179 |     contra_mul = uniform(a = 1-contra_adj, b = 1+contra_adj)
180 | 
181 |     bright_mul = uniform(a = 1 - bright_adj, b = 1+bright_adj)
182 | 
183 |     recolored = (img - ch_mean) * contra_mul + (ch_mean * bright_mul)
184 |     return recolored
185 | 
186 | def aug_array_morph(img: np.ndarray, v_rand:bool = None, h_rand:bool = None, r_rand:int = None, return_tuple:bool = False) -> np.ndarray:
187 |     """
188 |     Perform morphological image augmentation on image array
189 |     Parameters:
190 |         img (np.ndarray): 4D or 3D channels last image array
191 |     Returns:
192 |         np.ndarray: 3D channels last image array 
193 |     """
194 |     dims = list(range(len(img.shape)))
195 |     v_axis = dims[-3] # channels last, vertical axis is always third last
196 |     h_axis = dims[-2] # channels last, horizontal axis is always second last
197 | 
198 |     if v_rand is None:
199 |         v_rand = uniform(0,1) < 0.5
200 |     if h_rand is None:
201 |         h_rand = uniform(0,1) < 0.5
202 |     if r_rand is None:
203 |         r_rand = randint(0,3)
204 | 
205 |     # flip array up/down
206 |     x = np.flip(img, axis = v_axis) if v_rand else img
207 |     x = np.flip(x, axis = h_axis) if h_rand else x
208 |     x = np.rot90(x, r_rand, axes = (v_axis, h_axis))
209 | 
210 |     if return_tuple:
211 |         return x, v_rand, h_rand, r_rand
212 |     else:
213 |         return x
214 | 
215 | def normalize_timeseries(arr, maxval = 10000, minval = 0, axis = -1, e = 0.00001):
216 |   # normalize band values across timesteps
217 |   normalized = (arr-minval)/(maxval-minval+e)
218 | #   mn = np.nanmean(arr, axis = axis, keepdims = True)
219 | #   std = np.nanstd(arr, axis = axis, keepdims = True)
220 | #   normalized = (arr - mn)/(std+e)
221 |   # replace nans with zeros?
222 |   finite = np.where(np.isnan(normalized), 0.0, normalized)
223 |   return finite
224 | 
225 | def rearrange_timeseries(arr: np.ndarray, nbands: int) -> np.ndarray:
226 |   """ Randomly rearange 3d images in a timeseries
227 | 
228 |   Changes the startpoint of a temporal sequence of 3D images stored in a 4D array
229 |   while maintaining relative order.
230 |   
231 |   Parameters
232 |   ---
233 |   arr: np.ndarray
234 |     5D (B, T, H, W, C) array to be rearranged
235 |   nbands: int
236 |     size of the last array dimension corresponding to image bands/channels
237 | 
238 |   Returns
239 |   ---
240 |   np.ndarray
241 |     5D array of same size/shape as input
242 |   """
243 | 
244 |   # the number of time steps is in the 1st dimension if our data is (B, T, H, W, C)
245 |   timesteps = arr.shape[1]
246 |   # randomly pick one of the timesteps as the starting time
247 |   starttime = randint(0, timesteps-1)
248 |   # print('start', starttime)
249 |   # grab all timesteps leading up to the timestep corresponding to our random first
250 |   last = arr[:,0:starttime,:,:,:]
251 |   print('last shape', last.shape)
252 |   first = arr[:,starttime:timesteps,:,:,:]
253 |   print('start shape', first.shape)
254 |   rearranged = np.concatenate([first, last], axis = 1)
255 |   rearranged.shape == arr.shape
256 |   return(rearranged)
257 | 
258 | def split_timeseries(arr: np.ndarray) -> tuple:
259 |   """Divide a timeseries of 3D images into a series of images and labels
260 | 
261 |   Parameters
262 |   ---
263 |   arr: np.ndarray
264 |     5D (B, T, H, W, C) array to be split
265 | 
266 |   Returns
267 |   ---
268 |   tuple
269 |     two 5D timeseries arrays
270 |   """
271 | 
272 |   feats = arr[:,0:-1,:,:,:]
273 |   labels = arr[:,-1,:,:,0:nbands]
274 | 
275 |   # confirm there are no all-nan images in labels
276 |   batch_sums = np.sum(labels, axis = (1,2,3))
277 |   if 0.0 in batch_sums:
278 |     print('all nan labels, reshuffling')
279 |     feats, labels = rearrange_timeseries(arr, nbands)
280 | 
281 |   return(feats, labels)
282 | 
283 | def sin_cos(t:int, freq:int = 6) -> tuple:
284 |     x = t/freq
285 |     theta = 2*math.pi * x
286 |     return (math.sin(theta), math.cos(theta))
287 | 
288 | def add_harmonic(timeseries: np.ndarray):
289 |     """ add harmonic variables to an imagery timeseries. currently assumes first image is start of year
290 |     B, T, H, W, C
291 |     """
292 |     in_shape = timeseries.shape
293 |     timesteps = in_shape[1]
294 |     tpls = [sin_cos(t, timesteps) for t in range(timesteps)]
295 |     xys = [np.stack([np.full((in_shape[0], in_shape[2], in_shape[3]), x), np.full((in_shape[0], in_shape[2], in_shape[3]), y)], axis = -1) for x,y in tpls]
296 |     harmonics = np.stack(xys, axis = 1)
297 |     harmonic_timeseries = np.concatenate([timeseries, harmonics], axis = -1)
298 |     return harmonic_timeseries
299 | 


--------------------------------------------------------------------------------
/utils/calibration.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Mon Mar 16 17:44:19 2020
  4 | 
  5 | @author: MEvans
  6 | """
  7 | 
  8 | import math
  9 | import ee
 10 | from stats import normalize
 11 | 
 12 | def clamp_and_scale(img, bands, p, AOI):
 13 |   """ 
 14 |   clip the upper range of an image based on percentile
 15 | 
 16 |   This function is similar to ee.Image().clip() and ee.Image().unitScale(),
 17 |   but operates on multiple bands with potentially different upper limits.
 18 | 
 19 |   Parameters:
 20 |     img (ee.Image): the image to modify
 21 |     bands (ee.List<str>): 
 22 |     p (int): upper percentile above which to truncate values
 23 |     AOI (ee.Geometry): area within which to calculate percentile
 24 | 
 25 |   Returns:
 26 |     ee.Image: rescaled image with band values [0, 1]
 27 |   """
 28 |   #create a list of the 99th percentile value for all bands
 29 |   percentiles = img.select(bands).reduceRegion(
 30 |       reducer = ee.Reducer.percentile([99]).repeat(ee.List(bands).size()),
 31 |       geometry = AOI,
 32 |       scale = 100,
 33 |       maxPixels = 1e13,
 34 |       tileScale = 12
 35 |   ).get('p99')
 36 | 
 37 |   #turn list of 99th percentiles into constant image
 38 |   upperImg = ee.Image.constant(percentiles).rename(bands)
 39 | 
 40 |   #clip the upper range of extreme values where sensors get washed out
 41 |   normImage = img.where(img.gte(upperImg), upperImg)
 42 | 
 43 |   # rescale the truncated image to [0, 1]
 44 |   rescaled = normalize(normImage, upperImg, ee.Image.constant(0))
 45 |   return ee.Image(rescaled)
 46 | 
 47 | def scene_median(imgCol, bands, sceneID):
 48 |   """ 
 49 |   Create median images for each unique scene in an image collection
 50 |   Parameters:
 51 |     imgCol (ee.ImageCollection):
 52 |     bands (list<str>): image bands on which to calculate medians
 53 |     sceneID (str): metadata field storing unique scene ID values
 54 |   Returns:
 55 |     ee.ImageCollection: composed of median images per scene
 56 |   """
 57 |   # first get list of all scene IDs
 58 |   scenes = ee.List(imgCol.aggregate_array(sceneID)).distinct()
 59 |   # define function to filter by scene id and take median
 60 | 
 61 |   medians = scenes.map(lambda str: imgCol.filter(ee.Filter.eq(sceneID, str)).median().set(sceneID, str))
 62 |   return ee.ImageCollection(medians).select(bands)
 63 | 
 64 | def get_overlap(imgCol1, imgCol2):
 65 |   """
 66 |   Calculate the area of overlap between two image collections
 67 |   Parameters:
 68 |     imgCol1 (ee.ImageCollection): first image collection
 69 |     imgCol2 (ee.ImageCollection): second image collection
 70 |   Returns:
 71 |     ee.Geometry: area of overlap
 72 |   """
 73 |   geom1 = imgCol1.geometry(5).dissolve()
 74 |   geom2 = imgCol2.geometry(5).dissolve()
 75 |   intersect = geom1.intersection(geom2, 5)
 76 |   return intersect
 77 | 
 78 | def hist_to_FC(hist, band):
 79 |   """
 80 |   convert a histogram of band values to a feature collection
 81 | 
 82 |   Args:
 83 |     hist (ee.Dictionary): output of histogram reducer on an image
 84 |     band (str): band name
 85 | 
 86 |   Return:
 87 |     ee.FeatureCollection: one feature for each histogram bin with 
 88 |   """
 89 |   # properties 'bucketMeans' and 'probability' (normalized cummulative probability).
 90 |   valsList = ee.List(ee.Dictionary(ee.Dictionary(hist).get(band)).get('bucketMeans'))
 91 |   freqsList = ee.List(ee.Dictionary(ee.Dictionary(hist).get(band)).get('histogram'))
 92 |   cdfArray = ee.Array(freqsList).accum(0)
 93 |   total = cdfArray.get([-1])
 94 |   normalizedCdf = cdfArray.divide(total)
 95 |   
 96 |   # create 2D array with histogram bucket means and normalized cdf values
 97 |   array = ee.Array.cat([valsList, normalizedCdf], 1)
 98 | 
 99 |   # define function to create a feature colleciton with properties determined by list
100 |   def fxn(ls):
101 |     return ee.Feature(None, {'dn': ee.List(ls).get(0), 'probability': ee.List(ls).get(1)})
102 | 
103 |   output = ee.FeatureCollection(array.toList().map(fxn))
104 |   return output 
105 | 
106 | def make_FC(image, AOI):
107 |   """
108 |   create a feature colleciton from the histograms of an images bands
109 | 
110 |   Parameters:
111 |     image (ee.Image): input image
112 |     AOI (ee.Feaure): area within which to...
113 |   Returns:
114 |     ee.List: list of feature collections returned by hist_to_FC
115 |   """
116 |   # Histogram equalization start:
117 |   bands = image.bandNames()
118 |   histo = image.reduceRegion(
119 |     reducer = ee.Reducer.histogram(
120 |       maxBuckets = math.pow(2, 12)
121 |     ), 
122 |     geometry = AOI, 
123 |     scale = 100, 
124 |     maxPixels = 1e13, 
125 |     tileScale = 12
126 |   )
127 |   
128 |   def fxn(band):
129 |     return hist_to_FC(histo, band)
130 | 
131 |   # map hist -> FC conversion fxn across bands
132 |   output = bands.map(fxn)
133 |   
134 |   return output
135 | 
136 | def equalize(image1, image2, AOI):
137 |   """
138 |   use histogram matching to calibrate two images
139 |   
140 |   Parameters:
141 |     image1 (ee.Image): reference image
142 |     image2 (ee.Image): image to be calibrated
143 |     AOI (ee.Geometry): area of overlap between the two images
144 | 
145 |   Returns:
146 |     ee.Image: image2 with bands calibrated to the histogram(s) of image1 bands
147 |   """
148 |   bands = image1.bandNames()
149 |   nBands = bands.size().subtract(1)
150 |   
151 |   # These are lists of feature collections
152 |   fc1 = make_FC(image1, AOI)
153 |   fc2 = make_FC(image2, AOI)
154 | 
155 |   def fxn(i):
156 |     band = bands.get(i)
157 |     classifier1 = ee.Classifier.randomForest(100)\
158 |       .setOutputMode('REGRESSION')\
159 |       .train(
160 |         features = ee.FeatureCollection(ee.List(fc1).get(i)), 
161 |         classProperty = 'dn', 
162 |         inputProperties = ['probability']
163 |     )
164 | 
165 |     classifier2 = ee.Classifier.randomForest(100)\
166 |     .setOutputMode('REGRESSION')\
167 |     .train(
168 |       features = ee.FeatureCollection(ee.List(fc2).get(i)), 
169 |       classProperty = 'probability', 
170 |       inputProperties = ['dn']
171 |     )
172 |   
173 |     # Do the shuffle: DN -> probability -> DN. Return the result.
174 |     b = image2.select([band]).rename('dn');
175 |     # DN -> probability -> DN
176 |     output = b.classify(classifier2, 'probability')\
177 |     .classify(classifier1, band)   
178 |     
179 |     return output
180 | 
181 |   imgList = ee.List.sequence(0, nBands).map(fxn)
182 |   return ee.ImageCollection(imgList).toBands().rename(bands)
183 | 
184 | def equalize_collection(imgCol, bands, sceneID):
185 |   """ 
186 |   histogram equalize images in a collection by unique orbit path
187 | 
188 |   Parameters:
189 |     imgCol (ee.ImageCollection): collection storing images to equalize
190 |     bands (list<str>): list of band names to be calibrated
191 |     sceneID (str): property by which images will be grouped
192 | 
193 |   Returns:
194 |     ee.ImageCollection: median images per scene equalized to the westernmost path
195 |   """
196 |   # first get list of all scene IDs
197 |   scenes = ee.List(imgCol.aggregate_array(sceneID)).distinct()
198 |   # create an image collection of scene medians
199 |   medians = scene_median(imgCol, bands, sceneID)
200 |   # define a function to return the centroid longitude of each scene
201 |   def get_coord_min(str):
202 |     centroids = imgCol.filter(ee.Filter.eq(sceneID, str)).geometry(1).centroid(1)
203 |     longs = centroids.coordinates().get(0)
204 |     return longs
205 |   # create a list of centroid longitudes
206 |   coords = scenes.map(get_coord_min)
207 |   # sort the scenes by increasing longitude
208 |   scenes = scenes.sort(coords)
209 |   # define a function that will equalize the list of scenes in succession
210 |   def iterate_equalize(scene, prev):
211 |     # take the previous median image
212 |     prev = ee.List(prev)
213 |     img1 = ee.Image(prev.get(-1))
214 |     # take the next median image
215 |     img2 = ee.Image(medians.filter(ee.Filter.eq(sceneID, scene)).first())
216 |     # filter image collection to the previous scene
217 |     index = scenes.indexOf(scene).subtract(1)
218 |     imgCol1 = imgCol.filter(ee.Filter.eq(sceneID, scenes.get(index)))
219 |     #imgCol1 = imgCol.filter(ee.Filter.eq(sceneID, prev))
220 |     # filter image collection to the next scene
221 |     imgCol2 = imgCol.filter(ee.Filter.eq(sceneID, scene))
222 |     overlap = get_overlap(imgCol1, imgCol2)
223 |     # if there is overlap between collections, equalize (returns image)
224 |     # otherwise return the current image
225 |     equalized = ee.Algorithms.If(overlap.area(5).gt(0), equalize(img1, img2, overlap), img2)
226 |     update = ee.List(prev).add(equalized)
227 |     return update
228 |   # create a list of successively equalized scenes
229 |   # initial value for iterate is the first median scene
230 |   first = ee.Image(medians.filter(ee.Filter.eq(sceneID, scenes.get(0))).first())
231 |   # take all but the first scene median and iteratively equalize
232 |   output = scenes.slice(1).iterate(iterate_equalize, ee.List([first]))
233 |   return ee.ImageCollection.fromImages(output)


--------------------------------------------------------------------------------
/utils/ee_tools.py:
--------------------------------------------------------------------------------
  1 | import ee
  2 | 
  3 | # Initialize Earth Engine
  4 | ee.Initialize()
  5 | 
  6 | # Initialize Earth Engine
  7 | JRC = ee.ImageCollection("JRC/GSW1_1/YearlyHistory")
  8 | 
  9 | def norm_p(z):
 10 |     """ 
 11 |     Caclulate (approx) the p-value for a standard normal distribution
 12 |     
 13 |     Parameters:
 14 |         z (ee.Image): image containing z-scores
 15 |         
 16 |     Returns:
 17 |         ee.Image: image containing p-values
 18 |     """
 19 |     return ee.Image.constant(1).subtract(z.multiply(-1.65451).exp().add(1).pow(-1))
 20 | 
 21 | def chi_p(chi, df):
 22 |     """ Caclulate the CDF probability of a chi-square statistic
 23 |     Parameters:
 24 |         chi (ee.Image): single band image with observations from a chi-squared dist
 25 |         df (int): degrees of freedom
 26 |     Returns:
 27 |         ee.Image: single band image of probabilities
 28 |     """
 29 |     cdf = ee.Image(chi.divide(2)).gammainc(ee.Number(df).divide(2))
 30 |     return cdf.rename(['p'])
 31 | 
 32 | def gamma_p(stat, df):
 33 |     shape = ee.Image(1)
 34 |     scale = ee.Image(df)
 35 |     denom = shape.gamma()
 36 |     num = shape.gammainc(stat.divide(scale))
 37 |     return num.divide(denom).rename(['p'])
 38 | 
 39 | def normalize(img, maxImg, minImg):
 40 |   """
 41 |   Scale an image from 0 to 1
 42 | 
 43 |   Parameters:
 44 |     img (ee.Image): image to be rescaled
 45 |     maxImg (ee.Image): image storing the maximum value of the image
 46 |     minImg (ee.Image): image storing the minimum value of the image
 47 |   Returns:
 48 |     ee.Image:
 49 |   """
 50 |   return img.subtract(minImg).divide(maxImg.subtract(minImg))
 51 | 
 52 | def standardize(img):
 53 |   """
 54 |   Standardize an image to z-scores using mean and sd
 55 | 
 56 |   Parameters:
 57 |     img (ee.Image): image to be rescaled standardized
 58 |     
 59 |   Returns:
 60 |     ee.Image: image containing z-scores per band
 61 |   """
 62 |   bands = img.bandNames()
 63 |   mean = img.reduceRegion(
 64 |       reducer= ee.Reducer.mean(),
 65 |       scale= 300).toImage()
 66 |   sd = img.reduceRegion(
 67 |       reducer= ee.Reducer.stdDev(),
 68 |       scale= 300
 69 |   ).toImage(bands)
 70 |   return img.subtract(mean).divide(sd)
 71 | 
 72 | 
 73 | def ldaScore(img, inter, xbands, coefficients):
 74 |   """
 75 |   Function converting multiband image into single band image of LDA scores
 76 |   
 77 |   Parameters:
 78 |       img (ee.Image): multiband image
 79 |       int (float): intercept parameter from LDA analysis
 80 |       xbands (ee.List<string>): string list of n band names
 81 |       coefficients (ee.List<float>): numeric list of length n containing LDA coefficients
 82 |   Returns:
 83 |     ee.Image: image with one band containing LDA scores based on provided coefficients
 84 |   """
 85 |   bands = img.select(xbands)
 86 |   coeffs = ee.Dictionary.fromLists(xbands, coefficients).toImage(xbands)
 87 |   score = bands.multiply(coeffs).addBands(ee.Image(inter)).reduce(ee.Reducer.sum())
 88 |   return score
 89 | 
 90 | def sentinel2toa(img):
 91 |     """
 92 |     Convert processed sentinel toa reflectance to raw values, and extract azimuth / zenith metadata
 93 |     
 94 |     Parameters:
 95 |         img (ee.Image): Sentinel-2 image to convert
 96 |         
 97 |     Returns:
 98 |         ee.Image: 
 99 |     """
100 |     toa = img.select(['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B10', 'B11', 'B12']) \
101 |     .divide(10000)\
102 |     .set('solar_azimuth', img.get('MEAN_SOLAR_AZIMUTH_ANGLE')) \
103 |     .set('solar_zenith', img.get('MEAN_SOLAR_ZENITH_ANGLE')) \
104 |     .set('viewing_azimuth', img.get('MEAN_INCIDENCE_AZIMUTH_ANGLE_B8')) \
105 |     .set('viewing_zenith', img.get('MEAN_INCIDENCE_ZENITH_ANGLE_B8')) \
106 |     .set('CLOUDY_PIXEL_PERCENTAGE', img.get('CLOUDY_PIXEL_PERCENTAGE')) \
107 |     #.set('system:time_start', img.get('system:time_start'));
108 |     return img.select(['QA60']).addBands(toa);
109 | 
110 | def rescale(img, exp, thresholds):
111 |     #print('rescale:', img, exp, thresholds)
112 |     #return img.subtract(thresholds[0]).divide(thresholds[1]-thresholds[0])
113 |     return img.expression(exp, {'img': img}).subtract(thresholds[0]).divide(thresholds[1] - thresholds[0])
114 | 
115 | def waterScore(img):
116 |     """ 
117 |     Calculate a water likelihood score [0, 1]
118 |     
119 |     Parameters:
120 |         img (ee.Image): Sentinel-2 image
121 |         
122 |     Returns:
123 |         ee.Image: image with single ['waterscore'] band
124 |     """
125 |     img = sentinel2toa(img)
126 |     # Compute several indicators of water and take the minimum of them.
127 |     score = ee.Image(1.0)
128 | 
129 |     # Set up some params
130 |     darkBands = ['B3', 'B4', 'B8', 'B11', 'B12']
131 |     brightBand = 'B2'
132 |     shadowSumBands = ['B8', 'B11', 'B12']
133 |     # Water tends to be dark
134 |     sum = img.select(shadowSumBands).reduce(ee.Reducer.sum())
135 |     #sum = rescale(sum, [0.35, 0.2]).clamp(0, 1)
136 |     sum = rescale(sum, 'img', [0.35, 0.2]).clamp(0, 1)
137 |     score = score.min(sum)
138 | 
139 |     # It also tends to be relatively bright in the blue band
140 |     mean = img.select(darkBands).reduce(ee.Reducer.mean())
141 |     std = img.select(darkBands).reduce(ee.Reducer.stdDev())
142 |     z = (img.select([brightBand]).subtract(std)).divide(mean)
143 |     z = rescale(z, 'img', [0, 1]).clamp(0, 1)
144 |     #z = rescale(z, [0,1]).clamp(0,1)
145 |     score = score.min(z)
146 | 
147 |     # Water is at or above freezing
148 |     # score = score.min(rescale(img, 'img.temp', [273, 275]));
149 | 
150 |     # Water is nigh in ndsi(aka mndwi)
151 |     ndsi = img.normalizedDifference(['B3', 'B11'])
152 |     ndsi = rescale(ndsi, 'img', [0.3, 0.8])
153 |     #ndsi = rescale(ndsi, [0.3, 0.8])
154 | 
155 |     score = score.min(ndsi)
156 | 
157 |     return score.clamp(0, 1).rename(['waterScore'])
158 | 
159 | def basicQA(img):
160 |     """
161 |     Mask clouds in a Sentinel-2 image using builg in quality assurance band
162 |     Parameters:
163 |         img (ee.Image): Sentinel-2 image with QA band
164 |     Returns:
165 |         ee.Image: original image masked for clouds and cirrus
166 |     """
167 |     #print('basicQA:', img)
168 |     qa = img.select('QA60').int16()
169 |     # print('qa:', type(qa))
170 |     # qa = img.select(['QA60']).int16()
171 |     #print('qa:', qa.getInfo())
172 |     # Bits 10 and 11 are clouds and cirrus, respectively.
173 |     cloudBitMask = 1024 # math.pow(2, 10)
174 |     cirrusBitMask = 2048 #math.pow(2, 11)
175 |     # Both flags should be set to zero, indicating clear conditions.
176 |     #mask = qa.bitwiseAnd(cloudBitMask).eq(0).And(qa.bitwiseAnd(cirrusBitMask).eq(0))
177 |     mask = qa.bitwiseAnd(cloudBitMask).eq(0).And(qa.bitwiseAnd(cirrusBitMask).eq(0))
178 |     dated = img.updateMask(mask)
179 |     #dated = img.addBands(img.metadata('system:time_start', 'date')).updateMask(mask)
180 |     return dated
181 | 
182 | # Function to cloud mask from the Fmask band of Landsat 8 SR data.
183 | def maskL8sr(image):
184 |   # Bits 3 and 5 are cloud shadow and cloud, respectively.
185 |   cloudShadowBitMask = ee.Number(2).pow(3).int()
186 |   cloudsBitMask = ee.Number(2).pow(5).int()
187 | 
188 |   # Get the pixel QA band.
189 |   qa = image.select('pixel_qa')
190 | 
191 |   # Both flags should be set to zero, indicating clear conditions.
192 |   mask = qa.bitwiseAnd(cloudShadowBitMask).eq(0).And(qa.bitwiseAnd(cloudsBitMask).eq(0))
193 | 
194 |   # Return the masked image, scaled to [0, 1].
195 |   return image.updateMask(mask)
196 | 
197 | 
198 | def cloudBands(img):
199 |     ndmi = img.normalizedDifference(['B8', 'B11']).rename(['ndmi'])
200 |     ndsi = img.normalizedDifference(['B3', 'B11']).rename(['ndsi'])
201 |     cirrus = img.select(['B1', 'B10']).reduce(ee.Reducer.sum()).rename(['cirrus'])
202 |     vis = img.select(['B4', 'B3', 'B2']).reduce(ee.Reducer.sum()).rename(['vis'])
203 |     return img.addBands(ndmi).addBands(ndsi).addBands(cirrus).addBands(vis)
204 | 
205 | 
206 | def darkC (img, R, G, B):
207 |   R = img.select(R)
208 |   G = img.select(G)
209 |   B = img.select(B)
210 |   maxRB = R.max(B)
211 |   maxGB = G.max(B)
212 |   maxRG = R.max(G)
213 |   C1 = G.divide(maxRB).atan().rename(['C1'])
214 |   C2 = R.divide(maxGB).atan().rename(['C2'])
215 |   C3 = B.divide(maxRG).atan().rename(['C3'])
216 |   return img.addBands(C1).addBands(C2).addBands(C3)
217 | 
218 | def sentinelCloudScore(img):
219 |     """
220 |     Compute a custom cloud likelihood score for Sentinel-2 imagery
221 |     Parameters:
222 |         img (ee.Image): Sentinel-2 image
223 |     Returns:
224 |         ee.Image: original image with added ['cloudScore'] band
225 |     """
226 |     im = sentinel2toa(img)
227 |     # Compute several indicators of cloudyness and take the minimum of them.
228 |     score = ee.Image(1)
229 |       
230 |     # Clouds are reasonably bright in the blue and cirrus bands.
231 |     #score = score.min(rescale(im.select(['B2']), [0.1, 0.5]))
232 |     score = score.min(rescale(im, 'img.B2', [0.1, 0.5]))
233 |     #score = score.min(rescale(im.select(['B1']), [0.1, 0.3]))
234 |     score = score.min(rescale(im, 'img.B1', [0.1, 0.3]))
235 |     #score = score.min(rescale(im.select(['B1']).add(im.select(['B10'])), [0.15, 0.2]))
236 |     score = score.min(rescale(im, 'img.B1 + img.B10', [0.15, 0.2]))
237 |       
238 |     # Clouds are reasonably bright in all visible bands.
239 |     #score = score.min(rescale(im.select('B4').add(im.select('B3')).add(im.select('B2')), [0.2, 0.8]))
240 |     score = score.min(rescale(im, 'img.B4 + img.B3 + img.B2', [0.2, 0.8]))
241 |     
242 |     # Clouds are moist
243 |     ndmi = im.normalizedDifference(['B8','B11'])
244 |     #score=score.min(rescale(ndmi, [-0.1, 0.1]))
245 |     score=score.min(rescale(ndmi, 'img', [-0.1, 0.1]))
246 |       
247 |     # However, clouds are not snow.
248 |     ndsi = im.normalizedDifference(['B3', 'B11'])
249 |     #score=score.min(rescale(ndsi, [0.8, 0.6]))
250 |     score=score.min(rescale(ndsi, 'img', [0.8, 0.6]))
251 |       
252 |     score = score.multiply(100).byte()
253 |     #print('score:', type(score))
254 |      
255 |     return img.addBands(score.rename(['cloudScore']))
256 | 
257 | def mask(img):
258 |     date = img.date()
259 |     year = date.get('year')
260 |     month = date.get('month')
261 |     cdi = ee.Algorithms.Sentinel2.CDI(img)
262 |     scored = basicQA(img)
263 |     clouds = sentinelCloudScore(scored).lte(15).Or(cdi.gte(-0.2))
264 |     water = waterScore(img).select('waterScore').lte(0.25)
265 |     jrc = ee.Image(JRC.filterMetadata('month', 'equals', month).filterMetadata('year', 'equals', year).first())
266 |     waterMask = jrc.focal_max(1, 'square', 'pixels').neq(2).And(water)
267 |     shadowMask = img.select('B11').gt(900)
268 |     return scored.updateMask(clouds.And(shadowMask).And(waterMask))
269 | 
270 | def maskSR(img):
271 |     """
272 |     Apply built in masks to Sentinel-2 surface reflectance imagery
273 |     Parameters:
274 |         img (ee.Image): Sentinel-2 level 2A surface reflectange image
275 |     Returns:
276 |         ee.Image: masked image
277 |     """
278 | #    jrc = ee.Image('JRC/GSW1_1/YearlyHistory/2018')
279 |     scored = basicQA(img);
280 |     maskBand = img.select('SCL')
281 |     cloudMask = maskBand.neq(8).And(maskBand.neq(9))
282 | #    waterMask = maskBand.neq(6).where(jrc.gte(2), 0)
283 |     cirrusMask = maskBand.neq(10)
284 |     snowMask = maskBand.neq(11)
285 |     darkMask = maskBand.neq(2).And(maskBand.neq(3))
286 |     return scored.updateMask(cloudMask.And(cirrusMask).And(snowMask).And(darkMask))
287 | 
288 | def maskTOA(img):
289 |     """
290 |     Mask Sentinel-2 1C top of atmosphere imagery for clouds, water, shadow
291 |     Parameters:
292 |         img (ee.Image): Sentinel-2 level 1C image
293 |     Returns:
294 |         ee.Image: masked image
295 |     """
296 | #    date = img.date()
297 | #    year = date.get('year')
298 |     #month = date.get('month')
299 |     #cdi = ee.Algorithms.Sentinel2.CDI(img)
300 |     scored = basicQA(img)
301 |     cloudMask = sentinelCloudScore(scored).select('cloudScore').lte(15)#.Or(cdi.gte(-0.2))
302 | #    water = waterScore(img).select('waterScore').lte(0.25)
303 | #    jrc = ee.Image(JRC.filterMetadata('year', 'equals', year).first())
304 | #    watermask = water.where(jrc.gte(2), 0)
305 | #    shadowMask = img.select('B11').gt(900)
306 |     return scored.updateMask(cloudMask)#.And(shadowMask))#.And(watermask))
307 | 
308 | 


--------------------------------------------------------------------------------
/utils/pc_tools.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import json
  3 | from pathlib import Path
  4 | from importlib import reload
  5 | import numpy as np
  6 | import os
  7 | import sys
  8 | from os.path import join
  9 | from glob import glob
 10 | import io
 11 | from datetime import datetime
 12 | import xml
 13 | 
 14 | from osgeo import gdal
 15 | import xarray as xr
 16 | import rasterio as rio
 17 | from rasterio.vrt import WarpedVRT
 18 | from rioxarray.merge import merge_arrays
 19 | import rioxarray
 20 | from rioxarray.merge import merge_arrays
 21 | from pyproj import CRS
 22 | 
 23 | import planetary_computer
 24 | from dask_gateway import GatewayCluster
 25 | from dask.distributed import wait, Client
 26 | import pystac_client
 27 | import pystac
 28 | import stackstac
 29 | import stac_vrt
 30 | 
 31 | FILE = Path(__file__).resolve() # full path to the current file, including extension
 32 | print('filepath', FILE)
 33 | ROOT = FILE.parents[0]  # list of upstream directories containing file
 34 | print('root', ROOT)
 35 | REL = Path(os.path.relpath(ROOT, Path.cwd()))
 36 | print('rel', REL)
 37 | if str(ROOT) not in sys.path:
 38 |     sys.path.append(str(ROOT))
 39 | if str(REL) not in sys.path:
 40 |     sys.path.append(str(REL))  # add REL to PATH
 41 | 
 42 | from azure.storage.blob import ContainerClient, BlobClient
 43 | 
 44 | def recursive_api_try(search):
 45 |     try:
 46 |         signed = planetary_computer.sign(search.get_all_items())
 47 |         # collection = search.item_collection()
 48 |         # print(len(collection), 'assets')
 49 |         # signed = [planetary_computer.sign(item).to_dict() for item in collection]
 50 |     except pystac_client.exceptions.APIError as error:
 51 |         print('APIError, trying again')
 52 |         signed = recursive_api_try(search)
 53 |     return signed
 54 | 
 55 | def resign_vrt(filename, element_tag):
 56 |     """Update the authentication token on previously created VRT items
 57 |     Params
 58 |     ---
 59 |     filename: str
 60 |     element_tag: str
 61 |         xml tag containing asset url to be signed
 62 |     """
 63 |     tree = xml.etree.ElementTree.parse(filename)
 64 |     root = tree._root
 65 |     p = Path(filename)
 66 |     sub_vrt_list = []
 67 |     for item in root.iter(element_tag):
 68 |         text = item.text
 69 |         # if item.attrib['relativeToVRT'] == '0':
 70 |         if text.startswith('http'):
 71 |             newtext = planetary_computer.sign(text.split('?')[0])
 72 |             item.text = newtext
 73 |         elif '.vrt' in text:
 74 |             sub_vrt_list.append(text)
 75 |             newtext = text[:-4]+'_resigned.vrt'
 76 |             item.text = newtext
 77 |         for file in sub_vrt_list:
 78 |             etag = 'SourceDataset' if 'warped' in file else element_tag
 79 |             resign_vrt(file, etag)
 80 |     tree.write(str(p.parent)+'/'+str(p.stem)+'_resigned.vrt')
 81 | 
 82 | def export_blob(data: np.ndarray, container_client: ContainerClient, blobUrl: str) -> None:
 83 |     with io.BytesIO() as buffer:
 84 |         np.save(buffer, data)
 85 |         buffer.seek(0)
 86 |         blob_client = container_client.get_blob_client(blobUrl)
 87 |         blob_client.upload_blob(buffer, overwrite=True)  
 88 | 
 89 | def normalize_dataArray(da: xr.DataArray, dim: str) -> xr.DataArray:
 90 |   """Normalize (mean = 0, sd = 1) values in a xarray DataArray along given axis
 91 |   
 92 |   Parameters
 93 |   ---
 94 |   da: xarray.DataArray
 95 |     array to be normalized
 96 |   dim: str
 97 |     name of dimension along which to calculate mean and standard deviation (e.g. 'band')
 98 |     
 99 |   Return
100 |   ---
101 |   xarray.DataArray: input array with values scaled to mean = 0 and sd = 1
102 |   """
103 |   mean = da.mean(dim = dim, skipna = True)
104 |   sd = da.std(dim = dim, skipna = True)
105 |   normalized = (da - mean)/(sd+0.000001)
106 |   return normalized
107 | 
108 | def trim_dataArray(da: xr.DataArray, size: int) -> xr.DataArray: 
109 |   """Trim the remainder from x and y dimensions of a DataArray
110 |   
111 |   Parameters
112 |   ---
113 |   da: xarray:DataArray
114 |     input array to be trimmed
115 |   size: int
116 |     size of chunks in x and y dimension. remaining array x&y size will be evenly divisible by this value 
117 |   
118 |   Return:
119 |   xarray:DataArray: resized input array with x & y dimensions evenly divisible by 'size'
120 |   """
121 |   slices = {}
122 |   for coord in ["y", "x"]:
123 |       remainder = len(da.coords[coord]) % size
124 |       slice_ = slice(-remainder) if remainder else slice(None)
125 |       slices[coord] = slice_
126 | 
127 |   trimmed = da.isel(**slices)
128 |   return trimmed
129 | 
130 | def get_naip_stac(aoi, dates):
131 |     
132 |     catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
133 |     collections = ['naip']
134 | 
135 |     search = catalog.search(
136 |         intersects = aoi,
137 |         datetime = dates,
138 |         collections = collections,
139 |         limit = 500
140 |     )
141 | 
142 |     items = planetary_computer.sign(search.item_collection_as_dict())
143 |     # items is a pystac ItemCollection
144 |     # items2 = items.to_dict()
145 |     features = items['features'] 
146 |     dates = [x['properties']['datetime'] for x in features]
147 |     years = [date[0:4] for date in dates]
148 |     years.sort()
149 |     filtered = [x for x in features if x['properties']['datetime'][0:4] == years[-1]]
150 |     urls = [item['assets']['image']['href'] for item in filtered]
151 |     # organize all naip images overlapping box into a vrt stac
152 |     crs_list = np.array([item['properties']['proj:epsg'] for item in filtered])
153 |     crss = np.unique(crs_list)
154 |     crs_counts = [len(crs_list[crs_list == crs]) for crs in crss]
155 |     print('naip crss', crss)
156 |     if len(crss) > 1:
157 |         # rioxrs = []
158 |         minority_idx = np.argmin(crs_counts)
159 |         majority_idx = np.argmax(crs_counts)
160 |         majority_urls = [url for i, url in enumerate(urls) if crs_list[i] == crss[majority_idx]]
161 |         minority_urls = [url for i, url in enumerate(urls) if crs_list[i] == crss[minority_idx]]
162 |         print('minority urls', minority_urls)
163 |         minority_vrt = gdal.BuildVRT("./minority.vrt", minority_urls)
164 |         majority_vrt = gdal.BuildVRT("./majority.vrt", majority_urls)
165 |         warped_vrt = gdal.Warp("./warped.vrt", minority_vrt, format = 'vrt', dstSRS = f'EPSG:{crss[majority_idx]}')
166 |         naipVRT = gdal.BuildVRT('./naiptmp.vrt',  [warped_vrt, majority_vrt])
167 |         # naipVRT = None
168 |         # for i, url in enumerate(urls):
169 |         #     rioxr = rioxarray.open_rasterio(url)
170 |         #     if crs_list[i] == crss[minority_idx]:
171 |         #         reprojected = rioxr.rio.reproject(f'EPSG:{crss[majority_idx]}')
172 |         #         rioxrs.append(reprojected)
173 |         #     else:
174 |         #         rioxrs.append(rioxr)
175 |         # merged = merge_arrays(rioxrs)
176 |         # return merged
177 |     else:
178 |         # rioxrs = [rioxarray.open_rasterio(url, lock = False) for url in urls]
179 |         # merged = merge_arrays(rioxrs)
180 |         # vrt = stac_vrt.build_vrt(filtered, block_width=512, block_height=512, data_type="Byte")
181 |         # naipImg = rioxarray.open_rasterio(vrt, lock = False)
182 |         naipVRT = gdal.BuildVRT('./naiptmp.vrt', urls)
183 |     naipVRT = None
184 |     naipImg = rioxarray.open_rasterio('./naiptmp.vrt', lock = False)
185 |     return naipImg
186 | 
187 | def get_dem_stac(aoi, dates, crs = None, resolution = None):
188 |     catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
189 |     search = catalog.search(
190 |         intersects = aoi, 
191 |         collections = ["3dep-seamless"]
192 |     )    
193 | 
194 |     # items is a pystac ItemCollection
195 |     items = list(planetary_computer.sign(search.item_collection()))
196 |     dems = [item for item in items if item.properties['gsd'] == 10] # we only want 10 m data
197 |     return dems
198 |     # # hagUrl = hag[0]['assets']['data']['href']
199 |     # demProperties = dems[0].properties
200 |     # if crs:
201 |     #     demCrs = crs
202 |     # else:
203 |     #     demCrs = demProperties['proj:epsg']
204 |     # # demTransform = demProperties['proj:transform']
205 |     # # if resolution:
206 |     # #     demRes = resolution
207 |     # # else:
208 |     # #     demRes = demProperties['gsd']
209 | 
210 |     # demStac = stackstac.stack(
211 |     #     dems,
212 |     #     epsg = demCrs,
213 |     #     resolution = 10)
214 |     #     # sortby_date = False,
215 |     #     # assets = ['data'])
216 |     # print('3dep transform', demStac.rio.transform())
217 |     # demMedian = demStac.median(dim = 'time') 
218 |     # projected = demMedian.rio.set_crs(demCrs)
219 |     # # reprojected = projected.rio.reproject(hagCrs)
220 | 
221 |     # return projected
222 | 
223 | def get_hag_stac(aoi, dates, crs = None, resolution = None):
224 |     catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
225 |     search = catalog.search(
226 |         intersects = aoi,
227 |         datetime = dates, 
228 |         collections = ['3dep-lidar-hag']
229 |     )
230 | 
231 |     items = recursive_api_try(search)
232 |     # items is a pystac ItemCollection
233 |     items2 = items.to_dict()
234 |     hag = items2['features']
235 | 
236 |     # hagUrl = hag[0]['assets']['data']['href']
237 |     hagProperties = hag[0]['properties']
238 |     hagCrs = hagProperties['proj:projjson']['components'][0]['id']['code']
239 |     hagTransform = hagProperties['proj:transform']
240 |     if resolution:
241 |         hagRes = resolution
242 |     else:
243 |         hagRes = hagTransform[0]
244 | 
245 |     # hagSide = 360//hagRes
246 |     # hagZoom = round(600/hagSide, 4)
247 | 
248 |     # hagCrs = [asset['properties']['proj:projjson']['components'][0]['id']['code'] for asset in hag]
249 |     # print('hag CRS', hagCrs[0])
250 |     hagStac = stackstac.stack(
251 |         hag,
252 |         epsg = hagCrs,
253 |         resolution = hagRes,
254 |         sortby_date = False,
255 |         assets = ['data'])
256 | 
257 |     hagMedian = hagStac.median(dim = 'time') 
258 |     projected = hagMedian.rio.set_crs(hagCrs)
259 |     # reprojected = projected.rio.reproject(hagCrs)
260 | 
261 |     return projected
262 | 
263 | def naip_mosaic(naips: list, crs: int):
264 |     """ mosaic a list of naip stac items into a single xarray DataArray
265 |     Parameters
266 |     --------
267 |     naips: list:
268 |         list of naip image items in stac format
269 |     crs: int
270 |         epsg code specifying the common crs to project naip images
271 |     Return
272 |     ---
273 |     xr.DataArray: single array of mosaicd naip images
274 |     """
275 |     data = [item for item in naips if item['properties']['proj:epsg'] == crs]
276 |     crs = CRS.from_user_input(26918)
277 |     naipStac = stac_vrt.build_vrt(
278 |         data, block_width=512, block_height=512, data_type="Byte", crs = crs)
279 |     naipImage = rioxarray.open_rasterio(naipStac, chunks = (4, 8192, 8192), lock = False)
280 |     # reprojected = naipImage.rio.reproject('EPSG:4326')
281 |     return(naipImage)
282 | 
283 | def harmonize_to_old(data):
284 |     """
285 |     Harmonize new Sentinel-2 data to the old baseline.
286 | 
287 |     Parameters
288 |     ----------
289 |     data: xarray.DataArray
290 |         A DataArray with four dimensions: time, band, y, x
291 | 
292 |     Returns
293 |     -------
294 |     harmonized: xarray.DataArray
295 |         A DataArray with all values harmonized to the old
296 |         processing baseline.
297 |     """
298 |     cutoff = datetime(2022, 1, 25)
299 |     offset = 1000
300 |     bands = [
301 |         "B01",
302 |         "B02",
303 |         "B03",
304 |         "B04",
305 |         "B05",
306 |         "B06",
307 |         "B07",
308 |         "B08",
309 |         "B8A",
310 |         "B09",
311 |         "B10",
312 |         "B11",
313 |         "B12",
314 |     ]
315 | 
316 |     old = data.sel(time=slice(cutoff))
317 | 
318 |     to_process = list(set(bands) & set(data.band.data.tolist()))
319 |     new = data.sel(time=slice(cutoff, None)).drop_sel(band=to_process)
320 | 
321 |     new_harmonized = data.sel(time=slice(cutoff, None), band=to_process).clip(offset)
322 |     new_harmonized -= offset
323 | 
324 |     new = xr.concat([new, new_harmonized], "band").sel(band=data.band.data.tolist())
325 |     return xr.concat([old, new], dim="time")
326 | 
327 | def get_s2_stac(dates, aoi, cloud_thresh = 10, bands = ["B02", "B03", "B04", "B08"], epsg = None):
328 |     """from a pystac client return a stac of s2 imagery
329 | 
330 |     Parameters 
331 |     ----
332 |     dates: str
333 |         start/end dates
334 |     aoi: shapely.geometry.Polygon
335 |         polygon defining area of search
336 |     cloud_thresh: int
337 |         maximum cloudy pixel percentage of s2 images to return
338 |     bands: list
339 |         asset (band) names to return and stack
340 |     epsg: int
341 |         epsg coordinate system to reproject s2 data to
342 |     
343 |     Return
344 |     ---
345 |     stackstac.stac()
346 |     """
347 |     # connect to the planetary computer catalog
348 |     catalog = pystac_client.Client.open(
349 |         "https://planetarycomputer.microsoft.com/api/stac/v1",
350 |         modifier = planetary_computer.sign_inplace)
351 | 
352 |     search = catalog.search(
353 |         collections = ['sentinel-2-l2a'],
354 |         datetime = dates,
355 |         intersects = aoi,
356 |         query={"eo:cloud_cover": {"lt": cloud_thresh}} 
357 |     )
358 | 
359 |     s2items = [item.to_dict() for item in list(search.get_items())]
360 |     if len(s2items) > 0:
361 |         s2 = s2items[0]
362 |         if epsg:
363 |             s2epsg = epsg
364 |         else:
365 |             s2epsg = s2['properties']['proj:epsg']
366 | 
367 |         s2Stac = (
368 |             stackstac.stack(
369 |                 s2items,
370 |                 epsg = s2epsg,
371 |                 assets=bands,  # red, green, blue, nir
372 |                 # chunksize=4096,
373 |                 resolution=10,
374 |             )
375 |             .where(lambda x: x > 0, other=np.nan)  # sentinel-2 uses 0 as nodata
376 |         )
377 | 
378 |         harmonized = harmonize_to_old(s2Stac)
379 | 
380 |         s2crs = s2Stac.attrs['crs']
381 |         s2projected = harmonized.rio.set_crs(s2crs)
382 |     else:
383 |         # clipped = s2projected.rio.clip(geometries = [aoi], crs = epsg)
384 |         harmonized = None   
385 |     return harmonized
386 | 
387 | def get_s1_stac(dates, aoi, epsg  = None, bands = ["vv", "vh"]):
388 |     """from a pystac client return a stac of s2 imagery
389 | 
390 |     Parameters 
391 |     ----
392 |     client: pystac_client.Client()
393 |         pystac catalog from which to retrieve assets
394 |     dates: str
395 |         start/end dates
396 |     bbox: tpl
397 |         [xmin, ymin, xmax, ymax]
398 |     
399 |     Return
400 |     ---
401 |     stackstac.stac()
402 |     """
403 |     # connect to the planetary computer catalog
404 |     catalog = pystac_client.Client.open(
405 |         "https://planetarycomputer.microsoft.com/api/stac/v1",
406 |         modifier = planetary_computer.sign_inplace)
407 | 
408 |     search = catalog.search(
409 |         datetime = dates,
410 |         intersects = aoi,
411 |         collections=["sentinel-1-rtc"],
412 |         query={"sar:polarizations": {"eq": ['VV', 'VH']},
413 |                 'sar:instrument_mode': {"eq": 'IW'},
414 |                 'sat:orbit_state': {"eq": 'ascending'}
415 |                 }
416 |     )
417 | 
418 |     s1items = search.item_collection()
419 |     if not epsg:
420 |         s1 = s1items[0]
421 |         epsg = s1.properties['proj:epsg']
422 |     s1Stac = stackstac.stack(
423 |         s1items,
424 |         epsg = epsg,
425 |         assets=bands,
426 |         resolution=10,
427 |         gdal_env=stackstac.DEFAULT_GDAL_ENV.updated(
428 |             always=dict(GDAL_HTTP_MAX_RETRY=5, GDAL_HTTP_RETRY_DELAY=1)
429 |             )
430 |     )
431 | 
432 |     # # get spatial reference info
433 |     # s1crs = s1Stac.attrs['crs']
434 |     # s1transform = s1Stac.attrs['transform']
435 |     # s1res = s1transform[0]
436 | 
437 |     # s1projected = s1Stac.rio.set_crs(s1crs)
438 |     # clipped = s1projected.rio.clip(geometries = [aoi], crs = 4326)
439 |     return s1Stac
440 | 
441 | def get_s1_stac(dates, aoi, epsg  = None, bands = ["vv", "vh"]):
442 |     """from a pystac client return a stac of s2 imagery
443 | 
444 |     Parameters 
445 |     ----
446 |     client: pystac_client.Client()
447 |         pystac catalog from which to retrieve assets
448 |     dates: str
449 |         start/end dates
450 |     bbox: tpl
451 |         [xmin, ymin, xmax, ymax]
452 |     
453 |     Return
454 |     ---
455 |     stackstac.stac()
456 |     """
457 |     # connect to the planetary computer catalog
458 |     catalog = pystac_client.Client.open(
459 |         "https://planetarycomputer.microsoft.com/api/stac/v1",
460 |         modifier = planetary_computer.sign_inplace)
461 | 
462 |     search = catalog.search(
463 |         datetime = dates,
464 |         intersects = aoi,
465 |         collections=["sentinel-1-rtc"],
466 |         query={"sar:polarizations": {"eq": ['VV', 'VH']},
467 |                 'sar:instrument_mode': {"eq": 'IW'},
468 |                 'sat:orbit_state': {"eq": 'ascending'}
469 |                 }
470 |     )
471 | 
472 |     s1items = search.item_collection()
473 |     if not epsg:
474 |         s1 = s1items[0]
475 |         epsg = s1.properties['proj:epsg']
476 |     s1Stac = stackstac.stack(
477 |         s1items,
478 |         epsg = epsg,
479 |         assets=bands,
480 |         resolution=10,
481 |         gdal_env=stackstac.DEFAULT_GDAL_ENV.updated(
482 |             always=dict(GDAL_HTTP_MAX_RETRY=5, GDAL_HTTP_RETRY_DELAY=1)
483 |             )
484 |     )
485 | 
486 |     # # get spatial reference info
487 |     # s1crs = s1Stac.attrs['crs']
488 |     # s1transform = s1Stac.attrs['transform']
489 |     # s1res = s1transform[0]
490 | 
491 |     # s1projected = s1Stac.rio.set_crs(s1crs)
492 |     # clipped = s1projected.rio.clip(geometries = [aoi], crs = 4326)
493 |     return s1Stac
494 | 
495 | def get_ssurgo_stac(aoi, epsg)-> np.ndarray:
496 |     """Sample ssurgo data in raster format
497 |     
498 |     Parameters
499 |     ---
500 |     aoi: shapely.geometry.Polygon
501 |         polygon coordinates defining search aoi
502 |     epsg: int
503 |         cooridnate reference system epsg code to reproject ssurgo data to
504 |     
505 |     Returns
506 |     ---
507 |     np.ndarray: 3-dimensional raster (window_size, window_size, 4) containing ssurgo data
508 |     """
509 |     # connect to the PC STAC catalog
510 |     catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
511 | 
512 |     # get the gnatsco raster, which has 'mukey' values per pixel
513 |     search = catalog.search(
514 |         collections=["gnatsgo-rasters"],
515 |         intersects=aoi
516 |     )
517 |     surgoitems = planetary_computer.sign(search.get_all_items())
518 |     return surgoitems
519 |     # surgoitems = [planetary_computer.sign(item).to_dict() for item in list(search.items())]
520 |     # surgo = surgoitems[0]
521 | 
522 |     # surgowkt = surgo['properties']['proj:wkt2']
523 |     # if epsg:
524 |     #     surgoEPSG = epsg #surgoCrs.to_epsg()
525 |     # else:
526 |     #     surgoEPSG = CRS.from_wkt(surgowkt).to_epsg()
527 | 
528 |     # print(surgoEPSG)
529 |     # # surgoepsg = surgo['properties']['proj:epsg']
530 |     # surgoStac = stackstac.stack(
531 |     #         surgoitems,
532 |     #         # epsg = surgoEPSG,
533 |     #         epsg = surgoEPSG,
534 |     #         assets=['mukey'])
535 | 
536 |     # surgoTransform = surgoStac.attrs['transform']
537 |     # # surgores = 10 #surgoTransform[0] TODO: COnfirm ssurgo is always 10 m resolution
538 |     # # print('resolution', surgores)
539 |     
540 |     # temporal = surgoStac.median(dim = 'time')
541 |     # return temporal, surgoTransform, surgoEPSG
542 | 
543 | def join_ssurgo(ssurgo_table, ssurgo_raster:np.ndarray):
544 |     C,H,W = ssurgo_raster.shape
545 |     # get the unique values and their indices from the raster so we can join to table data
546 |     unique_mukeys, inverse = np.unique(ssurgo_raster, return_inverse=True) 
547 |     # print('\t\tJoining SSURGO Arrays. Unique mukeys', unique_mukeys)
548 |     rearranged = ssurgo_table[['mukey', 'hydclprs', 'drclassdcd', 'flodfreqdcd', 'wtdepannmin']].groupby('mukey').first().reindex(unique_mukeys, fill_value=np.nan).astype(np.float64)
549 |     rearranged.loc[rearranged['wtdepannmin'] > 200.0, 'wtdepannmin'] = 200.0 # anything above 200 should be clipped to 200
550 |     rearranged['wtdepannmin'] = rearranged['wtdepannmin'].fillna(200.0) # missing values are above 200 cm deep
551 |     rearranged['wtdepannmin'] = rearranged['wtdepannmin']/200.0 # 200 cm is the max measured value
552 | 
553 |     rearranged['flodfreqdcd'] = rearranged['flodfreqdcd'].fillna(0.0) # missing values mean no flooding
554 |     
555 |     rearranged['drclassdcd'] = rearranged['drclassdcd'].fillna(0.0) # missing values mean no soil e.g. excessively drained
556 |     
557 |     rearranged['hydclprs'] = rearranged['hydclprs'].fillna(0.0) # missing values mean no soil e.g. not hydric
558 |     rearranged['hydclprs'] = rearranged['hydclprs']/100.0 # 100 percent hydric is max
559 |     # join tabluar data to ssurgo raster based on mukey
560 |     ssurgo_hwc = rearranged.to_numpy()[inverse].reshape((H, W, 4)) # HWC
561 |     return ssurgo_hwc
562 | 
563 | def get_pc_imagery(aoi, dates, crs):
564 |     """Get S2 imagery from Planetary Computer. REQUIRES a valid API token be added to the os environment
565 |     Args:
566 |         aoi: POLYGON geometry json
567 |         dates (tpl): four YYYY-MM-DD date strings defining before and after
568 |         crs (int): 4-digit epsg code representing coordinate reference system
569 |     """
570 |     # Creates the Dask Scheduler. Might take a minute.
571 |     cluster = GatewayCluster(
572 |         address = "https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway",
573 |         proxy_address = "gateway://pccompute-dask.westeurope.cloudapp.azure.com:80",
574 |         auth = 'jupyterhub',
575 |         worker_cores = 4
576 |     )
577 | 
578 |     client = cluster.get_client()
579 |     
580 |     # allow our dask cluster to adaptively scale from 2 to 24 nodes
581 |     cluster.adapt(minimum=2, maximum=24)
582 |     
583 |     # extract before and after dates from input in format required by PC
584 |     before_dates = f'{dates[0]}/{dates[1]}'
585 |     after_dates = f'{dates[2]}/{dates[3]}'
586 | 
587 |     # connect to the planetary computer catalog
588 |     catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1")
589 |     # sentinel = catalog.get_child('sentinel-2-l2a')
590 |     
591 |     before_data = get_s2_stack(catalog, before_dates, aoi)
592 |     after_data = get_s2_stack(catalog, after_dates, aoi)
593 | 
594 |     # convert provided coordinates into appropriate format for clipping xarray imagery
595 |     xs = [x for x,y in aoi['coordinates'][0]]
596 |     ys = [y for x,y in aoi['coordinates'][0]]
597 |     bounds = [min(xs), min(ys), max(xs), max(ys)]
598 | 
599 |     # reduce the before and after image collections to a single image using median value per pixel
600 |     before = before_data.median(dim="time")
601 |     after = after_data.median(dim="time")
602 |     
603 |     # compute the result and load to local machine
604 |     bef_clip = bef.rio.clip([aoi], crs).compute()
605 |     aft_clip = aft.rio.clip([aoi], crs).compute()
606 | 
607 |     # This non-distributed method seems to be working but timing out
608 |     # TODO: try changing chunk dimensions, try increasing timeout time of Webservice
609 |     # bd, ad = dask.compute(bef_clip, aft_clip)
610 | 
611 |     # result_dict = wait([bef_clip, aft_clip], return_when = 'ALL_COMPLETED')
612 |     
613 |     # close our cluster
614 |     client.close()
615 |     cluster.shutdown()
616 |     # return the before and after images as numpy arrays
617 |     return bef_clip.data, aft_clip.data
618 | 
619 | def run_local(aoi, dates, m, buff = 128, kernel = 256):
620 |     """Retrieve Sentinel-2 imagery from Microsoft Planetary Computer and run change detection
621 |     Arguments:
622 |         aoi (dict): GeoJson like dictionary defining area of interest
623 |         crs (int): 4-digit epsg code representing coordinate reference system of the aoi
624 |         dates (tpl<str>): Four YYYY-MM-DD strings defining the before and after periods
625 |         m (keras.Model): model to be used to make predictions
626 |         buff (int): buffer to strip from prediction patches
627 |         kernel (int): size of side of prediction patches
628 |     Return:
629 |         numpy.ndarray: 3D array with per-pixel change probabilities
630 |     """
631 |     # extract before and after dates from input in format required by PC
632 |     before_dates = f'{dates[0]}/{dates[1]}'
633 |     after_dates = f'{dates[2]}/{dates[3]}'
634 | 
635 |     # get our before and after stacs
636 |     print('retrieving s2 data')
637 |     bef_stac, bef_transform = get_s2_stac(before_dates, aoi)
638 |     aft_stac, aft_transform = get_s2_stac(after_dates, aoi) # these are projected rioxarrays
639 | 
640 |     # create median composites
641 |     bef_median = bef_stac.median(dim="time")
642 |     aft_median = aft_stac.median(dim="time")
643 | 
644 |     #normalize
645 |     bef_norm = normalize_dataArray(bef_median, 'band')
646 |     aft_norm = normalize_dataArray(aft_median, 'band')
647 |     
648 |     # concatenate
649 |     ds = xr.concat([bef_norm, aft_norm], dim="band").assign_coords({'band':['B2', 'B3', 'B4', 'B8','B2_1', 'B3_1', 'B4_1', 'B8_1']})
650 | 
651 |     C,H,W = ds.shape
652 |     print('data shape:', ds.shape) # from planetary computer this is C, H, W
653 |     rearranged = ds.transpose('y','x','band')
654 |     print('rearranged shape', rearranged.shape)
655 |     indices = prediction_tools.generate_chip_indices(rearranged, buff, kernel)
656 |     print(len(indices), 'indices generated')
657 |     template = np.zeros((H, W))
658 |     print('template shape:', template.shape)
659 |     # print('generating chips')
660 |     # chips, chip_indices = extract_chips(ds)
661 |     # print(len(chip_indices), 'chips generated')
662 |     dat = rearranged.values
663 |     print('running predictions')
664 |     output = predict_chips(dat, indices, template, m, kernel = kernel, buff = buff)
665 | 
666 |     # print(f'returning array of {output.shape}')
667 |     return output, bef_median, aft_median, aft_transform
668 | 
669 | def run_dask(model_blob_url, weights_blob_url, custom_objects, dates, aoi):
670 |     # # create a dask cluster
671 |     # print('spinning up Dask Cluster')
672 |     # cluster = GatewayCluster(
673 |     #     address = "https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway",
674 |     #     proxy_address = "gateway://pccompute-dask.westeurope.cloudapp.azure.com:80",
675 |     #     auth = 'jupyterhub',
676 |     #     worker_cores = 4
677 |     # )
678 | 
679 |     # client = cluster.get_client()
680 |     # client.upload_file(f'{str(ROOT)}/model_tools.py', load = True)
681 | 
682 |     # # allow our dask cluster to adaptively scale from 2 to 24 nodes
683 |     # cluster.adapt(minimum=4, maximum=24)
684 |     # print('cluster created', cluster.dashboard_link)
685 | 
686 |     # extract before and after dates from input in format required by PC
687 |     before_dates = f'{dates[0]}/{dates[1]}'
688 |     after_dates = f'{dates[2]}/{dates[3]}'
689 | 
690 |     # get our before and after stacs
691 |     print('retrieving s2 data')
692 |     bef_stac = get_s2_stac(before_dates, aoi)
693 |     aft_stac = get_s2_stac(after_dates, aoi) # these are projected rioxarrays
694 | 
695 |     # create median composites
696 |     bef_median = bef_stac.median(dim="time")
697 |     aft_median = aft_stac.median(dim="time")
698 | 
699 |     #normalize
700 |     bef_norm = normalize_dataArray(bef_median, 'band')
701 |     aft_norm = normalize_dataArray(aft_median, 'band')
702 |     
703 |     # concatenate
704 |     ds = xr.concat([bef_norm, aft_norm], dim="band").assign_coords({'band':['B2', 'B3', 'B4', 'B8','B2_1', 'B3_1', 'B4_1', 'B8_1']})
705 | 
706 |     trimmed = trim_dataArray(ds, 256)
707 |     chunked = trimmed.chunk({'x':256, 'y':256})
708 | 
709 |     print('running chunked predictions')
710 |     meta = np.array([[]], dtype="float32")
711 |     predictions_array = chunked.data.map_overlap(
712 |         lambda x: predict_chunk(x, model_blob_url, weights_blob_url, custom_objects),
713 |         depth = (0, 64, 64),
714 |         boundary = 0,
715 |         meta=meta,
716 |         drop_axis=0    
717 |     )
718 | 
719 |     # predictions = predictions_array
720 | 
721 |     # # to restore spatial reference, cast back to Xarray
722 |     # out = xr.DataArray(
723 |     #     predictions,
724 |     #     coords=trimmed.drop_vars("band").coords,
725 |     #     dims=("y", "x"),
726 |     # )
727 |     
728 |     return(predictions_array)
729 | 
730 | 
731 | # def test_PC_connection():
732 | #     """Test our ability to retrieve satellite imagery from Planetary Computer
733 |     
734 | #     Without any processing, return the first Sentinel-2 image from a date range at
735 | #     a known location
736 | #     """
737 | #         # Creates the Dask Scheduler. Might take a minute.
738 | #     cluster = GatewayCluster(
739 | #         address = "https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway",
740 | #         proxy_address = "gateway://pccompute-dask.westeurope.cloudapp.azure.com:80",
741 | #         auth = 'jupyterhub',
742 | #         worker_cores = 4
743 | #     )
744 | 
745 | #     client = cluster.get_client()
746 |     
747 | #     # allow our dask cluster to adaptively scale from 2 to 24 nodes
748 | #     cluster.adapt(minimum=2, maximum=24)
749 |     
750 | #     # define fixed start and end date for summer 2021
751 | #     before_dates = '2021-05-01/2021-08-01'
752 | 
753 | #     # connect to the planetary computer catalog
754 | #     catalog = pcClient.open("https://planetarycomputer.microsoft.com/api/stac/v1")
755 | #     sentinel = catalog.get_child('sentinel-2-l2a')
756 |     
757 | #     search = catalog.search(
758 | #         collections = ['sentinel-2-l2a'],
759 | #         datetime=before_dates,
760 | #         intersects=aoi
761 | #     )
762 | 
763 | #     search_list = list(search_before.get_items())
764 | 
765 | #     least_cloudy = [item for item in search_list if item.properties['eo:cloud_cover'] <= 10]
766 | 
767 | #     items = [pc.sign_item(i).to_dict() for i in least_cloudy]
768 |     
769 | #     # sanity check to make sure we have retrieved and authenticated items fro planetary computer
770 | #     ilen = len(items)
771 | #     print(f'{ilen} images in collection')
772 | 
773 | #     # convert provided coordinates into appropriate format for clipping xarray imagery
774 | #     bounds = [-76.503778, 38.988321, -76.530776, 38.988322]
775 |     
776 | #     # create an 
777 | #     data = (
778 | #         stackstac.stack(
779 | #             items[0],
780 | #             epsg = 32617,
781 | #             bounds_latlon = bounds,
782 | #             sortby_date = 'desc',
783 | #             # resolution=10,
784 | #             assets=['B02', 'B03', 'B04', 'B08'],  # blue, green, red, nir
785 | #             # chunks is for parallel computing on Dask cluster, only refers to spatial dimension
786 | #             chunksize= 'auto' # don't make smaller than native S2 tiles (100x100km)
787 | #         )
788 | #         .where(lambda x: x > 0, other=np.nan)  # sentinel-2 uses 0 as nodata
789 | #         .assign_coords(band = lambda x: x.common_name.rename("band"))  # use common names
790 | #     )
791 |     
792 | #     # reduce the before and after image collections to a single image using first valid pixel
793 | #     before = data.mosaic(dim="time")
794 | 
795 | #     # assign the native sentinel-2 crs the resulting xarrays
796 | #     bef = before.rio.set_crs(32617)
797 |     
798 | #     # compute the result and load to local machine
799 | #     bef_local = bef.compute()
800 | 
801 | #     # This non-distributed method seems to be working but timing out
802 | #     # TODO: try changing chunk dimensions, try increasing timeout time of Webservice
803 | #     # bd, ad = dask.compute(bef_clip, aft_clip)
804 | 
805 | #     # result_dict = wait([bef_clip, aft_clip], return_when = 'ALL_COMPLETED')
806 |     
807 | #     # close our cluster
808 | #     client.close()
809 | #     cluster.shutdown()
810 | #     # return the image as numpy arrays
811 | #     return bef_local.data
812 | 
813 | 


--------------------------------------------------------------------------------
/utils/prediction_tools.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Fri Dec  4 19:24:42 2020
  4 | 
  5 | @author: MEvans
  6 | """
  7 | import os
  8 | from os.path import join
  9 | import sys
 10 | from sys import path
 11 | from pathlib import Path
 12 | 
 13 | # import ee
 14 | import json
 15 | import numpy as np
 16 | import tensorflow as tf
 17 | from matplotlib import pyplot as plt
 18 | #import gsutil
 19 | import rasterio as rio
 20 | from rasterio.crs import CRS
 21 | from rasterio.warp import transform_bounds
 22 | from rasterio.transform import array_bounds 
 23 |      
 24 | FILE = Path(__file__).resolve() # full path to the current file, including extension
 25 | print('filepath', FILE)
 26 | ROOT = FILE.parents[0]  # list of upstream directories containing file
 27 | print('root', ROOT)
 28 | REL = Path(os.path.relpath(ROOT, Path.cwd()))
 29 | print('rel', REL)
 30 | if str(ROOT) not in sys.path:
 31 |     path.append(str(ROOT))
 32 | if str(REL) not in sys.path:
 33 |     path.append(str(REL))  # add REL to PATH
 34 | 
 35 | from processing import normalize_tensor, rescale_tensor
 36 | # TODO: automate spliting of full GEE path
 37 | # def doExport(image, features, scale, bucket, pred_base, pred_path, region, kernel_shape = [256, 256], kernel_buffer = [128,128]):
 38 | #   """
 39 | #   Run an image export task on which to run predictions.  Block until complete.
 40 | #   Parameters:
 41 | #     image (ee.Image): image to be exported for prediction
 42 | #     features (list): list of band names to include in export
 43 | #     scale (int): pixel scale
 44 | #     bucket (str): name of GCS bucket to write files
 45 | #     pred_path (str): relative google cloud directory path for export
 46 | #     pred_base (str): base filename of exported image
 47 | #     kernel_shape (array<int>): size of image patch in pixels
 48 | #     kernel_buffer (array<int>): pixels to buffer the prediction patch. half added to each side
 49 | #     region (ee.Geometry):
 50 | #   """
 51 | #   task = ee.batch.Export.image.toCloudStorage(
 52 | #     image = image.select(features), 
 53 | #     description = pred_base, 
 54 | #     bucket = bucket, 
 55 | #     fileNamePrefix = join(pred_path, pred_base),
 56 | #     region = region,#.getInfo()['coordinates'], 
 57 | #     scale = scale, 
 58 | #     fileFormat = 'TFRecord', 
 59 | #     maxPixels = 1e13,
 60 | #     formatOptions = { 
 61 | #       'patchDimensions': kernel_shape,
 62 | #       'kernelSize': kernel_buffer,
 63 | #       'compressed': True,
 64 | #       'maxFileSize': 104857600
 65 | #     }
 66 | #   )
 67 | #   task.start()
 68 | 
 69 | #   # Block until the task completes.
 70 | #   print('Running image export to Cloud Storage...')
 71 | #   import time
 72 | #   while task.active():
 73 | #     time.sleep(30)
 74 | 
 75 | #   # Error condition
 76 | #   if task.status()['state'] != 'COMPLETED':
 77 | #     print('Error with image export.')
 78 | #   else:
 79 | #     print('Image export completed.')
 80 | 
 81 | #   # Error condition
 82 | #   if task.status()['state'] != 'COMPLETED':
 83 | #     print('Error with image export.')
 84 | #   else:
 85 | #     print('Image export completed.')
 86 | 
 87 | def generate_chip_indices(arr, buff = 128, kernel = 256):
 88 |   """
 89 |   Parameters
 90 |   ---
 91 |     arr: np.ndarray
 92 |       3D array (H, W, C) for which indices should be generated
 93 |     buff: int
 94 |       size of pixels to be trimmed from chips
 95 |     kernel: int
 96 |       size of contiguous image chips
 97 |   Return
 98 |   ---
 99 |     list::np.ndarray: list containing (y,x) index of chips upper left corner
100 |   """
101 |   H, W, C = arr.shape
102 |   side = buff + kernel
103 |   x_buff = y_buff = buff//2
104 |   
105 |   y_indices = list(range(y_buff, H - side, kernel))
106 |   x_indices = list(range(x_buff, W - side, kernel))
107 | 
108 |   indices = [(y_index, x_index) for y_index in y_indices for x_index in x_indices]
109 |   return indices
110 | 
111 | def extract_chips(arr, buff = 128, kernel = 256):
112 |     """Break an array into (potentially) overlapping chips for analysis
113 |     Arguments:
114 |         arr (ndarray): 3D array to run predictions on
115 |         buff (int): size of pixels to be trimmed from chips
116 |         kernel (int): size of contiguous image chips
117 |     Return:
118 |         list::np.ndarray: list containing image chips of size (kernel+buff, kernel+buff)
119 |     """
120 |     H, W, C = arr.shape
121 |     side = buff + kernel
122 |     x_buff = y_buff = buff//2
123 |     chips = []
124 | 
125 |     chip_indices = generate_chip_indices(arr, buff, kernel)
126 | 
127 |     for x, y in chip_indices:
128 |       chip = arr[y-y_buff:y+kernel+y_buff, x-x_buff:x+kernel+x_buff, :]
129 |       chips.append(chip)
130 |     
131 |     return chips
132 | 
133 | def predict_chips(arr, chip_indices, template, m, kernel = 256, buff = 128):
134 |     """Predict changes in image chips
135 |     Arguments:
136 |         chips (list): kernel+buff x kernel+buff pixel chips to be fed to U-Net model
137 |         chip_indices (list): list of (y,x) tuples marking position of chip upper-left corner in output array
138 |         m (keras.Model): model to be used to make predictions
139 |         template (ndarray): 2D all-zero array to which predictions will be written
140 |         buff (int): total number of pixels to be trimmed from output chips in x and y direction
141 |         kernel (int): number of pixels in x and y retained in prediction chips
142 |     Return:
143 |         ndarray: 3D array of size output.shape containing change probabilities
144 |     """
145 |     y_buff = x_buff = buff//2
146 |     if len(chip_indices) >= 1:
147 |         for y, x in chip_indices:
148 |             print(y,x)
149 |             chip = arr[y - y_buff:y+kernel+y_buff, x - x_buff:x + kernel + x_buff, :]
150 |             print(chip.shape)
151 |             # preds = m.predict(np.array([chips[i]]), verbose = 0)
152 |             preds = m.predict(np.array([chip]), verbose = 0)
153 |             print(preds.shape)
154 |             template[y:y+kernel, x:x+kernel] += preds[0, y_buff:(kernel + y_buff), x_buff:(kernel+x_buff), 0]
155 | 
156 |     return template
157 |       
158 | #def makePredDataset(bucket, pred_path, pred_image_base, kernel_buffer, features, raw = None):
159 | def make_pred_dataset(file_list, features, kernel_shape = [256, 256], kernel_buffer = [128, 128], axes = [2], splits = None, moments = None, one_hot = None, **kwargs):
160 |     """ Make a TFRecord Dataset that can be used for predictions
161 |     Parameters:
162 |         file_list: list of complete pathnames for prediction data files
163 |         pred_path (str): path to .tfrecord files
164 |         pred_image_base (str): pattern matching basename of file(s)
165 |         kernel_shape (tpl): size of image patch in pixels
166 |         kernel_buffer (tpl): pixels to trim from H, W dimensions of prediction
167 |         features (list): names of features in incoming data
168 |         axes (list): axes for normalization
169 |         one_hot (dict): key:value pairs for name of one-hot variable and desired one-hot depth
170 |     Return:
171 |         TFRecord Dataset
172 |     """
173 |     
174 |       # Make sure the files are in the right order.
175 |     file_list.sort()
176 |     
177 |       # Get set up for prediction.
178 |     x_buffer = int(kernel_buffer[0] / 2)
179 |     y_buffer = int(kernel_buffer[1] / 2)
180 |     
181 |     buffered_shape = [
182 |         kernel_shape[0] + kernel_buffer[0],
183 |         kernel_shape[1] + kernel_buffer[1]]
184 |     
185 |     imageColumns = [
186 |       tf.io.FixedLenFeature(shape=buffered_shape, dtype=tf.float32) 
187 |         for k in features
188 |     ]
189 |     
190 |     imageFeaturesDict = dict(zip(features, imageColumns))
191 |     
192 |     def parse_image(example_proto):
193 |       return tf.io.parse_single_example(example_proto, imageFeaturesDict)
194 |     
195 |     def toTupleImage(dic):
196 |         
197 |         # stack the augmented bands, optional one-hot tensors, and response variable
198 |         if one_hot:
199 |             featList = [dic.get(key) for key in features if key not in one_hot.keys()]
200 |             hotList = [tf.one_hot(tf.cast(dic.get(key), tf.uint8), val, axis = 2) for key, val in one_hot.items()]
201 |         else:
202 |             featList = [dic.get(key) for key in features]
203 |         
204 |         bands = tf.transpose(tf.stack(featList, axis = 0), [1,2,0])
205 |         bands = rescale_tensor(bands, axes = axes, moments = moments, splits = splits)
206 |             # If custom preprocessing functions are specified add respective bands
207 | 
208 |         for fxn in kwargs.values():
209 |             der = fxn(dic)
210 |             der = tf.expand_dims(der, axis = 2)
211 |             bands = tf.concat([bands, der], axis = 2)
212 |         
213 |         if one_hot:
214 |           hotStack = tf.concat(hotList, axis = 2)
215 |           stacked = tf.concat([bands, hotStack], axis =2)
216 |         else:
217 |           stacked = tf.concat([bands], axis = 2)
218 |         
219 |         return stacked
220 |   
221 |   # Create a dataset(s) from the TFRecord file(s) in Cloud Storage.
222 |     
223 |     imageDataset = tf.data.TFRecordDataset(file_list, compression_type='GZIP')
224 |     imageDataset = imageDataset.map(parse_image, num_parallel_calls=5)
225 |     imageDataset = imageDataset.map(toTupleImage).batch(1)
226 |     return imageDataset
227 | 
228 | def plot_to_image(figure):
229 |     """Converts the matplotlib plot specified by 'figure' to a PNG image and
230 |     returns it. The supplied figure is closed and inaccessible after this call."""
231 |     # Save the plot to a PNG in memory.
232 |     import io
233 |     buf = io.BytesIO()
234 |     plt.savefig(buf, format='png')
235 |     # Closing the figure prevents it from being displayed directly inside
236 |     # the notebook.
237 |     plt.close(figure)
238 |     buf.seek(0)
239 |     # Convert PNG buffer to TF image
240 |     image = tf.image.decode_png(buf.getvalue(), channels=4)
241 |     # Add the batch dimension
242 |     image = tf.expand_dims(image, 0)
243 |     return image
244 | 
245 | def callback_predictions(imageDataset, model, mixer, kernel_shape = [256, 256], kernel_buffer = [128, 128]):
246 |     patches = mixer['totalPatches']
247 |     cols = mixer['patchesPerRow']
248 |     rows = patches//cols
249 | 
250 |     # Perform inference.
251 |     predictions = model.predict(imageDataset, steps=patches, verbose=1)
252 |     
253 |     # some models will outputs probs (B, H, W, NCLASSES) and classes (B, H, W) as a list
254 |     if type(predictions) == list:
255 |         # in this case lets just grab the probabilities
256 |         predictions = predictions[0]
257 |         
258 |     x_buffer = int(kernel_buffer[0] / 2)
259 |     y_buffer = int(kernel_buffer[1] / 2)
260 |     x_size = kernel_shape[0]+y_buffer
261 |     y_size = kernel_shape[1]+x_buffer
262 | 
263 |     x = 1
264 |     for prediction in predictions:
265 |       print('Writing patch ' + str(x) + '...')
266 |       # write probability of target class (i.e. 1), classes can be calculated post processing if not present already
267 |       patch = prediction[y_buffer:y_size, x_buffer:x_size, 1]
268 | #      predPatch = np.add(np.argmax(prediction, axis = 2), 1)
269 | #      probPatch = np.max(prediction, axis = 2)
270 | #      predPatch = predPatch[x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]
271 | #      probPatch = probPatch[x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]
272 | #      # stack probabilities and classes along channel dimension
273 | #      patch = np.stack([predPatch, probPatch], axis = 2)
274 | 
275 |       ## NOTE: Predictions come out with y as 0 dimension (ie. rows), x as 1 dimension (ie. columns)
276 |       # if we're at the beginning of a row
277 |       if x%cols == 1:
278 |         row = patch
279 |       else:
280 |         row = np.append(row, patch, axis = 1)
281 |       # if we reached the end of a row start a new one
282 |       if x%cols == 0:
283 |         # for the first row, create single row rows object
284 |         if x <= cols:
285 |           rows = row
286 |         else:
287 |         # add current row to previous rows along y axis
288 |           rows = np.append(rows, row, axis = 0)
289 |       x += 1
290 | 
291 |     return rows  
292 | 
293 | def make_array_predictions(imageDataset, model, jsonFile, kernel_shape = [256, 256], kernel_buffer = [128,128]):
294 |     """Create a 3D array of prediction outputs from TFRecord dataset
295 |     
296 |     Given a set of TFRecords representing image patches on which to run model predictions,
297 |     and a json file specifying the spatial reference system and arrangement of patches,
298 |     this function writes predictions to a single, reconstructed numpy array of shape
299 |     (?,?,2). Dimension 2 holds class probabilities and most likely class.
300 |     
301 |     Parameters:
302 |         imageDataset (tf.Dataset): image patch tensors on which to run predictions
303 |         model (keras Model): model used to make predictions
304 |         jsonFile (str): complete GCS filepath to json file
305 |         kernel_size(tpl): size of image patch in pixels
306 |         kernel_buffer (tpl): pixels to trim from H, W, dimensions of each output patch
307 |     Return:
308 |         ndarray: 3D array of prediction outputs.
309 |     """
310 |     # we need metadata from the json file to reconstruct prediction patches
311 |     # Load the contents of the mixer file to a JSON object.
312 | #    jsonFile = '/'.join(jsonFile.split(sep = '/')[3:])
313 | #    blob = bucket.get_blob(jsonFile) #23Mar21 update to use google-cloud-storage library
314 | #    jsonText = blob.download_as_string().decode('utf-8')
315 | #    mixer = json.loads(jsonText)
316 |     
317 |     with open(jsonFile,) as file:
318 |         mixer = json.load(file)
319 |     
320 | #    # Load the contents of the mixer file to a JSON object.
321 | #    jsonText = !gsutil cat {jsonFile}
322 | #    
323 | #    # Get a single string w/ newlines from the IPython.utils.text.SList
324 | #    mixer = json.loads(jsonText.nlstr)
325 |     
326 |     print(mixer)
327 |     patches = mixer['totalPatches']
328 |     cols = mixer['patchesPerRow']
329 |     rows = patches//cols
330 | 
331 |     # Perform inference.
332 |     print('Running predictions...')
333 |     predictions = model.predict(imageDataset, steps=patches, verbose=1)
334 |     
335 |     # some models will outputs probs and classes as a list
336 |     if type(predictions) == list:
337 |         # in this case, concatenate list elments into a single 4d array along last dimension
338 |         predictions = np.concatenate(predictions, axis = 3)
339 |         
340 |     x_buffer = int(kernel_buffer[0] / 2)
341 |     y_buffer = int(kernel_buffer[1] / 2)
342 |     x_size = kernel_shape[0]+y_buffer
343 |     y_size = kernel_shape[1]+x_buffer
344 | 
345 |     x = 1
346 |     for prediction in predictions:
347 |       print('Writing patch ' + str(x) + '...')
348 |       # lets just write probabilities, classes can be calculated post processing if not present already
349 |       patch = prediction[y_buffer:y_size, x_buffer:x_size, :]
350 | #      predPatch = np.add(np.argmax(prediction, axis = 2), 1)
351 | #      probPatch = np.max(prediction, axis = 2)
352 | #      predPatch = predPatch[x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]
353 | #      probPatch = probPatch[x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]
354 | #      # stack probabilities and classes along channel dimension
355 | #      patch = np.stack([predPatch, probPatch], axis = 2)
356 | 
357 |       ## NOTE: Predictions come out with y as 0 dimension (ie. rows), x as 1 dimension (ie. columns)
358 |       # if we're at the beginning of a row
359 |       if x%cols == 1:
360 |         row = patch
361 |       else:
362 |         row = np.append(row, patch, axis = 1)
363 |       # if we reached the end of a row start a new one
364 |       if x%cols == 0:
365 |         # for the first row, create single row rows object
366 |         if x <= cols:
367 |           rows = row
368 |         else:
369 |         # add current row to previous rows along y axis
370 |           rows = np.append(rows, row, axis = 0)
371 |       x += 1
372 | 
373 |     return rows
374 | 
375 | def write_tfrecord_predictions(predictions, pred_path, out_image_base, kernel_shape = [256, 256], kernel_buffer = [128,128]):
376 |     """Generate predictions and save as TFRecords to Cloud Storage
377 |     Parameters:
378 |       imageDataset (tf.Dataset): data on which to run predictions
379 |       pred_path (str): full path to output directory 
380 |       out_image_base (str): file basename for input and output files
381 |       kernel_shape (tpl): [y, x] size of image patch in pixels
382 |       kernel_buffer (tpl): [y, x] size of buffer to be trimmed from predictions
383 | 
384 |     Return:
385 |       empty: Writes TFRecord files to specified destination
386 |     """
387 |     # Perform inference.
388 |     # print('Running predictions...')
389 |     # predictions = model.predict(imageDataset, steps=None, verbose=1)
390 |     # print(predictions[0])
391 |     
392 |     # some models will outputs probs and classes as a list
393 |     if type(predictions) == list:
394 |         # in this case, concatenate list elments into a single 4d array along last dimension
395 |         predictions = np.concatenate(predictions, axis = 3)
396 |     
397 |     # get the number of bands (should usually be one or two)
398 |     C = predictions.shape[-1]
399 |     
400 |     out_image_file = join(pred_path, f'{out_image_base}.tfrecords')
401 |     
402 |     print('Writing predictions to ' + out_image_file + '...')
403 |     writer = tf.io.TFRecordWriter(out_image_file)
404 | 
405 |     patches = 1
406 | 
407 |     x_buffer = int(kernel_buffer[0] / 2)
408 |     y_buffer = int(kernel_buffer[1] / 2)
409 |     x_size = x_buffer + kernel_shape[1]
410 |     y_size = y_buffer + kernel_shape[0]
411 | 
412 |     for prediction in predictions:
413 |       print('Writing patch ' + str(patches) + '...')
414 |       # lets just write probabilities, classes can be calculated post processing if not present already
415 |       patch = prediction[y_buffer:y_size, x_buffer:x_size, :]
416 | #      predPatch = np.add(np.argmax(prediction, axis = 2), 1)
417 | #      probPatch = np.max(prediction, axis = 2)
418 | #      predPatch = predPatch[x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]
419 | #      probPatch = probPatch[x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]
420 |       
421 |       # for each band in prediction, create a tf train feature
422 |       feature = {}
423 |       for i in range(C):
424 |           feat = tf.train.Feature(float_list = tf.train.FloatList(value = np.ndarray.flatten(patch[:,:,i])))
425 |           feature['b{}'.format(i+1)] = feat
426 |           
427 |       # Create an example.
428 |       example = tf.train.Example(
429 |         features=tf.train.Features(
430 |                 feature = feature
431 | #          feature={
432 | #            'class': tf.train.Feature(
433 | #                int64_list=tf.train.Int64List(
434 | #                    value = np.ndarray.flatten(predPatch))),
435 | #            'prob': tf.train.Feature(
436 | #                float_list = tf.train.FloatList(
437 | #                    value = np.ndarray.flatten(probPatch)))
438 | #          }
439 |         )
440 |       )
441 |       # Write the example.
442 |       writer.write(example.SerializeToString())
443 |       patches += 1
444 | 
445 |     writer.close()
446 | 
447 | def write_geotiff_prediction(image, jsonFile, aoi):
448 |   with open(jsonFile,) as file:
449 |     mixer = json.load(file)
450 |           
451 |     transform = mixer['projection']['affine']['doubleMatrix']
452 |     crs = mixer['projection']['crs']
453 |     ppr = mixer['patchesPerRow']
454 |     tp = mixer['totalPatches']
455 |     rows = int(tp/ppr)
456 | 
457 |   if image.ndim < 3:
458 |       image = np.expand_dims(image, axis = -1)
459 |       
460 |   affine = rio.Affine(transform[0], transform[1], transform[2], transform[3], transform[4], transform[5])
461 | 
462 |   with rio.open(
463 |       f'{aoi}.tif',
464 |       'w',
465 |       driver = 'GTiff',
466 |       width = image.shape[1],
467 |       height = image.shape[0],
468 |       count = image.shape[2],
469 |       dtype = image.dtype,
470 |       crs = crs,
471 |       transform = affine) as dst:
472 |       dst.write(np.transpose(image, (2,0,1)))
473 |       
474 | # TODO: re-calculate n and write files not strictly based on rows
475 | def write_geotiff_predictions(imageDataset, model, jsonFile, outImgBase, outImgPath, kernel_buffer = [128,128]):
476 |   """Run predictions on a TFRecord dataset and save as a GeoTIFF
477 |   Parameters:
478 |     imageDataset (tf.Dataset): data on which to run predictions
479 |     model (tf.keras.Model): trained model
480 |     jsonFile (str): filename of json mixer file
481 |     outImgPath (str): directory in which to write predictions
482 |     outImgBase (str): file basename
483 |     kernel_buffer (tpl): x and y padding around patches
484 |   Return:
485 |     empty: writes geotiff records temporarily to working directory
486 |   """
487 |   with open(jsonFile, ) as file:
488 |     mixer = json.load(file)
489 |   transform = mixer['projection']['affine']['doubleMatrix']
490 |   crs = mixer['projection']['crs']
491 |   ppr = mixer['patchesPerRow']
492 |   tp = mixer['totalPatches']
493 |   rows = int(tp/ppr)
494 |   kernel_shape = mixer['patchDimensions']
495 | 
496 |   H = rows*kernel_shape[0]
497 |   W = ppr*kernel_shape[1]
498 |   y_indices = list(range(0, H, kernel_shape[0]))
499 |   x_indices = list(range(0, W, kernel_shape[1]))
500 |   indices = [(y,x) for y in y_indices for x in x_indices]
501 |   out_array = np.zeros((H, W, 1), dtype = np.float32)
502 |   print('out array', out_array.shape)
503 |   x_buffer = int(kernel_buffer[0]/2)
504 |   y_buffer = int(kernel_buffer[1]/2)
505 |   x_size = x_buffer + kernel_shape[1]
506 |   y_size = y_buffer + kernel_shape[0]
507 | 
508 |   # prediction = model.predict(imageDataset, steps = tp, verbose = 1)
509 |   # if type(predictions) == list:
510 |   #   predictions = np.concatenate(predictions, axis = 3)
511 |   
512 |   iterator = iter(imageDataset)
513 | 
514 |   for i, (y,x) in enumerate (indices):
515 |     prediction = model.predict(iterator.next(), steps = 1, verbose = 1)
516 |     if type(prediction) == list:
517 |       prediction = np.concatenate(prediction, axis = 3)
518 |     # prediction = predictions[i]
519 |     print('prediction', prediction.shape)
520 |     out_array[y:y+kernel_shape[0], x:x+kernel_shape[1], 0] += prediction[0, y_buffer:y_size, x_buffer:x_size, 0]
521 |       
522 |   affine = rio.Affine(transform[0], transform[1], transform[2], transform[3], transform[4], transform[5])
523 | 
524 |   out_image_file = join(outImgPath, f'{outImgBase}.tif')
525 |   print(f'writing image to {out_image_file}')
526 |   with rio.open(
527 |     out_image_file,
528 |     'w',
529 |     driver = 'GTiff',
530 |     width = W,
531 |     height = H,
532 |     count = 1,
533 |     dtype = out_array.dtype,
534 |     crs = crs,
535 |     transform = affine) as dst:
536 |     dst.write(np.transpose(out_array, (2,0,1)))
537 | 
538 | #def ingest_predictions(pred_path, out_image_base, user_folder):
539 | #  """
540 | #  Upload prediction image(s) to Earth Engine.
541 | #  Parameters:
542 | #    pred_path (str): Google cloud (or Drive) path storing prediction image files
543 | #    pred_image_base (str):
544 | #    user_folder (str): GEE directory to store asset
545 | #    out_image_base (str): base filename for GEE asset
546 | #  """
547 | #  blob = bucket.get_blob(join(pred_path, out_image_base + '_mixer.json'))
548 | #  jsonFile = blob.name
549 | #  
550 | ##  jsonFile = !gsutil ls {join('gs://', pred_path, out_image_base + '*.json')}
551 | #  print(jsonFile)
552 | #  blobs = bucket.list_blobs(join(pred_path, 'outputs', out_image_base + ))
553 | #  predFiles = !gsutil ls {join('gs://', pred_path, 'outputs', out_image_base + '*TFRecord')}
554 | #  print(predFiles)
555 | #  out_image_files = ' '.join(predFiles)
556 | #  # Start the upload.
557 | #  out_image_asset = join(user_folder, out_image_base)
558 | #  !earthengine upload image --asset_id={out_image_asset} {out_image_files} {jsonFile[0]}
559 |   
560 | def get_img_bounds(img, jsonFile, dst_crs = None):
561 |       """Get the projected top left and bottom right coordinates of an image
562 |       Parameters:
563 |       img (ndarray): image to generate bounding coordinates for
564 |       jsonFile (str): path to json file defining crs and image size
565 |       dst_crs (str): epsg code for output crs
566 |       Return:
567 |       tpl: [[lat min, lon min],[lat max, lon max]]
568 |       """
569 |       # Get a single string w/ newlines from the IPython.utils.text.SList
570 |       with open(jsonFile,) as f:
571 |         mixer = json.load(f)
572 |       # mixer = json.loads(jsonText.nlstr)
573 |       transform = mixer['projection']['affine']['doubleMatrix']
574 |       print(transform)
575 |       src_crs = CRS.from_string(mixer['projection']['crs'])
576 |       print(src_crs)
577 |       affine = rio.Affine(transform[0], transform[1], transform[2], transform[3], transform[4], transform[5])
578 |       H,W = [0,0]
579 |     
580 |       if type(img) == np.ndarray:
581 |           print('input image is numpy')
582 |           H,W = img.shape
583 |           print('image shape is ', H, W)
584 |           bounds = array_bounds(H, W, affine)
585 |     
586 |       elif type(img) == str:
587 |           print('input image is geotiff')
588 |           with rio.open(img) as src:
589 |               bounds = src.bounds
590 |       # H, W = src.shape
591 |     
592 |       print(bounds)
593 |       lon_min, lat_min, lon_max, lat_max = bounds
594 |       # if we need to transform the bounds, such as for folium ('EPSG:3857')
595 |       if dst_crs:
596 |           dst_crs = CRS.from_string(dst_crs)
597 |           out_bounds = transform_bounds(src_crs, dst_crs, left = lon_min, bottom = lat_min, right = lon_max, top = lat_max, densify_pts=21)
598 |           lon_min, lat_min, lon_max, lat_max = out_bounds
599 |           print(out_bounds)
600 |       return [[lat_min, lon_min], [lat_max, lon_max]]
601 | 
602 | def doPrediction(bucket, pred_path, pred_image_base, features, one_hot, out_image_base, kernel_shape, kernel_buffer):
603 |   """
604 |   Given a bucket and path to prediction images, create a prediction dataset, make predictions
605 |   and write tfrecords to GCS
606 |   Parameters:
607 |     bucket: (Bucket): google-cloud-storage bucket object
608 |     pred_path (str): relative GCS path storing prediction image files
609 |     pred_image_base (str): base filename of prediction files
610 |     user_folder (str): GEE directory to store asset
611 |     out_image_base (str): base filename for GEE asset
612 |     kernel_buffer (Array<int>): length 2 array 
613 |   Return:
614 |     list: list of written image filenames to be used in earthengine upload
615 |   """
616 | 
617 |   print('Looking for TFRecord files...')
618 |   
619 |   # Get a list of all the files in the output bucket.
620 |   blobs = bucket.list_blobs(prefix = join(pred_path, pred_image_base))
621 |   filesList = [file.name for file in blobs if pred_image_base in file.name]
622 | #  filesList = !gsutil ls {pred_path}
623 |   # Get only the files generated by the image export.
624 | #  exportFilesList = [s for s in filesList if pred_image_base in s]
625 | 
626 |   # Get the list of image files and the JSON mixer file.
627 |   imageFilesList = []
628 |   jsonFile = None
629 |   for f in filesList:
630 |     if f.endswith('.tfrecord.gz'):
631 |       imageFilesList.append(f)
632 |     elif f.endswith('.json'):
633 |       jsonFile = f
634 | 
635 |   # Make sure the files are in the right order.
636 |   imageFilesList.sort()
637 | 
638 |   from pprint import pprint
639 |   pprint('image files:', imageFilesList)
640 |   print('json file:', jsonFile)
641 |   
642 |   # make a prediction dataset from the given files
643 |   
644 |   # Load the contents of the mixer file to a JSON object.
645 |   blob = bucket.get_blob(jsonFile)
646 |   jsonText = blob.download_as_string().decode('utf-8')
647 |   mixer = json.loads(jsonText)
648 | #  jsonText = !gsutil cat {jsonFile}
649 |   # Get a single string w/ newlines from the IPython.utils.text.SList
650 | #  mixer = json.loads(jsonText.nlstr)
651 |   pprint(mixer)
652 |   patches = mixer['totalPatches']
653 |   
654 | #  # Get set up for prediction.
655 | #  x_buffer = int(kernel_buffer[0] / 2)
656 | #  y_buffer = int(kernel_buffer[1] / 2)
657 | #
658 | #  buffered_shape = [
659 | #      KERNEL_SHAPE[0] + kernel_buffer[0],
660 | #      KERNEL_SHAPE[1] + kernel_buffer[1]]
661 | #
662 | #  imageColumns = [
663 | #    tf.io.FixedLenFeature(shape=buffered_shape, dtype=tf.float32) 
664 | #      for k in BANDS
665 | #  ]
666 | #
667 | #  imageFeaturesDict = dict(zip(BANDS, imageColumns))
668 | #
669 | #  def parse_image(example_proto):
670 | #    return tf.io.parse_single_example(example_proto, imageFeaturesDict)
671 | #
672 | #  def toTupleImage(dic):
673 | #    inputsList = [dic.get(key) for key in BANDS]
674 | #    stacked = tf.stack(inputsList, axis=0)
675 | #    stacked = tf.transpose(stacked, [1, 2, 0])
676 | #    stacked = normalize(stacked, [0, 1])
677 | #    return stacked
678 |   
679 |   # Create a dataset(s) from the TFRecord file(s) in Cloud Storage.
680 |   i = 0
681 |   patches = 0
682 |   written_files = []
683 |   while i < len(imageFilesList):
684 |     imageDataset = make_pred_dataset(file_list = imageFilesList[i:i+100], kernel_shape = kernel_shape, kernel_buffer = kernel_buffer, features = features, one_hot = one_hot)
685 | #    imageDataset = tf.data.TFRecordDataset(imageFilesList[i:i+100], compression_type='GZIP')
686 | #    imageDataset = imageDataset.map(parse_image, num_parallel_calls=5)
687 | #    imageDataset = imageDataset.map(toTupleImage).batch(1)
688 |     
689 |     out_image_base = out_image_base + '{:04d}'.format(i)
690 |     out_image_file = join('gs://', bucket.name, pred_path, 'outputs/tfrecord', out_image_base + '.TFRecord')
691 |     write_tfrecord_predictions(imageDataset, pred_path = pred_path, out_image_base = out_image_base, kernel_buffer = kernel_buffer)
692 | #    # Perform inference.
693 | #    print('Running predictions...')
694 | #    predictions = m.predict(imageDataset, steps=None, verbose=1)
695 | #    # print(predictions[0])
696 | #
697 | #
698 | #    
699 | #    print('Writing predictions to ' + out_image_file + '...')
700 | #    writer = tf.io.TFRecordWriter(out_image_file)
701 | #    for predictionPatch in predictions:
702 | #      print('Writing patch ' + str(patches) + '...')
703 | #      predictionPatch = predictionPatch[
704 | #          x_buffer:x_buffer+KERNEL_SIZE, y_buffer:y_buffer+KERNEL_SIZE]
705 | #
706 | #      # Create an example.
707 | #      example = tf.train.Example(
708 | #        features=tf.train.Features(
709 | #          feature={
710 | #            'probability': tf.train.Feature(
711 | #                float_list=tf.train.FloatList(
712 | #                    value=predictionPatch.flatten()))
713 | #          }
714 | #        )
715 | #      )
716 | #      # Write the example.
717 | #      writer.write(example.SerializeToString())
718 | #      patches += 1
719 | #
720 | #    writer.close()
721 |     i += 100
722 |     written_files.append(out_image_file)
723 |  
724 |   out_image_files = ' '.join(written_files)
725 |   # Start the upload.
726 | #  out_image_asset = join(user_folder, out_image_base)
727 | #  !earthengine upload image --asset_id={out_image_asset} {out_image_files} {jsonFile}
728 |   # return list of written image files for use in gee upload
729 |   return out_image_files
730 | 
731 | def predict_pc_local(aoi, dates, m, buff = 128, kernel = 256):
732 |     """Retrieve Sentinel-2 imagery from Microsoft Planetary Computer and run change detection
733 |     Arguments:
734 |         aoi (dict): GeoJson like dictionary defining area of interest
735 |         crs (int): 4-digit epsg code representing coordinate reference system of the aoi
736 |         dates (tpl<str>): Four YYYY-MM-DD strings defining the before and after periods
737 |         m (keras.Model): model to be used to make predictions
738 |         buff (int): buffer to strip from prediction patches
739 |         kernel (int): size of side of prediction patches
740 |     Return:
741 |         numpy.ndarray: 3D array with per-pixel change probabilities
742 |     """
743 |     # extract before and after dates from input in format required by PC
744 |     before_dates = f'{dates[0]}/{dates[1]}'
745 |     after_dates = f'{dates[2]}/{dates[3]}'
746 | 
747 |     # get our before and after stacs
748 |     print('retrieving s2 data')
749 |     bef_stac, bef_transform = get_s2_stac(before_dates, aoi)
750 |     aft_stac, aft_transform = get_s2_stac(after_dates, aoi) # these are projected rioxarrays
751 | 
752 |     # create median composites
753 |     bef_median = bef_stac.median(dim="time")
754 |     aft_median = aft_stac.median(dim="time")
755 | 
756 |     #normalize
757 |     bef_norm = normalize_dataArray(bef_median, 'band')
758 |     aft_norm = normalize_dataArray(aft_median, 'band')
759 |     
760 |     # concatenate
761 |     ds = xr.concat([bef_norm, aft_norm], dim="band").assign_coords({'band':['B2', 'B3', 'B4', 'B8','B2_1', 'B3_1', 'B4_1', 'B8_1']})
762 | 
763 |     C,H,W = ds.shape
764 |     print('data shape:', ds.shape) # from planetary computer this is C, H, W
765 |     rearranged = ds.transpose('y','x','band')
766 |     print('rearranged shape', rearranged.shape)
767 |     indices = prediction_tools.generate_chip_indices(rearranged, buff, kernel)
768 |     print(len(indices), 'indices generated')
769 |     template = np.zeros((H, W))
770 |     print('template shape:', template.shape)
771 |     # print('generating chips')
772 |     # chips, chip_indices = extract_chips(ds)
773 |     # print(len(chip_indices), 'chips generated')
774 |     dat = rearranged.values
775 |     print('running predictions')
776 |     output = predict_chips(dat, indices, template, m, kernel = kernel, buff = buff)
777 | 
778 |     # print(f'returning array of {output.shape}')
779 |     return output, bef_median, aft_median, aft_transform
780 | 
781 | def predict_pc_dask(model_blob_url, weights_blob_url, custom_objects, dates, aoi):
782 |     # # create a dask cluster
783 |     # print('spinning up Dask Cluster')
784 |     # cluster = GatewayCluster(
785 |     #     address = "https://pccompute.westeurope.cloudapp.azure.com/compute/services/dask-gateway",
786 |     #     proxy_address = "gateway://pccompute-dask.westeurope.cloudapp.azure.com:80",
787 |     #     auth = 'jupyterhub',
788 |     #     worker_cores = 4
789 |     # )
790 | 
791 |     # client = cluster.get_client()
792 |     # client.upload_file(f'{str(ROOT)}/model_tools.py', load = True)
793 | 
794 |     # # allow our dask cluster to adaptively scale from 2 to 24 nodes
795 |     # cluster.adapt(minimum=4, maximum=24)
796 |     # print('cluster created', cluster.dashboard_link)
797 | 
798 |     # extract before and after dates from input in format required by PC
799 |     before_dates = f'{dates[0]}/{dates[1]}'
800 |     after_dates = f'{dates[2]}/{dates[3]}'
801 | 
802 |     # get our before and after stacs
803 |     print('retrieving s2 data')
804 |     bef_stac = get_s2_stac(before_dates, aoi)
805 |     aft_stac = get_s2_stac(after_dates, aoi) # these are projected rioxarrays
806 | 
807 |     # create median composites
808 |     bef_median = bef_stac.median(dim="time")
809 |     aft_median = aft_stac.median(dim="time")
810 | 
811 |     #normalize
812 |     bef_norm = normalize_dataArray(bef_median, 'band')
813 |     aft_norm = normalize_dataArray(aft_median, 'band')
814 |     
815 |     # concatenate
816 |     ds = xr.concat([bef_norm, aft_norm], dim="band").assign_coords({'band':['B2', 'B3', 'B4', 'B8','B2_1', 'B3_1', 'B4_1', 'B8_1']})
817 | 
818 |     trimmed = trim_dataArray(ds, 256)
819 |     chunked = trimmed.chunk({'x':256, 'y':256})
820 | 
821 |     print('running chunked predictions')
822 |     meta = np.array([[]], dtype="float32")
823 |     predictions_array = chunked.data.map_overlap(
824 |         lambda x: predict_chunk(x, model_blob_url, weights_blob_url, custom_objects),
825 |         depth = (0, 64, 64),
826 |         boundary = 0,
827 |         meta=meta,
828 |         drop_axis=0    
829 |     )
830 | 
831 |     # predictions = predictions_array
832 | 
833 |     # # to restore spatial reference, cast back to Xarray
834 |     # out = xr.DataArray(
835 |     #     predictions,
836 |     #     coords=trimmed.drop_vars("band").coords,
837 |     #     dims=("y", "x"),
838 |     # )
839 |     
840 |     return(predictions_array)
841 | 


--------------------------------------------------------------------------------
/utils/processing.py:
--------------------------------------------------------------------------------
   1 | """
   2 | Created on Fri Mar 20 10:50:44 2020
   3 | 
   4 | @author: MEvans
   5 | """
   6 | import tensorflow as tf
   7 | import numpy as np
   8 | import math
   9 | import os
  10 | import copy
  11 | import sys
  12 | import requests
  13 | import io
  14 | from random import shuffle, randint, uniform
  15 | from pathlib import Path
  16 | 
  17 | FILE = Path(__file__).resolve()
  18 | ROOT = FILE.parents[0]
  19 | DIR = Path(os.path.relpath(ROOT, Path.cwd()))
  20 | 
  21 | if str(DIR) not in sys.path:
  22 |     sys.path.append(str(DIR))
  23 | 
  24 | import array_tools
  25 | 
  26 | def get_file_id(f:str, delim:str = '_', parts:slice = slice(3,5), flag=False):
  27 |     """Return a unique identifyier from a file name
  28 | 
  29 |     Params
  30 |     ---
  31 |     f: str
  32 |         file basename
  33 |     delim: str
  34 |         delimiter optionally splitting filename into parts
  35 |     parts: slice
  36 |         slice identifying the parts to return
  37 | 
  38 |     Returns
  39 |     ---
  40 |     tuple: tuple of filename pieces
  41 |     """
  42 |     stem = str(Path(f).stem)
  43 |     splits = stem.split(delim)
  44 |     ids = splits[parts]
  45 |     return tuple(ids)
  46 | 
  47 | def match_files(urls, vars, delim:str = '_', parts:slice = slice(3,5), subset: set = None, flatdirectory:bool = False):
  48 |     """Align files by unique id among variables
  49 |     Params
  50 |     ---
  51 |     urls: list:str
  52 |       unordered list of all filepaths to be sorted and aligned by variable
  53 |     vars: dict
  54 |       key, value pairs with variable names as keys (e.g., 'naip'). value = None will skip that variable
  55 |     delim: str
  56 |         delimiter optionally splitting filename into parts
  57 |     parts: slice
  58 |         slice identifying the parts to return
  59 |     subset: set
  60 |       optional. unique ids with which to further subset the returned files
  61 | 
  62 |     Returns
  63 |     ---
  64 |     dict: key, value pairs for each valid key in vars. variable names are key (e.g. 'naip') and values are corresponding list of files
  65 |     """
  66 | 
  67 |     #print(len(subset))
  68 |     vars_copy = copy.deepcopy(vars)
  69 | 
  70 |     if flatdirectory:
  71 |         files_dic = {key:[url for url in urls if f'_{key}_' in url] for key in vars_copy.keys() if vars_copy[key]['files'] is not None}
  72 |     else:
  73 |         files_dic = {key:[url for url in urls if f'/{key}/' in url] for key in vars_copy.keys() if vars_copy[key]['files'] is not None}
  74 |     
  75 |     ids = [set([get_file_id(f, delim, parts) for f in files]) for files in files_dic.values()] # list of sets per var
  76 |     
  77 |     intersection = set.intersection(*ids)
  78 | 
  79 |     if subset:
  80 |         intx = intersection.intersection(subset)
  81 |     else:
  82 |         intx = intersection
  83 |         
  84 |     for var, ls in files_dic.items():
  85 |        subset = [f for f in ls if get_file_id(f, delim, parts) in intx]
  86 |        subset.sort()
  87 |        vars_copy[var].update({"files": subset})
  88 | 
  89 |     return vars_copy
  90 | 
  91 | def split_files(files, labels = ['label', 'lu', 'naip', 'lidar', 's2'], delim = '_', parts = slice(3,5)):
  92 |   """Divide list of .npy arrays into separate lists by source data (e.g. NAIP, S2, etc.)
  93 | 
  94 |   Params
  95 |   ---
  96 |   files: list(str)
  97 |     list of files to be split
  98 |   labels: list(str)
  99 |     list of prefixes identifying subsets of files to return
 100 | 
 101 |   Return
 102 |   ---
 103 |   list, list, list: tuple of lists per file subset
 104 |   """
 105 |   def get_file_id(f, parts):
 106 |     stem = str(Path(f).stem)
 107 |     splits = stem.split(delim)
 108 |     ids = splits[parts]
 109 |     return tuple(ids)
 110 | 
 111 |   indices = [set([get_file_id(f, parts) for f in files if label in Path(f).parts]) for label in labels]
 112 |   intersection = set.intersection(*indices)
 113 |   out_files = [[f for f in files if label in Path(f).parts and get_file_id(f, parts) in intersection] for label in labels]
 114 |   return out_files
 115 | 
 116 | def calc_ndvi(input):
 117 |   """Caclulate NDVI from Sentinel-2 data
 118 |   Parameters:
 119 |     input (dict): dictionary of incoming tensors
 120 |   Returns:
 121 |     tensor
 122 |   """
 123 |   epsilon = 1e-8
 124 |   nir = input.get('B8')
 125 |   red = input.get('B4')
 126 |   ndvi = tf.divide(tf.subtract(nir, red), tf.add(epsilon, tf.add(nir,red)))
 127 |   return ndvi
 128 | 
 129 | def aug_tensor_color(img):
 130 |     n_ch = tf.shape(img)[-1]
 131 |     contra_adj = 0.05
 132 |     bright_adj = 0.05
 133 | 
 134 |     ch_mean = tf.math.reduce_mean(img, axis = (0,1), keepdims = True)
 135 |     #ch_mean = np.mean(img, axis=(0, 1), keepdims=True).astype(np.float32)
 136 | 
 137 |     contra_mul = tf.random.uniform(shape = (1, 1, n_ch),
 138 |                                    minval = 1-contra_adj,
 139 |                                    maxval = 1+contra_adj)
 140 |     # contra_mul = np.random.uniform(1 - contra_adj, 1 + contra_adj, (1, 1, n_ch)).astype(
 141 |     #     np.float32
 142 |     # )
 143 | 
 144 |     bright_mul = tf.random.uniform(shape = (1, 1, n_ch),
 145 |                                    minval = 1 - bright_adj,
 146 |                                    maxval = 1+bright_adj)
 147 |     # bright_mul = np.random.uniform(1 - bright_adj, 1 + bright_adj, (1, 1, n_ch)).astype(
 148 |     #     np.float32
 149 |     # )
 150 | 
 151 |     recolored = (img - ch_mean) * contra_mul + ch_mean * bright_mul
 152 |     return recolored
 153 | 
 154 | def augColor(x, contra_adj = 0.05, bright_adj = 0.05):
 155 |     """Color augmentation
 156 | 
 157 |     Args:
 158 |         x: Image
 159 | 
 160 |     Returns:
 161 |         Augmented image
 162 |     """
 163 |     x = tf.image.random_hue(x, 0.05)
 164 |     x = tf.image.random_saturation(x, 0.6, 1.6)
 165 |     x = tf.image.random_brightness(x, 0.05)
 166 |     x = tf.image.random_contrast(x, 0.7, 1.3)
 167 |     return x
 168 | 
 169 | def aug_tensor_morph(img):
 170 |     """
 171 |     Perform image augmentation on tfRecords
 172 |     Parameters:
 173 |         img (TFRecord): 4D tensor
 174 |     Returns:
 175 |         3D tensor:
 176 |     """
 177 |     outDims = tf.shape(img)[0:1]
 178 |     x = tf.image.random_flip_left_right(img)
 179 |     x = tf.image.random_flip_up_down(x)
 180 |     x = tf.image.rot90(x, tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))
 181 |       #x = zoom(x, outDims)
 182 |       #since were gonna map_fn this on a 4d image, output must be 3d, so squeeze the artificial 'sample' dimension
 183 |     return tf.squeeze(x)
 184 | 
 185 | def normalize_timeseries(arr, maxval = 10000, axis = -1, e = 0.00001):
 186 |   # normalize band values across timesteps
 187 |   normalized = arr/maxval
 188 | #   mn = np.nanmean(arr, axis = axis, keepdims = True)
 189 | #   std = np.nanstd(arr, axis = axis, keepdims = True)
 190 | #   normalized = (arr - mn)/(std+e)
 191 |   # replace nans with zeros?
 192 |   finite = np.where(np.isnan(normalized), 0.0, normalized)
 193 |   return finite
 194 | 
 195 | def rearrange_timeseries(arr, nbands, time_dim = 1):
 196 |   # the number of time steps is in the 1st dimension if our data is (B, T, H, W, C)
 197 |   timesteps = arr.shape[time_dim]
 198 |   # randomly pick one of the timesteps as the starting time
 199 |   starttime = randint(0, timesteps-1)
 200 |   # print('start', starttime)
 201 |   # grab all timesteps leading up to the timestep corresponding to our random first
 202 |   last = arr[:,0:starttime,:,:,:]
 203 | #   print('last shape', last.shape)
 204 |   first = arr[:,starttime:timesteps,:,:,:]
 205 | #   print('start shape', first.shape)
 206 |   rearranged = np.concatenate([first, last], axis = 1)
 207 |   rearranged.shape == arr.shape
 208 | 
 209 |   feats = rearranged[:,0:-1,:,:,:]
 210 |   labels = rearranged[:,-1,:,:,0:nbands]
 211 | 
 212 |   # confirm there are no all-nan images in labels
 213 |   batch_sums = np.sum(labels, axis = (1,2,3))
 214 |   if 0.0 in batch_sums:
 215 |     print('all nan labels, reshuffling')
 216 |     feats, labels, starttime = rearrange_timeseries(arr, nbands)
 217 | 
 218 |   return(feats, labels, starttime)
 219 | 
 220 | def sin_cos(t:int, freq:int = 6) -> tuple:
 221 |     x = t/freq
 222 |     theta = 2*math.pi * x
 223 |     return (math.sin(theta), math.cos(theta))
 224 | 
 225 | def normalize_tensor(x, axes=[2], epsilon=1e-8, moments = None, splits = None):
 226 |     """
 227 |     Standardize incoming image patches by mean and variance.
 228 | 
 229 |     Moments can be calculated based on patch data by providing axes:
 230 |     To standardize each pixel use axes = [2]
 231 |     To standardize each channel use axes = [0, 1]
 232 |     To standardize globally use axes = [0, 1, 2]
 233 | 
 234 |     To standardize by global, or per-channel moments supply a list of [mean, variance] tuples.
 235 |     To standardize groups of channels separately, identify the size of each group. Groups of
 236 |     channels must be stacked contiguously and group sizes must sum to the total # of channels
 237 | 
 238 |     Parameters:
 239 |         x (tensor): nD image tensor
 240 |         axes (array): Array of ints. Axes along which to compute mean and variance, usually length n-1
 241 |         epsilon (float): small number to avoid dividing by zero
 242 |         moments (list<tpl>): list of global mean, variance tuples for standardization
 243 |         splits (list): size(s) of groups of features to be kept together
 244 |     Return:
 245 |         tensor: nD image tensor normalized by channels
 246 |     """
 247 | 
 248 |     # define a basic function to normalize a 3d tensor
 249 |     def normalize(x):
 250 | #        shape = tf.shape(x).numpy()
 251 |         # if we've defined global or per-channel moments...
 252 |         if moments:
 253 |             # cast moments to arrays for mean and variance
 254 |             mean = np.array([tpl[0] for tpl in moments], dtype = 'float32')
 255 |             variance = np.array([tpl[1] for tpl in moments], dtype = 'float32')
 256 |         # otherwise, calculate moments along provided axes
 257 |         else:
 258 |             mean, variance = tf.nn.moments(x, axes, keepdims = True)
 259 |             # keepdims = True to ensure compatibility with input tensor
 260 | 
 261 |         # normalize the input tensor
 262 |         normed = (x - mean)/tf.sqrt(variance + epsilon)
 263 |         return normed
 264 | 
 265 | 
 266 |     # if splits are given, apply tensor normalization to each split
 267 |     if splits:
 268 |         splitLen = sum(splits)
 269 |         toNorm = x[:,:,0:splitLen]
 270 |         dontNorm = x[:,:,splitLen:]
 271 |         tensors = tf.split(toNorm, splits, axis = 2)
 272 |         normed = [normalize(tensor) for tensor in tensors]
 273 |         normed.append(dontNorm)
 274 |         # gather normalized splits into single tensor
 275 |         x_normed = tf.concat(normed, axis = 2)
 276 |     else:
 277 |         x_normed = normalize(x)
 278 | 
 279 |     return x_normed
 280 | 
 281 | def rescale_tensor(img, axes = [2], epsilon=1e-8, moments = None, splits = None):
 282 |     """
 283 |     Rescale incoming image patch to [0,1] based on min and max values
 284 | 
 285 |     Min, max can be calculated based on patch data by providing axes:
 286 |     To rescale each pixel use axes = [2]
 287 |     To rescale each channel use axes = [0, 1]
 288 |     To rescale globally use axes = [0, 1, 2]
 289 | 
 290 |     To rescale by global, or per-channel moments supply a list of [mean, variance] tuples.
 291 |     To rescale groups of channels separately, identify the size of each group. Groups of
 292 |     channels must be stacked contiguously and group sizes must sum to the total # of channels
 293 | 
 294 |     Args:
 295 |         img (tensor): 3D (H,W,C) image tensor
 296 |         axes (list): axes along which to calculate min/max for rescaling
 297 |         moments (list<tpl>): list of [min, max] tuples for standardization
 298 |         splits (list): size(s) of groups of features to be kept together
 299 |     Return:
 300 |         tensor: 3D tensor of same shape as input, with values [0,1]
 301 |     """
 302 |     def rescale(img):
 303 |         if moments:
 304 |             minimum = np.array([tpl[0] for tpl in moments], dtype = 'float32')
 305 |             maximum = np.array([tpl[1] for tpl in moments], dtype = 'float32')
 306 |         else:
 307 |             minimum = tf.math.reduce_min(img, axis = axes, keepdims = True)
 308 |             maximum = tf.math.reduce_max(img, axis = axes, keepdims = True)
 309 |         scaled = (img - minimum)/((maximum - minimum) + epsilon)
 310 | #        scaled = tf.divide(tf.subtract(img, minimum), tf.add(tf.subtract(maximum, minimum))
 311 |         return scaled
 312 | 
 313 |     # if splits are given, apply tensor normalization to each split
 314 |     if splits:
 315 |         tensors = tf.split(img, splits, axis = 2)
 316 |         rescaled = [rescale(tensor) for tensor in tensors]
 317 |         # gather normalized splits into single tensor
 318 |         img_rescaled = tf.concat(rescaled, axis = 2)
 319 |     else:
 320 |         img_rescaled = rescale(img)
 321 | 
 322 |     return img_rescaled
 323 | 
 324 | #def parse_tfrecord(example_proto, ftDict):
 325 | #    """The parsing function.
 326 | #    Read a serialized example into the structure defined by FEATURES_DICT.
 327 | #    Args:
 328 | #      example_proto: a serialized Example.
 329 | #    Returns:
 330 | #      A dictionary of tensors, keyed by feature name.
 331 | #    """
 332 | #    return tf.io.parse_single_example(example_proto, ftDict)
 333 | 
 334 | 
 335 | def to_tuple(inputs, features, response, axes = [2], splits = None, one_hot = None, moments = None, **kwargs):
 336 |     """Function to convert a dictionary of tensors to a tuple of (inputs, outputs).
 337 |     Turn the tensors returned by parse_tfrecord into a stack in HWC shape.
 338 |     Args:
 339 |       inputs (dict): A dictionary of tensors, keyed by feature name. Response
 340 |       variable must be the last item.
 341 |       features (list): List of input feature names
 342 |       respones (str): response name(s)
 343 |       axes (list): axes along which to calculate moments for rescaling
 344 |       one_hot (dict): key:value pairs for name of one-hot variable and desired one-hot depth
 345 |       splits (list): size(s) of groups of features to be kept together
 346 |       moments (list<tpl>): list of [mean, var] tuples for standardization
 347 |     Returns:
 348 |       A dtuple of (inputs, outputs).
 349 |     """
 350 | #    one_hot = kwargs.get('one_hot')
 351 | #    splits = kwargs.get('splits')
 352 | #    moments = kwargs.get('moments')
 353 | 
 354 |     # If custom preprocessing functions are specified add respective bands
 355 |     for fxn in kwargs.values():
 356 |         der = fxn(inputs)
 357 |         inputs = der
 358 | 
 359 | #    inputsList = [inputs.get(key) for key in features + [response]]
 360 |     if type(response) == dict:
 361 |         depth = list(response.values())[0]
 362 |         key = list(response.keys())[0]
 363 |         res = tf.squeeze(tf.one_hot(tf.cast(inputs.get(key), tf.uint8), depth = depth))
 364 |     else:
 365 |         res = tf.expand_dims(inputs.get(response), axis = 2)
 366 | 
 367 |     # stack the augmented bands, optional one-hot tensors, and response variable
 368 |     if one_hot:
 369 |         featList = [inputs.get(key) for key in features if key not in one_hot.keys()]
 370 |         hotList= [tf.one_hot(tf.cast(inputs.get(key), tf.uint8), val, axis = 2) for key, val in one_hot.items() if key in features]
 371 |         # hotList = [tf.one_hot(tf.cast(inputs.get(key), tf.uint8), val, axis = 2) for key, val in one_hot.items()]
 372 |     else:
 373 |         featList = [inputs.get(key) for key in features]
 374 | 
 375 |     # stack, transpose, augment, and normalize continuous bands
 376 |     bands = tf.transpose(tf.stack(featList, axis = 0), [1,2,0])
 377 |     bands = aug_tensor_color(bands)
 378 |     bands = rescale_tensor(bands, axes = axes, moments = moments, splits = splits)
 379 | 
 380 |     if one_hot:
 381 |       hotStack = tf.concat(hotList, axis = 2)
 382 |       stacked = tf.concat([bands, hotStack, res], axis =2)
 383 |     else:
 384 |       stacked = tf.concat([bands, res], axis = 2)
 385 | 
 386 |     # perform morphological augmentation
 387 |     stacked = aug_tensor_morph(stacked)
 388 | 
 389 |     feats = stacked[:, :, :-res.shape[2]]
 390 |     labels = stacked[:, :, -res.shape[2]:]
 391 |     labels = tf.where(tf.greater(labels, 1.0), 1.0, labels)
 392 |     return feats, labels
 393 | 
 394 | def get_dataset(files, ftDict, features, response, axes = [2], splits = None, one_hot = None, moments = None, **kwargs):
 395 |   """Function to read, parse and format to tuple a set of input tfrecord files.
 396 |   Get all the files matching the pattern, parse and convert to tuple.
 397 |   Args:
 398 |     files (list): A list of filenames storing tfrecords
 399 |     FtDict (dic): Dictionary of input features in tfrecords
 400 |     features (list): List of input feature names
 401 |     respones (str): response name(s)
 402 |     axes (list): axes along which to calculate moments for rescaling
 403 |     one_hot (dict): key:value pairs for name of one-hot variable and desired one-hot depth
 404 |     splits (list): size(s) of groups of features to be kept together
 405 |     moments (list<tpl>): list of [mean, var] tuples for standardization
 406 |   Returns:
 407 |     A tf.data.Dataset
 408 |   """
 409 | 
 410 |   def parse_tfrecord(example_proto):
 411 |       return tf.io.parse_single_example(example_proto, ftDict)
 412 | 
 413 |   def tupelize(ftDict):
 414 |       return to_tuple(ftDict, features, response, axes, splits, one_hot, moments, **kwargs)
 415 | 
 416 |   dataset = tf.data.TFRecordDataset(files, compression_type='GZIP')
 417 |   dataset = dataset.map(parse_tfrecord, num_parallel_calls=5)
 418 |   dataset = dataset.map(tupelize, num_parallel_calls=5)
 419 |   return dataset
 420 | 
 421 | def get_training_dataset(files, ftDict, features, response, buff, batch = 16, repeat = True, axes = [2], splits = None, one_hot = None, moments = None, **kwargs):
 422 |     """
 423 |     Get the preprocessed training dataset
 424 |     Args:
 425 |         files (list): list of tfrecord files to be used for training
 426 |         FtDict (dic): Dictionary of input features in tfrecords
 427 |         features (list): List of input feature names
 428 |         respones (str): response name(s)
 429 |         axes (list): axes along which to calculate moments for rescaling
 430 |         buffer (int): buffer size for shuffle
 431 |         batch (int): batch size for training
 432 |         repeat (bool): should the dataset be repeated
 433 |     Returns:
 434 |       A tf.data.Dataset of training data.
 435 |     """
 436 |     dataset = get_dataset(files, ftDict, features, response, axes, splits, one_hot, moments, **kwargs)
 437 |     if repeat:
 438 |         dataset = dataset.shuffle(buff).batch(batch).repeat()
 439 |     else:
 440 |         dataset = dataset.shuffle(buff).batch(batch)
 441 |     return dataset
 442 | 
 443 | def get_eval_dataset(files, ftDict, features, response, axes = [2], splits = None, one_hot = None, moments = None, **kwargs):
 444 |         """
 445 |     Get the preprocessed evaluation dataset
 446 |     Args:
 447 |         files (list): list of tfrecords to be used for evaluation
 448 |     Returns:
 449 |       A tf.data.Dataset of evaluation data.
 450 |     """
 451 | 
 452 |         dataset = get_dataset(files, ftDict, features, response, axes, splits, one_hot, moments, **kwargs)
 453 |         dataset = dataset.batch(1)
 454 |         return dataset
 455 | 
 456 | class UNETDataGenerator(tf.keras.utils.Sequence):
 457 |     """Generates data for Keras
 458 |     Sequence based data generator. Suitable for building data generator for training and prediction.
 459 |     """
 460 |     def __init__(self, labelfiles = None, s2files = None, naipfiles = None,
 461 |                  hagfiles = None, lidarfiles = None, lufiles = None,
 462 |                  demfiles = None, ssurgofiles = None,
 463 |                  to_fit=True, batch_size=32, unet_dim=(256, 256),
 464 |                  n_channels=4, n_classes = 8, shuffle=True,
 465 |                  splits = None, moments = None,
 466 |                 lc_transitions = [(12,3), (11,3), (10,3), (9,8), (255, 0)],
 467 |                 lu_transitions = [(82,9), (84,10)]):
 468 |         """Initialization
 469 | 
 470 |         :param files: list of all files to use in the generator
 471 |         :param to_fit: True to return X and y, False to return X only
 472 |         :param batch_size: batch size at each iteration
 473 |         :param dim: tuple indicating image dimension
 474 |         :param n_channels: number of image channels
 475 |         :param n_classes: number of output masks
 476 |         :param n_timesteps: number of multi-channel images
 477 |         :param shuffle: True to shuffle label indexes after every epoch
 478 |         """
 479 |         self.s2files = s2files
 480 |         self.naipfiles = naipfiles
 481 |         self.hagfiles = hagfiles
 482 |         self.demfiles = demfiles
 483 |         self.ssurgofiles = ssurgofiles
 484 |         self.lidarfiles = lidarfiles
 485 |         self.labelfiles = labelfiles
 486 |         self.lufiles = lufiles
 487 |         self.to_fit = to_fit
 488 |         self.batch_size = batch_size
 489 |         self.unet_dim = unet_dim
 490 |         self.n_channels = n_channels
 491 |         self.n_classes = n_classes
 492 |         self.shuffle = shuffle
 493 |         self.splits = splits
 494 |         self.moments = moments
 495 |         self.lc_trans = lc_transitions
 496 |         self.lu_trans = lu_transitions
 497 |         self.indexes = np.arange(len(self.labelfiles))
 498 |         self.mask = False
 499 |         self.on_epoch_end()
 500 | 
 501 |         # do an initial shuffle for cases where the generator is called fresh at the start of each epoch
 502 |         if self.shuffle == True:
 503 |             print('shuffling')
 504 |             np.random.shuffle(self.indexes)
 505 | 
 506 |         if self.to_fit == True:
 507 |             print('masking on')
 508 |             self.mask = True
 509 | 
 510 |     def __len__(self):
 511 |         """Denotes the number of batches per epoch
 512 | 
 513 |         :return: number of batches per epoch
 514 |         """
 515 |         return int(np.floor(len(self.indexes) / self.batch_size))
 516 | 
 517 |     def on_epoch_end(self):
 518 |         """Updates indexes after each epoch
 519 | 
 520 |         """
 521 |         print('the generator knows the epoch ended')
 522 |         self.indexes = np.arange(len(self.indexes))
 523 |         if self.shuffle == True:
 524 |             print('shuffling')
 525 |             np.random.shuffle(self.indexes)
 526 | 
 527 |     @staticmethod
 528 |     def load_numpy_url(url):
 529 | 
 530 |         if os.path.exists(url):
 531 |             data = np.load(url)
 532 |         else:
 533 |             response = requests.get(url)
 534 |             response.raise_for_status()
 535 |             data = np.load(io.BytesIO(response.content))
 536 | 
 537 |         return(data)
 538 | 
 539 |     def _load_numpy_data(self, files_temp):
 540 |         arrays = [UNETDataGenerator.load_numpy_url(f) for f in files_temp]
 541 |         return(arrays)
 542 | 
 543 |     def _get_unet_data(self, files_temp, add_nan_mask = False,rescale_val=False):
 544 |         # arrays come from PC in (C, H, W) format
 545 |         arrays = self._load_numpy_data(files_temp)
 546 |         try:
 547 |             assert len(arrays) > 0
 548 |             assert all([len(x.shape) == 3 for x in arrays]), 'all arrays not 3D'
 549 |             # ensure all arrays are C, H, W to start
 550 |             chw = [np.moveaxis(x, source = -1, destination = 0) if x.shape[-1] < x.shape[0] else x for x in arrays]
 551 |             if rescale_val is not False:
 552 |                 chw = [x/rescale_val for x in chw]
 553 |             if add_nan_mask == True:
 554 |                 chw_new = []
 555 |                 for cur_array in chw:                 
 556 |                         
 557 |                     mask_channel = np.zeros([cur_array.shape[1], cur_array.shape[2]])
 558 |                     # Create a random array to be used to replace the original data
 559 |                     if self.to_fit:
 560 |                         for arr_2d in cur_array:
 561 |                             nans = np.isnan(arr_2d)
 562 |                             bads = arr_2d < -5000
 563 |                             mask_channel[nans==True] = 1
 564 |                             mask_channel[bads==True] = 1
 565 |                             arr_2d[mask_channel==1] = np.random.randn((mask_channel==1).sum())
 566 |                         # arr_2d[nans==True] = np.random.uniform()
 567 |                         #arr_2d[np.isnan(arr_2d)] = np.random.randn(len(arr_2d[np.isnan(arr_2d)]))
 568 |                     #print("AFTER FIX:",np.isnan(cur_array).sum())
 569 |                     #cur_array = np.vstack((cur_array, mask[None,:,:]))
 570 | 
 571 | 
 572 |                     """randarr = np.random.uniform(size=cur_array.shape)*cur_array.max()
 573 |                     # Build a mask layer to use in the replacement
 574 |                     n_cols = cur_array.shape[2]
 575 |                     n_rows = cur_array.shape[1]
 576 |                     mask_channel = np.ones((n_rows, n_cols), dtype=np.int8)
 577 |                     np.any(cur_array == np.nan, axis=0, out=mask_channel)
 578 |                     # Replace the values in any of the channels where the mask_channel is 0 with the values from the random array
 579 |                     cur_array[:, mask_channel == 1] = randarr[:, mask_channel == 1]
 580 |                     cur_array[:, mask_channel == 1] = randarr[:, mask_channel == 1] """
 581 |                     cur_array = np.append(cur_array, mask_channel[np.newaxis, :, :], axis=0)
 582 |                     #print("AFTER:",np.isnan(cur_array).sum())
 583 |                     chw_new.append(cur_array)
 584 |                 chw = chw_new
 585 |             batch = np.stack(chw, axis = 0)
 586 |             assert np.isnan(batch).sum() < 1, 'nans in batch, skipping'
 587 |             in_shape = batch.shape
 588 |             # in case our incoming data is of different size than we want, define a trim amount
 589 |             trim = ((in_shape[2] - self.unet_dim[0])//2, (in_shape[3] - self.unet_dim[1])//2)
 590 |             # If necessary, trim data to (-1, dims[0], dims[1])
 591 |             array = batch[:,:,trim[0]:self.unet_dim[0]+trim[0], trim[1]:self.unet_dim[1]+trim[1]]
 592 |             # rearrange arrays from (B, C, H, W) -> (B, H, W, C) expected by model
 593 | 
 594 |             reshaped = np.moveaxis(array, source = 1, destination = 3)
 595 |             return reshaped
 596 |         except AssertionError as msg:
 597 |           print(msg)
 598 |           return None
 599 |     def _get_naip_data(self, indexes):
 600 |         files_temp = [self.naipfiles[k] for k in indexes]
 601 |         naip = self._get_unet_data(files_temp,rescale_val=255.0)
 602 |         if type(naip) == np.ndarray:
 603 | 
 604 |             if self.to_fit:
 605 |                 recolored = array_tools.aug_array_color(naip)
 606 |                 return recolored
 607 |             return naip
 608 |         #else:
 609 |             #return naip
 610 | 
 611 |     def _get_s2_data(self, indexes):
 612 |         files_temp = [self.s2files[k] for k in indexes]
 613 |         s2 = self._get_unet_data(files_temp,rescale_val=10000.0)
 614 |         if type(s2) == np.ndarray:
 615 |             if self.to_fit:
 616 |                 recolored = array_tools.aug_array_color(s2)
 617 |                 return recolored
 618 |             else:
 619 |                 return s2
 620 |         #else:
 621 |             #return s2
 622 | 
 623 |     def _get_lidar_data(self, indexes):
 624 |         files_temp = [self.lidarfiles[k] for k in indexes]
 625 |         lidar = self._get_unet_data(files_temp,self.mask,rescale_val=100)
 626 |         if type(lidar) == np.ndarray:
 627 |             return lidar
 628 | 
 629 |     def _get_hag_data(self, indexes):
 630 |         files_temp = [self.hagfiles[k] for k in indexes]
 631 |         hag = self._get_unet_data(files_temp, self.mask, rescale_val=100)
 632 |         if type(hag) == np.ndarray:
 633 |             return hag
 634 |         #else:
 635 |          #   return hag
 636 | 
 637 |     def _get_dem_data(self, indexes):
 638 |         files_temp = [self.demfiles[k] for k in indexes]
 639 |         dem = self._get_unet_data(files_temp,self.mask,rescale_val=2000.0)
 640 |         if type(dem) == np.ndarray:
 641 |            # we are going to use the min and max elevations across the chesapeake
 642 |           return dem
 643 |         #else:
 644 |          # return dem
 645 | 
 646 |     def _get_ssurgo_data(self, indexes):
 647 |         files_temp = [self.ssurgofiles[k] for k in indexes]
 648 |         ssurgo = self._get_unet_data(files_temp)
 649 |         if type(ssurgo) == np.ndarray:
 650 |             return ssurgo
 651 | 
 652 |     def _process_y(self, indexes):
 653 |         # get label files for current batch
 654 |         lc_files = [self.labelfiles[k] for k in indexes]
 655 |         # lc_arrays = [np.load(file) for file in lc_files]
 656 |         lc_arrays = self._load_numpy_data(lc_files)
 657 |         
 658 |         try:
 659 |             assert len(lc_arrays) == self.batch_size
 660 |             assert all([x.shape == (1, self.unet_dim[0], self.unet_dim[1]) for x in lc_arrays])
 661 |             lc = np.stack(lc_arrays, axis = 0) #(B, C, H, W)
 662 |             int_labels = lc.astype(int)
 663 | 
 664 |             # optionally reduce the number of classes
 665 |             if self.lc_trans:
 666 |               merged_labels = array_tools.merge_classes(cond_array = int_labels, trans = self.lc_trans, out_array = int_labels)
 667 |             else:
 668 |               merged_labels = int_labels
 669 | 
 670 |             if self.lufiles:
 671 |                 lu_files = [self.lufiles[k] for k in indexes]
 672 |                 # lu_arrays = [np.load(file) for file in lu_files]
 673 |                 lu_arrays = self._load_numpy_data(lu_files)
 674 |                 try:
 675 |                     assert len(lu_arrays) == self.batch_size
 676 |                     assert all([x.shape == (1, self.unet_dim[0], self.unet_dim[1]) for x in lu_arrays])
 677 |                     lu = np.stack(lu_arrays, axis = 0) #(B, C, H, W)
 678 |                     y = array_tools.merge_classes(cond_array = lu, trans = self.lu_trans, out_array = merged_labels)
 679 |                 except AssertionError:
 680 |                     return None
 681 |             else:
 682 |                 y = merged_labels
 683 | 
 684 |             # If necessary, trim data to (-1, dims[0], dims[1])
 685 |             in_shape = y.shape
 686 |             trim = ((in_shape[2] - self.unet_dim[0])//2, (in_shape[3] - self.unet_dim[1])//2)
 687 |             array = y[:,:,trim[0]:self.unet_dim[0]+trim[0], trim[1]:self.unet_dim[1]+trim[1]]
 688 | 
 689 |             # shift range of categorical labels from [1, n_classes] to [0, n_classes]
 690 |             zeroed = array
 691 |             # create one-hot representation of classes
 692 |             one_hot = tf.one_hot(zeroed, self.n_classes)
 693 |             # one_hot = to_one_hot(zeroed, self.n_classes)
 694 |             return tf.squeeze(one_hot)
 695 | 
 696 |         except AssertionError:
 697 |             return None
 698 | 
 699 |     def __getitem__(self, index):
 700 |         """Generate one batch of data
 701 | 
 702 |         :param index: index of the batch
 703 |         :return: X and y when fitting. X only when predicting
 704 |         """
 705 |         # Generate indexes of the batch
 706 |         indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
 707 | 
 708 |         datasets = []
 709 | 
 710 |         if self.s2files:
 711 |             s2Data = self._get_s2_data(indexes)
 712 |             datasets.append(s2Data)
 713 | 
 714 |         if self.naipfiles:
 715 |             naipData = self._get_naip_data(indexes)
 716 |             #print("appending Naip data",type(naipData))
 717 |             datasets.append(naipData)
 718 | 
 719 |         if self.hagfiles:
 720 |             hagData = self._get_hag_data(indexes)
 721 |             datasets.append(hagData)
 722 | 
 723 |         if self.demfiles:
 724 |             demData = self._get_dem_data(indexes)
 725 |             # print('dem', demData.shape)
 726 |             #print("appening DEM data",type(demData))
 727 |             datasets.append(demData)
 728 | 
 729 |         if self.ssurgofiles:
 730 |             ssurgoData = self._get_ssurgo_data(indexes)
 731 |             # print('ssurgo', ssurgoData.shape
 732 |             #print("appending ssurgoData",type(ssurgoData))
 733 |             datasets.append(ssurgoData)
 734 | 
 735 |         if self.lidarfiles:
 736 |             lidarData = self._get_lidar_data(indexes)
 737 |             datasets.append(lidarData)
 738 | 
 739 |         if any([type(dat) != np.ndarray for dat in datasets]):
 740 |           pass
 741 |         else:
 742 |             xData = np.concatenate(datasets, axis = -1)
 743 | 
 744 |         if self.to_fit:
 745 |             labels = self._process_y(indexes)
 746 |             # perform morphological augmentation - expects a 3D (H, W, C) image array
 747 |             stacked = np.concatenate([xData, labels], axis = -1)
 748 |             morphed = array_tools.aug_array_morph(stacked)
 749 |             # print('augmented max', np.nanmax(augmented, axis = (0,1,2)))
 750 | 
 751 |             feats = morphed[:,:,:,0:self.n_channels]
 752 |             labels = morphed[:,:,:,self.n_channels:]
 753 |             return feats, labels
 754 |         else:
 755 |             return xData
 756 | 
 757 | class SiameseDataGenerator(UNETDataGenerator):
 758 |     def __init__(self, beforefiles, afterfiles, add_nan_mask: bool, *args, **kwargs):
 759 |         super().__init__(*args, **kwargs)
 760 |         self.beforefiles = beforefiles
 761 |         self.afterfiles = afterfiles
 762 |         self.mask = add_nan_mask
 763 | 
 764 |         # do an initial shuffle for cases where the generator is called fresh at the start of each epoch
 765 |         if self.shuffle == True:
 766 |             print('shuffling')
 767 |             np.random.shuffle(self.indexes)
 768 |         print(self.batch_size)
 769 |     def __len__(self):
 770 |         """Denotes the number of batches per epoch
 771 | 
 772 |         :return: number of batches per epoch
 773 |         """
 774 |         return UNETDataGenerator.__len__(self)
 775 | 
 776 |     def on_epoch_end(self):
 777 |         """Updates indexes after each epoch
 778 | 
 779 |         """
 780 |         UNETDataGenerator.on_epoch_end(self)
 781 |     
 782 |     def _get_unet_data(self, files_temp, add_nan_mask = False,rescale_val=None):
 783 |       # arrays come from PC in (C, H, W) format
 784 |       arrays = self._load_numpy_data(files_temp)
 785 |       try:
 786 |           assert len(arrays) > 0
 787 |           assert all([len(x.shape) == 3 for x in arrays]), 'all arrays not 3D'
 788 |           # ensure all arrays are C, H, W to start
 789 |           chw = [np.moveaxis(x, source = -1, destination = 0) if x.shape[-1] < x.shape[0] else x for x in arrays]
 790 |           if rescale_val is not None:
 791 |               chw = [x/rescale_val for x in chw]
 792 |           batch = np.stack(chw, axis = 0)
 793 |           # assert np.isnan(batch).sum() < 1, 'nans in batch, skipping'
 794 |           in_shape = batch.shape
 795 |           # in case our incoming data is of different size than we want, define a trim amount
 796 |           trim = ((in_shape[2] - self.unet_dim[0])//2, (in_shape[3] - self.unet_dim[1])//2)
 797 |           # If necessary, trim data to (-1, dims[0], dims[1])
 798 |           array = batch[:,:,trim[0]:self.unet_dim[0]+trim[0], trim[1]:self.unet_dim[1]+trim[1]]
 799 |           # rearrange arrays from (B, C, H, W) -> (B, H, W, C) expected by model
 800 | 
 801 |           reshaped = np.moveaxis(array, source = 1, destination = 3)
 802 |           nans = np.isnan(reshaped)
 803 |           if add_nan_mask:
 804 |               mask = np.ones(shape = reshaped.shape) # create a mask with all valid pixels by default
 805 |               mask[nans] = 0 # inject zeros into mask at invalid pixels
 806 |               mask[reshaped < -1] = 0
 807 |               reduced_mask = mask.min(axis = -1, keepdims = True) # reduce mask along channels -> (B, C, H, 1)
 808 |               reshaped[nans] = np.random.random(nans.sum()) # replace nan values with random val from [0,1)
 809 |               masked = np.concatenate([reshaped, reduced_mask], axis = -1) # add mask to batch as additional channel
 810 |               return reshaped, reduced_mask
 811 |           else:
 812 |               assert np.isnan(reshaped).sum() < 1, 'nans in batch, skipping'
 813 |               return reshaped, None
 814 |           
 815 |       except AssertionError as msg:
 816 |         print(msg)
 817 |         return None, None
 818 |   
 819 |     def _process_y(self, indexes):
 820 |         # get label files for current batch
 821 |         files_temp = [self.labelfiles[k] for k in indexes]
 822 |         lc_files = self.load_numpy_data(files_temp)
 823 |         lc_arrays = [np.squeeze(f) for f in lc_files] # make all labels 2D to start
 824 |         try:
 825 |             assert len(lc_arrays) == self.batch_size
 826 |             lc = np.stack(lc_arrays, axis = 0) #(B, H, W)
 827 |             int_labels = lc.astype(int)
 828 |             binary = np.where(int_labels > 1, 1, int_labels)
 829 |             # If necessary, trim data to (-1, dims[0], dims[1])
 830 |             in_shape = binary.shape # -> (B, H, W)
 831 |             trim = ((in_shape[1] - self.unet_dim[0])//2, (in_shape[2] - self.unet_dim[1])//2)
 832 |             array = binary[:,trim[0]:self.unet_dim[0]+trim[0], trim[1]:self.unet_dim[1]+trim[1]]
 833 | 
 834 |             # add channel dimension (B, H, W) -> (B, H, W, C) expected by model
 835 |             reshaped = np.expand_dims(array, -1)
 836 |             return reshaped
 837 |         except AssertionError:
 838 |             return None
 839 | 
 840 |     def _get_before_data(self, indexes, rescale_val):
 841 |         files_temp = [self.beforefiles[k] for k in indexes]
 842 |         s2, bef_mask = self._get_unet_data(files_temp, add_nan_mask = self.mask, rescale_val=rescale_val)
 843 |         if type(s2) == np.ndarray:
 844 |             if self.to_fit:
 845 |                 recolored = array_tools.aug_array_color(s2)
 846 |                 return recolored, bef_mask
 847 |             else:
 848 |                 return s2, bef_mask
 849 |     
 850 |     def _get_after_data(self, indexes, rescale_val):
 851 |         files_temp = [self.afterfiles[k] for k in indexes]
 852 |         s2, aft_mask = self._get_unet_data(files_temp, add_nan_mask = self.mask, rescale_val=rescale_val)
 853 |         if type(s2) == np.ndarray:
 854 |             if self.to_fit:
 855 |                 recolored = array_tools.aug_array_color(s2)
 856 |                 return recolored, aft_mask
 857 |             else:
 858 |                 return s2, aft_mask
 859 |                         
 860 |     def __getitem__(self, index):
 861 |         """Generate one batch of data
 862 | 
 863 |         :param index: index of the batch
 864 |         :return: X and y when fitting. X only when predicting
 865 |         """
 866 |         # Generate indexes of the batch
 867 |         indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
 868 | 
 869 |         befData, befMask = self._get_before_data(indexes, rescale_val = 10000.0)
 870 |         
 871 |         aftData, aftMask = self._get_after_data(indexes, rescale_val = 10000.0)
 872 | 
 873 |         labels = self._process_y(indexes)
 874 | 
 875 |         # perform morphological augmentation - expects a 3D (H, W, C) image array
 876 |         # if all([befData is not None, aftData is not None, labels is not None]):
 877 |         if self.mask:
 878 |           mask = np.concatenate([befMask, aftMask], axis = -1).min(axis = -1, keepdims= True)
 879 |           labels = labels * mask
 880 | 
 881 |         stacked = np.concatenate([befData, aftData, labels], axis = -1)
 882 |         
 883 |         # print('augmented max', np.nanmax(augmented, axis = (0,1,2)))
 884 | 
 885 |         if self.to_fit:
 886 |           morphed = array_tools.aug_array_morph(stacked)
 887 |           feats_b = morphed[:,:,:,0:self.n_channels]
 888 |           feats_a = morphed[:,:,:,self.n_channels:2*(self.n_channels)]
 889 |           labels = morphed[:,:,:,-1:]
 890 |           return [feats_b, feats_a], labels
 891 |         else:
 892 |           return [befData, aftData]
 893 | 
 894 | 
 895 | class LSTMDataGenerator(tf.keras.utils.Sequence):
 896 |     """Generates data for Keras
 897 |     Sequence based data generator. Suitable for building data generator for training and prediction.
 898 |     """
 899 |     def __init__(self, files = None,
 900 |                  to_fit=True, batch_size=32, dim=(256, 256),
 901 |                  n_channels=4, n_timesteps = 6, shuffle=True):
 902 |         """Initialization
 903 | 
 904 |         :param files: list of all files to use in the generator
 905 |         :param to_fit: True to return X and y, False to return X only
 906 |         :param batch_size: batch size at each iteration
 907 |         :param dim: tuple indicating image dimension
 908 |         :param n_channels: number of image channels
 909 |         :param n_classes: number of output masks
 910 |         :param n_timesteps: number of multi-channel images
 911 |         :param shuffle: True to shuffle label indexes after every epoch
 912 |         """
 913 |         self.files = files
 914 |         self.to_fit = to_fit
 915 |         self.batch_size = batch_size
 916 |         self.dim = dim
 917 |         self.n_channels = n_channels
 918 |         self.n_timesteps = n_timesteps
 919 |         self.shuffle = shuffle
 920 |         self.on_epoch_end()
 921 | 
 922 |     def __len__(self):
 923 |         """Denotes the number of batches per epoch
 924 | 
 925 |         :return: number of batches per epoch
 926 |         """
 927 |         return int(np.floor(len(self.files) / self.batch_size))
 928 | 
 929 |     def on_epoch_end(self):
 930 |         """Updates indexes after each epoch
 931 | 
 932 |         """
 933 |         self.indexes = np.arange(len(self.files))
 934 |         if self.shuffle == True:
 935 |             np.random.shuffle(self.indexes)
 936 | 
 937 |     def _load_numpy_data(self, files_temp):
 938 |         arrays = [UNETDataGenerator.load_numpy_url(f) for f in files_temp]
 939 |         return(arrays)
 940 |     
 941 |     def __getitem__(self, index):
 942 |         """Generate one batch of data
 943 | 
 944 |         :param index: index of the batch
 945 |         :return: X and y when fitting. X only when predicting
 946 |         """
 947 |         # Generate indexes of the batch
 948 |         indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
 949 | 
 950 |         # Find list of IDs
 951 |         files_temp = [self.files[k] for k in indexes]
 952 |         # arrays come from PC in (T, C, H, W) format
 953 |         arrays = self._load_numpy_data(files_temp)
 954 | 
 955 |         trim = ((arrays[0].shape[2] - self.dim[0])//2, (arrays[0].shape[3] - self.dim[1])//2)
 956 |         # TEMPORARY FIX: drop the last image to give us a sereis of 5
 957 |         array = [arr[0:self.n_timesteps,:,trim[0]:-trim[0],trim[1]:-trim[1]] for arr in arrays]
 958 | 
 959 |         # creat a single (B, T, C, H, W) array
 960 |         batch = np.stack(array, axis = 0)
 961 |         # rearrange arrays from (B, T, C, H, W) -> (B, T, H, W, C) expected by model
 962 |         reshaped = np.moveaxis(batch, source = 2, destination = 4)
 963 |         normalized = normalize_timeseries(reshaped, axis = 1)
 964 |         # harmonized = add_harmonic(normalized)
 965 |         if self.to_fit:
 966 |             rearranged = rearrange_timeseries(normalized, self.n_channels)
 967 |             feats, labels = array_tools.split_timeseries(rearranged)
 968 |             # we can't have nans in label
 969 |             return feats, labels
 970 |         else:
 971 |             print('normalized dims', normalized.shape)
 972 |             return normalized
 973 | 
 974 | class LSTMAutoencoderGenerator(LSTMDataGenerator):
 975 |     """Generates data for Keras
 976 |     Sequence based data generator. Suitable for building data generator for training and prediction.
 977 |     """
 978 |     def __init__(
 979 |         self, harmonics = True, sample_weights = False, *args, **kwargs):
 980 |         """Initialization
 981 | 
 982 |         :param files: list of all files to use in the generator
 983 |         :param to_fit: True to return X and y, False to return X only
 984 |         :param batch_size: batch size at each iteration
 985 |         :param dim: tuple indicating image dimension
 986 |         :param n_channels: number of image channels
 987 |         :param n_classes: number of output masks
 988 |         :param n_timesteps: number of multi-channel images
 989 |         :param shuffle: True to shuffle label indexes after every epoch
 990 |         """
 991 |         super().__init__(*args, **kwargs)
 992 |         self.add_harmonics = harmonics
 993 |         self.sample_weights = sample_weights
 994 |         self.on_epoch_end()
 995 | 
 996 |     def __len__(self):
 997 |         return LSTMDataGenerator.__len__(self)
 998 | 
 999 |     def on_epoch_end(self):
1000 |         LSTMDataGenerator.on_epoch_end(self)
1001 | 
1002 |     def __getitem__(self, index):
1003 |         """Generate one batch of data
1004 | 
1005 |         :param index: index of the batch
1006 |         :return: X and y when fitting. X only when predicting
1007 |         """
1008 |         # Generate indexes of the batch
1009 |         indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
1010 | 
1011 |         # Find list of IDs
1012 |         files_temp = [self.files[k] for k in indexes]
1013 | 
1014 |         # arrays come from PC in (T, C, H, W) format
1015 |         arrays = self._load_numpy_data(files_temp)
1016 | 
1017 |         # creat a single (B, T, C, H, W) array
1018 |         batch = np.stack(arrays, axis = 0)
1019 | 
1020 |         # in case our incoming data is of different size than we want, define a trim amount
1021 |         trim = ((batch.shape[3] - self.dim[0])//2, (batch.shape[4] - self.dim[1])//2)
1022 | 
1023 |         # n_timesteps + 1 to account for the fact that the sequence includes the next image as target
1024 |         array = batch[:, 0:self.n_timesteps+1,:,trim[0]:self.dim[0]+trim[0],trim[1]:self.dim[1]+trim[1]]
1025 | 
1026 |         # rearrange arrays from (B, T, C, H, W) -> (B, T, H, W, C) expected by model
1027 |         reshaped = np.moveaxis(array, source = 2, destination = 4)
1028 | 
1029 |         normalized = normalize_timeseries(reshaped, axis = 1)
1030 | 
1031 |         # harmonized = add_harmonic(normalized)
1032 |         if self.add_harmonics:
1033 |             # get start dates for each file
1034 |             starts = [int(Path(f).stem.split('_')[2]) for f in files_temp]
1035 |         else:
1036 |             harmonics = None
1037 | 
1038 |         if self.to_fit:
1039 |             feats, y, start = rearrange_timeseries(normalized, self.n_channels)
1040 |             temporal_y = np.flip(feats, axis = 1) # reverse images along time dimension
1041 |             weights = [None, abs(feats[:,-1,:,:,:] - y)/(feats[:,-1,:,:,:] + y)] if self.sample_weights else None
1042 |             if self.add_harmonics:
1043 |                 starts = [x + start - self.n_timesteps for x in starts]
1044 |                 harmonics = array_tools.make_harmonics(starts, self.n_timesteps, self.dim)
1045 |             return [feats, harmonics], [temporal_y, y], weights
1046 |         else:
1047 |             if self.add_harmonics:
1048 |                 harmonics = array_tools.make_harmonics(starts, self.n_timesteps, self.dim)
1049 |             return [normalized, harmonics]
1050 | 
1051 | class HybridDataGenerator(UNETDataGenerator):
1052 |     """Generates data for Keras model with U-Net and LSTM branches
1053 |     Sequence based data generator. Suitable for building data generator for training and prediction.
1054 |     """
1055 | 
1056 |     def __init__(self, s1files,
1057 |                 lstm_dim = (6, 32, 32, 6),
1058 |                 lc_transitions = [(12,3), (11,3), (10,3), (9,8), (255, 0)],
1059 |                 lu_transitions = [(82,9), (84,10)],
1060 |                 unet_dim = (600,600),
1061 |                  *args, **kwargs):
1062 |         """Class Initialization
1063 | 
1064 |         Params
1065 |         ---
1066 |         unet_dim: tuple
1067 |             desired unet image H, W dimensions
1068 |         lstm_dim: tuple
1069 |             desired lstm image T, H, W, C dimensions
1070 |         lc_transitions: list
1071 |             list of ('from', to') tuples defining optional categorical reclassifications for lc data
1072 |         lu_transitions: list
1073 |             list of ('from', 'to') tuples defining optional categorical reclassificaitons for lu data
1074 | 
1075 |         Return
1076 |         ---
1077 |         tuple: three arrays containing batch of corresponding sentinel-2, naip, and label data
1078 |         """
1079 |         super().__init__(*args, **kwargs)
1080 |         self.s1files = s1files
1081 |         self.lc_trans = lc_transitions
1082 |         self.lu_trans = lu_transitions
1083 |         self.lstm_dim = lstm_dim
1084 |         self.unet_dim = unet_dim
1085 |         self.n_timesteps = lstm_dim[0]
1086 |         self.on_epoch_end()
1087 | 
1088 |     def _get_lstm_data(self, files_temp, rescale_val = 1.0, mask = False):
1089 |         arrays = self._load_numpy_data(files_temp)
1090 |         try:
1091 |             assert len(arrays) > 0, "No Array Found"
1092 |             assert all([x.shape == (self.lstm_dim[0], self.lstm_dim[3], self.lstm_dim[1], self.lstm_dim[2]) for x in arrays]), [x.shape for x in arrays]
1093 | 
1094 |             # creat a single (B, T, C, H, W) array
1095 |             batch = np.stack(arrays, axis = 0)
1096 |             # in case our incoming data is of different size than we want, define a trim amount
1097 |             trim = ((batch.shape[3] - self.lstm_dim[1])//2, (batch.shape[4] - self.lstm_dim[2])//2)
1098 | 
1099 |             array = batch[:, 0:self.n_timesteps,:,trim[0]:self.lstm_dim[1]+trim[0],trim[1]:self.lstm_dim[2]+trim[1]]
1100 | 
1101 |             # rearrange arrays from (B, T, C, H, W) -> (B, T, H, W, C) expected by model
1102 |             reshaped = np.moveaxis(array, source = 2, destination = 4)
1103 |             normalized = normalize_timeseries(reshaped, maxval = rescale_val, axis = 1)
1104 |             return normalized
1105 |         except AssertionError as msg:
1106 |             print(msg)
1107 |             sys.exit()
1108 |             return None    
1109 | 
1110 |     def _get_s2_data(self, indexes):
1111 |         files_temp = [self.s2files[k] for k in indexes]
1112 |         normalized = self._get_lstm_data(files_temp, rescale_val = 10000.0)
1113 |         if type(normalized) == np.ndarray:
1114 |             if self.to_fit:
1115 |                 recolored = array_tools.aug_array_color(normalized)
1116 |                 return recolored
1117 |             else:
1118 |                 return normalized
1119 |         
1120 |     def _get_s1_data(self, indexes):
1121 |         files_temp = [self.s1files[k] for k in indexes]
1122 |         normalized = self._get_lstm_data(files_temp, rescale_val = -50.0)
1123 |         if type(normalized) == np.ndarray:
1124 |             return normalized
1125 |             
1126 |     def __getitem__(self, index):
1127 |         """Generate one batch of data
1128 | 
1129 |         :param index: index of the batch
1130 |         :return: X and y when fitting. X only when predicting
1131 |         """
1132 |         # Generate indexes of the batch
1133 | 
1134 |         indexes = self.indexes[index * self.batch_size:(index + 1) * self.batch_size]
1135 | 
1136 |         unetDatasets = []
1137 |         lstmDatasets = []
1138 |         if self.s2files:
1139 |             s2Data = self._get_s2_data(indexes)
1140 |             lstmDatasets.append(s2Data)
1141 |         if self.s1files:
1142 |             s1Data = self._get_s1_data(indexes)
1143 |             lstmDatasets.append(s1Data)
1144 |         if self.naipfiles:
1145 |             naipData = self._get_naip_data(indexes)
1146 |             unetDatasets.append(naipData)
1147 |         if self.demfiles:
1148 |             demData = self._get_dem_data(indexes)
1149 |             unetDatasets.append(demData)
1150 |         if self.hagfiles:
1151 |             hagData = self._get_hag_data(indexes)
1152 |             unetDatasets.append(hagData)
1153 |         if self.lidarfiles:
1154 |             lidarData = self._get_lidar_data(indexes)
1155 |             unetDatasets.append(lidarData)
1156 |         if self.ssurgofiles:
1157 |             ssurgoData = self._get_ssurgo_data(indexes)
1158 |             unetDatasets.append(ssurgoData)
1159 | 
1160 |         if any([type(dat) != np.ndarray for dat in unetDatasets + lstmDatasets]):
1161 |           pass
1162 |         else:
1163 |           unetData = np.concatenate(unetDatasets, axis = -1)
1164 |           lstmData = np.concatenate(lstmDatasets, axis = -1)
1165 |         feats = [unetData, lstmData]
1166 |         # if type(lidarData) == np.ndarray:
1167 |         #     unetData = np.concatenate([naipData, lidarData], axis = -1)
1168 |         # else:
1169 |         #     unetData = naipData
1170 | 
1171 |         # feats = [unetData, s2Data]
1172 |         # if any([type(dat) == type(None) for dat in feats]):
1173 |         #     return self.__getitem__(randint(0, len(self.indexes) - self.batch_size))
1174 | 
1175 |         if self.to_fit:
1176 |             labels = self._process_y(indexes)
1177 |             if type(labels) == type(None):
1178 |                 pass
1179 |             # feats, labels = split_timeseries(rearranged)
1180 |             # we can't have nans in label
1181 |             else:
1182 |                 return feats, labels
1183 |         else:
1184 |             return feats
1185 | 


--------------------------------------------------------------------------------
/utils/raster_tools.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Created on Wed Jun 29 15:07:52 2022
  3 | 
  4 | @author: mevans
  5 | """
  6 | 
  7 | import os
  8 | from os.path import join
  9 | import rasterio as rio
 10 | from rasterio.windows import Window
 11 | from rasterio.transform import Affine
 12 | from rasterio.merge import merge
 13 | import shapely
 14 | from shapely.geometry import box
 15 | import geopandas as gpd
 16 | import numpy as np
 17 | from matplotlib.pyplot import imsave
 18 | import warnings
 19 | import random
 20 | from osgeo import gdal
 21 | rio.Env(CHECK_DISK_FREE_SPACE=False)
 22 | 
 23 | def generate_chip_indices(H, W, buff = 128, kernel = 256):
 24 |   """
 25 |   Parameters
 26 |   ---
 27 |     H: int
 28 |       height dimension in pixels over which indices should be generated
 29 |     W: int
 30 |       width dimension in pixels over which indices should be generated      
 31 |     buff: int
 32 |       size of pixels to be trimmed from each side of chip
 33 |     kernel: int
 34 |       size of contiguous image chips
 35 |   Return
 36 |   ---
 37 |     list::np.ndarray: list containing (y,x) index of chips upper left corner
 38 |   """
 39 |   side = (2*buff) + kernel
 40 |   x_buff = y_buff = buff
 41 |   
 42 |   y_indices = list(range(y_buff, H - (kernel+buff) +1, kernel))
 43 |   x_indices = list(range(x_buff, W - (kernel+buff) +1, kernel))
 44 | 
 45 |   indices = [(y_index, x_index) for y_index in y_indices for x_index in x_indices]
 46 |   return indices
 47 | 
 48 | def extract_chips(arr, buff = 128, kernel = 256):
 49 |     """Break an array into (potentially) overlapping chips for analysis
 50 |     Arguments:
 51 |         arr (ndarray): 3D array to run predictions on
 52 |         buff (int): size of pixels to be trimmed from chips
 53 |         kernel (int): size of contiguous image chips
 54 |     Return:
 55 |         list::np.ndarray: list containing image chips of size (kernel+buff, kernel+buff)
 56 |     """
 57 |     H, W, C = arr.shape
 58 |     side = buff + kernel
 59 |     x_buff = y_buff = buff//2
 60 |     chips = []
 61 | 
 62 |     chip_indices = generate_chip_indices(arr, buff, kernel)
 63 | 
 64 |     for x, y in chip_indices:
 65 |       chip = arr[y-y_buff:y+kernel+y_buff, x-x_buff:x+kernel+x_buff, :]
 66 |       chips.append(chip)
 67 |     
 68 |     return chips
 69 |     
 70 | def convert(size, box):
 71 |     """
 72 |     Convert coordinates of a bounding box given in image pixels to
 73 |     normalized [0,1] yolo coordinates
 74 |     
 75 |     Parameters
 76 |     ---
 77 |     size: tpl<int,int>
 78 |         height, width of image in pixels
 79 |     box: list[x0, y0, x1, y1]
 80 |         corners of box in pixels
 81 |         
 82 |     Return
 83 |     ---
 84 |       tpl(int, int, int, int): normalized x,y centroid and width, height of box
 85 |     """
 86 |     dw = 1./size[1]
 87 |     dh = 1./size[0]
 88 |     xmid = (box[0] + box[2])/2.0
 89 |     ymid = (box[1] + box[3])/2.0
 90 |     w0 = box[2] - box[0]
 91 |     h0 = box[3] - box[1]
 92 |     x = xmid*dw
 93 |     y = ymid*dh
 94 |     w = w0*dw
 95 |     h = h0*dh
 96 |     return (x,y,w,h)
 97 | 
 98 | def make_window(cx: int, cy:int, window_size: int) -> tuple:
 99 |     """Create an array window around a centroid
100 |     
101 |     Parameters
102 |     ---
103 |     cx: int
104 |         centroid x-coord
105 |     cy: int
106 |         centroid y-coord
107 |     window_size: int
108 |         size of window in pixels
109 |     
110 |     Return
111 |     ---
112 |     tpl: coordinates of top left (x0, y0) and bottom right (x1, y1) window points
113 |     """
114 |     x0 = round(cx - window_size//2)
115 |     y0 = round(cy - window_size//2)
116 |     x1 = round(cx + window_size//2)
117 |     y1 = round(cy + window_size//2)
118 |     return (x0, y0, x1, y1)
119 | 
120 | def get_geo_transform(raster_src):
121 |     """Get the geotransform for a raster image source.
122 |     Arguments
123 |     ---------
124 |     raster_src : str, :class:`rasterio.DatasetReader`, or `osgeo.gdal.Dataset`
125 |         Path to a raster image with georeferencing data to apply to `geom`.
126 |         Alternatively, an opened :class:`rasterio.Band` object or
127 |         :class:`osgeo.gdal.Dataset` object can be provided. Required if not
128 |         using `affine_obj`.
129 |     Returns
130 |     -------
131 |     transform : :class:`affine.Affine`
132 |         An affine transformation object to the image's location in its CRS.
133 |     """
134 | 
135 |     if isinstance(raster_src, str):
136 |         with rio.Env(CHECK_DISK_FREE_SPACE=False):
137 |             with rio.open(raster_src) as src:
138 |                 affine_obj = src.transform
139 |     elif isinstance(raster_src, rio.DatasetReader):
140 |         affine_obj = raster_src.transform
141 | 
142 |     return affine_obj
143 | 
144 | def convert_poly_coords(geom, raster_src=None, affine_obj=None, inverse=False,
145 |                         precision=None):
146 |     """Georegister geometry objects currently in pixel coords or vice versa.
147 |     Params
148 |     ---------
149 |     geom : :class:`shapely.geometry.shape` or str
150 |         A :class:`shapely.geometry.shape`, or WKT string-formatted geometry
151 |         object currently in pixel coordinates.
152 |     raster_src : str, optional
153 |         Path to a raster image with georeferencing data to apply to `geom`.
154 |         Alternatively, an opened :class:`rasterio.Band` object or
155 |         :class:`osgeo.gdal.Dataset` object can be provided. Required if not
156 |         using `affine_obj`.
157 |     affine_obj: list or :class:`affine.Affine`
158 |         An affine transformation to apply to `geom` in the form of an
159 |         ``[a, b, d, e, xoff, yoff]`` list or an :class:`affine.Affine` object.
160 |         Required if not using `raster_src`.
161 |     inverse : bool, optional
162 |         If true, will perform the inverse affine transformation, going from
163 |         geospatial coordinates to pixel coordinates.
164 |     precision : int, optional
165 |         Decimal precision for the polygon output. If not provided, rounding
166 |         is skipped.
167 |     Returns
168 |     -------
169 |     out_geom
170 |         A geometry in the same format as the input with its coordinate system
171 |         transformed to match the destination object.
172 |     """
173 | 
174 |     if not raster_src and not affine_obj:
175 |         raise ValueError("Either raster_src or affine_obj must be provided.")
176 | 
177 |     if raster_src is not None:
178 |         affine_xform = get_geo_transform(raster_src)
179 |     else:
180 |         if isinstance(affine_obj, Affine):
181 |             affine_xform = affine_obj
182 |         else:
183 |             # assume it's a list in either gdal or "standard" order
184 |             # (list_to_affine checks which it is)
185 |             if len(affine_obj) == 9:  # if it's straight from rasterio
186 |                 affine_obj = affine_obj[0:6]
187 |             affine_xform = Affine(*affine_obj)
188 | 
189 |     if inverse:  # geo->px transform
190 |         affine_xform = ~affine_xform
191 | 
192 |     if isinstance(geom, str):
193 |         # get the polygon out of the wkt string
194 |         g = shapely.wkt.loads(geom)
195 |     elif isinstance(geom, shapely.geometry.base.BaseGeometry):
196 |         g = geom
197 |     else:
198 |         raise TypeError('The provided geometry is not an accepted format. '
199 |                         'This function can only accept WKT strings and '
200 |                         'shapely geometries.')
201 | 
202 |     xformed_g = shapely.affinity.affine_transform(g, [affine_xform.a,
203 |                                                       affine_xform.b,
204 |                                                       affine_xform.d,
205 |                                                       affine_xform.e,
206 |                                                       affine_xform.xoff,
207 |                                                       affine_xform.yoff])
208 |     if isinstance(geom, str):
209 |         # restore to wkt string format
210 |         xformed_g = shapely.wkt.dumps(xformed_g)
211 |     if precision is not None:
212 |         xformed_g = _reduce_geom_precision(xformed_g, precision=precision)
213 | 
214 |     return xformed_g
215 | 
216 | def convert_pt(geometry: gpd.GeoSeries, out_crs: int, src_transform: list) -> tuple:
217 |     """ Change a point to another crs
218 |     
219 |     Parameters
220 |     ---
221 |     geomegry: gpd.GeoSeries
222 |         geoseries of points
223 |     out_crs: int
224 |         epsg for the desired crs
225 |     
226 |     Return
227 |     ---
228 |     tpl: (x,y) coordinates of point in new crs
229 |     """
230 |     pt = geometry.to_crs(out_crs)
231 |     coords = convert_poly_coords(pt.iloc[0], affine_obj = src_transform, inverse = True, precision = None)
232 |     x, y = np.rint(coords.x), np.rint(coords.y)
233 |     return (x,y)
234 | 
235 | def win_jitter(window_size, jitter_frac=0.1):
236 |     '''get x and y jitter
237 |     Parameters
238 |     ---------
239 |         window_size (tpl<int, int.): dimensions of window to be jittered
240 |         jitter_frac (float): proportion of window size to move window
241 |     Returns
242 |     --------
243 |         tpl <int, int>: dx, dy in pixels
244 |     '''
245 |     val = np.rint(jitter_frac * window_size)
246 |     dx = np.random.randint(-val, val)
247 |     dy = np.random.randint(-val, val)
248 |     
249 |     return dx, dy
250 | 
251 | def get_centroid(geom_pix, verbose = True):
252 |     """
253 |     Get the centroid of a polygon
254 | 
255 |     Parameters
256 |     ----------
257 |     geom_pix : shapely POLYGON
258 |     verbose : bool, optional
259 |         Return print statements? The default is True.
260 | 
261 |     Returns
262 |     -------
263 |     cx : float
264 |         centroid x coordinate in input crs.
265 |     cy : float
266 |         centroid y coordinate in input crs.
267 | 
268 |     """
269 |     bounds = geom_pix.bounds
270 |     area = geom_pix.area
271 |     (minx, miny, maxx, maxy) = bounds
272 |     dx, dy = maxx-minx, maxy-miny
273 |     
274 |     # get centroid
275 |     centroid = geom_pix.centroid
276 |     
277 |     cx_tmp, cy_tmp = list(centroid.coords)[0]
278 |     cx, cy = np.rint(cx_tmp), np.rint(cy_tmp)
279 |     if verbose:
280 |       print ("  bounds:", bounds )
281 |       print ("  dx, dy:", dx, dy )
282 |       print ("  area:", area )
283 |       print("centroid:", centroid)
284 |     
285 |     return cx, cy
286 | 
287 | def make_jittered_window(cx, cy, image_h, image_w, window_size = 1280, jitter_frac = 0.1):
288 |     """
289 |     Create a jittered image window from and input image and geometry centroid
290 | 
291 |     Parameters
292 |     ----------
293 |     cx : float
294 |         x-coordinate of centroid around which to jitter window.
295 |     cy : float
296 |         y-coordinate of centroid around which to jitter window.
297 |     image_h : int
298 |         height in pixels of input image.
299 |     image_w : int
300 |         width in pixels of input image.
301 |     window_size : int, optional
302 |         desired dimension of output window. The default is 1280.
303 |     jitter_frac : float, optional
304 |         proportion of window size to move window. The default is 0.2.
305 | 
306 |     Returns
307 |     -------
308 |     x0 : int
309 |         minx coordinate of jittered window
310 |     y0 : int
311 |         miny coordinate of jittered window.
312 |     x1 : int
313 |         maxx coordinate of jittered window.
314 |     y1 : int
315 |         maxy coordinate of jittered window.
316 | 
317 |     """
318 |     # number of pixels in x and y directions to shift window
319 |     jx, jy = win_jitter(window_size, jitter_frac=jitter_frac)
320 |     x0 = cx - window_size/2 + jx
321 |     y0 = cy - window_size/2 + jy
322 |     # ensure window does not extend outside larger image
323 |     x0 = max(x0, 0)
324 |     x0 = int(min(x0, image_w - window_size))
325 |     y0 = max(y0, 0)
326 |     y0 = int(min(y0, image_h - window_size))
327 |     # set other side of square
328 |     x1 = x0 + window_size
329 |     y1 = y0 + window_size
330 |     print('x0', x0, 'y0', y0, 'x1', x1, 'y1', y1)
331 |     return x0, y0, x1, y1
332 | 
333 | def rasterio_to_img(array, out_path, nbands = 3, ext = None):
334 |     """
335 |     Write an array read by rasterio to an 8-bit integer image file
336 | 
337 |     Parameters
338 |     ----------
339 |     array : numpy.ndarray
340 |         image array read by rasterio.
341 |     out_path : str
342 |         out image file path.
343 |     nbands : int, optional
344 |         number of image bands to write. The default is 3.
345 |     ext : str, optional
346 |         image file format extension. The default is 'png'.
347 | 
348 |     Returns
349 |     -------
350 |     None.
351 | 
352 |     """
353 |     # convert from CHW to HWC and cast as unsigned 8-band int for saving
354 |     t = array.transpose((1,2,0)).astype('uint8')
355 |     print('array shape', t.shape)
356 |     print('array min', t.min())
357 |     print('array max', t.max())
358 |     print('array type', t.dtype)
359 |     # to use pre-trained YOLO weights, only grab RGB bands
360 |     if ext:
361 |         out_file = f"{out_path}.{ext}"
362 |     else:
363 |         out_file = out_path
364 |     print('writing image to', out_file)
365 |     imsave(out_file, t[:,:,:nbands], vmin = 0, vmax = 255)
366 | 
367 | def numpy_to_raster(arr: np.ndarray, mixer: dict, out_file: str, dtype:str):
368 |     """
369 |     Params
370 |     ---
371 |     arr: np.ndarray
372 |         input (H,W,C) array to be converted to raster
373 |     mixer_file: dict
374 |         dictionary containing image dimension and spatial reference metadata required by rasterio.write
375 |     out_file: str
376 |         file path to destination raster file
377 |     dtype: str
378 |         output dtype accepted by rasterio.write (e.g., 'uint16', 'int32', 'float32', 'float64')
379 |     
380 |     Return
381 |     ---
382 |     None: writes raster data to destination file
383 |     """
384 |     C = arr.shape[0]
385 |     meta = {
386 |         'driver':'GTiff',
387 |         'width':mixer['cols'],
388 |         'height':mixer['rows'],
389 |         'count':1,
390 |         'dtype':dtype,
391 |         'transform':rio.Affine(*mixer['transform'][0:6]),
392 |         'crs':mixer['crs'],
393 |         'nodata':255
394 |     }
395 |     band_list = list(range(1,C+1))
396 |     temp_file = out_file.replace(".tif","_temp.tif")
397 |     with rio.Env(CHECK_DISK_FREE_SPACE=False):
398 |         with rio.open(temp_file, mode = 'w', **meta) as src:
399 |             src.write(arr, band_list)
400 |             # src.write(arr, 1)
401 |             src.close()
402 | 
403 |     ds = gdal.Open(temp_file)
404 |     
405 |     options = gdal.TranslateOptions(format = 'COG',creationOptions = ["COMPRESS=LZW"])
406 |     ds = gdal.Translate(destName=out_file, srcDS=ds, options=options)
407 |     ds = None
408 |     
409 |     os.remove(temp_file)
410 | 
411 | def arrays_to_cog(arrs: list, coords: list, mixer: dict, out_file: str, dtype:str):
412 |     """
413 |     Params
414 |     ---
415 |     arr: np.ndarray
416 |         input (H,W,C) array to be converted to raster
417 |     mixer_file: dict
418 |         dictionary containing image dimension and spatial reference metadata required by rasterio.write
419 |     out_file: str
420 |         file path to destination raster file
421 |     dtype: str
422 |         output dtype accepted by rasterio.write (e.g., 'uint16', 'int32', 'float32', 'float64')
423 |     
424 |     Return
425 |     ---
426 |     None: writes raster data to destination file
427 |     """
428 |     C = np.load(arrs[0]).shape[-1]
429 |     meta = {
430 |         'driver':'GTiff',
431 |         'width':round(mixer['cols']),
432 |         'height':round(mixer['rows']),
433 |         'count':C,
434 |         'dtype':dtype,
435 |         'affine':rio.Affine(*mixer['transform'][0:6]),
436 |         'crs':mixer['crs'],
437 |         'nodata':255
438 |     }
439 |     band_list = list(range(1,C+1))
440 |     temp_file = out_file.replace(".tif","_temp.tif")
441 |     with rio.Env(CHECK_DISK_FREE_SPACE=False):
442 |         with rio.open(temp_file, mode = 'w', **meta) as dst:
443 |             for f in arrs[0:4]:
444 |                 arr = np.moveaxis(np.load(f), -1, 0)
445 |                 indices = Path(f).stem.split('_') # X,Y tuple
446 |                 window = Window(
447 |                     row_off = int(indices[1]), #Y 
448 |                     col_off = int(indices[0]), #X
449 |                     width = mixer['size'],
450 |                     height = mixer['size'])
451 |                 src.write(arr, window = window, indexes = band_list)
452 | 
453 |     ds = gdal.Open(temp_file)
454 |     
455 |     options = gdal.TranslateOptions(format = 'COG',creationOptions = ["COMPRESS=LZW"])
456 |     # if we want to write straight to blob, use /vsiaz/container/path
457 |     # after setting environmental AZURE_STORAGE_CONNECTION_STRING variable
458 |     ds = gdal.Translate(destName=out_file, srcDS=ds, options=options)
459 |     ds = None
460 |     
461 |     os.remove(temp_file)
462 | 
463 | 


--------------------------------------------------------------------------------
/utils/stats.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from scipy.special import gamma
 3 | 
 4 | def gamma_pdf(x, a, b):
 5 |     """calculate the pdf of a gamma distribution defined by shape a and scale b
 6 |     Params
 7 |     ---
 8 |     x: float or array
 9 |       values at which to evaluate the gamma pdf
10 |     a: float or array
11 |       shape parameter of the gamma distribution
12 |     b: float or array
13 |       scale parameter of the gamma distribution
14 |     
15 |     Return
16 |     ---
17 |     float or array:
18 |       probability of x under the gamma distribution with shape a and scale b
19 |     """
20 |     denom = gamma(a)*(b**a)
21 |     num = (x**(a-1))*(np.exp(-1*x/b))
22 |     pd = num/denom
23 |     return pd
24 | 
25 | def lognormal_pdf(x, u, v):
26 |     """calculate the pdf of a lognormal distribution defined by mean u and variance v
27 |     Params
28 |     ---
29 |     x: float or array
30 |       values at which to evaluate the lognormal pdf
31 |     u: float or array
32 |       mean of the lognormal distribution
33 |     v: float or array
34 |       variance of the lognormal distribution
35 |     
36 |     Return
37 |     ---
38 |     float or array:
39 |       probability of x under the lognormal distribution with mean u and variance v
40 |     """
41 |     sd = np.sqrt(v)
42 |     const = (pi*2)**0.5
43 |     first = 1/(sd*const)
44 |     edenom = v*2
45 |     enum = ((np.log(x) - u)**2)*-1
46 |     second = np.exp(enum/edenom)/x
47 |     pd = first*second
48 |     return pd
49 | 
50 | 


--------------------------------------------------------------------------------