├── .gitignore
├── README.md
├── naip-utils
    ├── label_rasterize.ipynb
    ├── naip-label-align.py
    └── naip_download_pc.ipynb
├── pytorch-env.yml
├── src
    ├── calculate_image_stats_dir.py
    ├── chips.py
    ├── cls_distribution.py
    ├── dataloaders
    │   ├── StreamingDatasets.py
    │   ├── TileDatasets.py
    │   └── __init__.py
    ├── embeddings.py
    ├── eval.py
    ├── models.py
    ├── seed_data_creation.py
    ├── train.py
    ├── transforms_utils.py
    └── utils.py
└── train_azure
    ├── create_compute-cpu.py
    ├── create_compute-gpu.py
    ├── create_workspace.py
    ├── requirements.txt
    ├── run_cls_distrib.py
    ├── run_eval.py
    ├── run_model.py
    └── run_seeddata_creation.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # Editors
141 | .vscode/
142 | 
143 | # Mac/OSX
144 | .DS_Store
145 | 
146 | # Windows
147 | Thumbs.db
148 | 
149 | # Data
150 | data/
151 | tmp/
152 | model/
153 | 
154 | # Input imagery
155 | *.tif
156 | 
157 | # Azure
158 | .azureml
159 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PEARL ML Training Pipeline
  2 | 
  3 | This repo contains scripts to manage training data, workflow to create Azure ML stack and train new models that are compatible to be run on the PEARL Platform. It is based on the work on Caleb Robinson of Microsoft.
  4 | 
  5 | ## Training
  6 | 
  7 | - Monitor experiments and training runs on Azure ML 
  8 | - Training Repo
  9 |     - [Training code](https://github.com/developmentseed/pearl-ml-pipeline/blob/main/src/train.py) 
 10 |     - [Evaluation code](https://github.com/developmentseed/pearl-ml-pipeline/blob/main/src/eval.py)
 11 | 
 12 | - [DeepLabv3Plus Architecture](https://github.com/qubvel/segmentation_models.pytorch/blob/master/segmentation_models_pytorch/decoders/deeplabv3/model.py) + [focal loss](https://github.com/qubvel/segmentation_models.pytorch/blob/master/segmentation_models_pytorch/losses/focal.py) seems most promising approach
 13 | 
 14 | ## Evaluation
 15 | - We run the model over the test data set, and use the per class
 16 | 
 17 | ### SEED Data
 18 | 
 19 | **How/Why we create Seed Data**
 20 | 
 21 | - We have seed data for each model so during retraining the user doesn’t have to add samples for each class, so we can use the weights/biases from the retraining logistic regression sklearn model to update the weights/biases of the deep learning model and then run inference on the GPU 
 22 | - The retraining seed data should have same class distribution ratios as the original training data (ie 10% water, 50% trees ect)
 23 | - I’ve been generating retraining data using the GPU enabled Azure notebooks (these should ideally be converted into scripts) 
 24 | - [Seed Data Creation Script](https://github.com/developmentseed/pearl-ml-pipeline/blob/main/src/seed_data_creation.py)
 25 | 
 26 | 
 27 | 
 28 | ## Training Dataset Creation
 29 | 
 30 | There are two options to create the training dataset. 
 31 |  
 32 | **Option 1**. Feed LULC labels data in GeoTiff format. 
 33 | 
 34 | [naip-label-align.py](naip-utils/naip-label-align.py) and [NAIPTileIndex.py](naip-utils/NAIPTileIndex.py) provided functions on how to:
 35 | 
 36 | _Notes_:
 37 | - Install libspatialindex (dep of `rtree` which is not installed automaticaly)
 38 |      - `brew install spatialindex`
 39 | - align given LULC labels to available NAIP imagery tiles on Azure public Blob;
 40 | - filter out nodata tiles;
 41 | - create name conventions;
 42 | - write it to CSVs for train, validation and test dataset by 70:20:10. 
 43 | - Script will write the tiled label geoTIFF into out_dir. These files can then be uploaded to Azure blob storage
 44 | 
 45 | These CSVs can be deployed to AML for model training direction. Instruction will be given in the following section. 
 46 | 
 47 | ```bash
 48 | python naip-label-align.py 
 49 |     --label_tif_path sample.tif 
 50 |     --out_dir <dir-name>/ 
 51 |     --threshold [0.0 to 1.0] 
 52 |     --aoi <aoi-name> 
 53 |     --group <group-name>
 54 | ```
 55 | 
 56 | **Option 2**. LULC labels available as GeoJSON (vector) files, and rasterization is required. 
 57 | 
 58 | - Firstly, NAIP imagery that overlap with LULC label data is needed to be downloaded before the rasterization task. 
 59 | [naip_download_pc.ipynb](naip-utils/naip_download_pc.ipynb) provides script and documentation on how you can download NAIP imagery to your AOI from [MS Plentary Computer](https://planetarycomputer.microsoft.com/dataset/naip). 
 60 | 
 61 | - Secondly, LULC label rasterization functions and steps provided in [label_rasterize.ipynb](naip-utils/label_rasterize.ipynb) 
 62 | The rasterization in the order of (tree canopy on the top of the lulc layer or burn last, other_impervious on the bottom or it should be rasterized first in the order)
 63 | ```
 64 | tree_canopy
 65 | building
 66 | water
 67 | bare_soil
 68 | roads_railroads
 69 | grass_shrub
 70 | other_impervious
 71 | ```
 72 | Details see the [notebook](naip-utils/label_rasterize.ipynb).
 73 | 
 74 | 
 75 | ## Model Training on Azure ML(AML) 
 76 | If you are going to use AML to train LULC models for the first time, please go through these steps.
 77 | 
 78 | <img width="1426" alt="Screen Shot 2021-11-08 at 8 20 04 AM" src="https://user-images.githubusercontent.com/14057932/140749239-6963fd38-d8cb-40a6-b2eb-2cd1869ab897.png">
 79 | 
 80 | ### Configure environment
 81 | 
 82 | This code was tested using `python 3.6.5`
 83 | 
 84 | [Create a conda environment](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-from-an-environment-yml-file) using `.pytorch-env.yaml` file and execute the scripts from the created environment.
 85 | 
 86 | 
 87 | You will need to set the following variables in your `.env`
 88 | 
 89 | bash
 90 | ```
 91 | AZ_TENANT_ID=XXX #az account show --output table
 92 | AZ_SUB_ID=XXX #az account list --output table
 93 | 
 94 | AZ_WORKSPACE_NAME=XXX #User set
 95 | AZ_RESOURCE_GROUP=XXX #User set
 96 | AZ_REGION=XXX #User set
 97 | 
 98 | AZ_GPU_CLUSTER_NAME=XXX #User set
 99 | AZ_CPU_CLUSTER_NAME=XXX #User set
100 | ```
101 | 
102 | Then export all variables to your environment:
103 | 
104 | ```
105 | export $(cat .env);
106 | ```
107 | 
108 | 
109 | ### Create Your Workspace on AML
110 | [train_azure/create_workspace.py](train_azure/create_workspace.py) after export your Azure credentials, this script will create AML workspace. 
111 | 
112 | ### Create GPU Compute 
113 | 
114 | [This script](train_azure/create_compute-gpu.py) will create GPU compute resources to your workspace on AML. 
115 | 
116 | 
117 | ### (Optional) Create CPU Compute 
118 | 
119 | [This script](train_azure/create_compute-cpu.py) will create GPU compute resources to your workspace on AML. 
120 | 
121 | 
122 | ### Train LULC Model on AML
123 | We have three PyTorch based Semantic Segmenation models ready for LULC model trainings, FCN, UNet and DeepLabV3+. 
124 | 
125 | To train a model on AML, you will need to define or parse a few crucial parameters to the [script](train_azure/run_model.py), for instance:
126 | 
127 | TODO: Will we be providing sample csv
128 | ```python
129 | config = ScriptRunConfig(
130 |     source_directory="./src",
131 |     script="train.py",
132 |     compute_target=AZ_GPU_CLUSTER_NAME,
133 |     arguments=[
134 |         "--input_fn",
135 |         "sample_data/indianapolis_train.csv",
136 |         "--input_fn_val",
137 |         "sample_data/indianapolis_val.csv",
138 |         "--output_dir",
139 |         "./outputs",
140 |         "--save_most_recent",
141 |         "--num_epochs",
142 |         20,
143 |         "--num_chips",
144 |         200,
145 |         "--num_classes",
146 |         7,
147 |         "--label_transform",
148 |         "uvm",
149 |         "--model",
150 |         "deeplabv3plus",
151 |     ],
152 | )
153 | ```
154 | 
155 | These parameters are to be configure by the user. `input_fn_X` paths should be provided by the user, and are the outputs of the data generation step (NAIP Label Algin) described above.
156 | 
157 | `python train_azure/run_model.py`
158 | 
159 | 
160 | ### Evaluate the Trained Model
161 | 
162 | To compute Global F1, and class base F1 scores (written in CSV) from a trained model over latest dataset. You can use this [eval script](train_azure/run_eval.py) as an example. 
163 | 
164 | `python train_azure/run_eval.py`
165 | 
166 | 
167 | ### Seed Data Creation for PEARL
168 | After a best performing model is selected, seed dataseed need to be created to serve PEARL. Seed Data is the model embedding layers from the trained model that is used together with users inputs training data in PEARL retraining session. 
169 | 
170 | [run_seeddata_creation.py](train_azure/run_seeddata_creation.py) will config AML and use the [main seeddata creation script](src/seed_data_creation.py) to create seed data for the trained best performing model. 
171 | 
172 | `python train_azure/run_seeddata_creation.py`
173 | 
174 | ### (Optional) Classes Distribution
175 | 
176 | LULC Class distribution is a graph showing the porpotion of LULC pixel numbers for a trained model on PEARL. See the bar chart bellow.
177 | 
178 | [train_azure/run_cls_distrib.py](train_azure/run_cls_distrib.py) will guide you how to compute the classes distribution from the training dataset for the model. 
179 | 
180 | `python train_azure/run_cls_distrib.py`
181 | 
182 | <img width="1255" alt="Screen Shot 2021-11-08 at 8 07 49 AM" src="https://user-images.githubusercontent.com/14057932/140747356-31c90a9b-5cce-4b52-a74f-ca2841e9549c.png">
183 | 


--------------------------------------------------------------------------------
/naip-utils/label_rasterize.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Workflow\n",
   8 |     "- Get NAIP imagery from public Azure Blob (see notebook naip_download_pc.ipynb)\n",
   9 |     "- Rasterize label data based on the NAIP image tile\n",
  10 |     "- Create Image and Label fame name and Index match\n",
  11 |     "- Store info in CSV"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "code",
  16 |    "execution_count": null,
  17 |    "metadata": {},
  18 |    "outputs": [],
  19 |    "source": [
  20 |     "import os\n",
  21 |     "import glob\n",
  22 |     "from os import makedirs, path as op \n",
  23 |     "import geopandas as gpd\n",
  24 |     "from subprocess import call\n",
  25 |     "from rasterio.features import geometry_mask\n",
  26 |     "import rasterio\n",
  27 |     "import numpy as np\n"
  28 |    ]
  29 |   },
  30 |   {
  31 |    "cell_type": "code",
  32 |    "execution_count": null,
  33 |    "metadata": {},
  34 |    "outputs": [],
  35 |    "source": [
  36 |     "def fix_id(geojson, class_id, keyword, rankid):\n",
  37 |     "    \"\"\"adding class id to the label geojson and update output directory for new label\"\"\"\n",
  38 |     "    gdf = gpd.read_file(geojson)\n",
  39 |     "    gdf['class_id'] = int(class_id)\n",
  40 |     "    outdir = f\"../label_af_download/updated_labels_{keyword}\"\n",
  41 |     "    if not op.exists(outdir):\n",
  42 |     "        makedirs(outdir)\n",
  43 |     "    basename = op.basename(geojson)\n",
  44 |     "    gdf.to_file(f\"{outdir}/{rankid}_{basename}\", driver=\"GeoJSON\")\n",
  45 |     "    return outdir"
  46 |    ]
  47 |   },
  48 |   {
  49 |    "cell_type": "code",
  50 |    "execution_count": null,
  51 |    "metadata": {},
  52 |    "outputs": [],
  53 |    "source": [
  54 |     "def get_key(val, main_dict):\n",
  55 |     "    \"\"\"to fetch key of label order based on the label class name\"\"\"\n",
  56 |     "    for key, value in main_dict.items():\n",
  57 |     "         if val == value:\n",
  58 |     "             return key\n",
  59 |     " \n",
  60 |     "    return \"key doesn't exist\""
  61 |    ]
  62 |   },
  63 |   {
  64 |    "cell_type": "code",
  65 |    "execution_count": null,
  66 |    "metadata": {},
  67 |    "outputs": [],
  68 |    "source": [
  69 |     "def burn_base(raster, aoi, outfile):\n",
  70 |     "    \"\"\"rasterize aoi bounds\"\"\"\n",
  71 |     "    gdf_aoi = gpd.read_file(aoi)\n",
  72 |     "    with rasterio.open(raster, 'r') as src:\n",
  73 |     "        profile = src.profile\n",
  74 |     "        profile.update(\n",
  75 |     "            dtype=rasterio.uint8, \n",
  76 |     "            count=1,\n",
  77 |     "            compress='lzw'\n",
  78 |     "        )\n",
  79 |     "        fsrc = src.read()\n",
  80 |     "        outshape = (fsrc.shape[1], fsrc.shape[2])\n",
  81 |     "        transform_out = src.transform\n",
  82 |     "        out_arr = np.zeros(outshape)\n",
  83 |     "  \n",
  84 |     "        out_label=geometry_mask(gdf_aoi.geometry,\n",
  85 |     "                out_shape=outshape, \n",
  86 |     "                transform=transform_out, \n",
  87 |     "                all_touched=True,\n",
  88 |     "                invert=True\n",
  89 |     "            )\n",
  90 |     "        with rasterio.open(outfile, \"w\", **profile) as dst:\n",
  91 |     "            dst.write(out_label,1)\n",
  92 |     "    return outfile"
  93 |    ]
  94 |   },
  95 |   {
  96 |    "cell_type": "code",
  97 |    "execution_count": null,
  98 |    "metadata": {},
  99 |    "outputs": [],
 100 |    "source": [
 101 |     "def burn_labels(base_mask, label_ls, outfile):\n",
 102 |     "    \"\"\"burn labels based on the priority order\"\"\"\n",
 103 |     "    with rasterio.open(base_mask, 'r') as src:\n",
 104 |     "        profile = src.profile\n",
 105 |     "        profile.update(\n",
 106 |     "            dtype=rasterio.uint8, \n",
 107 |     "            count=1,\n",
 108 |     "            compress='lzw'\n",
 109 |     "        )\n",
 110 |     "        fsrc = src.read_masks(1)\n",
 111 |     "        outshape = fsrc.shape\n",
 112 |     "        print(outshape)\n",
 113 |     "        transform_out = src.transform\n",
 114 |     "        labels_arr=np.zeros(outshape)\n",
 115 |     "        for geojson in label_ls:\n",
 116 |     "            print(geojson)\n",
 117 |     "            gdf= gpd.read_file(geojson)\n",
 118 |     "            print(gdf.crs)\n",
 119 |     "            print(len(gdf.geometry))\n",
 120 |     "            print(gdf[\"class_id\"].unique()[0])\n",
 121 |     "            mask = geometry_mask(gdf.geometry, out_shape=outshape, transform=transform_out, all_touched=True, invert=True)\n",
 122 |     "            print(np.unique(mask))\n",
 123 |     "            update_mask = np.where(mask==True)\n",
 124 |     "            labels_arr[update_mask] = gdf[\"class_id\"].unique()[0]\n",
 125 |     "            print(np.unique(labels_arr))\n",
 126 |     "        with rasterio.open(outfile, \"w\", **profile) as dst:\n",
 127 |     "            dst.write(labels_arr,1)"
 128 |    ]
 129 |   },
 130 |   {
 131 |    "cell_type": "code",
 132 |    "execution_count": null,
 133 |    "metadata": {},
 134 |    "outputs": [],
 135 |    "source": [
 136 |     "aoi0 = \"../label_af_download/aoi_detroit_labeled/aoi0_bounds.geojson\"\n",
 137 |     "aoi1 = \"../label_af_download/aoi_detroit_labeled/aoi1_bounds.geojson\"\n",
 138 |     "aoi2 = \"../label_af_download/aoi_detroit_labeled/aoi2_bounds.geojson\"\n",
 139 |     "label_path = \"../label_af_download/aoi_detroit_labeled\"\n",
 140 |     "aoi0_naip = \"../label_af_download/downloaded_naip_aois/2018-07-06_naip_aoi0_bounds.tif\"\n",
 141 |     "aoi1_naip = \"../label_af_download/downloaded_naip_aois/2012-06-29_naip_aoi1_bounds.tif\"\n",
 142 |     "aoi2_naip = \"../label_af_download/downloaded_naip_aois/2016-08-03_naip_aoi2_bounds.tif\"\n"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "markdown",
 147 |    "metadata": {},
 148 |    "source": [
 149 |     "### Raterize LULC Labels\n",
 150 |     "The land classes should be burn in this order (1-7)\n",
 151 |     "1 on the bottom and 7 burn the last, so it's on the top\n",
 152 |     "\n",
 153 |     "7. tree_canopy\n",
 154 |     "6. building\n",
 155 |     "5. water\n",
 156 |     "4. bare_soil\n",
 157 |     "3. roads_railroads\n",
 158 |     "2. grass_shrub\n",
 159 |     "1. other_impervious\n",
 160 |     "\n",
 161 |     "Though the real class IDs are:\n",
 162 |     "\n",
 163 |     "- 0: Nodata (use the aoi)\n",
 164 |     "- 1: Tree Canopy, \n",
 165 |     "- 2: Grass/Shrub, \n",
 166 |     "- 3: bare soil, \n",
 167 |     "- 4: water, \n",
 168 |     "- 5: buildings, \n",
 169 |     "- 6:roads/railroads, \n",
 170 |     "- 7:other impervious"
 171 |    ]
 172 |   },
 173 |   {
 174 |    "cell_type": "code",
 175 |    "execution_count": null,
 176 |    "metadata": {},
 177 |    "outputs": [],
 178 |    "source": [
 179 |     "labels_classes = {\n",
 180 |     "    \"impervious\": 7,\n",
 181 |     "    \"building\": 5, \n",
 182 |     "    \"shrub\":2, \n",
 183 |     "    \"canopy\":1,\n",
 184 |     "    \"railroads\": 6, \n",
 185 |     "    \"soil\": 3, \n",
 186 |     "    \"water\": 4\n",
 187 |     "}"
 188 |    ]
 189 |   },
 190 |   {
 191 |    "cell_type": "code",
 192 |    "execution_count": null,
 193 |    "metadata": {},
 194 |    "outputs": [],
 195 |    "source": [
 196 |     "burn_order ={\n",
 197 |     "1: \"impervious\", \n",
 198 |     "2: \"shrub\",  \n",
 199 |     "3: \"railroads\",\n",
 200 |     "4:  \"soil\",\n",
 201 |     "5:  \"water\",\n",
 202 |     "6: \"building\",\n",
 203 |     "7: \"canopy\",\n",
 204 |     "}"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "code",
 209 |    "execution_count": null,
 210 |    "metadata": {},
 211 |    "outputs": [],
 212 |    "source": [
 213 |     "# aoi0_labels = glob.glob(label_path+\"/aoi_0/*.geojson\")\n",
 214 |     "aoi0_labels = glob.glob(label_path+\"/aoi_0/*.geojson\")\n",
 215 |     "for label in aoi0_labels:\n",
 216 |     "    basename=op.basename(label)\n",
 217 |     "    filezero = op.splitext(basename)[0]\n",
 218 |     "    keyword = filezero.split(\"_\")[-1]\n",
 219 |     "    class_id = labels_classes[keyword]\n",
 220 |     "    rankid = get_key(keyword, burn_order)\n",
 221 |     "    print(keyword, class_id)\n",
 222 |     "    out_dir_or = fix_id(label, class_id, \"aoi0\", rankid)\n",
 223 |     "    print(out_dir_or)\n",
 224 |     "# out= '/'.join(subdir for subdir in out_dir_or.split(\"/\")[:-1])\n",
 225 |     "# print(out)\n",
 226 |     "sorted_labels = sorted(glob.glob(out_dir_or + \"/*.geojson\"))\n",
 227 |     "print(sorted_labels)\n",
 228 |     "mask_path = burn_base(aoi0_naip, aoi0, \"../label_af_download/aoi_detroit_labeled/mark0.tif\")\n",
 229 |     "burn_labels(mask_path, sorted_labels, \"../label_af_download/aoi_detroit_labeled/aoi0_labels.tif\")\n",
 230 |     "# label_array(aoi0_labels, aoi0_naip, labels_classes, burn_order, \"aoi0\", aoi0, \"../label_af_download/aoi_detroit_labeled/mark0.tif\", \"../label_af_download/aoi_detroit_labeled/aoi0_labels.tif\")"
 231 |    ]
 232 |   },
 233 |   {
 234 |    "cell_type": "code",
 235 |    "execution_count": null,
 236 |    "metadata": {},
 237 |    "outputs": [],
 238 |    "source": [
 239 |     "aoi1_labels = glob.glob(label_path+\"/aoi_1/*.geojson\")\n",
 240 |     "for label in aoi1_labels:\n",
 241 |     "    basename=op.basename(label)\n",
 242 |     "    filezero = op.splitext(basename)[0]\n",
 243 |     "    keyword = filezero.split(\"_\")[-1]\n",
 244 |     "    class_id = labels_classes[keyword]\n",
 245 |     "    rankid = get_key(keyword, burn_order)\n",
 246 |     "    print(keyword, class_id)\n",
 247 |     "    out_dir_or = fix_id(label, class_id, \"aoi1\", rankid)\n",
 248 |     "    print(out_dir_or)\n",
 249 |     "# out= '/'.join(subdir for subdir in out_dir_or.split(\"/\")[:-1])\n",
 250 |     "# print(out)\n",
 251 |     "sorted_labels = sorted(glob.glob(out_dir_or + \"/*.geojson\"))\n",
 252 |     "print(sorted_labels)\n",
 253 |     "mask_path = burn_base(aoi1_naip, aoi1, \"../label_af_download/aoi_detroit_labeled/mark1.tif\")\n",
 254 |     "burn_labels(mask_path, sorted_labels, \"../label_af_download/aoi_detroit_labeled/aoi1_labels.tif\")"
 255 |    ]
 256 |   },
 257 |   {
 258 |    "cell_type": "code",
 259 |    "execution_count": null,
 260 |    "metadata": {},
 261 |    "outputs": [],
 262 |    "source": [
 263 |     "aoi2_labels = glob.glob(label_path+\"/aoi_2/*.geojson\")\n",
 264 |     "for label in aoi2_labels:\n",
 265 |     "    basename=op.basename(label)\n",
 266 |     "    filezero = op.splitext(basename)[0]\n",
 267 |     "    keyword = filezero.split(\"_\")[-1]\n",
 268 |     "    class_id = labels_classes[keyword]\n",
 269 |     "    rankid = get_key(keyword, burn_order)\n",
 270 |     "    print(keyword, class_id)\n",
 271 |     "    out_dir_or = fix_id(label, class_id, \"aoi2\", rankid)\n",
 272 |     "    print(out_dir_or)\n",
 273 |     "# out= '/'.join(subdir for subdir in out_dir_or.split(\"/\")[:-1])\n",
 274 |     "# print(out)\n",
 275 |     "sorted_labels = sorted(glob.glob(out_dir_or + \"/*.geojson\"))\n",
 276 |     "print(sorted_labels)\n",
 277 |     "mask_path = burn_base(aoi2_naip, aoi2, \"../label_af_download/aoi_detroit_labeled/mark2.tif\")\n",
 278 |     "burn_labels(mask_path, sorted_labels, \"../label_af_download/aoi_detroit_labeled/aoi2_labels.tif\")"
 279 |    ]
 280 |   },
 281 |   {
 282 |    "cell_type": "code",
 283 |    "execution_count": 42,
 284 |    "metadata": {},
 285 |    "outputs": [],
 286 |    "source": [
 287 |     "import pandas as pd"
 288 |    ]
 289 |   },
 290 |   {
 291 |    "cell_type": "code",
 292 |    "execution_count": 53,
 293 |    "metadata": {},
 294 |    "outputs": [],
 295 |    "source": [
 296 |     "df_train = pd.DataFrame()\n",
 297 |     "df_val = pd.DataFrame()\n",
 298 |     "df_test = pd.DataFrame()\n",
 299 |     "label_path = \"../label_af_download/trainingdataset-data-team_aois/labels\"\n",
 300 |     "image_path = \"../label_af_download/trainingdataset-data-team_aois/naips\""
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": 45,
 306 |    "metadata": {},
 307 |    "outputs": [
 308 |     {
 309 |      "data": {
 310 |       "text/plain": [
 311 |        "['../label_af_download/trainingdataset-data-team_aois/naips/2012-06-29_naip_aoi0_bounds.tif',\n",
 312 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2012-06-29_naip_aoi1_bounds.tif',\n",
 313 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2012-06-29_naip_aoi2_bounds.tif',\n",
 314 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2014-06-28_naip_aoi0_bounds.tif',\n",
 315 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2014-06-28_naip_aoi2_bounds.tif',\n",
 316 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2016-08-03_naip_aoi0_bounds.tif',\n",
 317 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2016-08-03_naip_aoi2_bounds.tif',\n",
 318 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2018-07-06_naip_aoi0_bounds.tif',\n",
 319 |        " '../label_af_download/trainingdataset-data-team_aois/naips/2018-07-07_naip_aoi2_bounds.tif']"
 320 |       ]
 321 |      },
 322 |      "execution_count": 45,
 323 |      "metadata": {},
 324 |      "output_type": "execute_result"
 325 |     }
 326 |    ],
 327 |    "source": [
 328 |     "images = sorted(glob.glob(image_path +\"/*.tif\"))\n",
 329 |     "images"
 330 |    ]
 331 |   },
 332 |   {
 333 |    "cell_type": "code",
 334 |    "execution_count": 46,
 335 |    "metadata": {},
 336 |    "outputs": [],
 337 |    "source": [
 338 |     "base_url = \"https://uvmlabels.blob.core.windows.net/\"\n",
 339 |     "label_key = \"labels4-data-team-aois\"\n",
 340 |     "image_key = \"naip4-data-team-aois\""
 341 |    ]
 342 |   },
 343 |   {
 344 |    "cell_type": "code",
 345 |    "execution_count": 54,
 346 |    "metadata": {},
 347 |    "outputs": [],
 348 |    "source": [
 349 |     "image_url = base_url + image_key\n",
 350 |     "label_url = base_url + label_key\n",
 351 |     "train_img = []\n",
 352 |     "train_label = []\n",
 353 |     "for img in images[:-2]:\n",
 354 |     "    basename = op.basename(img)\n",
 355 |     "    filezeor = op.splitext(basename)[0]\n",
 356 |     "    img_url = image_url + \"/\" + basename\n",
 357 |     "    lab_url = label_url + \"/\" + filezeor + \"_labels.tif\"\n",
 358 |     "    train_img.append(img_url)\n",
 359 |     "    train_label.append(lab_url)\n"
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "code",
 364 |    "execution_count": 55,
 365 |    "metadata": {},
 366 |    "outputs": [
 367 |     {
 368 |      "data": {
 369 |       "text/plain": [
 370 |        "['https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2012-06-29_naip_aoi0_bounds.tif',\n",
 371 |        " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2012-06-29_naip_aoi1_bounds.tif',\n",
 372 |        " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2012-06-29_naip_aoi2_bounds.tif',\n",
 373 |        " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2014-06-28_naip_aoi0_bounds.tif',\n",
 374 |        " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2014-06-28_naip_aoi2_bounds.tif',\n",
 375 |        " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2016-08-03_naip_aoi0_bounds.tif',\n",
 376 |        " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2016-08-03_naip_aoi2_bounds.tif']"
 377 |       ]
 378 |      },
 379 |      "execution_count": 55,
 380 |      "metadata": {},
 381 |      "output_type": "execute_result"
 382 |     }
 383 |    ],
 384 |    "source": [
 385 |     "train_img"
 386 |    ]
 387 |   },
 388 |   {
 389 |    "cell_type": "code",
 390 |    "execution_count": 56,
 391 |    "metadata": {},
 392 |    "outputs": [
 393 |     {
 394 |      "data": {
 395 |       "text/plain": [
 396 |        "['https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2012-06-29_naip_aoi0_bounds_labels.tif',\n",
 397 |        " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2012-06-29_naip_aoi1_bounds_labels.tif',\n",
 398 |        " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2012-06-29_naip_aoi2_bounds_labels.tif',\n",
 399 |        " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2014-06-28_naip_aoi0_bounds_labels.tif',\n",
 400 |        " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2014-06-28_naip_aoi2_bounds_labels.tif',\n",
 401 |        " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2016-08-03_naip_aoi0_bounds_labels.tif',\n",
 402 |        " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2016-08-03_naip_aoi2_bounds_labels.tif']"
 403 |       ]
 404 |      },
 405 |      "execution_count": 56,
 406 |      "metadata": {},
 407 |      "output_type": "execute_result"
 408 |     }
 409 |    ],
 410 |    "source": [
 411 |     "train_label"
 412 |    ]
 413 |   },
 414 |   {
 415 |    "cell_type": "code",
 416 |    "execution_count": 57,
 417 |    "metadata": {},
 418 |    "outputs": [
 419 |     {
 420 |      "data": {
 421 |       "text/html": [
 422 |        "<div>\n",
 423 |        "<style scoped>\n",
 424 |        "    .dataframe tbody tr th:only-of-type {\n",
 425 |        "        vertical-align: middle;\n",
 426 |        "    }\n",
 427 |        "\n",
 428 |        "    .dataframe tbody tr th {\n",
 429 |        "        vertical-align: top;\n",
 430 |        "    }\n",
 431 |        "\n",
 432 |        "    .dataframe thead th {\n",
 433 |        "        text-align: right;\n",
 434 |        "    }\n",
 435 |        "</style>\n",
 436 |        "<table border=\"1\" class=\"dataframe\">\n",
 437 |        "  <thead>\n",
 438 |        "    <tr style=\"text-align: right;\">\n",
 439 |        "      <th></th>\n",
 440 |        "      <th>image_fn</th>\n",
 441 |        "      <th>label_fn</th>\n",
 442 |        "      <th>group</th>\n",
 443 |        "    </tr>\n",
 444 |        "  </thead>\n",
 445 |        "  <tbody>\n",
 446 |        "    <tr>\n",
 447 |        "      <th>0</th>\n",
 448 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 449 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 450 |        "      <td>uvm</td>\n",
 451 |        "    </tr>\n",
 452 |        "    <tr>\n",
 453 |        "      <th>1</th>\n",
 454 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 455 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 456 |        "      <td>uvm</td>\n",
 457 |        "    </tr>\n",
 458 |        "    <tr>\n",
 459 |        "      <th>2</th>\n",
 460 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 461 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 462 |        "      <td>uvm</td>\n",
 463 |        "    </tr>\n",
 464 |        "    <tr>\n",
 465 |        "      <th>3</th>\n",
 466 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 467 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 468 |        "      <td>uvm</td>\n",
 469 |        "    </tr>\n",
 470 |        "    <tr>\n",
 471 |        "      <th>4</th>\n",
 472 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 473 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 474 |        "      <td>uvm</td>\n",
 475 |        "    </tr>\n",
 476 |        "    <tr>\n",
 477 |        "      <th>5</th>\n",
 478 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 479 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 480 |        "      <td>uvm</td>\n",
 481 |        "    </tr>\n",
 482 |        "    <tr>\n",
 483 |        "      <th>6</th>\n",
 484 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 485 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 486 |        "      <td>uvm</td>\n",
 487 |        "    </tr>\n",
 488 |        "  </tbody>\n",
 489 |        "</table>\n",
 490 |        "</div>"
 491 |       ],
 492 |       "text/plain": [
 493 |        "                                            image_fn  \\\n",
 494 |        "0  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 495 |        "1  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 496 |        "2  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 497 |        "3  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 498 |        "4  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 499 |        "5  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 500 |        "6  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 501 |        "\n",
 502 |        "                                            label_fn group  \n",
 503 |        "0  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 504 |        "1  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 505 |        "2  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 506 |        "3  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 507 |        "4  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 508 |        "5  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 509 |        "6  https://uvmlabels.blob.core.windows.net/labels...   uvm  "
 510 |       ]
 511 |      },
 512 |      "execution_count": 57,
 513 |      "metadata": {},
 514 |      "output_type": "execute_result"
 515 |     }
 516 |    ],
 517 |    "source": [
 518 |     "df_train['image_fn'] = train_img\n",
 519 |     "df_train[\"label_fn\"] = train_label\n",
 520 |     "df_train[\"group\"] = \"uvm\"\n",
 521 |     "df_train"
 522 |    ]
 523 |   },
 524 |   {
 525 |    "cell_type": "code",
 526 |    "execution_count": 58,
 527 |    "metadata": {},
 528 |    "outputs": [],
 529 |    "source": [
 530 |     "df_val['image_fn'] = \"https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2018-07-06_naip_aoi0_bounds.tif\"\n",
 531 |     "df_val[\"label_fn\"] = \"https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2018-07-06_naip_aoi0_bounds.tif\"\n",
 532 |     "df_val[\"group\"] = \"uvm\""
 533 |    ]
 534 |   },
 535 |   {
 536 |    "cell_type": "code",
 537 |    "execution_count": 59,
 538 |    "metadata": {},
 539 |    "outputs": [],
 540 |    "source": [
 541 |     "df_test['image_fn'] = \"https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2018-07-07_naip_aoi2_bounds.tif\"\n",
 542 |     "df_test[\"label_fn\"] = \"https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2018-07-07_naip_aoi2_bounds_labels.tif\"\n",
 543 |     "df_test[\"group\"] = \"uvm\""
 544 |    ]
 545 |   },
 546 |   {
 547 |    "cell_type": "code",
 548 |    "execution_count": 60,
 549 |    "metadata": {},
 550 |    "outputs": [],
 551 |    "source": [
 552 |     "df_train.to_csv(\"DevSeed_Data_created_train.csv\")\n",
 553 |     "df_val.to_csv(\"DevSeed_Data_created_val.csv\")\n",
 554 |     "df_test.to_csv(\"DevSeed_Data_created_test.csv\")"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "code",
 559 |    "execution_count": 65,
 560 |    "metadata": {},
 561 |    "outputs": [],
 562 |    "source": [
 563 |     "midwest_train = \"../src/data/midwest_train_multi_year.csv\"\n",
 564 |     "midwest_val = \"../src/data/midwest_val_multi_year.csv\"\n",
 565 |     "midwest_test = \"../src/data/midwest_test_multi_year.csv\"\n",
 566 |     "midw_train = pd.read_csv(midwest_train)\n",
 567 |     "midw_val = pd.read_csv(midwest_val)\n",
 568 |     "midw_test = pd.read_csv(midwest_test)\n"
 569 |    ]
 570 |   },
 571 |   {
 572 |    "cell_type": "code",
 573 |    "execution_count": 62,
 574 |    "metadata": {},
 575 |    "outputs": [
 576 |     {
 577 |      "data": {
 578 |       "text/html": [
 579 |        "<div>\n",
 580 |        "<style scoped>\n",
 581 |        "    .dataframe tbody tr th:only-of-type {\n",
 582 |        "        vertical-align: middle;\n",
 583 |        "    }\n",
 584 |        "\n",
 585 |        "    .dataframe tbody tr th {\n",
 586 |        "        vertical-align: top;\n",
 587 |        "    }\n",
 588 |        "\n",
 589 |        "    .dataframe thead th {\n",
 590 |        "        text-align: right;\n",
 591 |        "    }\n",
 592 |        "</style>\n",
 593 |        "<table border=\"1\" class=\"dataframe\">\n",
 594 |        "  <thead>\n",
 595 |        "    <tr style=\"text-align: right;\">\n",
 596 |        "      <th></th>\n",
 597 |        "      <th>Unnamed: 0</th>\n",
 598 |        "      <th>Unnamed: 0.1</th>\n",
 599 |        "      <th>Unnamed: 0.1.1</th>\n",
 600 |        "      <th>Unnamed: 0.1.1.1</th>\n",
 601 |        "      <th>image_fn</th>\n",
 602 |        "      <th>label_fn</th>\n",
 603 |        "      <th>group</th>\n",
 604 |        "    </tr>\n",
 605 |        "  </thead>\n",
 606 |        "  <tbody>\n",
 607 |        "    <tr>\n",
 608 |        "      <th>0</th>\n",
 609 |        "      <td>0</td>\n",
 610 |        "      <td>0</td>\n",
 611 |        "      <td>0</td>\n",
 612 |        "      <td>2</td>\n",
 613 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 614 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 615 |        "      <td>umv_label</td>\n",
 616 |        "    </tr>\n",
 617 |        "    <tr>\n",
 618 |        "      <th>1</th>\n",
 619 |        "      <td>1</td>\n",
 620 |        "      <td>1</td>\n",
 621 |        "      <td>1</td>\n",
 622 |        "      <td>26</td>\n",
 623 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 624 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 625 |        "      <td>umv_label</td>\n",
 626 |        "    </tr>\n",
 627 |        "    <tr>\n",
 628 |        "      <th>2</th>\n",
 629 |        "      <td>2</td>\n",
 630 |        "      <td>2</td>\n",
 631 |        "      <td>2</td>\n",
 632 |        "      <td>13</td>\n",
 633 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 634 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 635 |        "      <td>umv_label</td>\n",
 636 |        "    </tr>\n",
 637 |        "    <tr>\n",
 638 |        "      <th>3</th>\n",
 639 |        "      <td>3</td>\n",
 640 |        "      <td>3</td>\n",
 641 |        "      <td>3</td>\n",
 642 |        "      <td>16</td>\n",
 643 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 644 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 645 |        "      <td>umv_label</td>\n",
 646 |        "    </tr>\n",
 647 |        "    <tr>\n",
 648 |        "      <th>4</th>\n",
 649 |        "      <td>4</td>\n",
 650 |        "      <td>4</td>\n",
 651 |        "      <td>4</td>\n",
 652 |        "      <td>4</td>\n",
 653 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 654 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 655 |        "      <td>umv_label</td>\n",
 656 |        "    </tr>\n",
 657 |        "  </tbody>\n",
 658 |        "</table>\n",
 659 |        "</div>"
 660 |       ],
 661 |       "text/plain": [
 662 |        "   Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1  Unnamed: 0.1.1.1  \\\n",
 663 |        "0           0             0               0                 2   \n",
 664 |        "1           1             1               1                26   \n",
 665 |        "2           2             2               2                13   \n",
 666 |        "3           3             3               3                16   \n",
 667 |        "4           4             4               4                 4   \n",
 668 |        "\n",
 669 |        "                                            image_fn  \\\n",
 670 |        "0  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 671 |        "1  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 672 |        "2  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 673 |        "3  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 674 |        "4  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 675 |        "\n",
 676 |        "                                            label_fn      group  \n",
 677 |        "0  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 678 |        "1  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 679 |        "2  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 680 |        "3  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 681 |        "4  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  "
 682 |       ]
 683 |      },
 684 |      "execution_count": 62,
 685 |      "metadata": {},
 686 |      "output_type": "execute_result"
 687 |     }
 688 |    ],
 689 |    "source": [
 690 |     "midw_train.head()"
 691 |    ]
 692 |   },
 693 |   {
 694 |    "cell_type": "code",
 695 |    "execution_count": 66,
 696 |    "metadata": {},
 697 |    "outputs": [
 698 |     {
 699 |      "data": {
 700 |       "text/html": [
 701 |        "<div>\n",
 702 |        "<style scoped>\n",
 703 |        "    .dataframe tbody tr th:only-of-type {\n",
 704 |        "        vertical-align: middle;\n",
 705 |        "    }\n",
 706 |        "\n",
 707 |        "    .dataframe tbody tr th {\n",
 708 |        "        vertical-align: top;\n",
 709 |        "    }\n",
 710 |        "\n",
 711 |        "    .dataframe thead th {\n",
 712 |        "        text-align: right;\n",
 713 |        "    }\n",
 714 |        "</style>\n",
 715 |        "<table border=\"1\" class=\"dataframe\">\n",
 716 |        "  <thead>\n",
 717 |        "    <tr style=\"text-align: right;\">\n",
 718 |        "      <th></th>\n",
 719 |        "      <th>image_fn</th>\n",
 720 |        "      <th>label_fn</th>\n",
 721 |        "      <th>group</th>\n",
 722 |        "    </tr>\n",
 723 |        "  </thead>\n",
 724 |        "  <tbody>\n",
 725 |        "    <tr>\n",
 726 |        "      <th>0</th>\n",
 727 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 728 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 729 |        "      <td>umv_label</td>\n",
 730 |        "    </tr>\n",
 731 |        "    <tr>\n",
 732 |        "      <th>1</th>\n",
 733 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 734 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 735 |        "      <td>umv_label</td>\n",
 736 |        "    </tr>\n",
 737 |        "    <tr>\n",
 738 |        "      <th>2</th>\n",
 739 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 740 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 741 |        "      <td>umv_label</td>\n",
 742 |        "    </tr>\n",
 743 |        "    <tr>\n",
 744 |        "      <th>3</th>\n",
 745 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 746 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 747 |        "      <td>umv_label</td>\n",
 748 |        "    </tr>\n",
 749 |        "    <tr>\n",
 750 |        "      <th>4</th>\n",
 751 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 752 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 753 |        "      <td>umv_label</td>\n",
 754 |        "    </tr>\n",
 755 |        "  </tbody>\n",
 756 |        "</table>\n",
 757 |        "</div>"
 758 |       ],
 759 |       "text/plain": [
 760 |        "                                            image_fn  \\\n",
 761 |        "0  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 762 |        "1  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 763 |        "2  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 764 |        "3  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 765 |        "4  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 766 |        "\n",
 767 |        "                                            label_fn      group  \n",
 768 |        "0  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 769 |        "1  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 770 |        "2  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 771 |        "3  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  \n",
 772 |        "4  https://uvmlabels.blob.core.windows.net/detroi...  umv_label  "
 773 |       ]
 774 |      },
 775 |      "execution_count": 66,
 776 |      "metadata": {},
 777 |      "output_type": "execute_result"
 778 |     }
 779 |    ],
 780 |    "source": [
 781 |     "midw_train = midw_train[[\"image_fn\", \"label_fn\", \"group\"]]\n",
 782 |     "midw_val = midw_val[[\"image_fn\", \"label_fn\", \"group\"]]\n",
 783 |     "midw_test = midw_test[[\"image_fn\", \"label_fn\", \"group\"]]\n",
 784 |     "midw_train.head()"
 785 |    ]
 786 |   },
 787 |   {
 788 |    "cell_type": "code",
 789 |    "execution_count": 68,
 790 |    "metadata": {},
 791 |    "outputs": [
 792 |     {
 793 |      "data": {
 794 |       "text/html": [
 795 |        "<div>\n",
 796 |        "<style scoped>\n",
 797 |        "    .dataframe tbody tr th:only-of-type {\n",
 798 |        "        vertical-align: middle;\n",
 799 |        "    }\n",
 800 |        "\n",
 801 |        "    .dataframe tbody tr th {\n",
 802 |        "        vertical-align: top;\n",
 803 |        "    }\n",
 804 |        "\n",
 805 |        "    .dataframe thead th {\n",
 806 |        "        text-align: right;\n",
 807 |        "    }\n",
 808 |        "</style>\n",
 809 |        "<table border=\"1\" class=\"dataframe\">\n",
 810 |        "  <thead>\n",
 811 |        "    <tr style=\"text-align: right;\">\n",
 812 |        "      <th></th>\n",
 813 |        "      <th>image_fn</th>\n",
 814 |        "      <th>label_fn</th>\n",
 815 |        "      <th>group</th>\n",
 816 |        "    </tr>\n",
 817 |        "  </thead>\n",
 818 |        "  <tbody>\n",
 819 |        "    <tr>\n",
 820 |        "      <th>0</th>\n",
 821 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 822 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 823 |        "      <td>uvm</td>\n",
 824 |        "    </tr>\n",
 825 |        "    <tr>\n",
 826 |        "      <th>1</th>\n",
 827 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 828 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 829 |        "      <td>uvm</td>\n",
 830 |        "    </tr>\n",
 831 |        "    <tr>\n",
 832 |        "      <th>2</th>\n",
 833 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 834 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 835 |        "      <td>uvm</td>\n",
 836 |        "    </tr>\n",
 837 |        "    <tr>\n",
 838 |        "      <th>3</th>\n",
 839 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 840 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 841 |        "      <td>uvm</td>\n",
 842 |        "    </tr>\n",
 843 |        "    <tr>\n",
 844 |        "      <th>4</th>\n",
 845 |        "      <td>https://uvmlabels.blob.core.windows.net/naip4-...</td>\n",
 846 |        "      <td>https://uvmlabels.blob.core.windows.net/labels...</td>\n",
 847 |        "      <td>uvm</td>\n",
 848 |        "    </tr>\n",
 849 |        "  </tbody>\n",
 850 |        "</table>\n",
 851 |        "</div>"
 852 |       ],
 853 |       "text/plain": [
 854 |        "                                            image_fn  \\\n",
 855 |        "0  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 856 |        "1  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 857 |        "2  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 858 |        "3  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 859 |        "4  https://uvmlabels.blob.core.windows.net/naip4-...   \n",
 860 |        "\n",
 861 |        "                                            label_fn group  \n",
 862 |        "0  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 863 |        "1  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 864 |        "2  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 865 |        "3  https://uvmlabels.blob.core.windows.net/labels...   uvm  \n",
 866 |        "4  https://uvmlabels.blob.core.windows.net/labels...   uvm  "
 867 |       ]
 868 |      },
 869 |      "execution_count": 68,
 870 |      "metadata": {},
 871 |      "output_type": "execute_result"
 872 |     }
 873 |    ],
 874 |    "source": [
 875 |     "midwest_data_train = pd.concat([df_train, midw_train])\n",
 876 |     "midwest_data_train['group'] = \"uvm\"\n",
 877 |     "\n",
 878 |     "midwest_data_val = pd.concat([df_val, midw_val])\n",
 879 |     "midwest_data_val['group'] = \"uvm\" \n",
 880 |     "\n",
 881 |     "midwest_data_test = pd.concat([df_test, midw_test])\n",
 882 |     "midwest_data_test['group'] = \"uvm\" \n",
 883 |     "midwest_data_train.head()\n"
 884 |    ]
 885 |   },
 886 |   {
 887 |    "cell_type": "code",
 888 |    "execution_count": 69,
 889 |    "metadata": {},
 890 |    "outputs": [
 891 |     {
 892 |      "data": {
 893 |       "text/html": [
 894 |        "<div>\n",
 895 |        "<style scoped>\n",
 896 |        "    .dataframe tbody tr th:only-of-type {\n",
 897 |        "        vertical-align: middle;\n",
 898 |        "    }\n",
 899 |        "\n",
 900 |        "    .dataframe tbody tr th {\n",
 901 |        "        vertical-align: top;\n",
 902 |        "    }\n",
 903 |        "\n",
 904 |        "    .dataframe thead th {\n",
 905 |        "        text-align: right;\n",
 906 |        "    }\n",
 907 |        "</style>\n",
 908 |        "<table border=\"1\" class=\"dataframe\">\n",
 909 |        "  <thead>\n",
 910 |        "    <tr style=\"text-align: right;\">\n",
 911 |        "      <th></th>\n",
 912 |        "      <th>image_fn</th>\n",
 913 |        "      <th>label_fn</th>\n",
 914 |        "      <th>group</th>\n",
 915 |        "    </tr>\n",
 916 |        "  </thead>\n",
 917 |        "  <tbody>\n",
 918 |        "    <tr>\n",
 919 |        "      <th>0</th>\n",
 920 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 921 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 922 |        "      <td>uvm</td>\n",
 923 |        "    </tr>\n",
 924 |        "    <tr>\n",
 925 |        "      <th>1</th>\n",
 926 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 927 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 928 |        "      <td>uvm</td>\n",
 929 |        "    </tr>\n",
 930 |        "    <tr>\n",
 931 |        "      <th>2</th>\n",
 932 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 933 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 934 |        "      <td>uvm</td>\n",
 935 |        "    </tr>\n",
 936 |        "    <tr>\n",
 937 |        "      <th>3</th>\n",
 938 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 939 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 940 |        "      <td>uvm</td>\n",
 941 |        "    </tr>\n",
 942 |        "    <tr>\n",
 943 |        "      <th>4</th>\n",
 944 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
 945 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
 946 |        "      <td>uvm</td>\n",
 947 |        "    </tr>\n",
 948 |        "  </tbody>\n",
 949 |        "</table>\n",
 950 |        "</div>"
 951 |       ],
 952 |       "text/plain": [
 953 |        "                                            image_fn  \\\n",
 954 |        "0  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 955 |        "1  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 956 |        "2  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 957 |        "3  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 958 |        "4  https://naipblobs.blob.core.windows.net/naip/v...   \n",
 959 |        "\n",
 960 |        "                                            label_fn group  \n",
 961 |        "0  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
 962 |        "1  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
 963 |        "2  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
 964 |        "3  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
 965 |        "4  https://uvmlabels.blob.core.windows.net/detroi...   uvm  "
 966 |       ]
 967 |      },
 968 |      "execution_count": 69,
 969 |      "metadata": {},
 970 |      "output_type": "execute_result"
 971 |     }
 972 |    ],
 973 |    "source": [
 974 |     "midwest_data_val.head()"
 975 |    ]
 976 |   },
 977 |   {
 978 |    "cell_type": "code",
 979 |    "execution_count": 70,
 980 |    "metadata": {},
 981 |    "outputs": [
 982 |     {
 983 |      "data": {
 984 |       "text/html": [
 985 |        "<div>\n",
 986 |        "<style scoped>\n",
 987 |        "    .dataframe tbody tr th:only-of-type {\n",
 988 |        "        vertical-align: middle;\n",
 989 |        "    }\n",
 990 |        "\n",
 991 |        "    .dataframe tbody tr th {\n",
 992 |        "        vertical-align: top;\n",
 993 |        "    }\n",
 994 |        "\n",
 995 |        "    .dataframe thead th {\n",
 996 |        "        text-align: right;\n",
 997 |        "    }\n",
 998 |        "</style>\n",
 999 |        "<table border=\"1\" class=\"dataframe\">\n",
1000 |        "  <thead>\n",
1001 |        "    <tr style=\"text-align: right;\">\n",
1002 |        "      <th></th>\n",
1003 |        "      <th>image_fn</th>\n",
1004 |        "      <th>label_fn</th>\n",
1005 |        "      <th>group</th>\n",
1006 |        "    </tr>\n",
1007 |        "  </thead>\n",
1008 |        "  <tbody>\n",
1009 |        "    <tr>\n",
1010 |        "      <th>0</th>\n",
1011 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
1012 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
1013 |        "      <td>uvm</td>\n",
1014 |        "    </tr>\n",
1015 |        "    <tr>\n",
1016 |        "      <th>1</th>\n",
1017 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
1018 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
1019 |        "      <td>uvm</td>\n",
1020 |        "    </tr>\n",
1021 |        "    <tr>\n",
1022 |        "      <th>2</th>\n",
1023 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
1024 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
1025 |        "      <td>uvm</td>\n",
1026 |        "    </tr>\n",
1027 |        "    <tr>\n",
1028 |        "      <th>3</th>\n",
1029 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
1030 |        "      <td>https://uvmlabels.blob.core.windows.net/detroi...</td>\n",
1031 |        "      <td>uvm</td>\n",
1032 |        "    </tr>\n",
1033 |        "    <tr>\n",
1034 |        "      <th>4</th>\n",
1035 |        "      <td>https://naipblobs.blob.core.windows.net/naip/v...</td>\n",
1036 |        "      <td>https://uvmlabels.blob.core.windows.net/cuyaho...</td>\n",
1037 |        "      <td>uvm</td>\n",
1038 |        "    </tr>\n",
1039 |        "  </tbody>\n",
1040 |        "</table>\n",
1041 |        "</div>"
1042 |       ],
1043 |       "text/plain": [
1044 |        "                                            image_fn  \\\n",
1045 |        "0  https://naipblobs.blob.core.windows.net/naip/v...   \n",
1046 |        "1  https://naipblobs.blob.core.windows.net/naip/v...   \n",
1047 |        "2  https://naipblobs.blob.core.windows.net/naip/v...   \n",
1048 |        "3  https://naipblobs.blob.core.windows.net/naip/v...   \n",
1049 |        "4  https://naipblobs.blob.core.windows.net/naip/v...   \n",
1050 |        "\n",
1051 |        "                                            label_fn group  \n",
1052 |        "0  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
1053 |        "1  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
1054 |        "2  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
1055 |        "3  https://uvmlabels.blob.core.windows.net/detroi...   uvm  \n",
1056 |        "4  https://uvmlabels.blob.core.windows.net/cuyaho...   uvm  "
1057 |       ]
1058 |      },
1059 |      "execution_count": 70,
1060 |      "metadata": {},
1061 |      "output_type": "execute_result"
1062 |     }
1063 |    ],
1064 |    "source": [
1065 |     "midwest_data_test.head()"
1066 |    ]
1067 |   },
1068 |   {
1069 |    "cell_type": "code",
1070 |    "execution_count": 71,
1071 |    "metadata": {},
1072 |    "outputs": [
1073 |     {
1074 |      "data": {
1075 |       "text/plain": [
1076 |        "(24, 46, 167)"
1077 |       ]
1078 |      },
1079 |      "execution_count": 71,
1080 |      "metadata": {},
1081 |      "output_type": "execute_result"
1082 |     }
1083 |    ],
1084 |    "source": [
1085 |     "len(midwest_data_test), len(midwest_data_val), len(midwest_data_train)"
1086 |    ]
1087 |   },
1088 |   {
1089 |    "cell_type": "code",
1090 |    "execution_count": 72,
1091 |    "metadata": {},
1092 |    "outputs": [],
1093 |    "source": [
1094 |     "midwest_data_train.to_csv(\"../src/data/midwest_n_devseed_train_multiple_years.csv\")\n",
1095 |     "midwest_data_val.to_csv(\"../src/data/midwest_n_devseed_val_multiple_years.csv\")\n",
1096 |     "midwest_data_test.to_csv(\"../src/data/midwest_n_devseed_test_multiple_years.csv\")"
1097 |    ]
1098 |   },
1099 |   {
1100 |    "cell_type": "code",
1101 |    "execution_count": null,
1102 |    "metadata": {},
1103 |    "outputs": [],
1104 |    "source": []
1105 |   }
1106 |  ],
1107 |  "metadata": {
1108 |   "interpreter": {
1109 |    "hash": "d98913f21c46af07a5e0f9f95dad536eb8e0fcf0c29a960fff7f9f173650b3e5"
1110 |   },
1111 |   "kernelspec": {
1112 |    "display_name": "Python 3 (ipykernel)",
1113 |    "language": "python",
1114 |    "name": "python3"
1115 |   },
1116 |   "language_info": {
1117 |    "codemirror_mode": {
1118 |     "name": "ipython",
1119 |     "version": 3
1120 |    },
1121 |    "file_extension": ".py",
1122 |    "mimetype": "text/x-python",
1123 |    "name": "python",
1124 |    "nbconvert_exporter": "python",
1125 |    "pygments_lexer": "ipython3",
1126 |    "version": "3.9.10"
1127 |   }
1128 |  },
1129 |  "nbformat": 4,
1130 |  "nbformat_minor": 4
1131 | }
1132 | 


--------------------------------------------------------------------------------
/naip-utils/naip-label-align.py:
--------------------------------------------------------------------------------
  1 | import shapely.geometry
  2 | import rasterio
  3 | import fiona.transform
  4 | import os.path as op
  5 | from pathlib import Path
  6 | import os
  7 | import pandas as pd
  8 | import numpy as np
  9 | import subprocess
 10 | import rtree
 11 | import shapely
 12 | import click
 13 | import urllib.request
 14 | import pickle
 15 | 
 16 | 
 17 | class NAIPTileIndex:
 18 |     """Utility class for performing NAIP tile lookups by location"""
 19 | 
 20 |     NAIP_BLOB_ROOT = "https://naipblobs.blob.core.windows.net/naip/"
 21 |     NAIP_INDEX_BLOB_ROOT = "https://naipblobs.blob.core.windows.net/naip-index/rtree/"
 22 |     INDEX_FNS = ["tile_index.dat", "tile_index.idx", "tiles.p"]
 23 | 
 24 |     def __init__(self, base_path, verbose=False):
 25 |         """Loads the tile index into memory (~400 MB) for use by `self.lookup()`. Downloads the index files from the blob container if they do not exist in the `base_path/` directory.
 26 |         Args:
 27 |             base_path (str): The path on the local system to look for/store the three files that make up the tile index. This path will be created if it doesn't exist.
 28 |             verbose (bool): Whether to be verbose when downloading the tile index files
 29 |         """
 30 | 
 31 |         # Download the index files if it doens't exist
 32 |         if not os.path.exists(base_path):
 33 |             os.makedirs(base_path)
 34 |         for fn in NAIPTileIndex.INDEX_FNS:
 35 |             if not os.path.exists(os.path.join(base_path, fn)):
 36 |                 download_url(
 37 |                     NAIPTileIndex.NAIP_INDEX_BLOB_ROOT + fn,
 38 |                     os.path.join(base_path, fn),
 39 |                     verbose,
 40 |                 )
 41 | 
 42 |         self.base_path = base_path
 43 |         self.tile_rtree = rtree.index.Index(base_path + "/tile_index")
 44 |         self.tile_index = pickle.load(open(base_path + "/tiles.p", "rb"))
 45 | 
 46 |     def lookup_point(self, lat, lon):
 47 |         """Given a lat/lon coordinate pair, return the list of NAIP tiles that *contain* that point.
 48 |         Args:
 49 |             lat (float): Latitude in EPSG:4326
 50 |             lon (float): Longitude in EPSG:4326
 51 |         Returns:
 52 |             intersected_files (list): A list of URLs of NAIP tiles that *contain* the given (`lat`, `lon`) point
 53 |         Raises:
 54 |             IndexError: Raised if no tile within the index contains the given (`lat`, `lon`) point
 55 |         """
 56 | 
 57 |         point = shapely.geometry.Point(float(lon), float(lat))
 58 |         geom = shapely.geometry.mapping(point)
 59 | 
 60 |         return self.lookup_geom(geom)
 61 | 
 62 |     def lookup_geom(self, geom):
 63 |         """Given a GeoJSON geometry, return the list of NAIP tiles that *contain* that feature.
 64 |         Args:
 65 |             geom (dict): A GeoJSON geometry in EPSG:4326
 66 |         Returns:
 67 |             intersected_files (list): A list of URLs of NAIP tiles that *contain* the given `geom`
 68 |         Raises:
 69 |             IndexError: Raised if no tile within the index fully contains the given `geom`
 70 |         """
 71 |         shape = shapely.geometry.shape(geom)
 72 |         intersected_indices = list(self.tile_rtree.intersection(shape.bounds))
 73 |         print(intersected_indices)
 74 | 
 75 |         intersected_files = []
 76 |         naip_geom = []
 77 | 
 78 |         for idx in intersected_indices:
 79 |             print(idx)
 80 |             intersected_file = self.tile_index[idx][0]
 81 |             print(intersected_file)
 82 |             intersected_geom = self.tile_index[idx][1]
 83 |             print(intersected_geom)
 84 |             if intersected_geom.intersects(shape):
 85 |                 tile_intersection = True
 86 |                 f = NAIPTileIndex.NAIP_BLOB_ROOT + intersected_file
 87 |                 naip_geom.append(intersected_geom)
 88 |                 intersected_files.append(
 89 |                     NAIPTileIndex.NAIP_BLOB_ROOT + intersected_file
 90 |                 )
 91 | 
 92 |         if len(intersected_files) <= 0:
 93 |             raise IndexError("No tile intersections")
 94 |         else:
 95 |             return intersected_files, naip_geom
 96 | 
 97 | 
 98 | def download_url(url, output_fn, verbose=False):
 99 |     """Download a URL to file.
100 |     Args:
101 |         url (str): URL of file to download
102 |         output_fn (str): Filename to save (importantly -- not the directory to save the file to)
103 |         verbose (bool): Whether to print how the download is going
104 |     Returns:
105 |         output_fn (str): Return `output_fn` as is
106 |     """
107 | 
108 |     if verbose:
109 |         print("Downloading file {} to {}".format(os.path.basename(url), output_fn))
110 | 
111 |     urllib.request.urlretrieve(url, output_fn)
112 |     assert os.path.isfile(output_fn)
113 | 
114 |     if verbose:
115 |         nBytes = os.path.getsize(output_fn)
116 |         print("...done, {} bytes.".format(nBytes))
117 | 
118 |     return output_fn
119 | 
120 | 
121 | def get_naip_tiles(label_tif_path):
122 |     index = NAIPTileIndex("./tmp/")
123 | 
124 |     print(label_tif_path)
125 |     with rasterio.open(label_tif_path) as f:
126 |         geom = shapely.geometry.mapping(shapely.geometry.box(*f.bounds))
127 |         geom = fiona.transform.transform_geom(f.crs.to_string(), "epsg:4326", geom)
128 | 
129 |         naip_azure_path, naip_lst = index.lookup_geom(geom)
130 |     return naip_azure_path
131 | 
132 | 
133 | def wrap_labels_to_naip(naip_tile_lst, out_dir, large_label_tif):
134 |     for tile in naip_tile_lst:
135 |         print(tile)
136 |         with rasterio.open(tile, "r") as f:
137 |             left, bottom, right, top = f.bounds
138 |             crs = f.crs.to_string()
139 |             height, width = f.height, f.width
140 |         out_file = out_dir + tile.split("/")[-1]
141 |         print(out_file)
142 | 
143 |         command = [
144 |             "gdalwarp",
145 |             "-overwrite",
146 |             "-ot",
147 |             "Byte",
148 |             "-t_srs",
149 |             crs,
150 |             "-r",
151 |             "near",
152 |             "-of",
153 |             "GTiff",
154 |             "-te",
155 |             str(left),
156 |             str(bottom),
157 |             str(right),
158 |             str(top),
159 |             "-ts",
160 |             str(width),
161 |             str(height),
162 |             "-co",
163 |             "COMPRESS=LZW",
164 |             "-co",
165 |             "BIGTIFF=YES",
166 |             "-dstnodata",
167 |             str(0),
168 |             large_label_tif,
169 |             out_file,
170 |         ]
171 | 
172 |         subprocess.call(command)
173 |         print("written")
174 | 
175 | 
176 | def remove_notdata(in_dir, threshold):
177 |     t_lst = [in_dir + t for t in os.listdir(in_dir) if t.endswith(".tif")]
178 |     count = 0
179 |     for t in t_lst:
180 |         with rasterio.open(t) as src:
181 |             a = src.read()
182 |         if 0 in np.unique(a, return_counts=True)[0]:
183 |             black_prop = (np.unique(a, return_counts=True)[1][0]) / (
184 |                 a.shape[1] * a.shape[2]
185 |             )
186 |             if black_prop < threshold:
187 |                 print(f"keeping {t}")
188 |                 count += 1
189 |             else:
190 |                 print(f"too many no data pixels, removing {t}")
191 |                 os.remove(t)
192 | 
193 | 
194 | def azure_urls_df(label_dir, naip_lst, label_prefix_azure, group):
195 |     l = [f for f in os.listdir(label_dir) if f.endswith(".tif")]
196 |     tiles_lst = []
197 | 
198 |     for n in l:
199 |         tiles_lst.append([f for f in naip_lst if f.endswith(n)][0])
200 | 
201 |     labels_azure = [label_prefix_azure + n for n in l]
202 | 
203 |     df = pd.DataFrame(
204 |         list(zip(tiles_lst, labels_azure)), columns=["image_fn", "label_fn"]
205 |     )
206 |     df["group"] = group
207 |     return df
208 | 
209 | 
210 | @click.command()
211 | @click.option("--label_tif_path", help="path of input label tif", required=True)
212 | @click.option(
213 |     "--out_dir",
214 |     help="path for label tifs that align with naip tifs to be written",
215 |     required=True,
216 | )
217 | @click.option(
218 |     "--threshold",
219 |     help="threshold value for percentage of no data pixels",
220 |     type=float,
221 |     required=True,
222 | )
223 | @click.option("--aoi", help="aoi name", type=str, required=True)
224 | @click.option("--group", help="label group name", type=str, required=True)
225 | def main(label_tif_path, out_dir, threshold, aoi, group):
226 |     # create out_dir if it doesn't exist
227 |     Path(out_dir).mkdir(exist_ok=True)
228 | 
229 |     # get naip tiles that intersect with label tif
230 |     naip_azure_paths = get_naip_tiles(label_tif_path)
231 | 
232 |     # wrap label tiles to naip tiles
233 |     wrap_labels_to_naip(naip_azure_paths, out_dir, label_tif_path)
234 | 
235 |     # remove tiles that have a >= threshold percentage of no data tiles
236 |     remove_notdata(out_dir, threshold)
237 | 
238 |     azure_df = azure_urls_df(
239 |         out_dir,
240 |         naip_azure_paths,
241 |         "https://uvmlabels.blob.core.windows.net/" + aoi + "/",
242 |         group,
243 |     )
244 | 
245 |     train, validate, test = np.split(
246 |         azure_df.sample(frac=1, random_state=40),
247 |         [int(0.7 * len(azure_df)), int(0.9 * len(azure_df))],
248 |     )
249 | 
250 |     train.to_csv(out_dir  + aoi + "_train" + ".csv")
251 |     validate.to_csv(out_dir + aoi + "_val" ".csv")
252 |     test.to_csv(out_dir  + aoi + "_test" + ".csv")
253 | 
254 | 
255 | if __name__ == "__main__":
256 |     main()
257 | 


--------------------------------------------------------------------------------
/naip-utils/naip_download_pc.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "256da363-a549-4c9d-84c4-3829fe7fd80d",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import os\n",
 11 |     "from os import makedirs, path as op\n",
 12 |     "import json\n",
 13 |     "from typing import Collection, Tuple \n",
 14 |     "from pystac_client import Client\n",
 15 |     "import planetary_computer as pc\n",
 16 |     "from rio_tiler.io import COGReader\n",
 17 |     "from shapely.geometry import shape"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 2,
 23 |    "id": "e3860bbd-3b5b-4aab-bced-c9e5fb4536e0",
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# !pip install rio_tiler -U"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 3,
 33 |    "id": "a665d23a-1ba3-4a4a-860d-955830756b8d",
 34 |    "metadata": {},
 35 |    "outputs": [],
 36 |    "source": [
 37 |     "def download_NAIP(item, fn, area_of_interest):\n",
 38 |     "    \"\"\"\n",
 39 |     "    Download NAIP imagery from Planetary Computer\n",
 40 |     "    \n",
 41 |     "    Parameters:\n",
 42 |     "    ___\n",
 43 |     "\n",
 44 |     "    inputs:\n",
 45 |     "        item: specific item in the STAC collection,\n",
 46 |     "        fn: given file name\n",
 47 |     "        area_of_interest: geometry of the AOI\n",
 48 |     "    \n",
 49 |     "    Returns:\n",
 50 |     "       (None): writen COG of NAIP imagery that intersect with the given AOI\n",
 51 |     "    \"\"\"\n",
 52 |     "    print(item.datetime)\n",
 53 |     "    href = pc.sign(item.assets[\"image\"].href)\n",
 54 |     "    with COGReader(href) as cog:\n",
 55 |     "        data = cog.feature(area_of_interest, max_size=None, indexes=(1, 2, 3, 4))\n",
 56 |     "    \n",
 57 |     "    with open(fn, \"wb\") as f:\n",
 58 |     "        img = data.render(img_format=\"GTiff\", add_mask=False)\n",
 59 |     "        f.write(img)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 4,
 65 |    "id": "9cefa6db-a896-4ad7-bfb1-46ea1c9d606d",
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "def main(aoi, date_range, out_dir):\n",
 70 |     "    \n",
 71 |     "    \"\"\"\n",
 72 |     "    Download NAIP imagery from Planetary Computer\n",
 73 |     "    \n",
 74 |     "    Parameters:\n",
 75 |     "    ___\n",
 76 |     "\n",
 77 |     "    inputs:\n",
 78 |     "        aoi: the path to the aoi,\n",
 79 |     "        date_range: given date range to download images, e.g. 2010-01-01/2021-12-01\n",
 80 |     "        out_dir: given output direct to save imagery\n",
 81 |     "    \n",
 82 |     "    Returns:\n",
 83 |     "       (None): all writen COG of NAIP imagery that intersect with the given AOIs\n",
 84 |     "    \"\"\"\n",
 85 |     "        \n",
 86 |     "    catelog = Client.open(\"https://planetarycomputer.microsoft.com/api/stac/v1\")\n",
 87 |     "    #read in aoi\n",
 88 |     "    with open(aoi) as f:\n",
 89 |     "        feature = json.load(f)[\"features\"]\n",
 90 |     "        # assuming this is only one geomery feature of an bounding box\n",
 91 |     "        area_of_interest = feature[0][\"geometry\"]\n",
 92 |     "    search_imagery = catelog.search(\n",
 93 |     "        collections=[\"naip\"], intersects=area_of_interest, datetime=date_range\n",
 94 |     "    )\n",
 95 |     "    items = list(search_imagery.get_items())\n",
 96 |     "    print(f\"{len(items)} items found in the {date_range} range for {aoi}!\")\n",
 97 |     "    for item in items:\n",
 98 |     "        if not op.exists(out_dir):\n",
 99 |     "            makedirs(out_dir)\n",
100 |     "        fn = f\"{out_dir}/{str(item.datetime)[:10]}_naip_{aoi}.tif\"\n",
101 |     "        download_NAIP(item, fn, area_of_interest)\n",
102 |     " "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 5,
108 |    "id": "2ca669bd-3d5b-496b-812f-74b07f6aa723",
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "aois = [\"aoi0_bounds.geojson\", \"aoi1_bounds.geojson\", \"aoi2_bounds.geojson\"]"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 21,
118 |    "id": "d61764c7-26fd-4564-b8db-5cd6d1168d80",
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "date_range=\"2010-01-01/2021-12-01\""
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 22,
128 |    "id": "bae88e20-21ff-47ef-82a2-4342cd903800",
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "4 items found in the 2010-01-01/2021-12-01 range for aoi0_bounds.geojson!\n",
136 |       "2018-07-06 00:00:00+00:00\n",
137 |       "2016-08-03 00:00:00+00:00\n",
138 |       "2014-06-28 00:00:00+00:00\n",
139 |       "2012-06-29 00:00:00+00:00\n",
140 |       "8 items found in the 2010-01-01/2021-12-01 range for aoi1_bounds.geojson!\n",
141 |       "2018-07-07 00:00:00+00:00\n",
142 |       "2018-07-07 00:00:00+00:00\n",
143 |       "2016-08-03 00:00:00+00:00\n",
144 |       "2016-08-03 00:00:00+00:00\n",
145 |       "2014-06-28 00:00:00+00:00\n",
146 |       "2014-06-28 00:00:00+00:00\n",
147 |       "2012-07-02 00:00:00+00:00\n",
148 |       "2012-06-29 00:00:00+00:00\n",
149 |       "4 items found in the 2010-01-01/2021-12-01 range for aoi2_bounds.geojson!\n",
150 |       "2018-07-07 00:00:00+00:00\n",
151 |       "2016-08-03 00:00:00+00:00\n",
152 |       "2014-06-28 00:00:00+00:00\n",
153 |       "2012-06-29 00:00:00+00:00\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "out_dir=\"naip_downloaded_20211020\"\n",
159 |     "for aoi in aois:\n",
160 |     "    main(aoi, date_range, out_dir)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "id": "71612f0f-5600-4c6e-9256-7c9f57e9a7ea",
167 |    "metadata": {},
168 |    "outputs": [],
169 |    "source": []
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "Python 3 (ipykernel)",
175 |    "language": "python",
176 |    "name": "python3"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "3.9.10"
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 5
193 | }
194 | 


--------------------------------------------------------------------------------
/pytorch-env.yml:
--------------------------------------------------------------------------------
 1 | name: pytorch-env
 2 | channels:
 3 |     - defaults
 4 |     - pytorch
 5 | dependencies:
 6 |     - python=3.8
 7 |     - pytorch==1.4.0
 8 |     - torchvision
 9 |     - numpy
10 |     - pandas
11 |     - tifffile
12 |     - matplotlib
13 |     - pip
14 |     - pip:
15 |         - azureml-sdk
16 |         - rasterio
17 |         - fiona
18 |         - segmentation-models-pytorch
19 |         - scikit-learn
20 |         - rio-tiler
21 |         - mercantile
22 |         - matplotlib
23 |         - seaborn
24 |         - tqdm
25 | 


--------------------------------------------------------------------------------
/src/calculate_image_stats_dir.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # vim:fenc=utf-8
  4 | #
  5 | # Copyright © 2020 Caleb Robinson <calebrob6@gmail.com>
  6 | #
  7 | """Script for calculating per channel means and stdevs from a list of COGs
  8 | """
  9 | import sys
 10 | import os
 11 | 
 12 | env = dict(
 13 |     GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR",
 14 |     AWS_NO_SIGN_REQUEST="YES",
 15 |     GDAL_MAX_RAW_BLOCK_CACHE_SIZE="200000000",
 16 |     GDAL_SWATH_SIZE="200000000",
 17 |     VSI_CURL_CACHE_SIZE="200000000",
 18 | )
 19 | os.environ.update(env)
 20 | import time
 21 | 
 22 | import argparse
 23 | import numpy as np
 24 | import rasterio
 25 | import pandas as pd
 26 | import glob
 27 | 
 28 | 
 29 | def stats(verbose, input_fn, output_dir, num_samples_per_file, num_files, nodata):
 30 | 
 31 |     # -----------------------------------
 32 |     with open(input_fn, "r") as f:
 33 |         fns = f.read().strip().split("\n")
 34 |     if verbose:
 35 |         print("Found %d files" % (len(fns)))
 36 | 
 37 |     if num_files is not None:
 38 |         assert num_files <= len(
 39 |             fns
 40 |         ), "If you are going to sub-sample from the filelist, then you must specify a number of files less than the total number of files."
 41 |         np.random.shuffle(fns)
 42 |         fns = fns[:num_files]
 43 |         if verbose:
 44 |             print("...but only using %d of them" % (len(fns)))
 45 | 
 46 |     # -----------------------------------
 47 |     sampled_pixels = []
 48 | 
 49 |     if verbose:
 50 |         print("Sampling %d pixels per tile" % (num_samples_per_file))
 51 | 
 52 |     with rasterio.open(fns[0]) as f:
 53 |         num_channels = f.count
 54 | 
 55 |     tic = time.time()
 56 |     for i, fn in enumerate(fns):
 57 |         if i % 10 == 0 and verbose:
 58 |             print("%d/%d\t%0.2f seconds" % (i + 1, len(fns), time.time() - tic))
 59 |             tic = time.time()
 60 | 
 61 |         with rasterio.open(fn) as f:
 62 |             data = f.read().reshape(num_channels, -1)
 63 | 
 64 |         mask = np.sum(data == nodata, axis=0) == num_channels
 65 |         data = data[:, ~mask]
 66 |         num_samples = min(num_samples_per_file, data.shape[1])
 67 |         idxs = np.random.choice(data.shape[1], size=num_samples)
 68 | 
 69 |         pixels = data[:, idxs]
 70 |         sampled_pixels.append(pixels)
 71 | 
 72 |     sampled_pixels = np.concatenate(sampled_pixels, axis=1)
 73 |     means = sampled_pixels.mean(axis=1, dtype=np.float64)
 74 |     stdevs = sampled_pixels.std(axis=1, dtype=np.float64)
 75 | 
 76 |     # -----------------------------------
 77 | 
 78 |     print(type(means))
 79 |     print(type(means[0]))
 80 |     if output_dir is not None:
 81 |         # with open(args.output_fn, "w") as f:
 82 |         #     f.write("%s\n" % (means))
 83 |         #     f.write("%s\n" % (stdevs))
 84 |         df = pd.DataFrame.from_dict({"means": means, "stdevs": stdevs})
 85 |         df.to_csv(
 86 |             output_dir + "/" + os.path.splitext(os.path.basename(input_fn))[0] + ".csv"
 87 |         )
 88 | 
 89 |         df2 = pd.DataFrame.from_dict(
 90 |             {
 91 |                 "name": os.path.splitext(os.path.basename(input_fn))[0],
 92 |                 "means": [means],
 93 |                 "stdevs": [stdevs],
 94 |             }
 95 |         )
 96 |         df2.to_csv(
 97 |             output_dir
 98 |             + "/"
 99 |             + os.path.splitext(os.path.basename(input_fn))[0]
100 |             + "_2.csv"
101 |         )
102 | 
103 |     means = ",".join(["%0.4f" % (val) for val in means])
104 |     stdevs = ",".join(["%0.4f" % (val) for val in stdevs])
105 | 
106 |     if verbose:
107 |         print("Means:", means)
108 |         print("Stdevs:", stdevs)
109 | 
110 |     elif not verbose:
111 |         print(means)
112 |         print(stdevs)
113 | 
114 | 
115 | def main():
116 |     parser = argparse.ArgumentParser(description="Image statistic calculation script")
117 | 
118 |     parser.add_argument(
119 |         "-v",
120 |         "--verbose",
121 |         action="store_true",
122 |         help="Enable verbose debugging",
123 |         default=False,
124 |     )
125 |     parser.add_argument(
126 |         "--input_dir",
127 |         action="store",
128 |         type=str,
129 |         help="Path to filelist. Filenames should be readable by rasterio.",
130 |         required=True,
131 |     )
132 |     parser.add_argument(
133 |         "--output_dir",
134 |         action="store",
135 |         type=str,
136 |         help="Filename to write (if this is not set, then we print the results to stdout)",
137 |         default=None,
138 |     )
139 |     parser.add_argument(
140 |         "--num_samples_per_file",
141 |         action="store",
142 |         type=int,
143 |         help="Filename to write",
144 |         default=10000,
145 |     )
146 |     parser.add_argument(
147 |         "--num_files",
148 |         action="store",
149 |         type=int,
150 |         help="Number of files to subsample",
151 |         default=None,
152 |     )
153 |     parser.add_argument(
154 |         "--nodata",
155 |         action="store",
156 |         type=int,
157 |         help="The nodata value to check (we assume that if each band in the data equals this value, then the position is nodata)",
158 |         default=0,
159 |     )
160 | 
161 |     args = parser.parse_args(sys.argv[1:])
162 | 
163 |     f_lst = [
164 |         args.input_dir + x for x in os.listdir(args.input_dir) if not x.startswith(".")
165 |     ]
166 |     print("f_list : ", f_lst)
167 |     print("number of files to process: ", len(f_lst))
168 | 
169 |     for i, f in enumerate(f_lst):
170 |         print(i)
171 |         stats(
172 |             args.verbose,
173 |             f,
174 |             args.output_dir,
175 |             args.num_samples_per_file,
176 |             args.num_files,
177 |             args.nodata,
178 |         )
179 | 
180 |     # Combine all dataframe together into master dataframe
181 |     df_all = pd.concat(
182 |         map(pd.read_csv, glob.glob(os.path.join(args.output_dir, "*_2.csv")))
183 |     )
184 |     df_all.to_csv(args.output_dir + "/" "all_stats.csv")
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     main()
189 | 


--------------------------------------------------------------------------------
/src/chips.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import mercantile
 4 | import rio_tiler
 5 | import tqdm
 6 | from rio_tiler.io import COGReader
 7 | from rio_tiler.models import ImageData
 8 | import argparse
 9 | 
10 | parser = argparse.ArgumentParser(description="train/test/val csv creation script")
11 | parser.add_argument("--input_csv", type=str, required=True, help="")
12 | parser.add_argument("--output_dir", type=str, required=True, help="")
13 | args = parser.parse_args()
14 | 
15 | 
16 | def main():
17 |     df = pd.read_csv(args.input_csv)
18 |     print(df.shape)
19 |     img_path_lst = []
20 |     label_path_lst = []
21 |     for i, img in enumerate(tqdm.tqdm(df["image_fn"])):
22 |         with COGReader(img) as cog:
23 |             t_lst = [t for t in mercantile.tiles(*cog.bounds, 17)]
24 |             print(len(t_lst))
25 | 
26 |             with COGReader(df["label_fn"][i]) as cog_label:
27 |                 # chip NAIP image
28 |                 for t in t_lst:  # fix
29 |                     img = cog.tile(t.x, t.y, t.z, tilesize=256)
30 |                     img_r = img.render(img_format="GTiff")
31 | 
32 |                     img_arr = np.moveaxis(img.data, 0, -1)
33 |                     img_arr = img_arr - np.min(img_arr, (0, 1))
34 |                     data_max_val = np.percentile(img_arr, 0.98, axis=(0, 1))
35 |                     img_arr = img_arr / data_max_val * 255.0
36 |                     np.clip(img_arr, None, 255.0, img_arr)
37 |                     non_nodata_prop = np.sum(np.mean(img_arr, -1) > 0.0) / (256 * 256)
38 | 
39 |                     if non_nodata_prop >= 0.95:
40 |                         # Can be replaced with Azure blob path
41 |                         path = f"{args.output_dir}/{t.x}-{t.y}-{t.z}-img.tif"
42 |                         path_label = f"{args.output_dir}/{t.x}-{t.y}-{t.z}-label.tif"
43 |                         with open(path, "wb") as f:
44 |                             f.write(img_r)
45 |                         img_label = cog_label.tile(t.x, t.y, t.z, tilesize=256)
46 |                         buff = img_label.render(img_format="GTiff")
47 |                         with open(path_label, "wb") as f:
48 |                             f.write(buff)
49 |                         img_path_lst.append(path)
50 |                         label_path_lst.append(path_label)
51 |                     else:
52 |                         print("removed too many no-data pixels")
53 |     df_chips = pd.DataFrame(
54 |         list(zip(img_path_lst, label_path_lst)), columns=["image_fn", "label_fn"]
55 |     )
56 |     df_chips["group"] = df["group"][0]
57 | 
58 |     out_csv = args.output_dir + "/" + df["group"][0] + "_256chips" + ".csv"
59 |     df_chips.to_csv(out_csv)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     main()
64 | 


--------------------------------------------------------------------------------
/src/cls_distribution.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | 
  4 | os.environ[
  5 |     "CURL_CA_BUNDLE"
  6 | ] = "/etc/ssl/certs/ca-certificates.crt"  # A workaround in case this happens: https://github.com/mapbox/rasterio/issues/1289
  7 | import time
  8 | import datetime
  9 | import argparse
 10 | import copy
 11 | 
 12 | import numpy as np
 13 | import pandas as pd
 14 | import json
 15 | import utils
 16 | 
 17 | import rasterio
 18 | from rasterio.windows import Window
 19 | from rasterio.errors import RasterioError, RasterioIOError
 20 | 
 21 | from transforms_utils import (
 22 |     labels_transform_uvm_8cls,
 23 | )
 24 | 
 25 | import matplotlib
 26 | import matplotlib.pyplot as plt
 27 | 
 28 | matplotlib.use("Agg")
 29 | 
 30 | import seaborn as sns
 31 | 
 32 | sns.set()
 33 | 
 34 | import torch
 35 | 
 36 | NUM_WORKERS = 6
 37 | CHIP_SIZE = 256
 38 | 
 39 | parser = argparse.ArgumentParser(
 40 |     description="Minic streaming label data to create class distribution"
 41 | )
 42 | parser.add_argument(
 43 |     "--input_fn",
 44 |     type=str,
 45 |     required=True,
 46 |     help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.',
 47 | )
 48 | parser.add_argument(
 49 |     "--label_transform",
 50 |     default="naip",
 51 |     required=True,
 52 |     help="str either naip or epa to indicate how to transform labels",
 53 | )
 54 | parser.add_argument(
 55 |     "--output_dir",
 56 |     type=str,
 57 |     required=True,
 58 |     help="The path to store class distribution.",
 59 | )
 60 | parser.add_argument(
 61 |     "--overwrite",
 62 |     action="store_true",
 63 |     help="Flag for overwriting `output_dir` if that directory already exists.",
 64 | )
 65 | ## Training arguments to generate class distribution
 66 | parser.add_argument(
 67 |     "--batch_size", type=int, default=16, help="Batch size to use for training"
 68 | )
 69 | parser.add_argument(
 70 |     "--num_epochs", type=int, default=1, help="Number of epochs to train for"
 71 | )
 72 | parser.add_argument(
 73 |     "--seed", type=int, default=0, help="Random seed to pass to numpy and torch"
 74 | )
 75 | parser.add_argument(
 76 |     "--num_classes", type=int, default=10, help="number of classes in dataset"
 77 | )
 78 | parser.add_argument(
 79 |     "--num_chips",
 80 |     type=int,
 81 |     default=40,
 82 |     help="number of chips to randomly sample from data",
 83 | )
 84 | args = parser.parse_args()
 85 | 
 86 | 
 87 | def stream_tile_fns(NUM_WORKERS, label_fns, groups):
 88 |     worker_info = torch.utils.data.get_worker_info()
 89 |     if (
 90 |         worker_info is None
 91 |     ):  # In this case we are not loading through a DataLoader with multiple workers
 92 |         worker_id = 0
 93 |         num_workers = 1
 94 |     else:
 95 |         worker_id = worker_info.id
 96 |         num_workers = worker_info.NUM_WORKERS
 97 | 
 98 |     # We only want to shuffle the order we traverse the files if we are the first worker (else, every worker will shuffle the files...)
 99 |     if worker_id == 0:
100 |         np.random.shuffle(label_fns)  # in place
101 |     # This logic splits up the list of filenames into `num_workers` chunks. Each worker will recieve ceil(num_filenames / num_workers) filenames to generate chips from. If the number of workers doesn't divide the number of filenames evenly then the last worker will have fewer filenames.
102 |     N = len(label_fns)
103 |     num_files_per_worker = int(np.ceil(N / num_workers))
104 |     lower_idx = worker_id * num_files_per_worker
105 |     upper_idx = min(N, (worker_id + 1) * num_files_per_worker)
106 |     for idx in range(lower_idx, upper_idx):
107 | 
108 |         label_fn = None
109 |         # if self.use_labels:
110 |         label_fn = label_fns[idx]
111 |         group = groups[idx]
112 |         print(label_fn)
113 | 
114 |         yield label_fn, group
115 | 
116 | 
117 | def stream_chips(
118 |     num_workers,
119 |     label_fns,
120 |     num_chips_per_tile,
121 |     groups,
122 |     CHIP_SIZE,
123 |     windowed_sampling,
124 |     nodata_check,
125 |     label_transform,
126 |     verbose,
127 | ):
128 |     for label_fn, group in stream_tile_fns(num_workers, label_fns, groups):
129 |         num_skipped_chips = 0
130 |         # Open file pointers
131 |         label_fp = rasterio.open(label_fn, "r")
132 | 
133 |         # if use_labels: # garuntee that our label mask has the same dimensions as our imagery
134 |         t_height, t_width = label_fp.shape
135 |         print("Height and width of the label are:")
136 |         print(t_height, t_width)
137 | 
138 |         # If we aren't in windowed sampling mode then we should read the entire tile up front
139 |         label_data = None
140 |         try:
141 |             if not windowed_sampling:
142 |                 label_data = (
143 |                     label_fp.read().squeeze()
144 |                 )  # assume the label geotiff has a single channel
145 |         except RasterioError as e:
146 |             print("WARNING: Error reading in entire file, skipping to the next file")
147 |             continue
148 | 
149 |         for i in range(num_chips_per_tile):
150 |             # Select the top left pixel of our chip randomly
151 |             x = np.random.randint(0, t_width - CHIP_SIZE)
152 |             y = np.random.randint(0, t_height - CHIP_SIZE)
153 | 
154 |             # Read labels
155 |             labels = None
156 |             if windowed_sampling:
157 |                 try:
158 |                     labels = label_fp.read(
159 |                         window=Window(x, y, CHIP_SIZE, CHIP_SIZE)
160 |                     ).squeeze()
161 |                 except RasterioError:
162 |                     print(
163 |                         "WARNING: Error reading chip from file, skipping to the next chip"
164 |                     )
165 |                     continue
166 |             else:
167 |                 labels = label_data[y : y + CHIP_SIZE, x : x + CHIP_SIZE]
168 | 
169 |             # # Check for no data
170 |             if nodata_check is not None:
171 |                 skip_chip = nodata_check(labels)
172 | 
173 |                 if (
174 |                     skip_chip
175 |                 ):  # The current chip has been identified as invalid by the `nodata_check(...)` method
176 |                     num_skipped_chips += 1
177 |                     continue
178 |             if label_transform is not None:
179 |                 labels = label_transform(labels, group)
180 |             else:
181 |                 labels = torch.from_numpy(labels).squeeze()
182 |             print(labels)
183 |             return labels
184 |         label_fp.close()
185 |         #
186 |         if num_skipped_chips > 0 and verbose:
187 |             print("We skipped %d chips on %s" % (label_fn))
188 | 
189 | 
190 | def label_transforms_naip(labels, group):
191 |     labels = np.array(labels).astype(np.int64)
192 |     labels = np.where(labels == 14, 0, labels)  # to no data
193 |     labels = np.where(labels == 15, 0, labels)  # to no data
194 |     labels = np.where(labels == 13, 0, labels)  # to no data
195 |     labels = np.where(labels == 10, 3, labels)  # to tree canopy
196 |     labels = np.where(labels == 11, 3, labels)  # to tree canopy
197 |     labels = np.where(labels == 12, 3, labels)  # to tree canopy
198 |     return labels
199 | 
200 | 
201 | def label_transforms_epa(labels, group):
202 |     labels = np.array(labels).astype(np.int64)
203 |     labels_new = np.copy(labels)
204 |     for k, v in utils.epa_label_dict.items():
205 |         labels_new[labels == k] = v
206 |     labels_new = torch.from_numpy(labels_new)
207 |     return labels_new
208 | 
209 | 
210 | def label_transforms_uvm(labels, group):
211 |     labels = np.array(labels).astype(np.int64)
212 |     labels_new = np.copy(labels)
213 |     for k, v in utils.uvm_7cls.items():
214 |         labels_new[labels == k] = v
215 |     labels_new = torch.from_numpy(labels_new)
216 |     return labels_new
217 | 
218 | 
219 | def nodata_check(labels):
220 |     return np.any(labels == 0)
221 | 
222 | 
223 | def class_distribute():
224 |     print(
225 |         "Starting DFC2021 baseline training script at %s"
226 |         % (str(datetime.datetime.now()))
227 |     )
228 |     num_chips_per_tile = args.num_chips
229 |     windowed_sampling = False
230 |     label_transform = args.label_transform
231 |     nodata_check = None
232 |     verbose = True
233 |     all_labels = []
234 | 
235 |     if args.label_transform == "naip":
236 |         label_transform = label_transforms_naip
237 |         class_names = [
238 |             "no_data",
239 |             "water",
240 |             "emergent_wetlands",
241 |             "tree_canopy",
242 |             "shrubland",
243 |             "low_vegetation",
244 |             "barren",
245 |             "structure",
246 |             "impervious_surface",
247 |             "impervious_roads",
248 |             "weighted_avg",
249 |         ]
250 |     elif args.label_transform == "epa":
251 |         label_transform = label_transforms_epa
252 |         class_names = [
253 |             "no_data",
254 |             "impervious",
255 |             "soil_barren",
256 |             "grass",
257 |             "tree/forest",
258 |             "water",
259 |             "shrub",
260 |             "woody_wetlands",
261 |             "emergent_wetlands",
262 |             "agriculture",
263 |             "orchard",
264 |             "weighted_avg",
265 |         ]
266 |     elif args.label_transform == "uvm":
267 |         label_transform = label_transforms_uvm
268 |         class_names = [
269 |             "tree",
270 |             "grass",
271 |             "bare soil",
272 |             "water",
273 |             "buildings",
274 |             "roads",
275 |             "other impervious",
276 |         ]
277 |     elif args.label_transform == "uvm8cls":
278 |         label_transform = labels_transform_uvm_8cls
279 |         class_names = [
280 |             "tree",
281 |             "grass",
282 |             "bare soil",
283 |             "water",
284 |             "buildings",
285 |             "roads",
286 |             "other impervious",
287 |         ]
288 |     else:
289 |         raise ValueError("Invalid label transform")
290 |     # -------------------
291 |     # Setup
292 |     # -------------------
293 |     assert os.path.exists(args.input_fn)
294 | 
295 |     if os.path.isfile(args.output_dir):
296 |         print("A file was passed as `--output_dir`, please pass a directory!")
297 |         return
298 | 
299 |     if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)):
300 |         if args.overwrite:
301 |             print(
302 |                 "WARNING! The output directory, %s, already exists, we might overwrite data in it!"
303 |                 % (args.output_dir)
304 |             )
305 |         else:
306 |             print(
307 |                 "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..."
308 |                 % (args.output_dir)
309 |             )
310 |             return
311 |     else:
312 |         print("The output directory doesn't exist or is empty.")
313 |         os.makedirs(args.output_dir, exist_ok=True)
314 | 
315 |     if torch.cuda.is_available():
316 |         device = torch.device("cuda:%d" % args.gpu)
317 |     else:
318 |         print("WARNING! Torch is reporting that CUDA isn't available, using cpu")
319 |         device = "cpu"
320 | 
321 |     np.random.seed(args.seed)
322 |     torch.manual_seed(args.seed)
323 | 
324 |     # -------------------
325 |     # Load input data
326 |     # -------------------
327 |     input_dataframe = pd.read_csv(args.input_fn)
328 |     print(input_dataframe.head())
329 |     label_fns = input_dataframe["label_fn"].values
330 |     groups = input_dataframe["group"].values
331 |     print(label_fns)
332 | 
333 |     print(args.label_transform)
334 |     if args.label_transform == "naip":
335 |         label_transform = label_transforms_naip
336 |     elif args.label_transform == "epa":
337 |         label_transform = label_transforms_epa
338 |     elif args.label_transform == "uvm":
339 |         label_transform = label_transforms_uvm
340 |     else:
341 |         raise ValueError("Invalid label transform")
342 | 
343 |     num_training_batches_per_epoch = int(
344 |         len(label_fns) * args.num_chips / args.batch_size
345 |     )
346 | 
347 |     # getting label chips stac by given model epochs
348 |     # is num_chips_per_tile the num_training_batches_per_epoch
349 |     for epoch in range(args.num_epochs):
350 |         for num_batches in range(num_training_batches_per_epoch):
351 |             try:
352 |                 labels = stream_chips(
353 |                     NUM_WORKERS,
354 |                     label_fns,
355 |                     num_chips_per_tile,
356 |                     groups,
357 |                     CHIP_SIZE,
358 |                     windowed_sampling,
359 |                     nodata_check,
360 |                     label_transform,
361 |                     verbose,
362 |                 )
363 |                 all_labels.append(labels)
364 |             except Exception as ex:
365 |                 print(ex)
366 |                 pass
367 |     label_arr = np.array([t.numpy() for t in all_labels])
368 | 
369 |     # -------------------
370 |     # Plot classes distribution
371 |     # -------------------
372 |     fig, ax = plt.subplots(figsize=(30, 10))
373 |     fig.tight_layout()
374 |     unique, counts = np.unique(label_arr, return_counts=True)
375 |     cls_dict = dict(zip(range(len(class_names)), class_names))
376 |     vc_out = dict(zip(unique, counts / args.num_epochs))  # 10 epoaches
377 |     vc2df = dict(zip(cls_dict.values(), vc_out.values()))
378 |     df = pd.DataFrame.from_dict(vc2df, orient="index", columns=["count"])
379 | 
380 |     ax = sns.barplot(x=df.index, y="count", data=df)
381 |     ax.set_ylabel(f"Class count")
382 |     ax.set_xlabel(f"Classe name")
383 |     ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=45, ha="right")
384 |     plt.tight_layout()
385 |     fig.savefig(os.path.join(args.output_dir, "cls_distribution.png"))
386 |     csv_fn = "output_cls_counts_and_values.csv"
387 |     df.to_csv(os.path.join(args.output_dir, csv_fn))
388 | 
389 | 
390 | if __name__ == "__main__":
391 |     class_distribute()
392 | 


--------------------------------------------------------------------------------
/src/dataloaders/StreamingDatasets.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import numpy as np
  4 | 
  5 | import rasterio
  6 | from rasterio.windows import Window
  7 | from rasterio.errors import RasterioError, RasterioIOError
  8 | 
  9 | import torch
 10 | from torchvision import transforms
 11 | from torch.utils.data.dataset import IterableDataset
 12 | 
 13 | 
 14 | class StreamingGeospatialDataset(IterableDataset):
 15 |     def __init__(
 16 |         self,
 17 |         imagery_fns,
 18 |         label_fns=None,
 19 |         groups=None,
 20 |         chip_size=256,
 21 |         num_chips_per_tile=200,
 22 |         windowed_sampling=False,
 23 |         image_transform=None,
 24 |         label_transform=None,
 25 |         nodata_check=None,
 26 |         verbose=True,
 27 |     ):
 28 |         """A torch Dataset for randomly sampling chips from a list of tiles. When used in conjunction with a DataLoader that has `num_workers>1` this Dataset will assign each worker to sample chips from disjoint sets of tiles.
 29 |         Args:
 30 |             imagery_fns: A list of filenames (or URLS -- anything that `rasterio.open()` can read) pointing to imagery tiles.
 31 |             label_fns: A list of filenames of the same size as `imagery_fns` pointing to label mask tiles or `None` if the Dataset should operate in "imagery only mode". Note that we expect `imagery_fns[i]` and `label_fns[i]` to have the same dimension and coordinate system.
 32 |             groups: Optional: A list of integers of the same size as `imagery_fns` that gives the "group" membership of each tile. This can be used to normalize imagery from different groups differently.
 33 |             chip_size: Desired size of chips (in pixels).
 34 |             num_chips_per_tile: Desired number of chips to sample for each tile.
 35 |             windowed_sampling: Flag indicating whether we should sample each chip with a read using `rasterio.windows.Window` or whether we should read the whole tile into memory, then sample chips.
 36 |             image_transform: A function to apply to each image chip object. If this is `None`, then the only transformation applied to the loaded imagery will be to convert it to a `torch.Tensor`. If this is not `None`, then the function should return a `Torch.tensor`. Further, if `groups` is not `None` then the transform function should expect the imagery as the first argument and the group as the second argument.
 37 |             label_transform: Similar to image_transform, but applied to label chips.
 38 |             nodata_check: A method that will check an `(image_chip)` or `(image_chip, label_chip)` (if `label_fns` are provided) and return whether or not the chip should be skipped. This can be used, for example, to skip chips that contain nodata values.
 39 |             verbose: If `False` we will be quiet.
 40 |         """
 41 | 
 42 |         if label_fns is None:
 43 |             self.fns = imagery_fns
 44 |             self.use_labels = False
 45 |         else:
 46 |             self.fns = list(zip(imagery_fns, label_fns))
 47 |             self.use_labels = True
 48 | 
 49 |         self.groups = groups
 50 | 
 51 |         self.chip_size = chip_size
 52 |         self.num_chips_per_tile = num_chips_per_tile
 53 |         self.windowed_sampling = windowed_sampling
 54 | 
 55 |         self.image_transform = image_transform
 56 |         self.label_transform = label_transform
 57 |         self.nodata_check = nodata_check
 58 | 
 59 |         self.verbose = verbose
 60 | 
 61 |         if self.verbose:
 62 |             print("Constructed StreamingGeospatialDataset")
 63 | 
 64 |     def stream_tile_fns(self):
 65 |         worker_info = torch.utils.data.get_worker_info()
 66 |         if (
 67 |             worker_info is None
 68 |         ):  # In this case we are not loading through a DataLoader with multiple workers
 69 |             worker_id = 0
 70 |             num_workers = 1
 71 |         else:
 72 |             worker_id = worker_info.id
 73 |             num_workers = worker_info.num_workers
 74 | 
 75 |         # We only want to shuffle the order we traverse the files if we are the first worker (else, every worker will shuffle the files...)
 76 |         if worker_id == 0:
 77 |             np.random.shuffle(self.fns)  # in place
 78 |         # NOTE: A warning, when different workers are created they will all have the same numpy random seed, however will have different torch random seeds. If you want to use numpy random functions, seed appropriately.
 79 |         # seed = torch.randint(low=0,high=2**32-1,size=(1,)).item()
 80 |         # np.random.seed(seed) # when different workers spawn, they have the same numpy random seed...
 81 | 
 82 |         if self.verbose:
 83 |             print("Creating a filename stream for worker %d" % (worker_id))
 84 | 
 85 |         # This logic splits up the list of filenames into `num_workers` chunks. Each worker will recieve ceil(num_filenames / num_workers) filenames to generate chips from. If the number of workers doesn't divide the number of filenames evenly then the last worker will have fewer filenames.
 86 |         N = len(self.fns)
 87 |         num_files_per_worker = int(np.ceil(N / num_workers))
 88 |         lower_idx = worker_id * num_files_per_worker
 89 |         upper_idx = min(N, (worker_id + 1) * num_files_per_worker)
 90 |         for idx in range(lower_idx, upper_idx):
 91 | 
 92 |             label_fn = None
 93 |             if self.use_labels:
 94 |                 img_fn, label_fn = self.fns[idx]
 95 |             else:
 96 |                 img_fn = self.fns[idx]
 97 | 
 98 |             if self.groups is not None:
 99 |                 group = self.groups[idx]
100 |             else:
101 |                 group = None
102 | 
103 |             if self.verbose:
104 |                 print("Worker %d, yielding file %d" % (worker_id, idx))
105 | 
106 |             yield (img_fn, label_fn, group)
107 | 
108 |     def stream_chips(self):
109 |         for img_fn, label_fn, group in self.stream_tile_fns():
110 |             num_skipped_chips = 0
111 | 
112 |             # Open file pointers
113 |             img_fp = rasterio.open(img_fn, "r")
114 |             print(img_fn)
115 |             label_fp = rasterio.open(label_fn, "r") if self.use_labels else None
116 |             print(label_fn)
117 | 
118 |             print("label shape: ", label_fp.shape)
119 |             print("label height: ", label_fp.shape[0])
120 |             print("label width: ", label_fp.shape[1])
121 | 
122 |             height, width = img_fp.shape
123 |             print("image height: ", height)
124 |             print("image width: ", width)
125 | 
126 |             if (
127 |                 self.use_labels
128 |             ):  # garuntee that our label mask has the same dimensions as our imagery
129 |                 t_height, t_width = label_fp.shape
130 |                 assert height == t_height and width == t_width
131 | 
132 |             # If we aren't in windowed sampling mode then we should read the entire tile up front
133 |             img_data = None
134 |             label_data = None
135 |             try:
136 |                 if not self.windowed_sampling:
137 |                     img_data = np.rollaxis(img_fp.read(), 0, 3)
138 |                     if self.use_labels:
139 |                         label_data = (
140 |                             label_fp.read().squeeze()
141 |                         )  # assume the label geotiff has a single channel
142 |             except RasterioError as e:
143 |                 print(
144 |                     "WARNING: Error reading in entire file, skipping to the next file"
145 |                 )
146 |                 continue
147 | 
148 |             for i in range(self.num_chips_per_tile):
149 |                 # Select the top left pixel of our chip randomly
150 |                 x = np.random.randint(0, width - self.chip_size)
151 |                 y = np.random.randint(0, height - self.chip_size)
152 | 
153 |                 # Read imagery / labels
154 |                 img = None
155 |                 labels = None
156 |                 if self.windowed_sampling:
157 |                     try:
158 |                         img = np.rollaxis(
159 |                             img_fp.read(
160 |                                 window=Window(x, y, self.chip_size, self.chip_size)
161 |                             ),
162 |                             0,
163 |                             3,
164 |                         )
165 |                         # print(img.shape)
166 |                         if self.use_labels:
167 |                             labels = label_fp.read(
168 |                                 window=Window(x, y, self.chip_size, self.chip_size)
169 |                             ).squeeze()
170 |                     except RasterioError:
171 |                         print(
172 |                             "WARNING: Error reading chip from file, skipping to the next chip"
173 |                         )
174 |                         continue
175 |                 else:
176 |                     img = img_data[y : y + self.chip_size, x : x + self.chip_size, :]
177 |                     if self.use_labels:
178 |                         labels = label_data[
179 |                             y : y + self.chip_size, x : x + self.chip_size
180 |                         ]
181 | 
182 |                 # Check for no data
183 |                 if self.nodata_check is not None:
184 |                     if self.use_labels:
185 |                         skip_chip = self.nodata_check(img, labels)
186 |                     else:
187 |                         skip_chip = self.nodata_check(img)
188 | 
189 |                     if (
190 |                         skip_chip
191 |                     ):  # The current chip has been identified as invalid by the `nodata_check(...)` method
192 |                         num_skipped_chips += 1
193 |                         continue
194 | 
195 |                 # Transform the imagery
196 |                 if self.image_transform is not None:
197 |                     if self.groups is None:
198 |                         img = self.image_transform(img)
199 |                     else:
200 |                         img = self.image_transform(img, group)
201 |                 else:
202 |                     img = torch.from_numpy(img).squeeze()
203 | 
204 |                 # Transform the labels
205 |                 if self.use_labels:
206 |                     if self.label_transform is not None:
207 |                         if self.groups is None:
208 |                             labels = self.label_transform(labels)
209 |                         else:
210 |                             labels = self.label_transform(labels, group)
211 |                     else:
212 |                         labels = torch.from_numpy(labels).squeeze()
213 | 
214 |                 # Note, that img should be a torch "Double" type (i.e. a np.float32) and labels should be a torch "Long" type (i.e. np.int64)
215 |                 if self.use_labels:
216 |                     yield img, labels
217 |                 else:
218 |                     yield img
219 | 
220 |             # Close file pointers
221 |             img_fp.close()
222 |             if self.use_labels:
223 |                 label_fp.close()
224 | 
225 |             if num_skipped_chips > 0 and self.verbose:
226 |                 print("We skipped %d chips on %s" % (num_skipped_chips, img_fn))
227 | 
228 |     def __iter__(self):
229 |         if self.verbose:
230 |             print("Creating a new StreamingGeospatialDataset iterator")
231 |         return iter(self.stream_chips())
232 | 


--------------------------------------------------------------------------------
/src/dataloaders/TileDatasets.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | import rasterio
 4 | from rasterio.windows import Window
 5 | from rasterio.errors import RasterioIOError
 6 | 
 7 | import torch
 8 | from torch.utils.data.dataset import Dataset
 9 | 
10 | 
11 | class TileInferenceDataset(Dataset):
12 |     def __init__(
13 |         self,
14 |         fn,
15 |         chip_size,
16 |         stride,
17 |         transform=None,
18 |         windowed_sampling=False,
19 |         verbose=False,
20 |     ):
21 |         """A torch Dataset for sampling a grid of chips that covers an input tile.
22 |         If `chip_size` doesn't divide the height of the tile evenly (which is what is likely to happen) then we will sample an additional row of chips that are aligned to the bottom of the file.
23 |         We do a similar operation if `chip_size` doesn't divide the width of the tile evenly -- by appending an additional column.
24 |         Note: without a `transform` we will return chips in (height, width, channels) format in whatever the tile's dtype is.
25 |         Args:
26 |             fn: The path to the file to sample from (this can be anything that rasterio.open(...) knows how to read).
27 |             chip_size: The size of chips to return (chips will be squares).
28 |             stride: How much we move the sliding window to sample the next chip. If this is is less than `chip_size` then we will get overlapping windows, if it is > `chip_size` then some parts of the tile will not be sampled.
29 |             transform: A torchvision Transform to apply on each chip.
30 |             windowed_sample: If `True` we will use rasterio.windows.Window to sample chips without every loading the entire file into memory, else, we will load the entire tile up-front and index into it to sample chips.
31 |             verbose: Flag to control printing stuff.
32 |         """
33 |         self.fn = fn
34 |         self.chip_size = chip_size
35 |         self.transform = transform
36 |         self.windowed_sampling = windowed_sampling
37 |         self.verbose = verbose
38 |         with rasterio.open(self.fn) as f:
39 |             height, width = f.height, f.width
40 |             self.num_channels = f.count
41 |             self.dtype = f.profile["dtype"]
42 |             if (
43 |                 not windowed_sampling
44 |             ):  # if we aren't using windowed sampling, then go ahead and read in all of the data
45 |                 self.data = np.rollaxis(f.read(), 0, 3)
46 |         self.chip_coordinates = (
47 |             []
48 |         )  # upper left coordinate (y,x), of each chip that this Dataset will return
49 |         for y in list(range(0, height - self.chip_size, stride)) + [
50 |             height - self.chip_size
51 |         ]:
52 |             for x in list(range(0, width - self.chip_size, stride)) + [
53 |                 width - self.chip_size
54 |             ]:
55 |                 self.chip_coordinates.append((y, x))
56 |         self.num_chips = len(self.chip_coordinates)
57 | 
58 |         if self.verbose:
59 |             print(
60 |                 "Constructed TileInferenceDataset -- we have %d by %d file with %d channels with a dtype of %s. We are sampling %d chips from it."
61 |                 % (height, width, self.num_channels, self.dtype, self.num_chips)
62 |             )
63 | 
64 |     def __getitem__(self, idx):
65 |         """
66 |         Returns:
67 |             A tuple (chip, (y,x)): `chip` is the chip that we sampled from the larger tile. (y,x) are the indices of the upper left corner of the chip.
68 |         """
69 |         y, x = self.chip_coordinates[idx]
70 |         if self.windowed_sampling:
71 |             try:
72 |                 with rasterio.Env():
73 |                     with rasterio.open(self.fn) as f:
74 |                         img = np.rollaxis(
75 |                             f.read(
76 |                                 window=rasterio.windows.Window(
77 |                                     x, y, self.chip_size, self.chip_size
78 |                                 )
79 |                             ),
80 |                             0,
81 |                             3,
82 |                         )
83 |             except RasterioIOError as e:  # NOTE(caleb): I put this here to catch weird errors that I was seeing occasionally when trying to read from COGS - I don't remember the details though
84 |                 print("Reading %d failed, returning 0's" % (idx))
85 |                 img = np.zeros(
86 |                     (self.chip_size, self.chip_size, self.num_channels), dtype=np.uint8
87 |                 )
88 |         else:
89 |             img = self.data[y : y + self.chip_size, x : x + self.chip_size]
90 | 
91 |         if self.transform is not None:
92 |             img = self.transform(img)
93 | 
94 |         return img, np.array((y, x))
95 | 
96 |     def __len__(self):
97 |         return self.num_chips
98 | 


--------------------------------------------------------------------------------
/src/dataloaders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/developmentseed/pearl-ml-pipeline/bf8b857b5939f5e614bc81e7eb156246eea987ad/src/dataloaders/__init__.py


--------------------------------------------------------------------------------
/src/embeddings.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | import time
  4 | import datetime
  5 | import argparse
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | import rasterio
 11 | from rasterio.windows import Window
 12 | 
 13 | import torch
 14 | import torch.nn.functional as F
 15 | 
 16 | import models
 17 | from dataloaders.TileDatasets import TileInferenceDataset
 18 | import utils
 19 | from sklearn.metrics import confusion_matrix, f1_score
 20 | 
 21 | os.environ[
 22 |     "CURL_CA_BUNDLE"
 23 | ] = "/etc/ssl/certs/ca-certificates.crt"  # A workaround in case this happens: https://github.com/mapbox/rasterio/issues/1289
 24 | 
 25 | NUM_WORKERS = 4
 26 | CHIP_SIZE = 256
 27 | PADDING = 128
 28 | assert PADDING % 2 == 0
 29 | HALF_PADDING = PADDING // 2
 30 | CHIP_STRIDE = CHIP_SIZE - PADDING
 31 | 
 32 | from azureml.core import Run
 33 | 
 34 | run = Run.get_context()
 35 | 
 36 | parser = argparse.ArgumentParser(description="DFC2021 model inference script")
 37 | parser.add_argument(
 38 |     "--input_fn",
 39 |     type=str,
 40 |     required=True,
 41 |     help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.',
 42 | )
 43 | parser.add_argument(
 44 |     "--model_fn", type=str, required=True, help="Path to the model file to use."
 45 | )
 46 | parser.add_argument(
 47 |     "--output_dir",
 48 |     type=str,
 49 |     required=True,
 50 |     help="The path to output the model predictions as a GeoTIFF. Will fail if this file already exists.",
 51 | )
 52 | parser.add_argument(
 53 |     "--overwrite",
 54 |     action="store_true",
 55 |     help="Flag for overwriting `--output_dir` if that directory already exists.",
 56 | )
 57 | parser.add_argument("--gpu", type=int, default=0, help="The ID of the GPU to use")
 58 | parser.add_argument(
 59 |     "--batch_size", type=int, default=2, help="Batch size to use during inference."
 60 | )
 61 | parser.add_argument(
 62 |     "--model", default="fcn", choices=("unet", "fcn"), help="Model to use"
 63 | )
 64 | 
 65 | parser.add_argument(
 66 |     "--num_classes",
 67 |     type=int,
 68 |     default=11,
 69 |     help="number of classes model was trained with",
 70 | ),
 71 | 
 72 | parser.add_argument(
 73 |     "--label_transform",
 74 |     default="naip",
 75 |     help="str either naip, epa or cic to indicate how to transform labels",
 76 | )
 77 | 
 78 | 
 79 | args = parser.parse_args()
 80 | 
 81 | 
 82 | def label_transforms_naip(labels):
 83 |     labels = np.array(labels).astype(np.int64)
 84 |     labels = np.where(labels == 14, 0, labels)  # to no data
 85 |     labels = np.where(labels == 15, 0, labels)  # to no data
 86 |     labels = np.where(labels == 13, 0, labels)  # to no data
 87 |     labels = np.where(labels == 10, 3, labels)  # to tree canopy
 88 |     labels = np.where(labels == 11, 3, labels)  # to tree canopy
 89 |     labels = np.where(labels == 12, 3, labels)  # to tree canopy
 90 |     return labels
 91 | 
 92 | 
 93 | def label_transforms_epa(labels):
 94 |     labels = np.array(labels).astype(np.int64)
 95 |     labels_new = np.copy(labels)
 96 |     for k, v in utils.epa_label_dict.items():
 97 |         labels_new[labels == k] = v
 98 |     return labels_new
 99 | 
100 | 
101 | def label_transform_cic(labels):
102 |     labels = np.array(labels).astype(np.int64)
103 |     labels_new = np.copy(labels)
104 |     for k, v in utils.cic_label_dict.items():
105 |         labels_new[labels == k] = v
106 |     return labels_new
107 | 
108 | 
109 | def random_pixel_values(src_path: str, number_of_point: int, excludes={10, 11, 12}):
110 |     with rasterio.open(src_path) as src_dst:
111 |         output = {}
112 |         cr_lst = []
113 |         arr = src_dst.read(indexes=1)
114 |         value, count = np.unique(arr, return_counts=True)
115 |         for (i, c) in enumerate(count):
116 |             if value[i] in excludes:
117 |                 continue
118 |             point_y, point_x = np.where(arr == value[i])
119 |             n_points = (
120 |                 number_of_point if len(point_x) > number_of_point else len(point_x)
121 |             )
122 |             indexes = np.random.choice(len(point_x), n_points).tolist()
123 |             # y,x in row/col indexes
124 |             cr = [(point_x[idx], point_y[idx]) for idx in indexes]
125 |             output[value[i]] = cr
126 |             cr_lst.append(cr)
127 |             # TODO
128 |             # yield pix, coordinates
129 |     cr_f = [list(item) for sublist in cr_lst for item in sublist]
130 |     return output, cr_f
131 | 
132 | 
133 | def main():
134 |     print("Starting model eval script at %s" % (str(datetime.datetime.now())))
135 | 
136 |     # -------------------
137 |     # Setup
138 |     # -------------------
139 |     assert os.path.exists(args.input_fn)
140 |     assert os.path.exists(args.model_fn)
141 | 
142 |     if os.path.isfile(args.output_dir):
143 |         print("A file was passed as `--output_dir`, please pass a directory!")
144 |         return
145 | 
146 |     if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)) > 0:
147 |         if args.overwrite:
148 |             print(
149 |                 "WARNING! The output directory, %s, already exists, we might overwrite data in it!"
150 |                 % (args.output_dir)
151 |             )
152 |         else:
153 |             print(
154 |                 "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..."
155 |                 % (args.output_dir)
156 |             )
157 |             return
158 |     else:
159 |         print("The output directory doesn't exist or is empty.")
160 |         os.makedirs(args.output_dir, exist_ok=True)
161 | 
162 |     if torch.cuda.is_available():
163 |         device = torch.device("cuda:%d" % args.gpu)
164 |     else:
165 |         print("WARNING! Torch is reporting that CUDA isn't available, exiting...")
166 |         return
167 | 
168 |     # -------------------
169 |     # Load model
170 |     # -------------------
171 |     if args.model == "unet":
172 |         model = models.get_unet(classes=args.num_classes)
173 |     elif args.model == "fcn":
174 |         model = models.get_fcn(num_output_classes=args.num_classes)
175 |     else:
176 |         raise ValueError("Invalid model")
177 |     model.load_state_dict(torch.load(args.model_fn))
178 |     model = model.to(device)
179 | 
180 |     # determine which label transform to use
181 |     if args.label_transform == "naip":
182 |         label_transform = label_transforms_naip
183 |         class_names = [
184 |             "no_data",
185 |             "water",
186 |             "emergent_wetlands",
187 |             "tree_canopy",
188 |             "shrubland",
189 |             "low_vegetation",
190 |             "barren",
191 |             "structure",
192 |             "impervious_surface",
193 |             "impervious_roads",
194 |             "weighted_avg",
195 |         ]
196 |     elif args.label_transform == "epa":
197 |         label_transform = label_transforms_epa
198 |         class_names = [
199 |             "no_data",
200 |             "impervious",
201 |             "soil_barren",
202 |             "grass",
203 |             "tree/forest",
204 |             "water",
205 |             "shrub",
206 |             "woody_wetlands",
207 |             "emergent_wetlands",
208 |             "agriculture",
209 |             "orchard",
210 |             "weighted_avg",
211 |         ]
212 |     elif args.label_transform == "cic":
213 |         label_transform = label_transform_cic
214 |         class_names = [
215 |             "Structures",
216 |             "Impervious Surface",
217 |             "Water",
218 |             "Grassland/Pairie",
219 |             "Tree Canopy",
220 |             "Turff",
221 |             "Barren/Rock",
222 |             "Irregated",
223 |         ]
224 |     else:
225 |         raise ValueError("Invalid label transform")
226 | 
227 |     # -------------------
228 |     # Run on each line in the input
229 |     # -------------------
230 |     input_dataframe = pd.read_csv(args.input_fn)
231 |     image_fns = input_dataframe["image_fn"].values
232 |     label_fns = input_dataframe["label_fn"].values
233 |     groups = input_dataframe["group"].values
234 | 
235 |     # Get Row,Column for unique lables
236 |     for i, gt_img in enumerate(label_fns):
237 |         x_embedding = []
238 |         output, cr_f = random_pixel_values(gt_img, 10, {10, 11, 12})
239 |         labels = list(output.keys()) * 10
240 |         labels.sort()
241 |         # print(output)
242 |         print("flattened coords:")
243 |         print(cr_f)
244 |         print("labels")
245 |         print(labels)
246 | 
247 |         # run inference on window that contains each row,column val
248 |         for rc in cr_f:
249 |             gt_2 = rasterio.open(image_fns[i])
250 |             w = gt_2.read(window=Window(rc[0], rc[1], 256, 256))
251 |             data = w / 255.0
252 |             data = data.astype(np.float32)
253 |             data = torch.from_numpy(data)
254 |             data = data.to(device)
255 | 
256 |             label_img = rasterio.open(gt_img)
257 |             w_label = label_img.read(1, window=Window(rc[0], rc[1], 256, 256))
258 |             print(w_label[0, 0])
259 | 
260 |             with torch.no_grad():
261 | 
262 |                 embedding = model.forward_features(
263 |                     data[None, ...]
264 |                 )  # insert singleton "batch" dimension to input data for pytorch to be happy
265 |                 embedding = embedding.cpu().numpy()
266 |                 embedding = np.moveaxis(embedding[0], 0, -1)
267 |                 x_embedding.append(embedding[0, 0])
268 | 
269 |         output_fn = gt_img[0][:-4].split("/")[-1]  # something like "546_naip-2013.tif"
270 |         output_fn_e = output_fn + "_embedding.npz"
271 |         output_fn_l = output_fn + "_label.npz"
272 | 
273 |         output_path_e = os.path.join(args.output_dir, output_fn_e)
274 |         output_path_label = os.path.join(args.output_dir, output_fn_l)
275 | 
276 |         np.savez(output_path_e, np.array(x_embedding))
277 |         np.savez(output_path_label, np.array(labels))
278 |         print("saved")
279 | 
280 | 
281 | if __name__ == "__main__":
282 |     main()
283 | 


--------------------------------------------------------------------------------
/src/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import time
  4 | import datetime
  5 | import argparse
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | 
 10 | import rasterio
 11 | 
 12 | import torch
 13 | import torch.nn.functional as F
 14 | 
 15 | import models
 16 | from dataloaders.TileDatasets import TileInferenceDataset
 17 | import utils
 18 | from transforms_utils import (
 19 |     label_transforms_naip,
 20 |     label_transform_cic,
 21 |     label_transforms_epa,
 22 |     label_transform_naip5cls,
 23 |     labels_transform_uvm,
 24 |     labels_transform_uvm_8cls,
 25 |     image_transforms,
 26 | )
 27 | from sklearn.metrics import f1_score
 28 | from azureml.core import Run
 29 | 
 30 | # A workaround in case this happens: https://github.com/mapbox/rasterio/issues/1289
 31 | os.environ["CURL_CA_BUNDLE"] = "/etc/ssl/certs/ca-certificates.crt"
 32 | 
 33 | NUM_WORKERS = 4
 34 | CHIP_SIZE = 256
 35 | PADDING = 128
 36 | assert PADDING % 2 == 0
 37 | HALF_PADDING = PADDING // 2
 38 | CHIP_STRIDE = CHIP_SIZE - PADDING
 39 | 
 40 | 
 41 | run = Run.get_context()
 42 | 
 43 | parser = argparse.ArgumentParser(description="DFC2021 model inference script")
 44 | parser.add_argument(
 45 |     "--input_fn",
 46 |     type=str,
 47 |     required=True,
 48 |     help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.',
 49 | )
 50 | parser.add_argument(
 51 |     "--model_fn", type=str, required=True, help="Path to the model file to use."
 52 | )
 53 | parser.add_argument(
 54 |     "--output_dir",
 55 |     type=str,
 56 |     required=True,
 57 |     help="The path to output the model predictions as a GeoTIFF. Will fail if this file already exists.",
 58 | )
 59 | parser.add_argument(
 60 |     "--overwrite",
 61 |     action="store_true",
 62 |     help="Flag for overwriting `--output_dir` if that directory already exists.",
 63 | )
 64 | parser.add_argument("--gpu", type=int, default=0, help="The ID of the GPU to use")
 65 | parser.add_argument(
 66 |     "--batch_size", type=int, default=2, help="Batch size to use during inference."
 67 | )
 68 | parser.add_argument(
 69 |     "--save_soft",
 70 |     action="store_true",
 71 |     help='Flag that enables saving the predicted per class probabilities in addition to the "hard" class predictions.',
 72 | )
 73 | parser.add_argument(
 74 |     "--model",
 75 |     default="fcn",
 76 |     choices=("unet", "fcn", "unet2", "deeplabv3plus"),
 77 |     help="Model to use",
 78 | )
 79 | 
 80 | parser.add_argument(
 81 |     "--num_classes",
 82 |     type=int,
 83 |     default=10,
 84 |     help="number of classes model was trained with",
 85 | ),
 86 | 
 87 | parser.add_argument(
 88 |     "--label_transform",
 89 |     default="naip",
 90 |     help="str either naip, epa or cic to indicate how to transform labels",
 91 | )
 92 | 
 93 | 
 94 | args = parser.parse_args()
 95 | 
 96 | 
 97 | def main():
 98 |     print("Starting model eval script at %s" % (str(datetime.datetime.now())))
 99 | 
100 |     # -------------------
101 |     # Setup
102 |     # -------------------
103 |     assert os.path.exists(args.input_fn)
104 |     assert os.path.exists(args.model_fn)
105 | 
106 |     if os.path.isfile(args.output_dir):
107 |         print("A file was passed as `--output_dir`, please pass a directory!")
108 |         return
109 | 
110 |     if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)) > 0:
111 |         if args.overwrite:
112 |             print(
113 |                 "WARNING! The output directory, %s, already exists, we might overwrite data in it!"
114 |                 % (args.output_dir)
115 |             )
116 |         else:
117 |             print(
118 |                 "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..."
119 |                 % (args.output_dir)
120 |             )
121 |             return
122 |     else:
123 |         print("The output directory doesn't exist or is empty.")
124 |         os.makedirs(args.output_dir, exist_ok=True)
125 | 
126 |     if torch.cuda.is_available():
127 |         device = torch.device("cuda:%d" % args.gpu)
128 |     else:
129 |         print("WARNING! Torch is reporting that CUDA isn't available, exiting...")
130 |         return
131 | 
132 |     # -------------------
133 |     # Load model
134 |     # -------------------
135 |     if args.model == "unet":
136 |         model = models.get_unet(classes=args.num_classes)
137 |     elif args.model == "fcn":
138 |         model = models.get_fcn(num_output_classes=args.num_classes)
139 |     elif args.model == "unet2":
140 |         model = models.get_unet2(n_classes=args.num_classes)
141 |     elif args.model == "deeplabv3plus":
142 |         model = models.get_deeplabv3plus(n_classes=args.num_classes)
143 |     else:
144 |         raise ValueError("Invalid model")
145 |     model.load_state_dict(torch.load(args.model_fn))
146 |     model = model.to(device)
147 | 
148 |     # determine which label transform to use
149 |     if args.label_transform == "naip":
150 |         label_transform = label_transforms_naip
151 |         class_names = [
152 |             "no_data",
153 |             "water",
154 |             "emergent_wetlands",
155 |             "tree_canopy",
156 |             "shrubland",
157 |             "low_vegetation",
158 |             "barren",
159 |             "structure",
160 |             "impervious_surface",
161 |             "impervious_roads",
162 |             "weighted_avg",
163 |         ]
164 |     elif args.label_transform == "epa":
165 |         label_transform = label_transforms_epa
166 |         class_names = [
167 |             "no_data",
168 |             "impervious",
169 |             "soil_barren",
170 |             "grass",
171 |             "tree/forest",
172 |             "water",
173 |             "shrub",
174 |             "woody_wetlands",
175 |             "emergent_wetlands",
176 |             "agriculture",
177 |             "orchard",
178 |             "weighted_avg",
179 |         ]
180 |     elif args.label_transform == "cic":
181 |         label_transform = label_transform_cic
182 |         class_names = [
183 |             "Structures",
184 |             "Impervious Surface",
185 |             "Water",
186 |             "Grassland/Pairie",
187 |             "Tree Canopy",
188 |             "Turff",
189 |             "Barren/Rock",
190 |             "Irregated",
191 |         ]
192 | 
193 |     elif args.label_transform == "naip_5cls":
194 |         label_transform = label_transform_naip5cls
195 |         class_names = [
196 |             "water/wetland",
197 |             "tree",
198 |             "barren",
199 |             "low veg",
200 |             "built enviornment",
201 |         ]
202 | 
203 |     elif args.label_transform == "naip_4cls":
204 |         label_transform = label_transform_naip5cls
205 |         class_names = ["water/wetland", "tree", "low veg", "built enviornment"]
206 |     elif args.label_transform == "uvm":
207 |         label_transform = labels_transform_uvm
208 |         class_names = [
209 |             "tree",
210 |             "grass",
211 |             "bare soil",
212 |             "water",
213 |             "buildings",
214 |             "roads",
215 |             "other impervious",
216 |         ]
217 |     elif args.label_transform == "uvm8cls":
218 |         label_transform = labels_transform_uvm_8cls
219 |         class_names = [
220 |             "tree",
221 |             "grass",
222 |             "bare soil",
223 |             "water",
224 |             "buildings",
225 |             "roads",
226 |             "other impervious",
227 |             "shrubs",
228 |         ]
229 | 
230 |     else:
231 |         raise ValueError("Invalid label transform")
232 | 
233 |     # -------------------
234 |     # Run on each line in the input
235 |     # -------------------
236 |     input_dataframe = pd.read_csv(args.input_fn)
237 |     image_fns = input_dataframe["image_fn"].values
238 |     label_fns = input_dataframe["label_fn"].values
239 | 
240 |     df_lst = []
241 |     for image_idx in range(len(image_fns)):
242 |         pred_masks = []
243 |         tic = time.time()
244 |         image_fn = image_fns[image_idx]
245 |         gt_label_fn = label_fns[image_idx]
246 | 
247 |         print(
248 |             "(%d/%d) Processing %s" % (image_idx, len(image_fns), image_fn), end=" ... "
249 |         )
250 | 
251 |         # -------------------
252 |         # Load input and create dataloader
253 |         # -------------------
254 | 
255 |         with rasterio.open(image_fn) as f:
256 |             input_width, input_height = f.width, f.height
257 |             input_profile = f.profile.copy()
258 | 
259 |         dataset = TileInferenceDataset(
260 |             image_fn,
261 |             chip_size=CHIP_SIZE,
262 |             stride=CHIP_STRIDE,
263 |             transform=image_transforms,
264 |             verbose=False,
265 |         )
266 |         dataloader = torch.utils.data.DataLoader(
267 |             dataset,
268 |             batch_size=args.batch_size,
269 |             num_workers=NUM_WORKERS,
270 |             pin_memory=True,
271 |         )
272 | 
273 |         # -------------------
274 |         # Run model and organize output
275 |         # -------------------
276 | 
277 |         output = np.zeros(
278 |             (args.num_classes, input_height, input_width), dtype=np.float32
279 |         )
280 |         kernel = np.ones((CHIP_SIZE, CHIP_SIZE), dtype=np.float32)
281 |         kernel[HALF_PADDING:-HALF_PADDING, HALF_PADDING:-HALF_PADDING] = 5
282 |         counts = np.zeros((input_height, input_width), dtype=np.float32)
283 | 
284 |         for i, (data, coords) in enumerate(dataloader):
285 |             data = data.to(device)
286 |             with torch.no_grad():
287 |                 # https://discuss.pytorch.org/t/error-expected-more-than-1-value-per-channel-when-training/26274
288 |                 model.eval()
289 |                 t_output = model(data)
290 |                 t_output = F.softmax(t_output, dim=1).cpu().numpy()
291 | 
292 |             for j in range(t_output.shape[0]):
293 |                 y, x = coords[j]
294 | 
295 |                 output[:, y : y + CHIP_SIZE, x : x + CHIP_SIZE] += t_output[j] * kernel
296 |                 counts[y : y + CHIP_SIZE, x : x + CHIP_SIZE] += kernel
297 | 
298 |         output = output / counts
299 |         output_hard = output.argmax(axis=0).astype(np.uint8)
300 | 
301 |         # append to list of preds
302 |         pred_masks.append(output_hard)
303 | 
304 |         # -------------------
305 |         # Save output
306 |         # -------------------
307 |         output_profile = input_profile.copy()
308 |         output_profile["driver"] = "GTiff"
309 |         output_profile["dtype"] = "uint8"
310 |         output_profile["count"] = 1
311 |         output_profile["nodata"] = 90
312 | 
313 |         output_fn = image_fn.split("/")[-1]  # something like "546_naip-2013.tif"
314 |         output_fn = output_fn.replace("naip", "predictions")
315 |         output_fn = os.path.join(args.output_dir, output_fn)
316 | 
317 |         with rasterio.open(output_fn, "w", **output_profile) as f:
318 |             f.write(output_hard, 1)
319 |             f.write_colormap(1, utils.LC_TREE_COLORMAP)  # fix
320 | 
321 |         if args.save_soft:
322 | 
323 |             output = output / output.sum(axis=0, keepdims=True)
324 |             output = (output * 255).astype(np.uint8)
325 | 
326 |             output_profile = input_profile.copy()
327 |             output_profile["driver"] = "GTiff"
328 |             output_profile["dtype"] = "uint8"
329 |             output_profile["count"] = 13
330 |             # output_profile["count"] = 13
331 |             del output_profile["nodata"]
332 | 
333 |             output_fn = image_fn.split("/")[-1]  # something like "546_naip-2013.tif"
334 |             output_fn = output_fn.replace("naip", "predictions-soft")
335 |             output_fn = os.path.join(args.output_dir, output_fn)
336 | 
337 |             with rasterio.open(output_fn, "w", **output_profile) as f:
338 |                 f.write(output)
339 | 
340 |         print("finished in %0.4f seconds" % (time.time() - tic))
341 | 
342 |         # load in ground truth
343 |         gt = rasterio.open(gt_label_fn).read()
344 |         gt = gt[0]
345 |         gt_f = np.reshape(gt, [-1])
346 | 
347 |         # remove no data vals
348 |         gt_cleaned = np.delete(
349 |             gt_f, np.where((gt_f == 15) | (gt_f == 14) | (gt_f == 13) | (gt_f == 0))
350 |         )
351 | 
352 |         print(gt_cleaned.shape)
353 |         print(np.unique(gt_cleaned))
354 | 
355 |         gt_t = label_transform(gt_cleaned)
356 |         print("label transformed unique")
357 |         print(np.unique(gt_t))
358 | 
359 |         # f-score calculation
360 |         pred_masks = np.array(pred_masks)
361 |         pred_masks_f = np.reshape(pred_masks, [-1])
362 | 
363 |         pred_masks_cleaned = np.delete(
364 |             pred_masks_f,
365 |             np.where((gt_f == 15) | (gt_f == 14) | (gt_f == 13) | (gt_f == 0)),
366 |         )
367 |         print(pred_masks_cleaned.shape)
368 | 
369 |         uniq_tm = np.unique(gt_t)  # unique true mask
370 |         print(uniq_tm)
371 |         uniq_pm = np.unique(pred_masks_cleaned)  # unique pred mask
372 | 
373 |         # f1 score is computed toward common classes between gt and pred
374 |         uniq_v = np.unique(np.concatenate((uniq_tm, uniq_pm)))
375 |         print(uniq_v)
376 | 
377 |         # determine missing labels
378 |         missing_labels = np.setdiff1d(list(np.arange(args.num_classes)), uniq_v)
379 | 
380 |         f1_score_weighted = f1_score(gt_t, pred_masks_cleaned, average="weighted")
381 | 
382 |         f1_score_per_class = f1_score(gt_t, pred_masks_cleaned, average=None)
383 |         print(f"Length of f1 for classes {len(f1_score_per_class)}, they are: \n")
384 |         print(f1_score_per_class)
385 | 
386 |         per_class_f1_final = np.zeros(len(class_names))
387 |         # where the unique cls id exist, fill in f1 per calss
388 |         per_class_f1_final[uniq_v] = f1_score_per_class
389 |         # where is the missing id, fill in np.nan
390 |         per_class_f1_final[missing_labels] = np.nan
391 |         per_class_f1_final[-1] = f1_score_weighted
392 | 
393 |         d = {
394 |             "class": class_names,
395 |             image_fn.split("/")[-1] + "_f1_score": per_class_f1_final,
396 |         }
397 | 
398 |         df = pd.DataFrame.from_dict(d)
399 |         df_t = df.T
400 |         df_t.columns = df["class"]
401 |         df_t = df_t.drop("class")
402 | 
403 |         df_lst.append(df_t)
404 | 
405 |     df_combine = pd.concat(df_lst)
406 |     df_combine.loc["mean"] = df_combine.mean(axis=0)
407 |     csv_fn = os.path.join(args.output_dir, "f1_score_stats.csv")
408 |     df_combine.to_csv(csv_fn)
409 | 
410 | 
411 | if __name__ == "__main__":
412 |     main()
413 | 


--------------------------------------------------------------------------------
/src/models.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import logging
  3 | 
  4 | import numpy as np
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | import torch.nn.functional as F
  9 | 
 10 | import segmentation_models_pytorch as smp
 11 | 
 12 | import utils
 13 | 
 14 | from typing import Optional, Union, List
 15 | 
 16 | 
 17 | class FCN(nn.Module):
 18 |     def __init__(self, num_input_channels, num_output_classes, num_filters=64):
 19 |         super(FCN, self).__init__()
 20 | 
 21 |         self.conv1 = nn.Conv2d(
 22 |             num_input_channels, num_filters, kernel_size=3, stride=1, padding=1
 23 |         )
 24 |         self.conv2 = nn.Conv2d(
 25 |             num_filters, num_filters, kernel_size=3, stride=1, padding=1
 26 |         )
 27 |         self.conv3 = nn.Conv2d(
 28 |             num_filters, num_filters, kernel_size=3, stride=1, padding=1
 29 |         )
 30 |         self.conv4 = nn.Conv2d(
 31 |             num_filters, num_filters, kernel_size=3, stride=1, padding=1
 32 |         )
 33 |         self.conv5 = nn.Conv2d(
 34 |             num_filters, num_filters, kernel_size=3, stride=1, padding=1
 35 |         )
 36 |         self.last = nn.Conv2d(
 37 |             num_filters, num_output_classes, kernel_size=1, stride=1, padding=0
 38 |         )
 39 | 
 40 |     def forward(self, inputs):
 41 |         x = F.relu(self.conv1(inputs))
 42 |         x = F.relu(self.conv2(x))
 43 |         x = F.relu(self.conv3(x))
 44 |         x = F.relu(self.conv4(x))
 45 |         x = F.relu(self.conv5(x))
 46 |         x = self.last(x)
 47 |         return x
 48 | 
 49 |     def forward_features(self, inputs):
 50 |         x = F.relu(self.conv1(inputs))
 51 |         x = F.relu(self.conv2(x))
 52 |         x = F.relu(self.conv3(x))
 53 |         x = F.relu(self.conv4(x))
 54 |         z = F.relu(self.conv5(x))
 55 |         # y = self.last(z)
 56 |         return z
 57 | 
 58 | 
 59 | class Unet(smp.base.SegmentationModel):
 60 |     """Unet_ is a fully convolution neural network for image semantic segmentation. Consist of *encoder*
 61 |     and *decoder* parts connected with *skip connections*. Encoder extract features of different spatial
 62 |     resolution (skip connections) which are used by decoder to define accurate segmentation mask. Use *concatenation*
 63 |     for fusing decoder blocks with skip connections.
 64 |     Args:
 65 |         encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone)
 66 |             to extract features of different spatial resolution
 67 |         encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features
 68 |             two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features
 69 |             with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on).
 70 |             Default is 5
 71 |         encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and
 72 |             other pretrained weights (see table with available weights for each encoder_name)
 73 |         decoder_channels: List of integers which specify **in_channels** parameter for convolutions used in decoder.
 74 |             Length of the list should be the same as **encoder_depth**
 75 |         decoder_use_batchnorm: If **True**, BatchNorm2d layer between Conv2D and Activation layers
 76 |             is used. If **"inplace"** InplaceABN will be used, allows to decrease memory consumption.
 77 |             Available options are **True, False, "inplace"**
 78 |         decoder_attention_type: Attention module used in decoder of the model. Available options are **None** and **scse**.
 79 |             SCSE paper - https://arxiv.org/abs/1808.08127
 80 |         in_channels: A number of input channels for the model, default is 3 (RGB images)
 81 |         classes: A number of classes for output mask (or you can think as a number of channels of output mask)
 82 |         activation: An activation function to apply after the final convolution layer.
 83 |             Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**, **callable** and **None**.
 84 |             Default is **None**
 85 |         aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build
 86 |             on top of encoder if **aux_params** is not **None** (default). Supported params:
 87 |                 - classes (int): A number of classes
 88 |                 - pooling (str): One of "max", "avg". Default is "avg"
 89 |                 - dropout (float): Dropout factor in [0, 1)
 90 |                 - activation (str): An activation function to apply "sigmoid"/"softmax" (could be **None** to return logits)
 91 |     Returns:
 92 |         ``torch.nn.Module``: Unet
 93 |     .. _Unet:
 94 |         https://arxiv.org/abs/1505.04597
 95 |     """
 96 | 
 97 |     def __init__(
 98 |         self,
 99 |         encoder_name: str = "resnet34",
100 |         encoder_depth: int = 5,
101 |         encoder_weights: Optional[str] = "imagenet",
102 |         decoder_use_batchnorm: bool = True,
103 |         decoder_channels: List[int] = (256, 128, 64, 32, 16),
104 |         decoder_attention_type: Optional[str] = None,
105 |         in_channels: int = 3,
106 |         classes: int = 1,
107 |         activation: Optional[Union[str, callable]] = None,
108 |         aux_params: Optional[dict] = None,
109 |     ):
110 |         super().__init__()
111 | 
112 |         self.encoder = smp.encoders.get_encoder(
113 |             encoder_name,
114 |             in_channels=in_channels,
115 |             depth=encoder_depth,
116 |             weights=encoder_weights,
117 |         )
118 | 
119 |         self.decoder = smp.unet.decoder.UnetDecoder(
120 |             encoder_channels=self.encoder.out_channels,
121 |             decoder_channels=decoder_channels,
122 |             n_blocks=encoder_depth,
123 |             use_batchnorm=decoder_use_batchnorm,
124 |             center=True if encoder_name.startswith("vgg") else False,
125 |             attention_type=decoder_attention_type,
126 |         )
127 | 
128 |         self.segmentation_head = smp.base.SegmentationHead(
129 |             in_channels=decoder_channels[-1],
130 |             out_channels=classes,
131 |             activation=activation,
132 |             kernel_size=1,
133 |         )
134 | 
135 |         if aux_params is not None:
136 |             self.classification_head = smp.base.ClassificationHead(
137 |                 in_channels=self.encoder.out_channels[-1], **aux_params
138 |             )
139 |         else:
140 |             self.classification_head = None
141 | 
142 |         self.name = "u-{}".format(encoder_name)
143 |         self.initialize()
144 | 
145 | 
146 | class Unet2(nn.Module):
147 |     def __init__(
148 |         self,
149 |         feature_scale=1,
150 |         n_classes=3,
151 |         in_channels=3,
152 |         is_deconv=True,
153 |         is_batchnorm=False,
154 |     ):
155 |         """
156 |         Args:
157 |             feature_scale: the smallest number of filters (depth c) is 64 when feature_scale is 1,
158 |                            and it is 32 when feature_scale is 2
159 |             n_classes: number of output classes
160 |             in_channels: number of channels in input
161 |             is_deconv:
162 |             is_batchnorm:
163 |         """
164 | 
165 |         super(Unet2, self).__init__()
166 | 
167 |         self.is_deconv = is_deconv
168 |         self.in_channels = in_channels
169 |         self.is_batchnorm = is_batchnorm
170 |         self.feature_scale = feature_scale
171 | 
172 |         assert (
173 |             64 % self.feature_scale == 0
174 |         ), f"feature_scale {self.feature_scale} does not work with this UNet"
175 | 
176 |         filters = [
177 |             64,
178 |             128,
179 |             256,
180 |             512,
181 |             1024,
182 |         ]  # this is `c` in the diagram, [c, 2c, 4c, 8c, 16c]
183 |         filters = [int(x / self.feature_scale) for x in filters]
184 |         logging.info("filters used are: {}".format(filters))
185 | 
186 |         # downsampling
187 |         self.conv1 = UnetConv2(self.in_channels, filters[0], self.is_batchnorm)
188 |         self.maxpool1 = nn.MaxPool2d(kernel_size=2)
189 | 
190 |         self.conv2 = UnetConv2(filters[0], filters[1], self.is_batchnorm)
191 |         self.maxpool2 = nn.MaxPool2d(kernel_size=2)
192 | 
193 |         self.conv3 = UnetConv2(filters[1], filters[2], self.is_batchnorm)
194 |         self.maxpool3 = nn.MaxPool2d(kernel_size=2)
195 | 
196 |         self.conv4 = UnetConv2(filters[2], filters[3], self.is_batchnorm)
197 |         self.maxpool4 = nn.MaxPool2d(kernel_size=2)
198 | 
199 |         self.center = UnetConv2(filters[3], filters[4], self.is_batchnorm)
200 | 
201 |         # upsampling
202 |         self.up_concat4 = UnetUp(filters[4], filters[3], self.is_deconv)
203 |         self.up_concat3 = UnetUp(filters[3], filters[2], self.is_deconv)
204 |         self.up_concat2 = UnetUp(filters[2], filters[1], self.is_deconv)
205 |         self.up_concat1 = UnetUp(filters[1], filters[0], self.is_deconv)
206 | 
207 |         # final conv (without any concat)
208 |         self.final = nn.Conv2d(filters[0], n_classes, kernel_size=1)
209 | 
210 |     def forward(self, inputs):
211 |         conv1 = self.conv1(inputs)
212 |         maxpool1 = self.maxpool1(conv1)
213 | 
214 |         conv2 = self.conv2(maxpool1)
215 |         maxpool2 = self.maxpool2(conv2)
216 | 
217 |         conv3 = self.conv3(maxpool2)
218 |         maxpool3 = self.maxpool3(conv3)
219 | 
220 |         conv4 = self.conv4(maxpool3)
221 |         maxpool4 = self.maxpool4(conv4)
222 | 
223 |         center = self.center(maxpool4)
224 |         up4 = self.up_concat4(conv4, center)
225 |         up3 = self.up_concat3(conv3, up4)
226 |         up2 = self.up_concat2(conv2, up3)
227 |         up1 = self.up_concat1(conv1, up2)
228 | 
229 |         final = self.final(up1)
230 | 
231 |         return final
232 | 
233 |     def forward_features(self, inputs):
234 |         conv1 = self.conv1(inputs)
235 |         maxpool1 = self.maxpool1(conv1)
236 | 
237 |         conv2 = self.conv2(maxpool1)
238 |         maxpool2 = self.maxpool2(conv2)
239 | 
240 |         conv3 = self.conv3(maxpool2)
241 |         maxpool3 = self.maxpool3(conv3)
242 | 
243 |         conv4 = self.conv4(maxpool3)
244 |         maxpool4 = self.maxpool4(conv4)
245 | 
246 |         center = self.center(maxpool4)
247 |         up4 = self.up_concat4(conv4, center)
248 |         up3 = self.up_concat3(conv3, up4)
249 |         up2 = self.up_concat2(conv2, up3)
250 |         up1 = self.up_concat1(conv1, up2)
251 | 
252 |         final = self.final(up1)
253 | 
254 |         return final, up1
255 | 
256 | 
257 | class UnetConv2(nn.Module):
258 |     def __init__(self, in_channels, out_channels, is_batchnorm):
259 |         super(UnetConv2, self).__init__()
260 | 
261 |         if is_batchnorm:
262 |             self.conv1 = nn.Sequential(
263 |                 # this amount of padding/stride/kernel_size preserves width/height
264 |                 nn.Conv2d(
265 |                     in_channels, out_channels, kernel_size=3, stride=1, padding=1
266 |                 ),
267 |                 nn.BatchNorm2d(out_channels),
268 |                 nn.ReLU(),
269 |             )
270 |             self.conv2 = nn.Sequential(
271 |                 nn.Conv2d(
272 |                     out_channels, out_channels, kernel_size=3, stride=1, padding=1
273 |                 ),
274 |                 nn.BatchNorm2d(out_channels),
275 |                 nn.ReLU(),
276 |             )
277 |         else:
278 |             self.conv1 = nn.Sequential(
279 |                 nn.Conv2d(
280 |                     in_channels, out_channels, kernel_size=3, stride=1, padding=1
281 |                 ),
282 |                 nn.ReLU(),
283 |             )
284 |             self.conv2 = nn.Sequential(
285 |                 nn.Conv2d(
286 |                     out_channels, out_channels, kernel_size=3, stride=1, padding=1
287 |                 ),
288 |                 nn.ReLU(),
289 |             )
290 | 
291 |     def forward(self, inputs):
292 |         outputs = self.conv1(inputs)
293 |         outputs = self.conv2(outputs)
294 |         return outputs
295 | 
296 | 
297 | class UnetUp(nn.Module):
298 |     def __init__(self, in_channels, out_channels, is_deconv):
299 |         """
300 |         is_deconv:  use transposed conv layer to upsample - parameters are learnt; otherwise use
301 |                     bilinear interpolation to upsample.
302 |         """
303 |         super(UnetUp, self).__init__()
304 | 
305 |         self.conv = UnetConv2(in_channels, out_channels, False)
306 | 
307 |         self.is_deconv = is_deconv
308 |         if is_deconv:
309 |             self.up = nn.ConvTranspose2d(
310 |                 in_channels, out_channels, kernel_size=2, stride=2
311 |             )
312 |         # UpsamplingBilinear2d is deprecated in favor of interpolate()
313 |         # else:
314 |         #     self.up = nn.UpsamplingBilinear2d(scale_factor=2)
315 | 
316 |     def forward(self, inputs1, inputs2):
317 |         """
318 |         inputs1 is from the downward path, of higher resolution
319 |         inputs2 is from the 'lower' layer. It gets upsampled (spatial size increases) and its depth (channels) halves
320 |         to match the depth of inputs1, before being concatenated in the depth dimension.
321 |         """
322 |         if self.is_deconv:
323 |             outputs2 = self.up(inputs2)
324 |         else:
325 |             # scale_factor is the multiplier for spatial size
326 |             outputs2 = F.interpolate(
327 |                 inputs2, scale_factor=2, mode="bilinear", align_corners=True
328 |             )
329 | 
330 |         offset = outputs2.size()[2] - inputs1.size()[2]
331 |         padding = 2 * [offset // 2, offset // 2]
332 |         outputs1 = F.pad(inputs1, padding)
333 | 
334 |         return self.conv(torch.cat([outputs1, outputs2], dim=1))
335 | 
336 | 
337 | def get_unet(classes=11):
338 |     return Unet(
339 |         encoder_name="resnet18",
340 |         encoder_depth=3,
341 |         encoder_weights=None,
342 |         decoder_channels=(128, 64, 64),
343 |         in_channels=4,
344 |         classes=classes,
345 |     )
346 | 
347 | 
348 | def get_unet2(n_classes):
349 |     return Unet2(
350 |         feature_scale=1,
351 |         n_classes=n_classes,
352 |         in_channels=4,
353 |         is_deconv=True,
354 |         is_batchnorm=False,
355 |     )
356 | 
357 | 
358 | def get_fcn(num_output_classes=11):
359 |     return FCN(
360 |         num_input_channels=4, num_output_classes=num_output_classes, num_filters=64
361 |     )
362 | 
363 | 
364 | def get_deeplabv3plus(n_classes):
365 |     return smp.DeepLabV3Plus(
366 |         encoder_name="resnet18",
367 |         in_channels=4,
368 |         classes=n_classes,
369 |         encoder_weights=None,
370 |     )
371 | 


--------------------------------------------------------------------------------
/src/seed_data_creation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Script to create seed data from trained model for PEARL MVP model retraining session
  3 | """
  4 | import logging
  5 | import os
  6 | import sys
  7 | 
  8 | import joblib
  9 | import numpy as np
 10 | import sklearn.base
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | import segmentation_models_pytorch as smp
 15 | from sklearn.linear_model import SGDClassifier
 16 | from sklearn.metrics import f1_score
 17 | from sklearn.model_selection import train_test_split
 18 | import pandas as pd
 19 | import rasterio
 20 | import models
 21 | 
 22 | import argparse
 23 | 
 24 | sys.path.append("..")
 25 | LOGGER = logging.getLogger("server")
 26 | 
 27 | from typing import Optional, Union, List
 28 | 
 29 | parser = argparse.ArgumentParser(description="Create seed data from trained model")
 30 | parser.add_argument(
 31 |     "--input_csv",
 32 |     type=str,
 33 |     required=True,
 34 |     help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.',
 35 | )
 36 | parser.add_argument(
 37 |     "--ckpt_file", type=str, required=True, help="A trained model file in pt format"
 38 | )
 39 | parser.add_argument(
 40 |     "--n_classes", type=int, required=True, help="The number of calsses"
 41 | )
 42 | parser.add_argument(
 43 |     "--out_npz",
 44 |     type=str,
 45 |     required=True,
 46 |     help="The path to a directory to output model seed data in npz",
 47 | )
 48 | parser.add_argument(
 49 |     "--model",
 50 |     default="fcn",
 51 |     choices=("unet", "fcn", "unet2", "deeplabv3plus"),
 52 |     help="Model to use",
 53 | )
 54 | 
 55 | args = parser.parse_args()
 56 | 
 57 | if torch.cuda.is_available():
 58 |     device = torch.device("cuda")
 59 | else:
 60 |     print("WARNING! Torch is reporting that CUDA isn't available, using cpu")
 61 |     device = torch.device("cpu")
 62 | 
 63 | 
 64 | def label_transforms_naip(labels):
 65 |     labels = np.array(labels).astype(np.int64)
 66 |     labels = np.where(labels == 14, 0, labels)  # to no data
 67 |     labels = np.where(labels == 15, 0, labels)  # to no data
 68 |     labels = np.where(labels == 13, 0, labels)  # to no data
 69 |     labels = np.where(labels == 10, 3, labels)  # to tree canopy
 70 |     labels = np.where(labels == 11, 3, labels)  # to tree canopy
 71 |     labels = np.where(labels == 12, 3, labels)  # to tree canopy
 72 |     return labels
 73 | 
 74 | 
 75 | def label_transforms_uvm(labels):
 76 |     naip_7cls = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
 77 |     labels = np.array(labels).astype(np.int64)
 78 |     labels_new = np.copy(labels)
 79 |     for k, v in naip_7cls.items():
 80 |         labels_new[labels == k] = v
 81 |     return labels_new
 82 | 
 83 | 
 84 | def load_model(n_classes, chkpt_file, model_nm):
 85 |     if model_nm == "unet2":
 86 |         model = models.Unet2(
 87 |             feature_scale=1,
 88 |             n_classes=n_classes,
 89 |             in_channels=4,
 90 |             is_deconv=True,
 91 |             is_batchnorm=False,
 92 |         )
 93 |     elif model_nm == "unet":
 94 |         model = models.Unet(
 95 |             feature_scale=1,
 96 |             n_classes=n_classes,
 97 |             in_channels=4,
 98 |             is_deconv=True,
 99 |             is_batchnorm=False,
100 |         )
101 |     elif model_nm == "fcn":
102 |         model = models.FCN(
103 |             num_input_channels=4,
104 |             num_output_classes=n_classes,
105 |             num_filters=64,
106 |             padding=1,
107 |         )
108 |     elif model_nm == "deeplabv3plus":
109 |         model = smp.DeepLabV3Plus(
110 |             encoder_name="resnet18",
111 |             encoder_weights=None,
112 |             in_channels=4,
113 |             classes=n_classes,
114 |         )
115 |     checkpoint = torch.load(chkpt_file, map_location=device)
116 |     model.load_state_dict(checkpoint)
117 |     model = model.to(device)
118 |     model.eval()
119 | 
120 |     return model
121 | 
122 | 
123 | def sample_data(df_path, n_samples):
124 | 
125 |     df = pd.read_csv(df_path)
126 |     image_fns, label_fns = df[["image_fn", "label_fn"]].values.T
127 |     idxs = np.random.choice(image_fns.shape[0], replace=False, size=n_samples)
128 |     image_fns = image_fns[idxs]
129 |     label_fns = label_fns[idxs]
130 |     return image_fns, label_fns
131 | 
132 | 
133 | def deeplabv3plus_forward_features(model, x):
134 |     features = model.encoder(x)
135 |     decoder_output = model.decoder(*features)
136 |     return F.interpolate(
137 |         decoder_output,
138 |         scale_factor=4,
139 |     )
140 | 
141 | 
142 | def get_seed_data_deeplabv3plus(
143 |     model, device, img_fn, label_fn, n_patches, n_points, verbose=True
144 | ):
145 | 
146 |     with rasterio.open(img_fn) as f:
147 |         data = f.read()
148 |     data = data / 255.0
149 | 
150 |     with rasterio.open(label_fn) as f:
151 |         labels = f.read().squeeze()
152 | 
153 |     height, width = labels.shape
154 |     labels.shape
155 |     labels = label_transforms_uvm(labels)
156 | 
157 |     ## Sample n_patches from the tile
158 |     patch_size = 128
159 |     x_imgs = np.zeros((n_patches, 4, patch_size, patch_size), dtype=np.float32)
160 |     y_imgs = np.zeros((n_patches, patch_size, patch_size), dtype=np.uint8)
161 |     for i in range(n_patches):
162 | 
163 |         x = np.random.randint(0, width - patch_size)
164 |         y = np.random.randint(0, height - patch_size)
165 | 
166 |         x_img = data[:, y : y + patch_size, x : x + patch_size].copy()
167 |         y_img = labels[y : y + patch_size, x : x + patch_size].copy()
168 | 
169 |         x_imgs[i] = x_img
170 |         y_imgs[i] = y_img
171 | 
172 |     x_imgs = torch.from_numpy(x_imgs).to(device)
173 |     print("x_imgs")
174 |     print(x_imgs.shape)
175 | 
176 |     ## Run the model on the patches
177 |     with torch.no_grad():
178 |         x_img_features = deeplabv3plus_forward_features(model, x_imgs)
179 |         x_img_features = x_img_features.cpu().numpy()
180 |         print("x_img_feature_shape")
181 |         print(x_img_features.shape)
182 | 
183 |     ## Evaluate the model on all the patches
184 |     if verbose:
185 |         print(
186 |             "Base model acc on sampled patches",
187 |             accuracy_score(y_imgs.ravel(), y_imgs_pred.ravel()),
188 |         )
189 |         print(
190 |             "Base model f1 on sampled patches",
191 |             f1_score(y_imgs.ravel(), y_imgs_pred.ravel(), average="macro"),
192 |         )
193 | 
194 |     ## Subsample n_points from the patches
195 |     x_seed = np.zeros((n_points, 256), dtype=np.float32)
196 |     y_seed = np.zeros((n_points,), dtype=np.uint8)
197 | 
198 |     for j in range(n_points):
199 |         i = np.random.randint(n_patches)
200 |         x = np.random.randint(32, patch_size - 32)
201 |         y = np.random.randint(32, patch_size - 32)
202 | 
203 |         x_seed[j] = x_img_features[i, :, y, x]
204 |         y_seed[j] = y_imgs[i, y, x]
205 | 
206 |     ## Evaluate the model on the seed points
207 |     if verbose:
208 |         ## Use the last layer of the model to make predictions from the seed embeddings
209 |         fcn_weights = model.last.weight.cpu().detach().numpy().squeeze()
210 |         fcn_bias = model.last.bias.cpu().detach().numpy()
211 |         y_seed_pred = (x_seed @ fcn_weights.T + fcn_bias).argmax(axis=1)
212 |         print("Base model acc on subset of points", accuracy_score(y_seed, y_seed_pred))
213 |         print(
214 |             "Base model f1 on subset of points",
215 |             f1_score(y_seed, y_seed_pred, average="macro"),
216 |         )
217 | 
218 |     print("y_seed dataset for deeplabv3+ are:")
219 |     print(y_seed.shape, np.unique(y_seed))
220 |     return x_seed, y_seed
221 | 
222 | 
223 | def get_seed_data_fcn(
224 |     model, device, label_transform_function, img_fn, label_fn, n_patches, n_points
225 | ):
226 |     ## Load data
227 |     with rasterio.open(img_fn) as f:
228 |         data = f.read()
229 |     data = data / 255.0
230 | 
231 |     with rasterio.open(label_fn) as f:
232 |         labels = f.read().squeeze()
233 |     height, width = labels.shape
234 |     labels.shape
235 |     labels = label_transforms_function(labels)
236 | 
237 |     ## Sample n_patches from the tile
238 |     patch_size = 256
239 |     x_imgs = np.zeros((n_patches, 4, patch_size, patch_size), dtype=np.float32)
240 |     y_imgs = np.zeros((n_patches, patch_size, patch_size), dtype=np.uint8)
241 |     for i in range(n_patches):
242 | 
243 |         x = np.random.randint(0, width - patch_size)
244 |         y = np.random.randint(0, height - patch_size)
245 | 
246 |         x_img = data[:, y : y + patch_size, x : x + patch_size].copy()
247 |         y_img = labels[y : y + patch_size, x : x + patch_size].copy()
248 | 
249 |         x_imgs[i] = x_img
250 |         y_imgs[i] = y_img
251 | 
252 |     x_imgs = torch.from_numpy(x_imgs).to(device)
253 | 
254 |     ## Run the model on the patches
255 |     with torch.no_grad():
256 |         y_imgs_pred, x_img_features = model.forward_features(x_imgs)
257 |         y_imgs_pred = y_imgs_pred.argmax(axis=1).cpu().numpy()
258 |         x_img_features = x_img_features.cpu().numpy()
259 | 
260 |     ## Subsample n_points from the patches
261 |     x_seed = np.zeros((n_points, 64), dtype=np.float32)
262 |     y_seed = np.zeros((n_points,), dtype=np.uint8)
263 | 
264 |     for j in range(n_points):
265 |         i = np.random.randint(n_patches)
266 |         x = np.random.randint(32, patch_size - 32)
267 |         y = np.random.randint(32, patch_size - 32)
268 | 
269 |         x_seed[j] = x_img_features[i, :, y, x]
270 |         y_seed[j] = y_imgs[i, y, x]
271 | 
272 |     return x_seed, y_seed
273 | 
274 | 
275 | def get_seed_data_unet(
276 |     model, device, img_fn, label_fn, n_patches, n_points, verbose=True
277 | ):
278 | 
279 |     with rasterio.open(img_fn) as f:
280 |         data = f.read()
281 |     data = data / 255.0
282 | 
283 |     with rasterio.open(label_fn) as f:
284 |         labels = f.read().squeeze()
285 | 
286 |     height, width = labels.shape
287 |     labels.shape
288 |     labels = label_transforms_uvm(labels)
289 | 
290 |     ## Sample n_patches from the tile
291 |     patch_size = 128
292 |     x_imgs = np.zeros((n_patches, 4, patch_size, patch_size), dtype=np.float32)
293 |     y_imgs = np.zeros((n_patches, patch_size, patch_size), dtype=np.uint8)
294 |     for i in range(n_patches):
295 | 
296 |         x = np.random.randint(0, width - patch_size)
297 |         y = np.random.randint(0, height - patch_size)
298 | 
299 |         x_img = data[:, y : y + patch_size, x : x + patch_size].copy()
300 |         y_img = labels[y : y + patch_size, x : x + patch_size].copy()
301 | 
302 |         x_imgs[i] = x_img
303 |         y_imgs[i] = y_img
304 | 
305 |     x_imgs = torch.from_numpy(x_imgs).to(device)
306 |     print("x_imgs")
307 |     print(x_imgs.shape)
308 | 
309 |     ## Run the model on the patches
310 |     with torch.no_grad():
311 |         x_img_features = model.forward_features(x_imgs)
312 |         # y_imgs_pred = y_imgs_pred.argmax(axis=1).cpu().numpy()
313 |         x_img_features = x_img_features.cpu().numpy()
314 |         print("x_img_features shape")
315 |         print(x_img_features.shape)
316 | 
317 |     ## Evaluate the model on all the patches
318 |     if verbose:
319 |         print(
320 |             "Base model acc on sampled patches",
321 |             accuracy_score(y_imgs.ravel(), y_imgs_pred.ravel()),
322 |         )
323 |         print(
324 |             "Base model f1 on sampled patches",
325 |             f1_score(y_imgs.ravel(), y_imgs_pred.ravel(), average="macro"),
326 |         )
327 | 
328 |     ## Subsample n_points from the patches
329 |     x_seed = np.zeros((n_points, 64), dtype=np.float32)
330 |     y_seed = np.zeros((n_points,), dtype=np.uint8)
331 | 
332 |     for j in range(n_points):
333 |         i = np.random.randint(n_patches)
334 |         x = np.random.randint(32, patch_size - 32)
335 |         y = np.random.randint(32, patch_size - 32)
336 | 
337 |         x_seed[j] = x_img_features[i, :, y, x]
338 |         y_seed[j] = y_imgs[i, y, x]
339 | 
340 |     ## Evaluate the model on the seed points
341 |     if verbose:
342 |         ## Use the last layer of the model to make predictions from the seed embeddings
343 |         fcn_weights = model.last.weight.cpu().detach().numpy().squeeze()
344 |         fcn_bias = model.last.bias.cpu().detach().numpy()
345 |         y_seed_pred = (x_seed @ fcn_weights.T + fcn_bias).argmax(axis=1)
346 |         print("Base model acc on subset of points", accuracy_score(y_seed, y_seed_pred))
347 |         print(
348 |             "Base model f1 on subset of points",
349 |             f1_score(y_seed, y_seed_pred, average="macro"),
350 |         )
351 | 
352 |     return x_seed, y_seed
353 | 
354 | 
355 | def calculate_seed_data():
356 |     device = torch.device("cuda")
357 |     x_test = []
358 |     y_test = []
359 | 
360 |     df = pd.read_csv(args.input_csv)
361 |     image_fns, label_fns = df[["image_fn", "label_fn"]].values.T
362 |     for i in range(len(image_fns)):
363 |         if i % 5 == 0:
364 |             print(i, len(image_fns))
365 | 
366 |         # -------------------
367 |         # Setup model
368 |         # -------------------
369 |         if args.model == "unet2":
370 |             model = load_model(args.n_classes, args.ckpt_file, args.model)
371 |             x_test_sample, y_test_sample = get_seed_data_unet(
372 |                 model,
373 |                 device,
374 |                 image_fns[i],
375 |                 label_fns[i],
376 |                 n_patches=128,
377 |                 n_points=1000,
378 |                 verbose=False,
379 |             )
380 |             x_test.append(x_test_sample)
381 |             y_test.append(y_test_sample)
382 |         elif args.model == "unet":
383 |             model = model = load_model(args.n_classes, args.ckpt_file, args.model)
384 |             x_test_sample, y_test_sample = get_seed_data_unet(
385 |                 model,
386 |                 device,
387 |                 image_fns[i],
388 |                 label_fns[i],
389 |                 n_patches=128,
390 |                 n_points=1000,
391 |                 verbose=False,
392 |             )
393 |             x_test.append(x_test_sample)
394 |             y_test.append(y_test_sample)
395 |         elif args.model == "fcn":
396 |             model = model = load_model(args.n_classes, args.ckpt_file, args.model)
397 |             x_test_sample, y_test_sample = get_seed_data_fcn(
398 |                 model,
399 |                 device,
400 |                 image_fns[i],
401 |                 label_fns[i],
402 |                 n_patches=64,
403 |                 n_points=100,
404 |                 verbose=False,
405 |             )
406 |             x_test.append(x_test_sample)
407 |             y_test.append(y_test_sample)
408 |         elif args.model == "deeplabv3plus":
409 |             model = model = load_model(args.n_classes, args.ckpt_file, args.model)
410 |             x_test_sample, y_test_sample = get_seed_data_deeplabv3plus(
411 |                 model,
412 |                 device,
413 |                 image_fns[i],
414 |                 label_fns[i],
415 |                 n_patches=64,
416 |                 n_points=100,
417 |                 verbose=False,
418 |             )
419 |             x_test.append(x_test_sample)
420 |             y_test.append(y_test_sample)
421 |         else:
422 |             raise ValueError("Invalid model")
423 | 
424 |     x_test = np.concatenate(x_test, axis=0)
425 |     y_test = np.concatenate(y_test, axis=0)
426 | 
427 |     print(np.unique(y_test))
428 | 
429 |     np.savez(args.out_npz, embeddings=x_test, labels=y_test)
430 | 
431 | 
432 | if __name__ == "__main__":
433 |     calculate_seed_data()
434 | 


--------------------------------------------------------------------------------
/src/train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import datetime
  3 | import argparse
  4 | import copy
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | 
  9 | from dataloaders.StreamingDatasets import StreamingGeospatialDataset
 10 | 
 11 | import torch
 12 | import torch.optim as optim
 13 | import segmentation_models_pytorch as smp
 14 | 
 15 | import models
 16 | import utils
 17 | from transforms_utils import (
 18 |     label_transforms_naip,
 19 |     label_transforms_epa,
 20 |     label_transform_cic,
 21 |     label_transform_naip5cls,
 22 |     label_transform_4cls,
 23 |     labels_transform_uvm,
 24 |     labels_transform_uvm_8cls,
 25 |     image_transforms,
 26 | )
 27 | 
 28 | from azureml.core import Run
 29 | 
 30 | torch.backends.cudnn.deterministic = False
 31 | torch.backends.cudnn.benchmark = True
 32 | 
 33 | # Some tricks to make rasterio faster when using vsicurl
 34 | # -- see https://github.com/pangeo-data/cog-best-practices
 35 | RASTERIO_BEST_PRACTICES = dict(
 36 |     CURL_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt",
 37 |     GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR",
 38 |     AWS_NO_SIGN_REQUEST="YES",
 39 |     GDAL_MAX_RAW_BLOCK_CACHE_SIZE="200000000",
 40 |     GDAL_SWATH_SIZE="200000000",
 41 |     VSI_CURL_CACHE_SIZE="200000000",
 42 | )
 43 | os.environ.update(RASTERIO_BEST_PRACTICES)
 44 | 
 45 | 
 46 | run = Run.get_context()
 47 | 
 48 | NUM_WORKERS = 8
 49 | CHIP_SIZE = 256
 50 | 
 51 | parser = argparse.ArgumentParser(description="DFC2021 baseline training script")
 52 | parser.add_argument(
 53 |     "--input_fn",
 54 |     type=str,
 55 |     required=True,
 56 |     help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.',
 57 | )
 58 | parser.add_argument(
 59 |     "--input_fn_val",
 60 |     type=str,
 61 |     required=True,
 62 |     help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.',
 63 | )
 64 | parser.add_argument(
 65 |     "--output_dir",
 66 |     type=str,
 67 |     required=True,
 68 |     help="The path to a directory to store model checkpoints.",
 69 | )
 70 | parser.add_argument(
 71 |     "--overwrite",
 72 |     action="store_true",
 73 |     help="Flag for overwriting `output_dir` if that directory already exists.",
 74 | )
 75 | parser.add_argument(
 76 |     "--save_most_recent",
 77 |     action="store_true",
 78 |     help="Flag for saving the most recent version of the model during training.",
 79 | )
 80 | parser.add_argument(
 81 |     "--model",
 82 |     default="fcn",
 83 |     choices=("unet", "fcn", "unet2", "deeplabv3plus"),
 84 |     help="Model to use",
 85 | )
 86 | 
 87 | # Training arguments
 88 | parser.add_argument("--gpu", type=int, default=0, help="The ID of the GPU to use")
 89 | parser.add_argument(
 90 |     "--batch_size", type=int, default=32, help="Batch size to use for training"
 91 | )
 92 | parser.add_argument(
 93 |     "--num_epochs", type=int, default=50, help="Number of epochs to train for"
 94 | )
 95 | parser.add_argument(
 96 |     "--seed", type=int, default=0, help="Random seed to pass to numpy and torch"
 97 | )
 98 | parser.add_argument(
 99 |     "--num_classes", type=int, default=10, help="number of classes in dataset"
100 | )
101 | parser.add_argument(
102 |     "--num_chips",
103 |     type=int,
104 |     default=100,
105 |     help="number of chips to randomly sample from data",
106 | )
107 | parser.add_argument(
108 |     "--label_transform",
109 |     default="uvm",
110 |     help="str either naip, epa or cic, naip_5cls, uvm to indicate how to transform labels",
111 | )
112 | args = parser.parse_args()
113 | 
114 | 
115 | def nodata_check(img, labels):
116 |     return np.any(labels == 0)
117 | 
118 | 
119 | def main():
120 |     print(
121 |         "Starting DFC2021 baseline training script at %s"
122 |         % (str(datetime.datetime.now()))
123 |     )
124 | 
125 |     # -------------------
126 |     # Setup
127 |     # -------------------
128 |     assert os.path.exists(args.input_fn)
129 | 
130 |     if os.path.isfile(args.output_dir):
131 |         print("A file was passed as `--output_dir`, please pass a directory!")
132 |         return
133 | 
134 |     if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)):
135 |         if args.overwrite:
136 |             print(
137 |                 "WARNING! The output directory, %s, already exists, we might overwrite data in it!"
138 |                 % (args.output_dir)
139 |             )
140 |         else:
141 |             print(
142 |                 "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..."
143 |                 % (args.output_dir)
144 |             )
145 |             return
146 |     else:
147 |         print("The output directory doesn't exist or is empty.")
148 |         os.makedirs(args.output_dir, exist_ok=True)
149 | 
150 |     if torch.cuda.is_available():
151 |         device = torch.device("cuda:%d" % args.gpu)
152 |     else:
153 |         print("WARNING! Torch is reporting that CUDA isn't available, using cpu")
154 |         device = "cpu"
155 | 
156 |     np.random.seed(args.seed)
157 |     torch.manual_seed(args.seed)
158 | 
159 |     # -------------------
160 |     # Load input data
161 |     # -------------------
162 |     input_dataframe = pd.read_csv(args.input_fn)
163 |     image_fns = input_dataframe["image_fn"].values
164 |     label_fns = input_dataframe["label_fn"].values
165 |     groups = input_dataframe["group"].values
166 | 
167 |     input_dataframe_val = pd.read_csv(args.input_fn_val)
168 |     image_fns_val = input_dataframe_val["image_fn"].values
169 |     label_fns_val = input_dataframe_val["label_fn"].values
170 | 
171 |     if args.label_transform == "naip":
172 |         label_transform = label_transforms_naip
173 |     elif args.label_transform == "epa":
174 |         label_transform = label_transforms_epa
175 |     elif args.label_transform == "cic":
176 |         label_transform = label_transform_cic
177 |     elif args.label_transform == "naip_5cls":
178 |         label_transform = label_transform_naip5cls
179 |     elif args.label_transform == "naip_4cls":
180 |         label_transform = label_transform_4cls
181 |     elif args.label_transform == "uvm":
182 |         label_transform = labels_transform_uvm
183 |     elif args.label_transform == "uvm8cls":
184 |         label_transform = labels_transform_uvm_8cls
185 |     else:
186 |         raise ValueError("Invalid label transform")
187 | 
188 |     dataset = StreamingGeospatialDataset(
189 |         imagery_fns=image_fns,
190 |         label_fns=label_fns,
191 |         groups=groups,
192 |         chip_size=CHIP_SIZE,
193 |         num_chips_per_tile=args.num_chips,
194 |         windowed_sampling=False,
195 |         verbose=True,
196 |         image_transform=image_transforms,
197 |         label_transform=label_transform,
198 |         nodata_check=nodata_check,
199 |     )
200 | 
201 |     dataloader = torch.utils.data.DataLoader(
202 |         dataset,
203 |         batch_size=args.batch_size,
204 |         num_workers=NUM_WORKERS,
205 |         pin_memory=True,
206 |     )
207 | 
208 |     dataset_val = StreamingGeospatialDataset(
209 |         imagery_fns=image_fns_val,
210 |         label_fns=label_fns_val,
211 |         groups=groups,
212 |         chip_size=CHIP_SIZE,
213 |         num_chips_per_tile=args.num_chips,
214 |         windowed_sampling=False,
215 |         verbose=True,
216 |         image_transform=image_transforms,
217 |         label_transform=label_transform,
218 |         nodata_check=nodata_check,
219 |     )
220 | 
221 |     dataloader_val = torch.utils.data.DataLoader(
222 |         dataset_val,
223 |         batch_size=args.batch_size,
224 |         num_workers=NUM_WORKERS,
225 |         pin_memory=True,
226 |     )
227 | 
228 |     num_training_batches_per_epoch = int(
229 |         len(image_fns) * args.num_chips / args.batch_size
230 |     )
231 |     print(
232 |         "We will be training with %d batches per epoch"
233 |         % (num_training_batches_per_epoch)
234 |     )
235 | 
236 |     num_val_batches_per_epoch = int(
237 |         len(image_fns_val) * args.num_chips / args.batch_size
238 |     )
239 |     print(
240 |         "We will be validating with %d batches per epoch" % (num_val_batches_per_epoch)
241 |     )
242 | 
243 |     # -------------------
244 |     # Setup training
245 |     # -------------------
246 |     if args.model == "unet":
247 |         model = models.get_unet(classes=args.num_classes)
248 |     elif args.model == "unet2":
249 |         model = models.get_unet2(n_classes=args.num_classes)
250 |     elif args.model == "fcn":
251 |         model = models.get_fcn(num_output_classes=args.num_classes)
252 |     elif args.model == "deeplabv3plus":
253 |         model = models.get_deeplabv3plus(n_classes=args.num_classes)
254 |     else:
255 |         raise ValueError("Invalid model")
256 | 
257 |     model = model.to(device)
258 |     optimizer = optim.AdamW(model.parameters(), lr=0.001, amsgrad=True)
259 |     # criterion = nn.CrossEntropyLoss()
260 |     criterion = smp.losses.FocalLoss(mode="multiclass")
261 |     scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min")
262 | 
263 |     print("Model has %d parameters" % (utils.count_parameters(model)))
264 | 
265 |     # -------------------
266 |     # Model training
267 |     # -------------------
268 |     training_task_losses = []
269 |     num_times_lr_dropped = 0
270 |     model_checkpoints = []
271 |     val_task_losses = []
272 |     temp_model_fn = os.path.join(args.output_dir, "most_recent_model.pt")
273 | 
274 |     for epoch in range(args.num_epochs):
275 |         print("on epoch number: ", epoch)
276 |         lr = utils.get_lr(optimizer)
277 | 
278 |         training_losses = utils.fit(
279 |             model,
280 |             device,
281 |             dataloader,
282 |             num_training_batches_per_epoch,
283 |             optimizer,
284 |             criterion,
285 |             epoch,
286 |         )
287 |         scheduler.step(training_losses[0])
288 | 
289 |         model_checkpoints.append(copy.deepcopy(model.state_dict()))
290 |         if args.save_most_recent:
291 |             torch.save(model.state_dict(), temp_model_fn)
292 | 
293 |         if utils.get_lr(optimizer) < lr:
294 |             num_times_lr_dropped += 1
295 |             print("")
296 |             print("Learning rate dropped")
297 |             print("")
298 |         training_task_losses.append(training_losses[0])
299 |         run.log("loss", training_losses[0])
300 |         if num_times_lr_dropped == 4:
301 |             break
302 | 
303 |         # Run Validation
304 |         validation_losses = utils.evaluate(
305 |             model,
306 |             device,
307 |             dataloader_val,
308 |             num_val_batches_per_epoch,
309 |             criterion,
310 |             epoch,
311 |         )
312 |         val_task_losses.append(validation_losses[0])
313 |         run.log("loss", validation_losses[0])
314 | 
315 |         num_classes = args.num_classes  # to-do fix
316 |         per_class_f1, global_f1 = utils.score_batch(
317 |             model, device, dataloader_val, num_val_batches_per_epoch, num_classes
318 |         )
319 |         run.log("per_class_f1_val", per_class_f1)
320 |         run.log("global_f1_val", global_f1)
321 | 
322 |     # -------------------
323 |     # Save everything
324 |     # -------------------
325 |     save_obj = {
326 |         "args": args,
327 |         "training_task_losses": training_task_losses,
328 |         "checkpoints": model_checkpoints,
329 |     }
330 | 
331 |     save_obj_fn = "results.pt"
332 |     out_path = os.path.join(args.output_dir, save_obj_fn)
333 |     run.log("out_path", out_path)
334 |     with open(os.path.join(args.output_dir, save_obj_fn), "wb") as f:
335 |         torch.save(save_obj, f)
336 | 
337 | 
338 | if __name__ == "__main__":
339 |     main()
340 | 


--------------------------------------------------------------------------------
/src/transforms_utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import utils
 4 | 
 5 | 
 6 | def label_transforms_naip(labels, group=None):
 7 |     labels = np.array(labels).astype(np.int64)
 8 |     labels = np.where(labels == 14, 0, labels)  # to no data
 9 |     labels = np.where(labels == 15, 0, labels)  # to no data
10 |     labels = np.where(labels == 13, 0, labels)  # to no data
11 |     labels = np.where(labels == 10, 3, labels)  # to tree canopy
12 |     labels = np.where(labels == 11, 3, labels)  # to tree canopy
13 |     labels = np.where(labels == 12, 3, labels)  # to tree canopy
14 |     labels = torch.from_numpy(labels)
15 |     return labels
16 | 
17 | 
18 | def label_transforms_epa(labels, group=None):
19 |     labels = np.array(labels).astype(np.int64)
20 |     labels_new = np.copy(labels)
21 |     for k, v in utils.epa_label_dict.items():
22 |         labels_new[labels == k] = v
23 |     labels_new = torch.from_numpy(labels_new)
24 |     return labels_new
25 | 
26 | 
27 | def label_transform_cic(labels, group=None):
28 |     labels = np.array(labels).astype(np.int64)
29 |     labels_new = np.copy(labels)
30 |     for k, v in utils.cic_label_dict.items():
31 |         labels_new[labels == k] = v
32 |     labels_new = torch.from_numpy(labels_new)
33 |     return labels_new
34 | 
35 | 
36 | def label_transform_naip5cls(labels, group=None):
37 |     labels = np.array(labels).astype(np.int64)
38 |     labels_new = np.copy(labels)
39 |     for k, v in utils.naip_5cls.items():
40 |         labels_new[labels == k] = v
41 |     labels_new = torch.from_numpy(labels_new)
42 |     return labels_new
43 | 
44 | 
45 | def label_transform_4cls(labels, group=None):
46 |     labels = np.array(labels).astype(np.int64)
47 |     labels_new = np.copy(labels)
48 |     for k, v in utils.naip_4cls.items():
49 |         labels_new[labels == k] = v
50 |     labels_new = torch.from_numpy(labels_new)
51 |     return labels_new
52 | 
53 | 
54 | def labels_transform_uvm(labels, group=None):
55 |     labels = np.array(labels).astype(np.int64)
56 |     labels_new = np.copy(labels)
57 |     for k, v in utils.uvm_7cls.items():
58 |         labels_new[labels == k] = v
59 |     labels_new = torch.from_numpy(labels_new)
60 |     return labels_new
61 | 
62 | 
63 | def labels_transform_uvm_8cls(labels, group=None):
64 |     labels = np.array(labels).astype(np.int64)
65 |     labels_new = np.copy(labels)
66 |     for k, v in utils.uvm_8cls.items():
67 |         labels_new[labels == k] = v
68 |     labels_new = torch.from_numpy(labels_new)
69 |     return labels_new
70 | 
71 | 
72 | def image_transforms(img, group=None):
73 |     img = img / 255.0
74 |     img = np.rollaxis(img, 2, 0).astype(np.float32)
75 |     img = torch.from_numpy(img)
76 |     return img
77 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import time
  3 | import numpy as np
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | from sklearn.metrics import f1_score
 12 | 
 13 | NAIP_2013_MEANS = np.array([117.00, 130.75, 122.50, 159.30])
 14 | NAIP_2013_STDS = np.array([38.16, 36.68, 24.30, 66.22])
 15 | NAIP_2017_MEANS = np.array([72.84, 86.83, 76.78, 130.82])
 16 | NAIP_2017_STDS = np.array([41.78, 34.66, 28.76, 58.95])
 17 | NAIP_NY_2017_MEANS = np.array([95.31, 129.95, 127.77, 184.45])
 18 | NAIP_NY_2017_STDS = np.array([40.95, 34.71, 21.07, 51.11])
 19 | NAIP_PA_2017_MEANS = np.array([114.38, 140.45, 110.13, 177.38])
 20 | NAIP_PA_2017_STDS = np.array([37.401, 34.29, 23.77, 45.98])
 21 | NAIP_DE_2013_MEANS = np.array([116.74, 132.48, 127.61, 175.80])
 22 | NAIP_DE_2013_STDS = np.array([40.22, 34.22, 23.86, 60.58])
 23 | NAIP_VA_2018_MEANS = np.array([92.70, 104.10, 75.43, 118.62])
 24 | NAIP_VA_2018_STDS = np.array([42.56, 42.91, 31.27, 59.34])
 25 | NAIP_WV_2018_MEANS = np.array([109.36, 123.73, 105.63, 117.47])
 26 | NAIP_WV_2018_STDS = np.array([44.79, 36.71, 31.72, 38.93])
 27 | NAIP_MD_2018_MEANS = np.array([103.84, 108.31, 88.07, 113.36])
 28 | NAIP_MD_2018_STDS = np.array([46.19, 44.21, 37.07, 60.50])
 29 | NAIP_MD_2017_MEANS = np.array([73.31, 86.94, 77.38, 126.26])
 30 | NAIP_MD_2017_STDS = np.array([42.22, 35.90, 30.23, 60.49])
 31 | NAIP_MD_2015_MEANS = np.array([116.05, 126.48, 117.93, 158.21])
 32 | NAIP_MD_2015_STDS = np.array([38.08, 32.87, 27.07, 83.22])
 33 | NAIP_MD_2011_MEANS = np.array([108.35, 126.13, 121.83, 176.64])
 34 | NAIP_MD_2011_STDS = np.array([39.94, 30.60, 25.39, 45.69])
 35 | NAIP_MD_Merged_MEANS = np.array([100.3875, 111.965, 101.3025, 143.6175])
 36 | NAIP_MD_Merged_STDS = np.array([58.71, 59.7775, 54.05, 95.2125])
 37 | NAIP_VA_Merged_MEANS = np.array([102.6326, 118.5233, 104.2282, 145.7618])
 38 | NAIP_VA_Merged_STDS = np.array([39.6812, 37.2886, 32.8185, 50.1053])
 39 | NAIP_NY_Merged_MEANS = np.array([97.2829, 122.9519, 106.3612, 169.0045])
 40 | NAIP_NY_Merged_STDS = np.array([39.7267, 36.6849, 25.8357, 52.4304])
 41 | fresno_ca_means = np.array([132.70, 127.63, 109.55, 147.25])
 42 | fresno_ca_stds = np.array([45.21, 38.478, 34.65, 35.70])
 43 | la_ca_means = np.array([115.06, 114.08, 105.04, 123.96])
 44 | la_ca_stds = np.array([56.31, 49.89, 44.26, 48.78])
 45 | sanoma_ca_means = np.array([93.69, 101.96, 90.17, 126.93])
 46 | sanoma_ca_stds = np.array([49.12, 39.83, 33.27, 54.14])
 47 | 
 48 | 
 49 | NLCD_CLASSES = [
 50 |     0,
 51 |     11,
 52 |     12,
 53 |     21,
 54 |     22,
 55 |     23,
 56 |     24,
 57 |     31,
 58 |     41,
 59 |     42,
 60 |     43,
 61 |     52,
 62 |     71,
 63 |     81,
 64 |     82,
 65 |     90,
 66 |     95,
 67 | ]  # 16 classes + 1 nodata class ("0"). Note that "12" is "Perennial Ice/Snow" and is not present in Maryland.
 68 | 
 69 | NLCD_CLASS_COLORMAP = {  # Copied from the emebedded color table in the NLCD data files
 70 |     0: (0, 0, 0, 255),
 71 |     11: (70, 107, 159, 255),
 72 |     12: (209, 222, 248, 255),
 73 |     21: (222, 197, 197, 255),
 74 |     22: (217, 146, 130, 255),
 75 |     23: (235, 0, 0, 255),
 76 |     24: (171, 0, 0, 255),
 77 |     31: (179, 172, 159, 255),
 78 |     41: (104, 171, 95, 255),
 79 |     42: (28, 95, 44, 255),
 80 |     43: (181, 197, 143, 255),
 81 |     52: (204, 184, 121, 255),
 82 |     71: (223, 223, 194, 255),
 83 |     81: (220, 217, 57, 255),
 84 |     82: (171, 108, 40, 255),
 85 |     90: (184, 217, 235, 255),
 86 |     95: (108, 159, 184, 255),
 87 | }
 88 | 
 89 | NLCD_IDX_COLORMAP = {idx: NLCD_CLASS_COLORMAP[c] for idx, c in enumerate(NLCD_CLASSES)}
 90 | LC_CLASSES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
 91 | LC_CLASS_COLORMAP = {
 92 |     0: (0, 0, 0, 0),
 93 |     1: (0, 197, 255, 255),
 94 |     2: (0, 168, 132, 255),
 95 |     3: (38, 115, 0, 255),
 96 |     4: (76, 230, 0, 255),
 97 |     5: (163, 255, 115, 255),
 98 |     6: (255, 170, 0, 255),
 99 |     7: (255, 0, 0, 255),
100 |     8: (156, 156, 156, 255),
101 |     9: (0, 0, 0, 255),
102 |     10: (115, 115, 0, 255),
103 |     11: (230, 230, 0, 255),
104 |     12: (255, 255, 115, 255),
105 |     13: (197, 0, 255, 255),
106 | }
107 | 
108 | LC_CLASSES_TREE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
109 | LC_CLASS_TREE_COLORMAP = {
110 |     0: (252, 232, 3),
111 |     1: (0, 197, 255, 255),
112 |     2: (0, 168, 132, 255),
113 |     3: (38, 115, 0, 255),
114 |     4: (76, 230, 0, 255),
115 |     5: (163, 255, 115, 255),
116 |     6: (255, 170, 0, 255),
117 |     7: (255, 0, 0, 255),
118 |     8: (156, 156, 156, 255),
119 |     9: (0, 0, 0, 255),
120 | }
121 | 
122 | 
123 | LC_COLORMAP = {idx: LC_CLASS_COLORMAP[c] for idx, c in enumerate(LC_CLASSES)}
124 | 
125 | 
126 | LC_TREE_COLORMAP = {
127 |     idx: LC_CLASS_TREE_COLORMAP[c] for idx, c in enumerate(LC_CLASSES_TREE)
128 | }
129 | 
130 | EPA_CLASSES = [0, 10, 20, 30, 40, 52, 70, 80, 82, 91, 92]
131 | 
132 | epa_label_dict = {
133 |     0: 0,
134 |     10: 1,
135 |     20: 2,
136 |     30: 3,
137 |     40: 4,
138 |     52: 5,
139 |     70: 6,
140 |     80: 7,
141 |     82: 8,
142 |     91: 9,
143 |     92: 10,
144 | }
145 | 
146 | CIC_CLASSES = [1, 2, 3, 4, 5, 6, 7, 8]
147 | 
148 | cic_label_dict = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
149 | 
150 | naip_5cls = {1: 0, 2: 0, 3: 1, 10: 1, 11: 1, 12: 1, 4: 2, 5: 2, 6: 3, 7: 4, 8: 4, 9: 4}
151 | 
152 | naip_4cls = {1: 0, 2: 0, 3: 1, 10: 1, 11: 1, 12: 1, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3}
153 | 
154 | uvm_7cls = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6}
155 | 
156 | uvm_8cls = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7}
157 | 
158 | 
159 | def get_nlcd_class_to_idx_map():
160 |     nlcd_label_to_idx_map = []
161 |     idx = 0
162 |     for i in range(NLCD_CLASSES[-1] + 1):
163 |         if i in NLCD_CLASSES:
164 |             nlcd_label_to_idx_map.append(idx)
165 |             idx += 1
166 |         else:
167 |             nlcd_label_to_idx_map.append(0)
168 |     nlcd_label_to_idx_map = np.array(nlcd_label_to_idx_map).astype(np.int64)
169 |     return nlcd_label_to_idx_map
170 | 
171 | 
172 | NLCD_CLASS_TO_IDX_MAP = (
173 |     get_nlcd_class_to_idx_map()
174 | )  # I do this computation on import for illustration (this could instead be a length 96 vector that is hardcoded here)
175 | 
176 | 
177 | NLCD_IDX_TO_REDUCED_LC_MAP = np.array(
178 |     [
179 |         4,  #  0 No data 0
180 |         0,  #  1 Open Water
181 |         4,  #  2 Ice/Snow
182 |         2,  #  3 Developed Open Space
183 |         3,  #  4 Developed Low Intensity
184 |         3,  #  5 Developed Medium Intensity
185 |         3,  #  6 Developed High Intensity
186 |         3,  #  7 Barren Land
187 |         1,  #  8 Deciduous Forest
188 |         1,  #  9 Evergreen Forest
189 |         1,  # 10 Mixed Forest
190 |         1,  # 11 Shrub/Scrub
191 |         2,  # 12 Grassland/Herbaceous
192 |         2,  # 13 Pasture/Hay
193 |         2,  # 14 Cultivated Crops
194 |         1,  # 15 Woody Wetlands
195 |         1,  # 16 Emergent Herbaceious Wetlands
196 |     ]
197 | )
198 | 
199 | NLCD_IDX_TO_REDUCED_LC_ACCUMULATOR = np.array(
200 |     [
201 |         [0, 0, 0, 0, 1],  #  0 No data 0
202 |         [1, 0, 0, 0, 0],  #  1 Open Water
203 |         [0, 0, 0, 0, 1],  #  2 Ice/Snow
204 |         [0, 0, 0, 0, 0],  #  3 Developed Open Space
205 |         [0, 0, 0, 0, 0],  #  4 Developed Low Intensity
206 |         [0, 0, 0, 1, 0],  #  5 Developed Medium Intensity
207 |         [0, 0, 0, 1, 0],  #  6 Developed High Intensity
208 |         [0, 0, 0, 0, 0],  #  7 Barren Land
209 |         [0, 1, 0, 0, 0],  #  8 Deciduous Forest
210 |         [0, 1, 0, 0, 0],  #  9 Evergreen Forest
211 |         [0, 1, 0, 0, 0],  # 10 Mixed Forest
212 |         [0, 1, 0, 0, 0],  # 11 Shrub/Scrub
213 |         [0, 0, 1, 0, 0],  # 12 Grassland/Herbaceous
214 |         [0, 0, 1, 0, 0],  # 13 Pasture/Hay
215 |         [0, 0, 1, 0, 0],  # 14 Cultivated Crops
216 |         [0, 1, 0, 0, 0],  # 15 Woody Wetlands
217 |         [0, 1, 0, 0, 0],  # 16 Emergent Herbaceious Wetlands
218 |     ]
219 | )
220 | 
221 | 
222 | class Timer:
223 |     """A wrapper class for printing out what is running and how long it took.
224 |     Use as:
225 |     ```
226 |     with utils.Timer("running stuff"):
227 |         # do stuff
228 |     ```
229 |     This will output:
230 |     ```
231 |     Starting 'running stuff'
232 |     # any output from 'running stuff'
233 |     Finished 'running stuff' in 12.45 seconds
234 |     ```
235 |     """
236 | 
237 |     def __init__(self, message):
238 |         self.message = message
239 | 
240 |     def __enter__(self):
241 |         self.tic = float(time.time())
242 |         print("Starting '%s'" % (self.message))
243 | 
244 |     def __exit__(self, type, value, traceback):
245 |         print("Finished '%s' in %0.4f seconds" % (self.message, time.time() - self.tic))
246 | 
247 | 
248 | def fit(model, device, data_loader, num_batches, optimizer, criterion, epoch, memo=""):
249 |     model.train()
250 |     losses = []
251 |     tic = time.time()
252 |     for batch_idx, (data, targets) in tqdm(
253 |         enumerate(data_loader), total=num_batches, file=sys.stdout
254 |     ):
255 |         data = data.to(device)
256 |         targets = targets.to(device)
257 |         optimizer.zero_grad()
258 |         # error Expected more than 1 value per channel when training
259 |         # check https://discuss.pytorch.org/t/error-expected-more-than-1-value-per-channel-when-training/26274
260 |         model.eval()
261 |         outputs = model(data)
262 |         loss = criterion(outputs, targets)
263 |         losses.append(loss.item())
264 |         loss.backward()
265 |         optimizer.step()
266 | 
267 |     avg_loss = np.mean(losses)
268 |     print(
269 |         "[{}] Training Epoch: {}\t Time elapsed: {:.2f} seconds\t Loss: {:.2f}".format(
270 |             memo, epoch, time.time() - tic, avg_loss
271 |         ),
272 |         end="",
273 |     )
274 |     print("")
275 |     return [avg_loss]
276 | 
277 | 
278 | def evaluate(model, device, data_loader, num_batches, criterion, epoch, memo=""):
279 |     model.eval()
280 |     losses = []
281 |     tic = time.time()
282 |     for batch_idx, (data, targets) in tqdm(
283 |         enumerate(data_loader), total=num_batches, file=sys.stdout
284 |     ):
285 |         data = data.to(device)
286 |         targets = targets.to(device)
287 |         with torch.no_grad():
288 |             outputs = model(data)
289 |             loss = criterion(outputs, targets)
290 |             losses.append(loss.item())
291 |     avg_loss = np.mean(losses)
292 | 
293 |     print(
294 |         "[{}] Validation Epoch: {}\t Time elapsed: {:.2f} seconds\t Loss: {:.2f}".format(
295 |             memo, epoch, time.time() - tic, avg_loss
296 |         ),
297 |         end="",
298 |     )
299 |     print("")
300 |     return [avg_loss]
301 | 
302 | 
303 | def score(model, device, data_loader, num_batches):
304 |     model.eval()
305 | 
306 |     num_classes = model.module.segmentation_head[0].out_channels
307 |     num_samples = len(data_loader.dataset)
308 |     predictions = np.zeros((num_samples, num_classes), dtype=np.float32)
309 |     idx = 0
310 |     for batch_idx, (data, target) in enumerate(data_loader):
311 |         data = data.to(device)
312 |         with torch.no_grad():
313 |             output = F.softmax(model(data))
314 |         batch_size = data.shape[0]
315 |         predictions[idx : idx + batch_size] = output.cpu().numpy()
316 |         idx += batch_size
317 |     return predictions
318 | 
319 | 
320 | def score2(model, device, data_loader, num_batches, num_classes):
321 |     model.eval()
322 | 
323 |     predictions = []
324 |     ground_truth = []
325 |     idx = 0
326 |     for batch_idx, (data, target) in enumerate(data_loader):
327 |         data = data.to(device)
328 |         target = target.to(device)
329 |         with torch.no_grad():
330 |             output = F.softmax(model(data))
331 |         output = output.cpu().numpy()  # (32, 10, 256, 256)
332 |         target = target.cpu().numpy()
333 | 
334 |         for i, x in enumerate(output):
335 |             predictions.append(x.argmax(axis=0).astype(np.uint8))
336 |             ground_truth.append(target[i])
337 | 
338 |     # to this per batch instead of all at once to fix memory errors
339 |     preds_f = np.reshape(np.array(predictions), [-1])
340 |     gt_f = np.reshape(np.array(ground_truth), [-1])
341 |     per_class_f1 = f1_score(gt_f, preds_f, average=None)
342 |     global_f1 = f1_score(gt_f, preds_f, average="weighted")
343 | 
344 |     return per_class_f1, global_f1
345 | 
346 | 
347 | def score_batch(model, device, data_loader, num_batches, num_classes):
348 |     model.eval()
349 | 
350 |     batch_per_class_f1 = []
351 |     batch_global_f1 = []
352 |     idx = 0
353 |     for batch_idx, (data, target) in enumerate(data_loader):
354 |         predictions = []
355 |         ground_truth = []
356 |         data = data.to(device)
357 |         target = target.to(device)
358 |         with torch.no_grad():
359 |             output = F.softmax(model(data))
360 |         output = output.cpu().numpy()
361 |         target = target.cpu().numpy()
362 | 
363 |         print("output")
364 |         print(output)
365 |         for i, x in enumerate(output):
366 |             predictions.append(x.argmax(axis=0).astype(np.uint8))
367 |             ground_truth.append(target[i])
368 | 
369 |         preds_f = np.reshape(np.array(predictions), [-1])
370 |         print("ground truth")
371 |         print(ground_truth)
372 |         gt_f = np.reshape(np.array(ground_truth), [-1])
373 | 
374 |         missing_labels = np.setdiff1d(list(np.arange(num_classes)), np.unique(gt_f))
375 | 
376 |         per_class_f1 = f1_score(gt_f, preds_f, average=None)
377 | 
378 |         per_class_f1_final = np.zeros(num_classes)
379 |         # add nan for missing label classes
380 |         for x in missing_labels:
381 |             per_class_f1_final[x] = np.nan
382 |         print("gt_f")
383 |         print(gt_f)
384 |         print("np unique")
385 |         print(np.unique(gt_f))
386 |         for i, gt_class in enumerate(np.unique(gt_f)):
387 |             per_class_f1_final[gt_class] = per_class_f1[i]
388 | 
389 |         global_f1 = f1_score(gt_f, preds_f, average="weighted")
390 | 
391 |         batch_per_class_f1.append(per_class_f1_final)
392 |         batch_global_f1.append(global_f1)
393 | 
394 |     batch_per_class_f1_mean = np.nanmean(batch_per_class_f1, axis=0)
395 |     batch_global_f1_mean = np.mean(batch_global_f1)
396 |     return batch_per_class_f1_mean, batch_global_f1_mean
397 | 
398 | 
399 | def get_lr(optimizer):
400 |     for param_group in optimizer.param_groups:
401 |         return param_group["lr"]
402 | 
403 | 
404 | def count_parameters(model):
405 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
406 | 


--------------------------------------------------------------------------------
/train_azure/create_compute-cpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from azureml.core import Workspace
 3 | from azureml.core.compute import ComputeTarget, AmlCompute
 4 | from azureml.core.compute_target import ComputeTargetException
 5 | from azureml.core.authentication import InteractiveLoginAuthentication
 6 | 
 7 | 
 8 | # try:
 9 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID")
10 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID)
11 | 
12 | ws = Workspace.from_config()  # This automatically looks for a directory .azureml
13 | 
14 | # Choose a name for your CPU cluster
15 | # memory optimized: https://docs.microsoft.com/en-us/azure/virtual-machines/dv2-dsv2-series-memory
16 | AZ_CPU_CLUSTER_NAME = os.getenv("AZ_CPU_CLUSTER_NAME")
17 | 
18 | # Verify that the cluster does not exist already
19 | try:
20 |     cpu_cluster = ComputeTarget(workspace=ws, name=AZ_CPU_CLUSTER_NAME)
21 |     print("Found existing cluster, use it.")
22 | except ComputeTargetException:
23 |     compute_config = AmlCompute.provisioning_configuration(
24 |         vm_size="Standard_DS12_v2",
25 |         idle_seconds_before_scaledown=1200,
26 |         min_nodes=0,
27 |         max_nodes=3,
28 |     )
29 |     cpu_cluster = ComputeTarget.create(ws, AZ_CPU_CLUSTER_NAME, compute_config)
30 | 
31 | cpu_cluster.wait_for_completion(show_output=True)
32 | 


--------------------------------------------------------------------------------
/train_azure/create_compute-gpu.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from azureml.core import Workspace
 3 | from azureml.core.compute import ComputeTarget, AmlCompute
 4 | from azureml.core.compute_target import ComputeTargetException
 5 | from azureml.core.authentication import InteractiveLoginAuthentication
 6 | 
 7 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID")
 8 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID)
 9 | AZ_SUB_ID = os.getenv("AZ_SUB_ID")
10 | 
11 | ws = Workspace.from_config()  # This automatically looks for a directory .azureml
12 | 
13 | AZ_GPU_CLUSTER_NAME = os.getenv("AZ_GPU_CLUSTER_NAME")
14 | 
15 | # Verify that the cluster does not exist already
16 | try:
17 |     gpu_cluster = ComputeTarget(workspace=ws, name=AZ_GPU_CLUSTER_NAME)
18 |     print("Found existing cluster, use it.")
19 | except ComputeTargetException:
20 |     # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?tabs=python#managed-identity
21 |     print("Creating new gpu cluster...")
22 |     compute_config = AmlCompute.provisioning_configuration(
23 |         vm_size="Standard_NC6",
24 |         idle_seconds_before_scaledown=1200,
25 |         min_nodes=0,
26 |         max_nodes=3,
27 |     )
28 |     gpu_cluster = ComputeTarget.create(ws, AZ_GPU_CLUSTER_NAME, compute_config)
29 | 
30 | gpu_cluster.wait_for_completion(show_output=True)
31 | 


--------------------------------------------------------------------------------
/train_azure/create_workspace.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Setting up workspace on Azure ML stadio
 3 | """
 4 | import os
 5 | from azureml.core import Workspace
 6 | from azureml.core.authentication import InteractiveLoginAuthentication
 7 | from azureml.core.authentication import InteractiveLoginAuthentication
 8 | 
 9 | # get your TENANT_ID from "az account show --output table"
10 | # get your "subscription_id" from "az account list --output table"
11 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID")
12 | AZ_SUB_ID = os.getenv("AZ_SUB_ID")
13 | 
14 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID)
15 | 
16 | 
17 | AZ_WORKSPACE_NAME = os.getenv("AZ_WORKSPACE_NAME")
18 | AZ_RESOURCE_GROUP = os.getenv("AZ_RESOURCE_GROUP")
19 | AZ_REGION = os.getenv("AZ_REGION")
20 | 
21 | ws = Workspace.create(
22 |     name=AZ_WORKSPACE_NAME,  # provide a name for your workspace
23 |     subscription_id=AZ_SUB_ID,  # provide your subscription ID
24 |     resource_group=AZ_RESOURCE_GROUP,  # provide a resource group name
25 |     create_resource_group=True,
26 |     location=AZ_REGION,
27 | )  # For example: 'westeurope' or 'eastus2' or 'westus2' or 'southeastasia'.
28 | 
29 | # write out the workspace details to a configuration file: .azureml/config.json
30 | ws.write_config(path=".azureml")
31 | 


--------------------------------------------------------------------------------
/train_azure/requirements.txt:
--------------------------------------------------------------------------------
1 | azureml.core
2 | 


--------------------------------------------------------------------------------
/train_azure/run_cls_distrib.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | from azureml.core import Workspace
 3 | from azureml.core import Experiment
 4 | from azureml.core import Environment
 5 | from azureml.core import ScriptRunConfig
 6 | from azureml.core.authentication import InteractiveLoginAuthentication
 7 | 
 8 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID")
 9 | AZ_CPU_CLUSTER_NAME = os.getenv("AZ_CPU_CLUSTER_NAME")
10 | if __name__ == "__main__":
11 |     interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID)
12 |     try:
13 |         ws = Workspace.from_config()
14 |     except:
15 |         print("No config found. Please create a workspace before running")
16 |         sys.exit(0)
17 | 
18 |     experiment = Experiment(workspace=ws, name="sample-exp-fortcollins")
19 |     config = ScriptRunConfig(
20 |         source_directory="./src",
21 |         script="cls_distribution.py",
22 |         compute_target=AZ_CPU_CLUSTER_NAME,
23 |         arguments=[
24 |             "--input_fn",
25 |             "data/fort-collins_test.csv",
26 |             "--num_classes",
27 |             7,
28 |             "--label_transform",
29 |             "uvm",  # either 'naip or epa'
30 |             "--output_dir",
31 |             "./outputs",  # TBD don't actually want to use outputdir
32 |         ],
33 |     )
34 | 
35 |     # set up pytorch environment
36 |     pytorch_env = Environment.from_conda_specification(
37 |         name="lulc-pytorch-env", file_path="./pytorch-env.yml"
38 |     )
39 | 
40 |     # This env variable needs to be set for rasterio to open remote files
41 |     # https://github.com/mapbox/rasterio/issues/1289
42 |     pytorch_env.environment_variables[
43 |         "CURL_CA_BUNDLE"
44 |     ] = "/etc/ssl/certs/ca-certificates.crt"
45 | 
46 |     config.run_config.environment = pytorch_env
47 | 
48 |     run = experiment.submit(config)
49 | 
50 |     aml_url = run.get_portal_url()
51 |     print(aml_url)
52 | 


--------------------------------------------------------------------------------
/train_azure/run_eval.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from azureml.core import Workspace
 3 | from azureml.core import Experiment
 4 | from azureml.core import Environment
 5 | from azureml.core import ScriptRunConfig
 6 | from azureml.core import Dataset
 7 | from azureml.core.authentication import InteractiveLoginAuthentication
 8 | 
 9 | 
10 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID")
11 | AZ_GPU_CLUSTER_NAME = os.getenv("AZ_GPU_CLUSTER_NAME")
12 | 
13 | if __name__ == "__main__":
14 |     interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID)
15 |     ws = Workspace.from_config()
16 |     experiment = Experiment(workspace=ws, name="sample-exp-fortcollins")
17 | 
18 |     # find the experiment Run ID through your Azure portal https://ml.azure.com/experiments/
19 | 
20 |     config = ScriptRunConfig(
21 |         source_directory="./src",
22 |         script="eval.py",
23 |         compute_target=AZ_GPU_CLUSTER_NAME,
24 |         arguments=[
25 |             "--model_fn",
26 |             "model/most_recent_model.pt",
27 |             "--input_fn",
28 |             "data/fort-collins_test.csv",
29 |             "--output_dir",
30 |             "./outputs",
31 |             "--num_classes",
32 |             7,
33 |             "--label_transform",
34 |             "uvm",
35 |             "--model",
36 |             "deeplabv3plus",
37 |         ],
38 |     )
39 | 
40 |     # set up pytorch environment
41 |     pytorch_env = Environment.from_conda_specification(
42 |         name="lulc-pytorch-env", file_path="./pytorch-env.yml"
43 |     )
44 | 
45 |     # Specify a GPU base image
46 |     pytorch_env.docker.enabled = True
47 |     pytorch_env.docker.base_image = (
48 |         "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04"
49 |     )
50 | 
51 |     config.run_config.environment = pytorch_env
52 | 
53 |     run = experiment.submit(config)
54 | 
55 |     aml_url = run.get_portal_url()
56 |     print(aml_url)
57 | 


--------------------------------------------------------------------------------
/train_azure/run_model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from azureml.core import Workspace
 3 | from azureml.core import Experiment
 4 | from azureml.core import Environment
 5 | from azureml.core import ScriptRunConfig
 6 | 
 7 | AZ_GPU_CLUSTER_NAME = os.environ.get('AZ_GPU_CLUSTER_NAME')
 8 | 
 9 | if __name__ == "__main__":
10 |     ws = Workspace.from_config()
11 |     experiment = Experiment(workspace=ws, name="sample-exp-fortcollins")
12 |     config = ScriptRunConfig(
13 |         source_directory="./src",
14 |         script="train.py",
15 |         compute_target=AZ_GPU_CLUSTER_NAME,
16 |         arguments=[
17 |             "--input_fn",
18 |             "data/fort-collins_train.csv",
19 |             "--input_fn_val",
20 |             "data/fort-collins_val.csv",
21 |             "--output_dir",
22 |             "./outputs",
23 |             "--save_most_recent",
24 |             "--num_epochs",
25 |             20,
26 |             "--num_chips",
27 |             200,
28 |             "--num_classes",
29 |             7,
30 |             "--label_transform",
31 |             "uvm",
32 |             "--model",
33 |             "deeplabv3plus",
34 |         ],
35 |     )
36 | 
37 |     # set up pytorch environment
38 |     pytorch_env = Environment.from_conda_specification(
39 |         name="lulc-pytorch-env", file_path="./pytorch-env.yml"
40 |     )
41 | 
42 |     # Specify a GPU base image
43 |     pytorch_env.docker.enabled = True
44 |     pytorch_env.docker.base_image = (
45 |         "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04"
46 |     )
47 | 
48 |     config.run_config.environment = pytorch_env
49 | 
50 |     run = experiment.submit(config)
51 | 
52 |     aml_url = run.get_portal_url()
53 |     print(aml_url)
54 | 


--------------------------------------------------------------------------------
/train_azure/run_seeddata_creation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Example test script to deploy seed data creation to Azure Machine Learning
 3 | """
 4 | import os, sys
 5 | from azureml.core import Workspace
 6 | from azureml.core import Experiment
 7 | from azureml.core import Environment
 8 | from azureml.core import ScriptRunConfig
 9 | from azureml.core import Dataset
10 | from azureml.core.authentication import InteractiveLoginAuthentication
11 | 
12 | 
13 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID")
14 | AZ_GPU_CLUSTER_NAME = os.getenv("AZ_GPU_CLUSTER_NAME")
15 | 
16 | if __name__ == "__main__":
17 |     interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID)
18 |     try:
19 |         ws = Workspace.from_config()
20 |     except:
21 |         print("No config found. Please create a workspace before running")
22 |         sys.exit(0)
23 | 
24 |     experiment = Experiment(
25 |         workspace=ws,
26 |         name="sample-exp-fortcollins",
27 |     )
28 | 
29 |     config = ScriptRunConfig(
30 |         source_directory="./src",
31 |         script="seed_data_creation.py",
32 |         compute_target=AZ_GPU_CLUSTER_NAME,
33 |         arguments=[
34 |             "--input_csv",
35 |             "data/fort-collins_test.csv",
36 |             "--ckpt_file",
37 |             "model/most_recent_model.pt",  # replace with weight on azure
38 |             "--n_classes",
39 |             7,
40 |             "--out_npz",
41 |             "./outputs/sample-output-fortcollins.npz",
42 |             "--model",
43 |             "deeplabv3plus",
44 |         ],
45 |     )
46 |     # set up pytorch environment
47 |     pytorch_env = Environment.from_conda_specification(
48 |         name="lulc-pytorch-env", file_path="./pytorch-env.yml"
49 |     )
50 | 
51 |     # Specify a GPU base image
52 |     pytorch_env.docker.enabled = True
53 |     pytorch_env.docker.base_image = (
54 |         "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04"
55 |     )
56 | 
57 |     config.run_config.environment = pytorch_env
58 | 
59 |     run = experiment.submit(config)
60 | 
61 |     aml_url = run.get_portal_url()
62 |     print(aml_url)
63 | 


--------------------------------------------------------------------------------