├── .gitignore ├── README.md ├── naip-utils ├── label_rasterize.ipynb ├── naip-label-align.py └── naip_download_pc.ipynb ├── pytorch-env.yml ├── src ├── calculate_image_stats_dir.py ├── chips.py ├── cls_distribution.py ├── dataloaders │ ├── StreamingDatasets.py │ ├── TileDatasets.py │ └── __init__.py ├── embeddings.py ├── eval.py ├── models.py ├── seed_data_creation.py ├── train.py ├── transforms_utils.py └── utils.py └── train_azure ├── create_compute-cpu.py ├── create_compute-gpu.py ├── create_workspace.py ├── requirements.txt ├── run_cls_distrib.py ├── run_eval.py ├── run_model.py └── run_seeddata_creation.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # Editors 141 | .vscode/ 142 | 143 | # Mac/OSX 144 | .DS_Store 145 | 146 | # Windows 147 | Thumbs.db 148 | 149 | # Data 150 | data/ 151 | tmp/ 152 | model/ 153 | 154 | # Input imagery 155 | *.tif 156 | 157 | # Azure 158 | .azureml 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PEARL ML Training Pipeline 2 | 3 | This repo contains scripts to manage training data, workflow to create Azure ML stack and train new models that are compatible to be run on the PEARL Platform. It is based on the work on Caleb Robinson of Microsoft. 4 | 5 | ## Training 6 | 7 | - Monitor experiments and training runs on Azure ML 8 | - Training Repo 9 | - [Training code](https://github.com/developmentseed/pearl-ml-pipeline/blob/main/src/train.py) 10 | - [Evaluation code](https://github.com/developmentseed/pearl-ml-pipeline/blob/main/src/eval.py) 11 | 12 | - [DeepLabv3Plus Architecture](https://github.com/qubvel/segmentation_models.pytorch/blob/master/segmentation_models_pytorch/decoders/deeplabv3/model.py) + [focal loss](https://github.com/qubvel/segmentation_models.pytorch/blob/master/segmentation_models_pytorch/losses/focal.py) seems most promising approach 13 | 14 | ## Evaluation 15 | - We run the model over the test data set, and use the per class 16 | 17 | ### SEED Data 18 | 19 | **How/Why we create Seed Data** 20 | 21 | - We have seed data for each model so during retraining the user doesn’t have to add samples for each class, so we can use the weights/biases from the retraining logistic regression sklearn model to update the weights/biases of the deep learning model and then run inference on the GPU 22 | - The retraining seed data should have same class distribution ratios as the original training data (ie 10% water, 50% trees ect) 23 | - I’ve been generating retraining data using the GPU enabled Azure notebooks (these should ideally be converted into scripts) 24 | - [Seed Data Creation Script](https://github.com/developmentseed/pearl-ml-pipeline/blob/main/src/seed_data_creation.py) 25 | 26 | 27 | 28 | ## Training Dataset Creation 29 | 30 | There are two options to create the training dataset. 31 | 32 | **Option 1**. Feed LULC labels data in GeoTiff format. 33 | 34 | [naip-label-align.py](naip-utils/naip-label-align.py) and [NAIPTileIndex.py](naip-utils/NAIPTileIndex.py) provided functions on how to: 35 | 36 | _Notes_: 37 | - Install libspatialindex (dep of `rtree` which is not installed automaticaly) 38 | - `brew install spatialindex` 39 | - align given LULC labels to available NAIP imagery tiles on Azure public Blob; 40 | - filter out nodata tiles; 41 | - create name conventions; 42 | - write it to CSVs for train, validation and test dataset by 70:20:10. 43 | - Script will write the tiled label geoTIFF into out_dir. These files can then be uploaded to Azure blob storage 44 | 45 | These CSVs can be deployed to AML for model training direction. Instruction will be given in the following section. 46 | 47 | ```bash 48 | python naip-label-align.py 49 | --label_tif_path sample.tif 50 | --out_dir / 51 | --threshold [0.0 to 1.0] 52 | --aoi 53 | --group 54 | ``` 55 | 56 | **Option 2**. LULC labels available as GeoJSON (vector) files, and rasterization is required. 57 | 58 | - Firstly, NAIP imagery that overlap with LULC label data is needed to be downloaded before the rasterization task. 59 | [naip_download_pc.ipynb](naip-utils/naip_download_pc.ipynb) provides script and documentation on how you can download NAIP imagery to your AOI from [MS Plentary Computer](https://planetarycomputer.microsoft.com/dataset/naip). 60 | 61 | - Secondly, LULC label rasterization functions and steps provided in [label_rasterize.ipynb](naip-utils/label_rasterize.ipynb) 62 | The rasterization in the order of (tree canopy on the top of the lulc layer or burn last, other_impervious on the bottom or it should be rasterized first in the order) 63 | ``` 64 | tree_canopy 65 | building 66 | water 67 | bare_soil 68 | roads_railroads 69 | grass_shrub 70 | other_impervious 71 | ``` 72 | Details see the [notebook](naip-utils/label_rasterize.ipynb). 73 | 74 | 75 | ## Model Training on Azure ML(AML) 76 | If you are going to use AML to train LULC models for the first time, please go through these steps. 77 | 78 | Screen Shot 2021-11-08 at 8 20 04 AM 79 | 80 | ### Configure environment 81 | 82 | This code was tested using `python 3.6.5` 83 | 84 | [Create a conda environment](https://docs.conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html#creating-an-environment-from-an-environment-yml-file) using `.pytorch-env.yaml` file and execute the scripts from the created environment. 85 | 86 | 87 | You will need to set the following variables in your `.env` 88 | 89 | bash 90 | ``` 91 | AZ_TENANT_ID=XXX #az account show --output table 92 | AZ_SUB_ID=XXX #az account list --output table 93 | 94 | AZ_WORKSPACE_NAME=XXX #User set 95 | AZ_RESOURCE_GROUP=XXX #User set 96 | AZ_REGION=XXX #User set 97 | 98 | AZ_GPU_CLUSTER_NAME=XXX #User set 99 | AZ_CPU_CLUSTER_NAME=XXX #User set 100 | ``` 101 | 102 | Then export all variables to your environment: 103 | 104 | ``` 105 | export $(cat .env); 106 | ``` 107 | 108 | 109 | ### Create Your Workspace on AML 110 | [train_azure/create_workspace.py](train_azure/create_workspace.py) after export your Azure credentials, this script will create AML workspace. 111 | 112 | ### Create GPU Compute 113 | 114 | [This script](train_azure/create_compute-gpu.py) will create GPU compute resources to your workspace on AML. 115 | 116 | 117 | ### (Optional) Create CPU Compute 118 | 119 | [This script](train_azure/create_compute-cpu.py) will create GPU compute resources to your workspace on AML. 120 | 121 | 122 | ### Train LULC Model on AML 123 | We have three PyTorch based Semantic Segmenation models ready for LULC model trainings, FCN, UNet and DeepLabV3+. 124 | 125 | To train a model on AML, you will need to define or parse a few crucial parameters to the [script](train_azure/run_model.py), for instance: 126 | 127 | TODO: Will we be providing sample csv 128 | ```python 129 | config = ScriptRunConfig( 130 | source_directory="./src", 131 | script="train.py", 132 | compute_target=AZ_GPU_CLUSTER_NAME, 133 | arguments=[ 134 | "--input_fn", 135 | "sample_data/indianapolis_train.csv", 136 | "--input_fn_val", 137 | "sample_data/indianapolis_val.csv", 138 | "--output_dir", 139 | "./outputs", 140 | "--save_most_recent", 141 | "--num_epochs", 142 | 20, 143 | "--num_chips", 144 | 200, 145 | "--num_classes", 146 | 7, 147 | "--label_transform", 148 | "uvm", 149 | "--model", 150 | "deeplabv3plus", 151 | ], 152 | ) 153 | ``` 154 | 155 | These parameters are to be configure by the user. `input_fn_X` paths should be provided by the user, and are the outputs of the data generation step (NAIP Label Algin) described above. 156 | 157 | `python train_azure/run_model.py` 158 | 159 | 160 | ### Evaluate the Trained Model 161 | 162 | To compute Global F1, and class base F1 scores (written in CSV) from a trained model over latest dataset. You can use this [eval script](train_azure/run_eval.py) as an example. 163 | 164 | `python train_azure/run_eval.py` 165 | 166 | 167 | ### Seed Data Creation for PEARL 168 | After a best performing model is selected, seed dataseed need to be created to serve PEARL. Seed Data is the model embedding layers from the trained model that is used together with users inputs training data in PEARL retraining session. 169 | 170 | [run_seeddata_creation.py](train_azure/run_seeddata_creation.py) will config AML and use the [main seeddata creation script](src/seed_data_creation.py) to create seed data for the trained best performing model. 171 | 172 | `python train_azure/run_seeddata_creation.py` 173 | 174 | ### (Optional) Classes Distribution 175 | 176 | LULC Class distribution is a graph showing the porpotion of LULC pixel numbers for a trained model on PEARL. See the bar chart bellow. 177 | 178 | [train_azure/run_cls_distrib.py](train_azure/run_cls_distrib.py) will guide you how to compute the classes distribution from the training dataset for the model. 179 | 180 | `python train_azure/run_cls_distrib.py` 181 | 182 | Screen Shot 2021-11-08 at 8 07 49 AM 183 | -------------------------------------------------------------------------------- /naip-utils/label_rasterize.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Workflow\n", 8 | "- Get NAIP imagery from public Azure Blob (see notebook naip_download_pc.ipynb)\n", 9 | "- Rasterize label data based on the NAIP image tile\n", 10 | "- Create Image and Label fame name and Index match\n", 11 | "- Store info in CSV" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import os\n", 21 | "import glob\n", 22 | "from os import makedirs, path as op \n", 23 | "import geopandas as gpd\n", 24 | "from subprocess import call\n", 25 | "from rasterio.features import geometry_mask\n", 26 | "import rasterio\n", 27 | "import numpy as np\n" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "def fix_id(geojson, class_id, keyword, rankid):\n", 37 | " \"\"\"adding class id to the label geojson and update output directory for new label\"\"\"\n", 38 | " gdf = gpd.read_file(geojson)\n", 39 | " gdf['class_id'] = int(class_id)\n", 40 | " outdir = f\"../label_af_download/updated_labels_{keyword}\"\n", 41 | " if not op.exists(outdir):\n", 42 | " makedirs(outdir)\n", 43 | " basename = op.basename(geojson)\n", 44 | " gdf.to_file(f\"{outdir}/{rankid}_{basename}\", driver=\"GeoJSON\")\n", 45 | " return outdir" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "def get_key(val, main_dict):\n", 55 | " \"\"\"to fetch key of label order based on the label class name\"\"\"\n", 56 | " for key, value in main_dict.items():\n", 57 | " if val == value:\n", 58 | " return key\n", 59 | " \n", 60 | " return \"key doesn't exist\"" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "def burn_base(raster, aoi, outfile):\n", 70 | " \"\"\"rasterize aoi bounds\"\"\"\n", 71 | " gdf_aoi = gpd.read_file(aoi)\n", 72 | " with rasterio.open(raster, 'r') as src:\n", 73 | " profile = src.profile\n", 74 | " profile.update(\n", 75 | " dtype=rasterio.uint8, \n", 76 | " count=1,\n", 77 | " compress='lzw'\n", 78 | " )\n", 79 | " fsrc = src.read()\n", 80 | " outshape = (fsrc.shape[1], fsrc.shape[2])\n", 81 | " transform_out = src.transform\n", 82 | " out_arr = np.zeros(outshape)\n", 83 | " \n", 84 | " out_label=geometry_mask(gdf_aoi.geometry,\n", 85 | " out_shape=outshape, \n", 86 | " transform=transform_out, \n", 87 | " all_touched=True,\n", 88 | " invert=True\n", 89 | " )\n", 90 | " with rasterio.open(outfile, \"w\", **profile) as dst:\n", 91 | " dst.write(out_label,1)\n", 92 | " return outfile" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "def burn_labels(base_mask, label_ls, outfile):\n", 102 | " \"\"\"burn labels based on the priority order\"\"\"\n", 103 | " with rasterio.open(base_mask, 'r') as src:\n", 104 | " profile = src.profile\n", 105 | " profile.update(\n", 106 | " dtype=rasterio.uint8, \n", 107 | " count=1,\n", 108 | " compress='lzw'\n", 109 | " )\n", 110 | " fsrc = src.read_masks(1)\n", 111 | " outshape = fsrc.shape\n", 112 | " print(outshape)\n", 113 | " transform_out = src.transform\n", 114 | " labels_arr=np.zeros(outshape)\n", 115 | " for geojson in label_ls:\n", 116 | " print(geojson)\n", 117 | " gdf= gpd.read_file(geojson)\n", 118 | " print(gdf.crs)\n", 119 | " print(len(gdf.geometry))\n", 120 | " print(gdf[\"class_id\"].unique()[0])\n", 121 | " mask = geometry_mask(gdf.geometry, out_shape=outshape, transform=transform_out, all_touched=True, invert=True)\n", 122 | " print(np.unique(mask))\n", 123 | " update_mask = np.where(mask==True)\n", 124 | " labels_arr[update_mask] = gdf[\"class_id\"].unique()[0]\n", 125 | " print(np.unique(labels_arr))\n", 126 | " with rasterio.open(outfile, \"w\", **profile) as dst:\n", 127 | " dst.write(labels_arr,1)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "aoi0 = \"../label_af_download/aoi_detroit_labeled/aoi0_bounds.geojson\"\n", 137 | "aoi1 = \"../label_af_download/aoi_detroit_labeled/aoi1_bounds.geojson\"\n", 138 | "aoi2 = \"../label_af_download/aoi_detroit_labeled/aoi2_bounds.geojson\"\n", 139 | "label_path = \"../label_af_download/aoi_detroit_labeled\"\n", 140 | "aoi0_naip = \"../label_af_download/downloaded_naip_aois/2018-07-06_naip_aoi0_bounds.tif\"\n", 141 | "aoi1_naip = \"../label_af_download/downloaded_naip_aois/2012-06-29_naip_aoi1_bounds.tif\"\n", 142 | "aoi2_naip = \"../label_af_download/downloaded_naip_aois/2016-08-03_naip_aoi2_bounds.tif\"\n" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### Raterize LULC Labels\n", 150 | "The land classes should be burn in this order (1-7)\n", 151 | "1 on the bottom and 7 burn the last, so it's on the top\n", 152 | "\n", 153 | "7. tree_canopy\n", 154 | "6. building\n", 155 | "5. water\n", 156 | "4. bare_soil\n", 157 | "3. roads_railroads\n", 158 | "2. grass_shrub\n", 159 | "1. other_impervious\n", 160 | "\n", 161 | "Though the real class IDs are:\n", 162 | "\n", 163 | "- 0: Nodata (use the aoi)\n", 164 | "- 1: Tree Canopy, \n", 165 | "- 2: Grass/Shrub, \n", 166 | "- 3: bare soil, \n", 167 | "- 4: water, \n", 168 | "- 5: buildings, \n", 169 | "- 6:roads/railroads, \n", 170 | "- 7:other impervious" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "metadata": {}, 177 | "outputs": [], 178 | "source": [ 179 | "labels_classes = {\n", 180 | " \"impervious\": 7,\n", 181 | " \"building\": 5, \n", 182 | " \"shrub\":2, \n", 183 | " \"canopy\":1,\n", 184 | " \"railroads\": 6, \n", 185 | " \"soil\": 3, \n", 186 | " \"water\": 4\n", 187 | "}" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "burn_order ={\n", 197 | "1: \"impervious\", \n", 198 | "2: \"shrub\", \n", 199 | "3: \"railroads\",\n", 200 | "4: \"soil\",\n", 201 | "5: \"water\",\n", 202 | "6: \"building\",\n", 203 | "7: \"canopy\",\n", 204 | "}" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "# aoi0_labels = glob.glob(label_path+\"/aoi_0/*.geojson\")\n", 214 | "aoi0_labels = glob.glob(label_path+\"/aoi_0/*.geojson\")\n", 215 | "for label in aoi0_labels:\n", 216 | " basename=op.basename(label)\n", 217 | " filezero = op.splitext(basename)[0]\n", 218 | " keyword = filezero.split(\"_\")[-1]\n", 219 | " class_id = labels_classes[keyword]\n", 220 | " rankid = get_key(keyword, burn_order)\n", 221 | " print(keyword, class_id)\n", 222 | " out_dir_or = fix_id(label, class_id, \"aoi0\", rankid)\n", 223 | " print(out_dir_or)\n", 224 | "# out= '/'.join(subdir for subdir in out_dir_or.split(\"/\")[:-1])\n", 225 | "# print(out)\n", 226 | "sorted_labels = sorted(glob.glob(out_dir_or + \"/*.geojson\"))\n", 227 | "print(sorted_labels)\n", 228 | "mask_path = burn_base(aoi0_naip, aoi0, \"../label_af_download/aoi_detroit_labeled/mark0.tif\")\n", 229 | "burn_labels(mask_path, sorted_labels, \"../label_af_download/aoi_detroit_labeled/aoi0_labels.tif\")\n", 230 | "# label_array(aoi0_labels, aoi0_naip, labels_classes, burn_order, \"aoi0\", aoi0, \"../label_af_download/aoi_detroit_labeled/mark0.tif\", \"../label_af_download/aoi_detroit_labeled/aoi0_labels.tif\")" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "aoi1_labels = glob.glob(label_path+\"/aoi_1/*.geojson\")\n", 240 | "for label in aoi1_labels:\n", 241 | " basename=op.basename(label)\n", 242 | " filezero = op.splitext(basename)[0]\n", 243 | " keyword = filezero.split(\"_\")[-1]\n", 244 | " class_id = labels_classes[keyword]\n", 245 | " rankid = get_key(keyword, burn_order)\n", 246 | " print(keyword, class_id)\n", 247 | " out_dir_or = fix_id(label, class_id, \"aoi1\", rankid)\n", 248 | " print(out_dir_or)\n", 249 | "# out= '/'.join(subdir for subdir in out_dir_or.split(\"/\")[:-1])\n", 250 | "# print(out)\n", 251 | "sorted_labels = sorted(glob.glob(out_dir_or + \"/*.geojson\"))\n", 252 | "print(sorted_labels)\n", 253 | "mask_path = burn_base(aoi1_naip, aoi1, \"../label_af_download/aoi_detroit_labeled/mark1.tif\")\n", 254 | "burn_labels(mask_path, sorted_labels, \"../label_af_download/aoi_detroit_labeled/aoi1_labels.tif\")" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "aoi2_labels = glob.glob(label_path+\"/aoi_2/*.geojson\")\n", 264 | "for label in aoi2_labels:\n", 265 | " basename=op.basename(label)\n", 266 | " filezero = op.splitext(basename)[0]\n", 267 | " keyword = filezero.split(\"_\")[-1]\n", 268 | " class_id = labels_classes[keyword]\n", 269 | " rankid = get_key(keyword, burn_order)\n", 270 | " print(keyword, class_id)\n", 271 | " out_dir_or = fix_id(label, class_id, \"aoi2\", rankid)\n", 272 | " print(out_dir_or)\n", 273 | "# out= '/'.join(subdir for subdir in out_dir_or.split(\"/\")[:-1])\n", 274 | "# print(out)\n", 275 | "sorted_labels = sorted(glob.glob(out_dir_or + \"/*.geojson\"))\n", 276 | "print(sorted_labels)\n", 277 | "mask_path = burn_base(aoi2_naip, aoi2, \"../label_af_download/aoi_detroit_labeled/mark2.tif\")\n", 278 | "burn_labels(mask_path, sorted_labels, \"../label_af_download/aoi_detroit_labeled/aoi2_labels.tif\")" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 42, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "import pandas as pd" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 53, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "df_train = pd.DataFrame()\n", 297 | "df_val = pd.DataFrame()\n", 298 | "df_test = pd.DataFrame()\n", 299 | "label_path = \"../label_af_download/trainingdataset-data-team_aois/labels\"\n", 300 | "image_path = \"../label_af_download/trainingdataset-data-team_aois/naips\"" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 45, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "data": { 310 | "text/plain": [ 311 | "['../label_af_download/trainingdataset-data-team_aois/naips/2012-06-29_naip_aoi0_bounds.tif',\n", 312 | " '../label_af_download/trainingdataset-data-team_aois/naips/2012-06-29_naip_aoi1_bounds.tif',\n", 313 | " '../label_af_download/trainingdataset-data-team_aois/naips/2012-06-29_naip_aoi2_bounds.tif',\n", 314 | " '../label_af_download/trainingdataset-data-team_aois/naips/2014-06-28_naip_aoi0_bounds.tif',\n", 315 | " '../label_af_download/trainingdataset-data-team_aois/naips/2014-06-28_naip_aoi2_bounds.tif',\n", 316 | " '../label_af_download/trainingdataset-data-team_aois/naips/2016-08-03_naip_aoi0_bounds.tif',\n", 317 | " '../label_af_download/trainingdataset-data-team_aois/naips/2016-08-03_naip_aoi2_bounds.tif',\n", 318 | " '../label_af_download/trainingdataset-data-team_aois/naips/2018-07-06_naip_aoi0_bounds.tif',\n", 319 | " '../label_af_download/trainingdataset-data-team_aois/naips/2018-07-07_naip_aoi2_bounds.tif']" 320 | ] 321 | }, 322 | "execution_count": 45, 323 | "metadata": {}, 324 | "output_type": "execute_result" 325 | } 326 | ], 327 | "source": [ 328 | "images = sorted(glob.glob(image_path +\"/*.tif\"))\n", 329 | "images" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 46, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "base_url = \"https://uvmlabels.blob.core.windows.net/\"\n", 339 | "label_key = \"labels4-data-team-aois\"\n", 340 | "image_key = \"naip4-data-team-aois\"" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 54, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "image_url = base_url + image_key\n", 350 | "label_url = base_url + label_key\n", 351 | "train_img = []\n", 352 | "train_label = []\n", 353 | "for img in images[:-2]:\n", 354 | " basename = op.basename(img)\n", 355 | " filezeor = op.splitext(basename)[0]\n", 356 | " img_url = image_url + \"/\" + basename\n", 357 | " lab_url = label_url + \"/\" + filezeor + \"_labels.tif\"\n", 358 | " train_img.append(img_url)\n", 359 | " train_label.append(lab_url)\n" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 55, 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "['https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2012-06-29_naip_aoi0_bounds.tif',\n", 371 | " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2012-06-29_naip_aoi1_bounds.tif',\n", 372 | " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2012-06-29_naip_aoi2_bounds.tif',\n", 373 | " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2014-06-28_naip_aoi0_bounds.tif',\n", 374 | " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2014-06-28_naip_aoi2_bounds.tif',\n", 375 | " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2016-08-03_naip_aoi0_bounds.tif',\n", 376 | " 'https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2016-08-03_naip_aoi2_bounds.tif']" 377 | ] 378 | }, 379 | "execution_count": 55, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "train_img" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 56, 391 | "metadata": {}, 392 | "outputs": [ 393 | { 394 | "data": { 395 | "text/plain": [ 396 | "['https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2012-06-29_naip_aoi0_bounds_labels.tif',\n", 397 | " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2012-06-29_naip_aoi1_bounds_labels.tif',\n", 398 | " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2012-06-29_naip_aoi2_bounds_labels.tif',\n", 399 | " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2014-06-28_naip_aoi0_bounds_labels.tif',\n", 400 | " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2014-06-28_naip_aoi2_bounds_labels.tif',\n", 401 | " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2016-08-03_naip_aoi0_bounds_labels.tif',\n", 402 | " 'https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2016-08-03_naip_aoi2_bounds_labels.tif']" 403 | ] 404 | }, 405 | "execution_count": 56, 406 | "metadata": {}, 407 | "output_type": "execute_result" 408 | } 409 | ], 410 | "source": [ 411 | "train_label" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": 57, 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "data": { 421 | "text/html": [ 422 | "
\n", 423 | "\n", 436 | "\n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | "
image_fnlabel_fngroup
0https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
1https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
2https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
3https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
4https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
5https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
6https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
\n", 490 | "
" 491 | ], 492 | "text/plain": [ 493 | " image_fn \\\n", 494 | "0 https://uvmlabels.blob.core.windows.net/naip4-... \n", 495 | "1 https://uvmlabels.blob.core.windows.net/naip4-... \n", 496 | "2 https://uvmlabels.blob.core.windows.net/naip4-... \n", 497 | "3 https://uvmlabels.blob.core.windows.net/naip4-... \n", 498 | "4 https://uvmlabels.blob.core.windows.net/naip4-... \n", 499 | "5 https://uvmlabels.blob.core.windows.net/naip4-... \n", 500 | "6 https://uvmlabels.blob.core.windows.net/naip4-... \n", 501 | "\n", 502 | " label_fn group \n", 503 | "0 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 504 | "1 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 505 | "2 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 506 | "3 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 507 | "4 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 508 | "5 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 509 | "6 https://uvmlabels.blob.core.windows.net/labels... uvm " 510 | ] 511 | }, 512 | "execution_count": 57, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "df_train['image_fn'] = train_img\n", 519 | "df_train[\"label_fn\"] = train_label\n", 520 | "df_train[\"group\"] = \"uvm\"\n", 521 | "df_train" 522 | ] 523 | }, 524 | { 525 | "cell_type": "code", 526 | "execution_count": 58, 527 | "metadata": {}, 528 | "outputs": [], 529 | "source": [ 530 | "df_val['image_fn'] = \"https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2018-07-06_naip_aoi0_bounds.tif\"\n", 531 | "df_val[\"label_fn\"] = \"https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2018-07-06_naip_aoi0_bounds.tif\"\n", 532 | "df_val[\"group\"] = \"uvm\"" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 59, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "df_test['image_fn'] = \"https://uvmlabels.blob.core.windows.net/naip4-data-team-aois/2018-07-07_naip_aoi2_bounds.tif\"\n", 542 | "df_test[\"label_fn\"] = \"https://uvmlabels.blob.core.windows.net/labels4-data-team-aois/2018-07-07_naip_aoi2_bounds_labels.tif\"\n", 543 | "df_test[\"group\"] = \"uvm\"" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 60, 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "df_train.to_csv(\"DevSeed_Data_created_train.csv\")\n", 553 | "df_val.to_csv(\"DevSeed_Data_created_val.csv\")\n", 554 | "df_test.to_csv(\"DevSeed_Data_created_test.csv\")" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 65, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "midwest_train = \"../src/data/midwest_train_multi_year.csv\"\n", 564 | "midwest_val = \"../src/data/midwest_val_multi_year.csv\"\n", 565 | "midwest_test = \"../src/data/midwest_test_multi_year.csv\"\n", 566 | "midw_train = pd.read_csv(midwest_train)\n", 567 | "midw_val = pd.read_csv(midwest_val)\n", 568 | "midw_test = pd.read_csv(midwest_test)\n" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": 62, 574 | "metadata": {}, 575 | "outputs": [ 576 | { 577 | "data": { 578 | "text/html": [ 579 | "
\n", 580 | "\n", 593 | "\n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | "
Unnamed: 0Unnamed: 0.1Unnamed: 0.1.1Unnamed: 0.1.1.1image_fnlabel_fngroup
00002https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
111126https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
222213https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
333316https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
44444https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
\n", 659 | "
" 660 | ], 661 | "text/plain": [ 662 | " Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 Unnamed: 0.1.1.1 \\\n", 663 | "0 0 0 0 2 \n", 664 | "1 1 1 1 26 \n", 665 | "2 2 2 2 13 \n", 666 | "3 3 3 3 16 \n", 667 | "4 4 4 4 4 \n", 668 | "\n", 669 | " image_fn \\\n", 670 | "0 https://naipblobs.blob.core.windows.net/naip/v... \n", 671 | "1 https://naipblobs.blob.core.windows.net/naip/v... \n", 672 | "2 https://naipblobs.blob.core.windows.net/naip/v... \n", 673 | "3 https://naipblobs.blob.core.windows.net/naip/v... \n", 674 | "4 https://naipblobs.blob.core.windows.net/naip/v... \n", 675 | "\n", 676 | " label_fn group \n", 677 | "0 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 678 | "1 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 679 | "2 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 680 | "3 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 681 | "4 https://uvmlabels.blob.core.windows.net/detroi... umv_label " 682 | ] 683 | }, 684 | "execution_count": 62, 685 | "metadata": {}, 686 | "output_type": "execute_result" 687 | } 688 | ], 689 | "source": [ 690 | "midw_train.head()" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 66, 696 | "metadata": {}, 697 | "outputs": [ 698 | { 699 | "data": { 700 | "text/html": [ 701 | "
\n", 702 | "\n", 715 | "\n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | "
image_fnlabel_fngroup
0https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
1https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
2https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
3https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
4https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...umv_label
\n", 757 | "
" 758 | ], 759 | "text/plain": [ 760 | " image_fn \\\n", 761 | "0 https://naipblobs.blob.core.windows.net/naip/v... \n", 762 | "1 https://naipblobs.blob.core.windows.net/naip/v... \n", 763 | "2 https://naipblobs.blob.core.windows.net/naip/v... \n", 764 | "3 https://naipblobs.blob.core.windows.net/naip/v... \n", 765 | "4 https://naipblobs.blob.core.windows.net/naip/v... \n", 766 | "\n", 767 | " label_fn group \n", 768 | "0 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 769 | "1 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 770 | "2 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 771 | "3 https://uvmlabels.blob.core.windows.net/detroi... umv_label \n", 772 | "4 https://uvmlabels.blob.core.windows.net/detroi... umv_label " 773 | ] 774 | }, 775 | "execution_count": 66, 776 | "metadata": {}, 777 | "output_type": "execute_result" 778 | } 779 | ], 780 | "source": [ 781 | "midw_train = midw_train[[\"image_fn\", \"label_fn\", \"group\"]]\n", 782 | "midw_val = midw_val[[\"image_fn\", \"label_fn\", \"group\"]]\n", 783 | "midw_test = midw_test[[\"image_fn\", \"label_fn\", \"group\"]]\n", 784 | "midw_train.head()" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 68, 790 | "metadata": {}, 791 | "outputs": [ 792 | { 793 | "data": { 794 | "text/html": [ 795 | "
\n", 796 | "\n", 809 | "\n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | "
image_fnlabel_fngroup
0https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
1https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
2https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
3https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
4https://uvmlabels.blob.core.windows.net/naip4-...https://uvmlabels.blob.core.windows.net/labels...uvm
\n", 851 | "
" 852 | ], 853 | "text/plain": [ 854 | " image_fn \\\n", 855 | "0 https://uvmlabels.blob.core.windows.net/naip4-... \n", 856 | "1 https://uvmlabels.blob.core.windows.net/naip4-... \n", 857 | "2 https://uvmlabels.blob.core.windows.net/naip4-... \n", 858 | "3 https://uvmlabels.blob.core.windows.net/naip4-... \n", 859 | "4 https://uvmlabels.blob.core.windows.net/naip4-... \n", 860 | "\n", 861 | " label_fn group \n", 862 | "0 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 863 | "1 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 864 | "2 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 865 | "3 https://uvmlabels.blob.core.windows.net/labels... uvm \n", 866 | "4 https://uvmlabels.blob.core.windows.net/labels... uvm " 867 | ] 868 | }, 869 | "execution_count": 68, 870 | "metadata": {}, 871 | "output_type": "execute_result" 872 | } 873 | ], 874 | "source": [ 875 | "midwest_data_train = pd.concat([df_train, midw_train])\n", 876 | "midwest_data_train['group'] = \"uvm\"\n", 877 | "\n", 878 | "midwest_data_val = pd.concat([df_val, midw_val])\n", 879 | "midwest_data_val['group'] = \"uvm\" \n", 880 | "\n", 881 | "midwest_data_test = pd.concat([df_test, midw_test])\n", 882 | "midwest_data_test['group'] = \"uvm\" \n", 883 | "midwest_data_train.head()\n" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 69, 889 | "metadata": {}, 890 | "outputs": [ 891 | { 892 | "data": { 893 | "text/html": [ 894 | "
\n", 895 | "\n", 908 | "\n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | "
image_fnlabel_fngroup
0https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
1https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
2https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
3https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
4https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
\n", 950 | "
" 951 | ], 952 | "text/plain": [ 953 | " image_fn \\\n", 954 | "0 https://naipblobs.blob.core.windows.net/naip/v... \n", 955 | "1 https://naipblobs.blob.core.windows.net/naip/v... \n", 956 | "2 https://naipblobs.blob.core.windows.net/naip/v... \n", 957 | "3 https://naipblobs.blob.core.windows.net/naip/v... \n", 958 | "4 https://naipblobs.blob.core.windows.net/naip/v... \n", 959 | "\n", 960 | " label_fn group \n", 961 | "0 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 962 | "1 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 963 | "2 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 964 | "3 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 965 | "4 https://uvmlabels.blob.core.windows.net/detroi... uvm " 966 | ] 967 | }, 968 | "execution_count": 69, 969 | "metadata": {}, 970 | "output_type": "execute_result" 971 | } 972 | ], 973 | "source": [ 974 | "midwest_data_val.head()" 975 | ] 976 | }, 977 | { 978 | "cell_type": "code", 979 | "execution_count": 70, 980 | "metadata": {}, 981 | "outputs": [ 982 | { 983 | "data": { 984 | "text/html": [ 985 | "
\n", 986 | "\n", 999 | "\n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | "
image_fnlabel_fngroup
0https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
1https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
2https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
3https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/detroi...uvm
4https://naipblobs.blob.core.windows.net/naip/v...https://uvmlabels.blob.core.windows.net/cuyaho...uvm
\n", 1041 | "
" 1042 | ], 1043 | "text/plain": [ 1044 | " image_fn \\\n", 1045 | "0 https://naipblobs.blob.core.windows.net/naip/v... \n", 1046 | "1 https://naipblobs.blob.core.windows.net/naip/v... \n", 1047 | "2 https://naipblobs.blob.core.windows.net/naip/v... \n", 1048 | "3 https://naipblobs.blob.core.windows.net/naip/v... \n", 1049 | "4 https://naipblobs.blob.core.windows.net/naip/v... \n", 1050 | "\n", 1051 | " label_fn group \n", 1052 | "0 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 1053 | "1 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 1054 | "2 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 1055 | "3 https://uvmlabels.blob.core.windows.net/detroi... uvm \n", 1056 | "4 https://uvmlabels.blob.core.windows.net/cuyaho... uvm " 1057 | ] 1058 | }, 1059 | "execution_count": 70, 1060 | "metadata": {}, 1061 | "output_type": "execute_result" 1062 | } 1063 | ], 1064 | "source": [ 1065 | "midwest_data_test.head()" 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "code", 1070 | "execution_count": 71, 1071 | "metadata": {}, 1072 | "outputs": [ 1073 | { 1074 | "data": { 1075 | "text/plain": [ 1076 | "(24, 46, 167)" 1077 | ] 1078 | }, 1079 | "execution_count": 71, 1080 | "metadata": {}, 1081 | "output_type": "execute_result" 1082 | } 1083 | ], 1084 | "source": [ 1085 | "len(midwest_data_test), len(midwest_data_val), len(midwest_data_train)" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "execution_count": 72, 1091 | "metadata": {}, 1092 | "outputs": [], 1093 | "source": [ 1094 | "midwest_data_train.to_csv(\"../src/data/midwest_n_devseed_train_multiple_years.csv\")\n", 1095 | "midwest_data_val.to_csv(\"../src/data/midwest_n_devseed_val_multiple_years.csv\")\n", 1096 | "midwest_data_test.to_csv(\"../src/data/midwest_n_devseed_test_multiple_years.csv\")" 1097 | ] 1098 | }, 1099 | { 1100 | "cell_type": "code", 1101 | "execution_count": null, 1102 | "metadata": {}, 1103 | "outputs": [], 1104 | "source": [] 1105 | } 1106 | ], 1107 | "metadata": { 1108 | "interpreter": { 1109 | "hash": "d98913f21c46af07a5e0f9f95dad536eb8e0fcf0c29a960fff7f9f173650b3e5" 1110 | }, 1111 | "kernelspec": { 1112 | "display_name": "Python 3 (ipykernel)", 1113 | "language": "python", 1114 | "name": "python3" 1115 | }, 1116 | "language_info": { 1117 | "codemirror_mode": { 1118 | "name": "ipython", 1119 | "version": 3 1120 | }, 1121 | "file_extension": ".py", 1122 | "mimetype": "text/x-python", 1123 | "name": "python", 1124 | "nbconvert_exporter": "python", 1125 | "pygments_lexer": "ipython3", 1126 | "version": "3.9.10" 1127 | } 1128 | }, 1129 | "nbformat": 4, 1130 | "nbformat_minor": 4 1131 | } 1132 | -------------------------------------------------------------------------------- /naip-utils/naip-label-align.py: -------------------------------------------------------------------------------- 1 | import shapely.geometry 2 | import rasterio 3 | import fiona.transform 4 | import os.path as op 5 | from pathlib import Path 6 | import os 7 | import pandas as pd 8 | import numpy as np 9 | import subprocess 10 | import rtree 11 | import shapely 12 | import click 13 | import urllib.request 14 | import pickle 15 | 16 | 17 | class NAIPTileIndex: 18 | """Utility class for performing NAIP tile lookups by location""" 19 | 20 | NAIP_BLOB_ROOT = "https://naipblobs.blob.core.windows.net/naip/" 21 | NAIP_INDEX_BLOB_ROOT = "https://naipblobs.blob.core.windows.net/naip-index/rtree/" 22 | INDEX_FNS = ["tile_index.dat", "tile_index.idx", "tiles.p"] 23 | 24 | def __init__(self, base_path, verbose=False): 25 | """Loads the tile index into memory (~400 MB) for use by `self.lookup()`. Downloads the index files from the blob container if they do not exist in the `base_path/` directory. 26 | Args: 27 | base_path (str): The path on the local system to look for/store the three files that make up the tile index. This path will be created if it doesn't exist. 28 | verbose (bool): Whether to be verbose when downloading the tile index files 29 | """ 30 | 31 | # Download the index files if it doens't exist 32 | if not os.path.exists(base_path): 33 | os.makedirs(base_path) 34 | for fn in NAIPTileIndex.INDEX_FNS: 35 | if not os.path.exists(os.path.join(base_path, fn)): 36 | download_url( 37 | NAIPTileIndex.NAIP_INDEX_BLOB_ROOT + fn, 38 | os.path.join(base_path, fn), 39 | verbose, 40 | ) 41 | 42 | self.base_path = base_path 43 | self.tile_rtree = rtree.index.Index(base_path + "/tile_index") 44 | self.tile_index = pickle.load(open(base_path + "/tiles.p", "rb")) 45 | 46 | def lookup_point(self, lat, lon): 47 | """Given a lat/lon coordinate pair, return the list of NAIP tiles that *contain* that point. 48 | Args: 49 | lat (float): Latitude in EPSG:4326 50 | lon (float): Longitude in EPSG:4326 51 | Returns: 52 | intersected_files (list): A list of URLs of NAIP tiles that *contain* the given (`lat`, `lon`) point 53 | Raises: 54 | IndexError: Raised if no tile within the index contains the given (`lat`, `lon`) point 55 | """ 56 | 57 | point = shapely.geometry.Point(float(lon), float(lat)) 58 | geom = shapely.geometry.mapping(point) 59 | 60 | return self.lookup_geom(geom) 61 | 62 | def lookup_geom(self, geom): 63 | """Given a GeoJSON geometry, return the list of NAIP tiles that *contain* that feature. 64 | Args: 65 | geom (dict): A GeoJSON geometry in EPSG:4326 66 | Returns: 67 | intersected_files (list): A list of URLs of NAIP tiles that *contain* the given `geom` 68 | Raises: 69 | IndexError: Raised if no tile within the index fully contains the given `geom` 70 | """ 71 | shape = shapely.geometry.shape(geom) 72 | intersected_indices = list(self.tile_rtree.intersection(shape.bounds)) 73 | print(intersected_indices) 74 | 75 | intersected_files = [] 76 | naip_geom = [] 77 | 78 | for idx in intersected_indices: 79 | print(idx) 80 | intersected_file = self.tile_index[idx][0] 81 | print(intersected_file) 82 | intersected_geom = self.tile_index[idx][1] 83 | print(intersected_geom) 84 | if intersected_geom.intersects(shape): 85 | tile_intersection = True 86 | f = NAIPTileIndex.NAIP_BLOB_ROOT + intersected_file 87 | naip_geom.append(intersected_geom) 88 | intersected_files.append( 89 | NAIPTileIndex.NAIP_BLOB_ROOT + intersected_file 90 | ) 91 | 92 | if len(intersected_files) <= 0: 93 | raise IndexError("No tile intersections") 94 | else: 95 | return intersected_files, naip_geom 96 | 97 | 98 | def download_url(url, output_fn, verbose=False): 99 | """Download a URL to file. 100 | Args: 101 | url (str): URL of file to download 102 | output_fn (str): Filename to save (importantly -- not the directory to save the file to) 103 | verbose (bool): Whether to print how the download is going 104 | Returns: 105 | output_fn (str): Return `output_fn` as is 106 | """ 107 | 108 | if verbose: 109 | print("Downloading file {} to {}".format(os.path.basename(url), output_fn)) 110 | 111 | urllib.request.urlretrieve(url, output_fn) 112 | assert os.path.isfile(output_fn) 113 | 114 | if verbose: 115 | nBytes = os.path.getsize(output_fn) 116 | print("...done, {} bytes.".format(nBytes)) 117 | 118 | return output_fn 119 | 120 | 121 | def get_naip_tiles(label_tif_path): 122 | index = NAIPTileIndex("./tmp/") 123 | 124 | print(label_tif_path) 125 | with rasterio.open(label_tif_path) as f: 126 | geom = shapely.geometry.mapping(shapely.geometry.box(*f.bounds)) 127 | geom = fiona.transform.transform_geom(f.crs.to_string(), "epsg:4326", geom) 128 | 129 | naip_azure_path, naip_lst = index.lookup_geom(geom) 130 | return naip_azure_path 131 | 132 | 133 | def wrap_labels_to_naip(naip_tile_lst, out_dir, large_label_tif): 134 | for tile in naip_tile_lst: 135 | print(tile) 136 | with rasterio.open(tile, "r") as f: 137 | left, bottom, right, top = f.bounds 138 | crs = f.crs.to_string() 139 | height, width = f.height, f.width 140 | out_file = out_dir + tile.split("/")[-1] 141 | print(out_file) 142 | 143 | command = [ 144 | "gdalwarp", 145 | "-overwrite", 146 | "-ot", 147 | "Byte", 148 | "-t_srs", 149 | crs, 150 | "-r", 151 | "near", 152 | "-of", 153 | "GTiff", 154 | "-te", 155 | str(left), 156 | str(bottom), 157 | str(right), 158 | str(top), 159 | "-ts", 160 | str(width), 161 | str(height), 162 | "-co", 163 | "COMPRESS=LZW", 164 | "-co", 165 | "BIGTIFF=YES", 166 | "-dstnodata", 167 | str(0), 168 | large_label_tif, 169 | out_file, 170 | ] 171 | 172 | subprocess.call(command) 173 | print("written") 174 | 175 | 176 | def remove_notdata(in_dir, threshold): 177 | t_lst = [in_dir + t for t in os.listdir(in_dir) if t.endswith(".tif")] 178 | count = 0 179 | for t in t_lst: 180 | with rasterio.open(t) as src: 181 | a = src.read() 182 | if 0 in np.unique(a, return_counts=True)[0]: 183 | black_prop = (np.unique(a, return_counts=True)[1][0]) / ( 184 | a.shape[1] * a.shape[2] 185 | ) 186 | if black_prop < threshold: 187 | print(f"keeping {t}") 188 | count += 1 189 | else: 190 | print(f"too many no data pixels, removing {t}") 191 | os.remove(t) 192 | 193 | 194 | def azure_urls_df(label_dir, naip_lst, label_prefix_azure, group): 195 | l = [f for f in os.listdir(label_dir) if f.endswith(".tif")] 196 | tiles_lst = [] 197 | 198 | for n in l: 199 | tiles_lst.append([f for f in naip_lst if f.endswith(n)][0]) 200 | 201 | labels_azure = [label_prefix_azure + n for n in l] 202 | 203 | df = pd.DataFrame( 204 | list(zip(tiles_lst, labels_azure)), columns=["image_fn", "label_fn"] 205 | ) 206 | df["group"] = group 207 | return df 208 | 209 | 210 | @click.command() 211 | @click.option("--label_tif_path", help="path of input label tif", required=True) 212 | @click.option( 213 | "--out_dir", 214 | help="path for label tifs that align with naip tifs to be written", 215 | required=True, 216 | ) 217 | @click.option( 218 | "--threshold", 219 | help="threshold value for percentage of no data pixels", 220 | type=float, 221 | required=True, 222 | ) 223 | @click.option("--aoi", help="aoi name", type=str, required=True) 224 | @click.option("--group", help="label group name", type=str, required=True) 225 | def main(label_tif_path, out_dir, threshold, aoi, group): 226 | # create out_dir if it doesn't exist 227 | Path(out_dir).mkdir(exist_ok=True) 228 | 229 | # get naip tiles that intersect with label tif 230 | naip_azure_paths = get_naip_tiles(label_tif_path) 231 | 232 | # wrap label tiles to naip tiles 233 | wrap_labels_to_naip(naip_azure_paths, out_dir, label_tif_path) 234 | 235 | # remove tiles that have a >= threshold percentage of no data tiles 236 | remove_notdata(out_dir, threshold) 237 | 238 | azure_df = azure_urls_df( 239 | out_dir, 240 | naip_azure_paths, 241 | "https://uvmlabels.blob.core.windows.net/" + aoi + "/", 242 | group, 243 | ) 244 | 245 | train, validate, test = np.split( 246 | azure_df.sample(frac=1, random_state=40), 247 | [int(0.7 * len(azure_df)), int(0.9 * len(azure_df))], 248 | ) 249 | 250 | train.to_csv(out_dir + aoi + "_train" + ".csv") 251 | validate.to_csv(out_dir + aoi + "_val" ".csv") 252 | test.to_csv(out_dir + aoi + "_test" + ".csv") 253 | 254 | 255 | if __name__ == "__main__": 256 | main() 257 | -------------------------------------------------------------------------------- /naip-utils/naip_download_pc.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "256da363-a549-4c9d-84c4-3829fe7fd80d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import os\n", 11 | "from os import makedirs, path as op\n", 12 | "import json\n", 13 | "from typing import Collection, Tuple \n", 14 | "from pystac_client import Client\n", 15 | "import planetary_computer as pc\n", 16 | "from rio_tiler.io import COGReader\n", 17 | "from shapely.geometry import shape" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 2, 23 | "id": "e3860bbd-3b5b-4aab-bced-c9e5fb4536e0", 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# !pip install rio_tiler -U" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "id": "a665d23a-1ba3-4a4a-860d-955830756b8d", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "def download_NAIP(item, fn, area_of_interest):\n", 38 | " \"\"\"\n", 39 | " Download NAIP imagery from Planetary Computer\n", 40 | " \n", 41 | " Parameters:\n", 42 | " ___\n", 43 | "\n", 44 | " inputs:\n", 45 | " item: specific item in the STAC collection,\n", 46 | " fn: given file name\n", 47 | " area_of_interest: geometry of the AOI\n", 48 | " \n", 49 | " Returns:\n", 50 | " (None): writen COG of NAIP imagery that intersect with the given AOI\n", 51 | " \"\"\"\n", 52 | " print(item.datetime)\n", 53 | " href = pc.sign(item.assets[\"image\"].href)\n", 54 | " with COGReader(href) as cog:\n", 55 | " data = cog.feature(area_of_interest, max_size=None, indexes=(1, 2, 3, 4))\n", 56 | " \n", 57 | " with open(fn, \"wb\") as f:\n", 58 | " img = data.render(img_format=\"GTiff\", add_mask=False)\n", 59 | " f.write(img)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "id": "9cefa6db-a896-4ad7-bfb1-46ea1c9d606d", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "def main(aoi, date_range, out_dir):\n", 70 | " \n", 71 | " \"\"\"\n", 72 | " Download NAIP imagery from Planetary Computer\n", 73 | " \n", 74 | " Parameters:\n", 75 | " ___\n", 76 | "\n", 77 | " inputs:\n", 78 | " aoi: the path to the aoi,\n", 79 | " date_range: given date range to download images, e.g. 2010-01-01/2021-12-01\n", 80 | " out_dir: given output direct to save imagery\n", 81 | " \n", 82 | " Returns:\n", 83 | " (None): all writen COG of NAIP imagery that intersect with the given AOIs\n", 84 | " \"\"\"\n", 85 | " \n", 86 | " catelog = Client.open(\"https://planetarycomputer.microsoft.com/api/stac/v1\")\n", 87 | " #read in aoi\n", 88 | " with open(aoi) as f:\n", 89 | " feature = json.load(f)[\"features\"]\n", 90 | " # assuming this is only one geomery feature of an bounding box\n", 91 | " area_of_interest = feature[0][\"geometry\"]\n", 92 | " search_imagery = catelog.search(\n", 93 | " collections=[\"naip\"], intersects=area_of_interest, datetime=date_range\n", 94 | " )\n", 95 | " items = list(search_imagery.get_items())\n", 96 | " print(f\"{len(items)} items found in the {date_range} range for {aoi}!\")\n", 97 | " for item in items:\n", 98 | " if not op.exists(out_dir):\n", 99 | " makedirs(out_dir)\n", 100 | " fn = f\"{out_dir}/{str(item.datetime)[:10]}_naip_{aoi}.tif\"\n", 101 | " download_NAIP(item, fn, area_of_interest)\n", 102 | " " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 5, 108 | "id": "2ca669bd-3d5b-496b-812f-74b07f6aa723", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "aois = [\"aoi0_bounds.geojson\", \"aoi1_bounds.geojson\", \"aoi2_bounds.geojson\"]" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 21, 118 | "id": "d61764c7-26fd-4564-b8db-5cd6d1168d80", 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "date_range=\"2010-01-01/2021-12-01\"" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 22, 128 | "id": "bae88e20-21ff-47ef-82a2-4342cd903800", 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "4 items found in the 2010-01-01/2021-12-01 range for aoi0_bounds.geojson!\n", 136 | "2018-07-06 00:00:00+00:00\n", 137 | "2016-08-03 00:00:00+00:00\n", 138 | "2014-06-28 00:00:00+00:00\n", 139 | "2012-06-29 00:00:00+00:00\n", 140 | "8 items found in the 2010-01-01/2021-12-01 range for aoi1_bounds.geojson!\n", 141 | "2018-07-07 00:00:00+00:00\n", 142 | "2018-07-07 00:00:00+00:00\n", 143 | "2016-08-03 00:00:00+00:00\n", 144 | "2016-08-03 00:00:00+00:00\n", 145 | "2014-06-28 00:00:00+00:00\n", 146 | "2014-06-28 00:00:00+00:00\n", 147 | "2012-07-02 00:00:00+00:00\n", 148 | "2012-06-29 00:00:00+00:00\n", 149 | "4 items found in the 2010-01-01/2021-12-01 range for aoi2_bounds.geojson!\n", 150 | "2018-07-07 00:00:00+00:00\n", 151 | "2016-08-03 00:00:00+00:00\n", 152 | "2014-06-28 00:00:00+00:00\n", 153 | "2012-06-29 00:00:00+00:00\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "out_dir=\"naip_downloaded_20211020\"\n", 159 | "for aoi in aois:\n", 160 | " main(aoi, date_range, out_dir)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "71612f0f-5600-4c6e-9256-7c9f57e9a7ea", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3 (ipykernel)", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.9.10" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 5 193 | } 194 | -------------------------------------------------------------------------------- /pytorch-env.yml: -------------------------------------------------------------------------------- 1 | name: pytorch-env 2 | channels: 3 | - defaults 4 | - pytorch 5 | dependencies: 6 | - python=3.8 7 | - pytorch==1.4.0 8 | - torchvision 9 | - numpy 10 | - pandas 11 | - tifffile 12 | - matplotlib 13 | - pip 14 | - pip: 15 | - azureml-sdk 16 | - rasterio 17 | - fiona 18 | - segmentation-models-pytorch 19 | - scikit-learn 20 | - rio-tiler 21 | - mercantile 22 | - matplotlib 23 | - seaborn 24 | - tqdm 25 | -------------------------------------------------------------------------------- /src/calculate_image_stats_dir.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # vim:fenc=utf-8 4 | # 5 | # Copyright © 2020 Caleb Robinson 6 | # 7 | """Script for calculating per channel means and stdevs from a list of COGs 8 | """ 9 | import sys 10 | import os 11 | 12 | env = dict( 13 | GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR", 14 | AWS_NO_SIGN_REQUEST="YES", 15 | GDAL_MAX_RAW_BLOCK_CACHE_SIZE="200000000", 16 | GDAL_SWATH_SIZE="200000000", 17 | VSI_CURL_CACHE_SIZE="200000000", 18 | ) 19 | os.environ.update(env) 20 | import time 21 | 22 | import argparse 23 | import numpy as np 24 | import rasterio 25 | import pandas as pd 26 | import glob 27 | 28 | 29 | def stats(verbose, input_fn, output_dir, num_samples_per_file, num_files, nodata): 30 | 31 | # ----------------------------------- 32 | with open(input_fn, "r") as f: 33 | fns = f.read().strip().split("\n") 34 | if verbose: 35 | print("Found %d files" % (len(fns))) 36 | 37 | if num_files is not None: 38 | assert num_files <= len( 39 | fns 40 | ), "If you are going to sub-sample from the filelist, then you must specify a number of files less than the total number of files." 41 | np.random.shuffle(fns) 42 | fns = fns[:num_files] 43 | if verbose: 44 | print("...but only using %d of them" % (len(fns))) 45 | 46 | # ----------------------------------- 47 | sampled_pixels = [] 48 | 49 | if verbose: 50 | print("Sampling %d pixels per tile" % (num_samples_per_file)) 51 | 52 | with rasterio.open(fns[0]) as f: 53 | num_channels = f.count 54 | 55 | tic = time.time() 56 | for i, fn in enumerate(fns): 57 | if i % 10 == 0 and verbose: 58 | print("%d/%d\t%0.2f seconds" % (i + 1, len(fns), time.time() - tic)) 59 | tic = time.time() 60 | 61 | with rasterio.open(fn) as f: 62 | data = f.read().reshape(num_channels, -1) 63 | 64 | mask = np.sum(data == nodata, axis=0) == num_channels 65 | data = data[:, ~mask] 66 | num_samples = min(num_samples_per_file, data.shape[1]) 67 | idxs = np.random.choice(data.shape[1], size=num_samples) 68 | 69 | pixels = data[:, idxs] 70 | sampled_pixels.append(pixels) 71 | 72 | sampled_pixels = np.concatenate(sampled_pixels, axis=1) 73 | means = sampled_pixels.mean(axis=1, dtype=np.float64) 74 | stdevs = sampled_pixels.std(axis=1, dtype=np.float64) 75 | 76 | # ----------------------------------- 77 | 78 | print(type(means)) 79 | print(type(means[0])) 80 | if output_dir is not None: 81 | # with open(args.output_fn, "w") as f: 82 | # f.write("%s\n" % (means)) 83 | # f.write("%s\n" % (stdevs)) 84 | df = pd.DataFrame.from_dict({"means": means, "stdevs": stdevs}) 85 | df.to_csv( 86 | output_dir + "/" + os.path.splitext(os.path.basename(input_fn))[0] + ".csv" 87 | ) 88 | 89 | df2 = pd.DataFrame.from_dict( 90 | { 91 | "name": os.path.splitext(os.path.basename(input_fn))[0], 92 | "means": [means], 93 | "stdevs": [stdevs], 94 | } 95 | ) 96 | df2.to_csv( 97 | output_dir 98 | + "/" 99 | + os.path.splitext(os.path.basename(input_fn))[0] 100 | + "_2.csv" 101 | ) 102 | 103 | means = ",".join(["%0.4f" % (val) for val in means]) 104 | stdevs = ",".join(["%0.4f" % (val) for val in stdevs]) 105 | 106 | if verbose: 107 | print("Means:", means) 108 | print("Stdevs:", stdevs) 109 | 110 | elif not verbose: 111 | print(means) 112 | print(stdevs) 113 | 114 | 115 | def main(): 116 | parser = argparse.ArgumentParser(description="Image statistic calculation script") 117 | 118 | parser.add_argument( 119 | "-v", 120 | "--verbose", 121 | action="store_true", 122 | help="Enable verbose debugging", 123 | default=False, 124 | ) 125 | parser.add_argument( 126 | "--input_dir", 127 | action="store", 128 | type=str, 129 | help="Path to filelist. Filenames should be readable by rasterio.", 130 | required=True, 131 | ) 132 | parser.add_argument( 133 | "--output_dir", 134 | action="store", 135 | type=str, 136 | help="Filename to write (if this is not set, then we print the results to stdout)", 137 | default=None, 138 | ) 139 | parser.add_argument( 140 | "--num_samples_per_file", 141 | action="store", 142 | type=int, 143 | help="Filename to write", 144 | default=10000, 145 | ) 146 | parser.add_argument( 147 | "--num_files", 148 | action="store", 149 | type=int, 150 | help="Number of files to subsample", 151 | default=None, 152 | ) 153 | parser.add_argument( 154 | "--nodata", 155 | action="store", 156 | type=int, 157 | help="The nodata value to check (we assume that if each band in the data equals this value, then the position is nodata)", 158 | default=0, 159 | ) 160 | 161 | args = parser.parse_args(sys.argv[1:]) 162 | 163 | f_lst = [ 164 | args.input_dir + x for x in os.listdir(args.input_dir) if not x.startswith(".") 165 | ] 166 | print("f_list : ", f_lst) 167 | print("number of files to process: ", len(f_lst)) 168 | 169 | for i, f in enumerate(f_lst): 170 | print(i) 171 | stats( 172 | args.verbose, 173 | f, 174 | args.output_dir, 175 | args.num_samples_per_file, 176 | args.num_files, 177 | args.nodata, 178 | ) 179 | 180 | # Combine all dataframe together into master dataframe 181 | df_all = pd.concat( 182 | map(pd.read_csv, glob.glob(os.path.join(args.output_dir, "*_2.csv"))) 183 | ) 184 | df_all.to_csv(args.output_dir + "/" "all_stats.csv") 185 | 186 | 187 | if __name__ == "__main__": 188 | main() 189 | -------------------------------------------------------------------------------- /src/chips.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import mercantile 4 | import rio_tiler 5 | import tqdm 6 | from rio_tiler.io import COGReader 7 | from rio_tiler.models import ImageData 8 | import argparse 9 | 10 | parser = argparse.ArgumentParser(description="train/test/val csv creation script") 11 | parser.add_argument("--input_csv", type=str, required=True, help="") 12 | parser.add_argument("--output_dir", type=str, required=True, help="") 13 | args = parser.parse_args() 14 | 15 | 16 | def main(): 17 | df = pd.read_csv(args.input_csv) 18 | print(df.shape) 19 | img_path_lst = [] 20 | label_path_lst = [] 21 | for i, img in enumerate(tqdm.tqdm(df["image_fn"])): 22 | with COGReader(img) as cog: 23 | t_lst = [t for t in mercantile.tiles(*cog.bounds, 17)] 24 | print(len(t_lst)) 25 | 26 | with COGReader(df["label_fn"][i]) as cog_label: 27 | # chip NAIP image 28 | for t in t_lst: # fix 29 | img = cog.tile(t.x, t.y, t.z, tilesize=256) 30 | img_r = img.render(img_format="GTiff") 31 | 32 | img_arr = np.moveaxis(img.data, 0, -1) 33 | img_arr = img_arr - np.min(img_arr, (0, 1)) 34 | data_max_val = np.percentile(img_arr, 0.98, axis=(0, 1)) 35 | img_arr = img_arr / data_max_val * 255.0 36 | np.clip(img_arr, None, 255.0, img_arr) 37 | non_nodata_prop = np.sum(np.mean(img_arr, -1) > 0.0) / (256 * 256) 38 | 39 | if non_nodata_prop >= 0.95: 40 | # Can be replaced with Azure blob path 41 | path = f"{args.output_dir}/{t.x}-{t.y}-{t.z}-img.tif" 42 | path_label = f"{args.output_dir}/{t.x}-{t.y}-{t.z}-label.tif" 43 | with open(path, "wb") as f: 44 | f.write(img_r) 45 | img_label = cog_label.tile(t.x, t.y, t.z, tilesize=256) 46 | buff = img_label.render(img_format="GTiff") 47 | with open(path_label, "wb") as f: 48 | f.write(buff) 49 | img_path_lst.append(path) 50 | label_path_lst.append(path_label) 51 | else: 52 | print("removed too many no-data pixels") 53 | df_chips = pd.DataFrame( 54 | list(zip(img_path_lst, label_path_lst)), columns=["image_fn", "label_fn"] 55 | ) 56 | df_chips["group"] = df["group"][0] 57 | 58 | out_csv = args.output_dir + "/" + df["group"][0] + "_256chips" + ".csv" 59 | df_chips.to_csv(out_csv) 60 | 61 | 62 | if __name__ == "__main__": 63 | main() 64 | -------------------------------------------------------------------------------- /src/cls_distribution.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | os.environ[ 5 | "CURL_CA_BUNDLE" 6 | ] = "/etc/ssl/certs/ca-certificates.crt" # A workaround in case this happens: https://github.com/mapbox/rasterio/issues/1289 7 | import time 8 | import datetime 9 | import argparse 10 | import copy 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import json 15 | import utils 16 | 17 | import rasterio 18 | from rasterio.windows import Window 19 | from rasterio.errors import RasterioError, RasterioIOError 20 | 21 | from transforms_utils import ( 22 | labels_transform_uvm_8cls, 23 | ) 24 | 25 | import matplotlib 26 | import matplotlib.pyplot as plt 27 | 28 | matplotlib.use("Agg") 29 | 30 | import seaborn as sns 31 | 32 | sns.set() 33 | 34 | import torch 35 | 36 | NUM_WORKERS = 6 37 | CHIP_SIZE = 256 38 | 39 | parser = argparse.ArgumentParser( 40 | description="Minic streaming label data to create class distribution" 41 | ) 42 | parser.add_argument( 43 | "--input_fn", 44 | type=str, 45 | required=True, 46 | help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.', 47 | ) 48 | parser.add_argument( 49 | "--label_transform", 50 | default="naip", 51 | required=True, 52 | help="str either naip or epa to indicate how to transform labels", 53 | ) 54 | parser.add_argument( 55 | "--output_dir", 56 | type=str, 57 | required=True, 58 | help="The path to store class distribution.", 59 | ) 60 | parser.add_argument( 61 | "--overwrite", 62 | action="store_true", 63 | help="Flag for overwriting `output_dir` if that directory already exists.", 64 | ) 65 | ## Training arguments to generate class distribution 66 | parser.add_argument( 67 | "--batch_size", type=int, default=16, help="Batch size to use for training" 68 | ) 69 | parser.add_argument( 70 | "--num_epochs", type=int, default=1, help="Number of epochs to train for" 71 | ) 72 | parser.add_argument( 73 | "--seed", type=int, default=0, help="Random seed to pass to numpy and torch" 74 | ) 75 | parser.add_argument( 76 | "--num_classes", type=int, default=10, help="number of classes in dataset" 77 | ) 78 | parser.add_argument( 79 | "--num_chips", 80 | type=int, 81 | default=40, 82 | help="number of chips to randomly sample from data", 83 | ) 84 | args = parser.parse_args() 85 | 86 | 87 | def stream_tile_fns(NUM_WORKERS, label_fns, groups): 88 | worker_info = torch.utils.data.get_worker_info() 89 | if ( 90 | worker_info is None 91 | ): # In this case we are not loading through a DataLoader with multiple workers 92 | worker_id = 0 93 | num_workers = 1 94 | else: 95 | worker_id = worker_info.id 96 | num_workers = worker_info.NUM_WORKERS 97 | 98 | # We only want to shuffle the order we traverse the files if we are the first worker (else, every worker will shuffle the files...) 99 | if worker_id == 0: 100 | np.random.shuffle(label_fns) # in place 101 | # This logic splits up the list of filenames into `num_workers` chunks. Each worker will recieve ceil(num_filenames / num_workers) filenames to generate chips from. If the number of workers doesn't divide the number of filenames evenly then the last worker will have fewer filenames. 102 | N = len(label_fns) 103 | num_files_per_worker = int(np.ceil(N / num_workers)) 104 | lower_idx = worker_id * num_files_per_worker 105 | upper_idx = min(N, (worker_id + 1) * num_files_per_worker) 106 | for idx in range(lower_idx, upper_idx): 107 | 108 | label_fn = None 109 | # if self.use_labels: 110 | label_fn = label_fns[idx] 111 | group = groups[idx] 112 | print(label_fn) 113 | 114 | yield label_fn, group 115 | 116 | 117 | def stream_chips( 118 | num_workers, 119 | label_fns, 120 | num_chips_per_tile, 121 | groups, 122 | CHIP_SIZE, 123 | windowed_sampling, 124 | nodata_check, 125 | label_transform, 126 | verbose, 127 | ): 128 | for label_fn, group in stream_tile_fns(num_workers, label_fns, groups): 129 | num_skipped_chips = 0 130 | # Open file pointers 131 | label_fp = rasterio.open(label_fn, "r") 132 | 133 | # if use_labels: # garuntee that our label mask has the same dimensions as our imagery 134 | t_height, t_width = label_fp.shape 135 | print("Height and width of the label are:") 136 | print(t_height, t_width) 137 | 138 | # If we aren't in windowed sampling mode then we should read the entire tile up front 139 | label_data = None 140 | try: 141 | if not windowed_sampling: 142 | label_data = ( 143 | label_fp.read().squeeze() 144 | ) # assume the label geotiff has a single channel 145 | except RasterioError as e: 146 | print("WARNING: Error reading in entire file, skipping to the next file") 147 | continue 148 | 149 | for i in range(num_chips_per_tile): 150 | # Select the top left pixel of our chip randomly 151 | x = np.random.randint(0, t_width - CHIP_SIZE) 152 | y = np.random.randint(0, t_height - CHIP_SIZE) 153 | 154 | # Read labels 155 | labels = None 156 | if windowed_sampling: 157 | try: 158 | labels = label_fp.read( 159 | window=Window(x, y, CHIP_SIZE, CHIP_SIZE) 160 | ).squeeze() 161 | except RasterioError: 162 | print( 163 | "WARNING: Error reading chip from file, skipping to the next chip" 164 | ) 165 | continue 166 | else: 167 | labels = label_data[y : y + CHIP_SIZE, x : x + CHIP_SIZE] 168 | 169 | # # Check for no data 170 | if nodata_check is not None: 171 | skip_chip = nodata_check(labels) 172 | 173 | if ( 174 | skip_chip 175 | ): # The current chip has been identified as invalid by the `nodata_check(...)` method 176 | num_skipped_chips += 1 177 | continue 178 | if label_transform is not None: 179 | labels = label_transform(labels, group) 180 | else: 181 | labels = torch.from_numpy(labels).squeeze() 182 | print(labels) 183 | return labels 184 | label_fp.close() 185 | # 186 | if num_skipped_chips > 0 and verbose: 187 | print("We skipped %d chips on %s" % (label_fn)) 188 | 189 | 190 | def label_transforms_naip(labels, group): 191 | labels = np.array(labels).astype(np.int64) 192 | labels = np.where(labels == 14, 0, labels) # to no data 193 | labels = np.where(labels == 15, 0, labels) # to no data 194 | labels = np.where(labels == 13, 0, labels) # to no data 195 | labels = np.where(labels == 10, 3, labels) # to tree canopy 196 | labels = np.where(labels == 11, 3, labels) # to tree canopy 197 | labels = np.where(labels == 12, 3, labels) # to tree canopy 198 | return labels 199 | 200 | 201 | def label_transforms_epa(labels, group): 202 | labels = np.array(labels).astype(np.int64) 203 | labels_new = np.copy(labels) 204 | for k, v in utils.epa_label_dict.items(): 205 | labels_new[labels == k] = v 206 | labels_new = torch.from_numpy(labels_new) 207 | return labels_new 208 | 209 | 210 | def label_transforms_uvm(labels, group): 211 | labels = np.array(labels).astype(np.int64) 212 | labels_new = np.copy(labels) 213 | for k, v in utils.uvm_7cls.items(): 214 | labels_new[labels == k] = v 215 | labels_new = torch.from_numpy(labels_new) 216 | return labels_new 217 | 218 | 219 | def nodata_check(labels): 220 | return np.any(labels == 0) 221 | 222 | 223 | def class_distribute(): 224 | print( 225 | "Starting DFC2021 baseline training script at %s" 226 | % (str(datetime.datetime.now())) 227 | ) 228 | num_chips_per_tile = args.num_chips 229 | windowed_sampling = False 230 | label_transform = args.label_transform 231 | nodata_check = None 232 | verbose = True 233 | all_labels = [] 234 | 235 | if args.label_transform == "naip": 236 | label_transform = label_transforms_naip 237 | class_names = [ 238 | "no_data", 239 | "water", 240 | "emergent_wetlands", 241 | "tree_canopy", 242 | "shrubland", 243 | "low_vegetation", 244 | "barren", 245 | "structure", 246 | "impervious_surface", 247 | "impervious_roads", 248 | "weighted_avg", 249 | ] 250 | elif args.label_transform == "epa": 251 | label_transform = label_transforms_epa 252 | class_names = [ 253 | "no_data", 254 | "impervious", 255 | "soil_barren", 256 | "grass", 257 | "tree/forest", 258 | "water", 259 | "shrub", 260 | "woody_wetlands", 261 | "emergent_wetlands", 262 | "agriculture", 263 | "orchard", 264 | "weighted_avg", 265 | ] 266 | elif args.label_transform == "uvm": 267 | label_transform = label_transforms_uvm 268 | class_names = [ 269 | "tree", 270 | "grass", 271 | "bare soil", 272 | "water", 273 | "buildings", 274 | "roads", 275 | "other impervious", 276 | ] 277 | elif args.label_transform == "uvm8cls": 278 | label_transform = labels_transform_uvm_8cls 279 | class_names = [ 280 | "tree", 281 | "grass", 282 | "bare soil", 283 | "water", 284 | "buildings", 285 | "roads", 286 | "other impervious", 287 | ] 288 | else: 289 | raise ValueError("Invalid label transform") 290 | # ------------------- 291 | # Setup 292 | # ------------------- 293 | assert os.path.exists(args.input_fn) 294 | 295 | if os.path.isfile(args.output_dir): 296 | print("A file was passed as `--output_dir`, please pass a directory!") 297 | return 298 | 299 | if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)): 300 | if args.overwrite: 301 | print( 302 | "WARNING! The output directory, %s, already exists, we might overwrite data in it!" 303 | % (args.output_dir) 304 | ) 305 | else: 306 | print( 307 | "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..." 308 | % (args.output_dir) 309 | ) 310 | return 311 | else: 312 | print("The output directory doesn't exist or is empty.") 313 | os.makedirs(args.output_dir, exist_ok=True) 314 | 315 | if torch.cuda.is_available(): 316 | device = torch.device("cuda:%d" % args.gpu) 317 | else: 318 | print("WARNING! Torch is reporting that CUDA isn't available, using cpu") 319 | device = "cpu" 320 | 321 | np.random.seed(args.seed) 322 | torch.manual_seed(args.seed) 323 | 324 | # ------------------- 325 | # Load input data 326 | # ------------------- 327 | input_dataframe = pd.read_csv(args.input_fn) 328 | print(input_dataframe.head()) 329 | label_fns = input_dataframe["label_fn"].values 330 | groups = input_dataframe["group"].values 331 | print(label_fns) 332 | 333 | print(args.label_transform) 334 | if args.label_transform == "naip": 335 | label_transform = label_transforms_naip 336 | elif args.label_transform == "epa": 337 | label_transform = label_transforms_epa 338 | elif args.label_transform == "uvm": 339 | label_transform = label_transforms_uvm 340 | else: 341 | raise ValueError("Invalid label transform") 342 | 343 | num_training_batches_per_epoch = int( 344 | len(label_fns) * args.num_chips / args.batch_size 345 | ) 346 | 347 | # getting label chips stac by given model epochs 348 | # is num_chips_per_tile the num_training_batches_per_epoch 349 | for epoch in range(args.num_epochs): 350 | for num_batches in range(num_training_batches_per_epoch): 351 | try: 352 | labels = stream_chips( 353 | NUM_WORKERS, 354 | label_fns, 355 | num_chips_per_tile, 356 | groups, 357 | CHIP_SIZE, 358 | windowed_sampling, 359 | nodata_check, 360 | label_transform, 361 | verbose, 362 | ) 363 | all_labels.append(labels) 364 | except Exception as ex: 365 | print(ex) 366 | pass 367 | label_arr = np.array([t.numpy() for t in all_labels]) 368 | 369 | # ------------------- 370 | # Plot classes distribution 371 | # ------------------- 372 | fig, ax = plt.subplots(figsize=(30, 10)) 373 | fig.tight_layout() 374 | unique, counts = np.unique(label_arr, return_counts=True) 375 | cls_dict = dict(zip(range(len(class_names)), class_names)) 376 | vc_out = dict(zip(unique, counts / args.num_epochs)) # 10 epoaches 377 | vc2df = dict(zip(cls_dict.values(), vc_out.values())) 378 | df = pd.DataFrame.from_dict(vc2df, orient="index", columns=["count"]) 379 | 380 | ax = sns.barplot(x=df.index, y="count", data=df) 381 | ax.set_ylabel(f"Class count") 382 | ax.set_xlabel(f"Classe name") 383 | ax.set_xticklabels(labels=ax.get_xticklabels(), rotation=45, ha="right") 384 | plt.tight_layout() 385 | fig.savefig(os.path.join(args.output_dir, "cls_distribution.png")) 386 | csv_fn = "output_cls_counts_and_values.csv" 387 | df.to_csv(os.path.join(args.output_dir, csv_fn)) 388 | 389 | 390 | if __name__ == "__main__": 391 | class_distribute() 392 | -------------------------------------------------------------------------------- /src/dataloaders/StreamingDatasets.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import numpy as np 4 | 5 | import rasterio 6 | from rasterio.windows import Window 7 | from rasterio.errors import RasterioError, RasterioIOError 8 | 9 | import torch 10 | from torchvision import transforms 11 | from torch.utils.data.dataset import IterableDataset 12 | 13 | 14 | class StreamingGeospatialDataset(IterableDataset): 15 | def __init__( 16 | self, 17 | imagery_fns, 18 | label_fns=None, 19 | groups=None, 20 | chip_size=256, 21 | num_chips_per_tile=200, 22 | windowed_sampling=False, 23 | image_transform=None, 24 | label_transform=None, 25 | nodata_check=None, 26 | verbose=True, 27 | ): 28 | """A torch Dataset for randomly sampling chips from a list of tiles. When used in conjunction with a DataLoader that has `num_workers>1` this Dataset will assign each worker to sample chips from disjoint sets of tiles. 29 | Args: 30 | imagery_fns: A list of filenames (or URLS -- anything that `rasterio.open()` can read) pointing to imagery tiles. 31 | label_fns: A list of filenames of the same size as `imagery_fns` pointing to label mask tiles or `None` if the Dataset should operate in "imagery only mode". Note that we expect `imagery_fns[i]` and `label_fns[i]` to have the same dimension and coordinate system. 32 | groups: Optional: A list of integers of the same size as `imagery_fns` that gives the "group" membership of each tile. This can be used to normalize imagery from different groups differently. 33 | chip_size: Desired size of chips (in pixels). 34 | num_chips_per_tile: Desired number of chips to sample for each tile. 35 | windowed_sampling: Flag indicating whether we should sample each chip with a read using `rasterio.windows.Window` or whether we should read the whole tile into memory, then sample chips. 36 | image_transform: A function to apply to each image chip object. If this is `None`, then the only transformation applied to the loaded imagery will be to convert it to a `torch.Tensor`. If this is not `None`, then the function should return a `Torch.tensor`. Further, if `groups` is not `None` then the transform function should expect the imagery as the first argument and the group as the second argument. 37 | label_transform: Similar to image_transform, but applied to label chips. 38 | nodata_check: A method that will check an `(image_chip)` or `(image_chip, label_chip)` (if `label_fns` are provided) and return whether or not the chip should be skipped. This can be used, for example, to skip chips that contain nodata values. 39 | verbose: If `False` we will be quiet. 40 | """ 41 | 42 | if label_fns is None: 43 | self.fns = imagery_fns 44 | self.use_labels = False 45 | else: 46 | self.fns = list(zip(imagery_fns, label_fns)) 47 | self.use_labels = True 48 | 49 | self.groups = groups 50 | 51 | self.chip_size = chip_size 52 | self.num_chips_per_tile = num_chips_per_tile 53 | self.windowed_sampling = windowed_sampling 54 | 55 | self.image_transform = image_transform 56 | self.label_transform = label_transform 57 | self.nodata_check = nodata_check 58 | 59 | self.verbose = verbose 60 | 61 | if self.verbose: 62 | print("Constructed StreamingGeospatialDataset") 63 | 64 | def stream_tile_fns(self): 65 | worker_info = torch.utils.data.get_worker_info() 66 | if ( 67 | worker_info is None 68 | ): # In this case we are not loading through a DataLoader with multiple workers 69 | worker_id = 0 70 | num_workers = 1 71 | else: 72 | worker_id = worker_info.id 73 | num_workers = worker_info.num_workers 74 | 75 | # We only want to shuffle the order we traverse the files if we are the first worker (else, every worker will shuffle the files...) 76 | if worker_id == 0: 77 | np.random.shuffle(self.fns) # in place 78 | # NOTE: A warning, when different workers are created they will all have the same numpy random seed, however will have different torch random seeds. If you want to use numpy random functions, seed appropriately. 79 | # seed = torch.randint(low=0,high=2**32-1,size=(1,)).item() 80 | # np.random.seed(seed) # when different workers spawn, they have the same numpy random seed... 81 | 82 | if self.verbose: 83 | print("Creating a filename stream for worker %d" % (worker_id)) 84 | 85 | # This logic splits up the list of filenames into `num_workers` chunks. Each worker will recieve ceil(num_filenames / num_workers) filenames to generate chips from. If the number of workers doesn't divide the number of filenames evenly then the last worker will have fewer filenames. 86 | N = len(self.fns) 87 | num_files_per_worker = int(np.ceil(N / num_workers)) 88 | lower_idx = worker_id * num_files_per_worker 89 | upper_idx = min(N, (worker_id + 1) * num_files_per_worker) 90 | for idx in range(lower_idx, upper_idx): 91 | 92 | label_fn = None 93 | if self.use_labels: 94 | img_fn, label_fn = self.fns[idx] 95 | else: 96 | img_fn = self.fns[idx] 97 | 98 | if self.groups is not None: 99 | group = self.groups[idx] 100 | else: 101 | group = None 102 | 103 | if self.verbose: 104 | print("Worker %d, yielding file %d" % (worker_id, idx)) 105 | 106 | yield (img_fn, label_fn, group) 107 | 108 | def stream_chips(self): 109 | for img_fn, label_fn, group in self.stream_tile_fns(): 110 | num_skipped_chips = 0 111 | 112 | # Open file pointers 113 | img_fp = rasterio.open(img_fn, "r") 114 | print(img_fn) 115 | label_fp = rasterio.open(label_fn, "r") if self.use_labels else None 116 | print(label_fn) 117 | 118 | print("label shape: ", label_fp.shape) 119 | print("label height: ", label_fp.shape[0]) 120 | print("label width: ", label_fp.shape[1]) 121 | 122 | height, width = img_fp.shape 123 | print("image height: ", height) 124 | print("image width: ", width) 125 | 126 | if ( 127 | self.use_labels 128 | ): # garuntee that our label mask has the same dimensions as our imagery 129 | t_height, t_width = label_fp.shape 130 | assert height == t_height and width == t_width 131 | 132 | # If we aren't in windowed sampling mode then we should read the entire tile up front 133 | img_data = None 134 | label_data = None 135 | try: 136 | if not self.windowed_sampling: 137 | img_data = np.rollaxis(img_fp.read(), 0, 3) 138 | if self.use_labels: 139 | label_data = ( 140 | label_fp.read().squeeze() 141 | ) # assume the label geotiff has a single channel 142 | except RasterioError as e: 143 | print( 144 | "WARNING: Error reading in entire file, skipping to the next file" 145 | ) 146 | continue 147 | 148 | for i in range(self.num_chips_per_tile): 149 | # Select the top left pixel of our chip randomly 150 | x = np.random.randint(0, width - self.chip_size) 151 | y = np.random.randint(0, height - self.chip_size) 152 | 153 | # Read imagery / labels 154 | img = None 155 | labels = None 156 | if self.windowed_sampling: 157 | try: 158 | img = np.rollaxis( 159 | img_fp.read( 160 | window=Window(x, y, self.chip_size, self.chip_size) 161 | ), 162 | 0, 163 | 3, 164 | ) 165 | # print(img.shape) 166 | if self.use_labels: 167 | labels = label_fp.read( 168 | window=Window(x, y, self.chip_size, self.chip_size) 169 | ).squeeze() 170 | except RasterioError: 171 | print( 172 | "WARNING: Error reading chip from file, skipping to the next chip" 173 | ) 174 | continue 175 | else: 176 | img = img_data[y : y + self.chip_size, x : x + self.chip_size, :] 177 | if self.use_labels: 178 | labels = label_data[ 179 | y : y + self.chip_size, x : x + self.chip_size 180 | ] 181 | 182 | # Check for no data 183 | if self.nodata_check is not None: 184 | if self.use_labels: 185 | skip_chip = self.nodata_check(img, labels) 186 | else: 187 | skip_chip = self.nodata_check(img) 188 | 189 | if ( 190 | skip_chip 191 | ): # The current chip has been identified as invalid by the `nodata_check(...)` method 192 | num_skipped_chips += 1 193 | continue 194 | 195 | # Transform the imagery 196 | if self.image_transform is not None: 197 | if self.groups is None: 198 | img = self.image_transform(img) 199 | else: 200 | img = self.image_transform(img, group) 201 | else: 202 | img = torch.from_numpy(img).squeeze() 203 | 204 | # Transform the labels 205 | if self.use_labels: 206 | if self.label_transform is not None: 207 | if self.groups is None: 208 | labels = self.label_transform(labels) 209 | else: 210 | labels = self.label_transform(labels, group) 211 | else: 212 | labels = torch.from_numpy(labels).squeeze() 213 | 214 | # Note, that img should be a torch "Double" type (i.e. a np.float32) and labels should be a torch "Long" type (i.e. np.int64) 215 | if self.use_labels: 216 | yield img, labels 217 | else: 218 | yield img 219 | 220 | # Close file pointers 221 | img_fp.close() 222 | if self.use_labels: 223 | label_fp.close() 224 | 225 | if num_skipped_chips > 0 and self.verbose: 226 | print("We skipped %d chips on %s" % (num_skipped_chips, img_fn)) 227 | 228 | def __iter__(self): 229 | if self.verbose: 230 | print("Creating a new StreamingGeospatialDataset iterator") 231 | return iter(self.stream_chips()) 232 | -------------------------------------------------------------------------------- /src/dataloaders/TileDatasets.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import rasterio 4 | from rasterio.windows import Window 5 | from rasterio.errors import RasterioIOError 6 | 7 | import torch 8 | from torch.utils.data.dataset import Dataset 9 | 10 | 11 | class TileInferenceDataset(Dataset): 12 | def __init__( 13 | self, 14 | fn, 15 | chip_size, 16 | stride, 17 | transform=None, 18 | windowed_sampling=False, 19 | verbose=False, 20 | ): 21 | """A torch Dataset for sampling a grid of chips that covers an input tile. 22 | If `chip_size` doesn't divide the height of the tile evenly (which is what is likely to happen) then we will sample an additional row of chips that are aligned to the bottom of the file. 23 | We do a similar operation if `chip_size` doesn't divide the width of the tile evenly -- by appending an additional column. 24 | Note: without a `transform` we will return chips in (height, width, channels) format in whatever the tile's dtype is. 25 | Args: 26 | fn: The path to the file to sample from (this can be anything that rasterio.open(...) knows how to read). 27 | chip_size: The size of chips to return (chips will be squares). 28 | stride: How much we move the sliding window to sample the next chip. If this is is less than `chip_size` then we will get overlapping windows, if it is > `chip_size` then some parts of the tile will not be sampled. 29 | transform: A torchvision Transform to apply on each chip. 30 | windowed_sample: If `True` we will use rasterio.windows.Window to sample chips without every loading the entire file into memory, else, we will load the entire tile up-front and index into it to sample chips. 31 | verbose: Flag to control printing stuff. 32 | """ 33 | self.fn = fn 34 | self.chip_size = chip_size 35 | self.transform = transform 36 | self.windowed_sampling = windowed_sampling 37 | self.verbose = verbose 38 | with rasterio.open(self.fn) as f: 39 | height, width = f.height, f.width 40 | self.num_channels = f.count 41 | self.dtype = f.profile["dtype"] 42 | if ( 43 | not windowed_sampling 44 | ): # if we aren't using windowed sampling, then go ahead and read in all of the data 45 | self.data = np.rollaxis(f.read(), 0, 3) 46 | self.chip_coordinates = ( 47 | [] 48 | ) # upper left coordinate (y,x), of each chip that this Dataset will return 49 | for y in list(range(0, height - self.chip_size, stride)) + [ 50 | height - self.chip_size 51 | ]: 52 | for x in list(range(0, width - self.chip_size, stride)) + [ 53 | width - self.chip_size 54 | ]: 55 | self.chip_coordinates.append((y, x)) 56 | self.num_chips = len(self.chip_coordinates) 57 | 58 | if self.verbose: 59 | print( 60 | "Constructed TileInferenceDataset -- we have %d by %d file with %d channels with a dtype of %s. We are sampling %d chips from it." 61 | % (height, width, self.num_channels, self.dtype, self.num_chips) 62 | ) 63 | 64 | def __getitem__(self, idx): 65 | """ 66 | Returns: 67 | A tuple (chip, (y,x)): `chip` is the chip that we sampled from the larger tile. (y,x) are the indices of the upper left corner of the chip. 68 | """ 69 | y, x = self.chip_coordinates[idx] 70 | if self.windowed_sampling: 71 | try: 72 | with rasterio.Env(): 73 | with rasterio.open(self.fn) as f: 74 | img = np.rollaxis( 75 | f.read( 76 | window=rasterio.windows.Window( 77 | x, y, self.chip_size, self.chip_size 78 | ) 79 | ), 80 | 0, 81 | 3, 82 | ) 83 | except RasterioIOError as e: # NOTE(caleb): I put this here to catch weird errors that I was seeing occasionally when trying to read from COGS - I don't remember the details though 84 | print("Reading %d failed, returning 0's" % (idx)) 85 | img = np.zeros( 86 | (self.chip_size, self.chip_size, self.num_channels), dtype=np.uint8 87 | ) 88 | else: 89 | img = self.data[y : y + self.chip_size, x : x + self.chip_size] 90 | 91 | if self.transform is not None: 92 | img = self.transform(img) 93 | 94 | return img, np.array((y, x)) 95 | 96 | def __len__(self): 97 | return self.num_chips 98 | -------------------------------------------------------------------------------- /src/dataloaders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developmentseed/pearl-ml-pipeline/bf8b857b5939f5e614bc81e7eb156246eea987ad/src/dataloaders/__init__.py -------------------------------------------------------------------------------- /src/embeddings.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import time 4 | import datetime 5 | import argparse 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import rasterio 11 | from rasterio.windows import Window 12 | 13 | import torch 14 | import torch.nn.functional as F 15 | 16 | import models 17 | from dataloaders.TileDatasets import TileInferenceDataset 18 | import utils 19 | from sklearn.metrics import confusion_matrix, f1_score 20 | 21 | os.environ[ 22 | "CURL_CA_BUNDLE" 23 | ] = "/etc/ssl/certs/ca-certificates.crt" # A workaround in case this happens: https://github.com/mapbox/rasterio/issues/1289 24 | 25 | NUM_WORKERS = 4 26 | CHIP_SIZE = 256 27 | PADDING = 128 28 | assert PADDING % 2 == 0 29 | HALF_PADDING = PADDING // 2 30 | CHIP_STRIDE = CHIP_SIZE - PADDING 31 | 32 | from azureml.core import Run 33 | 34 | run = Run.get_context() 35 | 36 | parser = argparse.ArgumentParser(description="DFC2021 model inference script") 37 | parser.add_argument( 38 | "--input_fn", 39 | type=str, 40 | required=True, 41 | help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.', 42 | ) 43 | parser.add_argument( 44 | "--model_fn", type=str, required=True, help="Path to the model file to use." 45 | ) 46 | parser.add_argument( 47 | "--output_dir", 48 | type=str, 49 | required=True, 50 | help="The path to output the model predictions as a GeoTIFF. Will fail if this file already exists.", 51 | ) 52 | parser.add_argument( 53 | "--overwrite", 54 | action="store_true", 55 | help="Flag for overwriting `--output_dir` if that directory already exists.", 56 | ) 57 | parser.add_argument("--gpu", type=int, default=0, help="The ID of the GPU to use") 58 | parser.add_argument( 59 | "--batch_size", type=int, default=2, help="Batch size to use during inference." 60 | ) 61 | parser.add_argument( 62 | "--model", default="fcn", choices=("unet", "fcn"), help="Model to use" 63 | ) 64 | 65 | parser.add_argument( 66 | "--num_classes", 67 | type=int, 68 | default=11, 69 | help="number of classes model was trained with", 70 | ), 71 | 72 | parser.add_argument( 73 | "--label_transform", 74 | default="naip", 75 | help="str either naip, epa or cic to indicate how to transform labels", 76 | ) 77 | 78 | 79 | args = parser.parse_args() 80 | 81 | 82 | def label_transforms_naip(labels): 83 | labels = np.array(labels).astype(np.int64) 84 | labels = np.where(labels == 14, 0, labels) # to no data 85 | labels = np.where(labels == 15, 0, labels) # to no data 86 | labels = np.where(labels == 13, 0, labels) # to no data 87 | labels = np.where(labels == 10, 3, labels) # to tree canopy 88 | labels = np.where(labels == 11, 3, labels) # to tree canopy 89 | labels = np.where(labels == 12, 3, labels) # to tree canopy 90 | return labels 91 | 92 | 93 | def label_transforms_epa(labels): 94 | labels = np.array(labels).astype(np.int64) 95 | labels_new = np.copy(labels) 96 | for k, v in utils.epa_label_dict.items(): 97 | labels_new[labels == k] = v 98 | return labels_new 99 | 100 | 101 | def label_transform_cic(labels): 102 | labels = np.array(labels).astype(np.int64) 103 | labels_new = np.copy(labels) 104 | for k, v in utils.cic_label_dict.items(): 105 | labels_new[labels == k] = v 106 | return labels_new 107 | 108 | 109 | def random_pixel_values(src_path: str, number_of_point: int, excludes={10, 11, 12}): 110 | with rasterio.open(src_path) as src_dst: 111 | output = {} 112 | cr_lst = [] 113 | arr = src_dst.read(indexes=1) 114 | value, count = np.unique(arr, return_counts=True) 115 | for (i, c) in enumerate(count): 116 | if value[i] in excludes: 117 | continue 118 | point_y, point_x = np.where(arr == value[i]) 119 | n_points = ( 120 | number_of_point if len(point_x) > number_of_point else len(point_x) 121 | ) 122 | indexes = np.random.choice(len(point_x), n_points).tolist() 123 | # y,x in row/col indexes 124 | cr = [(point_x[idx], point_y[idx]) for idx in indexes] 125 | output[value[i]] = cr 126 | cr_lst.append(cr) 127 | # TODO 128 | # yield pix, coordinates 129 | cr_f = [list(item) for sublist in cr_lst for item in sublist] 130 | return output, cr_f 131 | 132 | 133 | def main(): 134 | print("Starting model eval script at %s" % (str(datetime.datetime.now()))) 135 | 136 | # ------------------- 137 | # Setup 138 | # ------------------- 139 | assert os.path.exists(args.input_fn) 140 | assert os.path.exists(args.model_fn) 141 | 142 | if os.path.isfile(args.output_dir): 143 | print("A file was passed as `--output_dir`, please pass a directory!") 144 | return 145 | 146 | if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)) > 0: 147 | if args.overwrite: 148 | print( 149 | "WARNING! The output directory, %s, already exists, we might overwrite data in it!" 150 | % (args.output_dir) 151 | ) 152 | else: 153 | print( 154 | "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..." 155 | % (args.output_dir) 156 | ) 157 | return 158 | else: 159 | print("The output directory doesn't exist or is empty.") 160 | os.makedirs(args.output_dir, exist_ok=True) 161 | 162 | if torch.cuda.is_available(): 163 | device = torch.device("cuda:%d" % args.gpu) 164 | else: 165 | print("WARNING! Torch is reporting that CUDA isn't available, exiting...") 166 | return 167 | 168 | # ------------------- 169 | # Load model 170 | # ------------------- 171 | if args.model == "unet": 172 | model = models.get_unet(classes=args.num_classes) 173 | elif args.model == "fcn": 174 | model = models.get_fcn(num_output_classes=args.num_classes) 175 | else: 176 | raise ValueError("Invalid model") 177 | model.load_state_dict(torch.load(args.model_fn)) 178 | model = model.to(device) 179 | 180 | # determine which label transform to use 181 | if args.label_transform == "naip": 182 | label_transform = label_transforms_naip 183 | class_names = [ 184 | "no_data", 185 | "water", 186 | "emergent_wetlands", 187 | "tree_canopy", 188 | "shrubland", 189 | "low_vegetation", 190 | "barren", 191 | "structure", 192 | "impervious_surface", 193 | "impervious_roads", 194 | "weighted_avg", 195 | ] 196 | elif args.label_transform == "epa": 197 | label_transform = label_transforms_epa 198 | class_names = [ 199 | "no_data", 200 | "impervious", 201 | "soil_barren", 202 | "grass", 203 | "tree/forest", 204 | "water", 205 | "shrub", 206 | "woody_wetlands", 207 | "emergent_wetlands", 208 | "agriculture", 209 | "orchard", 210 | "weighted_avg", 211 | ] 212 | elif args.label_transform == "cic": 213 | label_transform = label_transform_cic 214 | class_names = [ 215 | "Structures", 216 | "Impervious Surface", 217 | "Water", 218 | "Grassland/Pairie", 219 | "Tree Canopy", 220 | "Turff", 221 | "Barren/Rock", 222 | "Irregated", 223 | ] 224 | else: 225 | raise ValueError("Invalid label transform") 226 | 227 | # ------------------- 228 | # Run on each line in the input 229 | # ------------------- 230 | input_dataframe = pd.read_csv(args.input_fn) 231 | image_fns = input_dataframe["image_fn"].values 232 | label_fns = input_dataframe["label_fn"].values 233 | groups = input_dataframe["group"].values 234 | 235 | # Get Row,Column for unique lables 236 | for i, gt_img in enumerate(label_fns): 237 | x_embedding = [] 238 | output, cr_f = random_pixel_values(gt_img, 10, {10, 11, 12}) 239 | labels = list(output.keys()) * 10 240 | labels.sort() 241 | # print(output) 242 | print("flattened coords:") 243 | print(cr_f) 244 | print("labels") 245 | print(labels) 246 | 247 | # run inference on window that contains each row,column val 248 | for rc in cr_f: 249 | gt_2 = rasterio.open(image_fns[i]) 250 | w = gt_2.read(window=Window(rc[0], rc[1], 256, 256)) 251 | data = w / 255.0 252 | data = data.astype(np.float32) 253 | data = torch.from_numpy(data) 254 | data = data.to(device) 255 | 256 | label_img = rasterio.open(gt_img) 257 | w_label = label_img.read(1, window=Window(rc[0], rc[1], 256, 256)) 258 | print(w_label[0, 0]) 259 | 260 | with torch.no_grad(): 261 | 262 | embedding = model.forward_features( 263 | data[None, ...] 264 | ) # insert singleton "batch" dimension to input data for pytorch to be happy 265 | embedding = embedding.cpu().numpy() 266 | embedding = np.moveaxis(embedding[0], 0, -1) 267 | x_embedding.append(embedding[0, 0]) 268 | 269 | output_fn = gt_img[0][:-4].split("/")[-1] # something like "546_naip-2013.tif" 270 | output_fn_e = output_fn + "_embedding.npz" 271 | output_fn_l = output_fn + "_label.npz" 272 | 273 | output_path_e = os.path.join(args.output_dir, output_fn_e) 274 | output_path_label = os.path.join(args.output_dir, output_fn_l) 275 | 276 | np.savez(output_path_e, np.array(x_embedding)) 277 | np.savez(output_path_label, np.array(labels)) 278 | print("saved") 279 | 280 | 281 | if __name__ == "__main__": 282 | main() 283 | -------------------------------------------------------------------------------- /src/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import time 4 | import datetime 5 | import argparse 6 | 7 | import numpy as np 8 | import pandas as pd 9 | 10 | import rasterio 11 | 12 | import torch 13 | import torch.nn.functional as F 14 | 15 | import models 16 | from dataloaders.TileDatasets import TileInferenceDataset 17 | import utils 18 | from transforms_utils import ( 19 | label_transforms_naip, 20 | label_transform_cic, 21 | label_transforms_epa, 22 | label_transform_naip5cls, 23 | labels_transform_uvm, 24 | labels_transform_uvm_8cls, 25 | image_transforms, 26 | ) 27 | from sklearn.metrics import f1_score 28 | from azureml.core import Run 29 | 30 | # A workaround in case this happens: https://github.com/mapbox/rasterio/issues/1289 31 | os.environ["CURL_CA_BUNDLE"] = "/etc/ssl/certs/ca-certificates.crt" 32 | 33 | NUM_WORKERS = 4 34 | CHIP_SIZE = 256 35 | PADDING = 128 36 | assert PADDING % 2 == 0 37 | HALF_PADDING = PADDING // 2 38 | CHIP_STRIDE = CHIP_SIZE - PADDING 39 | 40 | 41 | run = Run.get_context() 42 | 43 | parser = argparse.ArgumentParser(description="DFC2021 model inference script") 44 | parser.add_argument( 45 | "--input_fn", 46 | type=str, 47 | required=True, 48 | help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.', 49 | ) 50 | parser.add_argument( 51 | "--model_fn", type=str, required=True, help="Path to the model file to use." 52 | ) 53 | parser.add_argument( 54 | "--output_dir", 55 | type=str, 56 | required=True, 57 | help="The path to output the model predictions as a GeoTIFF. Will fail if this file already exists.", 58 | ) 59 | parser.add_argument( 60 | "--overwrite", 61 | action="store_true", 62 | help="Flag for overwriting `--output_dir` if that directory already exists.", 63 | ) 64 | parser.add_argument("--gpu", type=int, default=0, help="The ID of the GPU to use") 65 | parser.add_argument( 66 | "--batch_size", type=int, default=2, help="Batch size to use during inference." 67 | ) 68 | parser.add_argument( 69 | "--save_soft", 70 | action="store_true", 71 | help='Flag that enables saving the predicted per class probabilities in addition to the "hard" class predictions.', 72 | ) 73 | parser.add_argument( 74 | "--model", 75 | default="fcn", 76 | choices=("unet", "fcn", "unet2", "deeplabv3plus"), 77 | help="Model to use", 78 | ) 79 | 80 | parser.add_argument( 81 | "--num_classes", 82 | type=int, 83 | default=10, 84 | help="number of classes model was trained with", 85 | ), 86 | 87 | parser.add_argument( 88 | "--label_transform", 89 | default="naip", 90 | help="str either naip, epa or cic to indicate how to transform labels", 91 | ) 92 | 93 | 94 | args = parser.parse_args() 95 | 96 | 97 | def main(): 98 | print("Starting model eval script at %s" % (str(datetime.datetime.now()))) 99 | 100 | # ------------------- 101 | # Setup 102 | # ------------------- 103 | assert os.path.exists(args.input_fn) 104 | assert os.path.exists(args.model_fn) 105 | 106 | if os.path.isfile(args.output_dir): 107 | print("A file was passed as `--output_dir`, please pass a directory!") 108 | return 109 | 110 | if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)) > 0: 111 | if args.overwrite: 112 | print( 113 | "WARNING! The output directory, %s, already exists, we might overwrite data in it!" 114 | % (args.output_dir) 115 | ) 116 | else: 117 | print( 118 | "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..." 119 | % (args.output_dir) 120 | ) 121 | return 122 | else: 123 | print("The output directory doesn't exist or is empty.") 124 | os.makedirs(args.output_dir, exist_ok=True) 125 | 126 | if torch.cuda.is_available(): 127 | device = torch.device("cuda:%d" % args.gpu) 128 | else: 129 | print("WARNING! Torch is reporting that CUDA isn't available, exiting...") 130 | return 131 | 132 | # ------------------- 133 | # Load model 134 | # ------------------- 135 | if args.model == "unet": 136 | model = models.get_unet(classes=args.num_classes) 137 | elif args.model == "fcn": 138 | model = models.get_fcn(num_output_classes=args.num_classes) 139 | elif args.model == "unet2": 140 | model = models.get_unet2(n_classes=args.num_classes) 141 | elif args.model == "deeplabv3plus": 142 | model = models.get_deeplabv3plus(n_classes=args.num_classes) 143 | else: 144 | raise ValueError("Invalid model") 145 | model.load_state_dict(torch.load(args.model_fn)) 146 | model = model.to(device) 147 | 148 | # determine which label transform to use 149 | if args.label_transform == "naip": 150 | label_transform = label_transforms_naip 151 | class_names = [ 152 | "no_data", 153 | "water", 154 | "emergent_wetlands", 155 | "tree_canopy", 156 | "shrubland", 157 | "low_vegetation", 158 | "barren", 159 | "structure", 160 | "impervious_surface", 161 | "impervious_roads", 162 | "weighted_avg", 163 | ] 164 | elif args.label_transform == "epa": 165 | label_transform = label_transforms_epa 166 | class_names = [ 167 | "no_data", 168 | "impervious", 169 | "soil_barren", 170 | "grass", 171 | "tree/forest", 172 | "water", 173 | "shrub", 174 | "woody_wetlands", 175 | "emergent_wetlands", 176 | "agriculture", 177 | "orchard", 178 | "weighted_avg", 179 | ] 180 | elif args.label_transform == "cic": 181 | label_transform = label_transform_cic 182 | class_names = [ 183 | "Structures", 184 | "Impervious Surface", 185 | "Water", 186 | "Grassland/Pairie", 187 | "Tree Canopy", 188 | "Turff", 189 | "Barren/Rock", 190 | "Irregated", 191 | ] 192 | 193 | elif args.label_transform == "naip_5cls": 194 | label_transform = label_transform_naip5cls 195 | class_names = [ 196 | "water/wetland", 197 | "tree", 198 | "barren", 199 | "low veg", 200 | "built enviornment", 201 | ] 202 | 203 | elif args.label_transform == "naip_4cls": 204 | label_transform = label_transform_naip5cls 205 | class_names = ["water/wetland", "tree", "low veg", "built enviornment"] 206 | elif args.label_transform == "uvm": 207 | label_transform = labels_transform_uvm 208 | class_names = [ 209 | "tree", 210 | "grass", 211 | "bare soil", 212 | "water", 213 | "buildings", 214 | "roads", 215 | "other impervious", 216 | ] 217 | elif args.label_transform == "uvm8cls": 218 | label_transform = labels_transform_uvm_8cls 219 | class_names = [ 220 | "tree", 221 | "grass", 222 | "bare soil", 223 | "water", 224 | "buildings", 225 | "roads", 226 | "other impervious", 227 | "shrubs", 228 | ] 229 | 230 | else: 231 | raise ValueError("Invalid label transform") 232 | 233 | # ------------------- 234 | # Run on each line in the input 235 | # ------------------- 236 | input_dataframe = pd.read_csv(args.input_fn) 237 | image_fns = input_dataframe["image_fn"].values 238 | label_fns = input_dataframe["label_fn"].values 239 | 240 | df_lst = [] 241 | for image_idx in range(len(image_fns)): 242 | pred_masks = [] 243 | tic = time.time() 244 | image_fn = image_fns[image_idx] 245 | gt_label_fn = label_fns[image_idx] 246 | 247 | print( 248 | "(%d/%d) Processing %s" % (image_idx, len(image_fns), image_fn), end=" ... " 249 | ) 250 | 251 | # ------------------- 252 | # Load input and create dataloader 253 | # ------------------- 254 | 255 | with rasterio.open(image_fn) as f: 256 | input_width, input_height = f.width, f.height 257 | input_profile = f.profile.copy() 258 | 259 | dataset = TileInferenceDataset( 260 | image_fn, 261 | chip_size=CHIP_SIZE, 262 | stride=CHIP_STRIDE, 263 | transform=image_transforms, 264 | verbose=False, 265 | ) 266 | dataloader = torch.utils.data.DataLoader( 267 | dataset, 268 | batch_size=args.batch_size, 269 | num_workers=NUM_WORKERS, 270 | pin_memory=True, 271 | ) 272 | 273 | # ------------------- 274 | # Run model and organize output 275 | # ------------------- 276 | 277 | output = np.zeros( 278 | (args.num_classes, input_height, input_width), dtype=np.float32 279 | ) 280 | kernel = np.ones((CHIP_SIZE, CHIP_SIZE), dtype=np.float32) 281 | kernel[HALF_PADDING:-HALF_PADDING, HALF_PADDING:-HALF_PADDING] = 5 282 | counts = np.zeros((input_height, input_width), dtype=np.float32) 283 | 284 | for i, (data, coords) in enumerate(dataloader): 285 | data = data.to(device) 286 | with torch.no_grad(): 287 | # https://discuss.pytorch.org/t/error-expected-more-than-1-value-per-channel-when-training/26274 288 | model.eval() 289 | t_output = model(data) 290 | t_output = F.softmax(t_output, dim=1).cpu().numpy() 291 | 292 | for j in range(t_output.shape[0]): 293 | y, x = coords[j] 294 | 295 | output[:, y : y + CHIP_SIZE, x : x + CHIP_SIZE] += t_output[j] * kernel 296 | counts[y : y + CHIP_SIZE, x : x + CHIP_SIZE] += kernel 297 | 298 | output = output / counts 299 | output_hard = output.argmax(axis=0).astype(np.uint8) 300 | 301 | # append to list of preds 302 | pred_masks.append(output_hard) 303 | 304 | # ------------------- 305 | # Save output 306 | # ------------------- 307 | output_profile = input_profile.copy() 308 | output_profile["driver"] = "GTiff" 309 | output_profile["dtype"] = "uint8" 310 | output_profile["count"] = 1 311 | output_profile["nodata"] = 90 312 | 313 | output_fn = image_fn.split("/")[-1] # something like "546_naip-2013.tif" 314 | output_fn = output_fn.replace("naip", "predictions") 315 | output_fn = os.path.join(args.output_dir, output_fn) 316 | 317 | with rasterio.open(output_fn, "w", **output_profile) as f: 318 | f.write(output_hard, 1) 319 | f.write_colormap(1, utils.LC_TREE_COLORMAP) # fix 320 | 321 | if args.save_soft: 322 | 323 | output = output / output.sum(axis=0, keepdims=True) 324 | output = (output * 255).astype(np.uint8) 325 | 326 | output_profile = input_profile.copy() 327 | output_profile["driver"] = "GTiff" 328 | output_profile["dtype"] = "uint8" 329 | output_profile["count"] = 13 330 | # output_profile["count"] = 13 331 | del output_profile["nodata"] 332 | 333 | output_fn = image_fn.split("/")[-1] # something like "546_naip-2013.tif" 334 | output_fn = output_fn.replace("naip", "predictions-soft") 335 | output_fn = os.path.join(args.output_dir, output_fn) 336 | 337 | with rasterio.open(output_fn, "w", **output_profile) as f: 338 | f.write(output) 339 | 340 | print("finished in %0.4f seconds" % (time.time() - tic)) 341 | 342 | # load in ground truth 343 | gt = rasterio.open(gt_label_fn).read() 344 | gt = gt[0] 345 | gt_f = np.reshape(gt, [-1]) 346 | 347 | # remove no data vals 348 | gt_cleaned = np.delete( 349 | gt_f, np.where((gt_f == 15) | (gt_f == 14) | (gt_f == 13) | (gt_f == 0)) 350 | ) 351 | 352 | print(gt_cleaned.shape) 353 | print(np.unique(gt_cleaned)) 354 | 355 | gt_t = label_transform(gt_cleaned) 356 | print("label transformed unique") 357 | print(np.unique(gt_t)) 358 | 359 | # f-score calculation 360 | pred_masks = np.array(pred_masks) 361 | pred_masks_f = np.reshape(pred_masks, [-1]) 362 | 363 | pred_masks_cleaned = np.delete( 364 | pred_masks_f, 365 | np.where((gt_f == 15) | (gt_f == 14) | (gt_f == 13) | (gt_f == 0)), 366 | ) 367 | print(pred_masks_cleaned.shape) 368 | 369 | uniq_tm = np.unique(gt_t) # unique true mask 370 | print(uniq_tm) 371 | uniq_pm = np.unique(pred_masks_cleaned) # unique pred mask 372 | 373 | # f1 score is computed toward common classes between gt and pred 374 | uniq_v = np.unique(np.concatenate((uniq_tm, uniq_pm))) 375 | print(uniq_v) 376 | 377 | # determine missing labels 378 | missing_labels = np.setdiff1d(list(np.arange(args.num_classes)), uniq_v) 379 | 380 | f1_score_weighted = f1_score(gt_t, pred_masks_cleaned, average="weighted") 381 | 382 | f1_score_per_class = f1_score(gt_t, pred_masks_cleaned, average=None) 383 | print(f"Length of f1 for classes {len(f1_score_per_class)}, they are: \n") 384 | print(f1_score_per_class) 385 | 386 | per_class_f1_final = np.zeros(len(class_names)) 387 | # where the unique cls id exist, fill in f1 per calss 388 | per_class_f1_final[uniq_v] = f1_score_per_class 389 | # where is the missing id, fill in np.nan 390 | per_class_f1_final[missing_labels] = np.nan 391 | per_class_f1_final[-1] = f1_score_weighted 392 | 393 | d = { 394 | "class": class_names, 395 | image_fn.split("/")[-1] + "_f1_score": per_class_f1_final, 396 | } 397 | 398 | df = pd.DataFrame.from_dict(d) 399 | df_t = df.T 400 | df_t.columns = df["class"] 401 | df_t = df_t.drop("class") 402 | 403 | df_lst.append(df_t) 404 | 405 | df_combine = pd.concat(df_lst) 406 | df_combine.loc["mean"] = df_combine.mean(axis=0) 407 | csv_fn = os.path.join(args.output_dir, "f1_score_stats.csv") 408 | df_combine.to_csv(csv_fn) 409 | 410 | 411 | if __name__ == "__main__": 412 | main() 413 | -------------------------------------------------------------------------------- /src/models.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | 4 | import numpy as np 5 | 6 | import torch 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | import segmentation_models_pytorch as smp 11 | 12 | import utils 13 | 14 | from typing import Optional, Union, List 15 | 16 | 17 | class FCN(nn.Module): 18 | def __init__(self, num_input_channels, num_output_classes, num_filters=64): 19 | super(FCN, self).__init__() 20 | 21 | self.conv1 = nn.Conv2d( 22 | num_input_channels, num_filters, kernel_size=3, stride=1, padding=1 23 | ) 24 | self.conv2 = nn.Conv2d( 25 | num_filters, num_filters, kernel_size=3, stride=1, padding=1 26 | ) 27 | self.conv3 = nn.Conv2d( 28 | num_filters, num_filters, kernel_size=3, stride=1, padding=1 29 | ) 30 | self.conv4 = nn.Conv2d( 31 | num_filters, num_filters, kernel_size=3, stride=1, padding=1 32 | ) 33 | self.conv5 = nn.Conv2d( 34 | num_filters, num_filters, kernel_size=3, stride=1, padding=1 35 | ) 36 | self.last = nn.Conv2d( 37 | num_filters, num_output_classes, kernel_size=1, stride=1, padding=0 38 | ) 39 | 40 | def forward(self, inputs): 41 | x = F.relu(self.conv1(inputs)) 42 | x = F.relu(self.conv2(x)) 43 | x = F.relu(self.conv3(x)) 44 | x = F.relu(self.conv4(x)) 45 | x = F.relu(self.conv5(x)) 46 | x = self.last(x) 47 | return x 48 | 49 | def forward_features(self, inputs): 50 | x = F.relu(self.conv1(inputs)) 51 | x = F.relu(self.conv2(x)) 52 | x = F.relu(self.conv3(x)) 53 | x = F.relu(self.conv4(x)) 54 | z = F.relu(self.conv5(x)) 55 | # y = self.last(z) 56 | return z 57 | 58 | 59 | class Unet(smp.base.SegmentationModel): 60 | """Unet_ is a fully convolution neural network for image semantic segmentation. Consist of *encoder* 61 | and *decoder* parts connected with *skip connections*. Encoder extract features of different spatial 62 | resolution (skip connections) which are used by decoder to define accurate segmentation mask. Use *concatenation* 63 | for fusing decoder blocks with skip connections. 64 | Args: 65 | encoder_name: Name of the classification model that will be used as an encoder (a.k.a backbone) 66 | to extract features of different spatial resolution 67 | encoder_depth: A number of stages used in encoder in range [3, 5]. Each stage generate features 68 | two times smaller in spatial dimensions than previous one (e.g. for depth 0 we will have features 69 | with shapes [(N, C, H, W),], for depth 1 - [(N, C, H, W), (N, C, H // 2, W // 2)] and so on). 70 | Default is 5 71 | encoder_weights: One of **None** (random initialization), **"imagenet"** (pre-training on ImageNet) and 72 | other pretrained weights (see table with available weights for each encoder_name) 73 | decoder_channels: List of integers which specify **in_channels** parameter for convolutions used in decoder. 74 | Length of the list should be the same as **encoder_depth** 75 | decoder_use_batchnorm: If **True**, BatchNorm2d layer between Conv2D and Activation layers 76 | is used. If **"inplace"** InplaceABN will be used, allows to decrease memory consumption. 77 | Available options are **True, False, "inplace"** 78 | decoder_attention_type: Attention module used in decoder of the model. Available options are **None** and **scse**. 79 | SCSE paper - https://arxiv.org/abs/1808.08127 80 | in_channels: A number of input channels for the model, default is 3 (RGB images) 81 | classes: A number of classes for output mask (or you can think as a number of channels of output mask) 82 | activation: An activation function to apply after the final convolution layer. 83 | Available options are **"sigmoid"**, **"softmax"**, **"logsoftmax"**, **"tanh"**, **"identity"**, **callable** and **None**. 84 | Default is **None** 85 | aux_params: Dictionary with parameters of the auxiliary output (classification head). Auxiliary output is build 86 | on top of encoder if **aux_params** is not **None** (default). Supported params: 87 | - classes (int): A number of classes 88 | - pooling (str): One of "max", "avg". Default is "avg" 89 | - dropout (float): Dropout factor in [0, 1) 90 | - activation (str): An activation function to apply "sigmoid"/"softmax" (could be **None** to return logits) 91 | Returns: 92 | ``torch.nn.Module``: Unet 93 | .. _Unet: 94 | https://arxiv.org/abs/1505.04597 95 | """ 96 | 97 | def __init__( 98 | self, 99 | encoder_name: str = "resnet34", 100 | encoder_depth: int = 5, 101 | encoder_weights: Optional[str] = "imagenet", 102 | decoder_use_batchnorm: bool = True, 103 | decoder_channels: List[int] = (256, 128, 64, 32, 16), 104 | decoder_attention_type: Optional[str] = None, 105 | in_channels: int = 3, 106 | classes: int = 1, 107 | activation: Optional[Union[str, callable]] = None, 108 | aux_params: Optional[dict] = None, 109 | ): 110 | super().__init__() 111 | 112 | self.encoder = smp.encoders.get_encoder( 113 | encoder_name, 114 | in_channels=in_channels, 115 | depth=encoder_depth, 116 | weights=encoder_weights, 117 | ) 118 | 119 | self.decoder = smp.unet.decoder.UnetDecoder( 120 | encoder_channels=self.encoder.out_channels, 121 | decoder_channels=decoder_channels, 122 | n_blocks=encoder_depth, 123 | use_batchnorm=decoder_use_batchnorm, 124 | center=True if encoder_name.startswith("vgg") else False, 125 | attention_type=decoder_attention_type, 126 | ) 127 | 128 | self.segmentation_head = smp.base.SegmentationHead( 129 | in_channels=decoder_channels[-1], 130 | out_channels=classes, 131 | activation=activation, 132 | kernel_size=1, 133 | ) 134 | 135 | if aux_params is not None: 136 | self.classification_head = smp.base.ClassificationHead( 137 | in_channels=self.encoder.out_channels[-1], **aux_params 138 | ) 139 | else: 140 | self.classification_head = None 141 | 142 | self.name = "u-{}".format(encoder_name) 143 | self.initialize() 144 | 145 | 146 | class Unet2(nn.Module): 147 | def __init__( 148 | self, 149 | feature_scale=1, 150 | n_classes=3, 151 | in_channels=3, 152 | is_deconv=True, 153 | is_batchnorm=False, 154 | ): 155 | """ 156 | Args: 157 | feature_scale: the smallest number of filters (depth c) is 64 when feature_scale is 1, 158 | and it is 32 when feature_scale is 2 159 | n_classes: number of output classes 160 | in_channels: number of channels in input 161 | is_deconv: 162 | is_batchnorm: 163 | """ 164 | 165 | super(Unet2, self).__init__() 166 | 167 | self.is_deconv = is_deconv 168 | self.in_channels = in_channels 169 | self.is_batchnorm = is_batchnorm 170 | self.feature_scale = feature_scale 171 | 172 | assert ( 173 | 64 % self.feature_scale == 0 174 | ), f"feature_scale {self.feature_scale} does not work with this UNet" 175 | 176 | filters = [ 177 | 64, 178 | 128, 179 | 256, 180 | 512, 181 | 1024, 182 | ] # this is `c` in the diagram, [c, 2c, 4c, 8c, 16c] 183 | filters = [int(x / self.feature_scale) for x in filters] 184 | logging.info("filters used are: {}".format(filters)) 185 | 186 | # downsampling 187 | self.conv1 = UnetConv2(self.in_channels, filters[0], self.is_batchnorm) 188 | self.maxpool1 = nn.MaxPool2d(kernel_size=2) 189 | 190 | self.conv2 = UnetConv2(filters[0], filters[1], self.is_batchnorm) 191 | self.maxpool2 = nn.MaxPool2d(kernel_size=2) 192 | 193 | self.conv3 = UnetConv2(filters[1], filters[2], self.is_batchnorm) 194 | self.maxpool3 = nn.MaxPool2d(kernel_size=2) 195 | 196 | self.conv4 = UnetConv2(filters[2], filters[3], self.is_batchnorm) 197 | self.maxpool4 = nn.MaxPool2d(kernel_size=2) 198 | 199 | self.center = UnetConv2(filters[3], filters[4], self.is_batchnorm) 200 | 201 | # upsampling 202 | self.up_concat4 = UnetUp(filters[4], filters[3], self.is_deconv) 203 | self.up_concat3 = UnetUp(filters[3], filters[2], self.is_deconv) 204 | self.up_concat2 = UnetUp(filters[2], filters[1], self.is_deconv) 205 | self.up_concat1 = UnetUp(filters[1], filters[0], self.is_deconv) 206 | 207 | # final conv (without any concat) 208 | self.final = nn.Conv2d(filters[0], n_classes, kernel_size=1) 209 | 210 | def forward(self, inputs): 211 | conv1 = self.conv1(inputs) 212 | maxpool1 = self.maxpool1(conv1) 213 | 214 | conv2 = self.conv2(maxpool1) 215 | maxpool2 = self.maxpool2(conv2) 216 | 217 | conv3 = self.conv3(maxpool2) 218 | maxpool3 = self.maxpool3(conv3) 219 | 220 | conv4 = self.conv4(maxpool3) 221 | maxpool4 = self.maxpool4(conv4) 222 | 223 | center = self.center(maxpool4) 224 | up4 = self.up_concat4(conv4, center) 225 | up3 = self.up_concat3(conv3, up4) 226 | up2 = self.up_concat2(conv2, up3) 227 | up1 = self.up_concat1(conv1, up2) 228 | 229 | final = self.final(up1) 230 | 231 | return final 232 | 233 | def forward_features(self, inputs): 234 | conv1 = self.conv1(inputs) 235 | maxpool1 = self.maxpool1(conv1) 236 | 237 | conv2 = self.conv2(maxpool1) 238 | maxpool2 = self.maxpool2(conv2) 239 | 240 | conv3 = self.conv3(maxpool2) 241 | maxpool3 = self.maxpool3(conv3) 242 | 243 | conv4 = self.conv4(maxpool3) 244 | maxpool4 = self.maxpool4(conv4) 245 | 246 | center = self.center(maxpool4) 247 | up4 = self.up_concat4(conv4, center) 248 | up3 = self.up_concat3(conv3, up4) 249 | up2 = self.up_concat2(conv2, up3) 250 | up1 = self.up_concat1(conv1, up2) 251 | 252 | final = self.final(up1) 253 | 254 | return final, up1 255 | 256 | 257 | class UnetConv2(nn.Module): 258 | def __init__(self, in_channels, out_channels, is_batchnorm): 259 | super(UnetConv2, self).__init__() 260 | 261 | if is_batchnorm: 262 | self.conv1 = nn.Sequential( 263 | # this amount of padding/stride/kernel_size preserves width/height 264 | nn.Conv2d( 265 | in_channels, out_channels, kernel_size=3, stride=1, padding=1 266 | ), 267 | nn.BatchNorm2d(out_channels), 268 | nn.ReLU(), 269 | ) 270 | self.conv2 = nn.Sequential( 271 | nn.Conv2d( 272 | out_channels, out_channels, kernel_size=3, stride=1, padding=1 273 | ), 274 | nn.BatchNorm2d(out_channels), 275 | nn.ReLU(), 276 | ) 277 | else: 278 | self.conv1 = nn.Sequential( 279 | nn.Conv2d( 280 | in_channels, out_channels, kernel_size=3, stride=1, padding=1 281 | ), 282 | nn.ReLU(), 283 | ) 284 | self.conv2 = nn.Sequential( 285 | nn.Conv2d( 286 | out_channels, out_channels, kernel_size=3, stride=1, padding=1 287 | ), 288 | nn.ReLU(), 289 | ) 290 | 291 | def forward(self, inputs): 292 | outputs = self.conv1(inputs) 293 | outputs = self.conv2(outputs) 294 | return outputs 295 | 296 | 297 | class UnetUp(nn.Module): 298 | def __init__(self, in_channels, out_channels, is_deconv): 299 | """ 300 | is_deconv: use transposed conv layer to upsample - parameters are learnt; otherwise use 301 | bilinear interpolation to upsample. 302 | """ 303 | super(UnetUp, self).__init__() 304 | 305 | self.conv = UnetConv2(in_channels, out_channels, False) 306 | 307 | self.is_deconv = is_deconv 308 | if is_deconv: 309 | self.up = nn.ConvTranspose2d( 310 | in_channels, out_channels, kernel_size=2, stride=2 311 | ) 312 | # UpsamplingBilinear2d is deprecated in favor of interpolate() 313 | # else: 314 | # self.up = nn.UpsamplingBilinear2d(scale_factor=2) 315 | 316 | def forward(self, inputs1, inputs2): 317 | """ 318 | inputs1 is from the downward path, of higher resolution 319 | inputs2 is from the 'lower' layer. It gets upsampled (spatial size increases) and its depth (channels) halves 320 | to match the depth of inputs1, before being concatenated in the depth dimension. 321 | """ 322 | if self.is_deconv: 323 | outputs2 = self.up(inputs2) 324 | else: 325 | # scale_factor is the multiplier for spatial size 326 | outputs2 = F.interpolate( 327 | inputs2, scale_factor=2, mode="bilinear", align_corners=True 328 | ) 329 | 330 | offset = outputs2.size()[2] - inputs1.size()[2] 331 | padding = 2 * [offset // 2, offset // 2] 332 | outputs1 = F.pad(inputs1, padding) 333 | 334 | return self.conv(torch.cat([outputs1, outputs2], dim=1)) 335 | 336 | 337 | def get_unet(classes=11): 338 | return Unet( 339 | encoder_name="resnet18", 340 | encoder_depth=3, 341 | encoder_weights=None, 342 | decoder_channels=(128, 64, 64), 343 | in_channels=4, 344 | classes=classes, 345 | ) 346 | 347 | 348 | def get_unet2(n_classes): 349 | return Unet2( 350 | feature_scale=1, 351 | n_classes=n_classes, 352 | in_channels=4, 353 | is_deconv=True, 354 | is_batchnorm=False, 355 | ) 356 | 357 | 358 | def get_fcn(num_output_classes=11): 359 | return FCN( 360 | num_input_channels=4, num_output_classes=num_output_classes, num_filters=64 361 | ) 362 | 363 | 364 | def get_deeplabv3plus(n_classes): 365 | return smp.DeepLabV3Plus( 366 | encoder_name="resnet18", 367 | in_channels=4, 368 | classes=n_classes, 369 | encoder_weights=None, 370 | ) 371 | -------------------------------------------------------------------------------- /src/seed_data_creation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to create seed data from trained model for PEARL MVP model retraining session 3 | """ 4 | import logging 5 | import os 6 | import sys 7 | 8 | import joblib 9 | import numpy as np 10 | import sklearn.base 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | import segmentation_models_pytorch as smp 15 | from sklearn.linear_model import SGDClassifier 16 | from sklearn.metrics import f1_score 17 | from sklearn.model_selection import train_test_split 18 | import pandas as pd 19 | import rasterio 20 | import models 21 | 22 | import argparse 23 | 24 | sys.path.append("..") 25 | LOGGER = logging.getLogger("server") 26 | 27 | from typing import Optional, Union, List 28 | 29 | parser = argparse.ArgumentParser(description="Create seed data from trained model") 30 | parser.add_argument( 31 | "--input_csv", 32 | type=str, 33 | required=True, 34 | help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.', 35 | ) 36 | parser.add_argument( 37 | "--ckpt_file", type=str, required=True, help="A trained model file in pt format" 38 | ) 39 | parser.add_argument( 40 | "--n_classes", type=int, required=True, help="The number of calsses" 41 | ) 42 | parser.add_argument( 43 | "--out_npz", 44 | type=str, 45 | required=True, 46 | help="The path to a directory to output model seed data in npz", 47 | ) 48 | parser.add_argument( 49 | "--model", 50 | default="fcn", 51 | choices=("unet", "fcn", "unet2", "deeplabv3plus"), 52 | help="Model to use", 53 | ) 54 | 55 | args = parser.parse_args() 56 | 57 | if torch.cuda.is_available(): 58 | device = torch.device("cuda") 59 | else: 60 | print("WARNING! Torch is reporting that CUDA isn't available, using cpu") 61 | device = torch.device("cpu") 62 | 63 | 64 | def label_transforms_naip(labels): 65 | labels = np.array(labels).astype(np.int64) 66 | labels = np.where(labels == 14, 0, labels) # to no data 67 | labels = np.where(labels == 15, 0, labels) # to no data 68 | labels = np.where(labels == 13, 0, labels) # to no data 69 | labels = np.where(labels == 10, 3, labels) # to tree canopy 70 | labels = np.where(labels == 11, 3, labels) # to tree canopy 71 | labels = np.where(labels == 12, 3, labels) # to tree canopy 72 | return labels 73 | 74 | 75 | def label_transforms_uvm(labels): 76 | naip_7cls = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6} 77 | labels = np.array(labels).astype(np.int64) 78 | labels_new = np.copy(labels) 79 | for k, v in naip_7cls.items(): 80 | labels_new[labels == k] = v 81 | return labels_new 82 | 83 | 84 | def load_model(n_classes, chkpt_file, model_nm): 85 | if model_nm == "unet2": 86 | model = models.Unet2( 87 | feature_scale=1, 88 | n_classes=n_classes, 89 | in_channels=4, 90 | is_deconv=True, 91 | is_batchnorm=False, 92 | ) 93 | elif model_nm == "unet": 94 | model = models.Unet( 95 | feature_scale=1, 96 | n_classes=n_classes, 97 | in_channels=4, 98 | is_deconv=True, 99 | is_batchnorm=False, 100 | ) 101 | elif model_nm == "fcn": 102 | model = models.FCN( 103 | num_input_channels=4, 104 | num_output_classes=n_classes, 105 | num_filters=64, 106 | padding=1, 107 | ) 108 | elif model_nm == "deeplabv3plus": 109 | model = smp.DeepLabV3Plus( 110 | encoder_name="resnet18", 111 | encoder_weights=None, 112 | in_channels=4, 113 | classes=n_classes, 114 | ) 115 | checkpoint = torch.load(chkpt_file, map_location=device) 116 | model.load_state_dict(checkpoint) 117 | model = model.to(device) 118 | model.eval() 119 | 120 | return model 121 | 122 | 123 | def sample_data(df_path, n_samples): 124 | 125 | df = pd.read_csv(df_path) 126 | image_fns, label_fns = df[["image_fn", "label_fn"]].values.T 127 | idxs = np.random.choice(image_fns.shape[0], replace=False, size=n_samples) 128 | image_fns = image_fns[idxs] 129 | label_fns = label_fns[idxs] 130 | return image_fns, label_fns 131 | 132 | 133 | def deeplabv3plus_forward_features(model, x): 134 | features = model.encoder(x) 135 | decoder_output = model.decoder(*features) 136 | return F.interpolate( 137 | decoder_output, 138 | scale_factor=4, 139 | ) 140 | 141 | 142 | def get_seed_data_deeplabv3plus( 143 | model, device, img_fn, label_fn, n_patches, n_points, verbose=True 144 | ): 145 | 146 | with rasterio.open(img_fn) as f: 147 | data = f.read() 148 | data = data / 255.0 149 | 150 | with rasterio.open(label_fn) as f: 151 | labels = f.read().squeeze() 152 | 153 | height, width = labels.shape 154 | labels.shape 155 | labels = label_transforms_uvm(labels) 156 | 157 | ## Sample n_patches from the tile 158 | patch_size = 128 159 | x_imgs = np.zeros((n_patches, 4, patch_size, patch_size), dtype=np.float32) 160 | y_imgs = np.zeros((n_patches, patch_size, patch_size), dtype=np.uint8) 161 | for i in range(n_patches): 162 | 163 | x = np.random.randint(0, width - patch_size) 164 | y = np.random.randint(0, height - patch_size) 165 | 166 | x_img = data[:, y : y + patch_size, x : x + patch_size].copy() 167 | y_img = labels[y : y + patch_size, x : x + patch_size].copy() 168 | 169 | x_imgs[i] = x_img 170 | y_imgs[i] = y_img 171 | 172 | x_imgs = torch.from_numpy(x_imgs).to(device) 173 | print("x_imgs") 174 | print(x_imgs.shape) 175 | 176 | ## Run the model on the patches 177 | with torch.no_grad(): 178 | x_img_features = deeplabv3plus_forward_features(model, x_imgs) 179 | x_img_features = x_img_features.cpu().numpy() 180 | print("x_img_feature_shape") 181 | print(x_img_features.shape) 182 | 183 | ## Evaluate the model on all the patches 184 | if verbose: 185 | print( 186 | "Base model acc on sampled patches", 187 | accuracy_score(y_imgs.ravel(), y_imgs_pred.ravel()), 188 | ) 189 | print( 190 | "Base model f1 on sampled patches", 191 | f1_score(y_imgs.ravel(), y_imgs_pred.ravel(), average="macro"), 192 | ) 193 | 194 | ## Subsample n_points from the patches 195 | x_seed = np.zeros((n_points, 256), dtype=np.float32) 196 | y_seed = np.zeros((n_points,), dtype=np.uint8) 197 | 198 | for j in range(n_points): 199 | i = np.random.randint(n_patches) 200 | x = np.random.randint(32, patch_size - 32) 201 | y = np.random.randint(32, patch_size - 32) 202 | 203 | x_seed[j] = x_img_features[i, :, y, x] 204 | y_seed[j] = y_imgs[i, y, x] 205 | 206 | ## Evaluate the model on the seed points 207 | if verbose: 208 | ## Use the last layer of the model to make predictions from the seed embeddings 209 | fcn_weights = model.last.weight.cpu().detach().numpy().squeeze() 210 | fcn_bias = model.last.bias.cpu().detach().numpy() 211 | y_seed_pred = (x_seed @ fcn_weights.T + fcn_bias).argmax(axis=1) 212 | print("Base model acc on subset of points", accuracy_score(y_seed, y_seed_pred)) 213 | print( 214 | "Base model f1 on subset of points", 215 | f1_score(y_seed, y_seed_pred, average="macro"), 216 | ) 217 | 218 | print("y_seed dataset for deeplabv3+ are:") 219 | print(y_seed.shape, np.unique(y_seed)) 220 | return x_seed, y_seed 221 | 222 | 223 | def get_seed_data_fcn( 224 | model, device, label_transform_function, img_fn, label_fn, n_patches, n_points 225 | ): 226 | ## Load data 227 | with rasterio.open(img_fn) as f: 228 | data = f.read() 229 | data = data / 255.0 230 | 231 | with rasterio.open(label_fn) as f: 232 | labels = f.read().squeeze() 233 | height, width = labels.shape 234 | labels.shape 235 | labels = label_transforms_function(labels) 236 | 237 | ## Sample n_patches from the tile 238 | patch_size = 256 239 | x_imgs = np.zeros((n_patches, 4, patch_size, patch_size), dtype=np.float32) 240 | y_imgs = np.zeros((n_patches, patch_size, patch_size), dtype=np.uint8) 241 | for i in range(n_patches): 242 | 243 | x = np.random.randint(0, width - patch_size) 244 | y = np.random.randint(0, height - patch_size) 245 | 246 | x_img = data[:, y : y + patch_size, x : x + patch_size].copy() 247 | y_img = labels[y : y + patch_size, x : x + patch_size].copy() 248 | 249 | x_imgs[i] = x_img 250 | y_imgs[i] = y_img 251 | 252 | x_imgs = torch.from_numpy(x_imgs).to(device) 253 | 254 | ## Run the model on the patches 255 | with torch.no_grad(): 256 | y_imgs_pred, x_img_features = model.forward_features(x_imgs) 257 | y_imgs_pred = y_imgs_pred.argmax(axis=1).cpu().numpy() 258 | x_img_features = x_img_features.cpu().numpy() 259 | 260 | ## Subsample n_points from the patches 261 | x_seed = np.zeros((n_points, 64), dtype=np.float32) 262 | y_seed = np.zeros((n_points,), dtype=np.uint8) 263 | 264 | for j in range(n_points): 265 | i = np.random.randint(n_patches) 266 | x = np.random.randint(32, patch_size - 32) 267 | y = np.random.randint(32, patch_size - 32) 268 | 269 | x_seed[j] = x_img_features[i, :, y, x] 270 | y_seed[j] = y_imgs[i, y, x] 271 | 272 | return x_seed, y_seed 273 | 274 | 275 | def get_seed_data_unet( 276 | model, device, img_fn, label_fn, n_patches, n_points, verbose=True 277 | ): 278 | 279 | with rasterio.open(img_fn) as f: 280 | data = f.read() 281 | data = data / 255.0 282 | 283 | with rasterio.open(label_fn) as f: 284 | labels = f.read().squeeze() 285 | 286 | height, width = labels.shape 287 | labels.shape 288 | labels = label_transforms_uvm(labels) 289 | 290 | ## Sample n_patches from the tile 291 | patch_size = 128 292 | x_imgs = np.zeros((n_patches, 4, patch_size, patch_size), dtype=np.float32) 293 | y_imgs = np.zeros((n_patches, patch_size, patch_size), dtype=np.uint8) 294 | for i in range(n_patches): 295 | 296 | x = np.random.randint(0, width - patch_size) 297 | y = np.random.randint(0, height - patch_size) 298 | 299 | x_img = data[:, y : y + patch_size, x : x + patch_size].copy() 300 | y_img = labels[y : y + patch_size, x : x + patch_size].copy() 301 | 302 | x_imgs[i] = x_img 303 | y_imgs[i] = y_img 304 | 305 | x_imgs = torch.from_numpy(x_imgs).to(device) 306 | print("x_imgs") 307 | print(x_imgs.shape) 308 | 309 | ## Run the model on the patches 310 | with torch.no_grad(): 311 | x_img_features = model.forward_features(x_imgs) 312 | # y_imgs_pred = y_imgs_pred.argmax(axis=1).cpu().numpy() 313 | x_img_features = x_img_features.cpu().numpy() 314 | print("x_img_features shape") 315 | print(x_img_features.shape) 316 | 317 | ## Evaluate the model on all the patches 318 | if verbose: 319 | print( 320 | "Base model acc on sampled patches", 321 | accuracy_score(y_imgs.ravel(), y_imgs_pred.ravel()), 322 | ) 323 | print( 324 | "Base model f1 on sampled patches", 325 | f1_score(y_imgs.ravel(), y_imgs_pred.ravel(), average="macro"), 326 | ) 327 | 328 | ## Subsample n_points from the patches 329 | x_seed = np.zeros((n_points, 64), dtype=np.float32) 330 | y_seed = np.zeros((n_points,), dtype=np.uint8) 331 | 332 | for j in range(n_points): 333 | i = np.random.randint(n_patches) 334 | x = np.random.randint(32, patch_size - 32) 335 | y = np.random.randint(32, patch_size - 32) 336 | 337 | x_seed[j] = x_img_features[i, :, y, x] 338 | y_seed[j] = y_imgs[i, y, x] 339 | 340 | ## Evaluate the model on the seed points 341 | if verbose: 342 | ## Use the last layer of the model to make predictions from the seed embeddings 343 | fcn_weights = model.last.weight.cpu().detach().numpy().squeeze() 344 | fcn_bias = model.last.bias.cpu().detach().numpy() 345 | y_seed_pred = (x_seed @ fcn_weights.T + fcn_bias).argmax(axis=1) 346 | print("Base model acc on subset of points", accuracy_score(y_seed, y_seed_pred)) 347 | print( 348 | "Base model f1 on subset of points", 349 | f1_score(y_seed, y_seed_pred, average="macro"), 350 | ) 351 | 352 | return x_seed, y_seed 353 | 354 | 355 | def calculate_seed_data(): 356 | device = torch.device("cuda") 357 | x_test = [] 358 | y_test = [] 359 | 360 | df = pd.read_csv(args.input_csv) 361 | image_fns, label_fns = df[["image_fn", "label_fn"]].values.T 362 | for i in range(len(image_fns)): 363 | if i % 5 == 0: 364 | print(i, len(image_fns)) 365 | 366 | # ------------------- 367 | # Setup model 368 | # ------------------- 369 | if args.model == "unet2": 370 | model = load_model(args.n_classes, args.ckpt_file, args.model) 371 | x_test_sample, y_test_sample = get_seed_data_unet( 372 | model, 373 | device, 374 | image_fns[i], 375 | label_fns[i], 376 | n_patches=128, 377 | n_points=1000, 378 | verbose=False, 379 | ) 380 | x_test.append(x_test_sample) 381 | y_test.append(y_test_sample) 382 | elif args.model == "unet": 383 | model = model = load_model(args.n_classes, args.ckpt_file, args.model) 384 | x_test_sample, y_test_sample = get_seed_data_unet( 385 | model, 386 | device, 387 | image_fns[i], 388 | label_fns[i], 389 | n_patches=128, 390 | n_points=1000, 391 | verbose=False, 392 | ) 393 | x_test.append(x_test_sample) 394 | y_test.append(y_test_sample) 395 | elif args.model == "fcn": 396 | model = model = load_model(args.n_classes, args.ckpt_file, args.model) 397 | x_test_sample, y_test_sample = get_seed_data_fcn( 398 | model, 399 | device, 400 | image_fns[i], 401 | label_fns[i], 402 | n_patches=64, 403 | n_points=100, 404 | verbose=False, 405 | ) 406 | x_test.append(x_test_sample) 407 | y_test.append(y_test_sample) 408 | elif args.model == "deeplabv3plus": 409 | model = model = load_model(args.n_classes, args.ckpt_file, args.model) 410 | x_test_sample, y_test_sample = get_seed_data_deeplabv3plus( 411 | model, 412 | device, 413 | image_fns[i], 414 | label_fns[i], 415 | n_patches=64, 416 | n_points=100, 417 | verbose=False, 418 | ) 419 | x_test.append(x_test_sample) 420 | y_test.append(y_test_sample) 421 | else: 422 | raise ValueError("Invalid model") 423 | 424 | x_test = np.concatenate(x_test, axis=0) 425 | y_test = np.concatenate(y_test, axis=0) 426 | 427 | print(np.unique(y_test)) 428 | 429 | np.savez(args.out_npz, embeddings=x_test, labels=y_test) 430 | 431 | 432 | if __name__ == "__main__": 433 | calculate_seed_data() 434 | -------------------------------------------------------------------------------- /src/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | import argparse 4 | import copy 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from dataloaders.StreamingDatasets import StreamingGeospatialDataset 10 | 11 | import torch 12 | import torch.optim as optim 13 | import segmentation_models_pytorch as smp 14 | 15 | import models 16 | import utils 17 | from transforms_utils import ( 18 | label_transforms_naip, 19 | label_transforms_epa, 20 | label_transform_cic, 21 | label_transform_naip5cls, 22 | label_transform_4cls, 23 | labels_transform_uvm, 24 | labels_transform_uvm_8cls, 25 | image_transforms, 26 | ) 27 | 28 | from azureml.core import Run 29 | 30 | torch.backends.cudnn.deterministic = False 31 | torch.backends.cudnn.benchmark = True 32 | 33 | # Some tricks to make rasterio faster when using vsicurl 34 | # -- see https://github.com/pangeo-data/cog-best-practices 35 | RASTERIO_BEST_PRACTICES = dict( 36 | CURL_CA_BUNDLE="/etc/ssl/certs/ca-certificates.crt", 37 | GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR", 38 | AWS_NO_SIGN_REQUEST="YES", 39 | GDAL_MAX_RAW_BLOCK_CACHE_SIZE="200000000", 40 | GDAL_SWATH_SIZE="200000000", 41 | VSI_CURL_CACHE_SIZE="200000000", 42 | ) 43 | os.environ.update(RASTERIO_BEST_PRACTICES) 44 | 45 | 46 | run = Run.get_context() 47 | 48 | NUM_WORKERS = 8 49 | CHIP_SIZE = 256 50 | 51 | parser = argparse.ArgumentParser(description="DFC2021 baseline training script") 52 | parser.add_argument( 53 | "--input_fn", 54 | type=str, 55 | required=True, 56 | help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.', 57 | ) 58 | parser.add_argument( 59 | "--input_fn_val", 60 | type=str, 61 | required=True, 62 | help='The path to a CSV file containing three columns -- "image_fn", "label_fn", and "group" -- that point to tiles of imagery and labels as well as which "group" each tile is in.', 63 | ) 64 | parser.add_argument( 65 | "--output_dir", 66 | type=str, 67 | required=True, 68 | help="The path to a directory to store model checkpoints.", 69 | ) 70 | parser.add_argument( 71 | "--overwrite", 72 | action="store_true", 73 | help="Flag for overwriting `output_dir` if that directory already exists.", 74 | ) 75 | parser.add_argument( 76 | "--save_most_recent", 77 | action="store_true", 78 | help="Flag for saving the most recent version of the model during training.", 79 | ) 80 | parser.add_argument( 81 | "--model", 82 | default="fcn", 83 | choices=("unet", "fcn", "unet2", "deeplabv3plus"), 84 | help="Model to use", 85 | ) 86 | 87 | # Training arguments 88 | parser.add_argument("--gpu", type=int, default=0, help="The ID of the GPU to use") 89 | parser.add_argument( 90 | "--batch_size", type=int, default=32, help="Batch size to use for training" 91 | ) 92 | parser.add_argument( 93 | "--num_epochs", type=int, default=50, help="Number of epochs to train for" 94 | ) 95 | parser.add_argument( 96 | "--seed", type=int, default=0, help="Random seed to pass to numpy and torch" 97 | ) 98 | parser.add_argument( 99 | "--num_classes", type=int, default=10, help="number of classes in dataset" 100 | ) 101 | parser.add_argument( 102 | "--num_chips", 103 | type=int, 104 | default=100, 105 | help="number of chips to randomly sample from data", 106 | ) 107 | parser.add_argument( 108 | "--label_transform", 109 | default="uvm", 110 | help="str either naip, epa or cic, naip_5cls, uvm to indicate how to transform labels", 111 | ) 112 | args = parser.parse_args() 113 | 114 | 115 | def nodata_check(img, labels): 116 | return np.any(labels == 0) 117 | 118 | 119 | def main(): 120 | print( 121 | "Starting DFC2021 baseline training script at %s" 122 | % (str(datetime.datetime.now())) 123 | ) 124 | 125 | # ------------------- 126 | # Setup 127 | # ------------------- 128 | assert os.path.exists(args.input_fn) 129 | 130 | if os.path.isfile(args.output_dir): 131 | print("A file was passed as `--output_dir`, please pass a directory!") 132 | return 133 | 134 | if os.path.exists(args.output_dir) and len(os.listdir(args.output_dir)): 135 | if args.overwrite: 136 | print( 137 | "WARNING! The output directory, %s, already exists, we might overwrite data in it!" 138 | % (args.output_dir) 139 | ) 140 | else: 141 | print( 142 | "The output directory, %s, already exists and isn't empty. We don't want to overwrite and existing results, exiting..." 143 | % (args.output_dir) 144 | ) 145 | return 146 | else: 147 | print("The output directory doesn't exist or is empty.") 148 | os.makedirs(args.output_dir, exist_ok=True) 149 | 150 | if torch.cuda.is_available(): 151 | device = torch.device("cuda:%d" % args.gpu) 152 | else: 153 | print("WARNING! Torch is reporting that CUDA isn't available, using cpu") 154 | device = "cpu" 155 | 156 | np.random.seed(args.seed) 157 | torch.manual_seed(args.seed) 158 | 159 | # ------------------- 160 | # Load input data 161 | # ------------------- 162 | input_dataframe = pd.read_csv(args.input_fn) 163 | image_fns = input_dataframe["image_fn"].values 164 | label_fns = input_dataframe["label_fn"].values 165 | groups = input_dataframe["group"].values 166 | 167 | input_dataframe_val = pd.read_csv(args.input_fn_val) 168 | image_fns_val = input_dataframe_val["image_fn"].values 169 | label_fns_val = input_dataframe_val["label_fn"].values 170 | 171 | if args.label_transform == "naip": 172 | label_transform = label_transforms_naip 173 | elif args.label_transform == "epa": 174 | label_transform = label_transforms_epa 175 | elif args.label_transform == "cic": 176 | label_transform = label_transform_cic 177 | elif args.label_transform == "naip_5cls": 178 | label_transform = label_transform_naip5cls 179 | elif args.label_transform == "naip_4cls": 180 | label_transform = label_transform_4cls 181 | elif args.label_transform == "uvm": 182 | label_transform = labels_transform_uvm 183 | elif args.label_transform == "uvm8cls": 184 | label_transform = labels_transform_uvm_8cls 185 | else: 186 | raise ValueError("Invalid label transform") 187 | 188 | dataset = StreamingGeospatialDataset( 189 | imagery_fns=image_fns, 190 | label_fns=label_fns, 191 | groups=groups, 192 | chip_size=CHIP_SIZE, 193 | num_chips_per_tile=args.num_chips, 194 | windowed_sampling=False, 195 | verbose=True, 196 | image_transform=image_transforms, 197 | label_transform=label_transform, 198 | nodata_check=nodata_check, 199 | ) 200 | 201 | dataloader = torch.utils.data.DataLoader( 202 | dataset, 203 | batch_size=args.batch_size, 204 | num_workers=NUM_WORKERS, 205 | pin_memory=True, 206 | ) 207 | 208 | dataset_val = StreamingGeospatialDataset( 209 | imagery_fns=image_fns_val, 210 | label_fns=label_fns_val, 211 | groups=groups, 212 | chip_size=CHIP_SIZE, 213 | num_chips_per_tile=args.num_chips, 214 | windowed_sampling=False, 215 | verbose=True, 216 | image_transform=image_transforms, 217 | label_transform=label_transform, 218 | nodata_check=nodata_check, 219 | ) 220 | 221 | dataloader_val = torch.utils.data.DataLoader( 222 | dataset_val, 223 | batch_size=args.batch_size, 224 | num_workers=NUM_WORKERS, 225 | pin_memory=True, 226 | ) 227 | 228 | num_training_batches_per_epoch = int( 229 | len(image_fns) * args.num_chips / args.batch_size 230 | ) 231 | print( 232 | "We will be training with %d batches per epoch" 233 | % (num_training_batches_per_epoch) 234 | ) 235 | 236 | num_val_batches_per_epoch = int( 237 | len(image_fns_val) * args.num_chips / args.batch_size 238 | ) 239 | print( 240 | "We will be validating with %d batches per epoch" % (num_val_batches_per_epoch) 241 | ) 242 | 243 | # ------------------- 244 | # Setup training 245 | # ------------------- 246 | if args.model == "unet": 247 | model = models.get_unet(classes=args.num_classes) 248 | elif args.model == "unet2": 249 | model = models.get_unet2(n_classes=args.num_classes) 250 | elif args.model == "fcn": 251 | model = models.get_fcn(num_output_classes=args.num_classes) 252 | elif args.model == "deeplabv3plus": 253 | model = models.get_deeplabv3plus(n_classes=args.num_classes) 254 | else: 255 | raise ValueError("Invalid model") 256 | 257 | model = model.to(device) 258 | optimizer = optim.AdamW(model.parameters(), lr=0.001, amsgrad=True) 259 | # criterion = nn.CrossEntropyLoss() 260 | criterion = smp.losses.FocalLoss(mode="multiclass") 261 | scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, "min") 262 | 263 | print("Model has %d parameters" % (utils.count_parameters(model))) 264 | 265 | # ------------------- 266 | # Model training 267 | # ------------------- 268 | training_task_losses = [] 269 | num_times_lr_dropped = 0 270 | model_checkpoints = [] 271 | val_task_losses = [] 272 | temp_model_fn = os.path.join(args.output_dir, "most_recent_model.pt") 273 | 274 | for epoch in range(args.num_epochs): 275 | print("on epoch number: ", epoch) 276 | lr = utils.get_lr(optimizer) 277 | 278 | training_losses = utils.fit( 279 | model, 280 | device, 281 | dataloader, 282 | num_training_batches_per_epoch, 283 | optimizer, 284 | criterion, 285 | epoch, 286 | ) 287 | scheduler.step(training_losses[0]) 288 | 289 | model_checkpoints.append(copy.deepcopy(model.state_dict())) 290 | if args.save_most_recent: 291 | torch.save(model.state_dict(), temp_model_fn) 292 | 293 | if utils.get_lr(optimizer) < lr: 294 | num_times_lr_dropped += 1 295 | print("") 296 | print("Learning rate dropped") 297 | print("") 298 | training_task_losses.append(training_losses[0]) 299 | run.log("loss", training_losses[0]) 300 | if num_times_lr_dropped == 4: 301 | break 302 | 303 | # Run Validation 304 | validation_losses = utils.evaluate( 305 | model, 306 | device, 307 | dataloader_val, 308 | num_val_batches_per_epoch, 309 | criterion, 310 | epoch, 311 | ) 312 | val_task_losses.append(validation_losses[0]) 313 | run.log("loss", validation_losses[0]) 314 | 315 | num_classes = args.num_classes # to-do fix 316 | per_class_f1, global_f1 = utils.score_batch( 317 | model, device, dataloader_val, num_val_batches_per_epoch, num_classes 318 | ) 319 | run.log("per_class_f1_val", per_class_f1) 320 | run.log("global_f1_val", global_f1) 321 | 322 | # ------------------- 323 | # Save everything 324 | # ------------------- 325 | save_obj = { 326 | "args": args, 327 | "training_task_losses": training_task_losses, 328 | "checkpoints": model_checkpoints, 329 | } 330 | 331 | save_obj_fn = "results.pt" 332 | out_path = os.path.join(args.output_dir, save_obj_fn) 333 | run.log("out_path", out_path) 334 | with open(os.path.join(args.output_dir, save_obj_fn), "wb") as f: 335 | torch.save(save_obj, f) 336 | 337 | 338 | if __name__ == "__main__": 339 | main() 340 | -------------------------------------------------------------------------------- /src/transforms_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import utils 4 | 5 | 6 | def label_transforms_naip(labels, group=None): 7 | labels = np.array(labels).astype(np.int64) 8 | labels = np.where(labels == 14, 0, labels) # to no data 9 | labels = np.where(labels == 15, 0, labels) # to no data 10 | labels = np.where(labels == 13, 0, labels) # to no data 11 | labels = np.where(labels == 10, 3, labels) # to tree canopy 12 | labels = np.where(labels == 11, 3, labels) # to tree canopy 13 | labels = np.where(labels == 12, 3, labels) # to tree canopy 14 | labels = torch.from_numpy(labels) 15 | return labels 16 | 17 | 18 | def label_transforms_epa(labels, group=None): 19 | labels = np.array(labels).astype(np.int64) 20 | labels_new = np.copy(labels) 21 | for k, v in utils.epa_label_dict.items(): 22 | labels_new[labels == k] = v 23 | labels_new = torch.from_numpy(labels_new) 24 | return labels_new 25 | 26 | 27 | def label_transform_cic(labels, group=None): 28 | labels = np.array(labels).astype(np.int64) 29 | labels_new = np.copy(labels) 30 | for k, v in utils.cic_label_dict.items(): 31 | labels_new[labels == k] = v 32 | labels_new = torch.from_numpy(labels_new) 33 | return labels_new 34 | 35 | 36 | def label_transform_naip5cls(labels, group=None): 37 | labels = np.array(labels).astype(np.int64) 38 | labels_new = np.copy(labels) 39 | for k, v in utils.naip_5cls.items(): 40 | labels_new[labels == k] = v 41 | labels_new = torch.from_numpy(labels_new) 42 | return labels_new 43 | 44 | 45 | def label_transform_4cls(labels, group=None): 46 | labels = np.array(labels).astype(np.int64) 47 | labels_new = np.copy(labels) 48 | for k, v in utils.naip_4cls.items(): 49 | labels_new[labels == k] = v 50 | labels_new = torch.from_numpy(labels_new) 51 | return labels_new 52 | 53 | 54 | def labels_transform_uvm(labels, group=None): 55 | labels = np.array(labels).astype(np.int64) 56 | labels_new = np.copy(labels) 57 | for k, v in utils.uvm_7cls.items(): 58 | labels_new[labels == k] = v 59 | labels_new = torch.from_numpy(labels_new) 60 | return labels_new 61 | 62 | 63 | def labels_transform_uvm_8cls(labels, group=None): 64 | labels = np.array(labels).astype(np.int64) 65 | labels_new = np.copy(labels) 66 | for k, v in utils.uvm_8cls.items(): 67 | labels_new[labels == k] = v 68 | labels_new = torch.from_numpy(labels_new) 69 | return labels_new 70 | 71 | 72 | def image_transforms(img, group=None): 73 | img = img / 255.0 74 | img = np.rollaxis(img, 2, 0).astype(np.float32) 75 | img = torch.from_numpy(img) 76 | return img 77 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import time 3 | import numpy as np 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | 9 | from tqdm import tqdm 10 | 11 | from sklearn.metrics import f1_score 12 | 13 | NAIP_2013_MEANS = np.array([117.00, 130.75, 122.50, 159.30]) 14 | NAIP_2013_STDS = np.array([38.16, 36.68, 24.30, 66.22]) 15 | NAIP_2017_MEANS = np.array([72.84, 86.83, 76.78, 130.82]) 16 | NAIP_2017_STDS = np.array([41.78, 34.66, 28.76, 58.95]) 17 | NAIP_NY_2017_MEANS = np.array([95.31, 129.95, 127.77, 184.45]) 18 | NAIP_NY_2017_STDS = np.array([40.95, 34.71, 21.07, 51.11]) 19 | NAIP_PA_2017_MEANS = np.array([114.38, 140.45, 110.13, 177.38]) 20 | NAIP_PA_2017_STDS = np.array([37.401, 34.29, 23.77, 45.98]) 21 | NAIP_DE_2013_MEANS = np.array([116.74, 132.48, 127.61, 175.80]) 22 | NAIP_DE_2013_STDS = np.array([40.22, 34.22, 23.86, 60.58]) 23 | NAIP_VA_2018_MEANS = np.array([92.70, 104.10, 75.43, 118.62]) 24 | NAIP_VA_2018_STDS = np.array([42.56, 42.91, 31.27, 59.34]) 25 | NAIP_WV_2018_MEANS = np.array([109.36, 123.73, 105.63, 117.47]) 26 | NAIP_WV_2018_STDS = np.array([44.79, 36.71, 31.72, 38.93]) 27 | NAIP_MD_2018_MEANS = np.array([103.84, 108.31, 88.07, 113.36]) 28 | NAIP_MD_2018_STDS = np.array([46.19, 44.21, 37.07, 60.50]) 29 | NAIP_MD_2017_MEANS = np.array([73.31, 86.94, 77.38, 126.26]) 30 | NAIP_MD_2017_STDS = np.array([42.22, 35.90, 30.23, 60.49]) 31 | NAIP_MD_2015_MEANS = np.array([116.05, 126.48, 117.93, 158.21]) 32 | NAIP_MD_2015_STDS = np.array([38.08, 32.87, 27.07, 83.22]) 33 | NAIP_MD_2011_MEANS = np.array([108.35, 126.13, 121.83, 176.64]) 34 | NAIP_MD_2011_STDS = np.array([39.94, 30.60, 25.39, 45.69]) 35 | NAIP_MD_Merged_MEANS = np.array([100.3875, 111.965, 101.3025, 143.6175]) 36 | NAIP_MD_Merged_STDS = np.array([58.71, 59.7775, 54.05, 95.2125]) 37 | NAIP_VA_Merged_MEANS = np.array([102.6326, 118.5233, 104.2282, 145.7618]) 38 | NAIP_VA_Merged_STDS = np.array([39.6812, 37.2886, 32.8185, 50.1053]) 39 | NAIP_NY_Merged_MEANS = np.array([97.2829, 122.9519, 106.3612, 169.0045]) 40 | NAIP_NY_Merged_STDS = np.array([39.7267, 36.6849, 25.8357, 52.4304]) 41 | fresno_ca_means = np.array([132.70, 127.63, 109.55, 147.25]) 42 | fresno_ca_stds = np.array([45.21, 38.478, 34.65, 35.70]) 43 | la_ca_means = np.array([115.06, 114.08, 105.04, 123.96]) 44 | la_ca_stds = np.array([56.31, 49.89, 44.26, 48.78]) 45 | sanoma_ca_means = np.array([93.69, 101.96, 90.17, 126.93]) 46 | sanoma_ca_stds = np.array([49.12, 39.83, 33.27, 54.14]) 47 | 48 | 49 | NLCD_CLASSES = [ 50 | 0, 51 | 11, 52 | 12, 53 | 21, 54 | 22, 55 | 23, 56 | 24, 57 | 31, 58 | 41, 59 | 42, 60 | 43, 61 | 52, 62 | 71, 63 | 81, 64 | 82, 65 | 90, 66 | 95, 67 | ] # 16 classes + 1 nodata class ("0"). Note that "12" is "Perennial Ice/Snow" and is not present in Maryland. 68 | 69 | NLCD_CLASS_COLORMAP = { # Copied from the emebedded color table in the NLCD data files 70 | 0: (0, 0, 0, 255), 71 | 11: (70, 107, 159, 255), 72 | 12: (209, 222, 248, 255), 73 | 21: (222, 197, 197, 255), 74 | 22: (217, 146, 130, 255), 75 | 23: (235, 0, 0, 255), 76 | 24: (171, 0, 0, 255), 77 | 31: (179, 172, 159, 255), 78 | 41: (104, 171, 95, 255), 79 | 42: (28, 95, 44, 255), 80 | 43: (181, 197, 143, 255), 81 | 52: (204, 184, 121, 255), 82 | 71: (223, 223, 194, 255), 83 | 81: (220, 217, 57, 255), 84 | 82: (171, 108, 40, 255), 85 | 90: (184, 217, 235, 255), 86 | 95: (108, 159, 184, 255), 87 | } 88 | 89 | NLCD_IDX_COLORMAP = {idx: NLCD_CLASS_COLORMAP[c] for idx, c in enumerate(NLCD_CLASSES)} 90 | LC_CLASSES = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13] 91 | LC_CLASS_COLORMAP = { 92 | 0: (0, 0, 0, 0), 93 | 1: (0, 197, 255, 255), 94 | 2: (0, 168, 132, 255), 95 | 3: (38, 115, 0, 255), 96 | 4: (76, 230, 0, 255), 97 | 5: (163, 255, 115, 255), 98 | 6: (255, 170, 0, 255), 99 | 7: (255, 0, 0, 255), 100 | 8: (156, 156, 156, 255), 101 | 9: (0, 0, 0, 255), 102 | 10: (115, 115, 0, 255), 103 | 11: (230, 230, 0, 255), 104 | 12: (255, 255, 115, 255), 105 | 13: (197, 0, 255, 255), 106 | } 107 | 108 | LC_CLASSES_TREE = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] 109 | LC_CLASS_TREE_COLORMAP = { 110 | 0: (252, 232, 3), 111 | 1: (0, 197, 255, 255), 112 | 2: (0, 168, 132, 255), 113 | 3: (38, 115, 0, 255), 114 | 4: (76, 230, 0, 255), 115 | 5: (163, 255, 115, 255), 116 | 6: (255, 170, 0, 255), 117 | 7: (255, 0, 0, 255), 118 | 8: (156, 156, 156, 255), 119 | 9: (0, 0, 0, 255), 120 | } 121 | 122 | 123 | LC_COLORMAP = {idx: LC_CLASS_COLORMAP[c] for idx, c in enumerate(LC_CLASSES)} 124 | 125 | 126 | LC_TREE_COLORMAP = { 127 | idx: LC_CLASS_TREE_COLORMAP[c] for idx, c in enumerate(LC_CLASSES_TREE) 128 | } 129 | 130 | EPA_CLASSES = [0, 10, 20, 30, 40, 52, 70, 80, 82, 91, 92] 131 | 132 | epa_label_dict = { 133 | 0: 0, 134 | 10: 1, 135 | 20: 2, 136 | 30: 3, 137 | 40: 4, 138 | 52: 5, 139 | 70: 6, 140 | 80: 7, 141 | 82: 8, 142 | 91: 9, 143 | 92: 10, 144 | } 145 | 146 | CIC_CLASSES = [1, 2, 3, 4, 5, 6, 7, 8] 147 | 148 | cic_label_dict = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7} 149 | 150 | naip_5cls = {1: 0, 2: 0, 3: 1, 10: 1, 11: 1, 12: 1, 4: 2, 5: 2, 6: 3, 7: 4, 8: 4, 9: 4} 151 | 152 | naip_4cls = {1: 0, 2: 0, 3: 1, 10: 1, 11: 1, 12: 1, 4: 2, 5: 2, 6: 2, 7: 3, 8: 3, 9: 3} 153 | 154 | uvm_7cls = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6} 155 | 156 | uvm_8cls = {1: 0, 2: 1, 3: 2, 4: 3, 5: 4, 6: 5, 7: 6, 8: 7} 157 | 158 | 159 | def get_nlcd_class_to_idx_map(): 160 | nlcd_label_to_idx_map = [] 161 | idx = 0 162 | for i in range(NLCD_CLASSES[-1] + 1): 163 | if i in NLCD_CLASSES: 164 | nlcd_label_to_idx_map.append(idx) 165 | idx += 1 166 | else: 167 | nlcd_label_to_idx_map.append(0) 168 | nlcd_label_to_idx_map = np.array(nlcd_label_to_idx_map).astype(np.int64) 169 | return nlcd_label_to_idx_map 170 | 171 | 172 | NLCD_CLASS_TO_IDX_MAP = ( 173 | get_nlcd_class_to_idx_map() 174 | ) # I do this computation on import for illustration (this could instead be a length 96 vector that is hardcoded here) 175 | 176 | 177 | NLCD_IDX_TO_REDUCED_LC_MAP = np.array( 178 | [ 179 | 4, # 0 No data 0 180 | 0, # 1 Open Water 181 | 4, # 2 Ice/Snow 182 | 2, # 3 Developed Open Space 183 | 3, # 4 Developed Low Intensity 184 | 3, # 5 Developed Medium Intensity 185 | 3, # 6 Developed High Intensity 186 | 3, # 7 Barren Land 187 | 1, # 8 Deciduous Forest 188 | 1, # 9 Evergreen Forest 189 | 1, # 10 Mixed Forest 190 | 1, # 11 Shrub/Scrub 191 | 2, # 12 Grassland/Herbaceous 192 | 2, # 13 Pasture/Hay 193 | 2, # 14 Cultivated Crops 194 | 1, # 15 Woody Wetlands 195 | 1, # 16 Emergent Herbaceious Wetlands 196 | ] 197 | ) 198 | 199 | NLCD_IDX_TO_REDUCED_LC_ACCUMULATOR = np.array( 200 | [ 201 | [0, 0, 0, 0, 1], # 0 No data 0 202 | [1, 0, 0, 0, 0], # 1 Open Water 203 | [0, 0, 0, 0, 1], # 2 Ice/Snow 204 | [0, 0, 0, 0, 0], # 3 Developed Open Space 205 | [0, 0, 0, 0, 0], # 4 Developed Low Intensity 206 | [0, 0, 0, 1, 0], # 5 Developed Medium Intensity 207 | [0, 0, 0, 1, 0], # 6 Developed High Intensity 208 | [0, 0, 0, 0, 0], # 7 Barren Land 209 | [0, 1, 0, 0, 0], # 8 Deciduous Forest 210 | [0, 1, 0, 0, 0], # 9 Evergreen Forest 211 | [0, 1, 0, 0, 0], # 10 Mixed Forest 212 | [0, 1, 0, 0, 0], # 11 Shrub/Scrub 213 | [0, 0, 1, 0, 0], # 12 Grassland/Herbaceous 214 | [0, 0, 1, 0, 0], # 13 Pasture/Hay 215 | [0, 0, 1, 0, 0], # 14 Cultivated Crops 216 | [0, 1, 0, 0, 0], # 15 Woody Wetlands 217 | [0, 1, 0, 0, 0], # 16 Emergent Herbaceious Wetlands 218 | ] 219 | ) 220 | 221 | 222 | class Timer: 223 | """A wrapper class for printing out what is running and how long it took. 224 | Use as: 225 | ``` 226 | with utils.Timer("running stuff"): 227 | # do stuff 228 | ``` 229 | This will output: 230 | ``` 231 | Starting 'running stuff' 232 | # any output from 'running stuff' 233 | Finished 'running stuff' in 12.45 seconds 234 | ``` 235 | """ 236 | 237 | def __init__(self, message): 238 | self.message = message 239 | 240 | def __enter__(self): 241 | self.tic = float(time.time()) 242 | print("Starting '%s'" % (self.message)) 243 | 244 | def __exit__(self, type, value, traceback): 245 | print("Finished '%s' in %0.4f seconds" % (self.message, time.time() - self.tic)) 246 | 247 | 248 | def fit(model, device, data_loader, num_batches, optimizer, criterion, epoch, memo=""): 249 | model.train() 250 | losses = [] 251 | tic = time.time() 252 | for batch_idx, (data, targets) in tqdm( 253 | enumerate(data_loader), total=num_batches, file=sys.stdout 254 | ): 255 | data = data.to(device) 256 | targets = targets.to(device) 257 | optimizer.zero_grad() 258 | # error Expected more than 1 value per channel when training 259 | # check https://discuss.pytorch.org/t/error-expected-more-than-1-value-per-channel-when-training/26274 260 | model.eval() 261 | outputs = model(data) 262 | loss = criterion(outputs, targets) 263 | losses.append(loss.item()) 264 | loss.backward() 265 | optimizer.step() 266 | 267 | avg_loss = np.mean(losses) 268 | print( 269 | "[{}] Training Epoch: {}\t Time elapsed: {:.2f} seconds\t Loss: {:.2f}".format( 270 | memo, epoch, time.time() - tic, avg_loss 271 | ), 272 | end="", 273 | ) 274 | print("") 275 | return [avg_loss] 276 | 277 | 278 | def evaluate(model, device, data_loader, num_batches, criterion, epoch, memo=""): 279 | model.eval() 280 | losses = [] 281 | tic = time.time() 282 | for batch_idx, (data, targets) in tqdm( 283 | enumerate(data_loader), total=num_batches, file=sys.stdout 284 | ): 285 | data = data.to(device) 286 | targets = targets.to(device) 287 | with torch.no_grad(): 288 | outputs = model(data) 289 | loss = criterion(outputs, targets) 290 | losses.append(loss.item()) 291 | avg_loss = np.mean(losses) 292 | 293 | print( 294 | "[{}] Validation Epoch: {}\t Time elapsed: {:.2f} seconds\t Loss: {:.2f}".format( 295 | memo, epoch, time.time() - tic, avg_loss 296 | ), 297 | end="", 298 | ) 299 | print("") 300 | return [avg_loss] 301 | 302 | 303 | def score(model, device, data_loader, num_batches): 304 | model.eval() 305 | 306 | num_classes = model.module.segmentation_head[0].out_channels 307 | num_samples = len(data_loader.dataset) 308 | predictions = np.zeros((num_samples, num_classes), dtype=np.float32) 309 | idx = 0 310 | for batch_idx, (data, target) in enumerate(data_loader): 311 | data = data.to(device) 312 | with torch.no_grad(): 313 | output = F.softmax(model(data)) 314 | batch_size = data.shape[0] 315 | predictions[idx : idx + batch_size] = output.cpu().numpy() 316 | idx += batch_size 317 | return predictions 318 | 319 | 320 | def score2(model, device, data_loader, num_batches, num_classes): 321 | model.eval() 322 | 323 | predictions = [] 324 | ground_truth = [] 325 | idx = 0 326 | for batch_idx, (data, target) in enumerate(data_loader): 327 | data = data.to(device) 328 | target = target.to(device) 329 | with torch.no_grad(): 330 | output = F.softmax(model(data)) 331 | output = output.cpu().numpy() # (32, 10, 256, 256) 332 | target = target.cpu().numpy() 333 | 334 | for i, x in enumerate(output): 335 | predictions.append(x.argmax(axis=0).astype(np.uint8)) 336 | ground_truth.append(target[i]) 337 | 338 | # to this per batch instead of all at once to fix memory errors 339 | preds_f = np.reshape(np.array(predictions), [-1]) 340 | gt_f = np.reshape(np.array(ground_truth), [-1]) 341 | per_class_f1 = f1_score(gt_f, preds_f, average=None) 342 | global_f1 = f1_score(gt_f, preds_f, average="weighted") 343 | 344 | return per_class_f1, global_f1 345 | 346 | 347 | def score_batch(model, device, data_loader, num_batches, num_classes): 348 | model.eval() 349 | 350 | batch_per_class_f1 = [] 351 | batch_global_f1 = [] 352 | idx = 0 353 | for batch_idx, (data, target) in enumerate(data_loader): 354 | predictions = [] 355 | ground_truth = [] 356 | data = data.to(device) 357 | target = target.to(device) 358 | with torch.no_grad(): 359 | output = F.softmax(model(data)) 360 | output = output.cpu().numpy() 361 | target = target.cpu().numpy() 362 | 363 | print("output") 364 | print(output) 365 | for i, x in enumerate(output): 366 | predictions.append(x.argmax(axis=0).astype(np.uint8)) 367 | ground_truth.append(target[i]) 368 | 369 | preds_f = np.reshape(np.array(predictions), [-1]) 370 | print("ground truth") 371 | print(ground_truth) 372 | gt_f = np.reshape(np.array(ground_truth), [-1]) 373 | 374 | missing_labels = np.setdiff1d(list(np.arange(num_classes)), np.unique(gt_f)) 375 | 376 | per_class_f1 = f1_score(gt_f, preds_f, average=None) 377 | 378 | per_class_f1_final = np.zeros(num_classes) 379 | # add nan for missing label classes 380 | for x in missing_labels: 381 | per_class_f1_final[x] = np.nan 382 | print("gt_f") 383 | print(gt_f) 384 | print("np unique") 385 | print(np.unique(gt_f)) 386 | for i, gt_class in enumerate(np.unique(gt_f)): 387 | per_class_f1_final[gt_class] = per_class_f1[i] 388 | 389 | global_f1 = f1_score(gt_f, preds_f, average="weighted") 390 | 391 | batch_per_class_f1.append(per_class_f1_final) 392 | batch_global_f1.append(global_f1) 393 | 394 | batch_per_class_f1_mean = np.nanmean(batch_per_class_f1, axis=0) 395 | batch_global_f1_mean = np.mean(batch_global_f1) 396 | return batch_per_class_f1_mean, batch_global_f1_mean 397 | 398 | 399 | def get_lr(optimizer): 400 | for param_group in optimizer.param_groups: 401 | return param_group["lr"] 402 | 403 | 404 | def count_parameters(model): 405 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 406 | -------------------------------------------------------------------------------- /train_azure/create_compute-cpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | from azureml.core import Workspace 3 | from azureml.core.compute import ComputeTarget, AmlCompute 4 | from azureml.core.compute_target import ComputeTargetException 5 | from azureml.core.authentication import InteractiveLoginAuthentication 6 | 7 | 8 | # try: 9 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID") 10 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID) 11 | 12 | ws = Workspace.from_config() # This automatically looks for a directory .azureml 13 | 14 | # Choose a name for your CPU cluster 15 | # memory optimized: https://docs.microsoft.com/en-us/azure/virtual-machines/dv2-dsv2-series-memory 16 | AZ_CPU_CLUSTER_NAME = os.getenv("AZ_CPU_CLUSTER_NAME") 17 | 18 | # Verify that the cluster does not exist already 19 | try: 20 | cpu_cluster = ComputeTarget(workspace=ws, name=AZ_CPU_CLUSTER_NAME) 21 | print("Found existing cluster, use it.") 22 | except ComputeTargetException: 23 | compute_config = AmlCompute.provisioning_configuration( 24 | vm_size="Standard_DS12_v2", 25 | idle_seconds_before_scaledown=1200, 26 | min_nodes=0, 27 | max_nodes=3, 28 | ) 29 | cpu_cluster = ComputeTarget.create(ws, AZ_CPU_CLUSTER_NAME, compute_config) 30 | 31 | cpu_cluster.wait_for_completion(show_output=True) 32 | -------------------------------------------------------------------------------- /train_azure/create_compute-gpu.py: -------------------------------------------------------------------------------- 1 | import os 2 | from azureml.core import Workspace 3 | from azureml.core.compute import ComputeTarget, AmlCompute 4 | from azureml.core.compute_target import ComputeTargetException 5 | from azureml.core.authentication import InteractiveLoginAuthentication 6 | 7 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID") 8 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID) 9 | AZ_SUB_ID = os.getenv("AZ_SUB_ID") 10 | 11 | ws = Workspace.from_config() # This automatically looks for a directory .azureml 12 | 13 | AZ_GPU_CLUSTER_NAME = os.getenv("AZ_GPU_CLUSTER_NAME") 14 | 15 | # Verify that the cluster does not exist already 16 | try: 17 | gpu_cluster = ComputeTarget(workspace=ws, name=AZ_GPU_CLUSTER_NAME) 18 | print("Found existing cluster, use it.") 19 | except ComputeTargetException: 20 | # https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?tabs=python#managed-identity 21 | print("Creating new gpu cluster...") 22 | compute_config = AmlCompute.provisioning_configuration( 23 | vm_size="Standard_NC6", 24 | idle_seconds_before_scaledown=1200, 25 | min_nodes=0, 26 | max_nodes=3, 27 | ) 28 | gpu_cluster = ComputeTarget.create(ws, AZ_GPU_CLUSTER_NAME, compute_config) 29 | 30 | gpu_cluster.wait_for_completion(show_output=True) 31 | -------------------------------------------------------------------------------- /train_azure/create_workspace.py: -------------------------------------------------------------------------------- 1 | """ 2 | Setting up workspace on Azure ML stadio 3 | """ 4 | import os 5 | from azureml.core import Workspace 6 | from azureml.core.authentication import InteractiveLoginAuthentication 7 | from azureml.core.authentication import InteractiveLoginAuthentication 8 | 9 | # get your TENANT_ID from "az account show --output table" 10 | # get your "subscription_id" from "az account list --output table" 11 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID") 12 | AZ_SUB_ID = os.getenv("AZ_SUB_ID") 13 | 14 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID) 15 | 16 | 17 | AZ_WORKSPACE_NAME = os.getenv("AZ_WORKSPACE_NAME") 18 | AZ_RESOURCE_GROUP = os.getenv("AZ_RESOURCE_GROUP") 19 | AZ_REGION = os.getenv("AZ_REGION") 20 | 21 | ws = Workspace.create( 22 | name=AZ_WORKSPACE_NAME, # provide a name for your workspace 23 | subscription_id=AZ_SUB_ID, # provide your subscription ID 24 | resource_group=AZ_RESOURCE_GROUP, # provide a resource group name 25 | create_resource_group=True, 26 | location=AZ_REGION, 27 | ) # For example: 'westeurope' or 'eastus2' or 'westus2' or 'southeastasia'. 28 | 29 | # write out the workspace details to a configuration file: .azureml/config.json 30 | ws.write_config(path=".azureml") 31 | -------------------------------------------------------------------------------- /train_azure/requirements.txt: -------------------------------------------------------------------------------- 1 | azureml.core 2 | -------------------------------------------------------------------------------- /train_azure/run_cls_distrib.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | from azureml.core import Workspace 3 | from azureml.core import Experiment 4 | from azureml.core import Environment 5 | from azureml.core import ScriptRunConfig 6 | from azureml.core.authentication import InteractiveLoginAuthentication 7 | 8 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID") 9 | AZ_CPU_CLUSTER_NAME = os.getenv("AZ_CPU_CLUSTER_NAME") 10 | if __name__ == "__main__": 11 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID) 12 | try: 13 | ws = Workspace.from_config() 14 | except: 15 | print("No config found. Please create a workspace before running") 16 | sys.exit(0) 17 | 18 | experiment = Experiment(workspace=ws, name="sample-exp-fortcollins") 19 | config = ScriptRunConfig( 20 | source_directory="./src", 21 | script="cls_distribution.py", 22 | compute_target=AZ_CPU_CLUSTER_NAME, 23 | arguments=[ 24 | "--input_fn", 25 | "data/fort-collins_test.csv", 26 | "--num_classes", 27 | 7, 28 | "--label_transform", 29 | "uvm", # either 'naip or epa' 30 | "--output_dir", 31 | "./outputs", # TBD don't actually want to use outputdir 32 | ], 33 | ) 34 | 35 | # set up pytorch environment 36 | pytorch_env = Environment.from_conda_specification( 37 | name="lulc-pytorch-env", file_path="./pytorch-env.yml" 38 | ) 39 | 40 | # This env variable needs to be set for rasterio to open remote files 41 | # https://github.com/mapbox/rasterio/issues/1289 42 | pytorch_env.environment_variables[ 43 | "CURL_CA_BUNDLE" 44 | ] = "/etc/ssl/certs/ca-certificates.crt" 45 | 46 | config.run_config.environment = pytorch_env 47 | 48 | run = experiment.submit(config) 49 | 50 | aml_url = run.get_portal_url() 51 | print(aml_url) 52 | -------------------------------------------------------------------------------- /train_azure/run_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | from azureml.core import Workspace 3 | from azureml.core import Experiment 4 | from azureml.core import Environment 5 | from azureml.core import ScriptRunConfig 6 | from azureml.core import Dataset 7 | from azureml.core.authentication import InteractiveLoginAuthentication 8 | 9 | 10 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID") 11 | AZ_GPU_CLUSTER_NAME = os.getenv("AZ_GPU_CLUSTER_NAME") 12 | 13 | if __name__ == "__main__": 14 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID) 15 | ws = Workspace.from_config() 16 | experiment = Experiment(workspace=ws, name="sample-exp-fortcollins") 17 | 18 | # find the experiment Run ID through your Azure portal https://ml.azure.com/experiments/ 19 | 20 | config = ScriptRunConfig( 21 | source_directory="./src", 22 | script="eval.py", 23 | compute_target=AZ_GPU_CLUSTER_NAME, 24 | arguments=[ 25 | "--model_fn", 26 | "model/most_recent_model.pt", 27 | "--input_fn", 28 | "data/fort-collins_test.csv", 29 | "--output_dir", 30 | "./outputs", 31 | "--num_classes", 32 | 7, 33 | "--label_transform", 34 | "uvm", 35 | "--model", 36 | "deeplabv3plus", 37 | ], 38 | ) 39 | 40 | # set up pytorch environment 41 | pytorch_env = Environment.from_conda_specification( 42 | name="lulc-pytorch-env", file_path="./pytorch-env.yml" 43 | ) 44 | 45 | # Specify a GPU base image 46 | pytorch_env.docker.enabled = True 47 | pytorch_env.docker.base_image = ( 48 | "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04" 49 | ) 50 | 51 | config.run_config.environment = pytorch_env 52 | 53 | run = experiment.submit(config) 54 | 55 | aml_url = run.get_portal_url() 56 | print(aml_url) 57 | -------------------------------------------------------------------------------- /train_azure/run_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from azureml.core import Workspace 3 | from azureml.core import Experiment 4 | from azureml.core import Environment 5 | from azureml.core import ScriptRunConfig 6 | 7 | AZ_GPU_CLUSTER_NAME = os.environ.get('AZ_GPU_CLUSTER_NAME') 8 | 9 | if __name__ == "__main__": 10 | ws = Workspace.from_config() 11 | experiment = Experiment(workspace=ws, name="sample-exp-fortcollins") 12 | config = ScriptRunConfig( 13 | source_directory="./src", 14 | script="train.py", 15 | compute_target=AZ_GPU_CLUSTER_NAME, 16 | arguments=[ 17 | "--input_fn", 18 | "data/fort-collins_train.csv", 19 | "--input_fn_val", 20 | "data/fort-collins_val.csv", 21 | "--output_dir", 22 | "./outputs", 23 | "--save_most_recent", 24 | "--num_epochs", 25 | 20, 26 | "--num_chips", 27 | 200, 28 | "--num_classes", 29 | 7, 30 | "--label_transform", 31 | "uvm", 32 | "--model", 33 | "deeplabv3plus", 34 | ], 35 | ) 36 | 37 | # set up pytorch environment 38 | pytorch_env = Environment.from_conda_specification( 39 | name="lulc-pytorch-env", file_path="./pytorch-env.yml" 40 | ) 41 | 42 | # Specify a GPU base image 43 | pytorch_env.docker.enabled = True 44 | pytorch_env.docker.base_image = ( 45 | "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04" 46 | ) 47 | 48 | config.run_config.environment = pytorch_env 49 | 50 | run = experiment.submit(config) 51 | 52 | aml_url = run.get_portal_url() 53 | print(aml_url) 54 | -------------------------------------------------------------------------------- /train_azure/run_seeddata_creation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example test script to deploy seed data creation to Azure Machine Learning 3 | """ 4 | import os, sys 5 | from azureml.core import Workspace 6 | from azureml.core import Experiment 7 | from azureml.core import Environment 8 | from azureml.core import ScriptRunConfig 9 | from azureml.core import Dataset 10 | from azureml.core.authentication import InteractiveLoginAuthentication 11 | 12 | 13 | AZ_TENANT_ID = os.getenv("AZ_TENANT_ID") 14 | AZ_GPU_CLUSTER_NAME = os.getenv("AZ_GPU_CLUSTER_NAME") 15 | 16 | if __name__ == "__main__": 17 | interactive_auth = InteractiveLoginAuthentication(tenant_id=AZ_TENANT_ID) 18 | try: 19 | ws = Workspace.from_config() 20 | except: 21 | print("No config found. Please create a workspace before running") 22 | sys.exit(0) 23 | 24 | experiment = Experiment( 25 | workspace=ws, 26 | name="sample-exp-fortcollins", 27 | ) 28 | 29 | config = ScriptRunConfig( 30 | source_directory="./src", 31 | script="seed_data_creation.py", 32 | compute_target=AZ_GPU_CLUSTER_NAME, 33 | arguments=[ 34 | "--input_csv", 35 | "data/fort-collins_test.csv", 36 | "--ckpt_file", 37 | "model/most_recent_model.pt", # replace with weight on azure 38 | "--n_classes", 39 | 7, 40 | "--out_npz", 41 | "./outputs/sample-output-fortcollins.npz", 42 | "--model", 43 | "deeplabv3plus", 44 | ], 45 | ) 46 | # set up pytorch environment 47 | pytorch_env = Environment.from_conda_specification( 48 | name="lulc-pytorch-env", file_path="./pytorch-env.yml" 49 | ) 50 | 51 | # Specify a GPU base image 52 | pytorch_env.docker.enabled = True 53 | pytorch_env.docker.base_image = ( 54 | "mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04" 55 | ) 56 | 57 | config.run_config.environment = pytorch_env 58 | 59 | run = experiment.submit(config) 60 | 61 | aml_url = run.get_portal_url() 62 | print(aml_url) 63 | --------------------------------------------------------------------------------