├── docs └── images │ ├── vQ9YMn.gif │ └── tutorials │ ├── rasterize_label.py │ ├── split_raster_gdal_tiff.py │ └── mygrid.py ├── requirements.txt ├── raster_type ├── test_data │ ├── tif_f32.tif │ ├── tif_u16.tif │ └── tif_u8.tif └── raster2uint8.py ├── .gitignore ├── docker-compose.yml ├── Dockerfile ├── LICENSE ├── split_data.py ├── split_dataset_list.py ├── multi_raster_vector.py └── README.md /docs/images/vQ9YMn.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/docs/images/vQ9YMn.gif -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | numpy 3 | rasterio 4 | GDAL==3.2.2 5 | paddleseg 6 | paddlepaddle 7 | jupyter -------------------------------------------------------------------------------- /raster_type/test_data/tif_f32.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/raster_type/test_data/tif_f32.tif -------------------------------------------------------------------------------- /raster_type/test_data/tif_u16.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/raster_type/test_data/tif_u16.tif -------------------------------------------------------------------------------- /raster_type/test_data/tif_u8.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/raster_type/test_data/tif_u8.tif -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | DataSet 2 | DataSet/image/* 3 | DataSet/label/* 4 | DataSet/raster/* 5 | DataSet/vector/* 6 | .ipynb_checkpoints 7 | __pycache__/ 8 | output 9 | output_inf 10 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | python: 5 | build: . 6 | volumes: 7 | - .:/usr/src/app 8 | ports: 9 | - 8888:8888 10 | command: bash -c "jupyter notebook --port=8888 --no-browser --ip='0.0.0.0' --allow-root" -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.12-slim-bullseye 2 | LABEL maintainer="youssef_harby@yahoo.com" 3 | 4 | RUN mkdir /usr/src/app 5 | WORKDIR /usr/src/app 6 | COPY ./requirements.txt . 7 | 8 | RUN apt-get update \ 9 | && apt-get install -y libgomp1 \ 10 | ffmpeg libsm6 libxext6 \ 11 | git \ 12 | build-essential \ 13 | gdal-bin libgdal-dev 14 | RUN export CPLUS_INCLUDE_PATH=/usr/include/gdal 15 | RUN export C_INCLUDE_PATH=/usr/include/gdal 16 | RUN pip install --upgrade pip 17 | RUN pip install -r requirements.txt 18 | # RUN pip install git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Youssef Harby Makar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/images/tutorials/rasterize_label.py: -------------------------------------------------------------------------------- 1 | 2 | from osgeo import gdal, gdal_array, ogr 3 | import mygrid as cropfarm 4 | 5 | fn_ras = '2019_9_4_res.tif' 6 | fn_vec = 'esribbb.shp' 7 | output = "lab_all_values.tif" 8 | 9 | ras_ds = gdal.Open(fn_ras) 10 | vec_ds = ogr.Open(fn_vec) 11 | 12 | lyr = vec_ds.GetLayer() 13 | geot = ras_ds.GetGeoTransform() 14 | proj = ras_ds.GetProjection() 15 | 16 | drv_tiff = gdal.GetDriverByName("GTiff") 17 | chn_ras_ds = drv_tiff.Create(output, ras_ds.RasterXSize, ras_ds.RasterYSize, 1, gdal.GDT_Byte) 18 | chn_ras_ds.SetGeoTransform(geot) 19 | 20 | gdal.RasterizeLayer(chn_ras_ds, [1], lyr, options=['ATTRIBUTE=fid']) 21 | chn_ras_ds.GetRasterBand(1).SetNoDataValue(0.0) # Change No Data Value to 0 22 | chn_ras_ds.SetProjection (proj) # Set Projection as the source 23 | chn_ras_ds = None 24 | 25 | #change all values >= 1 to 1 26 | final_output = "final_lab.tif" 27 | 28 | ds = gdal.Open(output) 29 | b1 = ds.GetRasterBand(1) 30 | arr = b1.ReadAsArray() 31 | 32 | data = (arr >= 1) 33 | gdal_array.SaveArray(data.astype("byte"), final_output, "GTIFF", ds) 34 | data = None 35 | 36 | cropfarm.mygridfun(fn_ras, ".jpg","DataSet\image\image" ) #Sat_Raster 37 | cropfarm.mygridfun(final_output, ".png", "DataSet\label\label" ) #label_raster -------------------------------------------------------------------------------- /docs/images/tutorials/split_raster_gdal_tiff.py: -------------------------------------------------------------------------------- 1 | from osgeo import gdal 2 | import math 3 | 4 | ds = gdal.Open("2019_9_4_res.tif") 5 | gt = ds.GetGeoTransform() 6 | 7 | needed_out_x = 512 8 | needed_out_y = 512 9 | 10 | # get coordinates of upper left corner 11 | xmin = gt[0] 12 | ymax = gt[3] 13 | resx = gt[1] 14 | res_y = gt[5] 15 | resy = abs(res_y) 16 | 17 | # determine total length of raster 18 | xlen = resx * ds.RasterXSize 19 | ylen = resy * ds.RasterYSize 20 | 21 | # overall raster dim in pixels before the edits 22 | img_width = ds.RasterXSize 23 | img_height = ds.RasterYSize 24 | 25 | # round up to nearst int to the 26 | xnotround = ds.RasterXSize/needed_out_x 27 | xround = math.ceil(xnotround) 28 | ynotround = ds.RasterYSize/needed_out_y 29 | yround = math.ceil(ynotround) 30 | 31 | # pixel to meter - 512×10×0.18 32 | pixtomX = needed_out_x*xround*resx 33 | pixtomy = needed_out_y*yround*resy 34 | 35 | # size of a single tile 36 | xsize = pixtomX/xround 37 | ysize = pixtomy/yround 38 | 39 | # create lists of x and y coordinates 40 | xsteps = [xmin + xsize * i for i in range(xround+1)] 41 | ysteps = [ymax - ysize * i for i in range(yround+1)] 42 | 43 | # loop over min and max x and y coordinates 44 | for i in range(xround): 45 | for j in range(yround): 46 | xmin = xsteps[i] 47 | xmax = xsteps[i+1] 48 | ymax = ysteps[j] 49 | ymin = ysteps[j+1] 50 | 51 | # print("xmin: "+str(xmin)) 52 | # print("xmax: "+str(xmax)) 53 | # print("ymin: "+str(ymin)) 54 | # print("ymax: "+str(ymax)) 55 | # print("\n") 56 | 57 | # use gdal warp 58 | gdal.Warp("ds"+str(i)+str(j)+".tif", ds, 59 | outputBounds = (xmin, ymin, xmax, ymax), dstNodata = -9999) 60 | # or gdal translate to subset the input raster 61 | # gdal.Translate("dem_translate"+str(i)+str(j)+".tif", dem, projWin = (xmin, ymax, xmax, ymin), xRes = res, yRes = -res) 62 | 63 | # close the open dataset!!! 64 | dem = None -------------------------------------------------------------------------------- /docs/images/tutorials/mygrid.py: -------------------------------------------------------------------------------- 1 | from osgeo import gdal 2 | import math 3 | 4 | def mygridfun(pic, frmt, cdpath, frmtuot, scaleoptions): 5 | ds = gdal.Open(pic) 6 | gt = ds.GetGeoTransform() 7 | 8 | needed_out_x = 512 9 | needed_out_y = 512 10 | 11 | # get coordinates of upper left corner 12 | xmin = gt[0] 13 | ymax = gt[3] 14 | resx = gt[1] 15 | res_y = gt[5] 16 | resy = abs(res_y) 17 | 18 | # determine total length of raster (if needed XD ) 19 | xlen = resx * ds.RasterXSize 20 | ylen = resy * ds.RasterYSize 21 | 22 | # overall raster dim in pixels before the edits (if needed XD ) 23 | img_width = ds.RasterXSize 24 | img_height = ds.RasterYSize 25 | # round up to nearst int to the 26 | xnotround = ds.RasterXSize/needed_out_x 27 | xround = math.ceil(xnotround) 28 | ynotround = ds.RasterYSize/needed_out_y 29 | yround = math.ceil(ynotround) 30 | 31 | # pixel to meter - 512×10×0.18 32 | pixtomX = needed_out_x*xround*resx 33 | pixtomy = needed_out_y*yround*resy 34 | # size of a single tile 35 | xsize = pixtomX/xround 36 | ysize = pixtomy/yround 37 | # create lists of x and y coordinates 38 | xsteps = [xmin + xsize * i for i in range(xround+1)] 39 | ysteps = [ymax - ysize * i for i in range(yround+1)] 40 | 41 | # loop over min and max x and y coordinates 42 | for i in range(xround): 43 | for j in range(yround): 44 | xmin = xsteps[i] 45 | xmax = xsteps[i+1] 46 | ymax = ysteps[j] 47 | ymin = ysteps[j+1] 48 | 49 | # use gdal warp 50 | # gdal.WarpOptions(outputType=gdal.gdalconst.GDT_Byte) 51 | # gdal.Warp("ds"+str(i)+str(j)+".tif", ds, 52 | # outputBounds = (xmin, ymin, xmax, ymax), dstNodata = -9999) 53 | 54 | # or gdal translate to subset the input raster 55 | gdal.Translate(cdpath+str(i)+str(j)+'.'+frmt, ds, projWin = (abs(xmin), abs(ymax), abs(xmax), abs(ymin)), xRes = resx, yRes = -resy, outputType=gdal.gdalconst.GDT_Byte, format = frmtuot, scaleParams = [[scaleoptions]]) 56 | # close the open dataset!!! 57 | # ds = None -------------------------------------------------------------------------------- /split_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | from typing import Union, Tuple, List, Any 4 | import numpy as np 5 | import cv2 6 | from PIL import Image 7 | 8 | import rasterio 9 | from rasterio.windows import Window 10 | 11 | 12 | def __mkdir_p(path: str, sub_name: str) -> str: 13 | new_path = osp.join(path, sub_name) 14 | if not osp.exists(new_path): 15 | os.makedirs(new_path) 16 | return new_path 17 | 18 | 19 | def __get_file_name(path: str) -> str: 20 | _, full_name = osp.split(path) 21 | name, _ = osp.splitext(full_name) 22 | return name 23 | 24 | 25 | def __full_size(ima: np.array, grid_size: Union[List, Tuple]=(512, 512)) -> np.array: 26 | h, w = ima.shape[:2] 27 | if len(ima.shape) == 2: 28 | img = np.zeros(grid_size, dtype=np.uint16) 29 | img[:h, :w] = ima 30 | else: 31 | img = np.zeros((grid_size[0], grid_size[1], 3), dtype=np.uint16) 32 | img[:h, :w, :] = ima 33 | return img.astype("uint8") 34 | 35 | 36 | def __get_grid(rasterio_data: Any, row: int, col: int, 37 | grid_size: Union[List, Tuple]=(512, 512)) -> np.array: 38 | grid_size = np.array(grid_size) 39 | grid_idx = np.array([row, col]) 40 | ul = grid_idx * grid_size 41 | lr = ul + grid_size 42 | window = Window(ul[1], ul[0], (lr[1] - ul[1]), (lr[0] - ul[0])) 43 | rgb = [] 44 | count = rasterio_data.meta["count"] 45 | if count == 1: # mask 46 | ima = rasterio_data.read(1, window=window) 47 | return __full_size(ima, grid_size) 48 | elif count == 3: # image 49 | for b in range(count): 50 | rgb.append(rasterio_data.read((b + 1), window=window)) 51 | ima = cv2.merge([np.uint16(c) for c in rgb]) 52 | return __full_size(ima, grid_size) 53 | else: 54 | raise ValueError("count must be 1 or 3!") 55 | 56 | 57 | def __save_palette(label, save_path): 58 | bin_colormap = np.ones((256, 3)) * 255 # color 59 | bin_colormap[0, :] = [0, 0, 0] 60 | bin_colormap = bin_colormap.astype(np.uint8) 61 | visualimg = Image.fromarray(label, "P") 62 | palette = bin_colormap 63 | visualimg.putpalette(palette) 64 | visualimg.save(save_path, format='PNG') 65 | 66 | 67 | def split_tif(img_path: str, 68 | lab_path: str, 69 | save_folder: str, 70 | ssize :Union[List, Tuple]=(512, 512)) -> None: 71 | """ divide the large image to the specified size. 72 | 73 | Args: 74 | img_path (str): path of image raster. 75 | lab_path (str): path of mask raster. 76 | save_folder (str): path of save result folder. 77 | ssize (Union[List, Tuple], optional): slice size. Defaults to (512, 512). 78 | """ 79 | img_save_folder = __mkdir_p(save_folder, "Images") 80 | lab_save_folder = __mkdir_p(save_folder, "Labels") 81 | print("folder created!") 82 | name = __get_file_name(img_path) 83 | img = rasterio.open(img_path) 84 | lab = rasterio.open(lab_path) 85 | if img.meta["width"] != lab.meta["width"] and img.meta["height"] != lab.meta["height"]: 86 | raise ValueError("image's size must equal label's size!") 87 | img_size = np.array([img.meta["height"], img.meta["width"]]) 88 | grid_count = list(np.ceil(img_size / np.array(ssize)).astype("uint8")) 89 | for r in range(grid_count[0]): 90 | for c in range(grid_count[1]): 91 | name_i = name + "_" + str(r) + "_" + str(c) 92 | img_i = __get_grid(img, r, c) 93 | img_save_path = osp.join(img_save_folder, (name_i + ".jpg")) 94 | cv2.imwrite(img_save_path, cv2.cvtColor(img_i, cv2.COLOR_RGB2BGR)) 95 | lab_i = __get_grid(lab, r, c) 96 | lab_save_path = osp.join(lab_save_folder, (name_i + ".png")) 97 | __save_palette(lab_i, lab_save_path) 98 | print("finished!") 99 | 100 | 101 | if __name__ == "__main__": 102 | img_path = r"Raster\2019_9_4_res.tif" 103 | lab_path = r"Raster\2019_9_4_lab_2.tif" 104 | save_folder = r"Datasets" 105 | split_tif(img_path, lab_path, save_folder) -------------------------------------------------------------------------------- /raster_type/raster2uint8.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import operator 4 | from functools import reduce 5 | 6 | 7 | def raster_to_uint8(image: np.array, dtype: str="uint8") -> np.array: 8 | """ Convert raster to uint8. 9 | 10 | Args: 11 | image (np.array): image. 12 | dtype (str): type of image. 13 | 14 | Returns: 15 | np.array: image on uint8. 16 | """ 17 | dtypes = ["uint8", "uint16", "float32"] 18 | if dtype not in dtypes: 19 | raise ValueError(f"'dtype' must be uint8/uint16/float32, not {dtype}.") 20 | if dtypes == "uint8": 21 | return image 22 | else: 23 | if dtypes == "float32": 24 | image = __sample_norm(image) 25 | return __two_percentLinear(image) 26 | 27 | 28 | # 2% linear stretch 29 | def __two_percentLinear(image: np.array, max_out: int=255, min_out: int=0) -> np.array: 30 | def __gray_process(gray, maxout=max_out, minout=min_out): 31 | high_value = np.percentile(gray, 98) # Get the corresponding gray level at 98% histogram 32 | low_value = np.percentile(gray, 2) 33 | truncated_gray = np.clip(gray, a_min=low_value, a_max=high_value) 34 | processed_gray = ((truncated_gray - low_value) / (high_value - low_value)) * (maxout - minout) 35 | return processed_gray 36 | if len(image.shape) == 3 and image.shape[-1] == 3: 37 | b, g, r = cv2.split(image) 38 | r_p = __gray_process(r) 39 | g_p = __gray_process(g) 40 | b_p = __gray_process(b) 41 | result = cv2.merge((b_p, g_p, r_p)) 42 | elif len(image.shape) == 2: 43 | result = __gray_process(image) 44 | else: 45 | raise ValueError(f"image.shape[-1] must be 1 or 3, but {image.shape[-1]}.") 46 | return np.uint8(result) 47 | 48 | 49 | # Simple image standardization 50 | def __sample_norm(image: np.array, NUMS: int=65536) -> np.array: 51 | if NUMS == 256: 52 | return np.uint8(image) 53 | if len(image.shape) == 3 and image.shape[-1] == 3: 54 | stretched_r = __stretch(image[:, :, 0], NUMS) 55 | stretched_g = __stretch(image[:, :, 1], NUMS) 56 | stretched_b = __stretch(image[:, :, 2], NUMS) 57 | stretched_img = cv2.merge([ 58 | stretched_r / float(NUMS), 59 | stretched_g / float(NUMS), 60 | stretched_b / float(NUMS)]) 61 | elif len(image.shape) == 2: 62 | stretched_img = __stretch(image, NUMS) 63 | else: 64 | raise ValueError(f"image.shape[-1] must be 1 or 3, but {image.shape[-1]}.") 65 | return np.uint8(stretched_img * 255) 66 | 67 | 68 | # Histogram equalization 69 | def __stretch(ima: np.array, NUMS: int) -> np.array: 70 | hist = __histogram(ima, NUMS) 71 | lut = [] 72 | for bt in range(0, len(hist), NUMS): 73 | # Step size 74 | step = reduce(operator.add, hist[bt : bt + NUMS]) / (NUMS - 1) 75 | # Create balanced lookup table 76 | n = 0 77 | for i in range(NUMS): 78 | lut.append(n / step) 79 | n += hist[i + bt] 80 | np.take(lut, ima, out=ima) 81 | return ima 82 | 83 | 84 | # Calculate histogram 85 | def __histogram(ima: np.array, NUMS: int) -> np.array: 86 | bins = list(range(0, NUMS)) 87 | flat = ima.flat 88 | n = np.searchsorted(np.sort(flat), bins) 89 | n = np.concatenate([n, [len(flat)]]) 90 | hist = n[1:] - n[:-1] 91 | return hist 92 | 93 | 94 | if __name__ == "__main__": 95 | try: 96 | import gdal 97 | except: 98 | from osgeo import gdal 99 | 100 | tif_u8_path = r"raster_type\test_data\tif_u8.tif" 101 | tif_u16_path = r"raster_type\test_data\tif_u16.tif" 102 | tif_f32_path = r"raster_type\test_data\tif_f32.tif" 103 | for tif_path, dtype in zip([tif_u8_path, tif_u16_path, tif_f32_path], 104 | ["uint8", "uint16", "float32"]): 105 | ima = gdal.Open(tif_path).ReadAsArray() 106 | if len(ima.shape) != 2: 107 | ima = ima.transpose((1, 2, 0)) 108 | ima = raster_to_uint8(ima, dtype) 109 | cv2.imshow("ima", ima) 110 | cv2.waitKey(0) 111 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /split_dataset_list.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import glob 17 | import os.path 18 | import argparse 19 | import warnings 20 | import numpy as np 21 | 22 | 23 | def parse_args(): 24 | parser = argparse.ArgumentParser( 25 | description= 26 | 'A tool for proportionally randomizing dataset to produce file lists.') 27 | parser.add_argument('dataset_root', help='the dataset root path', type=str) 28 | parser.add_argument( 29 | 'images_dir_name', help='the directory name of images', type=str) 30 | parser.add_argument( 31 | 'labels_dir_name', help='the directory name of labels', type=str) 32 | parser.add_argument( 33 | '--split', help='', nargs=3, type=float, default=[0.7, 0.3, 0]) 34 | parser.add_argument( 35 | '--label_class', 36 | help='label class names', 37 | type=str, 38 | nargs='*', 39 | default=['__background__', '__foreground__']) 40 | parser.add_argument( 41 | '--separator', 42 | dest='separator', 43 | help='file list separator', 44 | default=" ", 45 | type=str) 46 | parser.add_argument( 47 | '--format', 48 | help='data format of images and labels, e.g. jpg, tif or png.', 49 | type=str, 50 | nargs=2, 51 | default=['jpg', 'png']) 52 | parser.add_argument( 53 | '--postfix', 54 | help='postfix of images or labels', 55 | type=str, 56 | nargs=2, 57 | default=['', '']) 58 | 59 | return parser.parse_args() 60 | 61 | 62 | def get_files(path, format, postfix): 63 | pattern = '*%s.%s' % (postfix, format) 64 | 65 | search_files = os.path.join(path, pattern) 66 | search_files2 = os.path.join(path, "*", pattern) # 包含子目录 67 | search_files3 = os.path.join(path, "*", "*", pattern) # 包含三级目录 68 | 69 | filenames = glob.glob(search_files) 70 | filenames2 = glob.glob(search_files2) 71 | filenames3 = glob.glob(search_files3) 72 | 73 | filenames = filenames + filenames2 + filenames3 74 | 75 | return sorted(filenames) 76 | 77 | 78 | def generate_list(args): 79 | separator = args.separator 80 | dataset_root = args.dataset_root 81 | if sum(args.split) != 1.0: 82 | raise ValueError("划分比例之和必须为1") 83 | 84 | file_list = os.path.join(dataset_root, 'labels.txt') 85 | with open(file_list, "w") as f: 86 | for label_class in args.label_class: 87 | f.write(label_class + '\n') 88 | 89 | image_dir = os.path.join(dataset_root, args.images_dir_name) 90 | label_dir = os.path.join(dataset_root, args.labels_dir_name) 91 | image_files = get_files(image_dir, args.format[0], args.postfix[0]) 92 | label_files = get_files(label_dir, args.format[1], args.postfix[1]) 93 | if not image_files: 94 | warnings.warn("No files in {}".format(image_dir)) 95 | num_images = len(image_files) 96 | 97 | if not label_files: 98 | warnings.warn("No files in {}".format(label_dir)) 99 | num_label = len(label_files) 100 | 101 | if num_images != num_label and num_label > 0: 102 | raise Exception("Number of images = {} number of labels = {} \n" 103 | "Either number of images is equal to number of labels, " 104 | "or number of labels is equal to 0.\n" 105 | "Please check your dataset!".format( 106 | num_images, num_label)) 107 | 108 | image_files = np.array(image_files) 109 | label_files = np.array(label_files) 110 | state = np.random.get_state() 111 | np.random.shuffle(image_files) 112 | np.random.set_state(state) 113 | np.random.shuffle(label_files) 114 | 115 | start = 0 116 | num_split = len(args.split) 117 | dataset_name = ['train', 'val', 'test'] 118 | for i in range(num_split): 119 | dataset_split = dataset_name[i] 120 | print("Creating {}.txt...".format(dataset_split)) 121 | if args.split[i] > 1.0 or args.split[i] < 0: 122 | raise ValueError( 123 | "{} dataset percentage should be 0~1.".format(dataset_split)) 124 | 125 | file_list = os.path.join(dataset_root, dataset_split + '.txt') 126 | with open(file_list, "w") as f: 127 | num = round(args.split[i] * num_images) 128 | end = start + num 129 | if i == num_split - 1: 130 | end = num_images 131 | for item in range(start, end): 132 | left = image_files[item].replace(dataset_root, '') 133 | if left[0] == os.path.sep: 134 | left = left.lstrip(os.path.sep) 135 | 136 | try: 137 | right = label_files[item].replace(dataset_root, '') 138 | if right[0] == os.path.sep: 139 | right = right.lstrip(os.path.sep) 140 | line = left + separator + right + '\n' 141 | except: 142 | line = left + '\n' 143 | 144 | f.write(line) 145 | print(line) 146 | start = end 147 | 148 | 149 | if __name__ == '__main__': 150 | args = parse_args() 151 | generate_list(args) 152 | -------------------------------------------------------------------------------- /multi_raster_vector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import glob 4 | from osgeo import gdal, ogr 5 | import math 6 | 7 | 8 | rasdir = "DataSet/raster/" 9 | vecdir = "DataSet/vector/" 10 | 11 | # raster and vector names 12 | rasterName = [os.path.basename(rr) for rr in glob.glob( 13 | rasdir + "**/*.tif", recursive=True) + glob.glob( 14 | rasdir + "**/*.bmp", recursive=True)] 15 | vectorName = [os.path.basename(vv) for vv in glob.glob( 16 | vecdir + "**/*.shp", recursive=True)] 17 | 18 | # raster and vector relative path 19 | rasterList = [os.path.normpath(rr) for rr in glob.glob( 20 | rasdir + "**/*.tif", recursive=True) + glob.glob( 21 | rasdir + "**/*.bmp", recursive=True)] 22 | vectorList = [os.path.normpath(vv) for vv in glob.glob( 23 | vecdir + "**/*.shp", recursive=True)] 24 | 25 | 26 | def rasterize(fn_ras, fn_vec, output): 27 | driver = ogr.GetDriverByName("ESRI Shapefile") 28 | ras_ds = gdal.Open(fn_ras) 29 | vec_ds = driver.Open(fn_vec, 1) 30 | 31 | lyr = vec_ds.GetLayer() 32 | geot = ras_ds.GetGeoTransform() 33 | proj = ras_ds.GetProjection() # Get the projection from original tiff (fn_ras) 34 | 35 | layerdefinition = lyr.GetLayerDefn() 36 | feature = ogr.Feature(layerdefinition) 37 | 38 | schema = [] 39 | for n in range(layerdefinition.GetFieldCount()): 40 | fdefn = layerdefinition.GetFieldDefn(n) 41 | schema.append(fdefn.name) 42 | yy = feature.GetFieldIndex("MLDS") 43 | if yy < 0: 44 | print("MLDS field not found, we will create one for you and make all values to 1") 45 | else: 46 | lyr.DeleteField(yy) 47 | # lyr.ResetReading() 48 | new_field = ogr.FieldDefn("MLDS", ogr.OFTInteger) 49 | lyr.CreateField(new_field) 50 | for feature in lyr: 51 | feature.SetField("MLDS", 1) 52 | lyr.SetFeature(feature) 53 | feature = None 54 | 55 | # isAttributeOn = att_field_input if att_field_input != '' else first_att_field 56 | # pixelsizeX = 0.2 if ras_ds.RasterXSize < 0.2 else ras_ds.RasterXSize 57 | # pixelsizeY = -0.2 if ras_ds.RasterYSize < -0.2 else ras_ds.RasterYSize 58 | 59 | drv_tiff = gdal.GetDriverByName("GTiff") 60 | chn_ras_ds = drv_tiff.Create( 61 | output, ras_ds.RasterXSize, ras_ds.RasterYSize, 1, gdal.GDT_Byte) 62 | 63 | # Set the projection from original tiff (fn_ras) to the rasterized tiff 64 | chn_ras_ds.SetGeoTransform(geot) 65 | chn_ras_ds.SetProjection(proj) 66 | chn_ras_ds.FlushCache() 67 | 68 | gdal.RasterizeLayer(chn_ras_ds, [1], lyr, burn_values=[1], options=["ATTRIBUTE=MLDS"]) 69 | 70 | # Change No Data Value to 0 71 | # chn_ras_ds.GetRasterBand(1).SetNoDataValue(0) 72 | chn_ras_ds = None 73 | # lyr.DeleteField(yy) # delete field 74 | vec_ds = None 75 | 76 | 77 | def mygridfun(fn_ras, cdpath, frmt_ext, imgfrmat, scaleoptions, needed_out_x, needed_out_y, file_name): 78 | ds = gdal.Open(fn_ras) 79 | gt = ds.GetGeoTransform() 80 | 81 | # get coordinates of upper left corner 82 | xmin = gt[0] 83 | ymax = gt[3] 84 | resx = gt[1] 85 | res_y = gt[5] 86 | resy = abs(res_y) 87 | 88 | # round up to nearst int 89 | xnotround = ds.RasterXSize / needed_out_x 90 | xround = math.ceil(xnotround) 91 | ynotround = ds.RasterYSize / needed_out_y 92 | yround = math.ceil(ynotround) 93 | 94 | # pixel to meter - 512×10×0.18 95 | pixtomX = needed_out_x * xround * resx 96 | pixtomy = needed_out_y * yround * resy 97 | # size of a single tile 98 | xsize = pixtomX / xround 99 | ysize = pixtomy / yround 100 | # create lists of x and y coordinates 101 | xsteps = [xmin + xsize * i for i in range(xround + 1)] 102 | ysteps = [ymax - ysize * i for i in range(yround + 1)] 103 | 104 | # loop over min and max x and y coordinates 105 | for i in range(xround): 106 | for j in range(yround): 107 | xmin = xsteps[i] 108 | xmax = xsteps[i + 1] 109 | ymax = ysteps[j] 110 | ymin = ysteps[j + 1] 111 | 112 | # use gdal warp 113 | # gdal.WarpOptions(outputType=gdal.gdalconst.GDT_Byte) 114 | # gdal.Warp("ds"+str(i)+str(j)+".tif", ds, 115 | # outputBounds = (xmin, ymin, xmax, ymax), dstNodata = -9999) 116 | 117 | # or gdal translate to subset the input raster 118 | gdal.Translate(osp.join(cdpath, \ 119 | (str(file_name) + "-" + str(j) + "-" + str(i) + "." + frmt_ext)), 120 | ds, 121 | projWin=(abs(xmin), abs(ymax), abs(xmax), abs(ymin)), 122 | xRes=resx, 123 | yRes=-resy, 124 | outputType=gdal.gdalconst.GDT_Byte, 125 | format=imgfrmat, 126 | scaleParams=[[scaleoptions]]) 127 | 128 | # close the open dataset!!! 129 | # ds = None 130 | 131 | 132 | def mkdir_p(path): 133 | if not osp.exists(path): 134 | os.makedirs(path) 135 | 136 | 137 | dataset_path = "/".join(rasdir.split("/")[:-2]) 138 | output_folder_path = osp.join(dataset_path, "rasterized/values/") 139 | image_folder_path = osp.join(dataset_path, "image/") 140 | label_folder_path = osp.join(dataset_path, "label/") 141 | mkdir_p(output_folder_path) 142 | mkdir_p(image_folder_path) 143 | mkdir_p(label_folder_path) 144 | 145 | for vvv in range(len(vectorList)): 146 | fn_ras = rasterList[vvv] 147 | fn_vec = vectorList[vvv] 148 | file_name = vectorName[vvv].split(".")[0] 149 | output = output_folder_path + file_name + ".tif" 150 | rasterize(fn_ras, fn_vec, output) 151 | mygridfun(fn_ras, image_folder_path, "jpg", "JPEG", "", 512, 512, file_name) 152 | mygridfun(output, label_folder_path, "png", "PNG", "", 512, 512, file_name) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Youssef-Harby/split-rs-data/main?labpath=Tutorial.ipynb) 2 | [![CodeFactor](https://www.codefactor.io/repository/github/youssef-harby/split-rs-data/badge)](https://www.codefactor.io/repository/github/youssef-harby/split-rs-data) 3 | 4 | 5 | # Creating tools to handle raster and vector data to split it into small pieces equaled in size for machine learning datasets 6 | 7 | ## How To Use 8 | 9 | - Install docker https://docs.docker.com/engine/install/ (macos, Windows or Linux) 10 | 11 | - Clone the Repository : 12 | 13 | `git clone https://github.com/Youssef-Harby/split-rs-data.git` 14 | 15 | - Go to project directory : 16 | 17 | `cd split-rs-data` 18 | 19 | - Copy and paste your raster(.tif) and vector(.shp) files into a seperated folders : 20 | 21 | - ``` 22 | 23 | ./split-rs-data/DataSet/ # Dataset root directory 24 | |--raster # Original raster data 25 | | |--xxx1.tif (xx1.png) 26 | | |--... 27 | | └--... 28 | | 29 | |--vector # All shapefiles in the same place (.shx, .shp..etc) 30 | | |--xxx1.shp 31 | | |--xxx1.shx / .prj / .cpg / .dbf .... 32 | | └--xxx2.shp 33 | ``` 34 | 35 | - Build the docke image : ```docker compose up --build ``` 36 | 37 | - go to http://127.0.0.1:8888/ 38 | 39 | - you will find your token in the cli of the image. 40 | 41 | ![](https://github.com/Youssef-Harby/Remote-sensing-building-extraction-to-3D-model-using-Paddle-and-Grasshopper/blob/main/md_images/jnb-token.png?raw=true) 42 | 43 | - Open Tutorial.ipynb to learn 44 | 45 | - Or define your vector and raster folders in multi_raster_vector.py file and run it in docker by open cli and type : 46 | 47 | `python multi_raster_vector.py` 48 | 49 | ## TODO 50 | 51 | - [x] Creating Docker Image for development env. 52 | - [x] Splitting raster data into equal pieces with [rasterio](https://github.com/rasterio/rasterio) (512×512) thanks to [@geoyee](https://github.com/geoyee). 53 | - [x] Splitting raster data into equal pieces with [GDAL](https://github.com/OSGeo/gdal) , https://gdal.org/. 54 | - [x] Rasterize shapefile to raster in the same satellite pixel size and projection. 55 | - [x] Convert 24 or 16 bit raster to 8 bit. 56 | - [x] Export as jpg (for raster) and png (for rasterized shapefile) with GDAL. 57 | - [X] Validation of training and testing datasets for paddlepaddle. 58 | - [ ] GUI 59 | - [X] QGIS Plugin ➡️ [Deep Learning Datasets Maker](https://github.com/deepbands/deep-learning-datasets-maker/) 60 | 61 | ![](/docs/images/vQ9YMn.gif) 62 | 63 | 64 | 65 | ## Code In Detail ⬇️ 66 | 67 | 68 | 69 | ## First - Prepareing Datasets 70 | 71 | # 1.Convert Vector to Raster (Rasterize) with reference coordinate system from raster tiff 72 | 73 | all these tools made for prepare data for paddlepaddlea. 74 | 75 | ```python 76 | from osgeo import gdal, ogr 77 | ``` 78 | 79 | - fn\_ras = Input raster data (GTiff) 80 | - fn\_vec = input vector data (Shapefile) 81 | 82 | ```python 83 | fn_ras = 'DataSet/raster/01/01.tif' 84 | fn_vec = 'DataSet/vector/01/01.shp' 85 | output = 'DataSet/results/lab_all_values.tif' 86 | ``` 87 | 88 | import the GDAL driver "ESRI Shapefile" to open the shapefile 89 | 90 | 91 | ```python 92 | driver = ogr.GetDriverByName("ESRI Shapefile") 93 | ``` 94 | 95 | open raster and shapefile datasets with (shapefile , 1) 96 | 97 | - (shapefile , 1) read and write in the shapefile 98 | - (shapefile , 0) read onle the shapefile 99 | 100 | ```python 101 | ras_ds = gdal.Open(fn_ras) 102 | vec_ds = driver.Open(fn_vec, 1) 103 | ``` 104 | 105 | Get the : 106 | 107 | - GetLayer (Only shapefiles have one lyrs other fomates maybe have 108 | multi-lyrs) \#VECTOR 109 | - GetGeoTransform \#FROM RASTER 110 | - GetProjection \#FROM RASTER 111 | 112 | ```python 113 | lyr = vec_ds.GetLayer() 114 | geot = ras_ds.GetGeoTransform() 115 | proj = ras_ds.GetProjection() # Get the projection from original tiff (fn_ras) 116 | geot 117 | ``` 118 | 119 | (342940.8074133941, 120 | 0.18114600000000536, 121 | 0.0, 122 | 3325329.401211367, 123 | 0.0, 124 | -0.1811459999999247) 125 | 126 | Open the shapefile feature to edit in it 127 | 128 | 129 | ```python 130 | layerdefinition = lyr.GetLayerDefn() 131 | feature = ogr.Feature(layerdefinition) 132 | ``` 133 | 134 | `feature.GetFieldIndex` make you to know the id of a specific field name 135 | you want to read/edit/delete 136 | 137 | - Also you can list all fields on the shapefile by : 138 | 139 | 140 | 141 | schema = [] 142 | for n in range(layerdefinition.GetFieldCount()): 143 | fdefn = layerdefinition.GetFieldDefn(n) 144 | schema.append(fdefn.name) 145 | 146 | - Then I will delete the field called "MLDS" has been assumed by me 147 | 148 | ```python 149 | yy = feature.GetFieldIndex("MLDS") 150 | if yy < 0: 151 | print("MLDS field not found, we will create one for you and make all values to 1") 152 | else: 153 | lyr.DeleteField(yy) 154 | ``` 155 | 156 | add new field to the shapefile with a default value `"1"` and don't 157 | forget to close feature after the edits 158 | 159 | ```python 160 | new_field = ogr.FieldDefn("MLDS", ogr.OFTInteger) 161 | lyr.CreateField(new_field) 162 | for feature in lyr: 163 | feature.SetField("MLDS", 1) 164 | lyr.SetFeature(feature) 165 | feature = None 166 | ``` 167 | 168 | Set the projection from original tiff (fn\_ras) to the rasterized tiff 169 | 170 | ```python 171 | drv_tiff = gdal.GetDriverByName("GTiff") 172 | chn_ras_ds = drv_tiff.Create( 173 | output, ras_ds.RasterXSize, ras_ds.RasterYSize, 1, gdal.GDT_Byte) 174 | chn_ras_ds.SetGeoTransform(geot) 175 | chn_ras_ds.SetProjection(proj) 176 | chn_ras_ds.FlushCache() 177 | ``` 178 | 179 | ```python 180 | gdal.RasterizeLayer(chn_ras_ds, [1], lyr, burn_values=[1], options=["ATTRIBUTE=MLDS"]) 181 | chn_ras_ds = None 182 | vec_ds = None 183 | ``` 184 | 185 | DONE 186 | 187 | 188 | ## Second - Splitting raster and rasterized files to small tiles 512×512 depends on your memory 189 | 190 | 191 | ```python 192 | ds = gdal.Open(fn_ras) 193 | gt = ds.GetGeoTransform() 194 | ``` 195 | 196 | get coordinates of upper left corner 197 | 198 | ```python 199 | xmin = gt[0] 200 | ymax = gt[3] 201 | resx = gt[1] 202 | res_y = gt[5] 203 | resy = abs(res_y) 204 | ``` 205 | 206 | ```python 207 | import math 208 | import os.path as osp 209 | ``` 210 | 211 | the tile size i want (may be 256×256 for smaller memory size) 212 | 213 | 214 | ```python 215 | needed_out_x = 512 216 | needed_out_y = 512 217 | ``` 218 | 219 | round up to the nearest int 220 | 221 | ```python 222 | xnotround = ds.RasterXSize / needed_out_x 223 | xround = math.ceil(xnotround) 224 | ynotround = ds.RasterYSize / needed_out_y 225 | yround = math.ceil(ynotround) 226 | 227 | print(xnotround) 228 | print(xround) 229 | print(ynotround) 230 | print(yround) 231 | ``` 232 | 233 | 9.30078125 234 | 10 235 | 5.689453125 236 | 6 237 | 238 | pixel to meter - 512×10×0.18 239 | 240 | ```python 241 | pixtomX = needed_out_x * xround * resx 242 | pixtomy = needed_out_y * yround * resy 243 | 244 | print (pixtomX) 245 | print (pixtomy) 246 | ``` 247 | 248 | 927.4675200000274 249 | 556.4805119997686 250 | 251 | size of a single tile 252 | 253 | ```python 254 | xsize = pixtomX / xround 255 | ysize = pixtomy / yround 256 | 257 | print (xsize) 258 | print (ysize) 259 | ``` 260 | 261 | 92.74675200000274 262 | 92.74675199996143 263 | 264 | create lists of x and y coordinates 265 | 266 | 267 | ```python 268 | xsteps = [xmin + xsize * i for i in range(xround + 1)] 269 | ysteps = [ymax - ysize * i for i in range(yround + 1)] 270 | xsteps 271 | ``` 272 | 273 | [342940.8074133941, 274 | 343033.5541653941, 275 | 343126.3009173941, 276 | 343219.0476693941, 277 | 343311.7944213941, 278 | 343404.54117339413, 279 | 343497.28792539414, 280 | 343590.03467739414, 281 | 343682.78142939415, 282 | 343775.5281813941, 283 | 343868.2749333941] 284 | 285 | set the output path 286 | 287 | ```python 288 | cdpath = "DataSet/image/" 289 | ``` 290 | 291 | loop over min and max x and y coordinates 292 | 293 | ```python 294 | for i in range(xround): 295 | for j in range(yround): 296 | xmin = xsteps[i] 297 | xmax = xsteps[i + 1] 298 | ymax = ysteps[j] 299 | ymin = ysteps[j + 1] 300 | 301 | # gdal translate to subset the input raster 302 | 303 | gdal.Translate(osp.join(cdpath, \ 304 | (str("01") + "-" + str(j) + "-" + str(i) + "." + "jpg")), 305 | ds, 306 | projWin=(abs(xmin), abs(ymax), abs(xmax), abs(ymin)), 307 | xRes=resx, 308 | yRes=-resy, 309 | outputType=gdal.gdalconst.GDT_Byte, 310 | format="JPEG") 311 | ds = None 312 | ``` 313 | 314 | ### Third - Spilit Custom Dataset and Generate File List 315 | 316 | For all data that is not divided into training set, validation set, and 317 | test set, PaddleSeg provides a script to generate segmented data and 318 | generate a file list. 319 | 320 | #### Use scripts to randomly split the custom dataset proportionally and generate a file list 321 | 322 | The data file structure is as follows: 323 | 324 | ./DataSet/ # Dataset root directory 325 | |--image # Original image catalog 326 | | |--xxx1.jpg (xx1.png) 327 | | |--... 328 | | └--... 329 | | 330 | |--label # Annotated image catalog 331 | | |--xxx1.png 332 | | |--... 333 | | └--... 334 | 335 | Among them, the corresponding file name can be defined according to 336 | needs. 337 | 338 | The commands used are as follows, which supports enabling specific 339 | functions through different Flags. 340 | 341 | python tools/split_dataset_list.py ${FLAGS} 342 | 343 | Parameters: 344 | 345 | - dataset\_root: Dataset root directory 346 | - images\_dir\_name: Original image catalog 347 | - labels\_dir\_name: Annotated image catalog 348 | 349 | FLAGS: 350 | 351 | | FLAG | Meaning | Default | Parameter numbers | 352 | | --------------- | ------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------- | ----------------- | 353 | | \--split | Dataset segmentation ratio | 0.7 0.3 0 | 3 | 354 | | \--separator | File list separator | " | " | 355 | | \--format | Data format of pictures and label sets | "jpg" "png" | 2 | 356 | | \--label\_class | Label category | '\_\_background\_\_' '\_\_foreground\_\_' | several | 357 | | \--postfix | Filter pictures and label sets according to whether the main file name (without extension) contains the specified suffix | "" ""(2 null characters) | 2 | 358 | 359 | After running, `train.txt`, `val.txt`, `test.txt` and `labels.txt` will 360 | be generated in the root directory of the dataset. 361 | 362 | **Note:** Requirements for generating the file list: either the original 363 | image and the number of annotated images are the same, or there is only 364 | the original image without annotated images. If the dataset lacks 365 | annotated images, a file list without separators and annotated image 366 | paths will be generated. 367 | 368 | #### Example 369 | 370 | python tools/split_dataset_list.py images annotations --split 0.6 0.2 0.2 --format jpg png 371 | 372 | ## Dataset file organization 373 | 374 | - If you need to use a custom dataset for training, it is recommended 375 | to organize it into the following structure: custom\_dataset | 376 | |--images | |--image1.jpg | |--image2.jpg | |--... | |--labels | 377 | |--label1.png | |--label2.png | |--... | |--train.txt | |--val.txt | 378 | |--test.txt 379 | 380 | The contents of train.txt and val.txt are as follows: 381 | 382 | image/image1.jpg label/label1.png 383 | image/image2.jpg label/label2.png 384 | ... 385 | 386 | Full Docs : 387 | 388 | 389 | 390 | ```python 391 | import sys 392 | import subprocess 393 | ``` 394 | 395 | ```python 396 | theproc = subprocess.Popen([ 397 | "python", 398 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\split_dataset_list.py", #Split text py script 399 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\DataSet", # Root DataSet ath 400 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\DataSet\image", #images path 401 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\DataSet\label", 402 | # "--split", 403 | # "0.6", # 60% training 404 | # "0.2", # 20% validating 405 | # "0.2", # 20% testing 406 | "--format", 407 | "jpg", 408 | "png"]) 409 | theproc.communicate() 410 | ``` 411 | 412 | (None, None) 413 | 414 | 415 | --------------------------------------------------------------------------------