├── docs
    └── images
    │   ├── vQ9YMn.gif
    │   └── tutorials
    │       ├── rasterize_label.py
    │       ├── split_raster_gdal_tiff.py
    │       └── mygrid.py
├── requirements.txt
├── raster_type
    ├── test_data
    │   ├── tif_f32.tif
    │   ├── tif_u16.tif
    │   └── tif_u8.tif
    └── raster2uint8.py
├── .gitignore
├── docker-compose.yml
├── Dockerfile
├── LICENSE
├── split_data.py
├── split_dataset_list.py
├── multi_raster_vector.py
└── README.md


/docs/images/vQ9YMn.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/docs/images/vQ9YMn.gif


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python
2 | numpy
3 | rasterio
4 | GDAL==3.2.2
5 | paddleseg
6 | paddlepaddle
7 | jupyter


--------------------------------------------------------------------------------
/raster_type/test_data/tif_f32.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/raster_type/test_data/tif_f32.tif


--------------------------------------------------------------------------------
/raster_type/test_data/tif_u16.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/raster_type/test_data/tif_u16.tif


--------------------------------------------------------------------------------
/raster_type/test_data/tif_u8.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yharby/split-rs-data/HEAD/raster_type/test_data/tif_u8.tif


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | DataSet
 2 | DataSet/image/*
 3 | DataSet/label/*
 4 | DataSet/raster/*
 5 | DataSet/vector/*
 6 | .ipynb_checkpoints
 7 | __pycache__/
 8 | output
 9 | output_inf
10 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | 
 3 | services: 
 4 |   python:
 5 |     build: .
 6 |     volumes:
 7 |       - .:/usr/src/app
 8 |     ports:
 9 |       - 8888:8888
10 |     command: bash -c "jupyter notebook --port=8888 --no-browser --ip='0.0.0.0' --allow-root"


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.8.12-slim-bullseye
 2 | LABEL maintainer="youssef_harby@yahoo.com"
 3 | 
 4 | RUN mkdir /usr/src/app
 5 | WORKDIR /usr/src/app
 6 | COPY ./requirements.txt .
 7 | 
 8 | RUN apt-get update \
 9 |   && apt-get install -y libgomp1 \
10 |     ffmpeg libsm6 libxext6 \
11 |     git \
12 |     build-essential \
13 |     gdal-bin libgdal-dev 
14 | RUN export CPLUS_INCLUDE_PATH=/usr/include/gdal
15 | RUN export C_INCLUDE_PATH=/usr/include/gdal
16 | RUN pip install --upgrade pip
17 | RUN pip install -r requirements.txt
18 | # RUN pip install git+https://github.com/philferriere/cocoapi.git#egg=pycocotools&subdirectory=PythonAPI


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Youssef Harby Makar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/images/tutorials/rasterize_label.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from osgeo import gdal, gdal_array, ogr
 3 | import mygrid as cropfarm
 4 | 
 5 | fn_ras = '2019_9_4_res.tif'
 6 | fn_vec = 'esribbb.shp'
 7 | output = "lab_all_values.tif"
 8 | 
 9 | ras_ds = gdal.Open(fn_ras)
10 | vec_ds = ogr.Open(fn_vec)
11 | 
12 | lyr = vec_ds.GetLayer()
13 | geot = ras_ds.GetGeoTransform()
14 | proj = ras_ds.GetProjection()
15 | 
16 | drv_tiff = gdal.GetDriverByName("GTiff")
17 | chn_ras_ds = drv_tiff.Create(output, ras_ds.RasterXSize, ras_ds.RasterYSize, 1, gdal.GDT_Byte)
18 | chn_ras_ds.SetGeoTransform(geot)
19 | 
20 | gdal.RasterizeLayer(chn_ras_ds, [1], lyr, options=['ATTRIBUTE=fid'])
21 | chn_ras_ds.GetRasterBand(1).SetNoDataValue(0.0) # Change No Data Value to 0
22 | chn_ras_ds.SetProjection (proj) # Set Projection as the source
23 | chn_ras_ds = None
24 | 
25 | #change all values >= 1 to 1
26 | final_output = "final_lab.tif"
27 | 
28 | ds = gdal.Open(output)
29 | b1 = ds.GetRasterBand(1)
30 | arr = b1.ReadAsArray()
31 | 
32 | data = (arr >= 1)
33 | gdal_array.SaveArray(data.astype("byte"), final_output, "GTIFF", ds)
34 | data = None
35 | 
36 | cropfarm.mygridfun(fn_ras, ".jpg","DataSet\image\image" ) #Sat_Raster
37 | cropfarm.mygridfun(final_output, ".png", "DataSet\label\label" ) #label_raster


--------------------------------------------------------------------------------
/docs/images/tutorials/split_raster_gdal_tiff.py:
--------------------------------------------------------------------------------
 1 | from osgeo import gdal
 2 | import math
 3 | 
 4 | ds = gdal.Open("2019_9_4_res.tif")
 5 | gt = ds.GetGeoTransform()
 6 | 
 7 | needed_out_x = 512
 8 | needed_out_y = 512
 9 | 
10 | # get coordinates of upper left corner
11 | xmin = gt[0]
12 | ymax = gt[3]
13 | resx = gt[1]
14 | res_y = gt[5]
15 | resy = abs(res_y)
16 | 
17 | # determine total length of raster
18 | xlen = resx * ds.RasterXSize
19 | ylen = resy * ds.RasterYSize
20 | 
21 | # overall raster dim in pixels before the edits
22 | img_width =  ds.RasterXSize
23 | img_height = ds.RasterYSize
24 | 
25 | # round up to nearst int to the 
26 | xnotround = ds.RasterXSize/needed_out_x
27 | xround = math.ceil(xnotround)
28 | ynotround = ds.RasterYSize/needed_out_y
29 | yround = math.ceil(ynotround)
30 | 
31 | # pixel to meter - 512×10×0.18
32 | pixtomX = needed_out_x*xround*resx
33 | pixtomy = needed_out_y*yround*resy
34 | 
35 | # size of a single tile
36 | xsize = pixtomX/xround
37 | ysize = pixtomy/yround
38 | 
39 | # create lists of x and y coordinates
40 | xsteps = [xmin + xsize * i for i in range(xround+1)]
41 | ysteps = [ymax - ysize * i for i in range(yround+1)]
42 | 
43 | # loop over min and max x and y coordinates
44 | for i in range(xround):
45 |     for j in range(yround):
46 |         xmin = xsteps[i]
47 |         xmax = xsteps[i+1]
48 |         ymax = ysteps[j]
49 |         ymin = ysteps[j+1]
50 |         
51 |         # print("xmin: "+str(xmin))
52 |         # print("xmax: "+str(xmax))
53 |         # print("ymin: "+str(ymin))
54 |         # print("ymax: "+str(ymax))
55 |         # print("\n")
56 |         
57 |         # use gdal warp
58 |         gdal.Warp("ds"+str(i)+str(j)+".tif", ds, 
59 |                   outputBounds = (xmin, ymin, xmax, ymax), dstNodata = -9999)
60 |         # or gdal translate to subset the input raster
61 |         # gdal.Translate("dem_translate"+str(i)+str(j)+".tif", dem, projWin = (xmin, ymax, xmax, ymin), xRes = res, yRes = -res)
62 |  
63 | # close the open dataset!!!
64 | dem = None


--------------------------------------------------------------------------------
/docs/images/tutorials/mygrid.py:
--------------------------------------------------------------------------------
 1 | from osgeo import gdal
 2 | import math
 3 | 
 4 | def mygridfun(pic, frmt, cdpath, frmtuot, scaleoptions):
 5 |     ds = gdal.Open(pic)
 6 |     gt = ds.GetGeoTransform()
 7 | 
 8 |     needed_out_x = 512
 9 |     needed_out_y = 512
10 | 
11 |     # get coordinates of upper left corner
12 |     xmin = gt[0]
13 |     ymax = gt[3]
14 |     resx = gt[1]
15 |     res_y = gt[5]
16 |     resy = abs(res_y)
17 | 
18 |     # determine total length of raster (if needed XD )
19 |     xlen = resx * ds.RasterXSize
20 |     ylen = resy * ds.RasterYSize
21 | 
22 |     # overall raster dim in pixels before the edits (if needed XD )
23 |     img_width = ds.RasterXSize
24 |     img_height = ds.RasterYSize
25 |     # round up to nearst int to the
26 |     xnotround = ds.RasterXSize/needed_out_x
27 |     xround = math.ceil(xnotround)
28 |     ynotround = ds.RasterYSize/needed_out_y
29 |     yround = math.ceil(ynotround)
30 | 
31 |     # pixel to meter - 512×10×0.18
32 |     pixtomX = needed_out_x*xround*resx
33 |     pixtomy = needed_out_y*yround*resy
34 | # size of a single tile
35 |     xsize = pixtomX/xround
36 |     ysize = pixtomy/yround
37 | # create lists of x and y coordinates
38 |     xsteps = [xmin + xsize * i for i in range(xround+1)]
39 |     ysteps = [ymax - ysize * i for i in range(yround+1)]
40 | 
41 | # loop over min and max x and y coordinates
42 |     for i in range(xround):
43 |         for j in range(yround):
44 |             xmin = xsteps[i]
45 |             xmax = xsteps[i+1]
46 |             ymax = ysteps[j]
47 |             ymin = ysteps[j+1]
48 | 
49 |             # use gdal warp
50 |             # gdal.WarpOptions(outputType=gdal.gdalconst.GDT_Byte)
51 |             # gdal.Warp("ds"+str(i)+str(j)+".tif", ds,
52 |             # outputBounds = (xmin, ymin, xmax, ymax), dstNodata = -9999)
53 |             
54 |             # or gdal translate to subset the input raster
55 |             gdal.Translate(cdpath+str(i)+str(j)+'.'+frmt, ds, projWin = (abs(xmin), abs(ymax), abs(xmax), abs(ymin)), xRes = resx, yRes = -resy, outputType=gdal.gdalconst.GDT_Byte, format = frmtuot, scaleParams = [[scaleoptions]])
56 |             # close the open dataset!!!
57 |             # ds = None


--------------------------------------------------------------------------------
/split_data.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | from typing import Union, Tuple, List, Any
  4 | import numpy as np
  5 | import cv2
  6 | from PIL import Image
  7 | 
  8 | import rasterio
  9 | from rasterio.windows import Window
 10 | 
 11 | 
 12 | def __mkdir_p(path: str, sub_name: str) -> str:
 13 |     new_path = osp.join(path, sub_name)
 14 |     if not osp.exists(new_path):
 15 |         os.makedirs(new_path)
 16 |     return new_path
 17 | 
 18 | 
 19 | def __get_file_name(path: str) -> str:
 20 |     _, full_name = osp.split(path)
 21 |     name, _ = osp.splitext(full_name)
 22 |     return name
 23 | 
 24 | 
 25 | def __full_size(ima: np.array, grid_size: Union[List, Tuple]=(512, 512)) -> np.array:
 26 |     h, w = ima.shape[:2]
 27 |     if len(ima.shape) == 2:
 28 |         img = np.zeros(grid_size, dtype=np.uint16)
 29 |         img[:h, :w] = ima
 30 |     else:
 31 |         img = np.zeros((grid_size[0], grid_size[1], 3), dtype=np.uint16)
 32 |         img[:h, :w, :] = ima
 33 |     return img.astype("uint8")
 34 | 
 35 | 
 36 | def __get_grid(rasterio_data: Any, row: int, col: int, 
 37 |                grid_size: Union[List, Tuple]=(512, 512)) -> np.array:
 38 |     grid_size = np.array(grid_size)
 39 |     grid_idx = np.array([row, col])
 40 |     ul = grid_idx * grid_size
 41 |     lr = ul + grid_size
 42 |     window = Window(ul[1], ul[0], (lr[1] - ul[1]), (lr[0] - ul[0]))
 43 |     rgb = []
 44 |     count = rasterio_data.meta["count"]
 45 |     if count == 1:  # mask
 46 |         ima = rasterio_data.read(1, window=window)
 47 |         return __full_size(ima, grid_size)
 48 |     elif count == 3:  # image
 49 |         for b in range(count):
 50 |             rgb.append(rasterio_data.read((b + 1), window=window))
 51 |         ima = cv2.merge([np.uint16(c) for c in rgb])
 52 |         return __full_size(ima, grid_size)
 53 |     else:
 54 |         raise ValueError("count must be 1 or 3!")
 55 | 
 56 | 
 57 | def __save_palette(label, save_path):
 58 |     bin_colormap = np.ones((256, 3)) * 255  # color
 59 |     bin_colormap[0, :] = [0, 0, 0]
 60 |     bin_colormap = bin_colormap.astype(np.uint8)
 61 |     visualimg  = Image.fromarray(label, "P")
 62 |     palette = bin_colormap
 63 |     visualimg.putpalette(palette) 
 64 |     visualimg.save(save_path, format='PNG')
 65 | 
 66 | 
 67 | def split_tif(img_path: str, 
 68 |               lab_path: str, 
 69 |               save_folder: str,
 70 |               ssize :Union[List, Tuple]=(512, 512)) -> None:
 71 |     """ divide the large image to the specified size.
 72 | 
 73 |     Args:
 74 |         img_path (str): path of image raster.
 75 |         lab_path (str): path of mask raster.
 76 |         save_folder (str): path of save result folder.
 77 |         ssize (Union[List, Tuple], optional): slice size. Defaults to (512, 512).
 78 |     """
 79 |     img_save_folder = __mkdir_p(save_folder, "Images")
 80 |     lab_save_folder = __mkdir_p(save_folder, "Labels")
 81 |     print("folder created!")
 82 |     name = __get_file_name(img_path)
 83 |     img = rasterio.open(img_path)
 84 |     lab = rasterio.open(lab_path)
 85 |     if img.meta["width"] != lab.meta["width"] and img.meta["height"] != lab.meta["height"]:
 86 |         raise ValueError("image's size must equal label's size!")
 87 |     img_size = np.array([img.meta["height"], img.meta["width"]])
 88 |     grid_count = list(np.ceil(img_size / np.array(ssize)).astype("uint8"))
 89 |     for r in range(grid_count[0]):
 90 |         for c in range(grid_count[1]):
 91 |             name_i = name + "_" + str(r) + "_" + str(c)
 92 |             img_i = __get_grid(img, r, c)
 93 |             img_save_path = osp.join(img_save_folder, (name_i + ".jpg"))
 94 |             cv2.imwrite(img_save_path, cv2.cvtColor(img_i, cv2.COLOR_RGB2BGR))
 95 |             lab_i = __get_grid(lab, r, c)
 96 |             lab_save_path = osp.join(lab_save_folder, (name_i + ".png"))
 97 |             __save_palette(lab_i, lab_save_path)
 98 |     print("finished!")
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     img_path = r"Raster\2019_9_4_res.tif"
103 |     lab_path = r"Raster\2019_9_4_lab_2.tif"
104 |     save_folder = r"Datasets"
105 |     split_tif(img_path, lab_path, save_folder)


--------------------------------------------------------------------------------
/raster_type/raster2uint8.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cv2
  3 | import operator
  4 | from functools import reduce
  5 | 
  6 | 
  7 | def raster_to_uint8(image: np.array, dtype: str="uint8") -> np.array:
  8 |     """ Convert raster to uint8.
  9 | 
 10 |     Args:
 11 |         image (np.array): image.
 12 |         dtype (str): type of image.
 13 | 
 14 |     Returns:
 15 |         np.array: image on uint8.
 16 |     """
 17 |     dtypes = ["uint8", "uint16", "float32"]
 18 |     if dtype not in dtypes:
 19 |         raise ValueError(f"'dtype' must be uint8/uint16/float32, not {dtype}.")
 20 |     if dtypes == "uint8":
 21 |         return image
 22 |     else:
 23 |         if dtypes == "float32":
 24 |             image = __sample_norm(image)
 25 |         return __two_percentLinear(image)
 26 | 
 27 | 
 28 | # 2% linear stretch
 29 | def __two_percentLinear(image: np.array, max_out: int=255, min_out: int=0) -> np.array:
 30 |     def __gray_process(gray, maxout=max_out, minout=min_out):
 31 |         high_value = np.percentile(gray, 98)  # Get the corresponding gray level at 98% histogram
 32 |         low_value = np.percentile(gray, 2)
 33 |         truncated_gray = np.clip(gray, a_min=low_value, a_max=high_value)
 34 |         processed_gray = ((truncated_gray - low_value) / (high_value - low_value)) * (maxout - minout)
 35 |         return processed_gray
 36 |     if len(image.shape) == 3 and image.shape[-1] == 3:
 37 |         b, g, r = cv2.split(image)
 38 |         r_p = __gray_process(r)
 39 |         g_p = __gray_process(g)
 40 |         b_p = __gray_process(b)
 41 |         result = cv2.merge((b_p, g_p, r_p))
 42 |     elif len(image.shape) == 2:
 43 |         result = __gray_process(image)
 44 |     else:
 45 |         raise ValueError(f"image.shape[-1] must be 1 or 3, but {image.shape[-1]}.")
 46 |     return np.uint8(result)
 47 | 
 48 | 
 49 | # Simple image standardization
 50 | def __sample_norm(image: np.array, NUMS: int=65536) -> np.array:
 51 |     if NUMS == 256:
 52 |         return np.uint8(image)
 53 |     if len(image.shape) == 3 and image.shape[-1] == 3:
 54 |         stretched_r = __stretch(image[:, :, 0], NUMS)
 55 |         stretched_g = __stretch(image[:, :, 1], NUMS)
 56 |         stretched_b = __stretch(image[:, :, 2], NUMS)
 57 |         stretched_img = cv2.merge([
 58 |                 stretched_r / float(NUMS),
 59 |                 stretched_g / float(NUMS),
 60 |                 stretched_b / float(NUMS)])
 61 |     elif len(image.shape) == 2:
 62 |         stretched_img = __stretch(image, NUMS)
 63 |     else:
 64 |         raise ValueError(f"image.shape[-1] must be 1 or 3, but {image.shape[-1]}.")
 65 |     return np.uint8(stretched_img * 255)
 66 | 
 67 | 
 68 | # Histogram equalization
 69 | def __stretch(ima: np.array, NUMS: int) -> np.array:
 70 |     hist = __histogram(ima, NUMS)
 71 |     lut = []
 72 |     for bt in range(0, len(hist), NUMS):
 73 |         # Step size
 74 |         step = reduce(operator.add, hist[bt : bt + NUMS]) / (NUMS - 1)
 75 |         # Create balanced lookup table
 76 |         n = 0
 77 |         for i in range(NUMS):
 78 |             lut.append(n / step)
 79 |             n += hist[i + bt]
 80 |         np.take(lut, ima, out=ima)
 81 |         return ima
 82 | 
 83 | 
 84 | # Calculate histogram
 85 | def __histogram(ima: np.array, NUMS: int) -> np.array:
 86 |     bins = list(range(0, NUMS))
 87 |     flat = ima.flat
 88 |     n = np.searchsorted(np.sort(flat), bins)
 89 |     n = np.concatenate([n, [len(flat)]])
 90 |     hist = n[1:] - n[:-1]
 91 |     return hist
 92 | 
 93 | 
 94 | if __name__ == "__main__":
 95 |     try:
 96 |         import gdal
 97 |     except:
 98 |         from osgeo import gdal
 99 | 
100 |     tif_u8_path = r"raster_type\test_data\tif_u8.tif"
101 |     tif_u16_path = r"raster_type\test_data\tif_u16.tif"
102 |     tif_f32_path = r"raster_type\test_data\tif_f32.tif"
103 |     for tif_path, dtype in zip([tif_u8_path, tif_u16_path, tif_f32_path], 
104 |                                ["uint8", "uint16", "float32"]):
105 |         ima = gdal.Open(tif_path).ReadAsArray()
106 |         if len(ima.shape) != 2:
107 |             ima = ima.transpose((1, 2, 0))
108 |         ima = raster_to_uint8(ima, dtype)
109 |         cv2.imshow("ima", ima)
110 |         cv2.waitKey(0)
111 |         cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/split_dataset_list.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #    http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import glob
 17 | import os.path
 18 | import argparse
 19 | import warnings
 20 | import numpy as np
 21 | 
 22 | 
 23 | def parse_args():
 24 |     parser = argparse.ArgumentParser(
 25 |         description=
 26 |         'A tool for proportionally randomizing dataset to produce file lists.')
 27 |     parser.add_argument('dataset_root', help='the dataset root path', type=str)
 28 |     parser.add_argument(
 29 |         'images_dir_name', help='the directory name of images', type=str)
 30 |     parser.add_argument(
 31 |         'labels_dir_name', help='the directory name of labels', type=str)
 32 |     parser.add_argument(
 33 |         '--split', help='', nargs=3, type=float, default=[0.7, 0.3, 0])
 34 |     parser.add_argument(
 35 |         '--label_class',
 36 |         help='label class names',
 37 |         type=str,
 38 |         nargs='*',
 39 |         default=['__background__', '__foreground__'])
 40 |     parser.add_argument(
 41 |         '--separator',
 42 |         dest='separator',
 43 |         help='file list separator',
 44 |         default=" ",
 45 |         type=str)
 46 |     parser.add_argument(
 47 |         '--format',
 48 |         help='data format of images and labels, e.g. jpg, tif or png.',
 49 |         type=str,
 50 |         nargs=2,
 51 |         default=['jpg', 'png'])
 52 |     parser.add_argument(
 53 |         '--postfix',
 54 |         help='postfix of images or labels',
 55 |         type=str,
 56 |         nargs=2,
 57 |         default=['', ''])
 58 | 
 59 |     return parser.parse_args()
 60 | 
 61 | 
 62 | def get_files(path, format, postfix):
 63 |     pattern = '*%s.%s' % (postfix, format)
 64 | 
 65 |     search_files = os.path.join(path, pattern)
 66 |     search_files2 = os.path.join(path, "*", pattern)  # 包含子目录
 67 |     search_files3 = os.path.join(path, "*", "*", pattern)  # 包含三级目录
 68 | 
 69 |     filenames = glob.glob(search_files)
 70 |     filenames2 = glob.glob(search_files2)
 71 |     filenames3 = glob.glob(search_files3)
 72 | 
 73 |     filenames = filenames + filenames2 + filenames3
 74 | 
 75 |     return sorted(filenames)
 76 | 
 77 | 
 78 | def generate_list(args):
 79 |     separator = args.separator
 80 |     dataset_root = args.dataset_root
 81 |     if sum(args.split) != 1.0:
 82 |         raise ValueError("划分比例之和必须为1")
 83 | 
 84 |     file_list = os.path.join(dataset_root, 'labels.txt')
 85 |     with open(file_list, "w") as f:
 86 |         for label_class in args.label_class:
 87 |             f.write(label_class + '\n')
 88 | 
 89 |     image_dir = os.path.join(dataset_root, args.images_dir_name)
 90 |     label_dir = os.path.join(dataset_root, args.labels_dir_name)
 91 |     image_files = get_files(image_dir, args.format[0], args.postfix[0])
 92 |     label_files = get_files(label_dir, args.format[1], args.postfix[1])
 93 |     if not image_files:
 94 |         warnings.warn("No files in {}".format(image_dir))
 95 |     num_images = len(image_files)
 96 | 
 97 |     if not label_files:
 98 |         warnings.warn("No files in {}".format(label_dir))
 99 |     num_label = len(label_files)
100 | 
101 |     if num_images != num_label and num_label > 0:
102 |         raise Exception("Number of images = {}    number of labels = {} \n"
103 |                         "Either number of images is equal to number of labels, "
104 |                         "or number of labels is equal to 0.\n"
105 |                         "Please check your dataset!".format(
106 |                             num_images, num_label))
107 | 
108 |     image_files = np.array(image_files)
109 |     label_files = np.array(label_files)
110 |     state = np.random.get_state()
111 |     np.random.shuffle(image_files)
112 |     np.random.set_state(state)
113 |     np.random.shuffle(label_files)
114 | 
115 |     start = 0
116 |     num_split = len(args.split)
117 |     dataset_name = ['train', 'val', 'test']
118 |     for i in range(num_split):
119 |         dataset_split = dataset_name[i]
120 |         print("Creating {}.txt...".format(dataset_split))
121 |         if args.split[i] > 1.0 or args.split[i] < 0:
122 |             raise ValueError(
123 |                 "{} dataset percentage should be 0~1.".format(dataset_split))
124 | 
125 |         file_list = os.path.join(dataset_root, dataset_split + '.txt')
126 |         with open(file_list, "w") as f:
127 |             num = round(args.split[i] * num_images)
128 |             end = start + num
129 |             if i == num_split - 1:
130 |                 end = num_images
131 |             for item in range(start, end):
132 |                 left = image_files[item].replace(dataset_root, '')
133 |                 if left[0] == os.path.sep:
134 |                     left = left.lstrip(os.path.sep)
135 | 
136 |                 try:
137 |                     right = label_files[item].replace(dataset_root, '')
138 |                     if right[0] == os.path.sep:
139 |                         right = right.lstrip(os.path.sep)
140 |                     line = left + separator + right + '\n'
141 |                 except:
142 |                     line = left + '\n'
143 | 
144 |                 f.write(line)
145 |                 print(line)
146 |             start = end
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     args = parse_args()
151 |     generate_list(args)
152 | 


--------------------------------------------------------------------------------
/multi_raster_vector.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import glob
  4 | from osgeo import gdal, ogr
  5 | import math
  6 | 
  7 | 
  8 | rasdir = "DataSet/raster/"
  9 | vecdir = "DataSet/vector/"
 10 | 
 11 | # raster and vector names
 12 | rasterName = [os.path.basename(rr) for rr in glob.glob(
 13 |     rasdir + "**/*.tif", recursive=True) + glob.glob(
 14 |         rasdir + "**/*.bmp", recursive=True)]
 15 | vectorName = [os.path.basename(vv) for vv in glob.glob(
 16 |     vecdir + "**/*.shp", recursive=True)]
 17 | 
 18 | # raster and vector relative path
 19 | rasterList = [os.path.normpath(rr) for rr in glob.glob(
 20 |     rasdir + "**/*.tif", recursive=True) + glob.glob(
 21 |         rasdir + "**/*.bmp", recursive=True)]
 22 | vectorList = [os.path.normpath(vv) for vv in glob.glob(
 23 |     vecdir + "**/*.shp", recursive=True)]
 24 | 
 25 | 
 26 | def rasterize(fn_ras, fn_vec, output):
 27 |     driver = ogr.GetDriverByName("ESRI Shapefile")
 28 |     ras_ds = gdal.Open(fn_ras)
 29 |     vec_ds = driver.Open(fn_vec, 1)
 30 | 
 31 |     lyr = vec_ds.GetLayer()
 32 |     geot = ras_ds.GetGeoTransform()
 33 |     proj = ras_ds.GetProjection()  # Get the projection from original tiff (fn_ras)
 34 | 
 35 |     layerdefinition = lyr.GetLayerDefn()
 36 |     feature = ogr.Feature(layerdefinition)
 37 | 
 38 |     schema = []
 39 |     for n in range(layerdefinition.GetFieldCount()):
 40 |         fdefn = layerdefinition.GetFieldDefn(n)
 41 |         schema.append(fdefn.name)
 42 |     yy = feature.GetFieldIndex("MLDS")
 43 |     if yy < 0:
 44 |         print("MLDS field not found, we will create one for you and make all values to 1")
 45 |     else:
 46 |         lyr.DeleteField(yy)
 47 |         # lyr.ResetReading()
 48 |     new_field = ogr.FieldDefn("MLDS", ogr.OFTInteger)
 49 |     lyr.CreateField(new_field)
 50 |     for feature in lyr:
 51 |         feature.SetField("MLDS", 1)
 52 |         lyr.SetFeature(feature)
 53 |         feature = None
 54 | 
 55 |     # isAttributeOn = att_field_input if att_field_input != '' else first_att_field
 56 |     # pixelsizeX = 0.2 if ras_ds.RasterXSize < 0.2 else ras_ds.RasterXSize
 57 |     # pixelsizeY = -0.2 if ras_ds.RasterYSize < -0.2 else ras_ds.RasterYSize
 58 | 
 59 |     drv_tiff = gdal.GetDriverByName("GTiff")
 60 |     chn_ras_ds = drv_tiff.Create(
 61 |         output, ras_ds.RasterXSize, ras_ds.RasterYSize, 1, gdal.GDT_Byte)
 62 |     
 63 |     # Set the projection from original tiff (fn_ras) to the rasterized tiff
 64 |     chn_ras_ds.SetGeoTransform(geot)
 65 |     chn_ras_ds.SetProjection(proj)
 66 |     chn_ras_ds.FlushCache()
 67 | 
 68 |     gdal.RasterizeLayer(chn_ras_ds, [1], lyr, burn_values=[1], options=["ATTRIBUTE=MLDS"])
 69 | 
 70 |     # Change No Data Value to 0
 71 |     # chn_ras_ds.GetRasterBand(1).SetNoDataValue(0)
 72 |     chn_ras_ds = None
 73 |     # lyr.DeleteField(yy) # delete field
 74 |     vec_ds = None
 75 | 
 76 | 
 77 | def mygridfun(fn_ras, cdpath, frmt_ext, imgfrmat, scaleoptions, needed_out_x, needed_out_y, file_name):
 78 |     ds = gdal.Open(fn_ras)
 79 |     gt = ds.GetGeoTransform()
 80 | 
 81 |     # get coordinates of upper left corner
 82 |     xmin = gt[0]
 83 |     ymax = gt[3]
 84 |     resx = gt[1]
 85 |     res_y = gt[5]
 86 |     resy = abs(res_y)
 87 | 
 88 |     # round up to nearst int
 89 |     xnotround = ds.RasterXSize / needed_out_x
 90 |     xround = math.ceil(xnotround)
 91 |     ynotround = ds.RasterYSize / needed_out_y
 92 |     yround = math.ceil(ynotround)
 93 | 
 94 |     # pixel to meter - 512×10×0.18
 95 |     pixtomX = needed_out_x * xround * resx
 96 |     pixtomy = needed_out_y * yround * resy
 97 |     # size of a single tile
 98 |     xsize = pixtomX / xround
 99 |     ysize = pixtomy / yround
100 |     # create lists of x and y coordinates
101 |     xsteps = [xmin + xsize * i for i in range(xround + 1)]
102 |     ysteps = [ymax - ysize * i for i in range(yround + 1)]
103 | 
104 |     # loop over min and max x and y coordinates
105 |     for i in range(xround):
106 |         for j in range(yround):
107 |             xmin = xsteps[i]
108 |             xmax = xsteps[i + 1]
109 |             ymax = ysteps[j]
110 |             ymin = ysteps[j + 1]
111 | 
112 |             # use gdal warp
113 |             # gdal.WarpOptions(outputType=gdal.gdalconst.GDT_Byte)
114 |             # gdal.Warp("ds"+str(i)+str(j)+".tif", ds,
115 |             # outputBounds = (xmin, ymin, xmax, ymax), dstNodata = -9999)
116 | 
117 |             # or gdal translate to subset the input raster
118 |             gdal.Translate(osp.join(cdpath,  \
119 |                                     (str(file_name) + "-" + str(j) + "-" + str(i) + "." + frmt_ext)), 
120 |                            ds, 
121 |                            projWin=(abs(xmin), abs(ymax), abs(xmax), abs(ymin)),
122 |                            xRes=resx, 
123 |                            yRes=-resy, 
124 |                            outputType=gdal.gdalconst.GDT_Byte, 
125 |                            format=imgfrmat, 
126 |                            scaleParams=[[scaleoptions]])
127 | 
128 |             # close the open dataset!!!
129 |             # ds = None
130 | 
131 | 
132 | def mkdir_p(path):
133 |     if not osp.exists(path):
134 |         os.makedirs(path)
135 | 
136 | 
137 | dataset_path = "/".join(rasdir.split("/")[:-2])
138 | output_folder_path = osp.join(dataset_path, "rasterized/values/")
139 | image_folder_path = osp.join(dataset_path, "image/")
140 | label_folder_path = osp.join(dataset_path, "label/")
141 | mkdir_p(output_folder_path)
142 | mkdir_p(image_folder_path)
143 | mkdir_p(label_folder_path)
144 | 
145 | for vvv in range(len(vectorList)):
146 |     fn_ras = rasterList[vvv]
147 |     fn_vec = vectorList[vvv]
148 |     file_name = vectorName[vvv].split(".")[0]
149 |     output = output_folder_path + file_name + ".tif"
150 |     rasterize(fn_ras, fn_vec, output)
151 |     mygridfun(fn_ras, image_folder_path, "jpg", "JPEG", "", 512, 512, file_name)
152 |     mygridfun(output, label_folder_path, "png", "PNG", "", 512, 512, file_name)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/Youssef-Harby/split-rs-data/main?labpath=Tutorial.ipynb)
  2 | [![CodeFactor](https://www.codefactor.io/repository/github/youssef-harby/split-rs-data/badge)](https://www.codefactor.io/repository/github/youssef-harby/split-rs-data)
  3 | 
  4 | 
  5 | # Creating tools to handle raster and vector data to split it into small pieces equaled in size for machine learning datasets
  6 | 
  7 | ## How To Use
  8 | 
  9 | - Install docker https://docs.docker.com/engine/install/ (macos, Windows or Linux)
 10 | 
 11 | - Clone the Repository : 
 12 |   
 13 |   `git clone https://github.com/Youssef-Harby/split-rs-data.git`
 14 | 
 15 | - Go to project directory :
 16 |   
 17 |    `cd split-rs-data`
 18 | 
 19 | - Copy and paste your raster(.tif) and vector(.shp) files into a seperated folders :
 20 | 
 21 | - ```
 22 |   
 23 |   ./split-rs-data/DataSet/  # Dataset root directory
 24 |   |--raster  # Original raster data
 25 |   |  |--xxx1.tif (xx1.png)
 26 |   |  |--...
 27 |   |  └--...
 28 |   |
 29 |   |--vector # All shapefiles in the same place (.shx, .shp..etc)
 30 |   |  |--xxx1.shp
 31 |   |  |--xxx1.shx / .prj  / .cpg / .dbf ....
 32 |   |  └--xxx2.shp
 33 |   ```
 34 | 
 35 | - Build the docke image : ```docker compose up --build ```
 36 | 
 37 | - go to http://127.0.0.1:8888/
 38 | 
 39 | - you will find your token in the cli of the image.
 40 |   
 41 |   ![](https://github.com/Youssef-Harby/Remote-sensing-building-extraction-to-3D-model-using-Paddle-and-Grasshopper/blob/main/md_images/jnb-token.png?raw=true)
 42 | 
 43 | - Open Tutorial.ipynb to learn
 44 | 
 45 | - Or define your vector and raster folders in multi_raster_vector.py file and run it in docker by open cli and type :
 46 |   
 47 |   `python multi_raster_vector.py`
 48 | 
 49 | ## TODO
 50 | 
 51 | - [x] Creating Docker Image for development env.
 52 | - [x] Splitting raster data into equal pieces with [rasterio](https://github.com/rasterio/rasterio) (512×512) thanks to [@geoyee](https://github.com/geoyee).
 53 | - [x] Splitting raster data into equal pieces with [GDAL](https://github.com/OSGeo/gdal) , https://gdal.org/.
 54 | - [x] Rasterize shapefile to raster in the same satellite pixel size and projection.
 55 | - [x] Convert 24 or 16 bit raster to 8 bit.
 56 | - [x] Export as jpg (for raster) and png (for rasterized shapefile) with GDAL.
 57 | - [X] Validation of training and testing datasets for paddlepaddle.
 58 | - [ ] GUI
 59 | - [X] QGIS Plugin ➡️ [Deep Learning Datasets Maker](https://github.com/deepbands/deep-learning-datasets-maker/)
 60 | 
 61 |  ![](/docs/images/vQ9YMn.gif)
 62 | 
 63 | 
 64 | 
 65 | ## Code In Detail ⬇️
 66 | 
 67 | 
 68 | 
 69 | ## First - Prepareing Datasets
 70 | 
 71 | # 1.Convert Vector to Raster (Rasterize) with reference coordinate system from raster tiff
 72 | 
 73 | all these tools made for prepare data for paddlepaddlea.
 74 | 
 75 | ```python
 76 | from osgeo import gdal, ogr
 77 | ```
 78 | 
 79 | - fn\_ras = Input raster data (GTiff)
 80 | - fn\_vec = input vector data (Shapefile)
 81 | 
 82 | ```python
 83 | fn_ras = 'DataSet/raster/01/01.tif'
 84 | fn_vec = 'DataSet/vector/01/01.shp'
 85 | output = 'DataSet/results/lab_all_values.tif'
 86 | ```
 87 | 
 88 | import the GDAL driver "ESRI Shapefile" to open the shapefile
 89 | 
 90 | 
 91 | ```python
 92 | driver = ogr.GetDriverByName("ESRI Shapefile")
 93 | ```
 94 | 
 95 | open raster and shapefile datasets with (shapefile , 1)
 96 | 
 97 | - (shapefile , 1) read and write in the shapefile
 98 | - (shapefile , 0) read onle the shapefile
 99 | 
100 | ```python
101 | ras_ds = gdal.Open(fn_ras)
102 | vec_ds = driver.Open(fn_vec, 1)
103 | ```
104 | 
105 | Get the :
106 | 
107 | - GetLayer (Only shapefiles have one lyrs other fomates maybe have
108 |   multi-lyrs) \#VECTOR
109 | - GetGeoTransform \#FROM RASTER
110 | - GetProjection \#FROM RASTER
111 | 
112 | ```python
113 | lyr = vec_ds.GetLayer()
114 | geot = ras_ds.GetGeoTransform()
115 | proj = ras_ds.GetProjection() # Get the projection from original tiff (fn_ras)
116 | geot
117 | ```
118 | 
119 |     (342940.8074133941,
120 |      0.18114600000000536,
121 |      0.0,
122 |      3325329.401211367,
123 |      0.0,
124 |      -0.1811459999999247)
125 | 
126 | Open the shapefile feature to edit in it
127 | 
128 | 
129 | ```python
130 | layerdefinition = lyr.GetLayerDefn()
131 | feature = ogr.Feature(layerdefinition)
132 | ```
133 | 
134 | `feature.GetFieldIndex` make you to know the id of a specific field name
135 | you want to read/edit/delete
136 | 
137 | - Also you can list all fields on the shapefile by :
138 | 
139 | <!-- end list -->
140 | 
141 |     schema = []
142 |         for n in range(layerdefinition.GetFieldCount()):
143 |             fdefn = layerdefinition.GetFieldDefn(n)
144 |             schema.append(fdefn.name)
145 | 
146 | - Then I will delete the field called "MLDS" has been assumed by me
147 | 
148 | ```python
149 | yy = feature.GetFieldIndex("MLDS")
150 | if yy < 0:
151 |     print("MLDS field not found, we will create one for you and make all values to 1")
152 | else:
153 |     lyr.DeleteField(yy)
154 | ```
155 | 
156 | add new field to the shapefile with a default value `"1"` and don't
157 | forget to close feature after the edits
158 | 
159 | ```python
160 | new_field = ogr.FieldDefn("MLDS", ogr.OFTInteger)
161 | lyr.CreateField(new_field)
162 | for feature in lyr:
163 |         feature.SetField("MLDS", 1)
164 |         lyr.SetFeature(feature)
165 |         feature = None
166 | ```
167 | 
168 | Set the projection from original tiff (fn\_ras) to the rasterized tiff
169 | 
170 | ```python
171 | drv_tiff = gdal.GetDriverByName("GTiff")
172 | chn_ras_ds = drv_tiff.Create(
173 |         output, ras_ds.RasterXSize, ras_ds.RasterYSize, 1, gdal.GDT_Byte)
174 | chn_ras_ds.SetGeoTransform(geot)
175 | chn_ras_ds.SetProjection(proj)
176 | chn_ras_ds.FlushCache()
177 | ```
178 | 
179 | ```python
180 | gdal.RasterizeLayer(chn_ras_ds, [1], lyr, burn_values=[1], options=["ATTRIBUTE=MLDS"])
181 | chn_ras_ds = None
182 | vec_ds = None
183 | ```
184 | 
185 | DONE
186 | 
187 | 
188 | ## Second - Splitting raster and rasterized files to small tiles 512×512 depends on your memory
189 | 
190 | 
191 | ```python
192 | ds = gdal.Open(fn_ras)
193 | gt = ds.GetGeoTransform()
194 | ```
195 | 
196 | get coordinates of upper left corner
197 | 
198 | ```python
199 | xmin = gt[0]
200 | ymax = gt[3]
201 | resx = gt[1]
202 | res_y = gt[5]
203 | resy = abs(res_y)
204 | ```
205 | 
206 | ```python
207 | import math
208 | import os.path as osp
209 | ```
210 | 
211 | the tile size i want (may be 256×256 for smaller memory size)
212 | 
213 | 
214 | ```python
215 | needed_out_x = 512
216 | needed_out_y = 512
217 | ```
218 | 
219 | round up to the nearest int
220 | 
221 | ```python
222 | xnotround = ds.RasterXSize / needed_out_x
223 | xround = math.ceil(xnotround)
224 | ynotround = ds.RasterYSize / needed_out_y
225 | yround = math.ceil(ynotround)
226 | 
227 | print(xnotround)
228 | print(xround)
229 | print(ynotround)
230 | print(yround)
231 | ```
232 | 
233 |     9.30078125
234 |     10
235 |     5.689453125
236 |     6
237 | 
238 | pixel to meter - 512×10×0.18
239 | 
240 | ```python
241 | pixtomX = needed_out_x * xround * resx
242 | pixtomy = needed_out_y * yround * resy
243 | 
244 | print (pixtomX)
245 | print (pixtomy)
246 | ```
247 | 
248 |     927.4675200000274
249 |     556.4805119997686
250 | 
251 | size of a single tile
252 | 
253 | ```python
254 | xsize = pixtomX / xround
255 | ysize = pixtomy / yround
256 | 
257 | print (xsize)
258 | print (ysize)
259 | ```
260 | 
261 |     92.74675200000274
262 |     92.74675199996143
263 | 
264 | create lists of x and y coordinates
265 | 
266 | 
267 | ```python
268 | xsteps = [xmin + xsize * i for i in range(xround + 1)]
269 | ysteps = [ymax - ysize * i for i in range(yround + 1)]
270 | xsteps
271 | ```
272 | 
273 |     [342940.8074133941,
274 |      343033.5541653941,
275 |      343126.3009173941,
276 |      343219.0476693941,
277 |      343311.7944213941,
278 |      343404.54117339413,
279 |      343497.28792539414,
280 |      343590.03467739414,
281 |      343682.78142939415,
282 |      343775.5281813941,
283 |      343868.2749333941]
284 | 
285 | set the output path
286 | 
287 | ```python
288 | cdpath = "DataSet/image/"
289 | ```
290 | 
291 | loop over min and max x and y coordinates
292 | 
293 | ```python
294 | for i in range(xround):
295 |     for j in range(yround):
296 |         xmin = xsteps[i]
297 |         xmax = xsteps[i + 1]
298 |         ymax = ysteps[j]
299 |         ymin = ysteps[j + 1]
300 | 
301 |         # gdal translate to subset the input raster
302 | 
303 |         gdal.Translate(osp.join(cdpath,  \
304 |                         (str("01") + "-" + str(j) + "-" + str(i) + "." + "jpg")), 
305 |                 ds, 
306 |                 projWin=(abs(xmin), abs(ymax), abs(xmax), abs(ymin)),
307 |                 xRes=resx, 
308 |                 yRes=-resy, 
309 |                 outputType=gdal.gdalconst.GDT_Byte, 
310 |                 format="JPEG")
311 | ds = None
312 | ```
313 | 
314 | ### Third - Spilit Custom Dataset and Generate File List
315 | 
316 | For all data that is not divided into training set, validation set, and
317 | test set, PaddleSeg provides a script to generate segmented data and
318 | generate a file list.
319 | 
320 | #### Use scripts to randomly split the custom dataset proportionally and generate a file list
321 | 
322 | The data file structure is as follows:
323 | 
324 |     ./DataSet/  # Dataset root directory
325 |     |--image  # Original image catalog
326 |     |  |--xxx1.jpg (xx1.png)
327 |     |  |--...
328 |     |  └--...
329 |     |
330 |     |--label  # Annotated image catalog
331 |     |  |--xxx1.png
332 |     |  |--...
333 |     |  └--...
334 | 
335 | Among them, the corresponding file name can be defined according to
336 | needs.
337 | 
338 | The commands used are as follows, which supports enabling specific
339 | functions through different Flags.
340 | 
341 |     python tools/split_dataset_list.py <dataset_root> <images_dir_name> <labels_dir_name> ${FLAGS}
342 | 
343 | Parameters:
344 | 
345 | - dataset\_root: Dataset root directory
346 | - images\_dir\_name: Original image catalog
347 | - labels\_dir\_name: Annotated image catalog
348 | 
349 | FLAGS:
350 | 
351 | | FLAG            | Meaning                                                                                                                  | Default                                   | Parameter numbers |
352 | | --------------- | ------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------- | ----------------- |
353 | | \--split        | Dataset segmentation ratio                                                                                               | 0.7 0.3 0                                 | 3                 |
354 | | \--separator    | File list separator                                                                                                      | "                                         | "                 |
355 | | \--format       | Data format of pictures and label sets                                                                                   | "jpg" "png"                               | 2                 |
356 | | \--label\_class | Label category                                                                                                           | '\_\_background\_\_' '\_\_foreground\_\_' | several           |
357 | | \--postfix      | Filter pictures and label sets according to whether the main file name (without extension) contains the specified suffix | "" ""（2 null characters）                  | 2                 |
358 | 
359 | After running, `train.txt`, `val.txt`, `test.txt` and `labels.txt` will
360 | be generated in the root directory of the dataset.
361 | 
362 | **Note:** Requirements for generating the file list: either the original
363 | image and the number of annotated images are the same, or there is only
364 | the original image without annotated images. If the dataset lacks
365 | annotated images, a file list without separators and annotated image
366 | paths will be generated.
367 | 
368 | #### Example
369 | 
370 |     python tools/split_dataset_list.py <dataset_root> images annotations --split 0.6 0.2 0.2 --format jpg png
371 | 
372 | ## Dataset file organization
373 | 
374 | - If you need to use a custom dataset for training, it is recommended
375 |   to organize it into the following structure: custom\_dataset |
376 |   |--images | |--image1.jpg | |--image2.jpg | |--... | |--labels |
377 |   |--label1.png | |--label2.png | |--... | |--train.txt | |--val.txt |
378 |   |--test.txt
379 | 
380 | The contents of train.txt and val.txt are as follows:
381 | 
382 |     image/image1.jpg label/label1.png
383 |     image/image2.jpg label/label2.png
384 |     ...
385 | 
386 | Full Docs :
387 | <https://github.com/PaddlePaddle/PaddleSeg/blob/release/2.3/docs/data/custom/data_prepare.md>
388 | 
389 | 
390 | ```python
391 | import sys
392 | import subprocess
393 | ```
394 | 
395 | ```python
396 | theproc = subprocess.Popen([
397 | "python", 
398 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\split_dataset_list.py", #Split text py script
399 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\DataSet",  # Root DataSet ath
400 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\DataSet\image",  #images path
401 | r"C:\Users\Youss\Documents\pp\New folder\split-rs-data\DataSet\label", 
402 | # "--split", 
403 | # "0.6",  # 60% training
404 | # "0.2",  # 20% validating
405 | # "0.2",  # 20% testing
406 | "--format", 
407 | "jpg", 
408 | "png"])
409 | theproc.communicate()
410 | ```
411 | 
412 |     (None, None)
413 | 
414 | 
415 | 


--------------------------------------------------------------------------------