├── S3netCDF4 ├── __init__.py ├── CFA │ ├── Parsers │ │ ├── __init__.py │ │ ├── _CFAParser.pyx │ │ └── _CFAnetCDFParser.pyx │ ├── _CFAExceptions.pyx │ ├── __init__.py │ └── _CFASplitter.pyx ├── Managers │ ├── __init__.py │ ├── _ConnectionPool.pyx │ └── _ConfigManager.pyx ├── utils │ ├── __init__.py │ ├── split.py │ └── agg.py ├── Backends │ ├── __init__.py │ ├── _s3FileObject.pyx │ └── _s3aioFileObject.pyx └── _Exceptions.pyx ├── requirements.txt ├── pyproject.toml ├── Makefile ├── ROADMAP.md ├── config └── .s3nc.json.template ├── tutorial ├── readme.txt └── lesson_1.py ├── LICENSE ├── .gitignore ├── bin ├── s3nc_cfa_agg.py ├── s3nc_cfa_split.py ├── s3nc_cfa_mv.py └── s3nc_cfa_info.py ├── CHANGELOG.md ├── test ├── test_split.py ├── test_s3FileObject.py ├── test_s3aioFileObject.py └── test_s3Dataset.py └── setup.py /S3netCDF4/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /S3netCDF4/CFA/Parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /S3netCDF4/Managers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /S3netCDF4/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.19.4 2 | Cython==0.29.21 3 | netCDF4==1.5.5.1 4 | botocore==1.19.20 5 | aiobotocore==1.1.2 6 | psutil==5.7.3 7 | -------------------------------------------------------------------------------- /S3netCDF4/Backends/__init__.py: -------------------------------------------------------------------------------- 1 | # need to import all the backends 2 | from ._s3FileObject import s3FileObject 3 | from ._s3aioFileObject import s3aioFileObject 4 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Minimum requirements for the build system to execute. 3 | requires = ["setuptools", "wheel", "Cython", "numpy"] # PEP 508 specifications. 4 | build-backend = "setuptools.build_meta" 5 | -------------------------------------------------------------------------------- /S3netCDF4/_Exceptions.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | # Exception classes to indicate they come from the s3 component of the library 9 | class IOException(BaseException): 10 | pass 11 | 12 | class MemoryException(BaseException): 13 | pass 14 | 15 | class APIException(BaseException): 16 | pass 17 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # S3-netcdf-python Makefile 2 | # Simple makefile for compiling the Cython externals when developing 3 | 4 | # Setup.py will build these externals once on installation, so it is not 5 | # necessary to run this Makefile on installation for a user. 6 | # This Makefile only needs to be used when developing. 7 | 8 | all: 9 | python setup.py build_ext --inplace 10 | 11 | clean: 12 | rm -f *.so *.c 13 | rm -f ./S3netCDF4/Backends/*.so ./S3netCDF4/Backends/*.c 14 | rm -f ./S3netCDF4/CFA/Parsers/*.so ./S3netCDF4/CFA/Parsers/*.c 15 | rm -f ./S3netCDF4/CFA/*.so ./S3netCDF4/CFA/*.c 16 | rm -f ./S3netCDF4/Managers/*.so ./S3netCDF4/Managers/*.c 17 | rm -f ./S3netCDF4/*.so ./S3netCDF4/*.c 18 | -------------------------------------------------------------------------------- /S3netCDF4/CFA/_CFAExceptions.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | """Exceptions for the _CFAClasses""" 9 | 10 | class CFAError(BaseException): 11 | pass 12 | 13 | class CFAGroupError(CFAError): 14 | pass 15 | 16 | class CFADimensionError(CFAError): 17 | pass 18 | 19 | class CFAVariableError(CFAError): 20 | pass 21 | 22 | class CFAVariableIndexError(CFAError, IndexError): 23 | pass 24 | 25 | class CFAPartitionError(CFAError): 26 | pass 27 | 28 | class CFAPartitionIndexError(CFAError, IndexError): 29 | pass 30 | 31 | class CFASubArrayError(CFAError): 32 | pass 33 | 34 | class CFAParserError(CFAError): 35 | pass 36 | -------------------------------------------------------------------------------- /ROADMAP.md: -------------------------------------------------------------------------------- 1 | Roadmap for improvements to s3netCDF-python 2 | =========================================== 3 | 4 | 1. Improve documentation, provide more examples and tutorials 5 | 2. Add support for unequal partition sizes (completed in v2.0.5) 6 | 3. Add support for striding in slices e.g. [1:20:2] 7 | 4. Add support for streaming files greater than memory to disk / cache 8 | 5. Make more use of Cython features - add types for all variables in .pyx files 9 | 6. More unit tests and continuous integration 10 | 7. Add Compatibility with xarray and Zarr: read and write xarray / Zarr files, 11 | i.e. the master array file is an xarray JSON attributes file, and provide 12 | support for Zarr with a CFA master-array file, i.e. the chunks are Zarr but the 13 | master-array file is CFA-netCDF. 14 | 8. Upgrade aiobotocore to latest. v1.0+ has an API that breaks previous 15 | version. (completed in v2.0.5) 16 | 9. Add Dask support for parallel workflows. 17 | -------------------------------------------------------------------------------- /config/.s3nc.json.template: -------------------------------------------------------------------------------- 1 | { 2 | "version": "9", 3 | "hosts": { 4 | "{{ hostname0 }}": { 5 | "alias": "{{ host0_alias }}", 6 | "url": "{{ host0_url }}", 7 | "credentials": { 8 | "accessKey": "{{ host0_access_key }}", 9 | "secretKey": "{{ host0_secret_key }}" 10 | }, 11 | "backend": "s3aioFileObject", 12 | "api": "S3v4" 13 | } 14 | }, 15 | "backends": { 16 | "s3aioFileObject" : { 17 | "maximum_part_size": "50MB", 18 | "maximum_parts": 8, 19 | "enable_multipart_download": true, 20 | "enable_multipart_upload": true, 21 | "connect_timeout": 30.0, 22 | "read_timeout": 30.0 23 | }, 24 | "s3FileObject" : { 25 | "maximum_part_size": "50MB", 26 | "maximum_parts": 4, 27 | "enable_multipart_download": true, 28 | "enable_multipart_upload": true, 29 | "connect_timeout": 30.0, 30 | "read_timeout": 30.0 31 | 32 | } 33 | }, 34 | "cache_location" : "{{ cache_location }}", 35 | "resource_allocation" : { 36 | "memory": "{{ memory_allocation_limit }}", 37 | "filehandles": {{ filehandle_allocation_limit }} 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /S3netCDF4/CFA/Parsers/_CFAParser.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2020 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | """ 9 | Collection of functions that parse files with embedded CFA metadata and 10 | return a hierarchy of objects instantiated from the _CFAClasses. 11 | See the class definitions and documentation in _CFAClasses.pyx for this 12 | hierarchy. 13 | 14 | See: 15 | http://www.met.reading.ac.uk/~david/cfa/0.4/index.html 16 | for the specification of the CFA conventions. 17 | 18 | s3netCDF-python uses an updated version (v0.5) of the CFA conventions which, 19 | rather than writing the partition information to a netCDF attribute as a 20 | string, writes the partition information to variables inside a group. 21 | """ 22 | 23 | class CFA_Parser(object): 24 | """Base class for CFA Parser - pure abstract so raise an exception.""" 25 | def __init__(self): 26 | raise NotImplementedError 27 | 28 | def read(self, input_object): 29 | raise NotImplementedError 30 | 31 | def write(self, cfa_dataset, output_object): 32 | raise NotImplementedError 33 | 34 | def is_file(self, input_object): 35 | raise NotImplementedError 36 | -------------------------------------------------------------------------------- /tutorial/readme.txt: -------------------------------------------------------------------------------- 1 | S3-netCDF-python tutorials for JASMIN users 2 | =========================================== 3 | 4 | Setup 5 | ----- 6 | To access these tutorials you will need access to the cedadev-o Caringo tenancy. 7 | Please see the following webpage to set up the account: 8 | 9 | https://help.jasmin.ac.uk/article/4847-using-the-jasmin-object-store 10 | 11 | > module load jaspy 12 | > create a venv 13 | > pip install -e git+https://github.com/cedadev/S3-netcdf-python.git@version2 14 | 15 | Config 16 | ------ 17 | You will need to create a configuration file in your home directory: 18 | Using nano text editor: 19 | 20 | > nano ~/.s3nc.json 21 | 22 | Copy this text into the file opened in nano: 23 | { 24 | "version": "9", 25 | "hosts": { 26 | "s3://cedadev-o": { 27 | "alias": "cedadev-o", 28 | "url": "http://cedadev-o.s3.jc.rl.ac.uk", 29 | "credentials": { 30 | "accessKey": "access_key", 31 | "secretKey": "secret_key" 32 | }, 33 | "backend": "s3aioFileObject", 34 | "api": "S3v4" 35 | } 36 | }, 37 | "cache_location": "~/.cache" 38 | } 39 | 40 | replace "access_key" and "secret_key" with the credentials you got from the Caringo 41 | Swarm portal. 42 | 43 | Contents 44 | -------- 45 | Tutorial_1 - Read data from a CMIP6 file 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2019-2021, Centre of Environmental Data Analysis Developers, 4 | Scientific and Technical Facilities Council (STFC), 5 | UK Research and Innovation (UKRI). 6 | All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without 9 | modification, are permitted provided that the following conditions are met: 10 | 11 | * Redistributions of source code must retain the above copyright notice, this 12 | list of conditions and the following disclaimer. 13 | 14 | * Redistributions in binary form must reproduce the above copyright notice, 15 | this list of conditions and the following disclaimer in the documentation 16 | and/or other materials provided with the distribution. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /tutorial/lesson_1.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 2 | __license__ = "BSD - see LICENSE file in top-level directory" 3 | __authors__ = "Neil Massey" 4 | 5 | # s3-netCDF-python Tutorial 1 6 | # Purpose: Read a time series from a CMIP6 dataset 7 | # Author : Neil Massey 8 | # Date : 12/05/2020 9 | 10 | from S3netCDF4._s3netCDF4 import s3Dataset as Dataset 11 | import numpy as np 12 | 13 | # Dataset (Master Array File) location, this is on the Caringo object store, 14 | # using the alias defined in the config file in the user's home directory: 15 | # ~/.s3nc.json 16 | data_location = "s3://cedadev-o/cmip6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r1i1p1f3/day/tas/gn/tas_day_HadGEM3-GC31-MM_historical_r1i1p1f3_gn.nc" 17 | var_name = "tas" 18 | 19 | # We open the Master Array File just like opening a netCDF Dataset 20 | s3_ds = Dataset(data_location, 'r') 21 | 22 | # We can inspect the dataset by printing it, just like in netcdf4-python 23 | print("CFA DATASET: ", s3_ds) 24 | 25 | # We can also examine the variables in the Dataset 26 | print("VARIABLES: ", s3_ds.variables) 27 | 28 | # and the groups in the Dataset 29 | print("GROUPS: ", s3_ds.groups) 30 | 31 | # We can then get a variable from the Dataset 32 | var = s3_ds.variables[var_name] 33 | # and inspect it 34 | print("TAS: ", var) 35 | 36 | # we can get a timeseries for one year timeseries by slicing the variable: 37 | # this will return a numpy array 38 | var_data = var[:360, 45, 45] 39 | 40 | # calculate seasonal means 41 | #print(np.mean(var_data[0:90]), np.mean(var_data[90:180]), np.mean(var_data[180:240]), np.mean(var_data[240:360])) 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.c 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | nosetests.xml 46 | coverage.xml 47 | *.cover 48 | .hypothesis/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # dotenv 84 | .env 85 | 86 | # virtualenv 87 | .venv 88 | venv/ 89 | ENV/ 90 | 91 | # Spyder project settings 92 | .spyderproject 93 | .spyproject 94 | 95 | # Rope project settings 96 | .ropeproject 97 | 98 | # mkdocs documentation 99 | /site 100 | 101 | # mypy 102 | .mypy_cache/ 103 | 104 | archive/ 105 | .idea/ 106 | 107 | .s3config.json 108 | -------------------------------------------------------------------------------- /bin/s3nc_cfa_agg.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 4 | __license__ = "BSD - see LICENSE file in top-level directory" 5 | __authors__ = "Neil Massey" 6 | 7 | """Program to aggregate netCDF-CFA files from disk or s3. 8 | This program will produce a master array file, containing references to the 9 | files that have been aggregated. 10 | """ 11 | 12 | import argparse 13 | from S3netCDF4.utils.agg import aggregate_into_CFA 14 | 15 | if __name__ == "__main__": 16 | # set up and parse the arguments 17 | parser = argparse.ArgumentParser( 18 | prog="s3nc_cfa_agg", 19 | formatter_class=argparse.RawTextHelpFormatter, 20 | description=( 21 | "Aggregate a number of netCDF files into a CFA-netCDF " 22 | "master-array file." 23 | ) 24 | ) 25 | 26 | parser.add_argument( 27 | "output", action="store", default="", metavar="", 28 | help=( 29 | "Path of the output master-array file." 30 | ) 31 | ) 32 | 33 | parser.add_argument( 34 | "dir", action="store", default="", metavar="", 35 | help=( 36 | "Path of a directory containing netCDF files to aggregate into a " 37 | "CFA-netCDF master-array file." 38 | ) 39 | ) 40 | 41 | parser.add_argument( 42 | "--cfa_version", action="store", default="0.5", 43 | help=("Version of CFA conventions to use, 0.4|0.5") 44 | ) 45 | 46 | parser.add_argument( 47 | "--axis", action="store", default="time", 48 | help=("Axis to aggregate along, default=time") 49 | ) 50 | 51 | parser.add_argument( 52 | "--common_date", action="store", default=None, 53 | help=("Common start time across all files") 54 | ) 55 | 56 | args = parser.parse_args() 57 | 58 | if args.output and args.dir: 59 | aggregate_into_CFA(args.output, 60 | args.dir, 61 | args.axis, 62 | args.cfa_version, 63 | args.common_date) 64 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | S3netCDF-python 2 | --------------- 3 | 4 | Changes between v2.0.12 and v2.1-rc1 5 | ------------------------------------ 6 | 1. Tidied LICENSE information 7 | 8 | Changes between v2.0.8 and v2.0.12 9 | ---------------------------------- 10 | 1. Added a s3_nc_cfa_split.py program to split a large netCDF file into smaller netCDF subarray files and produce a CFA-netCDF master array file. 11 | 2. Bugfixes to the splitter. 12 | 13 | Changes between v2.0.6 and v2.0.8 14 | --------------------------------- 15 | 1. Bug fix for indexing. 16 | 2. Changed the name of the template config file and the name in the README.md file to match the code. 17 | 3. Allow an environment variable "S3_NC_CONFIG" to be set to point to the config file. 18 | 4. Fixed bug in previous file that prevented writing CFA sub-array files (!) 19 | 20 | Changes between v2.0.5 and v2.0.6 21 | --------------------------------- 22 | 1. Update the s3_nc_cfa_agg.py program so that it is compatible with more models and Datasets in CMIP6. This relates mostly to the way the time dimension is recorded, and the calendar type. 23 | 2. Changed the way that the indexing for unequal sized partitions is calculated. It is now (potentially) slower, but more robust. 24 | 25 | Changes between v2.0.4 and v2.0.5 26 | --------------------------------- 27 | 1. Added support for reading unequal sized partitions. These may occur in files written by the s3_nc_cfa_agg.py program. 28 | 29 | Changes between v2.0.3 and v2.0.4 30 | --------------------------------- 31 | 1. s3nc_cfa_agg.py now uses FileManager.request_file rather than FileManager._open. More elegant and API focused. 32 | 2. FileManager.request_file is now compatible with passing globs into it as the filename parameter. 33 | 34 | Changes between v2.0.2 and v2.0.3 35 | --------------------------------- 36 | 1. Fixed a problem where a BytesIO buffer was being passed by reference rather than copied, leading to a "file operation on unopened file" error. 37 | 2. Corrected install procedure in README. 38 | 3. Corrected bug in test_s3Dataset_read. 39 | 40 | Changes between v2.0.1 and v2.0.2 41 | --------------------------------- 42 | 1. Fixed unreleased file for Datasets on disk 43 | 2. Fixed incorrect parsing for CFA 0.4 44 | 45 | Changes between v0.2 and v2.0.1: 46 | -------------------------------- 47 | 1. complete rewrite 48 | 2. v0.5 CFA 49 | 3. partition matrix represented internally by netCDF Dataset 50 | 4. user can supply sub array size when creating variable 51 | 5. cacheless operation, except for read of very large files 52 | 6. intelligent memory handling 53 | 7. excellent sparse-array handling 54 | 8. complete compliance with netCDF4 API interface 55 | -------------------------------------------------------------------------------- /bin/s3nc_cfa_split.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 4 | __license__ = "BSD - see LICENSE file in top-level directory" 5 | __authors__ = "Neil Massey" 6 | 7 | """Program to split a netCDF file into a netCDF-CFA master file and a number 8 | of netCDF sub array files. 9 | """ 10 | import argparse 11 | 12 | from S3netCDF4.utils.split import split_into_CFA 13 | 14 | if __name__ == "__main__": 15 | # set up and parse the arguments 16 | parser = argparse.ArgumentParser( 17 | prog="s3nc_cfa_split", 18 | formatter_class=argparse.RawTextHelpFormatter, 19 | description=( 20 | "Split a netCDF file into a netCDF-CFA master file and a number" 21 | "of netCDF sub array files." 22 | ) 23 | ) 24 | 25 | parser.add_argument( 26 | "output", action="store", default="", metavar="", 27 | help=( 28 | "Path of the output CFA-netCDF master-array file." 29 | ) 30 | ) 31 | 32 | parser.add_argument( 33 | "input", action="store", default="", metavar="", 34 | help=( 35 | "Path of the input netCDF file" 36 | ) 37 | ) 38 | 39 | parser.add_argument( 40 | "--subarray_path", action="store", default="", 41 | metavar="", 42 | help=( 43 | "Common path of the output sub array files (optional). Without " 44 | "this argument, the output will be in a directory below the path of" 45 | " the output netCDF-CFA master array file." 46 | ) 47 | ) 48 | 49 | parser.add_argument( 50 | "--subarray_shape", action="store", default=[], 51 | metavar="", 52 | help=( 53 | "Shape for the subarray files (optional). Without this argument, " 54 | "the shape will be automatically determined." 55 | ) 56 | ) 57 | 58 | parser.add_argument( 59 | "--subarray_size", action="store", default=50*1024*1024, 60 | metavar="", 61 | help=( 62 | "Size for the subarray files (optional). With this argument, the " 63 | "shape will be automatically determined, with this target size. " 64 | "The units for the size is , not " 65 | "any magnitude of bytes." 66 | ) 67 | ) 68 | 69 | parser.add_argument( 70 | "--cfa_version", action="store", default="0.5", 71 | help=("Version of CFA conventions to use, 0.4|0.5") 72 | ) 73 | 74 | args = parser.parse_args() 75 | 76 | if args.output and args.input: 77 | split_into_CFA(args.output, args.input, 78 | args.subarray_path, 79 | args.subarray_shape, 80 | int(args.subarray_size), 81 | args.cfa_version) 82 | -------------------------------------------------------------------------------- /test/test_split.py: -------------------------------------------------------------------------------- 1 | 2 | import unittest, os 3 | 4 | from S3netCDF4.utils.split import split_into_CFA 5 | from S3netCDF4.utils.agg import aggregate_into_CFA 6 | from S3netCDF4.CFA._CFAClasses import CFADataset 7 | from S3netCDF4._s3netCDF4 import s3Dataset 8 | 9 | TESTFILE = "/Users/dhk63261/Archive/cmip5/ta_Amon_HadCM3_rcp45_r10i1p1_203101-203512.nc" 10 | 11 | 12 | def nca_equivalence(ncfile1, ncfile2, variable='ta'): 13 | """ Do these two files describe the same content?""" 14 | # Let's start by comparing a few important things 15 | 16 | x = s3Dataset(ncfile1) 17 | y = s3Dataset(ncfile2) 18 | 19 | # First let's just check a data record 20 | xx = x.variables[variable] 21 | yy = y.variables[variable] 22 | 23 | assert (xx.shape == yy.shape).all(), "CFA data arrays are not the same shape" 24 | 25 | assert len(xx.shape) == 4, "Unexpected variable shape for comparison" 26 | 27 | xx = xx[:, 0, 0, 0].flatten() 28 | yy = yy[:, 0, 0, 0].flatten() 29 | 30 | # We don't do all data coz it would take a long time 31 | assert (xx == yy).all(), "Data in arrays does not match" 32 | 33 | x.close() 34 | y.close() 35 | # now check file headers 36 | 37 | raise NotImplementedError("This doesn't mean the test has failed, just the test code is not finished") 38 | 39 | #return statement needed 40 | 41 | class TestSplit(unittest.TestCase): 42 | """ All the necessary splitter tests""" 43 | 44 | def setUp(self): 45 | self.ncafile1 = '/Users/dhk63261/Archive/things1.nca' 46 | self.ncapath = '/Users/dhk63261/Archive/things1/things1.ta.*' 47 | self.ncafile2 = '/Users/dhk63261/Archive/things2.nca' 48 | 49 | def _split_and_aggregate(self, cfa1, cfa2): 50 | # for now use real disk ... 51 | input = TESTFILE 52 | subarray_size = 50 * 1024 * 1024 53 | subarray_path = "" 54 | subarray_shape = "[1, 17, 73, 96]" 55 | 56 | split_into_CFA(self.ncafile1, input, 57 | subarray_path, 58 | subarray_shape, 59 | int(subarray_size), 60 | cfa1) 61 | 62 | axis = 'time' 63 | common_date = None 64 | 65 | aggregate_into_CFA(self.ncafile2, 66 | self.ncapath, 67 | axis, 68 | cfa2, 69 | common_date) 70 | 71 | def test_data_available(self): 72 | """ Test there is an input dataset available.""" 73 | assert os.path.exists(TESTFILE) 74 | 75 | def test_file_handles(self): 76 | """ Test we can open a file for write without fully qualifying it's name. 77 | See issue:24 """ 78 | raise NotImplementedError 79 | 80 | def test_auto_split_and_agg_round_trip1(self): 81 | """ Test the sensible split and aggregate 82 | with both at CFA 0.4 """ 83 | 84 | self._split_and_aggregate("0.4", "0.4") 85 | 86 | self.assertTrue(nca_equivalence(self.ncafile1, self.ncafile2)) 87 | 88 | def test_auto_split_and_agg_round_trip2(self): 89 | """ Test the sensible split and aggregate 90 | with different CFA versions """ 91 | 92 | self._split_and_aggregate("0.4", "0.5") 93 | 94 | self.assertTrue(nca_equivalence(self.ncafile1, self.ncafile2)) 95 | 96 | if __name__ == '__main__': 97 | unittest.main() 98 | -------------------------------------------------------------------------------- /S3netCDF4/Managers/_ConnectionPool.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | """ 9 | A very simple connection pool for S3netCDF. This allows connections to be 10 | maintained to (for example) a AWS or object store. The pool allows for the 11 | following behaviour: 12 | o. The backend File Object makes a request for a connection. The pool either 13 | returns a connection or None, if no connections are available or if all 14 | available connections are locked 15 | o. If None is returned, the backend is expected to make a connection and add it 16 | to the pool 17 | o. When connections are added they are locked and they can later be released so 18 | that they can be reused without having to re-establish the connection. 19 | o. When a connection is closed it is removed from the pool. 20 | """ 21 | from S3netCDF4._Exceptions import APIException 22 | 23 | class ConnectionObject(object): 24 | """A small class to hold connection information.""" 25 | def __init__(self, conn=None, uri="", available=False): 26 | self.conn = conn 27 | self.uri = uri 28 | self.conn_refs = 0 29 | 30 | def __str__(self): 31 | return "{} : ({})".format(self.uri, self.conn_refs) 32 | 33 | class ConnectionPool(object): 34 | """Connection pool for S3 netCDF. Stores connections to external storage in 35 | a pool, and keeps track of how many connections have been made to them. 36 | This maintains connections to servers to enhance performance by not 37 | incurring the time penalty of establishing a connection 38 | """ 39 | 40 | def __init__(self): 41 | self._connection_pool = {} 42 | 43 | def add(self, conn, conn_uri): 44 | """Add a connection to the ConnectionPool. 45 | Args: 46 | conn : the connection, e.g. a botocore client 47 | conn_uri: the uri of the connection, e.g. URL address 48 | Returns: 49 | None 50 | """ 51 | # Use the conn_uri as the key to the dictionary 52 | # If the conn_uri already exists in the connection pool then increase 53 | # the reference count 54 | # If it doesn't then create the connection with a reference count of 55 | # zero 56 | if conn_uri in self._connection_pool: 57 | conn_obj = self._connection_pool[conn_uri] 58 | conn_obj.conn_refs += 1 59 | else: 60 | conn_obj = ConnectionObject(conn, conn_uri) 61 | conn_obj.conn_refs = 1 62 | self._connection_pool[conn_uri] = conn_obj 63 | return conn_obj 64 | 65 | def get(self, conn_uri): 66 | """Get a connection from the ConnectionPool. 67 | Args: 68 | conn_uri: the uri of the connection, e.g. URL address 69 | Returns: 70 | ConnectionObject | None 71 | """ 72 | # Use the conn_uri to the dictionary to try to find a free connection 73 | if conn_uri in self._connection_pool: 74 | conn_obj = self._connection_pool[conn_uri] 75 | conn_obj.conn_refs += 1 76 | return conn_obj 77 | 78 | return None 79 | 80 | def release(self, conn_obj): 81 | """Release the connection for the connection uri. 82 | Args: 83 | conn : the ConnectionObject created in add""" 84 | if not conn_obj.uri in self._connection_pool: 85 | raise APIException( 86 | "Connection is not in the connection pool {}".format( 87 | conn_obj.uri 88 | ) 89 | ) 90 | conn_obj.conn_refs -= 1 91 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 2 | __license__ = "BSD - see LICENSE file in top-level directory" 3 | __authors__ = "Neil Massey" 4 | 5 | import os 6 | from setuptools import Extension, setup 7 | from Cython.Build import cythonize 8 | s3nc_define_macros = [( 9 | "NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION" 10 | )] 11 | import numpy 12 | 13 | with open(os.path.join(os.path.dirname(__file__), 'README.md')) as readme: 14 | README = readme.read() 15 | 16 | # allow setup.py to be run from any path 17 | os.chdir(os.path.normpath(os.path.join(os.path.abspath(__file__), os.pardir))) 18 | 19 | s3nc_extra_compile_args = ['-fno-strict-aliasing', '-O3'] 20 | 21 | extensions = [ 22 | Extension( 23 | name="S3netCDF4.Backends._s3aioFileObject", 24 | sources=["S3netCDF4/Backends/_s3aioFileObject.pyx"], 25 | define_macros=s3nc_define_macros, 26 | extra_compile_args=s3nc_extra_compile_args, 27 | include_dirs=[numpy.get_include()], 28 | inplace=True 29 | ), 30 | Extension( 31 | name="S3netCDF4.Backends._s3FileObject", 32 | sources=["S3netCDF4/Backends/_s3FileObject.pyx"], 33 | define_macros=s3nc_define_macros, 34 | extra_compile_args=s3nc_extra_compile_args, 35 | include_dirs=[numpy.get_include()], 36 | ), 37 | Extension( 38 | name="S3netCDF4.CFA._CFAClasses", 39 | sources=["S3netCDF4/CFA/_CFAClasses.pyx"], 40 | define_macros=s3nc_define_macros, 41 | extra_compile_args=s3nc_extra_compile_args, 42 | include_dirs=[numpy.get_include()], 43 | ), 44 | Extension( 45 | name="S3netCDF4.CFA._CFAExceptions", 46 | sources=["S3netCDF4/CFA/_CFAExceptions.pyx"], 47 | define_macros=s3nc_define_macros, 48 | extra_compile_args=s3nc_extra_compile_args, 49 | include_dirs=[numpy.get_include()], 50 | ), 51 | Extension( 52 | name="S3netCDF4.CFA._CFASplitter", 53 | sources=["S3netCDF4/CFA/_CFASplitter.pyx"], 54 | define_macros=s3nc_define_macros, 55 | extra_compile_args=s3nc_extra_compile_args, 56 | include_dirs=[numpy.get_include()], 57 | ), 58 | Extension( 59 | name="S3netCDF4.CFA.Parsers._CFAnetCDFParser", 60 | sources=["S3netCDF4/CFA/Parsers/_CFAnetCDFParser.pyx"], 61 | define_macros=s3nc_define_macros, 62 | extra_compile_args=s3nc_extra_compile_args, 63 | include_dirs=[numpy.get_include()], 64 | ), 65 | Extension( 66 | name="S3netCDF4.CFA.Parsers._CFAParser", 67 | sources=["S3netCDF4/CFA/Parsers/_CFAParser.pyx"], 68 | define_macros=s3nc_define_macros, 69 | extra_compile_args=s3nc_extra_compile_args, 70 | include_dirs=[numpy.get_include()], 71 | ), 72 | Extension( 73 | name="S3netCDF4.Managers._ConfigManager", 74 | sources=["S3netCDF4/Managers/_ConfigManager.pyx"], 75 | define_macros=s3nc_define_macros, 76 | extra_compile_args=s3nc_extra_compile_args, 77 | include_dirs=[numpy.get_include()], 78 | ), 79 | Extension( 80 | name="S3netCDF4.Managers._ConnectionPool", 81 | sources=["S3netCDF4/Managers/_ConnectionPool.pyx"], 82 | define_macros=s3nc_define_macros, 83 | extra_compile_args=s3nc_extra_compile_args, 84 | include_dirs=[numpy.get_include()], 85 | ), 86 | Extension( 87 | name="S3netCDF4.Managers._FileManager", 88 | sources=["S3netCDF4/Managers/_FileManager.pyx"], 89 | define_macros=s3nc_define_macros, 90 | extra_compile_args=s3nc_extra_compile_args, 91 | include_dirs=[numpy.get_include()], 92 | ), 93 | Extension( 94 | name="S3netCDF4._Exceptions", 95 | sources=["S3netCDF4/_Exceptions.pyx"], 96 | define_macros=s3nc_define_macros, 97 | extra_compile_args=s3nc_extra_compile_args, 98 | include_dirs=[numpy.get_include()], 99 | ), 100 | Extension( 101 | name="S3netCDF4._s3netCDF4", 102 | sources=["S3netCDF4/_s3netCDF4.pyx"], 103 | define_macros=s3nc_define_macros, 104 | extra_compile_args=s3nc_extra_compile_args, 105 | include_dirs=[numpy.get_include()], 106 | ), 107 | ] 108 | 109 | setup( 110 | name='S3netCDF4', 111 | version='2.1-rc1', 112 | packages=['S3netCDF4'], 113 | install_requires=[ 114 | 'numpy>=1.19.0', 115 | 'cython', 116 | 'netcdf4', 117 | 'botocore', 118 | 'aiobotocore', 119 | 'psutil', 120 | ], 121 | ext_modules=cythonize(extensions), 122 | zip_safe=False, 123 | include_package_data=True, 124 | license='my License', # example license 125 | description='A library to facilitate the storage of netCDF files on ObjectStores in an efficient manner.', 126 | long_description=README, 127 | long_description_content_type="text/markdown", 128 | url='http://www.ceda.ac.uk/', 129 | author='Neil Massey', 130 | author_email='neil.massey@stfc.ac.uk', 131 | classifiers=[ 132 | 'Development Status :: 4 - Beta', 133 | 'Intended Audience :: Science/Research', 134 | 'License :: OSI Approved :: BSD License', # example license 135 | 'Topic :: Software Development :: Libraries :: Python Modules', 136 | 'Topic :: System :: Archiving :: Compression', 137 | 'Operating System :: OS Independent', 138 | 'Programming Language :: Python', 139 | 'Programming Language :: Python :: 3', 140 | 'Programming Language :: Python :: 3.7', 141 | 'Programming Language :: Python :: 3.8', 142 | 'Programming Language :: Python :: 3.9', 143 | ] 144 | ) 145 | -------------------------------------------------------------------------------- /S3netCDF4/Managers/_ConfigManager.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | """ 9 | Configuration management for S3netCDF. Configuration is stored for each user 10 | in a JSON file in their home directory: ~/.s3nc.json 11 | """ 12 | 13 | import os 14 | import json 15 | import psutil 16 | import resource 17 | from .._Exceptions import IOException, APIException 18 | 19 | COMPATIBLE_VERSIONS = ["9"] 20 | 21 | def convert_file_size_string(value): 22 | """Convert a string containing a file size and suffix to an integer number 23 | of bytes. 24 | value : string containing integer number and an optional suffix 25 | """ 26 | # list of file format sizes 27 | file_format_sizes = ("kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB") 28 | # dictionary mapping to multiplier 29 | file_format_scale = {"B" : 1, 30 | "kB" : 1e3, 31 | "MB" : 1e6, 32 | "GB" : 1e9, 33 | "TB" : 1e12, 34 | "EB" : 1e15, 35 | "ZB" : 1e18, 36 | "YB" : 1e21} 37 | if isinstance(value, str): 38 | if value.endswith(file_format_sizes): 39 | suffix = value[-2:] 40 | size = int(value[:-2]) 41 | elif value[-1] == "B": 42 | suffix = "B" 43 | size = int(value[:-1]) 44 | else: 45 | suffix = "B" 46 | size = int(value) 47 | # multiply by scalar 48 | size *= file_format_scale[suffix] 49 | return size 50 | else: 51 | return value 52 | 53 | def interpret_config_file(node, keys_to_convert): 54 | """Recursively search the dictionary for keys to convert, and convert them 55 | using the convert_file_size_string function above.""" 56 | # First time entry node == dictionary 57 | for key, item in node.items(): 58 | if type(item) is dict: 59 | interpret_config_file(item, keys_to_convert) 60 | elif key in keys_to_convert: 61 | # reassign to the dictionary 62 | node[key] = convert_file_size_string(item) 63 | 64 | 65 | class Config(object): 66 | """Class to read in config file, interpret it and make the information 67 | available. 68 | """ 69 | 70 | def __init__(self): 71 | """Initialise S3netCDF4 for this user by reading the config file from 72 | their home directory. Config file is called ~/.s3nc.json""" 73 | # Read the JSON config file from the user home directory or from a path 74 | # set by the environment variable "S3_NC_CONFIG" 75 | # get user home directory 76 | user_home = os.environ["HOME"] 77 | 78 | # create the default path to the config file 79 | sl_config_default = os.path.join(user_home, ".s3nc.json") 80 | 81 | # try to get the path from the environment variable, but default to 82 | # above if environment variable not set 83 | sl_config_path = os.getenv("S3_NC_CONFIG", sl_config_default) 84 | # open the file 85 | try: 86 | fp = open(sl_config_path) 87 | # deserialize from the JSON 88 | self._sl_user_config = json.load(fp) 89 | # check the version number 90 | if ("version" not in self._sl_user_config or 91 | self._sl_user_config["version"] not in COMPATIBLE_VERSIONS): 92 | raise APIException( 93 | "User config file is not compatible with current version of" 94 | " S3netCDF4. Please update the config file at: {}".format( 95 | sl_config_path 96 | ) 97 | ) 98 | # add the filename to the config so we can refer to it in error 99 | # messages 100 | self._sl_user_config["filename"] = sl_config_path 101 | # keys to convert between text sizes and integer sizes 102 | # (e.g.) 50MB to 50*1024*1024 103 | keys_to_convert = [ 104 | "maximum_part_size", 105 | "memory" 106 | ] 107 | # interpret the config file, converting the above keys 108 | interpret_config_file(self._sl_user_config, keys_to_convert) 109 | # close the config file 110 | fp.close() 111 | # configure some defaults if they are not in the config file 112 | # note that default configs for the backends are handled in the 113 | # constructor of the backend class, e.g. _s3aioFileObject 114 | avail_mem = psutil.virtual_memory().available 115 | fhandles = resource.getrlimit(resource.RLIMIT_NOFILE)[0] 116 | if "resource_allocation" in self._sl_user_config: 117 | if not "memory" in self._sl_user_config["resource_allocation"]: 118 | self._sl_user_config["resource_allocation"]["memory"] = avail_mem 119 | if (not "filehandles" in 120 | self._sl_user_config["resource_allocation"]): 121 | self._sl_user_config["resource_allocation"]["filehandles"] = fhandles 122 | else: 123 | self._sl_user_config["resource_allocation"] = { 124 | "memory" : avail_mem, 125 | "filehandles" : fhandles 126 | } 127 | 128 | except IOError: 129 | raise IOException( 130 | "User config file does not exist with path: {}".format( 131 | sl_config_path 132 | ) 133 | ) 134 | 135 | def __getitem__(self, name): 136 | """Get a value from the s3 config""" 137 | return self._sl_user_config[name] 138 | 139 | @property 140 | def items(self): 141 | """Return the items in the dictionary / config definition""" 142 | return self._sl_user_config.items() 143 | 144 | @items.setter 145 | def items(self, value): 146 | raise AttributeError("items cannot be altered") 147 | -------------------------------------------------------------------------------- /S3netCDF4/CFA/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes containing the structure of CFA-netCDF files (master array) and the 3 | CF-netcdf subarray files. 4 | See: 5 | http://www.met.reading.ac.uk/~david/cfa/0.4/index.html 6 | for the specification of the CFA conventions. 7 | 8 | Only a subset of the CFA-netCDF specification is implemented - just what we 9 | use to fragment the files to store as multiple objects on the object storage. 10 | 11 | The classes here are organised to reflect the implied hierarchy in the CFA 12 | conventions : 13 | (NC = netCDF) 14 | 15 | +------------------------------------------------+ 16 | | CFADataset | 17 | +------------------------------------------------+ 18 | | format string | 19 | | metadata dict | 20 | | cfa_groups dict | 21 | +------------------------------------------------+ 22 | | bool createGroup(string grp_name)| 23 | | CFAGroup getGroup(string grp_name) | 24 | | bool renameGroup(string old_name,| 25 | | string new_name)| 26 | | list getGroups() | 27 | | dict getMetadata() | 28 | +------------------------------------------------+ 29 | | 30 | | 31 | | 32 | +------------------------------------------------+ 33 | | CFAGroup | 34 | +------------------------------------------------+ 35 | | cfa_dims dict | 36 | | grp_name string | 37 | | metadata dict | 38 | | cfa_vars dict | 39 | +------------------------------------------------+ 40 | | CFAVariable createVariable(string var_name, | 41 | | array shape, | 42 | | np.dtype dtype, | 43 | | list dim_names | 44 | | dict metadata) | 45 | | CFAVariable getVariable(string var_name) | 46 | | list getVariables() | 47 | | bool renameVariable(string old_name, | 48 | | string new_name) | 49 | | | 50 | | CFADim createDimension(string dim_name, | 51 | | int len, | 52 | | dictmetadata) | 53 | | CFADim getDimension(string dim_name) | 54 | | list getDimensions() | 55 | | bool renameDimension(string old_name, | 56 | | string new_name) | 57 | | | 58 | | string getName() | 59 | | dict getMetadata() | 60 | +------------------------------------------------+ 61 | | 62 | +--------------------------------------------------------------+ 63 | | | 64 | +------------------------------------------------+ +------------------------------------------------+ 65 | | CFAVariable | | CFADim | 66 | +------------------------------------------------+ +------------------------------------------------+ 67 | | var_name string | | dim_name string | 68 | | metadata dict | | dim_len int | 69 | | cf_role string | | metadata dict | 70 | | pmdimensions array | | axis_type string | 71 | | pmshape array | +------------------------------------------------+ 72 | | base string | | string getName() | 73 | | partitions array | | dict getMetadata() | 74 | +------------------------------------------------+ | array getIndices() | 75 | | string getName() | | string getAxisType | 76 | | dict getMetadata() | +------------------------------------------------+ 77 | | list getDimensions() | 78 | | bool parse(dict cfa_metadata) | 79 | | CFAPartition getPartition(array index) | 80 | +------------------------------------------------+ 81 | | 82 | | 83 | | 84 | +------------------------------------------------+ 85 | | CFAPartition | 86 | +------------------------------------------------+ 87 | | array index | 88 | | array location | 89 | | CFASubArray subarray | 90 | +------------------------------------------------+ 91 | | bool parse(dict cfa_metadata) | 92 | | array getIndex() | 93 | | array getLocation() | 94 | | CFASubarray getSubArray() | 95 | +------------------------------------------------+ 96 | | 97 | | 98 | | 99 | +------------------------------------------------+ 100 | | CFASubarray | 101 | +------------------------------------------------+ 102 | | ncvar string | 103 | | file string | 104 | | format string | 105 | | shape array | 106 | +------------------------------------------------+ 107 | | bool parse(dict cfa_metadata) | 108 | | string getncVar() | 109 | | string getFile() | 110 | | string getFormat() | 111 | | array getShape() | 112 | +------------------------------------------------+ 113 | """ 114 | -------------------------------------------------------------------------------- /test/test_s3FileObject.py: -------------------------------------------------------------------------------- 1 | from S3netCDF4.Backends._s3FileObject import s3FileObject 2 | from S3netCDF4._Exceptions import IOException, APIException 3 | import unittest 4 | import json 5 | import io 6 | 7 | """To run the tests, you need to create a .s3config.json file in the same 8 | directory as these tests. This file should contain: 9 | { 10 | "url": "", 11 | "credentials": { 12 | "accessKey": "", 13 | "secretKey": "" 14 | } 15 | } 16 | """ 17 | 18 | class s3FileObjectGeneralTest(object): 19 | """All of the general tests for either a read or write transaction.""" 20 | 21 | def tearDown(self): 22 | self.s3c.close() 23 | self.s3c_lines.close() 24 | 25 | def test_connect(self): 26 | self.assertTrue(self.s3c.connect()) 27 | 28 | def test_detach(self): 29 | self.assertTrue(self.s3c.connect()) 30 | self.assertRaises(io.UnsupportedOperation, self.s3c.detach) 31 | 32 | def test_close(self): 33 | self.assertTrue(self.s3c.connect()) 34 | self.assertTrue(self.s3c.close()) 35 | 36 | def test_readable(self): 37 | self.assertTrue(self.s3c.connect()) 38 | self.assertTrue(self.s3c.readable()) 39 | 40 | def test_truncate(self): 41 | self.assertTrue(self.s3c.connect()) 42 | self.assertRaises(io.UnsupportedOperation, self.s3c.truncate) 43 | 44 | def test_fileno(self): 45 | self.assertTrue(self.s3c.connect()) 46 | self.assertRaises(io.UnsupportedOperation, self.s3c.fileno) 47 | 48 | def test_seekable(self): 49 | self.assertTrue(self.s3c.connect()) 50 | self.assertTrue(self.s3c.seekable()) 51 | 52 | def test_tell(self): 53 | self.assertTrue(self.s3c.connect()) 54 | self.assertEqual(self.s3c.tell(), 0) 55 | 56 | def test_seek(self): 57 | self.assertTrue(self.s3c.connect()) 58 | # Three different methods for seek: 59 | # whence = io.SEEK_SET 60 | # whence = io.SEEK_CUR 61 | # whence = io.SEEK_END 62 | # the current pointer is on zero 63 | self.assertEqual(0, self.s3c.seek(0, whence=io.SEEK_SET)) 64 | self.assertEqual(10, self.s3c.seek(10, whence=io.SEEK_SET)) 65 | # now on 10 66 | with self.assertRaises(IOException) as contx: 67 | self.s3c.seek(-1, whence=io.SEEK_SET) 68 | # failed so still on 10 69 | 70 | # the current pointer is on ten (10) 71 | self.assertEqual(0, self.s3c.seek(-10, whence=io.SEEK_CUR)) 72 | # now on 0 - should raise an exception if we seek below 0 73 | with self.assertRaises(IOException): 74 | self.s3c.seek(-1, whence=io.SEEK_CUR) 75 | # still on zero: get the size to seek past it 76 | size = self.s3c._getsize() 77 | with self.assertRaises(IOException): 78 | self.s3c.seek(size+1, whence=io.SEEK_CUR) 79 | 80 | # still on zero - seek from the end 81 | with self.assertRaises(IOException): 82 | self.s3c.seek(size+1, whence=io.SEEK_END) 83 | # still on 0 - seek backwards from the end 84 | with self.assertRaises(IOException): 85 | self.s3c.seek(-1, whence=io.SEEK_END) 86 | # seek just a normal amount from the end 87 | self.assertEqual(size-10, self.s3c.seek(10, whence=io.SEEK_END)) 88 | 89 | 90 | class s3t1FileObjectWriteTest(unittest.TestCase, s3FileObjectGeneralTest): 91 | 92 | def setUp(self): 93 | """Set up the s3FileObject but don't connect.""" 94 | # load the credentials from the hidden file 95 | fh = open(".s3config.json") 96 | cfg = json.load(fh) 97 | fh.close() 98 | self.s3c = s3FileObject( 99 | cfg["url"] + "/buckettest/thefox1a.nc", 100 | credentials=cfg["credentials"], 101 | mode="w" 102 | ) 103 | 104 | # for writing with the write line methods 105 | self.s3c_lines = s3FileObject( 106 | cfg["url"] + "/buckettest/thefox1b.txt", 107 | credentials=cfg["credentials"], 108 | mode="w" 109 | ) 110 | 111 | def test_seek(self): 112 | with self.assertRaises(IOException): 113 | self.s3c.seek(0) 114 | 115 | def test_readable(self): 116 | self.assertTrue(self.s3c.connect()) 117 | self.assertFalse(self.s3c.readable()) 118 | 119 | def test_writable(self): 120 | self.assertTrue(self.s3c.connect()) 121 | self.assertTrue(self.s3c.writable()) 122 | 123 | def test_write(self): 124 | self.assertTrue(self.s3c.connect()) 125 | # create random bytes - if we keep it below s3c._getsize() then it will 126 | # only do one upload 127 | size = self.s3c._getsize() 128 | bytes = bytearray(size) 129 | for b in range(0, size): 130 | bytes[b] = 128 131 | self.assertNotEqual(0, self.s3c.write(bytes)) 132 | 133 | def test_write_multipart(self): 134 | self.assertTrue(self.s3c.connect()) 135 | # create random bytes - if we make it above 3c._getsize() then it will 136 | # do a multipart upload 137 | size = 3 * self.s3c._getsize() 138 | bytes = bytearray(size) 139 | for b in range(0, size): 140 | bytes[b] = 128 141 | self.assertNotEqual(0, self.s3c.write(bytes)) 142 | 143 | def test_write_lines(self): 144 | self.assertTrue(self.s3c_lines.connect()) 145 | lines = ["The","quick","brown","fox","jumped", 146 | "over","the","lazy","red","hen"] 147 | self.assertTrue(self.s3c_lines.writelines(lines)) 148 | 149 | 150 | class s3t2FileObjectReadTest(unittest.TestCase, s3FileObjectGeneralTest): 151 | 152 | def setUp(self): 153 | """Set up the s3FileObject but don't connect.""" 154 | # load the credentials from the hidden file 155 | fh = open(".s3config.json") 156 | cfg = json.load(fh) 157 | fh.close() 158 | self.s3c = s3FileObject( 159 | cfg["url"] + "/buckettest/thefox1a.nc", 160 | credentials=cfg["credentials"], 161 | mode="r" 162 | ) 163 | 164 | self.s3c_lines = s3FileObject( 165 | cfg["url"] + "/buckettest/thefox1b.txt", 166 | credentials=cfg["credentials"], 167 | mode="r" 168 | ) 169 | 170 | def test_writable(self): 171 | self.assertTrue(self.s3c.connect()) 172 | self.assertFalse(self.s3c.writable()) 173 | 174 | def testread(self): 175 | self.assertTrue(self.s3c.connect()) 176 | self.assertNotEqual(0, len(self.s3c.read())) 177 | 178 | def testreadrange(self): 179 | self.assertTrue(self.s3c.connect()) 180 | self.assertEqual(1024, len(self.s3c.read(size=1024))) 181 | self.assertNotEqual(0, len(self.s3c.read(size=1024))) 182 | 183 | def testreadinto(self): 184 | buffer = bytearray() 185 | self.assertTrue(self.s3c.connect()) 186 | self.assertNotEqual(0, self.s3c.readinto(buffer)) 187 | self.assertNotEqual(0, len(buffer)) 188 | 189 | def testreadline(self): 190 | self.assertTrue(self.s3c_lines.connect()) 191 | self.s3c_lines.seek(0) 192 | self.assertNotEqual(0, len(self.s3c_lines.readline())) 193 | self.s3c_lines.seek(0) 194 | self.assertNotEqual(0, len(self.s3c_lines.readlines())) 195 | 196 | if __name__ == '__main__': 197 | unittest.main() 198 | -------------------------------------------------------------------------------- /S3netCDF4/utils/split.py: -------------------------------------------------------------------------------- 1 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset 2 | from S3netCDF4.CFA._CFAExceptions import CFAError 3 | from netCDF4 import Dataset 4 | import numpy as np 5 | 6 | 7 | def copy_dims(nc_object, s3_object): 8 | nc_md_dims = nc_object.dimensions 9 | for d in nc_md_dims: 10 | # get the original dimension 11 | nc_dim = nc_object.dimensions[d] 12 | # create in the s3Dataset 13 | if nc_dim.isunlimited(): 14 | size = nc_dim.size 15 | else: 16 | size = nc_dim.size 17 | s3_object.createDimension(d, size) 18 | 19 | 20 | def copy_vars(nc_object, s3_object, subarray_size, subarray_shape=[]): 21 | nc_md_vars = nc_object.variables 22 | for v in nc_md_vars: 23 | # get the original variable 24 | nc_var = nc_object.variables[v] 25 | # create the variable if the sub array shape is given 26 | nc_var_md_keys = nc_var.ncattrs() 27 | if "_FillValue" in nc_var_md_keys: 28 | fill_value = nc_var.getncattr("_FillValue") 29 | else: 30 | fill_value = None 31 | # create the variable - the createVariable method needs to distinguish 32 | # between whether the shape or size has been passed in 33 | # Also, if the subarray_shape has been passed in then only attempt to 34 | # do it for variables with the same number of dimensions as the subarray 35 | # shape. 36 | use_shape = False 37 | if subarray_shape!=[]: 38 | shape_list = [int(x) for x in subarray_shape.strip("[]").split(",")] 39 | shape_array = np.array(shape_list) 40 | # check if shape_array.size is the same as the number of dims in the 41 | # netCDF variable 42 | if(shape_array.size == nc_var.ndim): 43 | use_shape = True 44 | 45 | if use_shape: 46 | # subarray shape at this moment is a string, [a,b,c,d] 47 | s3_var = s3_object.createVariable( 48 | # can only fill in endian from original dataset as 49 | # other initialisation variables are not stored in the 50 | # nc_var object 51 | nc_var.name, 52 | nc_var.dtype, 53 | endian=nc_var.endian(), 54 | fill_value=fill_value, 55 | dimensions=nc_var.dimensions, 56 | subarray_shape=shape_array) 57 | else: 58 | s3_var = s3_object.createVariable( 59 | # can only fill in endian from original dataset as 60 | # other initialisation variables are not stored in the 61 | # nc_var object 62 | nc_var.name, 63 | nc_var.dtype, 64 | endian=nc_var.endian(), 65 | fill_value=fill_value, 66 | dimensions=nc_var.dimensions, 67 | max_subarray_size=subarray_size) 68 | # copy the variable's metadata 69 | nc_var_md_keys = nc_var.ncattrs() 70 | for k in nc_var_md_keys: 71 | if k != "_FillValue": 72 | s3_var.setncattr(k, nc_var.getncattr(k)) 73 | 74 | # now copy the data - iterate over every partition 75 | if (s3_var._cfa_var): 76 | # it's a CFA variable so we want to copy the data in an intelligent 77 | # way - by copying it partition by partition. This will avoid 78 | # reading the large (potentially huge) dataset into memory all at 79 | # once 80 | pm_shape = tuple(s3_var._cfa_var.getPartitionMatrixShape()) 81 | for i in np.ndindex(pm_shape): 82 | partition = s3_var._cfa_var.getPartition(i) 83 | location = [] 84 | # this is a bit less obvious as we are using the partition 85 | # information to get the slices, rather than going from the 86 | # slices to the partition information, which happens in the 87 | # _CFAClasses 88 | for l in partition.location: 89 | s = slice(l[0], l[1], 1) 90 | location.append(s) 91 | location = tuple(location) 92 | nc_data = nc_var[location] 93 | s3_var[location] = nc_data 94 | else: 95 | # not a CFA variable so just copy the data 96 | s3_var[:] = nc_var[:] 97 | 98 | 99 | def split_into_CFA(output_path, input_path, 100 | subarray_path="", 101 | subarray_shape=[], subarray_size=50*1024*1024, 102 | cfa_version="0.5", ): 103 | """Split a netCDF file into a number of subarray files and write the CFA 104 | master array file.""" 105 | # if the subarray path is empty then get it from the output_path 106 | if subarray_path == "": 107 | if ".nca" in output_path: 108 | subarray_path = output_path[:-4] 109 | elif ".nc" in output_path: 110 | subarray_path = output_path[:-3] 111 | else: 112 | subarray_path = output_path 113 | output_path += ".nca" 114 | 115 | # open the input file 116 | nc_ds = Dataset(input_path, 'r') 117 | 118 | # get the output format for the new Dataset 119 | # if it's netCDF4 then the output is CFA4 120 | # if it's netCDF3 then the output is CFA3 121 | if nc_ds.file_format in ['NETCDF4', 'NETCDF4_CLASSIC']: 122 | s3_file_format = "CFA4" 123 | elif nc_ds.file_format == "NETCDF3_CLASSIC": 124 | s3_file_format = "CFA3" 125 | else: 126 | raise CFAError("Cannot split file with format: {}".format( 127 | nc_ds.file_format) 128 | ) 129 | 130 | # open the output file - copy the input from the input file to the output 131 | # file(s), whilst using the subarray settings to chunk the data 132 | s3_ds = s3Dataset(output_path, 'w', 133 | format=s3_file_format, 134 | cfa_version=cfa_version) 135 | 136 | # we now want to copy the information from the original dataset 137 | # netCDF files have: 138 | # global metadata 139 | # global dimensions 140 | # global variables 141 | # Each variable has 142 | # metadata 143 | # field data 144 | # 145 | # global groups 146 | # Each group has 147 | # metadata 148 | # dimensions 149 | # variables 150 | # Each variable has 151 | # metadata 152 | # field data 153 | 154 | # global metadata 155 | nc_md_keys = nc_ds.ncattrs() 156 | for k in nc_md_keys: 157 | s3_ds.setncattr(k, nc_ds.getncattr(k)) 158 | 159 | # global dimensions 160 | copy_dims(nc_ds, s3_ds) 161 | 162 | # global variables 163 | copy_vars(nc_ds, s3_ds, subarray_size, subarray_shape) 164 | 165 | # now do the groups 166 | for grp in nc_ds.groups: 167 | nc_grp = nc_ds.groups[grp] 168 | # create s3 group in the s3 dataset 169 | s3_grp = s3_ds.createGroup(nc_grp.name) 170 | # copy group metadata 171 | nc_md_keys = nc_grp.ncattrs() 172 | for k in nc_md_keys: 173 | s3_ds.setncattr(k, nc_grp.getncattr(k)) 174 | 175 | # copy group dimensions 176 | copy_dims(nc_ds, s3_ds) 177 | 178 | # copy group variables 179 | copy_vars(nc_ds, s3_ds, subarray_size, subarray_shape) 180 | 181 | # close the s3Dataset - super important as everything gets written on close 182 | s3_ds.close() -------------------------------------------------------------------------------- /bin/s3nc_cfa_mv.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 4 | __license__ = "BSD - see LICENSE file in top-level directory" 5 | __authors__ = "Neil Massey" 6 | 7 | """Program to rewrite partition infomation in a CFA-netCDF master-array file to reflect that a sub-array file has moved. 8 | """ 9 | 10 | import argparse 11 | from urllib.parse import urlparse 12 | import os 13 | import numpy as np 14 | import sys 15 | 16 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset 17 | from S3netCDF4.CFA._CFAClasses import CFAPartition 18 | 19 | def split_file_name(input_name): 20 | # split into prefix and filename 21 | # this should work on urls and file paths 22 | file_split = input_name.split("/") 23 | file_path = "/".join(file_split[:-1]) 24 | file_name = file_split[-1] 25 | return file_path, file_name 26 | 27 | def update_file_in_partition(prefix, cfa_var, partition_index): 28 | """Update the file_information in a variable for a given partition. 29 | Args: 30 | prefix (string): new prefix for files 31 | cfa_var (CFAVariable): variable to alter the partition for 32 | partition_index (np.ndarray): index of the partition to alter 33 | Returns: 34 | None 35 | """ 36 | # get the partition from the index 37 | partition = cfa_var.getPartition(partition_index) 38 | # get the file name and file path: 39 | file_path, file_name = split_file_name(partition.file) 40 | # new file path: 41 | new_file_path = prefix + "/" + file_name 42 | # construct a new partition 43 | new_part = CFAPartition( 44 | index = partition.index, 45 | location = partition.location, 46 | ncvar = partition.ncvar, 47 | file = new_file_path, 48 | format = partition.format, 49 | shape = partition.shape 50 | ) 51 | # write (and replace) the old partition 52 | cfa_var.writePartition(new_part) 53 | 54 | def update_file_in_variable(cfa_var, prefix, partition="all"): 55 | """Update the file_information in a variable for a given partition. 56 | Args: 57 | cfa_var (CFAVariable): CFA variable to alter, containing the partitions 58 | prefix (string): new prefix for files 59 | partition (string): index of the partition to alter, or 'all' 60 | Returns: 61 | None 62 | """ 63 | if partition == "all": 64 | pmshape = cfa_var.getPartitionMatrixShape() 65 | for partition_index in np.ndindex(*pmshape): 66 | update_file_in_partition(prefix, cfa_var, partition_index) 67 | else: 68 | # convert from partition string 69 | partition_index = np.fromstring(args.partition, dtype='i', sep=', ') 70 | update_file_in_partition(prefix, cfa_var, partition_index) 71 | 72 | def update_file_in_group(cfa_group, prefix, variable="all", partition="all"): 73 | """Update the file_information in a group for a given partition. 74 | Args: 75 | cfa_group (CFAGroup): CFA group to alter, containing the cfa_variables 76 | prefix (string): new prefix for files 77 | variable (string): name of the variable to alter, or 'all' 78 | partition (string): index of the partition to alter, or 'all' 79 | Returns: 80 | None 81 | """ 82 | if variable == "all": 83 | for var in cfa_group.getVariables(): 84 | cfa_var = cfa_group.getVariable(var) 85 | update_file_in_variable(cfa_var, prefix, partition) 86 | else: 87 | if variable in cfa_group.getVariables(): 88 | cfa_var = cfa_group.getVariable(variable) 89 | update_file_in_variable(cfa_var, prefix, partition) 90 | 91 | 92 | def update_file_in_partitions(input_dataset, 93 | prefix, 94 | group="all", 95 | variable="all", 96 | partition="all"): 97 | """Update the file information in the given partition. 98 | This partition could be all or a single partition specified by [t,z,x,y] 99 | for example. 100 | 101 | Args: 102 | input_dataset (s3Dataset): dataset to alter 103 | prefix (string): new prefix for files 104 | group (string): name of group to alter, or 'all', or 'none' 105 | variable (string): name of variable to alter, or 'all' 106 | partition (string): name of partition to alter, or 'all' 107 | 108 | Returns: 109 | None 110 | """ 111 | # get the cfa structure from the dataset 112 | cfa_dataset = input_dataset._cfa_dataset 113 | if group == "all": 114 | for grp in cfa_dataset.getGroups(): 115 | cfa_group = cfa_dataset.getGroup(grp) 116 | update_file_in_group(cfa_group, prefix, variable, partition) 117 | else: 118 | # named group 119 | cfa_group = input_dataset.getGroup(group) 120 | update_file_in_group(cfa_group, prefix, variable, partition) 121 | 122 | 123 | if __name__ == "__main__": 124 | """Utility program to alter the structure of a CFA-netCDF master array 125 | file, either on the disk or remotely on S3 storage, to change the 126 | location of the sub-array file. Note that it doesn't actually move any 127 | files, it just updates the record in the partition matrix. 128 | It will only update the prefix of the file location, not the actual 129 | filename. i.e. it replaces os.path.dirname 130 | Options are: 131 | 1. The input master-array file, write back to the same file 132 | 2. The partition to change 133 | --partition=all|none| default: --partition=all 134 | 3. The prefix of the new address for the file location 135 | --prefix= 136 | """ 137 | # set up and parse the arguments 138 | parser = argparse.ArgumentParser( 139 | prog="s3nc_cfa_mv", 140 | formatter_class=argparse.RawTextHelpFormatter, 141 | description=( 142 | "Alter the paths of the sub-array files in the master-array file to" 143 | " reflect that those sub-array files have been moved to a new " 144 | " location. It will only update the prefix of the file location, " " not the actual filename." 145 | ) 146 | ) 147 | 148 | parser.add_argument( 149 | "input", action="store", default="", metavar="", 150 | help=( 151 | "Path of the CFA-netCDF master-array file to alter." 152 | ) 153 | ) 154 | 155 | parser.add_argument( 156 | "--group", action="store", default="all", 157 | metavar="", 158 | help=( 159 | "Name of a group to change file prefix for, or change all groups. " 160 | "--group=all|" 161 | ) 162 | ) 163 | 164 | parser.add_argument( 165 | "--variable", action="store", default="all", 166 | metavar="", 167 | help=( 168 | "Name of a variable to change file prefix for, or change all " "variables." 169 | "--variable=all|" 170 | ) 171 | ) 172 | 173 | parser.add_argument( 174 | "--partition", action = "store", default="all", 175 | metavar="", 176 | help=( 177 | "Choose the partition to change the file location prefix for." 178 | "--partition=all" 179 | ) 180 | ) 181 | 182 | parser.add_argument( 183 | "--prefix", action = "store", default="none", required=True, 184 | metavar="", 185 | help=( 186 | "New file location prefix" 187 | ) 188 | ) 189 | args = parser.parse_args() 190 | 191 | # get the input file 192 | input_path = os.path.expanduser(args.input) 193 | # open the input dataset in append mode 194 | input_dataset = s3Dataset(input_path, mode='a') 195 | # Update the prefix in the partitions 196 | update_file_in_partitions(input_dataset, args.prefix, args.group, 197 | args.variable, args.partition) 198 | # close the file to save the changes 199 | input_dataset.close() 200 | -------------------------------------------------------------------------------- /test/test_s3aioFileObject.py: -------------------------------------------------------------------------------- 1 | from S3netCDF4.Backends._s3aioFileObject import s3aioFileObject 2 | from S3netCDF4._Exceptions import IOException, APIException 3 | import unittest 4 | import asyncio 5 | import json 6 | import time 7 | import io 8 | import inspect 9 | 10 | class AsyncIOTestFactory(type): 11 | """Metaclass that creates a `test_something` function for all those functions 12 | called `_test_something` which simply calls asyncio.run(`_test_something`)""" 13 | def __new__(cls, name, bases, dct): 14 | def mapper(attribute): 15 | if inspect.iscoroutinefunction(attribute): 16 | def async_wrapper(*args, **kwargs): 17 | loop = asyncio.get_event_loop() 18 | loop.run_until_complete(attribute(*args, **kwargs)) 19 | return async_wrapper 20 | else: 21 | return attribute 22 | return super().__new__( 23 | cls, 24 | name, 25 | bases, 26 | { k: mapper(v) for k, v in dct.items() } 27 | ) 28 | 29 | class s3aioFileObjectGeneralTest(object, metaclass=AsyncIOTestFactory): 30 | """All of the general tests for either a read or write transaction.""" 31 | 32 | async def test_detach(self): 33 | async with s3aioFileObject( 34 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 35 | credentials=self.cfg["STFC"]["credentials"], 36 | mode="w" 37 | ) as s3c: 38 | try: 39 | s3c.detach() 40 | except io.UnsupportedOperation: 41 | return 42 | self.fail( 43 | "s3aioFileObject.detach did not raise io.UnsupportedOperation" 44 | ) 45 | 46 | async def test_close(self): 47 | async with s3aioFileObject( 48 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 49 | credentials=self.cfg["STFC"]["credentials"], 50 | mode="rw" 51 | ) as s3c: 52 | if await s3c.close(): 53 | return 54 | else: 55 | self.fail("s3aioFileObject.close returned False") 56 | 57 | async def test_readable(self): 58 | async with s3aioFileObject( 59 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 60 | credentials=self.cfg["STFC"]["credentials"], 61 | mode="rw" 62 | ) as s3c: 63 | if s3c.readable(): 64 | return 65 | else: 66 | self.fail("s3aioFileObject.readable returned False") 67 | 68 | async def test_truncate(self): 69 | async with s3aioFileObject( 70 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 71 | credentials=self.cfg["STFC"]["credentials"], 72 | mode="w" 73 | ) as s3c: 74 | try: 75 | s3c.truncate() 76 | except io.UnsupportedOperation: 77 | return 78 | self.fail( 79 | "s3aioFileObject.truncate did not raise io.UnsupportedOperation" 80 | ) 81 | 82 | async def test_fileno(self): 83 | async with s3aioFileObject( 84 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 85 | credentials=self.cfg["STFC"]["credentials"], 86 | mode="w" 87 | ) as s3c: 88 | try: 89 | s3c.fileno() 90 | except io.UnsupportedOperation: 91 | return 92 | self.fail( 93 | "s3aioFileObject.fileno did not raise io.UnsupportedOperation" 94 | ) 95 | 96 | async def test_seekable(self): 97 | async with s3aioFileObject( 98 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 99 | credentials=self.cfg["STFC"]["credentials"], 100 | mode="rw" 101 | ) as s3c: 102 | if s3c.seekable(): 103 | return 104 | else: 105 | self.fail("s3aioFileObject.seekable returned False") 106 | 107 | async def test_tell(self): 108 | async with s3aioFileObject( 109 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 110 | credentials=self.cfg["STFC"]["credentials"], 111 | mode="rw" 112 | ) as s3c: 113 | if s3c.tell() == 0: 114 | return 115 | else: 116 | self.fail("s3aioFileObject.tell did not return 0") 117 | 118 | async def test_seek(self): 119 | async with s3aioFileObject( 120 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 121 | credentials=self.cfg["STFC"]["credentials"], 122 | mode="rw" 123 | ) as s3c: 124 | # Three different methods for seek: 125 | # whence = io.SEEK_SET 126 | # whence = io.SEEK_CUR 127 | # whence = io.SEEK_END 128 | # the current pointer is on zero 129 | if not await s3c.seek(0, whence=io.SEEK_SET) == 0: 130 | self.fail("s3aioFileObject.seek did not return 0") 131 | 132 | if not await s3c.seek(10, whence=io.SEEK_SET) == 10: 133 | self.fail("s3aioFileObject.seek did not return 10") 134 | # now on 10 135 | try: 136 | await s3c.seek(-1, whence=io.SEEK_SET) 137 | except IOException: 138 | pass 139 | else: 140 | self.fail("s3aioFileObject.seek did not raise IOException") 141 | # should have failed so still on 10 142 | 143 | # the current pointer is on ten (10) 144 | if not await s3c.seek(-10, whence=io.SEEK_CUR) == 0: 145 | self.fail("s3aioFileObject.seek did not return 0") 146 | 147 | # now on 0 - should raise an exception if we seek below 0 148 | try: 149 | await s3c.seek(-1, whence=io.SEEK_CUR) 150 | except IOException: 151 | pass 152 | else: 153 | self.fail("s3aioFileObject.seek did not raise IOException") 154 | 155 | # still on zero: get the size to seek past it 156 | size = await s3c._getsize() 157 | try: 158 | await s3c.seek(size+1, whence=io.SEEK_CUR) 159 | except IOException: 160 | pass 161 | else: 162 | self.fail("s3aioFileObject.seek did not raise IOException") 163 | 164 | # still on zero - seek from the end 165 | try: 166 | await s3c.seek(size+1, whence=io.SEEK_END) 167 | except IOException: 168 | pass 169 | else: 170 | self.fail("s3aioFileObject.seek did not raise IOException") 171 | 172 | # still on 0 - seek backwards from the end 173 | try: 174 | await s3c.seek(-1, whence=io.SEEK_END) 175 | except IOException: 176 | pass 177 | else: 178 | self.fail("s3aioFileObject.seek did not raise IOException") 179 | 180 | if await s3c.seek(10, whence=io.SEEK_END) != size-10: 181 | self.fail("s3aioFileObject.seek did not return {}".format( 182 | size-10 183 | )) 184 | 185 | 186 | class s3aiot1FileObjectWriteTest(unittest.TestCase, s3aioFileObjectGeneralTest): 187 | 188 | def setUp(self): 189 | """Set up the s3FileObject but don't connect.""" 190 | # load the credentials from the hidden file 191 | fh = open(".s3config.json") 192 | self.cfg = json.load(fh) 193 | fh.close() 194 | 195 | async def test_1writable(self): 196 | async with s3aioFileObject( 197 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 198 | credentials=self.cfg["STFC"]["credentials"], 199 | mode="w" 200 | ) as s3c: 201 | if s3c.writable(): 202 | return 203 | else: 204 | self.fail("s3aioFileObject.writable returned False") 205 | 206 | async def test_1write(self): 207 | async with s3aioFileObject( 208 | self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc", 209 | credentials=self.cfg["STFC"]["credentials"], 210 | mode="w" 211 | ) as s3c: 212 | # create random bytes - if we keep it below s3c._getsize() then it will 213 | # only do one upload 214 | size = await s3c._getsize() 215 | bytes = bytearray(size) 216 | for b in range(0, size): 217 | bytes[b] = 128 218 | # convert bytes to io.BytesIO 219 | if await s3c.write(bytes) == 0: 220 | self.fail("s3aioFileObject.write returned zero") 221 | 222 | if __name__ == '__main__': 223 | loop = asyncio.get_event_loop() 224 | unittest.main() 225 | loop.close() 226 | -------------------------------------------------------------------------------- /test/test_s3Dataset.py: -------------------------------------------------------------------------------- 1 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset 2 | from S3netCDF4._Exceptions import APIException 3 | import numpy as np 4 | import unittest 5 | import os 6 | 7 | DEBUG = False 8 | 9 | def create_test_dataset(s3_ds, format, cfa_version, shape=[30,1,192,145]): 10 | """Create a test dataset for a netCDF file""" 11 | s3_ds.history = "Test of s3netCDF: format: {} cfa_version: {}".format( 12 | format, cfa_version 13 | ) 14 | 15 | # create a group if this is a netCDF4 (or CFA4 equivalent) file 16 | if format == "NETCDF4" or format == "CFA4": 17 | group = s3_ds.createGroup("test_group") 18 | # otherwise for netCDF3 files the group is the dataset 19 | else: 20 | group = s3_ds 21 | group.group_class = "Surface variables" 22 | 23 | # create the dimension, the variable, add the variable values and some 24 | # metadata 25 | if DEBUG: 26 | print("\t . Creating time") 27 | time_dim = group.createDimension("time", shape[0]) 28 | time_var = group.createVariable("time", np.float32, ("time",)) 29 | time_var[:] = np.arange(0, shape[0]) 30 | time_var.units = "days since 2000-01-01" 31 | time_var.axis = "T" 32 | 33 | if DEBUG: 34 | print("\t . Creating level") 35 | level_dim = group.createDimension("level", shape[1]) 36 | level_var = group.createVariable("level", np.float32, ("level",)) 37 | level_var[:] = np.arange(0, shape[1])*100 38 | level_var.standard_name = "height above sea-level" 39 | level_var.units = "m" 40 | 41 | if DEBUG: 42 | print("\t . Creating latitude") 43 | latitude_dim = group.createDimension("latitude", shape[2]) 44 | latitude_var = group.createVariable("latitude", np.float32, ("latitude",)) 45 | latitude_vals = 90.0 - np.arange(0, shape[2]) * 180.0/(shape[2]-1) 46 | latitude_var[:] = latitude_vals 47 | latitude_var.standard_name = "latitude" 48 | latitude_var.units = "degrees north" 49 | latitude_var.setncatts({"name": "value", "test":234235}) 50 | 51 | if DEBUG: 52 | print("\t . Creating longitude") 53 | longitude_dim = group.createDimension("longitude", shape[3]) 54 | longitude_var = group.createVariable("longitude", np.float32, ("longitude",)) 55 | longitude_vals = np.arange(0, shape[3]) * 360.0/shape[3] 56 | longitude_var[:] = longitude_vals 57 | longitude_var.standard_name = "longitude" 58 | longitude_var.units = "degrees east" 59 | 60 | if DEBUG: 61 | print("\t . Creating tmp") 62 | # create the field variable and data 63 | subarray_shape = np.array( 64 | [12, shape[1], shape[2], shape[3]], 65 | dtype='i' 66 | ) 67 | tmp_var = group.createVariable("tmp", np.float32, 68 | ("time", "level", "latitude", "longitude"), 69 | fill_value=2e2, 70 | subarray_shape=subarray_shape 71 | ) 72 | tmp_var.standard_name = "temperature" 73 | tmp_var.units = "degrees C" 74 | tmp_var.setncattr("long_name", "Surface temperature at 1m") 75 | tmp_var._FillValue = np.float32(2e20) # strict typing matches variable 76 | 77 | if DEBUG: 78 | print("\t . Writing data") 79 | 80 | # write a single scalar of data 81 | scl_var = s3_ds.createVariable("scl", np.float32) 82 | 83 | # write a vector of data 84 | vec_dim = s3_ds.createDimension("vector", 128) 85 | vec_var = s3_ds.createVariable("vector", np.int32, ("vector",)) 86 | vec_var[:] = 12+np.arange(0,128) 87 | velocity = s3_ds.createVariable("velocity", np.float32, ("vector",)) 88 | velocity.units = "ms-1" 89 | 90 | def get_file_path(path_stub, format, cfa_version=None): 91 | """Get the path to the file for reading or writing. 92 | Based on the path_stub, the format and cfa_version. 93 | """ 94 | file_name = "{}_{}".format(path_stub, format) 95 | if cfa_version is not None: 96 | file_name += "_cfa{}".format(cfa_version) 97 | file_name += ".nc" 98 | return file_name 99 | 100 | def test_s3Dataset_write(path_stub, format="NETCDF4", cfa_version="0.4", 101 | resolution_degrees=1.5): 102 | """Test writing out a s3Dataset, for one of the various permutations of: 103 | 1. file format (netCDF3 or netCDF4) 104 | 2. whether it is a S3-netCDF / CFA file or a plain netCDF file 105 | 3. the CFA version (0.4 or 0.5) 106 | """ 107 | # build a file name from the path stub, the format and the cfa_version 108 | # don't use os.path.join as it doesn't handle URLs and paths 109 | file_name = get_file_path(path_stub, format, cfa_version) 110 | if DEBUG: 111 | print("Test writing {}".format(file_name)) 112 | # open the dataset 113 | ds = s3Dataset(file_name, format=format, mode='w', cfa_version=cfa_version, 114 | diskless=False, persist=False) 115 | # construct the shape: 116 | shape=[365, 1, 180.0/resolution_degrees+1, 360.0/resolution_degrees] 117 | # create the data inside the dataset 118 | create_test_dataset(ds, format, cfa_version, shape) 119 | if DEBUG: 120 | print(ds.groups["test_group"].variables["tmp"]) 121 | print(ds.variables["scl"]) 122 | 123 | if format == "CFA4" or format == "NETCDF4": 124 | tmp_var = ds.groups["test_group"].variables["tmp"] 125 | else: 126 | tmp_var = ds.variables["tmp"] 127 | tmp_var[:,:,:,:] = 250.0 128 | vel_var = ds.variables["velocity"] 129 | vel_var[0] = 10.0 130 | ds.close() 131 | return True 132 | 133 | def test_s3Dataset_read(path_stub, format="NETCDF4", cfa_version=None): 134 | """Test writing out a s3Dataset, for one of the various permutations of: 135 | 1. file format (netCDF3 or netCDF4) 136 | 2. whether it is a S3-netCDF / CFA file or a plain netCDF file 137 | 3. the CFA version (0.4 or 0.5) 138 | """ 139 | file_name = get_file_path(path_stub, format, cfa_version) 140 | if DEBUG: 141 | print("Test reading {}".format(file_name)) 142 | # open the dataset 143 | dr = s3Dataset(file_name, mode='r') 144 | if DEBUG: 145 | print(dr.groups) 146 | 147 | if format == "NETCDF4" or format == "CFA4": 148 | grp = dr.groups["test_group"] 149 | else: 150 | grp = dr 151 | 152 | if DEBUG: 153 | print(grp.variables["tmp"]) 154 | print(dr.variables["scl"]) 155 | 156 | tmp_var = grp.variables["tmp"] 157 | x = tmp_var[:,0,0,0] 158 | dr.close() 159 | return True 160 | 161 | class s3DatasetTest(unittest.TestCase): 162 | # static class members 163 | # all path stubs the same 164 | path_stub = os.environ["HOME"] + "/Test/s3Dataset_test" 165 | res_deg = 2.5 166 | 167 | # 168 | def test_NETCDF4_CFA0_4(self): 169 | self.assertTrue( 170 | test_s3Dataset_write( 171 | s3DatasetTest.path_stub, "NETCDF4", "0.4", s3DatasetTest.res_deg 172 | ) 173 | ) 174 | self.assertTrue( 175 | test_s3Dataset_read(s3DatasetTest.path_stub, "NETCDF4", "0.4") 176 | ) 177 | 178 | def test_NETCDF4_CFA0_5(self): 179 | self.assertTrue( 180 | test_s3Dataset_write( 181 | s3DatasetTest.path_stub, "NETCDF4", "0.5", s3DatasetTest.res_deg 182 | ) 183 | ) 184 | self.assertTrue( 185 | test_s3Dataset_read(s3DatasetTest.path_stub, "NETCDF4", "0.5") 186 | ) 187 | 188 | def test_NETCDF3_CFA0_4(self): 189 | self.assertTrue( 190 | test_s3Dataset_write( 191 | s3DatasetTest.path_stub, "NETCDF3_CLASSIC", "0.4", s3DatasetTest.res_deg 192 | ) 193 | ) 194 | self.assertTrue( 195 | test_s3Dataset_read(s3DatasetTest.path_stub, "NETCDF3_CLASSIC", "0.4") 196 | ) 197 | 198 | def test_NETCDF3_CFA0_5(self): 199 | with self.assertRaises(APIException): 200 | test_s3Dataset_write( 201 | s3DatasetTest.path_stub, "NETCDF3_CLASSIC", "0.5", s3DatasetTest.res_deg 202 | ) 203 | 204 | def test_CFA4_CFA0_4(self): 205 | self.assertTrue( 206 | test_s3Dataset_write( 207 | s3DatasetTest.path_stub, "CFA4", "0.4", s3DatasetTest.res_deg 208 | ) 209 | ) 210 | self.assertTrue( 211 | test_s3Dataset_read(s3DatasetTest.path_stub, "CFA4", "0.4") 212 | ) 213 | 214 | def test_CFA4_CFA0_5(self): 215 | self.assertTrue( 216 | test_s3Dataset_write( 217 | s3DatasetTest.path_stub, "CFA4", "0.5", s3DatasetTest.res_deg 218 | ) 219 | ) 220 | self.assertTrue( 221 | test_s3Dataset_read(s3DatasetTest.path_stub, "CFA4", "0.5") 222 | ) 223 | 224 | def test_CFA3_CFA0_4(self): 225 | self.assertTrue( 226 | test_s3Dataset_write( 227 | s3DatasetTest.path_stub, "CFA3", "0.4", s3DatasetTest.res_deg 228 | ) 229 | ) 230 | self.assertTrue( 231 | test_s3Dataset_read(s3DatasetTest.path_stub, "CFA3", "0.4") 232 | ) 233 | 234 | def test_CFA3_CFA0_5(self): 235 | with self.assertRaises(APIException): 236 | test_s3Dataset_write( 237 | s3DatasetTest.path_stub, "CFA3", "0.5", s3DatasetTest.res_deg 238 | ) 239 | 240 | if __name__ == '__main__': 241 | unittest.main() 242 | -------------------------------------------------------------------------------- /S3netCDF4/CFA/_CFASplitter.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | """ 9 | CFASplitter class containing the routines required to take a 10 | multi-dimensional array and split it into subarrays according to the protocol 11 | that each subarray should have a maximum size, and that the number of 12 | operations required to read the entire array in any direction should be 13 | equal. 14 | 15 | """ 16 | 17 | import numpy as np 18 | cimport numpy as np 19 | 20 | cdef class CFASplitter: 21 | """ 22 | Class containing the methods required to return optimised subarrays for 23 | creating CFAVariables. 24 | """ 25 | 26 | cdef np.ndarray shape 27 | cdef np.ndarray subarray_shape 28 | cdef list axis_types 29 | cdef int max_subarray_size 30 | 31 | 32 | def __init__(self, 33 | np.ndarray shape, 34 | int max_subarray_size=0, 35 | list axis_types=[], 36 | ): 37 | """Initialise the CFA array splitter. 38 | 39 | Args: 40 | shape (np.ndarray): the shape of the array to split into subarrays. 41 | axis_types (list): a list of the types of axis, in order, for the 42 | shape of the array. These axis types can be: 43 | 'X' - X axis 44 | 'Y' - Y axis 45 | 'Z' - Z / level axis 46 | 'T' - Time axis 47 | 'N' - non of the above axis 48 | 'U' - unspecified axis, this needs to be overwritten 49 | """ 50 | DEFAULT_SUBARRAY_SIZE = 50*1024*1024 # 50MB default object size 51 | self.shape = shape 52 | if len(axis_types) == 0: 53 | # build the axis_types by guessing what they should be 54 | # this order follows CF conventions 55 | default_axis_types = ["T", "Z", "Y", "X"] 56 | new_axis_types = np.empty(shape.size) 57 | # position in default axis array 58 | p = len(default_axis_types)-1 59 | for i in range(shape.size, 0, -1): 60 | # calculate the default axis position 61 | if p >= 0: 62 | new_axis_types[i] = default_axis_types[p] 63 | # go to the next (previous) default axis type 64 | p -= 1 65 | else: 66 | new_axis_types[i] = 'N' 67 | self.axis_types = new_axis_types 68 | else: 69 | self.axis_types = axis_types 70 | 71 | if max_subarray_size == 0: 72 | self.max_subarray_size = DEFAULT_SUBARRAY_SIZE 73 | else: 74 | self.max_subarray_size = max_subarray_size 75 | 76 | self.subarray_shape = np.array([]) 77 | 78 | 79 | cdef _numVals(self, np.ndarray shape): 80 | """Return number of values in subarray of specified shape, given by a 81 | list of dimension lengths. 82 | 83 | shape -- list of subarray dimension sizes""" 84 | if (len(shape) == 0): 85 | return 1 86 | return np.prod(shape) 87 | 88 | 89 | cdef _subdivideArray(self, 90 | np.ndarray c_subarray_divs, 91 | list permitted_axes=["T"]): 92 | # calculate the number of elements per sub for the linear axis types 93 | n_per_subf = np.empty((len(self.shape),),'i') 94 | for i in range(0, len(self.shape)): 95 | if self.axis_types[i] not in permitted_axes: 96 | n_per_subf[i] = int(1e6) 97 | # check that we are not going to subdivide more than the axis length! 98 | elif c_subarray_divs[i] > self.shape[i]: 99 | n_per_subf[i] = int(1e6) 100 | else: 101 | n_per_subf[i] = c_subarray_divs[i] 102 | # get the minimum index 103 | min_i = np.argmin(n_per_subf) 104 | c_subarray_divs[min_i] += 1 105 | return c_subarray_divs 106 | 107 | 108 | cdef _getLinearOperations(self, np.ndarray c_subarray_divs): 109 | """Get the number of operations required to read one spatial point for 110 | every timestep through the dataset. 111 | This is equal to: number of subarrays in the T axis.""" 112 | # get the t axis index, if it exists, otherwise the Z axis, otherwise 113 | # the N axis 114 | t_ax = -1 115 | if "T" in self.axis_types: 116 | t_ax = self.axis_types.index("T") 117 | elif "Z" in self.axis_types: 118 | t_ax = self.axis_types.index("Z") 119 | elif "N" in self.axis_types: 120 | t_ax = self.axis_types.index("N") 121 | 122 | # calculate number of operations 123 | if t_ax != -1: 124 | return c_subarray_divs[t_ax] 125 | else: 126 | # otherwise return -1 127 | return -1 128 | 129 | 130 | cdef _getFieldOperations(self, np.ndarray c_subarray_divs): 131 | """Get the number of operations required to read one 2D field of data at 132 | a particular timestep or level throughout the dataset. 133 | This is equal to: (subarrays in the X axis) * 134 | (subarrays in the Y axis) 135 | """ 136 | # get the X and Y axes, if they exists 137 | x_ax = -1 138 | y_ax = -1 139 | if "X" in self.axis_types: 140 | x_ax = self.axis_types.index("X") 141 | if "Y" in self.axis_types: 142 | y_ax = self.axis_types.index("Y") 143 | 144 | # four possibilities: 145 | # 1. X & Y exist : return subarrays in X * subarrays in Y 146 | # 2. X exists but Y doesn't : return subarrays in X 147 | # 3. Y exists but X doesn't : return subarrays in Y 148 | # 4. Neither X or Y exists : return -1 149 | 150 | # logic optimised 151 | if not (x_ax == -1 or y_ax == -1): 152 | n_ops = c_subarray_divs[x_ax] * c_subarray_divs[y_ax] 153 | elif y_ax != -1: 154 | n_ops = c_subarray_divs[y_ax] 155 | elif x_ax != -1: 156 | n_ops = c_subarray_divs[x_ax] 157 | else: 158 | n_ops = -1 159 | 160 | return n_ops 161 | 162 | 163 | cpdef calculateSubarrayShape(self): 164 | """ 165 | Return a 'good shape' for the sub-arrays for an any-D variable, 166 | assuming balanced 1D/(n-1)D access 167 | 168 | Returns floating point field lengths of a field shape that provides 169 | balanced access of 1D subsets and 2D subsets of a netCDF or HDF5 170 | variable with any shape. 171 | 'Good shape' for fields means that the number of fields accessed to read 172 | either kind of 1D or 2D subset is approximately equal, and the size of 173 | each field is no more than max_subarray_size. 174 | An extra complication here is that we wish to be able to optimise for any number of 175 | dimensions (1,2,3,4, etc.) but ensure that the algorithm knows which axis it is 176 | operating on. For example, a 2D field with X and Y axes should not be split in 177 | the same way as a 2D field with T and Z axes. 178 | 179 | The algorithm follows a sub-division process, in this order (if they 180 | exist): 181 | 1. sub divide the X axis 182 | 2. sub divide the T axis 183 | 3. sub divide the Y axis 184 | 4. sub divide the Z axis 185 | 5. sub divide any N axes 186 | 187 | Calculating the access operations: 188 | There are two "types" of access operations 189 | - linear (accessing a single spatial point across timesteps) 190 | - field (accessing a 2D field of data at a particular timestep) 191 | The number of access operations are: 192 | - linear : T dimension / number of subfields in the T axis 193 | - field : (X dimension / number of subfields in the X axis)* 194 | (Y dimension / number of subfields in the Y axis) 195 | """ 196 | 197 | # the algorithm first calculates how many partitions each dimension 198 | # should be split into - this is stored in c_subfield_divs 199 | # current subfield_repeats shape defaults to var shape 200 | c_subarray_divs = np.ones((len(self.shape),), 'i') 201 | 202 | # if the number of values in the field_shape is greater than 203 | # max_subarray_size then divide 204 | while (self._numVals(self.shape / c_subarray_divs)) > self.max_subarray_size: 205 | # get the linear access and the field access operations 206 | linear_ops = self._getLinearOperations(c_subarray_divs) 207 | field_ops = self._getFieldOperations(c_subarray_divs) 208 | # choose to divide on field ops first, if the number of ops are equal 209 | if field_ops <= linear_ops: 210 | c_subarray_divs = self._subdivideArray(c_subarray_divs, 211 | ["X", "Y"] 212 | ) 213 | else: 214 | c_subarray_divs = self._subdivideArray(c_subarray_divs, 215 | ["T", "Z", "N"] 216 | ) 217 | 218 | # we have so far calculated the optimum number of times each axis will 219 | # be divided 220 | # - translate this into a (floating point) number of elements in each 221 | # chunk, for each axis 222 | c_subarray_shape = np.array(self.shape, 'd') / c_subarray_divs 223 | self.subarray_shape = c_subarray_shape 224 | return c_subarray_shape 225 | 226 | 227 | cpdef setSubarrayShape(self, np.ndarray subarray_shape): 228 | """Set the shape of the subarray, for when the user wishes to define it. 229 | """ 230 | self.subarray_shape = subarray_shape 231 | return subarray_shape 232 | -------------------------------------------------------------------------------- /bin/s3nc_cfa_info.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 4 | __license__ = "BSD - see LICENSE file in top-level directory" 5 | __authors__ = "Neil Massey" 6 | 7 | """Program to return information about a netCDF-CFA file from disk or S3. 8 | Modelled after ncdump and cdo info. 9 | """ 10 | 11 | import argparse 12 | from urllib.parse import urlparse 13 | import os 14 | import numpy as np 15 | 16 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset 17 | 18 | def print_dimension_info(input_dim, metadata): 19 | """Print the information for the dimension.""" 20 | dim_size = input_dim.getLen() 21 | print(" {} = {}".format(input_dim.getName(), dim_size)) 22 | # print the metadata 23 | if metadata: 24 | md = input_dim.getMetadata() 25 | for key in md: 26 | if key[0:4] != "cfa_": 27 | print (" {}:{} = {}".format( 28 | input_dim.getName(), key, md[key]) 29 | ) 30 | 31 | def print_dimensions(group, metadata): 32 | """Print all the dimensions in a group.""" 33 | for d in group.getDimensions(): 34 | input_dimension = group.getDimension(d) 35 | print_dimension_info(input_dimension, metadata) 36 | 37 | def print_partition_info(input_var, partition_index): 38 | """Print the partition information for a single partition. 39 | By this point partition should be a numpy array of the number of 40 | dimensions of the partition.""" 41 | partition = input_var.getPartition(partition_index) 42 | var_name_len = len(input_var.getName()) + 16 43 | just_str = "" 44 | for x in range(0, var_name_len): 45 | just_str += " " 46 | print(" {}:{} {} =".format( 47 | input_var.getName(), "partition", partition_index 48 | ) 49 | ) 50 | # location 51 | location_string = ":location = {}".format(partition.location.tolist()) 52 | print(just_str + location_string) 53 | # shape 54 | shape_string = ":shape = {}".format(partition.shape.tolist()) 55 | print(just_str + shape_string) 56 | # filename 57 | filename_string = ":filename = {}".format(partition.file) 58 | print(just_str + filename_string) 59 | # varname 60 | varname_string = ":variable = {}".format(partition.ncvar) 61 | print(just_str + varname_string) 62 | # format 63 | format_string = ":format = {}".format(partition.format) 64 | print(just_str + format_string) 65 | 66 | def print_variable_info(input_var, partition, metadata): 67 | """Print the information for the variable.""" 68 | print(" {} {}({})".format( 69 | input_var.getType(), 70 | input_var.getName(), 71 | ",".join(input_var.getDimensions()) 72 | ) 73 | ) 74 | # print the metadata 75 | if metadata: 76 | md = input_var.getMetadata() 77 | for key in md: 78 | if key[0:4] != "cfa_": 79 | print (" {}:{} = {}".format( 80 | input_var.getName(), key, md[key]) 81 | ) 82 | # print the minimum partition information 83 | # print the partition matrix shape 84 | pmshape_str = "(" 85 | for x in input_var.getPartitionMatrixShape(): 86 | pmshape_str += str(x) + ", " 87 | pmshape_str = pmshape_str[:-2] + ")" 88 | print (" {}:{} = {}".format( 89 | input_var.getName(), "pmshape", pmshape_str) 90 | ) 91 | # print the partition matrix dimensions 92 | pmdims = "(" + ", ".join(input_var.getPartitionMatrixDimensions()) + ")" 93 | print (" {}:{} = {}".format( 94 | input_var.getName(), "pmdimensions", pmdims) 95 | ) 96 | # print the partition 97 | if partition == "all": 98 | pmshape = input_var.getPartitionMatrixShape() 99 | for index in np.ndindex(*pmshape): 100 | print_partition_info(input_var, index) 101 | elif partition == "none": 102 | pass # do not print anything for partition==none 103 | else: 104 | partition_index = np.fromstring(args.partition, dtype='i', sep=', ') 105 | print_partition_info(input_var, np.array(partition_index)) 106 | 107 | def print_variables(group, partition, metadata): 108 | for v in group.getVariables(): 109 | input_var = group.getVariable(v) 110 | print_variable_info(input_var, partition, metadata) 111 | 112 | def print_group_info(input_grp, variable, partition, metadata): 113 | """Print the information for the group, and all the dimensions and 114 | variables in the group.""" 115 | if variable == "none": 116 | print(" {}".format(input_grp.getName())) 117 | else: 118 | print("group: {} ".format(input_grp.getName())+"{") 119 | # print the dimensions 120 | print(" dimensions:") 121 | print_dimensions(input_grp, metadata) 122 | 123 | # print the variables in the group 124 | print(" variables:") 125 | if variable == "all": 126 | print_variables(input_grp, partition, metadata) 127 | else: 128 | input_var = input_grp.getVariable(variable) 129 | print_variable_info(input_var, partition, metadata) 130 | print(" }") 131 | if metadata: 132 | print(" // group attributes") 133 | md = input_grp.getMetadata() 134 | for key in md: 135 | if key[0:4] != "cfa_": 136 | print (" :{} = {}".format( 137 | key, md[key]) 138 | ) 139 | 140 | def print_dataset_info(input_dataset, group, variable, partition, metadata): 141 | """Print the information for the dataset. Use the CFA class. 142 | Print the name, metadata and groups. Recurse into the group to print the 143 | variables if variable==all or variable==.""" 144 | cfa_d = input_dataset._cfa_dataset 145 | print(cfa_d.getName() + " {") 146 | # print the root group if group == "all" or group == "root" 147 | if (group in ["all", "root"]): 148 | root_grp = cfa_d["root"] 149 | print("dimensions:") 150 | print_dimensions(root_grp, metadata) 151 | print("variables:") 152 | if variable == "all": 153 | print_variables(root_grp, partition, metadata) 154 | else: 155 | input_var = root_grp.getVariable(variable) 156 | print_variable_info(input_var, partition, metadata) 157 | # print the group names, unless just the root group is requested 158 | if (group != "root"): 159 | if (variable == "none"): 160 | print("groups:") 161 | for g in cfa_d.getGroups(): 162 | input_grp = cfa_d[g] 163 | if (g != "root" and g[0:4] != "cfa_"): 164 | print_group_info(input_grp, variable, partition, metadata) 165 | else: 166 | if (variable == "none"): 167 | print("groups:") 168 | input_grp = cfa_d[group] 169 | print_group_info(input_grp, variable, partition, metadata) 170 | 171 | # print the global attributes 172 | if metadata: 173 | print("// global attributes") 174 | md = cfa_d.getMetadata() 175 | for key in md: 176 | print (" :{} = {}".format(key, md[key])) 177 | print("}") 178 | 179 | if __name__ == "__main__": 180 | """Utility program to display the structure of a CFA-netCDF master array 181 | file, either on the disk or remotely on S3 storage. 182 | This program is inspired by ncdump and cdo info / sinfo. 183 | We need options to control three things: 184 | 1. Whether to output all the groups, or a particular group and whether 185 | to output the variables in the group(s) 186 | --group=all| default: --group=all 187 | --variable=all| default: --variable=all 188 | 2. Whether to output the metadata or not 189 | --metadata default: --metadata(on) 190 | 3. Whether to output partition information for the variables, either 191 | all the partition information or for a particular partition. 192 | --partition=all|none| default: --partion=none (off) 193 | """ 194 | # set up and parse the arguments 195 | parser = argparse.ArgumentParser( 196 | prog="s3nc_cfa_info", 197 | formatter_class=argparse.RawTextHelpFormatter, 198 | description=( 199 | "Output information about a CFA-netCDF file, or netCDF file either " 200 | "on disk or on S3" 201 | ) 202 | ) 203 | 204 | parser.add_argument( 205 | "input", action="store", default="", 206 | metavar="", 207 | help=( 208 | "Path of the CFA-netCDF or netCDF file input file, either on disk" 209 | " or S3." 210 | ) 211 | ) 212 | 213 | parser.add_argument( 214 | "--group", action="store", default="all", 215 | metavar="", 216 | help=( 217 | "Name of a group to print information about, or print all groups. " 218 | "--group=all|" 219 | ) 220 | ) 221 | 222 | parser.add_argument( 223 | "--variable", action="store", default="all", 224 | metavar="", 225 | help=( 226 | "Name of a variable to print information about, print all or no" "variables. " 227 | "--variable=all|none|" 228 | ) 229 | ) 230 | 231 | parser.add_argument( 232 | "--partition", action = "store", default="none", 233 | metavar="", 234 | help=( 235 | "Print the information about a partition. " 236 | "--partition=all|none|" 237 | ) 238 | ) 239 | 240 | parser.add_argument( 241 | "--metadata", action = "store_true", default=False, 242 | help=( 243 | "Print the metadata for groups, dimensions and variables" 244 | "--metadata" 245 | ) 246 | ) 247 | 248 | args = parser.parse_args() 249 | 250 | if args.input: 251 | input_file = args.input 252 | else: 253 | input_file = None 254 | 255 | if args.group: 256 | group = args.group 257 | else: 258 | group = "all" 259 | 260 | if args.variable: 261 | variable = args.variable 262 | else: 263 | variable = "all" 264 | 265 | if args.partition: 266 | # convert the partition string to a numpy array 267 | partition = args.partition 268 | else: 269 | partition = "none" 270 | 271 | if args.metadata: 272 | metadata = True 273 | else: 274 | metadata = False 275 | 276 | if input_file: 277 | # Get the input file. 278 | path = os.path.expanduser(input_file) 279 | input_dataset = s3Dataset(path, mode='r') 280 | # Print the global dataset information 281 | print_dataset_info( 282 | input_dataset, 283 | group, 284 | variable, 285 | partition, 286 | metadata 287 | ) 288 | #else: 289 | -------------------------------------------------------------------------------- /S3netCDF4/utils/agg.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | import os 3 | from glob import glob 4 | import numpy as np 5 | 6 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset 7 | from S3netCDF4.CFA._CFAClasses import CFAPartition 8 | from S3netCDF4.Managers._FileManager import FileManager 9 | 10 | from netCDF4 import num2date, date2num 11 | 12 | def get_universal_times(nc_var, common_date): 13 | # get the start date and calendar 14 | if ("units" in nc_var.ncattrs() and 15 | "calendar" in nc_var.ncattrs() and 16 | common_date is not None): 17 | date_values = num2date(nc_var[:], 18 | nc_var.units, 19 | nc_var.calendar) 20 | axis_dim_values = date2num(date_values, 21 | common_date, 22 | nc_var.calendar) 23 | else: 24 | axis_dim_values = nc_var[:] 25 | return axis_dim_values 26 | 27 | 28 | def add_var_dims(in_object, out_object, axis, fname, common_date): 29 | """Add the variables and dimensions to the s3Dataset or s3Group""" 30 | # create dimension, get the axis dimension location 31 | axis_dim_n = -1 32 | for d, dim in enumerate(in_object.dimensions): 33 | in_dim = in_object.dimensions[dim] 34 | if dim not in out_object.dimensions: 35 | # get the dim size, 0 is UNLIMITED if dim == axis 36 | if axis == dim: 37 | dim_size = 0 38 | else: 39 | dim_size = in_dim.size 40 | 41 | out_dim = out_object.createDimension( 42 | dim, dim_size 43 | ) 44 | else: 45 | out_dim = out_object.dimensions[dim] 46 | # get the axis dimension 47 | if axis == dim: 48 | axis_dim_n = d 49 | 50 | # create variable 51 | for var in in_object.variables: 52 | in_var = in_object.variables[var] 53 | # get the variable metadata 54 | in_var_attrs = { 55 | x: in_var.getncattr(x) for x in in_var.ncattrs() 56 | } 57 | # if the variable does not already exist then create it 58 | if var not in out_object.variables: 59 | # get the subarray shape 60 | shp = in_var.shape 61 | subarray_shape = np.array(shp, 'i') 62 | if len(in_var.dimensions) > 0: 63 | # rejig axis to be unlimited 64 | if len(subarray_shape) > axis_dim_n: 65 | subarray_shape[axis_dim_n] = 0 66 | # create the variable with subarray 67 | out_var = out_object.createVariable( 68 | var, in_var.dtype, in_var.dimensions, 69 | subarray_shape=subarray_shape 70 | ) 71 | else: # no dimensions, just a scalar variable 72 | out_var = out_object.createVariable( 73 | var, in_var.dtype 74 | ) 75 | else: 76 | # variable has already been created so get it 77 | out_var = out_object.variables[var] 78 | 79 | # only write partitions for field variables - those with _cfa_var != None 80 | if out_var._cfa_var: 81 | # get the current partition matrix shape 82 | c_shape = out_var._cfa_var.getPartitionMatrixShape() 83 | # create the index to append at the end of the currently used 84 | # indices 85 | n_dims = len(out_var.dimensions) 86 | if n_dims > 0: 87 | index = np.zeros(n_dims, 'i') 88 | index[axis_dim_n] = c_shape[0] 89 | # get the location along the aggregation axis in the Master Array, 90 | # from the axis dimension variable 91 | location = np.zeros([n_dims, 2],'i') 92 | 93 | # check whether the axis is in the dimensions of the input_variable 94 | # and calculate the location from it if it is 95 | if axis in in_var.dimensions: 96 | # get the values of the axis variable 97 | axis_dim_var = in_object.variables[axis] 98 | # if this is a time variable then covert the values to a common 99 | # calendar 100 | if axis_dim_var.name == "time" or axis_dim_var.name[0] == "t": 101 | # get the start date and calendar 102 | axis_dim_values = get_universal_times( 103 | axis_dim_var, common_date 104 | ) 105 | 106 | # get the axis resolution - i.e. the difference for each step 107 | # along the axis 108 | try: 109 | axis_res = (axis_dim_values[-1] - axis_dim_values[0]) / len(axis_dim_values) 110 | except IndexError: 111 | axis_res = 1 112 | # prevent divide by zero 113 | if (axis_res == 0.0): 114 | axis_res = 1.0 115 | # set the location for the aggregating axis dimension 116 | location[axis_dim_n, 0] = int(axis_dim_values[0] / axis_res) 117 | location[axis_dim_n, 1] = location[axis_dim_n, 0] + len(axis_dim_var) 118 | # set the locations for the other dimensions - equal to 0 to the 119 | # shape of the array 120 | for d, dim in enumerate(out_var.dimensions): 121 | # don't redo the above for axis_dim_n 122 | if d != axis_dim_n: 123 | location[d, 0] = 0 124 | location[d, 1] = in_var.shape[d] 125 | else: 126 | for d in range(0, len(in_var.shape)): 127 | location[d, 0] = 0 128 | location[d, 1] = in_var.shape[d] 129 | 130 | # get the datamodel from the parent object 131 | try: 132 | datamodel = out_object._nc_grp.data_model 133 | except (KeyError, AttributeError): 134 | datamodel = out_object._nc_dataset.data_model 135 | 136 | # create the partition for none scalar variables 137 | if len(out_var._cfa_var.getPartitionMatrixShape() != 0): 138 | partition = CFAPartition( 139 | index=tuple(index), 140 | location=location, 141 | ncvar=var, 142 | file=fname, 143 | format=datamodel, 144 | shape=in_var.shape 145 | ) 146 | # write the partition 147 | out_var._cfa_var.writePartition(partition) 148 | # add the attributes to the s3Dataset by updating the dictionary 149 | out_var._cfa_var.metadata.update(in_var_attrs) 150 | else: 151 | # assign the values from the input variable to the output variable 152 | # if it is the axis variable then append / concatenate 153 | if var == axis: 154 | var_vals = in_object.variables[var] 155 | axl = out_var._nc_var.shape[axis_dim_n] 156 | # convert times here as well 157 | out_var[axl:] = get_universal_times(var_vals, common_date) 158 | else: 159 | out_var[:] = in_object.variables[var][:] 160 | # update the in_var_attrs to the new common_date if applicable 161 | if (common_date is not None and 162 | "units" in in_var_attrs and 163 | in_var.name == axis): 164 | in_var_attrs["units"] = common_date 165 | out_var.setncatts(in_var_attrs) 166 | 167 | 168 | def create_partitions_from_files(out_dataset, files, axis, 169 | cfa_version, common_date): 170 | """Create the CFA partitions from a list of files.""" 171 | # loop over the files and open as a regular netCDF4 Dataset 172 | for fname in files: 173 | in_dataset = s3Dataset(fname, "r") 174 | # get the global metadata 175 | in_dataset_attrs = { 176 | x: in_dataset.getncattr(x) for x in in_dataset.ncattrs() 177 | } 178 | # add the attributes to the s3Dataset by updating the dictionary 179 | out_dataset._cfa_dataset.metadata.update(in_dataset_attrs) 180 | # loop over the groups 181 | for grp in in_dataset.groups: 182 | in_group = in_dataset[grp] 183 | # create a group if one with this name does not exist 184 | if grp not in out_dataset.groups: 185 | out_group = out_dataset.createGroup(grp) 186 | else: 187 | out_group = out_dataset.groups[grp] 188 | # update the metadata 189 | in_group_attrs = { 190 | x: in_group.getncattr(x) for x in in_group.ncattrs() 191 | } 192 | out_group._cfa_grp.metadata.update(in_group_attrs) 193 | add_var_dims(in_group, out_group, axis, fname, common_date) 194 | 195 | # add the variables in the root group 196 | add_var_dims(in_dataset, out_dataset, axis, fname, common_date) 197 | in_dataset.close() 198 | 199 | 200 | def sort_partition_matrix(out_var, axis): 201 | """Sort the partition matrix for a single variable.""" 202 | # get the index of the axis that we are aggregating over 203 | try: 204 | axis_dim_n = out_var._cfa_var.getPartitionMatrixDimensions().index(axis) 205 | # create the index 206 | n_dims = len(out_var._cfa_var.getDimensions()) 207 | # get the location values from the values 208 | locs = out_var._cfa_var.getPartitionValues(key="location").squeeze() 209 | # get the first (start) location values and get the order to sort them 210 | # in 211 | sort_order = np.argsort(locs[:,axis_dim_n,0]) 212 | # loop over the sort order and write the partition information into 213 | # the new location 214 | # keep a list of partitions 215 | new_parts = [] 216 | for i, s in enumerate(sort_order): 217 | # build the index to get the partition, in the sort order 218 | index = np.zeros(n_dims,'i') 219 | index[axis_dim_n] = s 220 | # get the partition 221 | source_part = out_var._cfa_var.getPartition(index) 222 | # reassign the index 223 | source_part.index[axis_dim_n] = i 224 | # add to the list 225 | new_parts.append(source_part) 226 | 227 | # now rewrite the partitions, and ensure their integrity - i.e. make 228 | # sure that the axis partitions are the right length 229 | for p in range(len(new_parts)): 230 | # get the first new partition and the first location - this is the 231 | # offset which we will need to subtract from the other locations 232 | # in the loop as it changes in the loop 233 | axis_offset = new_parts[0].location[axis_dim_n, 0] 234 | 235 | part = new_parts[p] 236 | if p > 0: 237 | # align with previous partition 238 | prev_part = new_parts[p-1] 239 | part.location[axis_dim_n,0] = prev_part.location[axis_dim_n,1] 240 | # make sure end of partition aligns with shape of array 241 | part.location[axis_dim_n,1] = (part.location[axis_dim_n,0] + 242 | part.shape[axis_dim_n]) 243 | part.location[axis_dim_n,:] -= axis_offset 244 | # subtract the offset 245 | out_var._cfa_var.writePartition(part) 246 | 247 | except ValueError: 248 | axis_dim_n = 0 249 | 250 | 251 | def sort_axis_variable(out_object, axis): 252 | # sort the axis variable and write back out to the netCDF object 253 | try: 254 | axis_dim_var = out_object.variables[axis] 255 | axis_dim_var[:] = np.sort(axis_dim_var[:]) 256 | except KeyError: 257 | pass 258 | 259 | 260 | def sort_partition_matrices(out_dataset, axis): 261 | """Sort the partition matrices for all the variables. Sort is based on the 262 | first element of the location.""" 263 | # need to sort all groups 264 | for grp in out_dataset.groups: 265 | out_group = out_dataset.groups[grp] 266 | # need to sort all variables in the group 267 | for var in out_group.variables: 268 | out_var = out_group.variables[var] 269 | if out_var._cfa_var: 270 | sort_partition_matrix(out_var, axis) 271 | 272 | # sort the axis variable in the group 273 | sort_axis_variable(out_group, axis) 274 | 275 | # need to sort all the variables just in the database 276 | for var in out_dataset.variables: 277 | out_var = out_dataset.variables[var] 278 | if out_var._cfa_var: 279 | sort_partition_matrix(out_var, axis) 280 | 281 | # sort the axis variable in the dataset 282 | sort_axis_variable(out_dataset, axis) 283 | 284 | 285 | def get_file_list(path): 286 | """Get a list of files given the path. 287 | The path can be: 288 | a directory 289 | a glob with multiple wildcards 290 | a 'path' on a S3 storage device 291 | """ 292 | # open the directory as a FileManager object 293 | fm = FileManager() 294 | path = os.path.expanduser(path) 295 | request_object = fm.request_file(path) 296 | file_object = request_object.file_object 297 | 298 | # get a list of files using the file object if it is a remote system 299 | if (file_object.remote_system): 300 | # split the url into the scheme, netloc, etc. 301 | url_o = urlparse(path) 302 | # the alias is the scheme + "://" + netloc 303 | alias = url_o.scheme + "://" + url_o.netloc 304 | # use a paginator to get multiple pages of the objects in the bucket 305 | files = file_object.glob() 306 | # add the alias and bucket to each of the files 307 | bucket = file_object.file_handle._bucket 308 | for i, f in enumerate(files): 309 | files[i] = alias + "/" + bucket + "/" + f 310 | else: 311 | if os.path.isdir(path): 312 | rawfiles = os.listdir(path) 313 | files = [os.path.join(path, f) for f in rawfiles] 314 | # or get a list of files using glob 315 | else: 316 | files = glob(path) 317 | return files 318 | 319 | 320 | def aggregate_into_CFA(output_master_array, path, axis, 321 | cfa_version, common_date=None): 322 | """Aggregate the netCDF files in directory into a CFA master-array file""" 323 | # get the list of files first of all 324 | files = get_file_list(path) 325 | # create the s3Dataset 326 | # create the output master array file 327 | out_dataset = s3Dataset( 328 | output_master_array, 329 | mode='w', 330 | clobber=True, 331 | diskless=False, 332 | cfa_version=cfa_version 333 | ) 334 | # create the partitions from the list - these will be created in the order 335 | # that the files are read in 336 | create_partitions_from_files(out_dataset, files, axis, 337 | cfa_version, common_date) 338 | # we need to sort the partition matrices for each variable - i.e. there is 339 | # one matrix per variable 340 | sort_partition_matrices(out_dataset, axis) 341 | # close the dataset to write / upload it 342 | out_dataset.close() 343 | -------------------------------------------------------------------------------- /S3netCDF4/CFA/Parsers/_CFAnetCDFParser.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2020 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | """ 9 | Parser to read / write CFA metadata from / to a netCDF file. 10 | 11 | See: 12 | http://www.met.reading.ac.uk/~david/cfa/0.4/index.html 13 | for the v0.4 specification of the CFA conventions and an overview of the 14 | CFA conventions 15 | 16 | s3netCDF-python uses an updated version (v0.5) of the CFA conventions which, 17 | rather than writing the partition information to a netCDF attribute as a 18 | string, writes the partition information to variables inside a group. 19 | """ 20 | 21 | from S3netCDF4.CFA._CFAExceptions import CFAParserError 22 | from S3netCDF4.CFA._CFAClasses import ( 23 | CFADataset, CFAGroup, CFAVariable, CFADimension 24 | ) 25 | import netCDF4._netCDF4 as netCDF4 26 | import json 27 | 28 | from S3netCDF4.CFA.Parsers._CFAParser import CFA_Parser 29 | 30 | class CFA_netCDFParser(CFA_Parser): 31 | 32 | def __init__(self): 33 | """Do nothing but set the CFA version used, but don't call the base 34 | class as that will raise NotImplementedError""" 35 | self.CFA_conventions = "CFA" 36 | 37 | def is_file(self, nc_dataset): 38 | """Return whether this input nc_dataset has the requisite metadata to 39 | mark it as a CFA file.""" 40 | if not "Conventions" in nc_dataset.ncattrs(): 41 | return False 42 | if not "CFA" in nc_dataset.getncattr("Conventions"): 43 | return False 44 | return True 45 | 46 | def __get_cfa_version(self, nc_dataset): 47 | """Parse the Conventions attribute to get the CFA version.""" 48 | if not "Conventions" in nc_dataset.ncattrs(): 49 | raise CFAParserError("Not a CFA file.") 50 | else: 51 | conventions = nc_dataset.getncattr("Conventions").split(" ") 52 | cfa_version = "0.0" 53 | for c in conventions: 54 | if "CFA-" in c: 55 | cfa_version = c[4:] 56 | if cfa_version == "0.0": 57 | raise CFAParserError("Not a CFA file.") 58 | return cfa_version 59 | 60 | def __create_s3vars_and_dims(self, s3_object, nc_object, cfa_object): 61 | """Consolidate the variables and dimensions in the nc_object (which may 62 | be a dataset or a group) into the s3_object (which may also be a dataset 63 | or a group), matching them up with the variables and dimensions in the 64 | cfa_object (again, which may be a dataset or group) 65 | """ 66 | from S3netCDF4._s3netCDF4 import s3Dimension, s3Variable 67 | # loop over the variables 68 | s3_object._s3_variables = {} # reset to empty 69 | for var in nc_object.variables: 70 | nc_var = nc_object.variables[var] 71 | if var in cfa_object.getVariables(): 72 | cfa_var = cfa_object.getVariable(var) 73 | # create the s3Variable with links to the cfa variable and nc_var 74 | s3_object._s3_variables[var] = s3Variable( 75 | nc_var=nc_var, 76 | cfa_var=cfa_var, 77 | parent=s3_object 78 | ) 79 | 80 | else: 81 | s3_object._s3_variables[var] = nc_var 82 | 83 | # loop over the dimensions 84 | s3_object._s3_dimensions = {} # reset to empty 85 | for dim in nc_object.dimensions: 86 | nc_dim = nc_object.dimensions[dim] 87 | if dim in cfa_object.getDimensions(): 88 | cfa_dim = cfa_object.getDimension(dim) 89 | # set the datatype for the cfa_dim by getting the type from the 90 | # associated variable 91 | if dim in nc_object.variables: 92 | nc_var = nc_object.variables[dim] 93 | cfa_dim.setType(nc_var.dtype) 94 | # create the s3Dimension with links to the cfa dimension and 95 | # nc_dim 96 | s3_object._s3_dimensions[dim] = s3Dimension( 97 | nc_dim=nc_dim, 98 | cfa_dim=cfa_dim, 99 | parent=s3_object 100 | ) 101 | else: 102 | s3_object._s3_dimensions[dim] = nc_dim 103 | 104 | def __consolidate_from_read(self, s3_dataset): 105 | """Consolidate a s3_dataset from the file read in. 106 | s3_dataset contains a netCDF dataset (_nc_dataset). This contains all 107 | the definitions of the variables, dimensions and groups as netCDF4 108 | variables, dimensions and groups. We want to convert these into their 109 | s3 equivalents (s3_variable, s3_dimension and s3_group). 110 | This involves directly manipulating the s3_dataset object. 111 | """ 112 | from S3netCDF4._s3netCDF4 import s3Group 113 | nc_dataset = s3_dataset._nc_dataset 114 | cfa_dataset = s3_dataset._cfa_dataset 115 | 116 | # loop over the variables and dimensions (in the root group) 117 | if "root" in s3_dataset._cfa_dataset.getGroups(): 118 | cfa_grp = s3_dataset._cfa_dataset.getGroup("root") 119 | else: 120 | cfa_grp = s3_dataset._cfa_dataset.createGroup("root") 121 | self.__create_s3vars_and_dims(s3_dataset, nc_dataset, cfa_grp) 122 | 123 | # loop over the groups 124 | for grp in nc_dataset.groups: 125 | nc_grp = nc_dataset.groups[grp] 126 | if grp in cfa_dataset.getGroups(): 127 | cfa_grp = cfa_dataset.getGroup(grp) 128 | # create the s3Group with links to the cfa group and nc_grp 129 | s3_dataset._s3_groups[grp] = s3Group( 130 | cfa_grp=cfa_grp, 131 | nc_grp=nc_grp, 132 | parent=s3_dataset 133 | ) 134 | # create the vars and dims in the group 135 | self.__create_s3vars_and_dims( 136 | s3_dataset._s3_groups[grp], 137 | nc_grp, 138 | cfa_grp 139 | ) 140 | 141 | else: 142 | s3_dataset._s3_groups[grp] = nc_grp 143 | 144 | 145 | def read(self, s3_dataset, filename=""): 146 | """Parse an already open s3_dataset to build the _CFAClasses 147 | hierarchy. 148 | 149 | Args: 150 | netcdf_dataset (Dataset): the open dataset from the netcdf4-python 151 | library. 152 | 153 | Returns: 154 | CFADataset: The CFADataset object, populated with CFAGroups, which 155 | are in turn populated with CFADims and CFAVariables. 156 | """ 157 | # get the netCDF dataset from the s3 dataset 158 | nc_dataset = s3_dataset._nc_dataset 159 | # check this is a CFA file 160 | if not self.is_file(nc_dataset): 161 | raise CFAParserError("Not a CFA file.") 162 | 163 | # get the cfa version so we can interpret it as CFA-0.5 (in netCDF4 164 | # format) or CFA-0.4 (in netCDF3, CLASSIC or netCDF4 format) 165 | cfa_version = self.__get_cfa_version(nc_dataset) 166 | # check to see if there are any groups and, if there is, create a CFAgroup 167 | # and add the nc_group to a dictionary of groups. Start with the root 168 | # group pointing to the base Dataset 169 | nc_groups = {"root" : nc_dataset} 170 | if len(nc_dataset.groups) != 0: 171 | for grp_name in nc_dataset.groups: 172 | nc_groups[grp_name] = nc_dataset.groups[grp_name] 173 | 174 | # get the metadata from the dataset in a new dictionary 175 | nc_dataset_md = {a:nc_dataset.getncattr(a) for a in nc_dataset.ncattrs()} 176 | # create the CFADataset, with the metadata and format, and empty groups 177 | cfa_dataset = CFADataset( 178 | name=filename, 179 | format=nc_dataset.data_model, 180 | metadata=nc_dataset_md, 181 | cfa_version=cfa_version 182 | ) 183 | # now loop over all the groups, and add a CFAGroup to each dataset, then 184 | # the CFAVariables and CFADimensions contained in that group 185 | output_groups = {} 186 | for group_name in nc_groups: 187 | nc_group = nc_groups[group_name] 188 | nc_group_md = {a:nc_group.getncattr(a) for a in nc_group.ncattrs()} 189 | cfa_group = cfa_dataset.createGroup(group_name, nc_group_md) 190 | # next parse the dimensions 191 | for nc_dimname in nc_group.dimensions: 192 | # get the dimension 193 | nc_dim = nc_group.dimensions[nc_dimname] 194 | # get the dimension's associated variable 195 | try: 196 | nc_dim_var = nc_group.variables[nc_dimname] 197 | # get the metadata from the dim var 198 | nc_dim_var_md = { 199 | a:nc_dim_var.getncattr(a) for a in nc_dim_var.ncattrs() 200 | } 201 | except KeyError: 202 | nc_dim_var_md = {} 203 | # create the dimension and append to list of cfa_dims 204 | cfa_dim = cfa_group.createDimension( 205 | dim_name=nc_dimname, 206 | dim_len=nc_dim.size, 207 | metadata=nc_dim_var_md 208 | ) 209 | 210 | # loop over the variables in the group / dataset 211 | for nc_varname in nc_group.variables: 212 | nc_var = nc_group.variables[nc_varname] 213 | nc_var_md = {a:nc_var.getncattr(a) for a in nc_var.ncattrs()} 214 | if "cf_role" in nc_var_md: 215 | cfa_var = cfa_group.createVariable( 216 | var_name=nc_varname, 217 | nc_dtype=nc_var.dtype, 218 | metadata=nc_var_md 219 | ) 220 | if cfa_version == "0.4": 221 | # this parses from the 0.4 version - i.e all the 222 | # metadata is stored in the netCDF attributes 223 | cfa_var.parse(nc_var_md) 224 | elif cfa_version == "0.5": 225 | # this parses from the 0.5 version - i.e. all the 226 | # metadata is stored in a variable in a group 227 | cfa_var.load(nc_var_md, nc_group) 228 | else: 229 | raise CFAParserError( 230 | "Unsupported CFA version ({}) in file.".format( 231 | cfa_version 232 | ) 233 | ) 234 | # load the cfa_dataset into the s3_dataset that was passed in 235 | s3_dataset._cfa_dataset = cfa_dataset 236 | # need to "consolidate" the dataset - create s3 variants of the netCDF 237 | # groups, variables and dimensions - call .consolidate_from_read 238 | # on the s3_dataset passed in 239 | self.__consolidate_from_read(s3_dataset) 240 | 241 | def write(self, cfa_dataset, s3_dataset): 242 | """Write the _CFAClasses hierarchy to an already open netcdf_dataset 243 | (opened with 'w' write flag). 244 | 245 | Args: 246 | cfa_dataset (CFADataset): the top class in the _CFAClasses hierarchy 247 | s3_dataset (Dataset): the open dataset from the netcdf4-python 248 | library. Has to have been opened with 'w' flag. 249 | 250 | Returns: 251 | None 252 | """ 253 | # add the CFA conventions into the metadata 254 | dataset_metadata = cfa_dataset.getMetadata() 255 | cfa_version = cfa_dataset.getCFAVersion() 256 | cfa_conventions = self.CFA_conventions + "-{}".format(cfa_version) 257 | # get the underlying netCDF4 dataset 258 | nc_dataset = s3_dataset._nc_dataset 259 | if "Conventions" in dataset_metadata: 260 | dataset_metadata["Conventions"] += " " + cfa_conventions 261 | else: 262 | dataset_metadata["Conventions"] = cfa_conventions 263 | 264 | # set the global metadata 265 | nc_dataset.setncatts(dataset_metadata) 266 | # get the groups 267 | for group in cfa_dataset.getGroups(): 268 | # get the actual group 269 | cfa_group = cfa_dataset.getGroup(group) 270 | if (group == "root"): 271 | s3_group = s3_dataset 272 | nc_group = nc_dataset # just a shortcut to the nc_group 273 | else: 274 | s3_group = s3_dataset.groups[group] 275 | nc_group = s3_group._nc_grp # shortcut 276 | # set the metadata for the group 277 | nc_group.setncatts(cfa_group.getMetadata()) 278 | 279 | # set the metadata for the variables 280 | for var in cfa_group.getVariables(): 281 | # get the actual cfa variable 282 | cfa_var = cfa_group.getVariable(var) 283 | # get the variable 284 | nc_var = s3_group._s3_variables[var]._nc_var 285 | # get the variable metadata 286 | var_md = dict(cfa_var.getMetadata()) 287 | # add the cfa metadata - if it is a cfa variable 288 | if cfa_var.getRole() != "": 289 | var_md['cf_role'] = cfa_var.getRole() 290 | var_md['cfa_dimensions'] = " ".join(cfa_var.getDimensions()) 291 | # if the convention version is >= 0.5 then the data has 292 | # already been written into the cfa metagroup 293 | # for v0.4 we need to dump it into the attribute string 294 | if cfa_version == "0.4": 295 | # write the partition data 296 | var_md['cfa_array'] = json.dumps(cfa_var.dump()['cfa_array']) 297 | elif cfa_version == "0.5": 298 | # just need to name the cfa_metagroup as an attribute in 299 | # the original variable 300 | var_md['cfa_group'] = "cfa_" + var 301 | else: 302 | raise CFAParserError( 303 | "Unsupported CFA version ({}) in file.".format( 304 | cfa_version 305 | ) 306 | ) 307 | # set the metadata for the variable 308 | nc_var.setncatts(var_md) 309 | # set the metadata for the dimension variables 310 | for dim_var in cfa_group.getDimensions(): 311 | # get the actual cfa dimensions 312 | cfa_dim = cfa_group.getDimension(dim_var) 313 | # get the netCDF variable for this dimension 314 | try: 315 | nc_dimvar = s3_group.variables[cfa_dim.getName()]._nc_var 316 | # copy the dimension metadata into the (dimension) variable 317 | # metadata 318 | dim_md = dict(cfa_dim.getMetadata()) 319 | # set the metadata for the variable 320 | nc_dimvar.setncatts(dim_md) 321 | except KeyError: 322 | pass # don't try to write to dimension with no associated 323 | # variable 324 | -------------------------------------------------------------------------------- /S3netCDF4/Backends/_s3FileObject.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | import io 9 | from fnmatch import fnmatch 10 | from urllib.parse import urlparse 11 | 12 | from botocore.exceptions import ClientError 13 | import botocore.session 14 | 15 | from S3netCDF4.Managers._ConnectionPool import ConnectionPool 16 | from S3netCDF4.Managers._ConfigManager import Config 17 | from S3netCDF4._Exceptions import APIException, IOException 18 | 19 | class s3FileObject(io.BufferedIOBase): 20 | """Custom file object class, inheriting from Python io.Base, to read from 21 | an S3 object store / AWS cloud storage.""" 22 | 23 | """Static connection pool object - i.e. shared across the file objects.""" 24 | _connection_pool = ConnectionPool() 25 | 26 | # The defaults for MAXIMUM_PART_SIZE etc. are now assigned in 27 | # __init__ if no values are found in ~/.s3nc.json 28 | """Static config object for the backend options""" 29 | _config = Config() 30 | 31 | def _get_server_bucket_object(uri): 32 | """Get the server name from the URI""" 33 | # First split the uri into the network location and path, and build the 34 | # server 35 | url_p = urlparse(uri) 36 | # check that the uri contains a scheme and a netloc 37 | if url_p.scheme == '' or url_p.netloc == '': 38 | raise APIException( 39 | "URI supplied to s3FileObject is not well-formed: {}".format(uri) 40 | ) 41 | server = url_p.scheme + "://" + url_p.netloc 42 | split_path = url_p.path.split("/") 43 | # get the bucket 44 | try: 45 | bucket = split_path[1] 46 | except IndexError as e: 47 | raise APIException( 48 | "URI supplied has no bucket contained within it: {}".format(uri) 49 | ) 50 | # get the path 51 | try: 52 | path = "/".join(split_path[2:]) 53 | except IndexError as e: 54 | raise APIException( 55 | "URI supplied has no path contained within it: {}".format(uri) 56 | ) 57 | return server, bucket, path 58 | 59 | def __init__(self, uri, credentials, mode='r', create_bucket=True, 60 | part_size=None, max_parts=None, multipart_upload=None, 61 | multipart_download=None, connect_timeout=None, 62 | read_timeout=None): 63 | """Initialise the file object by creating or reusing a connection in the 64 | connection pool.""" 65 | # get the server, bucket and the key from the endpoint url 66 | self._server, self._bucket, self._path = s3FileObject._get_server_bucket_object(uri) 67 | self._closed = False # set the file to be not closed 68 | self._mode = mode 69 | self._seek_pos = 0 70 | self._buffer = [io.BytesIO()] # have a list of objects that can stream 71 | self._credentials = credentials 72 | self._create_bucket = create_bucket 73 | self._uri = uri 74 | 75 | """Either get the backend config from the parameters, or the config file 76 | or use defaults.""" 77 | if "s3FileObject" in s3FileObject._config["backends"]: 78 | backend_config = s3FileObject._config["backends"]["s3FileObject"] 79 | else: 80 | backend_config = {} 81 | 82 | if part_size: 83 | self._part_size = int(part_size) 84 | elif "maximum_part_size" in backend_config: 85 | self._part_size = int(backend_config["maximum_part_size"]) 86 | else: 87 | self._part_size = int(50 * 1024 * 1024) 88 | 89 | if max_parts: 90 | self._max_parts = int(max_parts) 91 | elif "maximum_parts" in backend_config: 92 | self._max_parts = int(backend_config["maximum_parts"]) 93 | else: 94 | self._max_parts = 8 95 | 96 | if multipart_upload: 97 | self._multipart_upload = multipart_upload 98 | elif "multipart_upload" in backend_config: 99 | self._multipart_upload = backend_config["multipart_upload"] 100 | else: 101 | self._multipart_upload = True 102 | 103 | if multipart_download: 104 | self._multipart_download = multipart_download 105 | elif "multipart_download" in backend_config: 106 | self._multipart_download = backend_config["multipart_download"] 107 | else: 108 | self._multipart_download = True 109 | 110 | if connect_timeout: 111 | self._connect_timeout = connect_timeout 112 | elif "connect_timeout" in backend_config: 113 | self._connect_timeout = backend_config["connect_timeout"] 114 | else: 115 | self._connect_timeout = 30.0 116 | 117 | if read_timeout: 118 | self._read_timeout = read_timeout 119 | elif "read_timeout" in backend_config: 120 | self._read_timeout = backend_config["read_timeout"] 121 | else: 122 | self._read_timeout = 30.0 123 | 124 | def __enter__(self): 125 | """Create the connection on an enter.""" 126 | self.connect() 127 | return self 128 | 129 | def __exit__(self, exc_type, exc_value, exc_tb): 130 | """Close the file on the exit of a with statement, or by the garbage 131 | collector removing the object.""" 132 | self.close() 133 | # check for any exceptions 134 | if exc_type is not None: 135 | return False 136 | return True 137 | 138 | def _getsize(self): 139 | # Use content length in the head object to determine the size of 140 | # the file / object 141 | # If we are writing then the size should be the buffer size 142 | try: 143 | if 'w' in self._mode: 144 | size = self._part_size 145 | else: 146 | response = self._conn_obj.conn.head_object( 147 | Bucket=self._bucket, 148 | Key=self._path 149 | ) 150 | size = response['ContentLength'] 151 | except ClientError as e: 152 | raise IOException( 153 | "Could not get size of object {}".format(self._path) 154 | ) 155 | except AttributeError as e: 156 | self._handle_connection_exception(e) 157 | return size 158 | 159 | def _get_bucket_list(self): 160 | # get the names of the buckets in a list 161 | try: 162 | bl = self._conn_obj.conn.list_buckets()['Buckets'] # this returns a dict 163 | bucket_list = [b['Name'] for b in bl] 164 | except AttributeError as e: 165 | self._handle_connection_exception(e) 166 | return bucket_list 167 | 168 | def _handle_connection_exception(self, e): 169 | # Check if connection made 170 | if ("_conn_obj" in e.args[0] or "_current_part" in e.args[0]): 171 | raise APIException( 172 | "Connection to S3 server is not established. Use either the " 173 | ".connect method or a with statement." 174 | ) 175 | else: 176 | # other AttributeError - handle that separately 177 | raise e 178 | 179 | def connect(self): 180 | """Connect to the s3 server with the details passed in via the __init__ 181 | method.""" 182 | # if the connection returns None then either there isn't a connection to 183 | # the server in the pool, or there is no connection that is available 184 | self._conn_obj = s3FileObject._connection_pool.get(self._server) 185 | if self._conn_obj is None: 186 | try: 187 | session = botocore.session.get_session() 188 | config = botocore.config.Config( 189 | connect_timeout=self._connect_timeout, 190 | read_timeout=self._connect_timeout 191 | ) 192 | s3c = session.create_client( 193 | "s3", 194 | endpoint_url=self._server, 195 | aws_access_key_id=self._credentials["accessKey"], 196 | aws_secret_access_key=self._credentials["secretKey"], 197 | config=config 198 | ) 199 | # add the connection to the connection pool 200 | self._conn_obj = s3FileObject._connection_pool.add( 201 | s3c, self._server 202 | ) 203 | except ClientError as e: 204 | raise IOException( 205 | "Could not connect to S3 endpoint {} {}".format( 206 | self._server, e) 207 | ) 208 | if ('r' in self._mode and '*' not in self._path and 209 | '?' not in self._path): 210 | # if this is a read method then check the file exists 211 | response = self._conn_obj.conn.list_objects_v2( 212 | Bucket=self._bucket, 213 | Prefix=self._path 214 | ) 215 | exists = False 216 | for obj in response.get('Contents', []): 217 | if obj['Key'] == self._path: 218 | exists = True 219 | break 220 | if not exists: 221 | raise IOException( 222 | "Object does not exist: {}/{}/{}".format( 223 | self._server, self._bucket, self._path 224 | ) 225 | ) 226 | if 'w' in self._mode: 227 | # if this is a write method then create a bytes array 228 | self._current_part = 1 229 | if 'a' in self._mode or '+' in self._mode: 230 | raise APIException( 231 | "Appending to files is not supported {}".format(self._path) 232 | ) 233 | return True 234 | 235 | def detach(self): 236 | """Separate the underlying raw stream from the buffer and return it. 237 | Not supported in S3.""" 238 | raise io.UnsupportedOperation 239 | 240 | def read(self, size=-1): 241 | """Read and return up to size bytes. For the S3 implementation the size 242 | can be used for RangeGet. If size==-1 then the whole object is streamed 243 | into memory.""" 244 | # read the object using the bucket and path already determined in 245 | # __init__, and using the connection object 246 | try: 247 | if size== -1: 248 | s3_object = self._conn_obj.conn.get_object( 249 | Bucket = self._bucket, 250 | Key = self._path 251 | ) 252 | body = s3_object['Body'] 253 | else: 254 | # do the partial / range get version, and increment the seek 255 | # pointer 256 | range_end = self._seek_pos+size-1 257 | file_size = self._getsize() 258 | if range_end >= file_size: 259 | range_end = file_size-1 260 | 261 | if not self._multipart_download: 262 | s3_object = self._conn_obj.conn.get_object( 263 | Bucket = self._bucket, 264 | Key = self._path, 265 | ) 266 | body = s3_object['Body'] 267 | else: 268 | s3_object = self._conn_obj.conn.get_object( 269 | Bucket = self._bucket, 270 | Key = self._path, 271 | Range = 'bytes={}-{}'.format( 272 | self._seek_pos, range_end 273 | ) 274 | ) 275 | self._seek_pos += size 276 | body = s3_object['Body'] 277 | except ClientError as e: 278 | raise IOException( 279 | "Could not read from object {} {}".format(self._path, e) 280 | ) 281 | except AttributeError as e: 282 | self._handle_connection_exception(e) 283 | return body.read() 284 | 285 | def read1(self, size=-1): 286 | """Just call read.""" 287 | return self.read(size=size) 288 | 289 | def readinto(self, b): 290 | """Read bytes into a pre-allocated, writable bytes-like object b and 291 | return the number of bytes read. 292 | In S3 the entire file is read into the bytesbuffer. It is important 293 | that the bytesbuffer is big enough to hold the entire file.""" 294 | # get the size of the file 295 | size = self._getsize() 296 | b[:size] = self.read(size) 297 | return size 298 | 299 | def readinto1(self, b): 300 | """Just call readinto""" 301 | return self.readinto(b) 302 | 303 | def _multipart_upload_from_buffer(self): 304 | """Do a multipart upload from the buffer. 305 | There are two cases: 306 | 1. The size is exactly the same size as the self._part_size 307 | 2. The size is greater than the self._part_size 308 | """ 309 | # check to see if bucket needs to be created 310 | if self._create_bucket: 311 | # check whether the bucket exists 312 | bucket_list = self._get_bucket_list() 313 | if not self._bucket in bucket_list: 314 | self._conn_obj.conn.create_bucket(Bucket=self._bucket) 315 | 316 | # if the current part is 1 we have to create the multipart upload 317 | if self._current_part == 1: 318 | response = self._conn_obj.conn.create_multipart_upload( 319 | Bucket = self._bucket, 320 | Key = self._path 321 | ) 322 | self._upload_id = response['UploadId'] 323 | # we need to keep a track of the multipart info 324 | self._multipart_info = {'Parts' : []} 325 | 326 | # upload from a buffer - do we need to split into more than one 327 | # multiparts? Remember: self._buffer is a list of BytesIO objects 328 | new_buffer = [] 329 | for buffer_part in range(0, len(self._buffer)): 330 | # is the current part of the buffer larger than the maximum 331 | # upload size? split if it is 332 | data_buf = self._buffer[buffer_part] 333 | data_len = data_buf.tell() 334 | if data_len >= self._part_size: 335 | data_buf.seek(0) 336 | data_pos = 0 337 | # split the file up 338 | while data_pos < data_len: 339 | new_buffer.append(io.BytesIO()) 340 | # copy the data - don't overstep the buffer 341 | if data_pos + self._part_size >= data_len: 342 | sub_data = data_buf.read(data_len-data_pos) 343 | else: 344 | sub_data = data_buf.read(self._part_size) 345 | new_buffer[-1].write(sub_data) 346 | # increment to next 347 | data_pos += self._part_size 348 | 349 | # free the old memory 350 | self._buffer[buffer_part].close() 351 | else: 352 | self._buffer[buffer_part].seek(0) 353 | new_buffer.append(io.BytesIO(self._buffer[buffer_part].read())) 354 | 355 | self._buffer = new_buffer 356 | 357 | for buffer_part in range(0, len(self._buffer)): 358 | # seek in the BytesIO buffer to get to the beginning after the 359 | # writing§ 360 | self._buffer[buffer_part].seek(0) 361 | # upload here 362 | part = self._conn_obj.conn.upload_part( 363 | Bucket=self._bucket, 364 | Key=self._path, 365 | UploadId=self._upload_id, 366 | PartNumber=self._current_part, 367 | Body=self._buffer[buffer_part] 368 | ) 369 | # insert into the multipart info list of dictionaries 370 | self._multipart_info['Parts'].append( 371 | { 372 | 'PartNumber' : self._current_part, 373 | 'ETag' : part['ETag'] 374 | } 375 | ) 376 | self._current_part += 1 377 | 378 | # reset all the byte buffers and their positions 379 | for buffer_part in range(0, len(self._buffer)): 380 | self._buffer[buffer_part].close() 381 | self._buffer = [io.BytesIO()] 382 | self._seek_pos = 0 383 | self._current_part += 1 384 | 385 | def write(self, b): 386 | """Write the given bytes-like object, b, and return the number of bytes 387 | written (always equal to the length of b in bytes, since if the write 388 | fails an OSError will be raised). 389 | For the S3 file object we just write the file to a temporary bytearray 390 | and increment the seek_pos. 391 | This data will be uploaded to an object when .flush is called. 392 | """ 393 | if "w" not in self._mode: 394 | raise APIException( 395 | "Trying to write to a read only file, where mode != 'w'." 396 | ) 397 | try: 398 | # add to local, temporary bytearray 399 | size = len(b) 400 | self._buffer[-1].write(b) 401 | self._seek_pos += size 402 | # test to see whether we should do a multipart upload now 403 | # this occurs when the number of buffers is > the maximum number of 404 | # parts. self._current_part is indexed from 1 405 | if (self._multipart_upload and 406 | self._seek_pos > self._part_size): 407 | if len(self._buffer) == self._max_parts: 408 | self._multipart_upload_from_buffer() 409 | else: 410 | # add another buffer to write to 411 | self._buffer.append(io.BytesIO()) 412 | 413 | except ClientError as e: 414 | raise IOException( 415 | "Could not write to object {} {}".format(self._path, e) 416 | ) 417 | except AttributeError as e: 418 | self._handle_connection_exception(e) 419 | 420 | return size 421 | 422 | def close(self): 423 | """Flush and close this stream. This method has no effect if the file is 424 | already closed. Once the file is closed, any operation on the file (e.g. 425 | reading or writing) will raise a ValueError. 426 | 427 | As a convenience, it is allowed to call this method more than once; only 428 | the first call, however, will have an effect.""" 429 | try: 430 | if not self._closed: 431 | # self.flush will upload the buffer to the S3 store 432 | self.flush() 433 | s3FileObject._connection_pool.release(self._conn_obj) 434 | self._closed = True 435 | except AttributeError as e: 436 | self._handle_connection_exception(e) 437 | return True 438 | 439 | def seek(self, offset, whence=io.SEEK_SET): 440 | """Change the stream position to the given byte offset. offset is 441 | interpreted relative to the position indicated by whence. The default 442 | value for whence is SEEK_SET. Values for whence are: 443 | 444 | SEEK_SET or 0 – start of the stream (the default); offset should be zero 445 | or positive 446 | SEEK_CUR or 1 – current stream position; offset may be negative 447 | SEEK_END or 2 – end of the stream; offset is usually negative 448 | Return the new absolute position. 449 | 450 | Note: currently cannot seek when writing a file. 451 | 452 | """ 453 | if self._mode == 'w': 454 | raise IOException( 455 | "Cannot seek within a file that is being written to." 456 | ) 457 | 458 | size = self._getsize() 459 | error_string = "Seek {} is outside file size bounds 0->{} for file {}" 460 | seek_pos = self._seek_pos 461 | if whence == io.SEEK_SET: 462 | # range check 463 | seek_pos = offset 464 | elif whence == io.SEEK_CUR: 465 | seek_pos += offset 466 | elif whence == io.SEEK_END: 467 | seek_pos = size - offset 468 | 469 | # range checks 470 | if (seek_pos >= size): 471 | raise IOException(error_string.format( 472 | seek_pos, 473 | size, 474 | self._path) 475 | ) 476 | elif (seek_pos < 0): 477 | raise IOException(error_string.format( 478 | seek_pos, 479 | size, 480 | self._path) 481 | ) 482 | self._seek_pos = seek_pos 483 | return self._seek_pos 484 | 485 | def seekable(self): 486 | """We can seek in s3 streams using the range get and range put features. 487 | """ 488 | return True 489 | 490 | def tell(self): 491 | """Return True if the stream supports random access. If False, seek(), 492 | tell() and truncate() will raise OSError.""" 493 | return self._seek_pos 494 | 495 | def fileno(self): 496 | """Return the underlying file descriptor (an integer) of the stream if 497 | it exists. An IOError is raised if the IO object does not use a file 498 | descriptor.""" 499 | raise io.UnsupportedOperation 500 | 501 | def flush(self): 502 | """Flush the write buffers of the stream. This will upload the contents 503 | of the final multipart upload of self._buffer to the S3 store.""" 504 | try: 505 | if 'w' in self._mode: 506 | # if the size is less than the MAXIMUM UPLOAD SIZE 507 | # then just write the data 508 | size = self._buffer[0].tell() 509 | if self._current_part == 1 and size < self._part_size: 510 | if self._create_bucket: 511 | # check whether the bucket exists and create if not 512 | bucket_list = self._get_bucket_list() 513 | if not self._bucket in bucket_list: 514 | self._conn_obj.conn.create_bucket( 515 | Bucket=self._bucket 516 | ) 517 | # upload the whole buffer - seek back to the start first 518 | self._buffer[0].seek(0) 519 | self._conn_obj.conn.put_object( 520 | Bucket=self._bucket, 521 | Key=self._path, 522 | Body=self._buffer[0].read(size) 523 | ) 524 | else: 525 | # upload as multipart 526 | self._multipart_upload_from_buffer() 527 | # finalise the multipart upload 528 | self._conn_obj.conn.complete_multipart_upload( 529 | Bucket=self._bucket, 530 | Key=self._path, 531 | UploadId=self._upload_id, 532 | MultipartUpload=self._multipart_info 533 | ) 534 | except AttributeError as e: 535 | self._handle_connection_exception(e) 536 | return True 537 | 538 | def readable(self): 539 | """Return True if the stream can be read from. If False, read() will 540 | raise IOError.""" 541 | return 'r' in self._mode or '+' in self._mode 542 | 543 | def readline(self, size=-1): 544 | """Read and return one line from the stream. 545 | If size is specified, at most size bytes will be read.""" 546 | if 'b' in self._mode: 547 | raise APIException( 548 | "readline on a binary file is not permitted: {}".format( 549 | self._uri) 550 | ) 551 | # only read a set number of bytes if size is passed in, otherwise 552 | # read upto the file size 553 | if size == -1: 554 | size = self._getsize() 555 | 556 | # use the BytesIO readline methods 557 | if self.tell() == 0: 558 | buffer = self.read(size=size) 559 | self._buffer[-1].write(buffer) 560 | self._buffer[-1].seek(0) 561 | 562 | line = self._buffer[-1].readline().decode().strip() 563 | return line 564 | 565 | def readlines(self, hint=-1): 566 | """Read and return a list of lines from the stream. hint can be 567 | specified to control the number of lines read: no more lines will be 568 | read if the total size (in bytes/characters) of all lines so far exceeds 569 | hint.""" 570 | if 'b' in self._mode: 571 | raise APIException( 572 | "readline on a binary file is not permitted: {}".format( 573 | self._uri) 574 | ) 575 | # read the entire file in and decode it 576 | lines = self.read().decode().split("\n") 577 | return lines 578 | 579 | def truncate(self, size=None): 580 | """Not supported""" 581 | raise io.UnsupportedOperation 582 | 583 | def writable(self): 584 | """Return True if the stream supports writing. If False, write() and 585 | truncate() will raise IOError.""" 586 | return 'w' in self._mode 587 | 588 | def writelines(self, lines): 589 | """Write a list of lines to the stream.""" 590 | # first check if the file is binary or not 591 | if 'b' in self._mode: 592 | raise APIException( 593 | "writelines on a binary file is not permitted: {}".format( 594 | self._uri) 595 | ) 596 | # write all but the last line with a line break 597 | for l in lines: 598 | self.write((l+"\n").encode('utf-8')) 599 | return True 600 | 601 | def glob(self): 602 | """Emulate glob on an open bucket. The glob has been passed in via 603 | self._path, created on connection to the server and bucket.""" 604 | # get the path string up to the wildcards 605 | try: 606 | pi1 = self._path.index("*") 607 | except ValueError: 608 | pi1 = len(self._path) 609 | try: 610 | pi2 = self._path.index("?") 611 | except ValueError: 612 | pi2 = len(self._path) 613 | pi = min(pi1, pi2) 614 | # using the prefix will cut down on the search space 615 | prefix = self._path[:pi] 616 | # get the wildcard 617 | wildcard = self._path[pi:] 618 | # set up the paginator 619 | paginator = self._conn_obj.conn.get_paginator("list_objects_v2") 620 | parameters = { 621 | 'Bucket': self._bucket, 622 | 'Prefix': prefix 623 | } 624 | page_iterator = paginator.paginate(**parameters) 625 | files = [] 626 | for page in page_iterator: 627 | for item in page.get('Contents', []): 628 | fname = item['Key'] 629 | # check that it matches against wildcard 630 | if fnmatch(fname, wildcard): 631 | files.append(item['Key']) 632 | return files 633 | -------------------------------------------------------------------------------- /S3netCDF4/Backends/_s3aioFileObject.pyx: -------------------------------------------------------------------------------- 1 | #!python 2 | #cython: language_level=3 3 | 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council" 5 | __license__ = "BSD - see LICENSE file in top-level directory" 6 | __authors__ = "Neil Massey" 7 | 8 | import io 9 | from fnmatch import fnmatch 10 | from urllib.parse import urlparse 11 | 12 | import asyncio 13 | import aiobotocore 14 | from botocore.exceptions import ClientError 15 | import botocore.config 16 | 17 | from S3netCDF4.Managers._ConnectionPool import ConnectionPool, ConnectionObject 18 | from S3netCDF4.Managers._ConfigManager import Config 19 | from S3netCDF4._Exceptions import APIException, IOException 20 | 21 | class s3aioFileObject(object): 22 | """Custom file object class, inheriting from Python io.Base, to read from 23 | an S3 object store / AWS cloud storage.""" 24 | 25 | """Static connection pool object - i.e. shared across the file objects.""" 26 | _connection_pool = ConnectionPool() 27 | 28 | # The defaults for MAXIMUM_PART_SIZE etc. are now assigned in 29 | # __init__ if no values are found in ~/.s3nc.json 30 | """Static config object for the backend options""" 31 | _config = Config() 32 | 33 | def _get_server_bucket_object(uri): 34 | """Get the server name from the URI""" 35 | # First split the uri into the network location and path, and build the 36 | # server 37 | url_p = urlparse(uri) 38 | # check that the uri contains a scheme and a netloc 39 | if url_p.scheme == '' or url_p.netloc == '': 40 | raise APIException( 41 | "URI supplied to s3aioFileObject is not well-formed: {}". format(uri) 42 | ) 43 | server = url_p.scheme + "://" + url_p.netloc 44 | split_path = url_p.path.split("/") 45 | # get the bucket 46 | try: 47 | bucket = split_path[1] 48 | except IndexError as e: 49 | raise APIException( 50 | "URI supplied has no bucket contained within it: {}".format(uri) 51 | ) 52 | # get the path 53 | try: 54 | path = "/".join(split_path[2:]) 55 | except IndexError as e: 56 | raise APIException( 57 | "URI supplied has no path contained within it: {}".format(uri) 58 | ) 59 | return server, bucket, path 60 | 61 | def __init__(self, uri, credentials, mode='r', create_bucket=True, 62 | part_size=None, max_parts=None, multipart_upload=None, 63 | multipart_download=None, connect_timeout=None, 64 | read_timeout=None): 65 | """Initialise the file object by creating or reusing a connection in the 66 | connection pool.""" 67 | # get the server, bucket and the key from the endpoint url 68 | self._server, self._bucket, self._path = s3aioFileObject._get_server_bucket_object(uri) 69 | self._closed = False # set the file to be not closed 70 | self._mode = mode 71 | self._seek_pos = 0 72 | self._buffer = [io.BytesIO()] # have a list of objects that can stream 73 | self._credentials = credentials 74 | self._create_bucket = create_bucket 75 | self._uri = uri 76 | 77 | """Either get the backend config from the parameters, or the config file 78 | or use defaults.""" 79 | if "s3aioFileObject" in s3aioFileObject._config["backends"]: 80 | backend_config = s3aioFileObject._config["backends"]["s3aioFileObject"] 81 | else: 82 | backend_config = {} 83 | 84 | if part_size: 85 | self._part_size = int(part_size) 86 | elif "maximum_part_size" in backend_config: 87 | self._part_size = int(backend_config["maximum_part_size"]) 88 | else: 89 | self._part_size = int(50 * 1024 * 1024) 90 | 91 | if max_parts: 92 | self._max_parts = int(max_parts) 93 | elif "maximum_parts" in backend_config: 94 | self._max_parts = int(backend_config["maximum_parts"]) 95 | else: 96 | self._max_parts = 8 97 | 98 | if multipart_upload: 99 | self._multipart_upload = multipart_upload 100 | elif "multipart_upload" in backend_config: 101 | self._multipart_upload = backend_config["multipart_upload"] 102 | else: 103 | self._multipart_upload = True 104 | 105 | if multipart_download: 106 | self._multipart_download = multipart_download 107 | elif "multipart_download" in backend_config: 108 | self._multipart_download = backend_config["multipart_download"] 109 | else: 110 | self._multipart_download = True 111 | 112 | if connect_timeout: 113 | self._connect_timeout = connect_timeout 114 | elif "connect_timeout" in backend_config: 115 | self._connect_timeout = backend_config["connect_timeout"] 116 | else: 117 | self._connect_timeout = 30.0 118 | 119 | if read_timeout: 120 | self._read_timeout = read_timeout 121 | elif "read_timeout" in backend_config: 122 | self._read_timeout = backend_config["read_timeout"] 123 | else: 124 | self._read_timeout = 30.0 125 | 126 | async def __aenter__(self): 127 | """Async version of the enter context method.""" 128 | await self.connect() 129 | return self 130 | 131 | async def __aexit__(self, exc_type, exc_value, exc_tb): 132 | """Close the file on the exit of a with statement, or by the garbage 133 | collector removing the object.""" 134 | await self.close() 135 | # check for any exceptions 136 | if exc_type is not None: 137 | return False 138 | return True 139 | 140 | async def _getsize(self): 141 | # Use content length in the head object to determine how the size of 142 | # the file / object 143 | # If we are writing then the size should be the buffer size 144 | try: 145 | if 'w' in self._mode: 146 | size = self._part_size 147 | else: 148 | response = await self._conn_obj.conn.head_object( 149 | Bucket=self._bucket, 150 | Key=self._path 151 | ) 152 | size = response['ContentLength'] 153 | except ClientError as e: 154 | raise IOException( 155 | "Could not get size of object {}".format(self._path) 156 | ) 157 | except AttributeError as e: 158 | self._handle_connection_exception(e) 159 | return size 160 | 161 | async def _get_bucket_list(self): 162 | # get the names of the buckets in a list 163 | try: 164 | bl = await self._conn_obj.conn.list_buckets() 165 | bucket_list = [b['Name'] for b in bl['Buckets']] 166 | except AttributeError as e: 167 | self._handle_connection_exception(e) 168 | return bucket_list 169 | 170 | def _handle_connection_exception(self, e): 171 | # Check if connection made 172 | if ("_conn_obj" in e.args[0] or "_current_part" in e.args[0]): 173 | raise APIException( 174 | "Connection to S3 server is not established. Use either the " 175 | ".connect method or a with statement." 176 | ) 177 | else: 178 | # other AttributeError - handle that separately 179 | raise e 180 | 181 | async def connect(self): 182 | """Connect to the s3 server with the details passed in via the __init__ 183 | method.""" 184 | # if the connection returns None then either there isn't a connection to 185 | # the server in the pool, or there is no connection that is available 186 | self._conn_obj = s3aioFileObject._connection_pool.get(self._server) 187 | if self._conn_obj is None: 188 | try: 189 | session = aiobotocore.get_session() 190 | config = botocore.config.Config( 191 | connect_timeout=self._connect_timeout, 192 | read_timeout=self._read_timeout 193 | ) 194 | s3c = session.create_client( 195 | "s3", 196 | endpoint_url=self._server, 197 | aws_access_key_id=self._credentials["accessKey"], 198 | aws_secret_access_key=self._credentials["secretKey"], 199 | config=config 200 | ) 201 | # call await s3c.__aenter__ : this is needed for newer versions 202 | # of aiobotocore 203 | s3c = await s3c.__aenter__() 204 | # add the connection to the connection pool 205 | self._conn_obj = s3aioFileObject._connection_pool.add( 206 | s3c, self._server 207 | ) 208 | except ClientError as e: 209 | raise IOException( 210 | "Could not connect to S3 endpoint {} {}".format( 211 | self._server, e) 212 | ) 213 | 214 | if ('r' in self._mode and '*' not in self._path and 215 | '?' not in self._path): 216 | # if this is a read method then check the file exists 217 | response = await self._conn_obj.conn.list_objects_v2( 218 | Bucket=self._bucket, 219 | Prefix=self._path 220 | ) 221 | exists = False 222 | for obj in response.get('Contents', []): 223 | if obj['Key'] == self._path: 224 | exists = True 225 | if not exists: 226 | raise IOException( 227 | "Object does not exist: {}/{}/{}".format( 228 | self._server, self._bucket, self._path 229 | ) 230 | ) 231 | if 'w' in self._mode: 232 | # if this is a write method then create a bytes array 233 | self._current_part = 1 234 | if 'a' in self._mode or '+' in self._mode: 235 | raise APIException( 236 | "Appending to files is not supported {}".format(self._path) 237 | ) 238 | return True 239 | 240 | def detach(self): 241 | """Separate the underlying raw stream from the buffer and return it. 242 | Not supported in S3.""" 243 | raise io.UnsupportedOperation 244 | 245 | async def _read_partial_file(self, part_num, part_size): 246 | s = int(part_num*part_size) 247 | e = int((part_num+1)*part_size)-1 248 | range_fmt = 'bytes={}-{}'.format(s,e) 249 | s3_object = await self._conn_obj.conn.get_object( 250 | Bucket = self._bucket, 251 | Key = self._path, 252 | Range = range_fmt 253 | ) 254 | body = s3_object['Body'] 255 | return await body.read() 256 | 257 | async def read(self, size=-1): 258 | """Read and return up to size bytes. For the S3 implementation the size 259 | can be used for RangeGet. If size==-1 then the whole object is streamed 260 | into memory.""" 261 | # read the object using the bucket and path already determined in 262 | # __init__, and using the connection object 263 | try: 264 | # get the file size first 265 | file_size = await self._getsize() 266 | if size== -1: 267 | range_start = 0 268 | range_end = file_size 269 | range_size = file_size 270 | else: 271 | range_start = self._seek_pos 272 | range_end = self._seek_pos+size-1 273 | if range_end > file_size: 274 | range_end = file_size-1 275 | range_size = range_end-range_start+1 276 | 277 | # if multipart download is not supported 278 | if not self._multipart_download: 279 | # get the full file 280 | s3_object = await self._conn_obj.conn.get_object( 281 | Bucket = self._bucket, 282 | Key = self._path, 283 | ) 284 | body = s3_object['Body'] 285 | data = await body.read() 286 | # if the file is smaller than the MAXIMUM_PART_SIZE 287 | elif (range_size < self._part_size): 288 | # the requested range is the full file, it is fastest to 289 | # not specify the range 290 | if (range_start == 0 and range_size == file_size): 291 | # get the full file 292 | s3_object = await self._conn_obj.conn.get_object( 293 | Bucket = self._bucket, 294 | Key = self._path, 295 | ) 296 | # a portion of the file is requested 297 | else: 298 | s3_object = await self._conn_obj.conn.get_object( 299 | Bucket = self._bucket, 300 | Key = self._path, 301 | Range = 'bytes={}-{}'.format( 302 | range_start, range_end 303 | ) 304 | ) 305 | body = s3_object['Body'] 306 | data = await body.read() 307 | # multipart download version 308 | else: 309 | """Use range get to split up a file into the MAXIMUM_PART_SIZE 310 | and download each part asynchronously.""" 311 | # calculate the number of necessary parts 312 | n_parts = int(range_size / self._part_size + 1) 313 | # don't go above the maximum number downloadable 314 | if n_parts > self._max_parts: 315 | n_parts = self._max_parts 316 | # (re)calculate the download size 317 | part_size = float(range_size) / n_parts 318 | # create the tasks and assign the return data buffer 319 | tasks = [] 320 | data_buf = io.BytesIO() 321 | 322 | for p in range(0, n_parts): 323 | event_loop = asyncio.get_event_loop() 324 | task = event_loop.create_task(self._read_partial_file( 325 | p, part_size 326 | )) 327 | tasks.append(task) 328 | # wait for all the tasks to finish 329 | results = await asyncio.gather(*tasks) 330 | # read each chunk of data and write into the global buffer 331 | for r in results: 332 | data_buf.write(r) 333 | r = None # indicate ready for garbage collection 334 | data_buf.seek(0) 335 | data = data_buf.read() 336 | 337 | except ClientError as e: 338 | raise IOException( 339 | "Could not read from object {} {}".format(self._path, e) 340 | ) 341 | except AttributeError as e: 342 | self._handle_connection_exception(e) 343 | return data 344 | 345 | async def read1(self, size=-1): 346 | """Just call read.""" 347 | return await self.read(size=size) 348 | 349 | async def readinto(self, b): 350 | """Read bytes into a pre-allocated, writable bytes-like object b and 351 | return the number of bytes read. 352 | In S3 the entire file is read into the bytesbuffer. It is important 353 | that the bytesbuffer is big enough to hold the entire file.""" 354 | # get the size of the file 355 | size = await self._getsize() 356 | b[:size] = await self.read(size) 357 | return size 358 | 359 | async def readinto1(self, b): 360 | """Just call readinto""" 361 | return await self.readinto(b) 362 | 363 | async def _multipart_upload_from_buffer(self): 364 | """Do a multipart upload from the buffer. 365 | There are three cases: 366 | 1. The size is exactly the same size as the MAXIMUM_PART_SIZE 367 | 2. The size is greater than the MAXIMUM_PART_SIZE 368 | 3. The size is multiple times greater than the MAX_UPLOAD_SIZE and 369 | requires splitting into smaller chunks 370 | """ 371 | # check to see if bucket needs to be created 372 | if self._create_bucket: 373 | # check whether the bucket exists 374 | bucket_list = await self._get_bucket_list() 375 | if not self._bucket in bucket_list: 376 | await self._conn_obj.conn.create_bucket(Bucket=self._bucket) 377 | 378 | # if the current part is 1 we have to create the multipart upload 379 | if self._current_part == 1: 380 | response = await self._conn_obj.conn.create_multipart_upload( 381 | Bucket = self._bucket, 382 | Key = self._path 383 | ) 384 | self._upload_id = response['UploadId'] 385 | # we need to keep a track of the multipart info 386 | self._multipart_info = {'Parts' : []} 387 | 388 | # upload from a buffer - do we need to split into more than one 389 | # multiparts? 390 | new_buffer = [] 391 | for buffer_part in range(0, len(self._buffer)): 392 | # is the current part of the buffer larger than the maximum 393 | # upload size? split if it is 394 | data_buf = self._buffer[buffer_part] 395 | data_len = data_buf.tell() 396 | if data_len >= self._part_size: 397 | data_buf.seek(0) 398 | data_pos = 0 399 | # split the file up 400 | while data_pos < data_len: 401 | new_buffer.append(io.BytesIO()) 402 | # copy the data - don't overstep the buffer 403 | if data_pos + self._part_size >= data_len: 404 | sub_data = data_buf.read(data_len-data_pos) 405 | else: 406 | sub_data = data_buf.read( 407 | self._part_size 408 | ) 409 | new_buffer[-1].write(sub_data) 410 | # increment to next 411 | data_pos += self._part_size 412 | 413 | # free the old memory 414 | self._buffer[buffer_part].close() 415 | else: 416 | # copy the old buffer into a new one 417 | self._buffer[buffer_part].seek(0) 418 | new_buffer.append(io.BytesIO(self._buffer[buffer_part].read())) 419 | 420 | # close other buffers first 421 | for b in self._buffer: 422 | b.close() 423 | self._buffer = new_buffer 424 | 425 | tasks = [] 426 | 427 | for buffer_part in range(0, len(self._buffer)): 428 | # seek in the BytesIO buffer to get to the beginning after the 429 | # writing 430 | self._buffer[buffer_part].seek(0) 431 | # upload here 432 | # schedule the uploads 433 | event_loop = asyncio.get_event_loop() 434 | task = event_loop.create_task(self._conn_obj.conn.upload_part( 435 | Bucket=self._bucket, 436 | Key=self._path, 437 | UploadId=self._upload_id, 438 | PartNumber=self._current_part + buffer_part, 439 | Body=self._buffer[buffer_part] 440 | )) 441 | tasks.append(task) 442 | 443 | # await the completion of the uploads 444 | res = await asyncio.gather(*tasks) 445 | for buffer_part in range(0, len(self._buffer)): 446 | # insert into the multipart info list of dictionaries 447 | part = res[buffer_part] 448 | self._multipart_info['Parts'].append( 449 | { 450 | 'PartNumber' : self._current_part + buffer_part, 451 | 'ETag' : part['ETag'] 452 | } 453 | ) 454 | 455 | # add the total number of uploads to the current part 456 | self._current_part += len(self._buffer) 457 | 458 | # reset all the byte buffers and their positions 459 | for buffer_part in range(0, len(self._buffer)): 460 | self._buffer[buffer_part].close() 461 | self._buffer = [io.BytesIO()] 462 | self._seek_pos = 0 463 | 464 | async def write(self, b): 465 | """Write the given bytes-like object, b, and return the number of bytes 466 | written (always equal to the length of b in bytes, since if the write 467 | fails an OSError will be raised). 468 | For the S3 file object we just write the file to a temporary bytearray 469 | and increment the seek_pos. 470 | This data will be uploaded to an object when .flush is called. 471 | """ 472 | if "w" not in self._mode: 473 | raise APIException( 474 | "Trying to write to a read only file, where mode != 'w'." 475 | ) 476 | try: 477 | # add to local, temporary bytearray 478 | size = len(b) 479 | self._buffer[-1].write(b) 480 | self._seek_pos += size 481 | # test to see whether we should do a multipart upload now 482 | # this occurs when the number of buffers is > the maximum number of 483 | # parts. self._current_part is indexed from 1 484 | if (self._multipart_upload and 485 | self._seek_pos > self._part_size): 486 | if len(self._buffer) == self._max_parts: 487 | await self._multipart_upload_from_buffer() 488 | else: 489 | # add another buffer to write to 490 | self._buffer.append(io.BytesIO()) 491 | 492 | except ClientError as e: 493 | raise IOException( 494 | "Could not write to object {} {}".format(self._path, e) 495 | ) 496 | except AttributeError as e: 497 | self._handle_connection_exception(e) 498 | 499 | return size 500 | 501 | async def close(self): 502 | """Flush and close this stream. This method has no effect if the file is 503 | already closed. Once the file is closed, any operation on the file (e.g. 504 | reading or writing) will raise a ValueError. 505 | 506 | As a convenience, it is allowed to call this method more than once; only 507 | the first call, however, will have an effect.""" 508 | try: 509 | if not self._closed: 510 | # self.flush will upload the bytesarray to the S3 store 511 | await self.flush() 512 | s3aioFileObject._connection_pool.release(self._conn_obj) 513 | self._closed = True 514 | except AttributeError as e: 515 | self._handle_connection_exception(e) 516 | return True 517 | 518 | async def seek(self, offset, whence=io.SEEK_SET): 519 | """Change the stream position to the given byte offset. offset is 520 | interpreted relative to the position indicated by whence. The default 521 | value for whence is SEEK_SET. Values for whence are: 522 | 523 | SEEK_SET or 0 – start of the stream (the default); offset should be zero 524 | or positive 525 | SEEK_CUR or 1 – current stream position; offset may be negative 526 | SEEK_END or 2 – end of the stream; offset is usually negative 527 | Return the new absolute position. 528 | 529 | Note: currently cannot seek when writing a file. 530 | 531 | """ 532 | 533 | if self._mode == 'w': 534 | raise IOException( 535 | "Cannot seek within a file that is being written to." 536 | ) 537 | 538 | size = await self._getsize() 539 | error_string = "Seek {} is outside file size bounds 0->{} for file {}" 540 | seek_pos = self._seek_pos 541 | if whence == io.SEEK_SET: 542 | # range check 543 | seek_pos = offset 544 | elif whence == io.SEEK_CUR: 545 | seek_pos += offset 546 | elif whence == io.SEEK_END: 547 | seek_pos = size - offset 548 | 549 | # range checks 550 | if (seek_pos >= size): 551 | raise IOException(error_string.format( 552 | seek_pos, 553 | size, 554 | self._path) 555 | ) 556 | elif (seek_pos < 0): 557 | raise IOException(error_string.format( 558 | seek_pos, 559 | size, 560 | self._path) 561 | ) 562 | self._seek_pos = seek_pos 563 | return self._seek_pos 564 | 565 | def seekable(self): 566 | """We can seek in s3 streams using the range get and range put features. 567 | """ 568 | return True 569 | 570 | def tell(self): 571 | """Return True if the stream supports random access. If False, seek(), 572 | tell() and truncate() will raise OSError.""" 573 | return self._seek_pos 574 | 575 | def fileno(self): 576 | """Return the underlying file descriptor (an integer) of the stream if 577 | it exists. An IOError is raised if the IO object does not use a file 578 | descriptor.""" 579 | raise io.UnsupportedOperation 580 | 581 | async def flush(self): 582 | """Flush the write buffers of the stream. This will upload the contents 583 | of the final multipart upload of self._buffer to the S3 store.""" 584 | try: 585 | if 'w' in self._mode: 586 | # if the size is less than the MAXIMUM UPLOAD SIZE 587 | # then just write the data 588 | size = self._buffer[0].tell() 589 | if ((self._current_part == 1 and 590 | size < self._part_size) or 591 | not self._multipart_upload 592 | ): 593 | if self._create_bucket: 594 | # check whether the bucket exists and create if not 595 | bucket_list = await self._get_bucket_list() 596 | if not self._bucket in bucket_list: 597 | await self._conn_obj.conn.create_bucket( 598 | Bucket=self._bucket 599 | ) 600 | # upload the whole buffer - seek back to the start first 601 | self._buffer[0].seek(0) 602 | await self._conn_obj.conn.put_object( 603 | Bucket=self._bucket, 604 | Key=self._path, 605 | Body=self._buffer[0].read(size) 606 | ) 607 | else: 608 | # upload as multipart 609 | await self._multipart_upload_from_buffer() 610 | # finalise the multipart upload 611 | await self._conn_obj.conn.complete_multipart_upload( 612 | Bucket=self._bucket, 613 | Key=self._path, 614 | UploadId=self._upload_id, 615 | MultipartUpload=self._multipart_info 616 | ) 617 | # clear the buffers 618 | for b in self._buffer: 619 | b.close() 620 | 621 | except AttributeError as e: 622 | self._handle_connection_exception(e) 623 | return True 624 | 625 | def readable(self): 626 | """Return True if the stream can be read from. If False, read() will 627 | raise IOError.""" 628 | return 'r' in self._mode or '+' in self._mode 629 | 630 | async def readline(self, size=-1): 631 | """Read and return one line from the stream. 632 | If size is specified, at most size bytes will be read.""" 633 | if 'b' in self._mode: 634 | raise APIException( 635 | "readline on a binary file is not permitted: {}".format( 636 | self._uri) 637 | ) 638 | # only read a set number of bytes if size is passed in, otherwise 639 | # read upto the file size 640 | if size == -1: 641 | size = self._getsize() 642 | 643 | # use the BytesIO readline methods 644 | if self.tell() == 0: 645 | buffer = await self.read(size=size) 646 | self._buffer[-1].write(buffer) 647 | self._buffer[-1].seek(0) 648 | 649 | line = self._buffer[-1].readline().decode().strip() 650 | return line 651 | 652 | async def readlines(self, hint=-1): 653 | """Read and return a list of lines from the stream. hint can be 654 | specified to control the number of lines read: no more lines will be 655 | read if the total size (in bytes/characters) of all lines so far exceeds 656 | hint.""" 657 | if 'b' in self._mode: 658 | raise APIException( 659 | "readline on a binary file is not permitted: {}".format( 660 | self._uri) 661 | ) 662 | # read the entire file in and decode it 663 | lines = await self.read().decode().split("\n") 664 | return lines 665 | 666 | def truncate(self, size=None): 667 | """Not supported""" 668 | raise io.UnsupportedOperation 669 | 670 | def writable(self): 671 | """Return True if the stream supports writing. If False, write() and 672 | truncate() will raise IOError.""" 673 | return 'w' in self._mode 674 | 675 | async def writelines(self, lines): 676 | """Write a list of lines to the stream.""" 677 | # first check if the file is binary or not 678 | if 'b' in self._mode: 679 | raise APIException( 680 | "writelines on a binary file is not permitted: {}".format( 681 | self._uri) 682 | ) 683 | # write all but the last line with a line break 684 | for l in lines: 685 | await self.write((l+"\n").encode('utf-8')) 686 | return True 687 | 688 | async def glob(self): 689 | """Emulate glob on an open bucket. The glob has been passed in via 690 | self._path, created on connection to the server and bucket.""" 691 | # get the path string up to the wildcards 692 | try: 693 | pi1 = self._path.index("*") 694 | except ValueError: 695 | pi1 = len(self._path) 696 | try: 697 | pi2 = self._path.index("?") 698 | except ValueError: 699 | pi2 = len(self._path) 700 | pi = min(pi1, pi2) 701 | # using the prefix will cut down on the search space 702 | prefix = self._path[:pi] 703 | # get the wildcard 704 | wildcard = self._path[pi:] 705 | # set up the paginator 706 | paginator = self._conn_obj.conn.get_paginator("list_objects_v2") 707 | parameters = { 708 | 'Bucket': self._bucket, 709 | 'Prefix': prefix 710 | } 711 | page_iterator = paginator.paginate(**parameters) 712 | files = [] 713 | async for page in page_iterator: 714 | for item in page.get('Contents', []): 715 | fname = item['Key'] 716 | # check that it matches against wildcard 717 | if fnmatch(fname, wildcard): 718 | files.append(item['Key']) 719 | return files 720 | --------------------------------------------------------------------------------