├── S3netCDF4
    ├── __init__.py
    ├── CFA
    │   ├── Parsers
    │   │   ├── __init__.py
    │   │   ├── _CFAParser.pyx
    │   │   └── _CFAnetCDFParser.pyx
    │   ├── _CFAExceptions.pyx
    │   ├── __init__.py
    │   └── _CFASplitter.pyx
    ├── Managers
    │   ├── __init__.py
    │   ├── _ConnectionPool.pyx
    │   └── _ConfigManager.pyx
    ├── utils
    │   ├── __init__.py
    │   ├── split.py
    │   └── agg.py
    ├── Backends
    │   ├── __init__.py
    │   ├── _s3FileObject.pyx
    │   └── _s3aioFileObject.pyx
    └── _Exceptions.pyx
├── requirements.txt
├── pyproject.toml
├── Makefile
├── ROADMAP.md
├── config
    └── .s3nc.json.template
├── tutorial
    ├── readme.txt
    └── lesson_1.py
├── LICENSE
├── .gitignore
├── bin
    ├── s3nc_cfa_agg.py
    ├── s3nc_cfa_split.py
    ├── s3nc_cfa_mv.py
    └── s3nc_cfa_info.py
├── CHANGELOG.md
├── test
    ├── test_split.py
    ├── test_s3FileObject.py
    ├── test_s3aioFileObject.py
    └── test_s3Dataset.py
└── setup.py


/S3netCDF4/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/S3netCDF4/CFA/Parsers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/S3netCDF4/Managers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/S3netCDF4/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.19.4
2 | Cython==0.29.21
3 | netCDF4==1.5.5.1
4 | botocore==1.19.20
5 | aiobotocore==1.1.2
6 | psutil==5.7.3
7 | 


--------------------------------------------------------------------------------
/S3netCDF4/Backends/__init__.py:
--------------------------------------------------------------------------------
1 | # need to import all the backends
2 | from ._s3FileObject import s3FileObject
3 | from ._s3aioFileObject import s3aioFileObject
4 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | # Minimum requirements for the build system to execute.
3 | requires = ["setuptools", "wheel", "Cython", "numpy"] # PEP 508 specifications.
4 | build-backend = "setuptools.build_meta"
5 | 


--------------------------------------------------------------------------------
/S3netCDF4/_Exceptions.pyx:
--------------------------------------------------------------------------------
 1 | #!python
 2 | #cython: language_level=3
 3 | 
 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
 5 | __license__ = "BSD - see LICENSE file in top-level directory"
 6 | __authors__ = "Neil Massey"
 7 | 
 8 | # Exception classes to indicate they come from the s3 component of the library
 9 | class IOException(BaseException):
10 |     pass
11 | 
12 | class MemoryException(BaseException):
13 |     pass
14 | 
15 | class APIException(BaseException):
16 |     pass
17 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # S3-netcdf-python Makefile
 2 | # Simple makefile for compiling the Cython externals when developing
 3 | 
 4 | # Setup.py will build these externals once on installation, so it is not
 5 | # necessary to run this Makefile on installation for a user.
 6 | # This Makefile only needs to be used when developing.
 7 | 
 8 | all:
 9 | 	python setup.py build_ext --inplace
10 | 
11 | clean:
12 | 	rm -f *.so *.c
13 | 	rm -f ./S3netCDF4/Backends/*.so ./S3netCDF4/Backends/*.c
14 | 	rm -f ./S3netCDF4/CFA/Parsers/*.so ./S3netCDF4/CFA/Parsers/*.c
15 | 	rm -f ./S3netCDF4/CFA/*.so ./S3netCDF4/CFA/*.c
16 | 	rm -f ./S3netCDF4/Managers/*.so ./S3netCDF4/Managers/*.c
17 | 	rm -f ./S3netCDF4/*.so ./S3netCDF4/*.c
18 | 


--------------------------------------------------------------------------------
/S3netCDF4/CFA/_CFAExceptions.pyx:
--------------------------------------------------------------------------------
 1 | #!python
 2 | #cython: language_level=3
 3 | 
 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
 5 | __license__ = "BSD - see LICENSE file in top-level directory"
 6 | __authors__ = "Neil Massey"
 7 | 
 8 | """Exceptions for the _CFAClasses"""
 9 | 
10 | class CFAError(BaseException):
11 |     pass
12 | 
13 | class CFAGroupError(CFAError):
14 |     pass
15 | 
16 | class CFADimensionError(CFAError):
17 |     pass
18 | 
19 | class CFAVariableError(CFAError):
20 |     pass
21 | 
22 | class CFAVariableIndexError(CFAError, IndexError):
23 |     pass
24 | 
25 | class CFAPartitionError(CFAError):
26 |     pass
27 | 
28 | class CFAPartitionIndexError(CFAError, IndexError):
29 |     pass
30 | 
31 | class CFASubArrayError(CFAError):
32 |     pass
33 | 
34 | class CFAParserError(CFAError):
35 |     pass
36 | 


--------------------------------------------------------------------------------
/ROADMAP.md:
--------------------------------------------------------------------------------
 1 | Roadmap for improvements to s3netCDF-python
 2 | ===========================================
 3 | 
 4 | 1. Improve documentation, provide more examples and tutorials
 5 | 2. Add support for unequal partition sizes (completed in v2.0.5)
 6 | 3. Add support for striding in slices e.g. [1:20:2]
 7 | 4. Add support for streaming files greater than memory to disk / cache
 8 | 5. Make more use of Cython features - add types for all variables in .pyx files
 9 | 6. More unit tests and continuous integration
10 | 7. Add Compatibility with xarray and Zarr: read and write xarray / Zarr files,
11 | i.e. the master array file is an xarray JSON attributes file, and provide
12 | support for Zarr with a CFA master-array file, i.e. the chunks are Zarr but the
13 | master-array file is CFA-netCDF.
14 | 8. Upgrade aiobotocore to latest.  v1.0+ has an API that breaks previous
15 | version. (completed in v2.0.5)
16 | 9. Add Dask support for parallel workflows.
17 | 


--------------------------------------------------------------------------------
/config/.s3nc.json.template:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": "9",
 3 |     "hosts": {
 4 |         "{{ hostname0 }}": {
 5 |             "alias": "{{ host0_alias }}",
 6 |             "url": "{{ host0_url }}",
 7 |             "credentials": {
 8 |                 "accessKey": "{{ host0_access_key }}",
 9 |                 "secretKey": "{{ host0_secret_key }}"
10 |             },
11 |             "backend": "s3aioFileObject",
12 |             "api": "S3v4"
13 |         }
14 |     },
15 |     "backends": {
16 |         "s3aioFileObject" : {
17 |             "maximum_part_size": "50MB",
18 |             "maximum_parts": 8,
19 |             "enable_multipart_download": true,
20 |             "enable_multipart_upload": true,
21 |             "connect_timeout": 30.0,
22 |             "read_timeout": 30.0
23 |         },
24 |         "s3FileObject" : {
25 |             "maximum_part_size": "50MB",
26 |             "maximum_parts": 4,
27 |             "enable_multipart_download": true,
28 |             "enable_multipart_upload": true,
29 |             "connect_timeout": 30.0,
30 |             "read_timeout": 30.0
31 | 
32 |         }
33 |     },
34 |     "cache_location" : "{{ cache_location }}",
35 |     "resource_allocation" : {
36 |         "memory": "{{ memory_allocation_limit }}",
37 |         "filehandles": {{ filehandle_allocation_limit }}
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/S3netCDF4/CFA/Parsers/_CFAParser.pyx:
--------------------------------------------------------------------------------
 1 | #!python
 2 | #cython: language_level=3
 3 | 
 4 | __copyright__ = "(C) 2020 Science and Technology Facilities Council"
 5 | __license__ = "BSD - see LICENSE file in top-level directory"
 6 | __authors__ = "Neil Massey"
 7 | 
 8 | """
 9 |    Collection of functions that parse files with embedded CFA metadata and
10 |    return a hierarchy of objects instantiated from the _CFAClasses.
11 |    See the class definitions and documentation in _CFAClasses.pyx for this
12 |    hierarchy.
13 | 
14 |    See:
15 |      http://www.met.reading.ac.uk/~david/cfa/0.4/index.html
16 |    for the specification of the CFA conventions.
17 | 
18 |    s3netCDF-python uses an updated version (v0.5) of the CFA conventions which,
19 |    rather than writing the partition information to a netCDF attribute as a
20 |    string, writes the partition information to variables inside a group.
21 | """
22 | 
23 | class CFA_Parser(object):
24 |     """Base class for CFA Parser - pure abstract so raise an exception."""
25 |     def __init__(self):
26 |         raise NotImplementedError
27 | 
28 |     def read(self, input_object):
29 |         raise NotImplementedError
30 | 
31 |     def write(self, cfa_dataset, output_object):
32 |         raise NotImplementedError
33 | 
34 |     def is_file(self, input_object):
35 |         raise NotImplementedError
36 | 


--------------------------------------------------------------------------------
/tutorial/readme.txt:
--------------------------------------------------------------------------------
 1 | S3-netCDF-python tutorials for JASMIN users
 2 | ===========================================
 3 | 
 4 | Setup
 5 | -----
 6 | To access these	tutorials you will need access to the cedadev-o Caringo tenancy.
 7 | Please see the following webpage to set up the account:
 8 | 
 9 | https://help.jasmin.ac.uk/article/4847-using-the-jasmin-object-store
10 | 
11 | > module load jaspy
12 | > create a venv
13 | > pip install -e git+https://github.com/cedadev/S3-netcdf-python.git@version2
14 | 
15 | Config
16 | ------
17 | You will need to create a configuration file in your home directory:
18 | Using nano text editor:
19 | 
20 | > nano ~/.s3nc.json
21 | 
22 | Copy this text into the file opened in nano:
23 | {
24 |     "version": "9",
25 |     "hosts": {
26 | 	"s3://cedadev-o": {
27 |             "alias": "cedadev-o",
28 |                 "url": "http://cedadev-o.s3.jc.rl.ac.uk",
29 |                 "credentials": {
30 |                     "accessKey": "access_key",
31 |                     "secretKey": "secret_key"
32 |                 },
33 |                 "backend": "s3aioFileObject",
34 |                 "api": "S3v4"
35 |         }
36 |     },
37 |     "cache_location": "~/.cache"
38 | }
39 | 
40 | replace "access_key" and "secret_key" with the credentials you got from the Caringo
41 | Swarm portal.
42 | 
43 | Contents
44 | --------
45 | Tutorial_1 - Read data from a CMIP6 file
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2019-2021, Centre of Environmental Data Analysis Developers,
 4 | Scientific and Technical Facilities Council (STFC),
 5 | UK Research and Innovation (UKRI).
 6 | All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without
 9 | modification, are permitted provided that the following conditions are met:
10 | 
11 | * Redistributions of source code must retain the above copyright notice, this
12 |   list of conditions and the following disclaimer.
13 | 
14 | * Redistributions in binary form must reproduce the above copyright notice,
15 |   this list of conditions and the following disclaimer in the documentation
16 |   and/or other materials provided with the distribution.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/tutorial/lesson_1.py:
--------------------------------------------------------------------------------
 1 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
 2 | __license__ = "BSD - see LICENSE file in top-level directory"
 3 | __authors__ = "Neil Massey"
 4 | 
 5 | # s3-netCDF-python Tutorial 1
 6 | # Purpose: Read a time series from a CMIP6 dataset
 7 | # Author : Neil Massey
 8 | # Date   : 12/05/2020
 9 | 
10 | from S3netCDF4._s3netCDF4 import s3Dataset as Dataset
11 | import numpy as np
12 | 
13 | # Dataset (Master Array File) location, this is on the Caringo object store,
14 | # using the alias defined in the config file in the user's home directory:
15 | # ~/.s3nc.json
16 | data_location = "s3://cedadev-o/cmip6/CMIP/MOHC/HadGEM3-GC31-MM/historical/r1i1p1f3/day/tas/gn/tas_day_HadGEM3-GC31-MM_historical_r1i1p1f3_gn.nc"
17 | var_name = "tas"
18 | 
19 | # We open the Master Array File just like opening a netCDF Dataset
20 | s3_ds = Dataset(data_location, 'r')
21 | 
22 | # We can inspect the dataset by printing it, just like in netcdf4-python
23 | print("CFA DATASET: ", s3_ds)
24 | 
25 | # We can also examine the variables in the Dataset
26 | print("VARIABLES: ", s3_ds.variables)
27 | 
28 | # and the groups in the Dataset
29 | print("GROUPS:    ", s3_ds.groups)
30 | 
31 | # We can then get a variable from the Dataset
32 | var = s3_ds.variables[var_name]
33 | # and inspect it
34 | print("TAS:       ", var)
35 | 
36 | # we can get a timeseries for one year timeseries by slicing the variable:
37 | # this will return a numpy array
38 | var_data = var[:360, 45, 45]
39 | 
40 | # calculate seasonal means
41 | #print(np.mean(var_data[0:90]), np.mean(var_data[90:180]), np.mean(var_data[180:240]), np.mean(var_data[240:360]))
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.c
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | env/
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | 
 58 | # Flask stuff:
 59 | instance/
 60 | .webassets-cache
 61 | 
 62 | # Scrapy stuff:
 63 | .scrapy
 64 | 
 65 | # Sphinx documentation
 66 | docs/_build/
 67 | 
 68 | # PyBuilder
 69 | target/
 70 | 
 71 | # Jupyter Notebook
 72 | .ipynb_checkpoints
 73 | 
 74 | # pyenv
 75 | .python-version
 76 | 
 77 | # celery beat schedule file
 78 | celerybeat-schedule
 79 | 
 80 | # SageMath parsed files
 81 | *.sage.py
 82 | 
 83 | # dotenv
 84 | .env
 85 | 
 86 | # virtualenv
 87 | .venv
 88 | venv/
 89 | ENV/
 90 | 
 91 | # Spyder project settings
 92 | .spyderproject
 93 | .spyproject
 94 | 
 95 | # Rope project settings
 96 | .ropeproject
 97 | 
 98 | # mkdocs documentation
 99 | /site
100 | 
101 | # mypy
102 | .mypy_cache/
103 | 
104 | archive/
105 | .idea/
106 | 
107 | .s3config.json
108 | 


--------------------------------------------------------------------------------
/bin/s3nc_cfa_agg.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
 4 | __license__ = "BSD - see LICENSE file in top-level directory"
 5 | __authors__ = "Neil Massey"
 6 | 
 7 | """Program to aggregate netCDF-CFA files from disk or s3.
 8 | This program will produce a master array file, containing references to the
 9 | files that have been aggregated.
10 | """
11 | 
12 | import argparse
13 | from S3netCDF4.utils.agg import aggregate_into_CFA
14 | 
15 | if __name__ == "__main__":
16 |     # set up and parse the arguments
17 |     parser = argparse.ArgumentParser(
18 |         prog="s3nc_cfa_agg",
19 |         formatter_class=argparse.RawTextHelpFormatter,
20 |         description=(
21 |             "Aggregate a number of netCDF files into a CFA-netCDF "
22 |             "master-array file."
23 |         )
24 |     )
25 | 
26 |     parser.add_argument(
27 |         "output", action="store", default="", metavar="<output CFA file>",
28 |         help=(
29 |             "Path of the output master-array file."
30 |         )
31 |     )
32 | 
33 |     parser.add_argument(
34 |         "dir", action="store", default="", metavar="<input path>",
35 |         help=(
36 |             "Path of a directory containing netCDF files to aggregate into a "
37 |             "CFA-netCDF master-array file."
38 |         )
39 |     )
40 | 
41 |     parser.add_argument(
42 |         "--cfa_version", action="store", default="0.5",
43 |         help=("Version of CFA conventions to use, 0.4|0.5")
44 |     )
45 | 
46 |     parser.add_argument(
47 |         "--axis", action="store", default="time",
48 |         help=("Axis to aggregate along, default=time")
49 |     )
50 | 
51 |     parser.add_argument(
52 |         "--common_date", action="store", default=None,
53 |         help=("Common start time across all files")
54 |     )
55 | 
56 |     args = parser.parse_args()
57 | 
58 |     if args.output and args.dir:
59 |         aggregate_into_CFA(args.output,
60 |                            args.dir,
61 |                            args.axis,
62 |                            args.cfa_version,
63 |                            args.common_date)
64 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | S3netCDF-python
 2 | ---------------
 3 | 
 4 | Changes between v2.0.12 and v2.1-rc1
 5 | ------------------------------------
 6 | 1. Tidied LICENSE information
 7 | 
 8 | Changes between v2.0.8 and v2.0.12
 9 | ----------------------------------
10 | 1. Added a s3_nc_cfa_split.py program to split a large netCDF file into smaller netCDF subarray files and produce a CFA-netCDF master array file.
11 | 2. Bugfixes to the splitter.
12 | 
13 | Changes between v2.0.6 and v2.0.8
14 | ---------------------------------
15 | 1. Bug fix for indexing.
16 | 2. Changed the name of the template config file and the name in the README.md file to match the code.
17 | 3. Allow an environment variable "S3_NC_CONFIG" to be set to point to the config file.
18 | 4. Fixed bug in previous file that prevented writing CFA sub-array files (!)
19 | 
20 | Changes between v2.0.5 and v2.0.6
21 | ---------------------------------
22 | 1. Update the s3_nc_cfa_agg.py program so that it is compatible with more models and Datasets in CMIP6.  This relates mostly to the way the time dimension is recorded, and the calendar type.
23 | 2. Changed the way that the indexing for unequal sized partitions is calculated.  It is now (potentially) slower, but more robust.
24 | 
25 | Changes between v2.0.4 and v2.0.5
26 | ---------------------------------
27 | 1. Added support for reading unequal sized partitions.  These may occur in files written by the s3_nc_cfa_agg.py program.
28 | 
29 | Changes between v2.0.3 and v2.0.4
30 | ---------------------------------
31 | 1. s3nc_cfa_agg.py now uses FileManager.request_file rather than FileManager._open.  More elegant and API focused.
32 | 2. FileManager.request_file is now compatible with passing globs into it as the filename parameter.
33 | 
34 | Changes between v2.0.2 and v2.0.3
35 | ---------------------------------
36 | 1. Fixed a problem where a BytesIO buffer was being passed by reference rather than copied, leading to a "file operation on unopened file" error.
37 | 2. Corrected install procedure in README.
38 | 3. Corrected bug in test_s3Dataset_read.
39 | 
40 | Changes between v2.0.1 and v2.0.2
41 | ---------------------------------
42 | 1. Fixed unreleased file for Datasets on disk
43 | 2. Fixed incorrect parsing for CFA 0.4
44 | 
45 | Changes between v0.2 and v2.0.1:
46 | --------------------------------
47 | 1. complete rewrite
48 | 2. v0.5 CFA
49 | 3. partition matrix represented internally by netCDF Dataset
50 | 4. user can supply sub array size when creating variable
51 | 5. cacheless operation, except for read of very large files
52 | 6. intelligent memory handling
53 | 7. excellent sparse-array handling
54 | 8. complete compliance with netCDF4 API interface
55 | 


--------------------------------------------------------------------------------
/bin/s3nc_cfa_split.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
 4 | __license__ = "BSD - see LICENSE file in top-level directory"
 5 | __authors__ = "Neil Massey"
 6 | 
 7 | """Program to split a netCDF file into a netCDF-CFA master file and a number
 8 | of netCDF sub array files.
 9 | """
10 | import argparse
11 | 
12 | from S3netCDF4.utils.split import split_into_CFA
13 | 
14 | if __name__ == "__main__":
15 |     # set up and parse the arguments
16 |     parser = argparse.ArgumentParser(
17 |         prog="s3nc_cfa_split",
18 |         formatter_class=argparse.RawTextHelpFormatter,
19 |         description=(
20 |             "Split a netCDF file into a netCDF-CFA master file and a number"
21 |             "of netCDF sub array files."
22 |         )
23 |     )
24 | 
25 |     parser.add_argument(
26 |         "output", action="store", default="", metavar="<output CFA file>",
27 |         help=(
28 |             "Path of the output CFA-netCDF master-array file."
29 |         )
30 |     )
31 | 
32 |     parser.add_argument(
33 |         "input", action="store", default="", metavar="<input path>",
34 |         help=(
35 |             "Path of the input netCDF file"
36 |         )
37 |     )
38 | 
39 |     parser.add_argument(
40 |         "--subarray_path", action="store", default="",
41 |         metavar="<subarray path>",
42 |         help=(
43 |             "Common path of the output sub array files (optional).  Without "
44 |             "this argument, the output will be in a directory below the path of"
45 |             " the output netCDF-CFA master array file."
46 |         )
47 |     )
48 | 
49 |     parser.add_argument(
50 |         "--subarray_shape", action="store", default=[],
51 |         metavar="<subarray_shape>",
52 |         help=(
53 |             "Shape for the subarray files (optional).  Without this argument, "
54 |             "the shape will be automatically determined."
55 |         )
56 |     )
57 | 
58 |     parser.add_argument(
59 |         "--subarray_size", action="store", default=50*1024*1024,
60 |         metavar="<subarray_size>",
61 |         help=(
62 |             "Size for the subarray files (optional).  With this argument, the "
63 |             "shape will be automatically determined, with this target size. "
64 |             "The units for the size is <number of elements in the array>, not "
65 |             "any magnitude of bytes."
66 |         )
67 |     )
68 | 
69 |     parser.add_argument(
70 |         "--cfa_version", action="store", default="0.5",
71 |         help=("Version of CFA conventions to use, 0.4|0.5")
72 |     )
73 | 
74 |     args = parser.parse_args()
75 | 
76 |     if args.output and args.input:
77 |         split_into_CFA(args.output, args.input,
78 |                        args.subarray_path,
79 |                        args.subarray_shape,
80 |                        int(args.subarray_size),
81 |                        args.cfa_version)
82 | 


--------------------------------------------------------------------------------
/test/test_split.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import unittest, os
 3 | 
 4 | from S3netCDF4.utils.split import split_into_CFA
 5 | from S3netCDF4.utils.agg import aggregate_into_CFA
 6 | from S3netCDF4.CFA._CFAClasses import CFADataset
 7 | from S3netCDF4._s3netCDF4 import s3Dataset
 8 | 
 9 | TESTFILE = "/Users/dhk63261/Archive/cmip5/ta_Amon_HadCM3_rcp45_r10i1p1_203101-203512.nc"
10 | 
11 | 
12 | def nca_equivalence(ncfile1, ncfile2, variable='ta'):
13 |     """ Do these two files describe the same content?"""
14 |     # Let's start by comparing a few important things
15 | 
16 |     x = s3Dataset(ncfile1)
17 |     y = s3Dataset(ncfile2)
18 | 
19 |     # First let's just check a data record
20 |     xx = x.variables[variable]
21 |     yy = y.variables[variable]
22 | 
23 |     assert (xx.shape == yy.shape).all(), "CFA data arrays are not the same shape"
24 | 
25 |     assert len(xx.shape) == 4, "Unexpected variable shape for comparison"
26 | 
27 |     xx = xx[:, 0, 0, 0].flatten()
28 |     yy = yy[:, 0, 0, 0].flatten()
29 | 
30 |     # We don't do all data coz it would take a long time
31 |     assert (xx == yy).all(), "Data in arrays does not match"
32 | 
33 |     x.close()
34 |     y.close()
35 |     # now check file headers
36 | 
37 |     raise NotImplementedError("This doesn't mean the test has failed, just the test code is not finished")
38 | 
39 |     #return statement needed
40 | 
41 | class TestSplit(unittest.TestCase):
42 |     """ All the necessary splitter tests"""
43 | 
44 |     def setUp(self):
45 |         self.ncafile1 = '/Users/dhk63261/Archive/things1.nca'
46 |         self.ncapath = '/Users/dhk63261/Archive/things1/things1.ta.*'
47 |         self.ncafile2 = '/Users/dhk63261/Archive/things2.nca'
48 | 
49 |     def _split_and_aggregate(self, cfa1, cfa2):
50 |         # for now use real disk ...
51 |         input = TESTFILE
52 |         subarray_size = 50 * 1024 * 1024
53 |         subarray_path = ""
54 |         subarray_shape = "[1, 17,  73, 96]"
55 | 
56 |         split_into_CFA(self.ncafile1, input,
57 |                        subarray_path,
58 |                        subarray_shape,
59 |                        int(subarray_size),
60 |                        cfa1)
61 | 
62 |         axis = 'time'
63 |         common_date = None
64 | 
65 |         aggregate_into_CFA(self.ncafile2,
66 |                            self.ncapath,
67 |                            axis,
68 |                            cfa2,
69 |                            common_date)
70 | 
71 |     def test_data_available(self):
72 |         """ Test there is an input dataset available."""
73 |         assert os.path.exists(TESTFILE)
74 | 
75 |     def test_file_handles(self):
76 |         """ Test we can open a file for write without fully qualifying it's name.
77 |         See issue:24 """
78 |         raise NotImplementedError
79 | 
80 |     def test_auto_split_and_agg_round_trip1(self):
81 |         """ Test the sensible split and aggregate
82 |          with both at CFA 0.4 """
83 | 
84 |         self._split_and_aggregate("0.4", "0.4")
85 | 
86 |         self.assertTrue(nca_equivalence(self.ncafile1, self.ncafile2))
87 | 
88 |     def test_auto_split_and_agg_round_trip2(self):
89 |         """ Test the sensible split and aggregate
90 |          with different CFA versions """
91 | 
92 |         self._split_and_aggregate("0.4", "0.5")
93 | 
94 |         self.assertTrue(nca_equivalence(self.ncafile1, self.ncafile2))
95 | 
96 | if __name__ == '__main__':
97 |     unittest.main()
98 | 


--------------------------------------------------------------------------------
/S3netCDF4/Managers/_ConnectionPool.pyx:
--------------------------------------------------------------------------------
 1 | #!python
 2 | #cython: language_level=3
 3 | 
 4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
 5 | __license__ = "BSD - see LICENSE file in top-level directory"
 6 | __authors__ = "Neil Massey"
 7 | 
 8 | """
 9 | A very simple connection pool for S3netCDF.  This allows connections to be
10 | maintained to (for example) a AWS or object store.  The pool allows for the
11 | following behaviour:
12 | o. The backend File Object makes a request for a connection.  The pool either
13 |    returns a connection or None, if no connections are available or if all
14 |    available connections are locked
15 | o. If None is returned, the backend is expected to make a connection and add it
16 |    to the pool
17 | o. When connections are added they are locked and they can later be released so
18 |    that they can be reused without having to re-establish the connection.
19 | o. When a connection is closed it is removed from the pool.
20 | """
21 | from S3netCDF4._Exceptions import APIException
22 | 
23 | class ConnectionObject(object):
24 |     """A small class to hold connection information."""
25 |     def __init__(self, conn=None, uri="", available=False):
26 |         self.conn = conn
27 |         self.uri = uri
28 |         self.conn_refs = 0
29 | 
30 |     def __str__(self):
31 |         return "{} : ({})".format(self.uri, self.conn_refs)
32 | 
33 | class ConnectionPool(object):
34 |     """Connection pool for S3 netCDF.  Stores connections to external storage in
35 |     a pool, and keeps track of how many connections have been made to them.
36 |     This maintains connections to servers to enhance performance by not
37 |     incurring the time penalty of establishing a connection
38 |     """
39 | 
40 |     def __init__(self):
41 |         self._connection_pool = {}
42 | 
43 |     def add(self, conn, conn_uri):
44 |         """Add a connection to the ConnectionPool.
45 |         Args:
46 |             conn    : the connection, e.g. a botocore client
47 |             conn_uri: the uri of the connection, e.g. URL address
48 |         Returns:
49 |             None
50 |         """
51 |         # Use the conn_uri as the key to the dictionary
52 |         # If the conn_uri already exists in the connection pool then increase
53 |         # the reference count
54 |         # If it doesn't then create the connection with a reference count of
55 |         # zero
56 |         if conn_uri in self._connection_pool:
57 |             conn_obj = self._connection_pool[conn_uri]
58 |             conn_obj.conn_refs += 1
59 |         else:
60 |             conn_obj = ConnectionObject(conn, conn_uri)
61 |             conn_obj.conn_refs = 1
62 |             self._connection_pool[conn_uri] = conn_obj
63 |         return conn_obj
64 | 
65 |     def get(self, conn_uri):
66 |         """Get a connection from the ConnectionPool.
67 |         Args:
68 |             conn_uri: the uri of the connection, e.g. URL address
69 |         Returns:
70 |             ConnectionObject | None
71 |         """
72 |         # Use the conn_uri to the dictionary to try to find a free connection
73 |         if conn_uri in self._connection_pool:
74 |             conn_obj = self._connection_pool[conn_uri]
75 |             conn_obj.conn_refs += 1
76 |             return conn_obj
77 | 
78 |         return None
79 | 
80 |     def release(self, conn_obj):
81 |         """Release the connection for the connection uri.
82 |         Args:
83 |             conn : the ConnectionObject created in add"""
84 |         if not conn_obj.uri in self._connection_pool:
85 |             raise APIException(
86 |                 "Connection is not in the connection pool {}".format(
87 |                     conn_obj.uri
88 |                 )
89 |             )
90 |         conn_obj.conn_refs -= 1
91 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
  2 | __license__ = "BSD - see LICENSE file in top-level directory"
  3 | __authors__ = "Neil Massey"
  4 | 
  5 | import os
  6 | from setuptools import Extension, setup
  7 | from Cython.Build import cythonize
  8 | s3nc_define_macros = [(
  9 |     "NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION"
 10 | )]
 11 | import numpy
 12 | 
 13 | with open(os.path.join(os.path.dirname(__file__), 'README.md')) as readme:
 14 |     README = readme.read()
 15 | 
 16 | # allow setup.py to be run from any path
 17 | os.chdir(os.path.normpath(os.path.join(os.path.abspath(__file__), os.pardir)))
 18 | 
 19 | s3nc_extra_compile_args = ['-fno-strict-aliasing', '-O3']
 20 | 
 21 | extensions = [
 22 |     Extension(
 23 |             name="S3netCDF4.Backends._s3aioFileObject",
 24 |             sources=["S3netCDF4/Backends/_s3aioFileObject.pyx"],
 25 |             define_macros=s3nc_define_macros,
 26 |             extra_compile_args=s3nc_extra_compile_args,
 27 |             include_dirs=[numpy.get_include()],
 28 |             inplace=True
 29 |     ),
 30 |     Extension(
 31 |             name="S3netCDF4.Backends._s3FileObject",
 32 |             sources=["S3netCDF4/Backends/_s3FileObject.pyx"],
 33 |             define_macros=s3nc_define_macros,
 34 |             extra_compile_args=s3nc_extra_compile_args,
 35 |             include_dirs=[numpy.get_include()],
 36 |     ),
 37 |     Extension(
 38 |             name="S3netCDF4.CFA._CFAClasses",
 39 |             sources=["S3netCDF4/CFA/_CFAClasses.pyx"],
 40 |             define_macros=s3nc_define_macros,
 41 |             extra_compile_args=s3nc_extra_compile_args,
 42 |             include_dirs=[numpy.get_include()],
 43 |     ),
 44 |     Extension(
 45 |             name="S3netCDF4.CFA._CFAExceptions",
 46 |             sources=["S3netCDF4/CFA/_CFAExceptions.pyx"],
 47 |             define_macros=s3nc_define_macros,
 48 |             extra_compile_args=s3nc_extra_compile_args,
 49 |             include_dirs=[numpy.get_include()],
 50 |     ),
 51 |     Extension(
 52 |             name="S3netCDF4.CFA._CFASplitter",
 53 |             sources=["S3netCDF4/CFA/_CFASplitter.pyx"],
 54 |             define_macros=s3nc_define_macros,
 55 |             extra_compile_args=s3nc_extra_compile_args,
 56 |             include_dirs=[numpy.get_include()],
 57 |     ),
 58 |     Extension(
 59 |             name="S3netCDF4.CFA.Parsers._CFAnetCDFParser",
 60 |             sources=["S3netCDF4/CFA/Parsers/_CFAnetCDFParser.pyx"],
 61 |             define_macros=s3nc_define_macros,
 62 |             extra_compile_args=s3nc_extra_compile_args,
 63 |             include_dirs=[numpy.get_include()],
 64 |     ),
 65 |     Extension(
 66 |             name="S3netCDF4.CFA.Parsers._CFAParser",
 67 |             sources=["S3netCDF4/CFA/Parsers/_CFAParser.pyx"],
 68 |             define_macros=s3nc_define_macros,
 69 |             extra_compile_args=s3nc_extra_compile_args,
 70 |             include_dirs=[numpy.get_include()],
 71 |     ),
 72 |     Extension(
 73 |             name="S3netCDF4.Managers._ConfigManager",
 74 |             sources=["S3netCDF4/Managers/_ConfigManager.pyx"],
 75 |             define_macros=s3nc_define_macros,
 76 |             extra_compile_args=s3nc_extra_compile_args,
 77 |             include_dirs=[numpy.get_include()],
 78 |     ),
 79 |     Extension(
 80 |             name="S3netCDF4.Managers._ConnectionPool",
 81 |             sources=["S3netCDF4/Managers/_ConnectionPool.pyx"],
 82 |             define_macros=s3nc_define_macros,
 83 |             extra_compile_args=s3nc_extra_compile_args,
 84 |             include_dirs=[numpy.get_include()],
 85 |     ),
 86 |     Extension(
 87 |             name="S3netCDF4.Managers._FileManager",
 88 |             sources=["S3netCDF4/Managers/_FileManager.pyx"],
 89 |             define_macros=s3nc_define_macros,
 90 |             extra_compile_args=s3nc_extra_compile_args,
 91 |             include_dirs=[numpy.get_include()],
 92 |     ),
 93 |     Extension(
 94 |             name="S3netCDF4._Exceptions",
 95 |             sources=["S3netCDF4/_Exceptions.pyx"],
 96 |             define_macros=s3nc_define_macros,
 97 |             extra_compile_args=s3nc_extra_compile_args,
 98 |             include_dirs=[numpy.get_include()],
 99 |     ),
100 |     Extension(
101 |             name="S3netCDF4._s3netCDF4",
102 |             sources=["S3netCDF4/_s3netCDF4.pyx"],
103 |             define_macros=s3nc_define_macros,
104 |             extra_compile_args=s3nc_extra_compile_args,
105 |             include_dirs=[numpy.get_include()],
106 |     ),
107 | ]
108 | 
109 | setup(
110 |     name='S3netCDF4',
111 |     version='2.1-rc1',
112 |     packages=['S3netCDF4'],
113 |     install_requires=[
114 |       'numpy>=1.19.0',
115 |       'cython',
116 |       'netcdf4',
117 |       'botocore',
118 |       'aiobotocore',
119 |       'psutil',
120 |     ],
121 |     ext_modules=cythonize(extensions),
122 |     zip_safe=False,
123 |     include_package_data=True,
124 |     license='my License',  # example license
125 |     description='A library to facilitate the storage of netCDF files on ObjectStores in an efficient manner.',
126 |     long_description=README,
127 |     long_description_content_type="text/markdown",
128 |     url='http://www.ceda.ac.uk/',
129 |     author='Neil Massey',
130 |     author_email='neil.massey@stfc.ac.uk',
131 |     classifiers=[
132 |         'Development Status :: 4 - Beta',
133 |         'Intended Audience :: Science/Research',
134 |         'License :: OSI Approved :: BSD License', # example license
135 |         'Topic :: Software Development :: Libraries :: Python Modules',
136 |         'Topic :: System :: Archiving :: Compression',
137 |         'Operating System :: OS Independent',
138 |         'Programming Language :: Python',
139 |         'Programming Language :: Python :: 3',
140 |         'Programming Language :: Python :: 3.7',
141 |         'Programming Language :: Python :: 3.8',
142 |         'Programming Language :: Python :: 3.9',
143 |     ]
144 | )
145 | 


--------------------------------------------------------------------------------
/S3netCDF4/Managers/_ConfigManager.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: language_level=3
  3 | 
  4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
  5 | __license__ = "BSD - see LICENSE file in top-level directory"
  6 | __authors__ = "Neil Massey"
  7 | 
  8 | """
  9 | Configuration management for S3netCDF.  Configuration is stored for each user
 10 | in a JSON file in their home directory: ~/.s3nc.json
 11 | """
 12 | 
 13 | import os
 14 | import json
 15 | import psutil
 16 | import resource
 17 | from .._Exceptions import IOException, APIException
 18 | 
 19 | COMPATIBLE_VERSIONS = ["9"]
 20 | 
 21 | def convert_file_size_string(value):
 22 |     """Convert a string containing a file size and suffix to an integer number
 23 |     of bytes.
 24 |     value <string> : string containing integer number and an optional suffix
 25 |     """
 26 |     # list of file format sizes
 27 |     file_format_sizes = ("kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
 28 |     # dictionary mapping to multiplier
 29 |     file_format_scale = {"B" : 1,
 30 |                          "kB" : 1e3,
 31 |                          "MB" : 1e6,
 32 |                          "GB" : 1e9,
 33 |                          "TB" : 1e12,
 34 |                          "EB" : 1e15,
 35 |                          "ZB" : 1e18,
 36 |                          "YB" : 1e21}
 37 |     if isinstance(value, str):
 38 |         if value.endswith(file_format_sizes):
 39 |             suffix = value[-2:]
 40 |             size = int(value[:-2])
 41 |         elif value[-1] == "B":
 42 |             suffix = "B"
 43 |             size = int(value[:-1])
 44 |         else:
 45 |             suffix = "B"
 46 |             size = int(value)
 47 |         # multiply by scalar
 48 |         size *= file_format_scale[suffix]
 49 |         return size
 50 |     else:
 51 |         return value
 52 | 
 53 | def interpret_config_file(node, keys_to_convert):
 54 |     """Recursively search the dictionary for keys to convert, and convert them
 55 |     using the convert_file_size_string function above."""
 56 |     # First time entry node == dictionary
 57 |     for key, item in node.items():
 58 |         if type(item) is dict:
 59 |             interpret_config_file(item, keys_to_convert)
 60 |         elif key in keys_to_convert:
 61 |             # reassign to the dictionary
 62 |             node[key] = convert_file_size_string(item)
 63 | 
 64 | 
 65 | class Config(object):
 66 |     """Class to read in config file, interpret it and make the information
 67 |     available.
 68 |     """
 69 | 
 70 |     def __init__(self):
 71 |         """Initialise S3netCDF4 for this user by reading the config file from
 72 |         their home directory.  Config file is called ~/.s3nc.json"""
 73 |         # Read the JSON config file from the user home directory or from a path
 74 |         # set by the environment variable "S3_NC_CONFIG"
 75 |         # get user home directory
 76 |         user_home = os.environ["HOME"]
 77 | 
 78 |         # create the default path to the config file
 79 |         sl_config_default = os.path.join(user_home, ".s3nc.json")
 80 | 
 81 |         # try to get the path from the environment variable, but default to
 82 |         # above if environment variable not set
 83 |         sl_config_path = os.getenv("S3_NC_CONFIG", sl_config_default)
 84 |         # open the file
 85 |         try:
 86 |             fp = open(sl_config_path)
 87 |             # deserialize from the JSON
 88 |             self._sl_user_config = json.load(fp)
 89 |             # check the version number
 90 |             if ("version" not in self._sl_user_config or
 91 |                 self._sl_user_config["version"] not in COMPATIBLE_VERSIONS):
 92 |                 raise APIException(
 93 |                     "User config file is not compatible with current version of"
 94 |                     " S3netCDF4.  Please update the config file at: {}".format(
 95 |                         sl_config_path
 96 |                     )
 97 |                 )
 98 |             # add the filename to the config so we can refer to it in error
 99 |             # messages
100 |             self._sl_user_config["filename"] = sl_config_path
101 |             # keys to convert between text sizes and integer sizes
102 |             # (e.g.) 50MB to 50*1024*1024
103 |             keys_to_convert = [
104 |                 "maximum_part_size",
105 |                 "memory"
106 |             ]
107 |             # interpret the config file, converting the above keys
108 |             interpret_config_file(self._sl_user_config, keys_to_convert)
109 |             # close the config file
110 |             fp.close()
111 |             # configure some defaults if they are not in the config file
112 |             # note that default configs for the backends are handled in the
113 |             # constructor of the backend class, e.g. _s3aioFileObject
114 |             avail_mem = psutil.virtual_memory().available
115 |             fhandles = resource.getrlimit(resource.RLIMIT_NOFILE)[0]
116 |             if "resource_allocation" in self._sl_user_config:
117 |                 if not "memory" in self._sl_user_config["resource_allocation"]:
118 |                     self._sl_user_config["resource_allocation"]["memory"] = avail_mem
119 |                 if (not "filehandles" in
120 |                       self._sl_user_config["resource_allocation"]):
121 |                     self._sl_user_config["resource_allocation"]["filehandles"] = fhandles
122 |             else:
123 |                 self._sl_user_config["resource_allocation"] = {
124 |                     "memory" : avail_mem,
125 |                     "filehandles" : fhandles
126 |                 }
127 | 
128 |         except IOError:
129 |             raise IOException(
130 |                 "User config file does not exist with path: {}".format(
131 |                     sl_config_path
132 |                 )
133 |             )
134 | 
135 |     def __getitem__(self, name):
136 |         """Get a value from the s3 config"""
137 |         return self._sl_user_config[name]
138 | 
139 |     @property
140 |     def items(self):
141 |         """Return the items in the dictionary / config definition"""
142 |         return self._sl_user_config.items()
143 | 
144 |     @items.setter
145 |     def items(self, value):
146 |         raise AttributeError("items cannot be altered")
147 | 


--------------------------------------------------------------------------------
/S3netCDF4/CFA/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 |    Classes containing the structure of CFA-netCDF files (master array) and the
  3 |    CF-netcdf subarray files.
  4 |    See:
  5 |      http://www.met.reading.ac.uk/~david/cfa/0.4/index.html
  6 |    for the specification of the CFA conventions.
  7 | 
  8 |    Only a subset of the CFA-netCDF specification is implemented - just what we
  9 |    use to fragment the files to store as multiple objects on the object storage.
 10 | 
 11 |    The classes here are organised to reflect the implied hierarchy in the CFA
 12 |    conventions :
 13 |    (NC = netCDF)
 14 | 
 15 |     +------------------------------------------------+
 16 |     | CFADataset                                     |
 17 |     +------------------------------------------------+
 18 |     | format             string                      |
 19 |     | metadata           dict<mixed>                 |
 20 |     | cfa_groups         dict<CFAGroups>             |
 21 |     +------------------------------------------------+
 22 |     | bool               createGroup(string grp_name)|
 23 |     | CFAGroup           getGroup(string grp_name)   |
 24 |     | bool               renameGroup(string old_name,|
 25 |     |                                string new_name)|
 26 |     | list<string>       getGroups()                 |
 27 |     | dict<mixed>        getMetadata()               |
 28 |     +------------------------------------------------+
 29 |                    |
 30 |                    |
 31 |                    |
 32 |     +------------------------------------------------+
 33 |     | CFAGroup                                       |
 34 |     +------------------------------------------------+
 35 |     | cfa_dims        dict<CFADim>                   |
 36 |     | grp_name        string                         |
 37 |     | metadata        dict<mixed>                    |
 38 |     | cfa_vars        dict<CFAVariable>              |
 39 |     +------------------------------------------------+
 40 |     | CFAVariable  createVariable(string var_name,   |
 41 |     |                     array<int> shape,          |
 42 |     |                     np.dtype dtype,            |
 43 |     |                     list<string> dim_names     |
 44 |     |                     dict<mixed> metadata)      |
 45 |     | CFAVariable  getVariable(string var_name)      |
 46 |     | list<string> getVariables()                    |
 47 |     | bool         renameVariable(string old_name,   |
 48 |     |                            string new_name)    |
 49 |     |                                                |
 50 |     | CFADim       createDimension(string dim_name,  |
 51 |     |                           int len,             |
 52 |     |                           dict<mixed>metadata) |
 53 |     | CFADim       getDimension(string dim_name)     |
 54 |     | list<string> getDimensions()                   |
 55 |     | bool         renameDimension(string old_name,  |
 56 |     |                             string new_name)   |
 57 |     |                                                |
 58 |     | string       getName()                         |
 59 |     | dict<mixed>  getMetadata()                     |
 60 |     +------------------------------------------------+
 61 |                    |
 62 |                    +--------------------------------------------------------------+
 63 |                    |                                                              |
 64 |     +------------------------------------------------+             +------------------------------------------------+
 65 |     | CFAVariable                                    |             | CFADim                                         |
 66 |     +------------------------------------------------+             +------------------------------------------------+
 67 |     | var_name       string                          |             | dim_name         string                        |
 68 |     | metadata       dict<mixed>                     |             | dim_len          int                           |
 69 |     | cf_role        string                          |             | metadata         dict<mixed>                   |
 70 |     | pmdimensions   array<string>                   |             | axis_type        string                        |
 71 |     | pmshape        array<int>                      |             +------------------------------------------------+
 72 |     | base           string                          |             | string           getName()                     |
 73 |     | partitions     array<CFAPartition>             |             | dict<mixed>      getMetadata()                 |
 74 |     +------------------------------------------------+             | array<int>       getIndices()                  |
 75 |     | string         getName()                       |             | string           getAxisType                   |
 76 |     | dict<mixed>    getMetadata()                   |             +------------------------------------------------+
 77 |     | list<string>   getDimensions()                 |
 78 |     | bool           parse(dict cfa_metadata)        |
 79 |     | CFAPartition   getPartition(array<int> index)  |
 80 |     +------------------------------------------------+
 81 |                     |
 82 |                     |
 83 |                     |
 84 |     +------------------------------------------------+
 85 |     | CFAPartition                                   |
 86 |     +------------------------------------------------+
 87 |     | array<int>    index                            |
 88 |     | array<int>    location                         |
 89 |     | CFASubArray   subarray                         |
 90 |     +------------------------------------------------+
 91 |     | bool          parse(dict cfa_metadata)         |
 92 |     | array<int>    getIndex()                       |
 93 |     | array<int>    getLocation()                    |
 94 |     | CFASubarray   getSubArray()                    |
 95 |     +------------------------------------------------+
 96 |                     |
 97 |                     |
 98 |                     |
 99 |     +------------------------------------------------+
100 |     | CFASubarray                                    |
101 |     +------------------------------------------------+
102 |     | ncvar          string                          |
103 |     | file           string                          |
104 |     | format         string                          |
105 |     | shape          array<int>                      |
106 |     +------------------------------------------------+
107 |     | bool           parse(dict cfa_metadata)        |
108 |     | string         getncVar()                      |
109 |     | string         getFile()                       |
110 |     | string         getFormat()                     |
111 |     | array<int>     getShape()                      |
112 |     +------------------------------------------------+
113 | """
114 | 


--------------------------------------------------------------------------------
/test/test_s3FileObject.py:
--------------------------------------------------------------------------------
  1 | from S3netCDF4.Backends._s3FileObject import s3FileObject
  2 | from S3netCDF4._Exceptions import IOException, APIException
  3 | import unittest
  4 | import json
  5 | import io
  6 | 
  7 | """To run the tests, you need to create a .s3config.json file in the same
  8 | directory as these tests.  This file should contain:
  9 | {
 10 |     "url": "<url of s3 service>",
 11 |     "credentials": {
 12 |         "accessKey": "<your access key>",
 13 |         "secretKey": "<your secret key>"
 14 |     }
 15 | }
 16 | """
 17 | 
 18 | class s3FileObjectGeneralTest(object):
 19 |     """All of the general tests for either a read or write transaction."""
 20 | 
 21 |     def tearDown(self):
 22 |         self.s3c.close()
 23 |         self.s3c_lines.close()
 24 | 
 25 |     def test_connect(self):
 26 |         self.assertTrue(self.s3c.connect())
 27 | 
 28 |     def test_detach(self):
 29 |         self.assertTrue(self.s3c.connect())
 30 |         self.assertRaises(io.UnsupportedOperation, self.s3c.detach)
 31 | 
 32 |     def test_close(self):
 33 |         self.assertTrue(self.s3c.connect())
 34 |         self.assertTrue(self.s3c.close())
 35 | 
 36 |     def test_readable(self):
 37 |         self.assertTrue(self.s3c.connect())
 38 |         self.assertTrue(self.s3c.readable())
 39 | 
 40 |     def test_truncate(self):
 41 |         self.assertTrue(self.s3c.connect())
 42 |         self.assertRaises(io.UnsupportedOperation, self.s3c.truncate)
 43 | 
 44 |     def test_fileno(self):
 45 |         self.assertTrue(self.s3c.connect())
 46 |         self.assertRaises(io.UnsupportedOperation, self.s3c.fileno)
 47 | 
 48 |     def test_seekable(self):
 49 |         self.assertTrue(self.s3c.connect())
 50 |         self.assertTrue(self.s3c.seekable())
 51 | 
 52 |     def test_tell(self):
 53 |         self.assertTrue(self.s3c.connect())
 54 |         self.assertEqual(self.s3c.tell(), 0)
 55 | 
 56 |     def test_seek(self):
 57 |         self.assertTrue(self.s3c.connect())
 58 |         # Three different methods for seek:
 59 |         #   whence = io.SEEK_SET
 60 |         #   whence = io.SEEK_CUR
 61 |         #   whence = io.SEEK_END
 62 |         # the current pointer is on zero
 63 |         self.assertEqual(0, self.s3c.seek(0, whence=io.SEEK_SET))
 64 |         self.assertEqual(10, self.s3c.seek(10, whence=io.SEEK_SET))
 65 |         # now on 10
 66 |         with self.assertRaises(IOException) as contx:
 67 |             self.s3c.seek(-1, whence=io.SEEK_SET)
 68 |         # failed so still on 10
 69 | 
 70 |         # the current pointer is on ten (10)
 71 |         self.assertEqual(0, self.s3c.seek(-10, whence=io.SEEK_CUR))
 72 |         # now on 0 - should raise an exception if we seek below 0
 73 |         with self.assertRaises(IOException):
 74 |             self.s3c.seek(-1, whence=io.SEEK_CUR)
 75 |         # still on zero: get the size to seek past it
 76 |         size = self.s3c._getsize()
 77 |         with self.assertRaises(IOException):
 78 |             self.s3c.seek(size+1, whence=io.SEEK_CUR)
 79 | 
 80 |         # still on zero - seek from the end
 81 |         with self.assertRaises(IOException):
 82 |             self.s3c.seek(size+1, whence=io.SEEK_END)
 83 |         # still on 0 - seek backwards from the end
 84 |         with self.assertRaises(IOException):
 85 |             self.s3c.seek(-1, whence=io.SEEK_END)
 86 |         # seek just a normal amount from the end
 87 |         self.assertEqual(size-10, self.s3c.seek(10, whence=io.SEEK_END))
 88 | 
 89 | 
 90 | class s3t1FileObjectWriteTest(unittest.TestCase, s3FileObjectGeneralTest):
 91 | 
 92 |     def setUp(self):
 93 |         """Set up the s3FileObject but don't connect."""
 94 |         # load the credentials from the hidden file
 95 |         fh = open(".s3config.json")
 96 |         cfg = json.load(fh)
 97 |         fh.close()
 98 |         self.s3c = s3FileObject(
 99 |             cfg["url"] + "/buckettest/thefox1a.nc",
100 |             credentials=cfg["credentials"],
101 |             mode="w"
102 |         )
103 | 
104 |         # for writing with the write line methods
105 |         self.s3c_lines = s3FileObject(
106 |             cfg["url"] + "/buckettest/thefox1b.txt",
107 |             credentials=cfg["credentials"],
108 |             mode="w"
109 |         )
110 | 
111 |     def test_seek(self):
112 |         with self.assertRaises(IOException):
113 |             self.s3c.seek(0)
114 | 
115 |     def test_readable(self):
116 |         self.assertTrue(self.s3c.connect())
117 |         self.assertFalse(self.s3c.readable())
118 | 
119 |     def test_writable(self):
120 |         self.assertTrue(self.s3c.connect())
121 |         self.assertTrue(self.s3c.writable())
122 | 
123 |     def test_write(self):
124 |         self.assertTrue(self.s3c.connect())
125 |         # create random bytes - if we keep it below s3c._getsize() then it will
126 |         # only do one upload
127 |         size = self.s3c._getsize()
128 |         bytes = bytearray(size)
129 |         for b in range(0, size):
130 |             bytes[b] = 128
131 |         self.assertNotEqual(0, self.s3c.write(bytes))
132 | 
133 |     def test_write_multipart(self):
134 |         self.assertTrue(self.s3c.connect())
135 |         # create random bytes - if we make it above 3c._getsize() then it will
136 |         # do a multipart upload
137 |         size = 3 * self.s3c._getsize()
138 |         bytes = bytearray(size)
139 |         for b in range(0, size):
140 |             bytes[b] = 128
141 |         self.assertNotEqual(0, self.s3c.write(bytes))
142 | 
143 |     def test_write_lines(self):
144 |         self.assertTrue(self.s3c_lines.connect())
145 |         lines = ["The","quick","brown","fox","jumped",
146 |                  "over","the","lazy","red","hen"]
147 |         self.assertTrue(self.s3c_lines.writelines(lines))
148 | 
149 | 
150 | class s3t2FileObjectReadTest(unittest.TestCase, s3FileObjectGeneralTest):
151 | 
152 |     def setUp(self):
153 |         """Set up the s3FileObject but don't connect."""
154 |         # load the credentials from the hidden file
155 |         fh = open(".s3config.json")
156 |         cfg = json.load(fh)
157 |         fh.close()
158 |         self.s3c = s3FileObject(
159 |             cfg["url"] + "/buckettest/thefox1a.nc",
160 |             credentials=cfg["credentials"],
161 |             mode="r"
162 |         )
163 | 
164 |         self.s3c_lines = s3FileObject(
165 |             cfg["url"] + "/buckettest/thefox1b.txt",
166 |             credentials=cfg["credentials"],
167 |             mode="r"
168 |         )
169 | 
170 |     def test_writable(self):
171 |         self.assertTrue(self.s3c.connect())
172 |         self.assertFalse(self.s3c.writable())
173 | 
174 |     def testread(self):
175 |         self.assertTrue(self.s3c.connect())
176 |         self.assertNotEqual(0, len(self.s3c.read()))
177 | 
178 |     def testreadrange(self):
179 |         self.assertTrue(self.s3c.connect())
180 |         self.assertEqual(1024, len(self.s3c.read(size=1024)))
181 |         self.assertNotEqual(0, len(self.s3c.read(size=1024)))
182 | 
183 |     def testreadinto(self):
184 |         buffer = bytearray()
185 |         self.assertTrue(self.s3c.connect())
186 |         self.assertNotEqual(0, self.s3c.readinto(buffer))
187 |         self.assertNotEqual(0, len(buffer))
188 | 
189 |     def testreadline(self):
190 |         self.assertTrue(self.s3c_lines.connect())
191 |         self.s3c_lines.seek(0)
192 |         self.assertNotEqual(0, len(self.s3c_lines.readline()))
193 |         self.s3c_lines.seek(0)
194 |         self.assertNotEqual(0, len(self.s3c_lines.readlines()))
195 | 
196 | if __name__ == '__main__':
197 |     unittest.main()
198 | 


--------------------------------------------------------------------------------
/S3netCDF4/utils/split.py:
--------------------------------------------------------------------------------
  1 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset
  2 | from S3netCDF4.CFA._CFAExceptions import CFAError
  3 | from netCDF4 import Dataset
  4 | import numpy as np
  5 | 
  6 | 
  7 | def copy_dims(nc_object, s3_object):
  8 |     nc_md_dims = nc_object.dimensions
  9 |     for d in nc_md_dims:
 10 |         # get the original dimension
 11 |         nc_dim = nc_object.dimensions[d]
 12 |         # create in the s3Dataset
 13 |         if nc_dim.isunlimited():
 14 |             size = nc_dim.size
 15 |         else:
 16 |             size = nc_dim.size
 17 |         s3_object.createDimension(d, size)
 18 | 
 19 | 
 20 | def copy_vars(nc_object, s3_object, subarray_size, subarray_shape=[]):
 21 |     nc_md_vars = nc_object.variables
 22 |     for v in nc_md_vars:
 23 |         # get the original variable
 24 |         nc_var = nc_object.variables[v]
 25 |         # create the variable if the sub array shape is given
 26 |         nc_var_md_keys = nc_var.ncattrs()
 27 |         if "_FillValue" in nc_var_md_keys:
 28 |             fill_value = nc_var.getncattr("_FillValue")
 29 |         else:
 30 |             fill_value = None
 31 |         # create the variable - the createVariable method needs to distinguish
 32 |         # between whether the shape or size has been passed in
 33 |         # Also, if the subarray_shape has been passed in then only attempt to
 34 |         # do it for variables with the same number of dimensions as the subarray
 35 |         # shape.
 36 |         use_shape = False
 37 |         if subarray_shape!=[]:
 38 |             shape_list = [int(x) for x in subarray_shape.strip("[]").split(",")]
 39 |             shape_array = np.array(shape_list)
 40 |             # check if shape_array.size is the same as the number of dims in the
 41 |             # netCDF variable
 42 |             if(shape_array.size == nc_var.ndim):
 43 |                 use_shape = True
 44 | 
 45 |         if use_shape:
 46 |             # subarray shape at this moment is a string, [a,b,c,d]
 47 |             s3_var = s3_object.createVariable(
 48 |                         # can only fill in endian from original dataset as
 49 |                         # other initialisation variables are not stored in the
 50 |                         # nc_var object
 51 |                         nc_var.name,
 52 |                         nc_var.dtype,
 53 |                         endian=nc_var.endian(),
 54 |                         fill_value=fill_value,
 55 |                         dimensions=nc_var.dimensions,
 56 |                         subarray_shape=shape_array)
 57 |         else:
 58 |             s3_var = s3_object.createVariable(
 59 |                         # can only fill in endian from original dataset as
 60 |                         # other initialisation variables are not stored in the
 61 |                         # nc_var object
 62 |                         nc_var.name,
 63 |                         nc_var.dtype,
 64 |                         endian=nc_var.endian(),
 65 |                         fill_value=fill_value,
 66 |                         dimensions=nc_var.dimensions,
 67 |                         max_subarray_size=subarray_size)
 68 |         # copy the variable's metadata
 69 |         nc_var_md_keys = nc_var.ncattrs()
 70 |         for k in nc_var_md_keys:
 71 |             if k != "_FillValue":
 72 |                 s3_var.setncattr(k, nc_var.getncattr(k))
 73 | 
 74 |         # now copy the data - iterate over every partition
 75 |         if (s3_var._cfa_var):
 76 |             # it's a CFA variable so we want to copy the data in an intelligent
 77 |             # way - by copying it partition by partition.  This will avoid
 78 |             # reading the large (potentially huge) dataset into memory all at
 79 |             # once
 80 |             pm_shape = tuple(s3_var._cfa_var.getPartitionMatrixShape())
 81 |             for i in np.ndindex(pm_shape):
 82 |                 partition = s3_var._cfa_var.getPartition(i)
 83 |                 location = []
 84 |                 # this is a bit less obvious as we are using the partition
 85 |                 # information to get the slices, rather than going from the
 86 |                 # slices to the partition information, which happens in the
 87 |                 # _CFAClasses
 88 |                 for l in partition.location:
 89 |                     s = slice(l[0], l[1], 1)
 90 |                     location.append(s)
 91 |                 location = tuple(location)
 92 |                 nc_data = nc_var[location]
 93 |                 s3_var[location] = nc_data
 94 |         else:
 95 |             # not a CFA variable so just copy the data
 96 |             s3_var[:] = nc_var[:]
 97 | 
 98 | 
 99 | def split_into_CFA(output_path, input_path,
100 |                    subarray_path="",
101 |                    subarray_shape=[], subarray_size=50*1024*1024,
102 |                    cfa_version="0.5", ):
103 |     """Split a netCDF file into a number of subarray files and write the CFA
104 |     master array file."""
105 |     # if the subarray path is empty then get it from the output_path
106 |     if subarray_path == "":
107 |         if ".nca" in output_path:
108 |             subarray_path = output_path[:-4]
109 |         elif ".nc" in output_path:
110 |             subarray_path = output_path[:-3]
111 |         else:
112 |             subarray_path = output_path
113 |             output_path += ".nca"
114 | 
115 |     # open the input file
116 |     nc_ds = Dataset(input_path, 'r')
117 | 
118 |     # get the output format for the new Dataset
119 |     # if it's netCDF4 then the output is CFA4
120 |     # if it's netCDF3 then the output is CFA3
121 |     if nc_ds.file_format in ['NETCDF4', 'NETCDF4_CLASSIC']:
122 |         s3_file_format = "CFA4"
123 |     elif nc_ds.file_format == "NETCDF3_CLASSIC":
124 |         s3_file_format = "CFA3"
125 |     else:
126 |         raise CFAError("Cannot split file with format: {}".format(
127 |                         nc_ds.file_format)
128 |                       )
129 | 
130 |     # open the output file - copy the input from the input file to the output
131 |     # file(s), whilst using the subarray settings to chunk the data
132 |     s3_ds = s3Dataset(output_path, 'w',
133 |                       format=s3_file_format,
134 |                       cfa_version=cfa_version)
135 | 
136 |     # we now want to copy the information from the original dataset
137 |     # netCDF files have:
138 |     #   global metadata
139 |     #   global dimensions
140 |     #   global variables
141 |     #       Each variable has
142 |     #           metadata
143 |     #           field data
144 |     #
145 |     #   global groups
146 |     #       Each group has
147 |     #           metadata
148 |     #           dimensions
149 |     #           variables
150 |     #               Each variable has
151 |     #                   metadata
152 |     #                   field data
153 | 
154 |     # global metadata
155 |     nc_md_keys = nc_ds.ncattrs()
156 |     for k in nc_md_keys:
157 |         s3_ds.setncattr(k, nc_ds.getncattr(k))
158 | 
159 |     # global dimensions
160 |     copy_dims(nc_ds, s3_ds)
161 | 
162 |     # global variables
163 |     copy_vars(nc_ds, s3_ds, subarray_size, subarray_shape)
164 | 
165 |     # now do the groups
166 |     for grp in nc_ds.groups:
167 |         nc_grp = nc_ds.groups[grp]
168 |         # create s3 group in the s3 dataset
169 |         s3_grp = s3_ds.createGroup(nc_grp.name)
170 |         # copy group metadata
171 |         nc_md_keys = nc_grp.ncattrs()
172 |         for k in nc_md_keys:
173 |             s3_ds.setncattr(k, nc_grp.getncattr(k))
174 | 
175 |         # copy group dimensions
176 |         copy_dims(nc_ds, s3_ds)
177 | 
178 |         # copy group variables
179 |         copy_vars(nc_ds, s3_ds, subarray_size, subarray_shape)
180 | 
181 |     # close the s3Dataset - super important as everything gets written on close
182 |     s3_ds.close()


--------------------------------------------------------------------------------
/bin/s3nc_cfa_mv.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
  4 | __license__ = "BSD - see LICENSE file in top-level directory"
  5 | __authors__ = "Neil Massey"
  6 | 
  7 | """Program to rewrite partition infomation in a CFA-netCDF master-array file to reflect that a sub-array file has moved.
  8 | """
  9 | 
 10 | import argparse
 11 | from urllib.parse import urlparse
 12 | import os
 13 | import numpy as np
 14 | import sys
 15 | 
 16 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset
 17 | from S3netCDF4.CFA._CFAClasses import CFAPartition
 18 | 
 19 | def split_file_name(input_name):
 20 |     # split into prefix and filename
 21 |     # this should work on urls and file paths
 22 |     file_split = input_name.split("/")
 23 |     file_path = "/".join(file_split[:-1])
 24 |     file_name = file_split[-1]
 25 |     return file_path, file_name
 26 | 
 27 | def update_file_in_partition(prefix, cfa_var, partition_index):
 28 |     """Update the file_information in a variable for a given partition.
 29 |     Args:
 30 |         prefix (string): new prefix for files
 31 |         cfa_var (CFAVariable): variable to alter the partition for
 32 |         partition_index (np.ndarray): index of the partition to alter
 33 |     Returns:
 34 |         None
 35 |     """
 36 |     # get the partition from the index
 37 |     partition = cfa_var.getPartition(partition_index)
 38 |     # get the file name and file path:
 39 |     file_path, file_name = split_file_name(partition.file)
 40 |     # new file path:
 41 |     new_file_path = prefix + "/" + file_name
 42 |     # construct a new partition
 43 |     new_part = CFAPartition(
 44 |         index = partition.index,
 45 |         location = partition.location,
 46 |         ncvar = partition.ncvar,
 47 |         file = new_file_path,
 48 |         format = partition.format,
 49 |         shape = partition.shape
 50 |     )
 51 |     # write (and replace) the old partition
 52 |     cfa_var.writePartition(new_part)
 53 | 
 54 | def update_file_in_variable(cfa_var, prefix, partition="all"):
 55 |     """Update the file_information in a variable for a given partition.
 56 |     Args:
 57 |         cfa_var (CFAVariable): CFA variable to alter, containing the partitions
 58 |         prefix (string): new prefix for files
 59 |         partition (string): index of the partition to alter, or 'all'
 60 |     Returns:
 61 |         None
 62 |     """
 63 |     if partition == "all":
 64 |         pmshape = cfa_var.getPartitionMatrixShape()
 65 |         for partition_index in np.ndindex(*pmshape):
 66 |             update_file_in_partition(prefix, cfa_var, partition_index)
 67 |     else:
 68 |         # convert from partition string
 69 |         partition_index = np.fromstring(args.partition, dtype='i', sep=', ')
 70 |         update_file_in_partition(prefix, cfa_var, partition_index)
 71 | 
 72 | def update_file_in_group(cfa_group, prefix, variable="all", partition="all"):
 73 |     """Update the file_information in a group for a given partition.
 74 |     Args:
 75 |         cfa_group (CFAGroup): CFA group to alter, containing the cfa_variables
 76 |         prefix (string): new prefix for files
 77 |         variable (string): name of the variable to alter, or 'all'
 78 |         partition (string): index of the partition to alter, or 'all'
 79 |     Returns:
 80 |         None
 81 |     """
 82 |     if variable == "all":
 83 |         for var in cfa_group.getVariables():
 84 |             cfa_var = cfa_group.getVariable(var)
 85 |             update_file_in_variable(cfa_var, prefix, partition)
 86 |     else:
 87 |         if variable in cfa_group.getVariables():
 88 |             cfa_var = cfa_group.getVariable(variable)
 89 |             update_file_in_variable(cfa_var, prefix, partition)
 90 | 
 91 | 
 92 | def update_file_in_partitions(input_dataset,
 93 |                               prefix,
 94 |                               group="all",
 95 |                               variable="all",
 96 |                               partition="all"):
 97 |     """Update the file information in the given partition.
 98 |     This partition could be all or a single partition specified by [t,z,x,y]
 99 |     for example.
100 | 
101 |     Args:
102 |         input_dataset (s3Dataset): dataset to alter
103 |         prefix (string): new prefix for files
104 |         group (string): name of group to alter, or 'all', or 'none'
105 |         variable (string): name of variable to alter, or 'all'
106 |         partition (string): name of partition to alter, or 'all'
107 | 
108 |     Returns:
109 |         None
110 |     """
111 |     # get the cfa structure from the dataset
112 |     cfa_dataset = input_dataset._cfa_dataset
113 |     if group == "all":
114 |         for grp in cfa_dataset.getGroups():
115 |             cfa_group = cfa_dataset.getGroup(grp)
116 |             update_file_in_group(cfa_group, prefix, variable, partition)
117 |     else:
118 |         # named group
119 |         cfa_group = input_dataset.getGroup(group)
120 |         update_file_in_group(cfa_group, prefix, variable, partition)
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     """Utility program to alter the structure of a CFA-netCDF master array
125 |        file, either on the disk or remotely on S3 storage, to change the
126 |        location of the sub-array file.  Note that it doesn't actually move any
127 |        files, it just updates the record in the partition matrix.
128 |        It will only update the prefix of the file location, not the actual
129 |        filename.  i.e. it replaces os.path.dirname
130 |        Options are:
131 |         1. The input master-array file, write back to the same file
132 |         2. The partition to change
133 |             --partition=all|none|<partition index>  default: --partition=all
134 |         3. The prefix of the new address for the file location
135 |             --prefix=
136 |     """
137 |     # set up and parse the arguments
138 |     parser = argparse.ArgumentParser(
139 |         prog="s3nc_cfa_mv",
140 |         formatter_class=argparse.RawTextHelpFormatter,
141 |         description=(
142 |             "Alter the paths of the sub-array files in the master-array file to"
143 |             " reflect that those sub-array files have been moved to a new "
144 |             " location. It will only update the prefix of the file location, " " not the actual filename."
145 |         )
146 |     )
147 | 
148 |     parser.add_argument(
149 |         "input", action="store", default="", metavar="<CFA file>",
150 |         help=(
151 |             "Path of the CFA-netCDF master-array file to alter."
152 |         )
153 |     )
154 | 
155 |     parser.add_argument(
156 |         "--group", action="store", default="all",
157 |         metavar="<group>",
158 |         help=(
159 |             "Name of a group to change file prefix for, or change all groups. "
160 |             "--group=all|<group_name>"
161 |         )
162 |     )
163 | 
164 |     parser.add_argument(
165 |         "--variable", action="store", default="all",
166 |         metavar="<variable>",
167 |         help=(
168 |             "Name of a variable to change file prefix for, or change all " "variables."
169 |             "--variable=all|<variable_name>"
170 |         )
171 |     )
172 | 
173 |     parser.add_argument(
174 |         "--partition", action = "store", default="all",
175 |         metavar="<partition>",
176 |         help=(
177 |             "Choose the partition to change the file location prefix for."
178 |             "--partition=all<partition_index>"
179 |         )
180 |     )
181 | 
182 |     parser.add_argument(
183 |         "--prefix", action = "store", default="none", required=True,
184 |         metavar="<prefix>",
185 |         help=(
186 |             "New file location prefix"
187 |         )
188 |     )
189 |     args = parser.parse_args()
190 | 
191 |     # get the input file
192 |     input_path = os.path.expanduser(args.input)
193 |     # open the input dataset in append mode
194 |     input_dataset = s3Dataset(input_path, mode='a')
195 |     # Update the prefix in the partitions
196 |     update_file_in_partitions(input_dataset, args.prefix, args.group,
197 |                               args.variable, args.partition)
198 |     # close the file to save the changes
199 |     input_dataset.close()
200 | 


--------------------------------------------------------------------------------
/test/test_s3aioFileObject.py:
--------------------------------------------------------------------------------
  1 | from S3netCDF4.Backends._s3aioFileObject import s3aioFileObject
  2 | from S3netCDF4._Exceptions import IOException, APIException
  3 | import unittest
  4 | import asyncio
  5 | import json
  6 | import time
  7 | import io
  8 | import inspect
  9 | 
 10 | class AsyncIOTestFactory(type):
 11 |     """Metaclass that creates a `test_something` function for all those functions
 12 |     called `_test_something` which simply calls asyncio.run(`_test_something`)"""
 13 |     def __new__(cls, name, bases, dct):
 14 |         def mapper(attribute):
 15 |             if inspect.iscoroutinefunction(attribute):
 16 |                 def async_wrapper(*args, **kwargs):
 17 |                     loop = asyncio.get_event_loop()
 18 |                     loop.run_until_complete(attribute(*args, **kwargs))
 19 |                 return async_wrapper
 20 |             else:
 21 |                 return attribute
 22 |         return super().__new__(
 23 |             cls,
 24 |             name,
 25 |             bases,
 26 |             { k: mapper(v) for k, v in dct.items() }
 27 |         )
 28 | 
 29 | class s3aioFileObjectGeneralTest(object, metaclass=AsyncIOTestFactory):
 30 |     """All of the general tests for either a read or write transaction."""
 31 | 
 32 |     async def test_detach(self):
 33 |         async with s3aioFileObject(
 34 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
 35 |             credentials=self.cfg["STFC"]["credentials"],
 36 |             mode="w"
 37 |         ) as s3c:
 38 |             try:
 39 |                 s3c.detach()
 40 |             except io.UnsupportedOperation:
 41 |                 return
 42 |             self.fail(
 43 |                 "s3aioFileObject.detach did not raise io.UnsupportedOperation"
 44 |             )
 45 | 
 46 |     async def test_close(self):
 47 |         async with s3aioFileObject(
 48 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
 49 |             credentials=self.cfg["STFC"]["credentials"],
 50 |             mode="rw"
 51 |         ) as s3c:
 52 |             if await s3c.close():
 53 |                 return
 54 |             else:
 55 |                 self.fail("s3aioFileObject.close returned False")
 56 | 
 57 |     async def test_readable(self):
 58 |         async with s3aioFileObject(
 59 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
 60 |             credentials=self.cfg["STFC"]["credentials"],
 61 |             mode="rw"
 62 |         ) as s3c:
 63 |             if s3c.readable():
 64 |                 return
 65 |             else:
 66 |                 self.fail("s3aioFileObject.readable returned False")
 67 | 
 68 |     async def test_truncate(self):
 69 |         async with s3aioFileObject(
 70 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
 71 |             credentials=self.cfg["STFC"]["credentials"],
 72 |             mode="w"
 73 |         ) as s3c:
 74 |             try:
 75 |                 s3c.truncate()
 76 |             except io.UnsupportedOperation:
 77 |                 return
 78 |             self.fail(
 79 |                 "s3aioFileObject.truncate did not raise io.UnsupportedOperation"
 80 |             )
 81 | 
 82 |     async def test_fileno(self):
 83 |         async with s3aioFileObject(
 84 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
 85 |             credentials=self.cfg["STFC"]["credentials"],
 86 |             mode="w"
 87 |         ) as s3c:
 88 |             try:
 89 |                 s3c.fileno()
 90 |             except io.UnsupportedOperation:
 91 |                 return
 92 |             self.fail(
 93 |                 "s3aioFileObject.fileno did not raise io.UnsupportedOperation"
 94 |             )
 95 | 
 96 |     async def test_seekable(self):
 97 |         async with s3aioFileObject(
 98 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
 99 |             credentials=self.cfg["STFC"]["credentials"],
100 |             mode="rw"
101 |         ) as s3c:
102 |             if s3c.seekable():
103 |                 return
104 |             else:
105 |                 self.fail("s3aioFileObject.seekable returned False")
106 | 
107 |     async def test_tell(self):
108 |         async with s3aioFileObject(
109 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
110 |             credentials=self.cfg["STFC"]["credentials"],
111 |             mode="rw"
112 |         ) as s3c:
113 |             if s3c.tell() == 0:
114 |                 return
115 |             else:
116 |                 self.fail("s3aioFileObject.tell did not return 0")
117 | 
118 |     async def test_seek(self):
119 |         async with s3aioFileObject(
120 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
121 |             credentials=self.cfg["STFC"]["credentials"],
122 |             mode="rw"
123 |         ) as s3c:
124 |             # Three different methods for seek:
125 |             #   whence = io.SEEK_SET
126 |             #   whence = io.SEEK_CUR
127 |             #   whence = io.SEEK_END
128 |             # the current pointer is on zero
129 |             if not await s3c.seek(0, whence=io.SEEK_SET) == 0:
130 |                 self.fail("s3aioFileObject.seek did not return 0")
131 | 
132 |             if not await s3c.seek(10, whence=io.SEEK_SET) == 10:
133 |                 self.fail("s3aioFileObject.seek did not return 10")
134 |             # now on 10
135 |             try:
136 |                 await s3c.seek(-1, whence=io.SEEK_SET)
137 |             except IOException:
138 |                 pass
139 |             else:
140 |                 self.fail("s3aioFileObject.seek did not raise IOException")
141 |             # should have failed so still on 10
142 | 
143 |             # the current pointer is on ten (10)
144 |             if not await s3c.seek(-10, whence=io.SEEK_CUR) == 0:
145 |                 self.fail("s3aioFileObject.seek did not return 0")
146 | 
147 |             # now on 0 - should raise an exception if we seek below 0
148 |             try:
149 |                 await s3c.seek(-1, whence=io.SEEK_CUR)
150 |             except IOException:
151 |                 pass
152 |             else:
153 |                 self.fail("s3aioFileObject.seek did not raise IOException")
154 | 
155 |             # still on zero: get the size to seek past it
156 |             size = await s3c._getsize()
157 |             try:
158 |                 await s3c.seek(size+1, whence=io.SEEK_CUR)
159 |             except IOException:
160 |                 pass
161 |             else:
162 |                 self.fail("s3aioFileObject.seek did not raise IOException")
163 | 
164 |             # still on zero - seek from the end
165 |             try:
166 |                 await s3c.seek(size+1, whence=io.SEEK_END)
167 |             except IOException:
168 |                 pass
169 |             else:
170 |                 self.fail("s3aioFileObject.seek did not raise IOException")
171 | 
172 |             # still on 0 - seek backwards from the end
173 |             try:
174 |                 await s3c.seek(-1, whence=io.SEEK_END)
175 |             except IOException:
176 |                 pass
177 |             else:
178 |                 self.fail("s3aioFileObject.seek did not raise IOException")
179 | 
180 |             if await s3c.seek(10, whence=io.SEEK_END) != size-10:
181 |                 self.fail("s3aioFileObject.seek did not return {}".format(
182 |                     size-10
183 |                 ))
184 | 
185 | 
186 | class s3aiot1FileObjectWriteTest(unittest.TestCase, s3aioFileObjectGeneralTest):
187 | 
188 |     def setUp(self):
189 |         """Set up the s3FileObject but don't connect."""
190 |         # load the credentials from the hidden file
191 |         fh = open(".s3config.json")
192 |         self.cfg = json.load(fh)
193 |         fh.close()
194 | 
195 |     async def test_1writable(self):
196 |         async with s3aioFileObject(
197 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
198 |             credentials=self.cfg["STFC"]["credentials"],
199 |             mode="w"
200 |         ) as s3c:
201 |             if s3c.writable():
202 |                 return
203 |             else:
204 |                 self.fail("s3aioFileObject.writable returned False")
205 | 
206 |     async def test_1write(self):
207 |         async with s3aioFileObject(
208 |             self.cfg["STFC"]["url"] + "/buckettest/thefox2a.nc",
209 |             credentials=self.cfg["STFC"]["credentials"],
210 |             mode="w"
211 |         ) as s3c:
212 |             # create random bytes - if we keep it below s3c._getsize() then it will
213 |             # only do one upload
214 |             size = await s3c._getsize()
215 |             bytes = bytearray(size)
216 |             for b in range(0, size):
217 |                 bytes[b] = 128
218 |             # convert bytes to io.BytesIO
219 |             if await s3c.write(bytes) == 0:
220 |                 self.fail("s3aioFileObject.write returned zero")
221 | 
222 | if __name__ == '__main__':
223 |     loop = asyncio.get_event_loop()
224 |     unittest.main()
225 |     loop.close()
226 | 


--------------------------------------------------------------------------------
/test/test_s3Dataset.py:
--------------------------------------------------------------------------------
  1 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset
  2 | from S3netCDF4._Exceptions import APIException
  3 | import numpy as np
  4 | import unittest
  5 | import os
  6 | 
  7 | DEBUG = False
  8 | 
  9 | def create_test_dataset(s3_ds, format, cfa_version, shape=[30,1,192,145]):
 10 |     """Create a test dataset for a netCDF file"""
 11 |     s3_ds.history = "Test of s3netCDF: format: {} cfa_version: {}".format(
 12 |         format, cfa_version
 13 |     )
 14 | 
 15 |     # create a group if this is a netCDF4 (or CFA4 equivalent) file
 16 |     if format == "NETCDF4" or format == "CFA4":
 17 |         group = s3_ds.createGroup("test_group")
 18 |     # otherwise for netCDF3 files the group is the dataset
 19 |     else:
 20 |         group = s3_ds
 21 |     group.group_class = "Surface variables"
 22 | 
 23 |     # create the dimension, the variable, add the variable values and some
 24 |     # metadata
 25 |     if DEBUG:
 26 |         print("\t . Creating time")
 27 |     time_dim = group.createDimension("time", shape[0])
 28 |     time_var = group.createVariable("time", np.float32, ("time",))
 29 |     time_var[:] = np.arange(0, shape[0])
 30 |     time_var.units = "days since 2000-01-01"
 31 |     time_var.axis = "T"
 32 | 
 33 |     if DEBUG:
 34 |         print("\t . Creating level")
 35 |     level_dim = group.createDimension("level", shape[1])
 36 |     level_var = group.createVariable("level", np.float32, ("level",))
 37 |     level_var[:] = np.arange(0, shape[1])*100
 38 |     level_var.standard_name = "height above sea-level"
 39 |     level_var.units = "m"
 40 | 
 41 |     if DEBUG:
 42 |         print("\t . Creating latitude")
 43 |     latitude_dim = group.createDimension("latitude", shape[2])
 44 |     latitude_var = group.createVariable("latitude", np.float32, ("latitude",))
 45 |     latitude_vals = 90.0 - np.arange(0, shape[2]) * 180.0/(shape[2]-1)
 46 |     latitude_var[:] = latitude_vals
 47 |     latitude_var.standard_name = "latitude"
 48 |     latitude_var.units = "degrees north"
 49 |     latitude_var.setncatts({"name": "value", "test":234235})
 50 | 
 51 |     if DEBUG:
 52 |         print("\t . Creating longitude")
 53 |     longitude_dim = group.createDimension("longitude", shape[3])
 54 |     longitude_var = group.createVariable("longitude", np.float32, ("longitude",))
 55 |     longitude_vals = np.arange(0, shape[3]) * 360.0/shape[3]
 56 |     longitude_var[:] = longitude_vals
 57 |     longitude_var.standard_name = "longitude"
 58 |     longitude_var.units = "degrees east"
 59 | 
 60 |     if DEBUG:
 61 |         print("\t . Creating tmp")
 62 |     # create the field variable and data
 63 |     subarray_shape = np.array(
 64 |         [12, shape[1], shape[2], shape[3]],
 65 |         dtype='i'
 66 |     )
 67 |     tmp_var = group.createVariable("tmp", np.float32,
 68 |                                     ("time", "level", "latitude", "longitude"),
 69 |                                     fill_value=2e2,
 70 |                                     subarray_shape=subarray_shape
 71 |                                   )
 72 |     tmp_var.standard_name = "temperature"
 73 |     tmp_var.units = "degrees C"
 74 |     tmp_var.setncattr("long_name", "Surface temperature at 1m")
 75 |     tmp_var._FillValue = np.float32(2e20)  # strict typing matches variable
 76 | 
 77 |     if DEBUG:
 78 |         print("\t . Writing data")
 79 | 
 80 |     # write a single scalar of data
 81 |     scl_var = s3_ds.createVariable("scl", np.float32)
 82 | 
 83 |     # write a vector of data
 84 |     vec_dim = s3_ds.createDimension("vector", 128)
 85 |     vec_var = s3_ds.createVariable("vector", np.int32, ("vector",))
 86 |     vec_var[:] = 12+np.arange(0,128)
 87 |     velocity = s3_ds.createVariable("velocity", np.float32, ("vector",))
 88 |     velocity.units = "ms-1"
 89 | 
 90 | def get_file_path(path_stub, format, cfa_version=None):
 91 |     """Get the path to the file for reading or writing.
 92 |     Based on the path_stub, the format and cfa_version.
 93 |     """
 94 |     file_name = "{}_{}".format(path_stub, format)
 95 |     if cfa_version is not None:
 96 |         file_name += "_cfa{}".format(cfa_version)
 97 |     file_name += ".nc"
 98 |     return file_name
 99 | 
100 | def test_s3Dataset_write(path_stub, format="NETCDF4", cfa_version="0.4",
101 |                          resolution_degrees=1.5):
102 |     """Test writing out a s3Dataset, for one of the various permutations of:
103 |         1. file format (netCDF3 or netCDF4)
104 |         2. whether it is a S3-netCDF / CFA file or a plain netCDF file
105 |         3. the CFA version (0.4 or 0.5)
106 |     """
107 |     # build a file name from the path stub, the format and the cfa_version
108 |     # don't use os.path.join as it doesn't handle URLs and paths
109 |     file_name = get_file_path(path_stub, format, cfa_version)
110 |     if DEBUG:
111 |         print("Test writing {}".format(file_name))
112 |     # open the dataset
113 |     ds = s3Dataset(file_name, format=format, mode='w', cfa_version=cfa_version,
114 |                    diskless=False, persist=False)
115 |     # construct the shape:
116 |     shape=[365, 1, 180.0/resolution_degrees+1, 360.0/resolution_degrees]
117 |     # create the data inside the dataset
118 |     create_test_dataset(ds, format, cfa_version, shape)
119 |     if DEBUG:
120 |         print(ds.groups["test_group"].variables["tmp"])
121 |         print(ds.variables["scl"])
122 | 
123 |     if format == "CFA4" or format == "NETCDF4":
124 |         tmp_var = ds.groups["test_group"].variables["tmp"]
125 |     else:
126 |         tmp_var = ds.variables["tmp"]
127 |     tmp_var[:,:,:,:] = 250.0
128 |     vel_var = ds.variables["velocity"]
129 |     vel_var[0] = 10.0
130 |     ds.close()
131 |     return True
132 | 
133 | def test_s3Dataset_read(path_stub, format="NETCDF4", cfa_version=None):
134 |     """Test writing out a s3Dataset, for one of the various permutations of:
135 |         1. file format (netCDF3 or netCDF4)
136 |         2. whether it is a S3-netCDF / CFA file or a plain netCDF file
137 |         3. the CFA version (0.4 or 0.5)
138 |     """
139 |     file_name = get_file_path(path_stub, format, cfa_version)
140 |     if DEBUG:
141 |         print("Test reading {}".format(file_name))
142 |     # open the dataset
143 |     dr = s3Dataset(file_name, mode='r')
144 |     if DEBUG:
145 |         print(dr.groups)
146 | 
147 |     if format == "NETCDF4" or format == "CFA4":
148 |         grp = dr.groups["test_group"]
149 |     else:
150 |         grp = dr
151 | 
152 |     if DEBUG:
153 |         print(grp.variables["tmp"])
154 |         print(dr.variables["scl"])
155 | 
156 |     tmp_var = grp.variables["tmp"]
157 |     x = tmp_var[:,0,0,0]
158 |     dr.close()
159 |     return True
160 | 
161 | class s3DatasetTest(unittest.TestCase):
162 |     # static class members
163 |     # all path stubs the same
164 |     path_stub = os.environ["HOME"] + "/Test/s3Dataset_test"
165 |     res_deg = 2.5
166 | 
167 |     #
168 |     def test_NETCDF4_CFA0_4(self):
169 |         self.assertTrue(
170 |             test_s3Dataset_write(
171 |                 s3DatasetTest.path_stub, "NETCDF4", "0.4", s3DatasetTest.res_deg
172 |             )
173 |         )
174 |         self.assertTrue(
175 |             test_s3Dataset_read(s3DatasetTest.path_stub, "NETCDF4", "0.4")
176 |         )
177 | 
178 |     def test_NETCDF4_CFA0_5(self):
179 |         self.assertTrue(
180 |             test_s3Dataset_write(
181 |                 s3DatasetTest.path_stub, "NETCDF4", "0.5", s3DatasetTest.res_deg
182 |             )
183 |         )
184 |         self.assertTrue(
185 |             test_s3Dataset_read(s3DatasetTest.path_stub, "NETCDF4", "0.5")
186 |         )
187 | 
188 |     def test_NETCDF3_CFA0_4(self):
189 |         self.assertTrue(
190 |             test_s3Dataset_write(
191 |                 s3DatasetTest.path_stub, "NETCDF3_CLASSIC", "0.4", s3DatasetTest.res_deg
192 |             )
193 |         )
194 |         self.assertTrue(
195 |             test_s3Dataset_read(s3DatasetTest.path_stub, "NETCDF3_CLASSIC", "0.4")
196 |         )
197 | 
198 |     def test_NETCDF3_CFA0_5(self):
199 |         with self.assertRaises(APIException):
200 |             test_s3Dataset_write(
201 |                 s3DatasetTest.path_stub, "NETCDF3_CLASSIC", "0.5", s3DatasetTest.res_deg
202 |             )
203 | 
204 |     def test_CFA4_CFA0_4(self):
205 |         self.assertTrue(
206 |             test_s3Dataset_write(
207 |                 s3DatasetTest.path_stub, "CFA4", "0.4", s3DatasetTest.res_deg
208 |             )
209 |         )
210 |         self.assertTrue(
211 |             test_s3Dataset_read(s3DatasetTest.path_stub, "CFA4", "0.4")
212 |         )
213 | 
214 |     def test_CFA4_CFA0_5(self):
215 |         self.assertTrue(
216 |             test_s3Dataset_write(
217 |                 s3DatasetTest.path_stub, "CFA4", "0.5", s3DatasetTest.res_deg
218 |             )
219 |         )
220 |         self.assertTrue(
221 |             test_s3Dataset_read(s3DatasetTest.path_stub, "CFA4", "0.5")
222 |         )
223 | 
224 |     def test_CFA3_CFA0_4(self):
225 |         self.assertTrue(
226 |             test_s3Dataset_write(
227 |                 s3DatasetTest.path_stub, "CFA3", "0.4", s3DatasetTest.res_deg
228 |             )
229 |         )
230 |         self.assertTrue(
231 |             test_s3Dataset_read(s3DatasetTest.path_stub, "CFA3", "0.4")
232 |         )
233 | 
234 |     def test_CFA3_CFA0_5(self):
235 |         with self.assertRaises(APIException):
236 |             test_s3Dataset_write(
237 |                 s3DatasetTest.path_stub, "CFA3", "0.5", s3DatasetTest.res_deg
238 |             )
239 | 
240 | if __name__ == '__main__':
241 |     unittest.main()
242 | 


--------------------------------------------------------------------------------
/S3netCDF4/CFA/_CFASplitter.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: language_level=3
  3 | 
  4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
  5 | __license__ = "BSD - see LICENSE file in top-level directory"
  6 | __authors__ = "Neil Massey"
  7 | 
  8 | """
  9 |    CFASplitter class containing the routines required to take a
 10 |    multi-dimensional array and split it into subarrays according to the protocol
 11 |    that each subarray should have a maximum size, and that the number of
 12 |    operations required to read the entire array in any direction should be
 13 |    equal.
 14 | 
 15 | """
 16 | 
 17 | import numpy as np
 18 | cimport numpy as np
 19 | 
 20 | cdef class CFASplitter:
 21 |     """
 22 |        Class containing the methods required to return optimised subarrays for
 23 |        creating CFAVariables.
 24 |     """
 25 | 
 26 |     cdef np.ndarray shape
 27 |     cdef np.ndarray subarray_shape
 28 |     cdef list axis_types
 29 |     cdef int max_subarray_size
 30 | 
 31 | 
 32 |     def __init__(self,
 33 |                  np.ndarray shape,
 34 |                  int max_subarray_size=0,
 35 |                  list axis_types=[],
 36 |                 ):
 37 |         """Initialise the CFA array splitter.
 38 | 
 39 |         Args:
 40 |             shape (np.ndarray): the shape of the array to split into subarrays.
 41 |             axis_types (list): a list of the types of axis, in order, for the
 42 |                 shape of the array.  These axis types can be:
 43 |                     'X' - X axis
 44 |                     'Y' - Y axis
 45 |                     'Z' - Z / level axis
 46 |                     'T' - Time axis
 47 |                     'N' - non of the above axis
 48 |                     'U' - unspecified axis, this needs to be overwritten
 49 |         """
 50 |         DEFAULT_SUBARRAY_SIZE = 50*1024*1024 # 50MB default object size
 51 |         self.shape = shape
 52 |         if len(axis_types) == 0:
 53 |             # build the axis_types by guessing what they should be
 54 |             # this order follows CF conventions
 55 |             default_axis_types = ["T", "Z", "Y", "X"]
 56 |             new_axis_types = np.empty(shape.size)
 57 |             # position in default axis array
 58 |             p = len(default_axis_types)-1
 59 |             for i in range(shape.size, 0, -1):
 60 |                 # calculate the default axis position
 61 |                 if p >= 0:
 62 |                     new_axis_types[i] = default_axis_types[p]
 63 |                     # go to the next (previous) default axis type
 64 |                     p -= 1
 65 |                 else:
 66 |                     new_axis_types[i] = 'N'
 67 |             self.axis_types = new_axis_types
 68 |         else:
 69 |             self.axis_types = axis_types
 70 | 
 71 |         if max_subarray_size == 0:
 72 |             self.max_subarray_size = DEFAULT_SUBARRAY_SIZE
 73 |         else:
 74 |             self.max_subarray_size = max_subarray_size
 75 | 
 76 |         self.subarray_shape = np.array([])
 77 | 
 78 | 
 79 |     cdef _numVals(self, np.ndarray shape):
 80 |         """Return number of values in subarray of specified shape, given by a
 81 |         list of dimension lengths.
 82 | 
 83 |         shape -- list of subarray dimension sizes"""
 84 |         if (len(shape) == 0):
 85 |             return 1
 86 |         return np.prod(shape)
 87 | 
 88 | 
 89 |     cdef _subdivideArray(self,
 90 |                           np.ndarray c_subarray_divs,
 91 |                           list permitted_axes=["T"]):
 92 |         # calculate the number of elements per sub for the linear axis types
 93 |         n_per_subf = np.empty((len(self.shape),),'i')
 94 |         for i in range(0, len(self.shape)):
 95 |             if self.axis_types[i] not in permitted_axes:
 96 |                 n_per_subf[i] = int(1e6)
 97 |             # check that we are not going to subdivide more than the axis length!
 98 |             elif c_subarray_divs[i] > self.shape[i]:
 99 |                 n_per_subf[i] = int(1e6)
100 |             else:
101 |                 n_per_subf[i] = c_subarray_divs[i]
102 |         # get the minimum index
103 |         min_i = np.argmin(n_per_subf)
104 |         c_subarray_divs[min_i] += 1
105 |         return c_subarray_divs
106 | 
107 | 
108 |     cdef _getLinearOperations(self, np.ndarray c_subarray_divs):
109 |         """Get the number of operations required to read one spatial point for
110 |            every timestep through the dataset.
111 |            This is equal to: number of subarrays in the T axis."""
112 |         # get the t axis index, if it exists, otherwise the Z axis, otherwise
113 |         # the N axis
114 |         t_ax = -1
115 |         if "T" in self.axis_types:
116 |             t_ax = self.axis_types.index("T")
117 |         elif "Z" in self.axis_types:
118 |             t_ax = self.axis_types.index("Z")
119 |         elif "N" in self.axis_types:
120 |             t_ax = self.axis_types.index("N")
121 | 
122 |         # calculate number of operations
123 |         if t_ax != -1:
124 |             return c_subarray_divs[t_ax]
125 |         else:
126 |             # otherwise return -1
127 |             return -1
128 | 
129 | 
130 |     cdef _getFieldOperations(self, np.ndarray c_subarray_divs):
131 |         """Get the number of operations required to read one 2D field of data at
132 |            a particular timestep or level throughout the dataset.
133 |            This is equal to: (subarrays in the X axis) *
134 |                              (subarrays in the Y axis)
135 |         """
136 |         # get the X and Y axes, if they exists
137 |         x_ax = -1
138 |         y_ax = -1
139 |         if "X" in self.axis_types:
140 |             x_ax = self.axis_types.index("X")
141 |         if "Y" in self.axis_types:
142 |             y_ax = self.axis_types.index("Y")
143 | 
144 |         # four possibilities:
145 |         # 1. X & Y exist            : return subarrays in X * subarrays in Y
146 |         # 2. X exists but Y doesn't : return subarrays in X
147 |         # 3. Y exists but X doesn't : return subarrays in Y
148 |         # 4. Neither X or Y exists  : return -1
149 | 
150 |         # logic optimised
151 |         if not (x_ax == -1 or y_ax == -1):
152 |             n_ops = c_subarray_divs[x_ax] * c_subarray_divs[y_ax]
153 |         elif y_ax != -1:
154 |             n_ops = c_subarray_divs[y_ax]
155 |         elif x_ax != -1:
156 |             n_ops = c_subarray_divs[x_ax]
157 |         else:
158 |             n_ops = -1
159 | 
160 |         return n_ops
161 | 
162 | 
163 |     cpdef calculateSubarrayShape(self):
164 |         """
165 |         Return a 'good shape' for the sub-arrays for an any-D variable,
166 |         assuming balanced 1D/(n-1)D access
167 | 
168 |         Returns floating point field lengths of a field shape that provides
169 |         balanced access of 1D subsets and 2D subsets of a netCDF or HDF5
170 |         variable with any shape.
171 |         'Good shape' for fields means that the number of fields accessed to read
172 |         either kind of 1D or 2D subset is approximately equal, and the size of
173 |         each field is no more than max_subarray_size.
174 |         An extra complication here is that we wish to be able to optimise for any number of
175 |         dimensions (1,2,3,4, etc.) but ensure that the algorithm knows which axis it is
176 |         operating on.  For example, a 2D field with X and Y axes should not be split in
177 |         the same way as a 2D field with T and Z axes.
178 | 
179 |         The algorithm follows a sub-division process, in this order (if they
180 |         exist):
181 |             1. sub divide the X axis
182 |             2. sub divide the T axis
183 |             3. sub divide the Y axis
184 |             4. sub divide the Z axis
185 |             5. sub divide any N axes
186 | 
187 |         Calculating the access operations:
188 |             There are two "types" of access operations
189 |              - linear (accessing a single spatial point across timesteps)
190 |              - field  (accessing a 2D field of data at a particular timestep)
191 |             The number of access operations are:
192 |              - linear :  T dimension / number of subfields in the T axis
193 |              - field  : (X dimension / number of subfields in the X axis)*
194 |                         (Y dimension / number of subfields in the Y axis)
195 |         """
196 | 
197 |         # the algorithm first calculates how many partitions each dimension
198 |         # should be split into - this is stored in c_subfield_divs
199 |         # current subfield_repeats shape defaults to var shape
200 |         c_subarray_divs = np.ones((len(self.shape),), 'i')
201 | 
202 |         # if the number of values in the field_shape is greater than
203 |         # max_subarray_size then divide
204 |         while (self._numVals(self.shape / c_subarray_divs)) > self.max_subarray_size:
205 |             # get the linear access and the field access operations
206 |             linear_ops = self._getLinearOperations(c_subarray_divs)
207 |             field_ops  = self._getFieldOperations(c_subarray_divs)
208 |             # choose to divide on field ops first, if the number of ops are equal
209 |             if field_ops <= linear_ops:
210 |                 c_subarray_divs = self._subdivideArray(c_subarray_divs,
211 |                                                         ["X", "Y"]
212 |                                                        )
213 |             else:
214 |                 c_subarray_divs = self._subdivideArray(c_subarray_divs,
215 |                                                         ["T", "Z", "N"]
216 |                                                        )
217 | 
218 |         # we have so far calculated the optimum number of times each axis will
219 |         # be divided
220 |         # - translate this into a (floating point) number of elements in each
221 |         #   chunk, for each axis
222 |         c_subarray_shape = np.array(self.shape, 'd') / c_subarray_divs
223 |         self.subarray_shape = c_subarray_shape
224 |         return c_subarray_shape
225 | 
226 | 
227 |     cpdef setSubarrayShape(self, np.ndarray subarray_shape):
228 |         """Set the shape of the subarray, for when the user wishes to define it.
229 |         """
230 |         self.subarray_shape = subarray_shape
231 |         return subarray_shape
232 | 


--------------------------------------------------------------------------------
/bin/s3nc_cfa_info.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
  4 | __license__ = "BSD - see LICENSE file in top-level directory"
  5 | __authors__ = "Neil Massey"
  6 | 
  7 | """Program to return information about a netCDF-CFA file from disk or S3.
  8 | Modelled after ncdump and cdo info.
  9 | """
 10 | 
 11 | import argparse
 12 | from urllib.parse import urlparse
 13 | import os
 14 | import numpy as np
 15 | 
 16 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset
 17 | 
 18 | def print_dimension_info(input_dim, metadata):
 19 |     """Print the information for the dimension."""
 20 |     dim_size = input_dim.getLen()
 21 |     print("        {} = {}".format(input_dim.getName(), dim_size))
 22 |     # print the metadata
 23 |     if metadata:
 24 |         md = input_dim.getMetadata()
 25 |         for key in md:
 26 |             if key[0:4] != "cfa_":
 27 |                 print ("            {}:{} = {}".format(
 28 |                     input_dim.getName(), key, md[key])
 29 |                 )
 30 | 
 31 | def print_dimensions(group, metadata):
 32 |     """Print all the dimensions in a group."""
 33 |     for d in group.getDimensions():
 34 |         input_dimension = group.getDimension(d)
 35 |         print_dimension_info(input_dimension, metadata)
 36 | 
 37 | def print_partition_info(input_var, partition_index):
 38 |     """Print the partition information for a single partition.
 39 |     By this point partition should be a numpy array of the number of
 40 |     dimensions of the partition."""
 41 |     partition = input_var.getPartition(partition_index)
 42 |     var_name_len = len(input_var.getName()) + 16
 43 |     just_str = ""
 44 |     for x in range(0, var_name_len):
 45 |         just_str += " "
 46 |     print("            {}:{} {} =".format(
 47 |             input_var.getName(), "partition", partition_index
 48 |         )
 49 |     )
 50 |     # location
 51 |     location_string = ":location = {}".format(partition.location.tolist())
 52 |     print(just_str + location_string)
 53 |     # shape
 54 |     shape_string = ":shape    = {}".format(partition.shape.tolist())
 55 |     print(just_str + shape_string)
 56 |     # filename
 57 |     filename_string = ":filename = {}".format(partition.file)
 58 |     print(just_str + filename_string)
 59 |     # varname
 60 |     varname_string = ":variable = {}".format(partition.ncvar)
 61 |     print(just_str + varname_string)
 62 |     # format
 63 |     format_string = ":format   = {}".format(partition.format)
 64 |     print(just_str + format_string)
 65 | 
 66 | def print_variable_info(input_var, partition, metadata):
 67 |     """Print the information for the variable."""
 68 |     print("        {} {}({})".format(
 69 |         input_var.getType(),
 70 |         input_var.getName(),
 71 |         ",".join(input_var.getDimensions())
 72 |         )
 73 |     )
 74 |     # print the metadata
 75 |     if metadata:
 76 |         md = input_var.getMetadata()
 77 |         for key in md:
 78 |             if key[0:4] != "cfa_":
 79 |                 print ("            {}:{} = {}".format(
 80 |                     input_var.getName(), key, md[key])
 81 |                 )
 82 |         # print the minimum partition information
 83 |         # print the partition matrix shape
 84 |         pmshape_str = "("
 85 |         for x in input_var.getPartitionMatrixShape():
 86 |             pmshape_str += str(x) + ", "
 87 |         pmshape_str = pmshape_str[:-2] + ")"
 88 |         print ("            {}:{} = {}".format(
 89 |             input_var.getName(), "pmshape", pmshape_str)
 90 |         )
 91 |         # print the partition matrix dimensions
 92 |         pmdims = "(" + ", ".join(input_var.getPartitionMatrixDimensions()) + ")"
 93 |         print ("            {}:{} = {}".format(
 94 |             input_var.getName(), "pmdimensions", pmdims)
 95 |         )
 96 |     # print the partition
 97 |     if partition == "all":
 98 |         pmshape = input_var.getPartitionMatrixShape()
 99 |         for index in np.ndindex(*pmshape):
100 |             print_partition_info(input_var, index)
101 |     elif partition == "none":
102 |         pass # do not print anything for partition==none
103 |     else:
104 |         partition_index = np.fromstring(args.partition, dtype='i', sep=', ')
105 |         print_partition_info(input_var, np.array(partition_index))
106 | 
107 | def print_variables(group, partition, metadata):
108 |     for v in group.getVariables():
109 |         input_var = group.getVariable(v)
110 |         print_variable_info(input_var, partition, metadata)
111 | 
112 | def print_group_info(input_grp, variable, partition, metadata):
113 |     """Print the information for the group, and all the dimensions and
114 |     variables in the group."""
115 |     if variable == "none":
116 |         print("    {}".format(input_grp.getName()))
117 |     else:
118 |         print("group: {} ".format(input_grp.getName())+"{")
119 |         # print the dimensions
120 |         print("    dimensions:")
121 |         print_dimensions(input_grp, metadata)
122 | 
123 |         # print the variables in the group
124 |         print("    variables:")
125 |         if variable == "all":
126 |             print_variables(input_grp, partition, metadata)
127 |         else:
128 |             input_var = input_grp.getVariable(variable)
129 |             print_variable_info(input_var, partition, metadata)
130 |         print("    }")
131 |     if metadata:
132 |         print("    // group attributes")
133 |         md = input_grp.getMetadata()
134 |         for key in md:
135 |             if key[0:4] != "cfa_":
136 |                 print ("        :{} = {}".format(
137 |                     key, md[key])
138 |                 )
139 | 
140 | def print_dataset_info(input_dataset, group, variable, partition, metadata):
141 |     """Print the information for the dataset.  Use the CFA class.
142 |     Print the name, metadata and groups.  Recurse into the group to print the
143 |     variables if variable==all or variable==<name of variable>."""
144 |     cfa_d = input_dataset._cfa_dataset
145 |     print(cfa_d.getName() + " {")
146 |     # print the root group if group == "all" or group == "root"
147 |     if (group in ["all", "root"]):
148 |         root_grp = cfa_d["root"]
149 |         print("dimensions:")
150 |         print_dimensions(root_grp, metadata)
151 |         print("variables:")
152 |         if variable == "all":
153 |             print_variables(root_grp, partition, metadata)
154 |         else:
155 |             input_var = root_grp.getVariable(variable)
156 |             print_variable_info(input_var, partition, metadata)
157 |         # print the group names, unless just the root group is requested
158 |         if (group != "root"):
159 |             if (variable == "none"):
160 |                 print("groups:")
161 |             for g in cfa_d.getGroups():
162 |                 input_grp = cfa_d[g]
163 |                 if (g != "root" and g[0:4] != "cfa_"):
164 |                     print_group_info(input_grp, variable, partition, metadata)
165 |     else:
166 |         if (variable == "none"):
167 |             print("groups:")
168 |         input_grp = cfa_d[group]
169 |         print_group_info(input_grp, variable, partition, metadata)
170 | 
171 |     # print the global attributes
172 |     if metadata:
173 |         print("// global attributes")
174 |         md = cfa_d.getMetadata()
175 |         for key in md:
176 |             print ("    :{} = {}".format(key, md[key]))
177 |     print("}")
178 | 
179 | if __name__ == "__main__":
180 |     """Utility program to display the structure of a CFA-netCDF master array
181 |        file, either on the disk or remotely on S3 storage.
182 |        This program is inspired by ncdump and cdo info / sinfo.
183 |        We need options to control three things:
184 |         1.  Whether to output all the groups, or a particular group and whether
185 |             to output the variables in the group(s)
186 |             --group=all|<group_name>                default: --group=all
187 |             --variable=all|<variable_name>          default: --variable=all
188 |         2.  Whether to output the metadata or not
189 |             --metadata                              default: --metadata(on)
190 |         3.  Whether to output partition information for the variables, either
191 |             all the partition information or for a particular partition.
192 |             --partition=all|none|<partition index>  default: --partion=none (off)
193 |     """
194 |     # set up and parse the arguments
195 |     parser = argparse.ArgumentParser(
196 |         prog="s3nc_cfa_info",
197 |         formatter_class=argparse.RawTextHelpFormatter,
198 |         description=(
199 |             "Output information about a CFA-netCDF file, or netCDF file either "
200 |             "on disk or on S3"
201 |         )
202 |     )
203 | 
204 |     parser.add_argument(
205 |         "input", action="store", default="",
206 |         metavar="<input>",
207 |         help=(
208 |             "Path of the  CFA-netCDF or netCDF file input file, either on disk"
209 |             " or S3."
210 |         )
211 |     )
212 | 
213 |     parser.add_argument(
214 |         "--group", action="store", default="all",
215 |         metavar="<group>",
216 |         help=(
217 |             "Name of a group to print information about, or print all groups. "
218 |             "--group=all|<group_name>"
219 |         )
220 |     )
221 | 
222 |     parser.add_argument(
223 |         "--variable", action="store", default="all",
224 |         metavar="<variable>",
225 |         help=(
226 |             "Name of a variable to print information about, print all or no" "variables. "
227 |             "--variable=all|none|<variable_name>"
228 |         )
229 |     )
230 | 
231 |     parser.add_argument(
232 |         "--partition", action = "store", default="none",
233 |         metavar="<partition>",
234 |         help=(
235 |             "Print the information about a partition. "
236 |             "--partition=all|none|<partition_index>"
237 |         )
238 |     )
239 | 
240 |     parser.add_argument(
241 |         "--metadata", action = "store_true", default=False,
242 |         help=(
243 |             "Print the metadata for groups, dimensions and variables"
244 |             "--metadata"
245 |         )
246 |     )
247 | 
248 |     args = parser.parse_args()
249 | 
250 |     if args.input:
251 |         input_file = args.input
252 |     else:
253 |         input_file = None
254 | 
255 |     if args.group:
256 |         group = args.group
257 |     else:
258 |         group = "all"
259 | 
260 |     if args.variable:
261 |         variable = args.variable
262 |     else:
263 |         variable = "all"
264 | 
265 |     if args.partition:
266 |         # convert the partition string to a numpy array
267 |         partition = args.partition
268 |     else:
269 |         partition = "none"
270 | 
271 |     if args.metadata:
272 |         metadata = True
273 |     else:
274 |         metadata = False
275 | 
276 |     if input_file:
277 |         # Get the input file.
278 |         path = os.path.expanduser(input_file)
279 |         input_dataset = s3Dataset(path, mode='r')
280 |         # Print the global dataset information
281 |         print_dataset_info(
282 |             input_dataset,
283 |             group,
284 |             variable,
285 |             partition,
286 |             metadata
287 |         )
288 |     #else:
289 | 


--------------------------------------------------------------------------------
/S3netCDF4/utils/agg.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlparse
  2 | import os
  3 | from glob import glob
  4 | import numpy as np
  5 | 
  6 | from S3netCDF4._s3netCDF4 import s3Dataset as s3Dataset
  7 | from S3netCDF4.CFA._CFAClasses import CFAPartition
  8 | from S3netCDF4.Managers._FileManager import FileManager
  9 | 
 10 | from netCDF4 import num2date, date2num
 11 | 
 12 | def get_universal_times(nc_var, common_date):
 13 |     # get the start date and calendar
 14 |     if ("units" in nc_var.ncattrs() and
 15 |         "calendar" in nc_var.ncattrs() and
 16 |         common_date is not None):
 17 |         date_values = num2date(nc_var[:],
 18 |                         nc_var.units,
 19 |                         nc_var.calendar)
 20 |         axis_dim_values = date2num(date_values,
 21 |                                    common_date,
 22 |                                    nc_var.calendar)
 23 |     else:
 24 |         axis_dim_values = nc_var[:]
 25 |     return axis_dim_values
 26 | 
 27 | 
 28 | def add_var_dims(in_object, out_object, axis, fname, common_date):
 29 |     """Add the variables and dimensions to the s3Dataset or s3Group"""
 30 |     # create dimension, get the axis dimension location
 31 |     axis_dim_n = -1
 32 |     for d, dim in enumerate(in_object.dimensions):
 33 |         in_dim = in_object.dimensions[dim]
 34 |         if dim not in out_object.dimensions:
 35 |             # get the dim size, 0 is UNLIMITED if dim == axis
 36 |             if axis == dim:
 37 |                 dim_size = 0
 38 |             else:
 39 |                 dim_size = in_dim.size
 40 | 
 41 |             out_dim = out_object.createDimension(
 42 |                 dim, dim_size
 43 |             )
 44 |         else:
 45 |             out_dim = out_object.dimensions[dim]
 46 |         # get the axis dimension
 47 |         if axis == dim:
 48 |             axis_dim_n = d
 49 | 
 50 |     # create variable
 51 |     for var in in_object.variables:
 52 |         in_var = in_object.variables[var]
 53 |         # get the variable metadata
 54 |         in_var_attrs = {
 55 |             x: in_var.getncattr(x) for x in in_var.ncattrs()
 56 |         }
 57 |         # if the variable does not already exist then create it
 58 |         if var not in out_object.variables:
 59 |             # get the subarray shape
 60 |             shp = in_var.shape
 61 |             subarray_shape = np.array(shp, 'i')
 62 |             if len(in_var.dimensions) > 0:
 63 |                 # rejig axis to be unlimited
 64 |                 if len(subarray_shape) > axis_dim_n:
 65 |                     subarray_shape[axis_dim_n] = 0
 66 |                 # create the variable with subarray
 67 |                 out_var = out_object.createVariable(
 68 |                     var, in_var.dtype, in_var.dimensions,
 69 |                     subarray_shape=subarray_shape
 70 |                 )
 71 |             else: # no dimensions, just a scalar variable
 72 |                 out_var = out_object.createVariable(
 73 |                     var, in_var.dtype
 74 |                 )
 75 |         else:
 76 |             # variable has already been created so get it
 77 |             out_var = out_object.variables[var]
 78 | 
 79 |         # only write partitions for field variables - those with _cfa_var != None
 80 |         if out_var._cfa_var:
 81 |             # get the current partition matrix shape
 82 |             c_shape = out_var._cfa_var.getPartitionMatrixShape()
 83 |             # create the index to append at the end of the currently used
 84 |             # indices
 85 |             n_dims = len(out_var.dimensions)
 86 |             if n_dims > 0:
 87 |                 index = np.zeros(n_dims, 'i')
 88 |                 index[axis_dim_n] = c_shape[0]
 89 |                 # get the location along the aggregation axis in the Master Array,
 90 |                 # from the axis dimension variable
 91 |                 location = np.zeros([n_dims, 2],'i')
 92 | 
 93 |             # check whether the axis is in the dimensions of the input_variable
 94 |             # and calculate the location from it if it is
 95 |             if axis in in_var.dimensions:
 96 |                 # get the values of the axis variable
 97 |                 axis_dim_var = in_object.variables[axis]
 98 |                 # if this is a time variable then covert the values to a common
 99 |                 # calendar
100 |                 if axis_dim_var.name == "time" or axis_dim_var.name[0] == "t":
101 |                     # get the start date and calendar
102 |                     axis_dim_values = get_universal_times(
103 |                         axis_dim_var, common_date
104 |                     )
105 | 
106 |                 # get the axis resolution - i.e. the difference for each step
107 |                 # along the axis
108 |                 try:
109 |                     axis_res = (axis_dim_values[-1] - axis_dim_values[0]) / len(axis_dim_values)
110 |                 except IndexError:
111 |                     axis_res = 1
112 |                 # prevent divide by zero
113 |                 if (axis_res == 0.0):
114 |                     axis_res = 1.0
115 |                 # set the location for the aggregating axis dimension
116 |                 location[axis_dim_n, 0] = int(axis_dim_values[0] / axis_res)
117 |                 location[axis_dim_n, 1] = location[axis_dim_n, 0] + len(axis_dim_var)
118 |                 # set the locations for the other dimensions - equal to 0 to the
119 |                 # shape of the array
120 |                 for d, dim in enumerate(out_var.dimensions):
121 |                     # don't redo the above for axis_dim_n
122 |                     if d != axis_dim_n:
123 |                         location[d, 0] = 0
124 |                         location[d, 1] = in_var.shape[d]
125 |             else:
126 |                 for d in range(0, len(in_var.shape)):
127 |                     location[d, 0] = 0
128 |                     location[d, 1] = in_var.shape[d]
129 | 
130 |             # get the datamodel from the parent object
131 |             try:
132 |                 datamodel = out_object._nc_grp.data_model
133 |             except (KeyError, AttributeError):
134 |                 datamodel = out_object._nc_dataset.data_model
135 | 
136 |             # create the partition for none scalar variables
137 |             if len(out_var._cfa_var.getPartitionMatrixShape() != 0):
138 |                 partition = CFAPartition(
139 |                     index=tuple(index),
140 |                     location=location,
141 |                     ncvar=var,
142 |                     file=fname,
143 |                     format=datamodel,
144 |                     shape=in_var.shape
145 |                 )
146 |                 # write the partition
147 |                 out_var._cfa_var.writePartition(partition)
148 |                 # add the attributes to the s3Dataset by updating the dictionary
149 |                 out_var._cfa_var.metadata.update(in_var_attrs)
150 |         else:
151 |             # assign the values from the input variable to the output variable
152 |             # if it is the axis variable then append / concatenate
153 |             if var == axis:
154 |                 var_vals = in_object.variables[var]
155 |                 axl = out_var._nc_var.shape[axis_dim_n]
156 |                 # convert times here as well
157 |                 out_var[axl:] = get_universal_times(var_vals, common_date)
158 |             else:
159 |                 out_var[:] = in_object.variables[var][:]
160 |             # update the in_var_attrs to the new common_date if applicable
161 |             if (common_date is not None and
162 |                 "units" in in_var_attrs and
163 |                 in_var.name == axis):
164 |                 in_var_attrs["units"] = common_date
165 |             out_var.setncatts(in_var_attrs)
166 | 
167 | 
168 | def create_partitions_from_files(out_dataset, files, axis,
169 |                                  cfa_version, common_date):
170 |     """Create the CFA partitions from a list of files."""
171 |     # loop over the files and open as a regular netCDF4 Dataset
172 |     for fname in files:
173 |         in_dataset = s3Dataset(fname, "r")
174 |         # get the global metadata
175 |         in_dataset_attrs = {
176 |             x: in_dataset.getncattr(x) for x in in_dataset.ncattrs()
177 |         }
178 |         # add the attributes to the s3Dataset by updating the dictionary
179 |         out_dataset._cfa_dataset.metadata.update(in_dataset_attrs)
180 |         # loop over the groups
181 |         for grp in in_dataset.groups:
182 |             in_group = in_dataset[grp]
183 |             # create a group if one with this name does not exist
184 |             if grp not in out_dataset.groups:
185 |                 out_group = out_dataset.createGroup(grp)
186 |             else:
187 |                 out_group = out_dataset.groups[grp]
188 |             # update the metadata
189 |             in_group_attrs = {
190 |                 x: in_group.getncattr(x) for x in in_group.ncattrs()
191 |             }
192 |             out_group._cfa_grp.metadata.update(in_group_attrs)
193 |             add_var_dims(in_group, out_group, axis, fname, common_date)
194 | 
195 |         # add the variables in the root group
196 |         add_var_dims(in_dataset, out_dataset, axis, fname, common_date)
197 |         in_dataset.close()
198 | 
199 | 
200 | def sort_partition_matrix(out_var, axis):
201 |     """Sort the partition matrix for a single variable."""
202 |     # get the index of the axis that we are aggregating over
203 |     try:
204 |         axis_dim_n = out_var._cfa_var.getPartitionMatrixDimensions().index(axis)
205 |         # create the index
206 |         n_dims = len(out_var._cfa_var.getDimensions())
207 |         # get the location values from the values
208 |         locs = out_var._cfa_var.getPartitionValues(key="location").squeeze()
209 |         # get the first (start) location values and get the order to sort them
210 |         # in
211 |         sort_order = np.argsort(locs[:,axis_dim_n,0])
212 |         # loop over the sort order and write the partition information into
213 |         # the new location
214 |         # keep a list of partitions
215 |         new_parts = []
216 |         for i, s in enumerate(sort_order):
217 |             # build the index to get the partition, in the sort order
218 |             index = np.zeros(n_dims,'i')
219 |             index[axis_dim_n] = s
220 |             # get the partition
221 |             source_part = out_var._cfa_var.getPartition(index)
222 |             # reassign the index
223 |             source_part.index[axis_dim_n] = i
224 |             # add to the list
225 |             new_parts.append(source_part)
226 | 
227 |         # now rewrite the partitions, and ensure their integrity - i.e. make
228 |         # sure that the axis partitions are the right length
229 |         for p in range(len(new_parts)):
230 |             # get the first new partition and the first location - this is the
231 |             # offset which we will need to subtract from the other locations
232 |             # in the loop as it changes in the loop
233 |             axis_offset = new_parts[0].location[axis_dim_n, 0]
234 | 
235 |             part = new_parts[p]
236 |             if p > 0:
237 |                 # align with previous partition
238 |                 prev_part = new_parts[p-1]
239 |                 part.location[axis_dim_n,0] = prev_part.location[axis_dim_n,1]
240 |             # make sure end of partition aligns with shape of array
241 |             part.location[axis_dim_n,1] = (part.location[axis_dim_n,0] +
242 |                 part.shape[axis_dim_n])
243 |             part.location[axis_dim_n,:] -= axis_offset
244 |             # subtract the offset
245 |             out_var._cfa_var.writePartition(part)
246 | 
247 |     except ValueError:
248 |         axis_dim_n = 0
249 | 
250 | 
251 | def sort_axis_variable(out_object, axis):
252 |     # sort the axis variable and write back out to the netCDF object
253 |     try:
254 |         axis_dim_var = out_object.variables[axis]
255 |         axis_dim_var[:] = np.sort(axis_dim_var[:])
256 |     except KeyError:
257 |         pass
258 | 
259 | 
260 | def sort_partition_matrices(out_dataset, axis):
261 |     """Sort the partition matrices for all the variables.  Sort is based on the
262 |     first element of the location."""
263 |     # need to sort all groups
264 |     for grp in out_dataset.groups:
265 |         out_group = out_dataset.groups[grp]
266 |         # need to sort all variables in the group
267 |         for var in out_group.variables:
268 |             out_var = out_group.variables[var]
269 |             if out_var._cfa_var:
270 |                 sort_partition_matrix(out_var, axis)
271 | 
272 |         # sort the axis variable in the group
273 |         sort_axis_variable(out_group, axis)
274 | 
275 |     # need to sort all the variables just in the database
276 |     for var in out_dataset.variables:
277 |         out_var = out_dataset.variables[var]
278 |         if out_var._cfa_var:
279 |             sort_partition_matrix(out_var, axis)
280 | 
281 |     # sort the axis variable in the dataset
282 |     sort_axis_variable(out_dataset, axis)
283 | 
284 | 
285 | def get_file_list(path):
286 |     """Get a list of files given the path.
287 |        The path can be:
288 |             a directory
289 |             a glob with multiple wildcards
290 |             a 'path' on a S3 storage device
291 |     """
292 |     # open the directory as a FileManager object
293 |     fm = FileManager()
294 |     path = os.path.expanduser(path)
295 |     request_object = fm.request_file(path)
296 |     file_object = request_object.file_object
297 | 
298 |     # get a list of files using the file object if it is a remote system
299 |     if (file_object.remote_system):
300 |         # split the url into the scheme, netloc, etc.
301 |         url_o = urlparse(path)
302 |         # the alias is the scheme + "://" + netloc
303 |         alias = url_o.scheme + "://" + url_o.netloc
304 |         # use a paginator to get multiple pages of the objects in the bucket
305 |         files = file_object.glob()
306 |         # add the alias and bucket to each of the files
307 |         bucket = file_object.file_handle._bucket
308 |         for i, f in enumerate(files):
309 |             files[i] = alias + "/" + bucket + "/" + f
310 |     else:
311 |         if os.path.isdir(path):
312 |             rawfiles = os.listdir(path)
313 |             files = [os.path.join(path, f) for f in rawfiles]
314 |             # or get a list of files using glob
315 |         else:
316 |             files = glob(path)
317 |     return files
318 | 
319 | 
320 | def aggregate_into_CFA(output_master_array, path, axis,
321 |                        cfa_version, common_date=None):
322 |     """Aggregate the netCDF files in directory into a CFA master-array file"""
323 |     # get the list of files first of all
324 |     files = get_file_list(path)
325 |     # create the s3Dataset
326 |     # create the output master array file
327 |     out_dataset = s3Dataset(
328 |         output_master_array,
329 |         mode='w',
330 |         clobber=True,
331 |         diskless=False,
332 |         cfa_version=cfa_version
333 |     )
334 |     # create the partitions from the list - these will be created in the order
335 |     # that the files are read in
336 |     create_partitions_from_files(out_dataset, files, axis,
337 |                                  cfa_version, common_date)
338 |     # we need to sort the partition matrices for each variable - i.e. there is
339 |     # one matrix per variable
340 |     sort_partition_matrices(out_dataset, axis)
341 |     # close the dataset to write / upload it
342 |     out_dataset.close()
343 | 


--------------------------------------------------------------------------------
/S3netCDF4/CFA/Parsers/_CFAnetCDFParser.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: language_level=3
  3 | 
  4 | __copyright__ = "(C) 2020 Science and Technology Facilities Council"
  5 | __license__ = "BSD - see LICENSE file in top-level directory"
  6 | __authors__ = "Neil Massey"
  7 | 
  8 | """
  9 |    Parser to read / write CFA metadata from / to a netCDF file.
 10 | 
 11 |    See:
 12 |      http://www.met.reading.ac.uk/~david/cfa/0.4/index.html
 13 |    for the v0.4 specification of the CFA conventions and an overview of the
 14 |    CFA conventions
 15 | 
 16 |    s3netCDF-python uses an updated version (v0.5) of the CFA conventions which,
 17 |    rather than writing the partition information to a netCDF attribute as a
 18 |    string, writes the partition information to variables inside a group.
 19 | """
 20 | 
 21 | from S3netCDF4.CFA._CFAExceptions import CFAParserError
 22 | from S3netCDF4.CFA._CFAClasses import (
 23 |     CFADataset, CFAGroup, CFAVariable, CFADimension
 24 | )
 25 | import netCDF4._netCDF4 as netCDF4
 26 | import json
 27 | 
 28 | from S3netCDF4.CFA.Parsers._CFAParser import CFA_Parser
 29 | 
 30 | class CFA_netCDFParser(CFA_Parser):
 31 | 
 32 |     def __init__(self):
 33 |         """Do nothing but set the CFA version used, but don't call the base
 34 |         class as that will raise NotImplementedError"""
 35 |         self.CFA_conventions = "CFA"
 36 | 
 37 |     def is_file(self, nc_dataset):
 38 |         """Return whether this input nc_dataset has the requisite metadata to
 39 |         mark it as a CFA file."""
 40 |         if not "Conventions" in nc_dataset.ncattrs():
 41 |             return False
 42 |         if not "CFA" in nc_dataset.getncattr("Conventions"):
 43 |             return False
 44 |         return True
 45 | 
 46 |     def __get_cfa_version(self, nc_dataset):
 47 |         """Parse the Conventions attribute to get the CFA version."""
 48 |         if not "Conventions" in nc_dataset.ncattrs():
 49 |             raise CFAParserError("Not a CFA file.")
 50 |         else:
 51 |             conventions = nc_dataset.getncattr("Conventions").split(" ")
 52 |             cfa_version = "0.0"
 53 |             for c in conventions:
 54 |                 if "CFA-" in c:
 55 |                     cfa_version = c[4:]
 56 |             if cfa_version == "0.0":
 57 |                 raise CFAParserError("Not a CFA file.")
 58 |         return cfa_version
 59 | 
 60 |     def __create_s3vars_and_dims(self, s3_object, nc_object, cfa_object):
 61 |         """Consolidate the variables and dimensions in the nc_object (which may
 62 |         be a dataset or a group) into the s3_object (which may also be a dataset
 63 |         or a group), matching them up with the variables and dimensions in the
 64 |         cfa_object (again, which may be a dataset or group)
 65 |         """
 66 |         from S3netCDF4._s3netCDF4 import s3Dimension, s3Variable
 67 |         # loop over the variables
 68 |         s3_object._s3_variables = {}     # reset to empty
 69 |         for var in nc_object.variables:
 70 |             nc_var = nc_object.variables[var]
 71 |             if var in cfa_object.getVariables():
 72 |                 cfa_var = cfa_object.getVariable(var)
 73 |                 # create the s3Variable with links to the cfa variable and nc_var
 74 |                 s3_object._s3_variables[var] = s3Variable(
 75 |                                                     nc_var=nc_var,
 76 |                                                     cfa_var=cfa_var,
 77 |                                                     parent=s3_object
 78 |                                                 )
 79 | 
 80 |             else:
 81 |                 s3_object._s3_variables[var] = nc_var
 82 | 
 83 |         # loop over the dimensions
 84 |         s3_object._s3_dimensions = {}    # reset to empty
 85 |         for dim in nc_object.dimensions:
 86 |             nc_dim = nc_object.dimensions[dim]
 87 |             if dim in cfa_object.getDimensions():
 88 |                 cfa_dim = cfa_object.getDimension(dim)
 89 |                 # set the datatype for the cfa_dim by getting the type from the
 90 |                 # associated variable
 91 |                 if dim in nc_object.variables:
 92 |                     nc_var = nc_object.variables[dim]
 93 |                     cfa_dim.setType(nc_var.dtype)
 94 |                 # create the s3Dimension with links to the cfa dimension and
 95 |                 # nc_dim
 96 |                 s3_object._s3_dimensions[dim] = s3Dimension(
 97 |                                                     nc_dim=nc_dim,
 98 |                                                     cfa_dim=cfa_dim,
 99 |                                                     parent=s3_object
100 |                                                 )
101 |             else:
102 |                 s3_object._s3_dimensions[dim] = nc_dim
103 | 
104 |     def __consolidate_from_read(self, s3_dataset):
105 |         """Consolidate a s3_dataset from the file read in.
106 |         s3_dataset contains a netCDF dataset (_nc_dataset).  This contains all
107 |         the definitions of the variables, dimensions and groups as netCDF4
108 |         variables, dimensions and groups.  We want to convert these into their
109 |         s3 equivalents (s3_variable, s3_dimension and s3_group).
110 |         This involves directly manipulating the s3_dataset object.
111 |         """
112 |         from S3netCDF4._s3netCDF4 import s3Group
113 |         nc_dataset = s3_dataset._nc_dataset
114 |         cfa_dataset = s3_dataset._cfa_dataset
115 | 
116 |         # loop over the variables and dimensions (in the root group)
117 |         if "root" in s3_dataset._cfa_dataset.getGroups():
118 |             cfa_grp = s3_dataset._cfa_dataset.getGroup("root")
119 |         else:
120 |             cfa_grp = s3_dataset._cfa_dataset.createGroup("root")
121 |         self.__create_s3vars_and_dims(s3_dataset, nc_dataset, cfa_grp)
122 | 
123 |         # loop over the groups
124 |         for grp in nc_dataset.groups:
125 |             nc_grp = nc_dataset.groups[grp]
126 |             if grp in cfa_dataset.getGroups():
127 |                 cfa_grp = cfa_dataset.getGroup(grp)
128 |                 # create the s3Group with links to the cfa group and nc_grp
129 |                 s3_dataset._s3_groups[grp] = s3Group(
130 |                     cfa_grp=cfa_grp,
131 |                     nc_grp=nc_grp,
132 |                     parent=s3_dataset
133 |                 )
134 |                 # create the vars and dims in the group
135 |                 self.__create_s3vars_and_dims(
136 |                     s3_dataset._s3_groups[grp],
137 |                     nc_grp,
138 |                     cfa_grp
139 |                 )
140 | 
141 |             else:
142 |                 s3_dataset._s3_groups[grp] = nc_grp
143 | 
144 | 
145 |     def read(self, s3_dataset, filename=""):
146 |         """Parse an already open s3_dataset to build the _CFAClasses
147 |         hierarchy.
148 | 
149 |         Args:
150 |             netcdf_dataset (Dataset): the open dataset from the netcdf4-python
151 |             library.
152 | 
153 |         Returns:
154 |             CFADataset: The CFADataset object, populated with CFAGroups, which
155 |             are in turn populated with CFADims and CFAVariables.
156 |         """
157 |         # get the netCDF dataset from the s3 dataset
158 |         nc_dataset = s3_dataset._nc_dataset
159 |         # check this is a CFA file
160 |         if not self.is_file(nc_dataset):
161 |             raise CFAParserError("Not a CFA file.")
162 | 
163 |         # get the cfa version so we can interpret it as CFA-0.5 (in netCDF4
164 |         # format) or CFA-0.4 (in netCDF3, CLASSIC or netCDF4 format)
165 |         cfa_version = self.__get_cfa_version(nc_dataset)
166 |         # check to see if there are any groups and, if there is, create a CFAgroup
167 |         # and add the nc_group to a dictionary of groups.  Start with the root
168 |         # group pointing to the base Dataset
169 |         nc_groups = {"root" : nc_dataset}
170 |         if len(nc_dataset.groups) != 0:
171 |             for grp_name in nc_dataset.groups:
172 |                 nc_groups[grp_name] = nc_dataset.groups[grp_name]
173 | 
174 |         # get the metadata from the dataset in a new dictionary
175 |         nc_dataset_md = {a:nc_dataset.getncattr(a) for a in nc_dataset.ncattrs()}
176 |         # create the CFADataset, with the metadata and format, and empty groups
177 |         cfa_dataset = CFADataset(
178 |                           name=filename,
179 |                           format=nc_dataset.data_model,
180 |                           metadata=nc_dataset_md,
181 |                           cfa_version=cfa_version
182 |                       )
183 |         # now loop over all the groups, and add a CFAGroup to each dataset, then
184 |         # the CFAVariables and CFADimensions contained in that group
185 |         output_groups = {}
186 |         for group_name in nc_groups:
187 |             nc_group = nc_groups[group_name]
188 |             nc_group_md = {a:nc_group.getncattr(a) for a in nc_group.ncattrs()}
189 |             cfa_group = cfa_dataset.createGroup(group_name, nc_group_md)
190 |             # next parse the dimensions
191 |             for nc_dimname in nc_group.dimensions:
192 |                 # get the dimension
193 |                 nc_dim = nc_group.dimensions[nc_dimname]
194 |                 # get the dimension's associated variable
195 |                 try:
196 |                     nc_dim_var = nc_group.variables[nc_dimname]
197 |                     # get the metadata from the dim var
198 |                     nc_dim_var_md = {
199 |                         a:nc_dim_var.getncattr(a) for a in nc_dim_var.ncattrs()
200 |                     }
201 |                 except KeyError:
202 |                     nc_dim_var_md = {}
203 |                 # create the dimension and append to list of cfa_dims
204 |                 cfa_dim = cfa_group.createDimension(
205 |                               dim_name=nc_dimname,
206 |                               dim_len=nc_dim.size,
207 |                               metadata=nc_dim_var_md
208 |                             )
209 | 
210 |             # loop over the variables in the group / dataset
211 |             for nc_varname in nc_group.variables:
212 |                 nc_var = nc_group.variables[nc_varname]
213 |                 nc_var_md = {a:nc_var.getncattr(a) for a in nc_var.ncattrs()}
214 |                 if "cf_role" in nc_var_md:
215 |                     cfa_var = cfa_group.createVariable(
216 |                                     var_name=nc_varname,
217 |                                     nc_dtype=nc_var.dtype,
218 |                                     metadata=nc_var_md
219 |                                 )
220 |                     if cfa_version == "0.4":
221 |                         # this parses from the 0.4 version - i.e all the
222 |                         # metadata is stored in the netCDF attributes
223 |                         cfa_var.parse(nc_var_md)
224 |                     elif cfa_version == "0.5":
225 |                         # this parses from the 0.5 version - i.e. all the
226 |                         # metadata is stored in a variable in a group
227 |                         cfa_var.load(nc_var_md, nc_group)
228 |                     else:
229 |                         raise CFAParserError(
230 |                             "Unsupported CFA version ({}) in file.".format(
231 |                                 cfa_version
232 |                             )
233 |                         )
234 |         # load the cfa_dataset into the s3_dataset that was passed in
235 |         s3_dataset._cfa_dataset = cfa_dataset
236 |         # need to "consolidate" the dataset - create s3 variants of the netCDF
237 |         # groups, variables and dimensions - call .consolidate_from_read
238 |         # on the s3_dataset passed in
239 |         self.__consolidate_from_read(s3_dataset)
240 | 
241 |     def write(self, cfa_dataset, s3_dataset):
242 |         """Write the _CFAClasses hierarchy to an already open netcdf_dataset
243 |         (opened with 'w' write flag).
244 | 
245 |         Args:
246 |             cfa_dataset (CFADataset): the top class in the _CFAClasses hierarchy
247 |             s3_dataset (Dataset): the open dataset from the netcdf4-python
248 |             library.  Has to have been opened with 'w' flag.
249 | 
250 |         Returns:
251 |             None
252 |         """
253 |         # add the CFA conventions into the metadata
254 |         dataset_metadata = cfa_dataset.getMetadata()
255 |         cfa_version = cfa_dataset.getCFAVersion()
256 |         cfa_conventions = self.CFA_conventions + "-{}".format(cfa_version)
257 |         # get the underlying netCDF4 dataset
258 |         nc_dataset = s3_dataset._nc_dataset
259 |         if "Conventions" in dataset_metadata:
260 |             dataset_metadata["Conventions"] += " " + cfa_conventions
261 |         else:
262 |             dataset_metadata["Conventions"] = cfa_conventions
263 | 
264 |         # set the global metadata
265 |         nc_dataset.setncatts(dataset_metadata)
266 |         # get the groups
267 |         for group in cfa_dataset.getGroups():
268 |             # get the actual group
269 |             cfa_group = cfa_dataset.getGroup(group)
270 |             if (group == "root"):
271 |                 s3_group = s3_dataset
272 |                 nc_group = nc_dataset       # just a shortcut to the nc_group
273 |             else:
274 |                 s3_group = s3_dataset.groups[group]
275 |                 nc_group = s3_group._nc_grp # shortcut
276 |             # set the metadata for the group
277 |             nc_group.setncatts(cfa_group.getMetadata())
278 | 
279 |             # set the metadata for the variables
280 |             for var in cfa_group.getVariables():
281 |                 # get the actual cfa variable
282 |                 cfa_var = cfa_group.getVariable(var)
283 |                 # get the variable
284 |                 nc_var = s3_group._s3_variables[var]._nc_var
285 |                 # get the variable metadata
286 |                 var_md = dict(cfa_var.getMetadata())
287 |                 # add the cfa metadata - if it is a cfa variable
288 |                 if cfa_var.getRole() != "":
289 |                     var_md['cf_role'] = cfa_var.getRole()
290 |                     var_md['cfa_dimensions'] = " ".join(cfa_var.getDimensions())
291 |                     # if the convention version is >= 0.5 then the data has
292 |                     # already been written into the cfa metagroup
293 |                     # for v0.4 we need to dump it into the attribute string
294 |                     if cfa_version == "0.4":
295 |                         # write the partition data
296 |                         var_md['cfa_array'] = json.dumps(cfa_var.dump()['cfa_array'])
297 |                     elif cfa_version == "0.5":
298 |                         # just need to name the cfa_metagroup as an attribute in
299 |                         # the original variable
300 |                         var_md['cfa_group'] = "cfa_" + var
301 |                     else:
302 |                         raise CFAParserError(
303 |                             "Unsupported CFA version ({}) in file.".format(
304 |                                 cfa_version
305 |                             )
306 |                         )
307 |                 # set the metadata for the variable
308 |                 nc_var.setncatts(var_md)
309 |             # set the metadata for the dimension variables
310 |             for dim_var in cfa_group.getDimensions():
311 |                 # get the actual cfa dimensions
312 |                 cfa_dim = cfa_group.getDimension(dim_var)
313 |                 # get the netCDF variable for this dimension
314 |                 try:
315 |                     nc_dimvar = s3_group.variables[cfa_dim.getName()]._nc_var
316 |                     # copy the dimension metadata into the (dimension) variable
317 |                     # metadata
318 |                     dim_md = dict(cfa_dim.getMetadata())
319 |                     # set the metadata for the variable
320 |                     nc_dimvar.setncatts(dim_md)
321 |                 except KeyError:
322 |                     pass # don't try to write to dimension with no associated
323 |                          # variable
324 | 


--------------------------------------------------------------------------------
/S3netCDF4/Backends/_s3FileObject.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: language_level=3
  3 | 
  4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
  5 | __license__ = "BSD - see LICENSE file in top-level directory"
  6 | __authors__ = "Neil Massey"
  7 | 
  8 | import io
  9 | from fnmatch import fnmatch
 10 | from urllib.parse import urlparse
 11 | 
 12 | from botocore.exceptions import ClientError
 13 | import botocore.session
 14 | 
 15 | from S3netCDF4.Managers._ConnectionPool import ConnectionPool
 16 | from S3netCDF4.Managers._ConfigManager import Config
 17 | from S3netCDF4._Exceptions import APIException, IOException
 18 | 
 19 | class s3FileObject(io.BufferedIOBase):
 20 |     """Custom file object class, inheriting from Python io.Base, to read from
 21 |     an S3 object store / AWS cloud storage."""
 22 | 
 23 |     """Static connection pool object - i.e. shared across the file objects."""
 24 |     _connection_pool = ConnectionPool()
 25 | 
 26 |     # The defaults for MAXIMUM_PART_SIZE etc. are now assigned in
 27 |     # __init__ if no values are found in ~/.s3nc.json
 28 |     """Static config object for the backend options"""
 29 |     _config = Config()
 30 | 
 31 |     def _get_server_bucket_object(uri):
 32 |         """Get the server name from the URI"""
 33 |         # First split the uri into the network location and path, and build the
 34 |         # server
 35 |         url_p = urlparse(uri)
 36 |         # check that the uri contains a scheme and a netloc
 37 |         if url_p.scheme == '' or url_p.netloc == '':
 38 |             raise APIException(
 39 |                 "URI supplied to s3FileObject is not well-formed: {}".format(uri)
 40 |             )
 41 |         server = url_p.scheme + "://" + url_p.netloc
 42 |         split_path = url_p.path.split("/")
 43 |         # get the bucket
 44 |         try:
 45 |             bucket = split_path[1]
 46 |         except IndexError as e:
 47 |             raise APIException(
 48 |                 "URI supplied has no bucket contained within it: {}".format(uri)
 49 |             )
 50 |         # get the path
 51 |         try:
 52 |             path = "/".join(split_path[2:])
 53 |         except IndexError as e:
 54 |             raise APIException(
 55 |                 "URI supplied has no path contained within it: {}".format(uri)
 56 |             )
 57 |         return server, bucket, path
 58 | 
 59 |     def __init__(self, uri, credentials, mode='r', create_bucket=True,
 60 |                  part_size=None, max_parts=None, multipart_upload=None,
 61 |                  multipart_download=None, connect_timeout=None,
 62 |                  read_timeout=None):
 63 |         """Initialise the file object by creating or reusing a connection in the
 64 |         connection pool."""
 65 |         # get the server, bucket and the key from the endpoint url
 66 |         self._server, self._bucket, self._path = s3FileObject._get_server_bucket_object(uri)
 67 |         self._closed = False            # set the file to be not closed
 68 |         self._mode = mode
 69 |         self._seek_pos = 0
 70 |         self._buffer = [io.BytesIO()]   # have a list of objects that can stream
 71 |         self._credentials = credentials
 72 |         self._create_bucket = create_bucket
 73 |         self._uri = uri
 74 | 
 75 |         """Either get the backend config from the parameters, or the config file
 76 |         or use defaults."""
 77 |         if "s3FileObject" in s3FileObject._config["backends"]:
 78 |             backend_config = s3FileObject._config["backends"]["s3FileObject"]
 79 |         else:
 80 |             backend_config = {}
 81 | 
 82 |         if part_size:
 83 |             self._part_size = int(part_size)
 84 |         elif "maximum_part_size" in backend_config:
 85 |             self._part_size = int(backend_config["maximum_part_size"])
 86 |         else:
 87 |             self._part_size = int(50 * 1024 * 1024)
 88 | 
 89 |         if max_parts:
 90 |             self._max_parts = int(max_parts)
 91 |         elif "maximum_parts" in backend_config:
 92 |             self._max_parts = int(backend_config["maximum_parts"])
 93 |         else:
 94 |             self._max_parts = 8
 95 | 
 96 |         if multipart_upload:
 97 |             self._multipart_upload = multipart_upload
 98 |         elif "multipart_upload" in backend_config:
 99 |             self._multipart_upload = backend_config["multipart_upload"]
100 |         else:
101 |             self._multipart_upload = True
102 | 
103 |         if multipart_download:
104 |             self._multipart_download = multipart_download
105 |         elif "multipart_download" in backend_config:
106 |             self._multipart_download = backend_config["multipart_download"]
107 |         else:
108 |             self._multipart_download = True
109 | 
110 |         if connect_timeout:
111 |             self._connect_timeout = connect_timeout
112 |         elif "connect_timeout" in backend_config:
113 |             self._connect_timeout = backend_config["connect_timeout"]
114 |         else:
115 |             self._connect_timeout = 30.0
116 | 
117 |         if read_timeout:
118 |             self._read_timeout = read_timeout
119 |         elif "read_timeout" in backend_config:
120 |             self._read_timeout = backend_config["read_timeout"]
121 |         else:
122 |             self._read_timeout = 30.0
123 | 
124 |     def __enter__(self):
125 |         """Create the connection on an enter."""
126 |         self.connect()
127 |         return self
128 | 
129 |     def __exit__(self, exc_type, exc_value, exc_tb):
130 |         """Close the file on the exit of a with statement, or by the garbage
131 |         collector removing the object."""
132 |         self.close()
133 |         # check for any exceptions
134 |         if exc_type is not None:
135 |             return False
136 |         return True
137 | 
138 |     def _getsize(self):
139 |         # Use content length in the head object to determine the size of
140 |         # the file / object
141 |         # If we are writing then the size should be the buffer size
142 |         try:
143 |             if 'w' in self._mode:
144 |                 size = self._part_size
145 |             else:
146 |                 response = self._conn_obj.conn.head_object(
147 |                     Bucket=self._bucket,
148 |                     Key=self._path
149 |                 )
150 |                 size = response['ContentLength']
151 |         except ClientError as e:
152 |             raise IOException(
153 |                 "Could not get size of object {}".format(self._path)
154 |             )
155 |         except AttributeError as e:
156 |             self._handle_connection_exception(e)
157 |         return size
158 | 
159 |     def _get_bucket_list(self):
160 |         # get the names of the buckets in a list
161 |         try:
162 |             bl = self._conn_obj.conn.list_buckets()['Buckets'] # this returns a dict
163 |             bucket_list = [b['Name'] for b in bl]
164 |         except AttributeError as e:
165 |             self._handle_connection_exception(e)
166 |         return bucket_list
167 | 
168 |     def _handle_connection_exception(self, e):
169 |         # Check if connection made
170 |         if ("_conn_obj" in e.args[0] or "_current_part" in e.args[0]):
171 |             raise APIException(
172 |                 "Connection to S3 server is not established.  Use either the "
173 |                 ".connect method or a with statement."
174 |             )
175 |         else:
176 |             # other AttributeError - handle that separately
177 |             raise e
178 | 
179 |     def connect(self):
180 |         """Connect to the s3 server with the details passed in via the __init__
181 |         method."""
182 |         # if the connection returns None then either there isn't a connection to
183 |         # the server in the pool, or there is no connection that is available
184 |         self._conn_obj = s3FileObject._connection_pool.get(self._server)
185 |         if self._conn_obj is None:
186 |             try:
187 |                 session = botocore.session.get_session()
188 |                 config = botocore.config.Config(
189 |                     connect_timeout=self._connect_timeout,
190 |                     read_timeout=self._connect_timeout
191 |                 )
192 |                 s3c = session.create_client(
193 |                           "s3",
194 |                           endpoint_url=self._server,
195 |                           aws_access_key_id=self._credentials["accessKey"],
196 |                           aws_secret_access_key=self._credentials["secretKey"],
197 |                           config=config
198 |                       )
199 |                 # add the connection to the connection pool
200 |                 self._conn_obj = s3FileObject._connection_pool.add(
201 |                     s3c, self._server
202 |                 )
203 |             except ClientError as e:
204 |                 raise IOException(
205 |                     "Could not connect to S3 endpoint {} {}".format(
206 |                         self._server, e)
207 |                 )
208 |         if ('r' in self._mode and '*' not in self._path and
209 |             '?' not in self._path):
210 |             # if this is a read method then check the file exists
211 |             response = self._conn_obj.conn.list_objects_v2(
212 |                 Bucket=self._bucket,
213 |                 Prefix=self._path
214 |             )
215 |             exists = False
216 |             for obj in response.get('Contents', []):
217 |                 if obj['Key'] == self._path:
218 |                     exists = True
219 |                     break
220 |             if not exists:
221 |                 raise IOException(
222 |                     "Object does not exist: {}/{}/{}".format(
223 |                         self._server, self._bucket, self._path
224 |                     )
225 |                 )
226 |         if 'w' in self._mode:
227 |             # if this is a write method then create a bytes array
228 |             self._current_part = 1
229 |         if 'a' in self._mode or '+' in self._mode:
230 |             raise APIException(
231 |                 "Appending to files is not supported {}".format(self._path)
232 |             )
233 |         return True
234 | 
235 |     def detach(self):
236 |         """Separate the underlying raw stream from the buffer and return it.
237 |         Not supported in S3."""
238 |         raise io.UnsupportedOperation
239 | 
240 |     def read(self, size=-1):
241 |         """Read and return up to size bytes. For the S3 implementation the size
242 |         can be used for RangeGet.  If size==-1 then the whole object is streamed
243 |         into memory."""
244 |         # read the object using the bucket and path already determined in
245 |         # __init__, and using the connection object
246 |         try:
247 |             if size== -1:
248 |                 s3_object = self._conn_obj.conn.get_object(
249 |                     Bucket = self._bucket,
250 |                     Key = self._path
251 |                 )
252 |                 body = s3_object['Body']
253 |             else:
254 |                 # do the partial / range get version, and increment the seek
255 |                 # pointer
256 |                 range_end = self._seek_pos+size-1
257 |                 file_size = self._getsize()
258 |                 if range_end >= file_size:
259 |                     range_end = file_size-1
260 | 
261 |                 if not self._multipart_download:
262 |                     s3_object = self._conn_obj.conn.get_object(
263 |                         Bucket = self._bucket,
264 |                         Key = self._path,
265 |                     )
266 |                     body = s3_object['Body']
267 |                 else:
268 |                     s3_object = self._conn_obj.conn.get_object(
269 |                         Bucket = self._bucket,
270 |                         Key = self._path,
271 |                         Range = 'bytes={}-{}'.format(
272 |                             self._seek_pos, range_end
273 |                         )
274 |                     )
275 |                     self._seek_pos += size
276 |                     body = s3_object['Body']
277 |         except ClientError as e:
278 |             raise IOException(
279 |                 "Could not read from object {} {}".format(self._path, e)
280 |             )
281 |         except AttributeError as e:
282 |             self._handle_connection_exception(e)
283 |         return body.read()
284 | 
285 |     def read1(self, size=-1):
286 |         """Just call read."""
287 |         return self.read(size=size)
288 | 
289 |     def readinto(self, b):
290 |         """Read bytes into a pre-allocated, writable bytes-like object b and
291 |         return the number of bytes read.
292 |         In S3 the entire file is read into the bytesbuffer.  It is important
293 |         that the bytesbuffer is big enough to hold the entire file."""
294 |         # get the size of the file
295 |         size = self._getsize()
296 |         b[:size] = self.read(size)
297 |         return size
298 | 
299 |     def readinto1(self, b):
300 |         """Just call readinto"""
301 |         return self.readinto(b)
302 | 
303 |     def _multipart_upload_from_buffer(self):
304 |         """Do a multipart upload from the buffer.
305 |         There are two cases:
306 |             1.  The size is exactly the same size as the self._part_size
307 |             2.  The size is greater than the self._part_size
308 |         """
309 |         # check to see if bucket needs to be created
310 |         if self._create_bucket:
311 |             # check whether the bucket exists
312 |             bucket_list = self._get_bucket_list()
313 |             if not self._bucket in bucket_list:
314 |                 self._conn_obj.conn.create_bucket(Bucket=self._bucket)
315 | 
316 |         # if the current part is 1 we have to create the multipart upload
317 |         if self._current_part == 1:
318 |             response = self._conn_obj.conn.create_multipart_upload(
319 |                 Bucket = self._bucket,
320 |                 Key = self._path
321 |             )
322 |             self._upload_id = response['UploadId']
323 |             # we need to keep a track of the multipart info
324 |             self._multipart_info = {'Parts' : []}
325 | 
326 |         # upload from a buffer - do we need to split into more than one
327 |         # multiparts?  Remember: self._buffer is a list of BytesIO objects
328 |         new_buffer = []
329 |         for buffer_part in range(0, len(self._buffer)):
330 |             # is the current part of the buffer larger than the maximum
331 |             # upload size? split if it is
332 |             data_buf = self._buffer[buffer_part]
333 |             data_len = data_buf.tell()
334 |             if data_len >= self._part_size:
335 |                 data_buf.seek(0)
336 |                 data_pos = 0
337 |                 # split the file up
338 |                 while data_pos < data_len:
339 |                     new_buffer.append(io.BytesIO())
340 |                     # copy the data - don't overstep the buffer
341 |                     if data_pos + self._part_size >= data_len:
342 |                         sub_data = data_buf.read(data_len-data_pos)
343 |                     else:
344 |                         sub_data = data_buf.read(self._part_size)
345 |                     new_buffer[-1].write(sub_data)
346 |                     # increment to next
347 |                     data_pos += self._part_size
348 | 
349 |                 # free the old memory
350 |                 self._buffer[buffer_part].close()
351 |             else:
352 |                 self._buffer[buffer_part].seek(0)
353 |                 new_buffer.append(io.BytesIO(self._buffer[buffer_part].read()))
354 | 
355 |         self._buffer = new_buffer
356 | 
357 |         for buffer_part in range(0, len(self._buffer)):
358 |             # seek in the BytesIO buffer to get to the beginning after the
359 |             # writing§
360 |             self._buffer[buffer_part].seek(0)
361 |             # upload here
362 |             part = self._conn_obj.conn.upload_part(
363 |                 Bucket=self._bucket,
364 |                 Key=self._path,
365 |                 UploadId=self._upload_id,
366 |                 PartNumber=self._current_part,
367 |                 Body=self._buffer[buffer_part]
368 |             )
369 |             # insert into the multipart info list of dictionaries
370 |             self._multipart_info['Parts'].append(
371 |                 {
372 |                     'PartNumber' : self._current_part,
373 |                     'ETag' : part['ETag']
374 |                 }
375 |             )
376 |             self._current_part += 1
377 | 
378 |         # reset all the byte buffers and their positions
379 |         for buffer_part in range(0, len(self._buffer)):
380 |             self._buffer[buffer_part].close()
381 |         self._buffer = [io.BytesIO()]
382 |         self._seek_pos = 0
383 |         self._current_part += 1
384 | 
385 |     def write(self, b):
386 |         """Write the given bytes-like object, b, and return the number of bytes
387 |         written (always equal to the length of b in bytes, since if the write
388 |         fails an OSError will be raised).
389 |         For the S3 file object we just write the file to a temporary bytearray
390 |         and increment the seek_pos.
391 |         This data will be uploaded to an object when .flush is called.
392 |         """
393 |         if "w" not in self._mode:
394 |             raise APIException(
395 |                 "Trying to write to a read only file, where mode != 'w'."
396 |             )
397 |         try:
398 |             # add to local, temporary bytearray
399 |             size = len(b)
400 |             self._buffer[-1].write(b)
401 |             self._seek_pos += size
402 |             # test to see whether we should do a multipart upload now
403 |             # this occurs when the number of buffers is > the maximum number of
404 |             # parts.  self._current_part is indexed from 1
405 |             if (self._multipart_upload and
406 |                 self._seek_pos > self._part_size):
407 |                 if len(self._buffer) == self._max_parts:
408 |                     self._multipart_upload_from_buffer()
409 |                 else:
410 |                     # add another buffer to write to
411 |                     self._buffer.append(io.BytesIO())
412 | 
413 |         except ClientError as e:
414 |             raise IOException(
415 |                 "Could not write to object {} {}".format(self._path, e)
416 |             )
417 |         except AttributeError as e:
418 |             self._handle_connection_exception(e)
419 | 
420 |         return size
421 | 
422 |     def close(self):
423 |         """Flush and close this stream. This method has no effect if the file is
424 |         already closed. Once the file is closed, any operation on the file (e.g.
425 |         reading or writing) will raise a ValueError.
426 | 
427 |         As a convenience, it is allowed to call this method more than once; only
428 |         the first call, however, will have an effect."""
429 |         try:
430 |             if not self._closed:
431 |                 # self.flush will upload the buffer to the S3 store
432 |                 self.flush()
433 |                 s3FileObject._connection_pool.release(self._conn_obj)
434 |                 self._closed = True
435 |         except AttributeError as e:
436 |             self._handle_connection_exception(e)
437 |         return True
438 | 
439 |     def seek(self, offset, whence=io.SEEK_SET):
440 |         """Change the stream position to the given byte offset. offset is
441 |         interpreted relative to the position indicated by whence. The default
442 |         value for whence is SEEK_SET. Values for whence are:
443 | 
444 |         SEEK_SET or 0 – start of the stream (the default); offset should be zero
445 |                         or positive
446 |         SEEK_CUR or 1 – current stream position; offset may be negative
447 |         SEEK_END or 2 – end of the stream; offset is usually negative
448 |         Return the new absolute position.
449 | 
450 |         Note: currently cannot seek when writing a file.
451 | 
452 |         """
453 |         if self._mode == 'w':
454 |             raise IOException(
455 |                 "Cannot seek within a file that is being written to."
456 |             )
457 | 
458 |         size = self._getsize()
459 |         error_string = "Seek {} is outside file size bounds 0->{} for file {}"
460 |         seek_pos = self._seek_pos
461 |         if whence == io.SEEK_SET:
462 |             # range check
463 |             seek_pos = offset
464 |         elif whence == io.SEEK_CUR:
465 |             seek_pos += offset
466 |         elif whence == io.SEEK_END:
467 |             seek_pos = size - offset
468 | 
469 |         # range checks
470 |         if (seek_pos >= size):
471 |             raise IOException(error_string.format(
472 |                 seek_pos,
473 |                 size,
474 |                 self._path)
475 |             )
476 |         elif (seek_pos < 0):
477 |             raise IOException(error_string.format(
478 |                 seek_pos,
479 |                 size,
480 |                 self._path)
481 |             )
482 |         self._seek_pos = seek_pos
483 |         return self._seek_pos
484 | 
485 |     def seekable(self):
486 |         """We can seek in s3 streams using the range get and range put features.
487 |         """
488 |         return True
489 | 
490 |     def tell(self):
491 |         """Return True if the stream supports random access. If False, seek(),
492 |         tell() and truncate() will raise OSError."""
493 |         return self._seek_pos
494 | 
495 |     def fileno(self):
496 |         """Return the underlying file descriptor (an integer) of the stream if
497 |         it exists. An IOError is raised if the IO object does not use a file
498 |         descriptor."""
499 |         raise io.UnsupportedOperation
500 | 
501 |     def flush(self):
502 |         """Flush the write buffers of the stream.  This will upload the contents
503 |         of the final multipart upload of self._buffer to the S3 store."""
504 |         try:
505 |             if 'w' in self._mode:
506 |                 # if the size is less than the MAXIMUM UPLOAD SIZE
507 |                 # then just write the data
508 |                 size = self._buffer[0].tell()
509 |                 if self._current_part == 1 and size < self._part_size:
510 |                     if self._create_bucket:
511 |                         # check whether the bucket exists and create if not
512 |                         bucket_list = self._get_bucket_list()
513 |                         if not self._bucket in bucket_list:
514 |                             self._conn_obj.conn.create_bucket(
515 |                                 Bucket=self._bucket
516 |                             )
517 |                     # upload the whole buffer - seek back to the start first
518 |                     self._buffer[0].seek(0)
519 |                     self._conn_obj.conn.put_object(
520 |                         Bucket=self._bucket,
521 |                         Key=self._path,
522 |                         Body=self._buffer[0].read(size)
523 |                     )
524 |                 else:
525 |                     # upload as multipart
526 |                     self._multipart_upload_from_buffer()
527 |                     # finalise the multipart upload
528 |                     self._conn_obj.conn.complete_multipart_upload(
529 |                         Bucket=self._bucket,
530 |                         Key=self._path,
531 |                         UploadId=self._upload_id,
532 |                         MultipartUpload=self._multipart_info
533 |                     )
534 |         except AttributeError as e:
535 |             self._handle_connection_exception(e)
536 |         return True
537 | 
538 |     def readable(self):
539 |         """Return True if the stream can be read from. If False, read() will
540 |         raise IOError."""
541 |         return 'r' in self._mode or '+' in self._mode
542 | 
543 |     def readline(self, size=-1):
544 |         """Read and return one line from the stream.
545 |         If size is specified, at most size bytes will be read."""
546 |         if 'b' in self._mode:
547 |             raise APIException(
548 |                 "readline on a binary file is not permitted: {}".format(
549 |                     self._uri)
550 |                 )
551 |         # only read a set number of bytes if size is passed in, otherwise
552 |         # read upto the file size
553 |         if size == -1:
554 |             size = self._getsize()
555 | 
556 |         # use the BytesIO readline methods
557 |         if self.tell() == 0:
558 |             buffer = self.read(size=size)
559 |             self._buffer[-1].write(buffer)
560 |             self._buffer[-1].seek(0)
561 | 
562 |         line = self._buffer[-1].readline().decode().strip()
563 |         return line
564 | 
565 |     def readlines(self, hint=-1):
566 |         """Read and return a list of lines from the stream. hint can be
567 |         specified to control the number of lines read: no more lines will be
568 |         read if the total size (in bytes/characters) of all lines so far exceeds
569 |         hint."""
570 |         if 'b' in self._mode:
571 |             raise APIException(
572 |                 "readline on a binary file is not permitted: {}".format(
573 |                     self._uri)
574 |                 )
575 |         # read the entire file in and decode it
576 |         lines = self.read().decode().split("\n")
577 |         return lines
578 | 
579 |     def truncate(self, size=None):
580 |         """Not supported"""
581 |         raise io.UnsupportedOperation
582 | 
583 |     def writable(self):
584 |         """Return True if the stream supports writing. If False, write() and
585 |         truncate() will raise IOError."""
586 |         return 'w' in self._mode
587 | 
588 |     def writelines(self, lines):
589 |         """Write a list of lines to the stream."""
590 |         # first check if the file is binary or not
591 |         if 'b' in self._mode:
592 |             raise APIException(
593 |                 "writelines on a binary file is not permitted: {}".format(
594 |                     self._uri)
595 |                 )
596 |         # write all but the last line with a line break
597 |         for l in lines:
598 |             self.write((l+"\n").encode('utf-8'))
599 |         return True
600 | 
601 |     def glob(self):
602 |         """Emulate glob on an open bucket.  The glob has been passed in via
603 |         self._path, created on connection to the server and bucket."""
604 |         # get the path string up to the wildcards
605 |         try:
606 |             pi1 = self._path.index("*")
607 |         except ValueError:
608 |             pi1 = len(self._path)
609 |         try:
610 |             pi2 = self._path.index("?")
611 |         except ValueError:
612 |             pi2 = len(self._path)
613 |         pi = min(pi1, pi2)
614 |         # using the prefix will cut down on the search space
615 |         prefix = self._path[:pi]
616 |         # get the wildcard
617 |         wildcard = self._path[pi:]
618 |         # set up the paginator
619 |         paginator = self._conn_obj.conn.get_paginator("list_objects_v2")
620 |         parameters = {
621 |             'Bucket': self._bucket,
622 |             'Prefix': prefix
623 |         }
624 |         page_iterator = paginator.paginate(**parameters)
625 |         files = []
626 |         for page in page_iterator:
627 |             for item in page.get('Contents', []):
628 |                 fname = item['Key']
629 |                 # check that it matches against wildcard
630 |                 if fnmatch(fname, wildcard):
631 |                     files.append(item['Key'])
632 |         return files
633 | 


--------------------------------------------------------------------------------
/S3netCDF4/Backends/_s3aioFileObject.pyx:
--------------------------------------------------------------------------------
  1 | #!python
  2 | #cython: language_level=3
  3 | 
  4 | __copyright__ = "(C) 2019-2021 Science and Technology Facilities Council"
  5 | __license__ = "BSD - see LICENSE file in top-level directory"
  6 | __authors__ = "Neil Massey"
  7 | 
  8 | import io
  9 | from fnmatch import fnmatch
 10 | from urllib.parse import urlparse
 11 | 
 12 | import asyncio
 13 | import aiobotocore
 14 | from botocore.exceptions import ClientError
 15 | import botocore.config
 16 | 
 17 | from S3netCDF4.Managers._ConnectionPool import ConnectionPool, ConnectionObject
 18 | from S3netCDF4.Managers._ConfigManager import Config
 19 | from S3netCDF4._Exceptions import APIException, IOException
 20 | 
 21 | class s3aioFileObject(object):
 22 |     """Custom file object class, inheriting from Python io.Base, to read from
 23 |     an S3 object store / AWS cloud storage."""
 24 | 
 25 |     """Static connection pool object - i.e. shared across the file objects."""
 26 |     _connection_pool = ConnectionPool()
 27 | 
 28 |     # The defaults for MAXIMUM_PART_SIZE etc. are now assigned in
 29 |     # __init__ if no values are found in ~/.s3nc.json
 30 |     """Static config object for the backend options"""
 31 |     _config = Config()
 32 | 
 33 |     def _get_server_bucket_object(uri):
 34 |         """Get the server name from the URI"""
 35 |         # First split the uri into the network location and path, and build the
 36 |         # server
 37 |         url_p = urlparse(uri)
 38 |         # check that the uri contains a scheme and a netloc
 39 |         if url_p.scheme == '' or url_p.netloc == '':
 40 |             raise APIException(
 41 |                 "URI supplied to s3aioFileObject is not well-formed: {}". format(uri)
 42 |             )
 43 |         server = url_p.scheme + "://" + url_p.netloc
 44 |         split_path = url_p.path.split("/")
 45 |         # get the bucket
 46 |         try:
 47 |             bucket = split_path[1]
 48 |         except IndexError as e:
 49 |             raise APIException(
 50 |                 "URI supplied has no bucket contained within it: {}".format(uri)
 51 |             )
 52 |         # get the path
 53 |         try:
 54 |             path = "/".join(split_path[2:])
 55 |         except IndexError as e:
 56 |             raise APIException(
 57 |                 "URI supplied has no path contained within it: {}".format(uri)
 58 |             )
 59 |         return server, bucket, path
 60 | 
 61 |     def __init__(self, uri, credentials, mode='r', create_bucket=True,
 62 |                  part_size=None, max_parts=None, multipart_upload=None,
 63 |                  multipart_download=None, connect_timeout=None,
 64 |                  read_timeout=None):
 65 |         """Initialise the file object by creating or reusing a connection in the
 66 |         connection pool."""
 67 |         # get the server, bucket and the key from the endpoint url
 68 |         self._server, self._bucket, self._path = s3aioFileObject._get_server_bucket_object(uri)
 69 |         self._closed = False        # set the file to be not closed
 70 |         self._mode = mode
 71 |         self._seek_pos = 0
 72 |         self._buffer = [io.BytesIO()]   # have a list of objects that can stream
 73 |         self._credentials = credentials
 74 |         self._create_bucket = create_bucket
 75 |         self._uri = uri
 76 | 
 77 |         """Either get the backend config from the parameters, or the config file
 78 |         or use defaults."""
 79 |         if "s3aioFileObject" in s3aioFileObject._config["backends"]:
 80 |             backend_config = s3aioFileObject._config["backends"]["s3aioFileObject"]
 81 |         else:
 82 |             backend_config = {}
 83 | 
 84 |         if part_size:
 85 |             self._part_size = int(part_size)
 86 |         elif "maximum_part_size" in backend_config:
 87 |             self._part_size = int(backend_config["maximum_part_size"])
 88 |         else:
 89 |             self._part_size = int(50 * 1024 * 1024)
 90 | 
 91 |         if max_parts:
 92 |             self._max_parts = int(max_parts)
 93 |         elif "maximum_parts" in backend_config:
 94 |             self._max_parts = int(backend_config["maximum_parts"])
 95 |         else:
 96 |             self._max_parts = 8
 97 | 
 98 |         if multipart_upload:
 99 |             self._multipart_upload = multipart_upload
100 |         elif "multipart_upload" in backend_config:
101 |             self._multipart_upload = backend_config["multipart_upload"]
102 |         else:
103 |             self._multipart_upload = True
104 | 
105 |         if multipart_download:
106 |             self._multipart_download = multipart_download
107 |         elif "multipart_download" in backend_config:
108 |             self._multipart_download = backend_config["multipart_download"]
109 |         else:
110 |             self._multipart_download = True
111 | 
112 |         if connect_timeout:
113 |             self._connect_timeout = connect_timeout
114 |         elif "connect_timeout" in backend_config:
115 |             self._connect_timeout = backend_config["connect_timeout"]
116 |         else:
117 |             self._connect_timeout = 30.0
118 | 
119 |         if read_timeout:
120 |             self._read_timeout = read_timeout
121 |         elif "read_timeout" in backend_config:
122 |             self._read_timeout = backend_config["read_timeout"]
123 |         else:
124 |             self._read_timeout = 30.0
125 | 
126 |     async def __aenter__(self):
127 |         """Async version of the enter context method."""
128 |         await self.connect()
129 |         return self
130 | 
131 |     async def __aexit__(self, exc_type, exc_value, exc_tb):
132 |         """Close the file on the exit of a with statement, or by the garbage
133 |         collector removing the object."""
134 |         await self.close()
135 |         # check for any exceptions
136 |         if exc_type is not None:
137 |             return False
138 |         return True
139 | 
140 |     async def _getsize(self):
141 |         # Use content length in the head object to determine how the size of
142 |         # the file / object
143 |         # If we are writing then the size should be the buffer size
144 |         try:
145 |             if 'w' in self._mode:
146 |                 size = self._part_size
147 |             else:
148 |                 response = await self._conn_obj.conn.head_object(
149 |                     Bucket=self._bucket,
150 |                     Key=self._path
151 |                 )
152 |                 size = response['ContentLength']
153 |         except ClientError as e:
154 |             raise IOException(
155 |                 "Could not get size of object {}".format(self._path)
156 |             )
157 |         except AttributeError as e:
158 |             self._handle_connection_exception(e)
159 |         return size
160 | 
161 |     async def _get_bucket_list(self):
162 |         # get the names of the buckets in a list
163 |         try:
164 |             bl = await self._conn_obj.conn.list_buckets()
165 |             bucket_list = [b['Name'] for b in bl['Buckets']]
166 |         except AttributeError as e:
167 |             self._handle_connection_exception(e)
168 |         return bucket_list
169 | 
170 |     def _handle_connection_exception(self, e):
171 |         # Check if connection made
172 |         if ("_conn_obj" in e.args[0] or "_current_part" in e.args[0]):
173 |             raise APIException(
174 |                 "Connection to S3 server is not established.  Use either the "
175 |                 ".connect method or a with statement."
176 |             )
177 |         else:
178 |             # other AttributeError - handle that separately
179 |             raise e
180 | 
181 |     async def connect(self):
182 |         """Connect to the s3 server with the details passed in via the __init__
183 |         method."""
184 |         # if the connection returns None then either there isn't a connection to
185 |         # the server in the pool, or there is no connection that is available
186 |         self._conn_obj = s3aioFileObject._connection_pool.get(self._server)
187 |         if self._conn_obj is None:
188 |             try:
189 |                 session = aiobotocore.get_session()
190 |                 config = botocore.config.Config(
191 |                     connect_timeout=self._connect_timeout,
192 |                     read_timeout=self._read_timeout
193 |                 )
194 |                 s3c = session.create_client(
195 |                           "s3",
196 |                           endpoint_url=self._server,
197 |                           aws_access_key_id=self._credentials["accessKey"],
198 |                           aws_secret_access_key=self._credentials["secretKey"],
199 |                           config=config
200 |                       )
201 |                 # call await s3c.__aenter__ : this is needed for newer versions
202 |                 # of aiobotocore
203 |                 s3c = await s3c.__aenter__()
204 |                 # add the connection to the connection pool
205 |                 self._conn_obj = s3aioFileObject._connection_pool.add(
206 |                      s3c, self._server
207 |                 )
208 |             except ClientError as e:
209 |                 raise IOException(
210 |                     "Could not connect to S3 endpoint {} {}".format(
211 |                         self._server, e)
212 |                 )
213 | 
214 |         if ('r' in self._mode and '*' not in self._path and
215 |             '?' not in self._path):
216 |             # if this is a read method then check the file exists
217 |             response = await self._conn_obj.conn.list_objects_v2(
218 |                 Bucket=self._bucket,
219 |                 Prefix=self._path
220 |             )
221 |             exists = False
222 |             for obj in response.get('Contents', []):
223 |                 if obj['Key'] == self._path:
224 |                     exists = True
225 |             if not exists:
226 |                 raise IOException(
227 |                     "Object does not exist: {}/{}/{}".format(
228 |                         self._server, self._bucket, self._path
229 |                     )
230 |                 )
231 |         if 'w' in self._mode:
232 |             # if this is a write method then create a bytes array
233 |             self._current_part = 1
234 |         if 'a' in self._mode or '+' in self._mode:
235 |             raise APIException(
236 |                 "Appending to files is not supported {}".format(self._path)
237 |             )
238 |         return True
239 | 
240 |     def detach(self):
241 |         """Separate the underlying raw stream from the buffer and return it.
242 |         Not supported in S3."""
243 |         raise io.UnsupportedOperation
244 | 
245 |     async def _read_partial_file(self, part_num, part_size):
246 |         s = int(part_num*part_size)
247 |         e = int((part_num+1)*part_size)-1
248 |         range_fmt = 'bytes={}-{}'.format(s,e)
249 |         s3_object = await self._conn_obj.conn.get_object(
250 |             Bucket = self._bucket,
251 |             Key = self._path,
252 |             Range = range_fmt
253 |         )
254 |         body = s3_object['Body']
255 |         return await body.read()
256 | 
257 |     async def read(self, size=-1):
258 |         """Read and return up to size bytes. For the S3 implementation the size
259 |         can be used for RangeGet.  If size==-1 then the whole object is streamed
260 |         into memory."""
261 |         # read the object using the bucket and path already determined in
262 |         # __init__, and using the connection object
263 |         try:
264 |             # get the file size first
265 |             file_size = await self._getsize()
266 |             if size== -1:
267 |                 range_start = 0
268 |                 range_end   = file_size
269 |                 range_size  = file_size
270 |             else:
271 |                 range_start = self._seek_pos
272 |                 range_end   = self._seek_pos+size-1
273 |                 if range_end > file_size:
274 |                     range_end = file_size-1
275 |                 range_size  = range_end-range_start+1
276 | 
277 |             # if multipart download is not supported
278 |             if not self._multipart_download:
279 |                 # get the full file
280 |                 s3_object = await self._conn_obj.conn.get_object(
281 |                     Bucket = self._bucket,
282 |                     Key = self._path,
283 |                 )
284 |                 body = s3_object['Body']
285 |                 data = await body.read()
286 |             # if the file is smaller than the MAXIMUM_PART_SIZE
287 |             elif (range_size < self._part_size):
288 |                 # the requested range is the full file, it is fastest to
289 |                 # not specify the range
290 |                 if (range_start == 0 and range_size == file_size):
291 |                     # get the full file
292 |                     s3_object = await self._conn_obj.conn.get_object(
293 |                         Bucket = self._bucket,
294 |                         Key = self._path,
295 |                     )
296 |                 # a portion of the file is requested
297 |                 else:
298 |                     s3_object = await self._conn_obj.conn.get_object(
299 |                         Bucket = self._bucket,
300 |                         Key = self._path,
301 |                         Range = 'bytes={}-{}'.format(
302 |                             range_start, range_end
303 |                         )
304 |                     )
305 |                 body = s3_object['Body']
306 |                 data = await body.read()
307 |             # multipart download version
308 |             else:
309 |                 """Use range get to split up a file into the MAXIMUM_PART_SIZE
310 |                 and download each part asynchronously."""
311 |                 # calculate the number of necessary parts
312 |                 n_parts = int(range_size / self._part_size + 1)
313 |                 # don't go above the maximum number downloadable
314 |                 if n_parts > self._max_parts:
315 |                     n_parts = self._max_parts
316 |                 # (re)calculate the download size
317 |                 part_size = float(range_size) / n_parts
318 |                 # create the tasks and assign the return data buffer
319 |                 tasks = []
320 |                 data_buf = io.BytesIO()
321 | 
322 |                 for p in range(0, n_parts):
323 |                     event_loop = asyncio.get_event_loop()
324 |                     task = event_loop.create_task(self._read_partial_file(
325 |                         p, part_size
326 |                     ))
327 |                     tasks.append(task)
328 |                 # wait for all the tasks to finish
329 |                 results = await asyncio.gather(*tasks)
330 |                 # read each chunk of data and write into the global buffer
331 |                 for r in results:
332 |                     data_buf.write(r)
333 |                     r = None            # indicate ready for garbage collection
334 |                 data_buf.seek(0)
335 |                 data = data_buf.read()
336 | 
337 |         except ClientError as e:
338 |             raise IOException(
339 |                 "Could not read from object {} {}".format(self._path, e)
340 |             )
341 |         except AttributeError as e:
342 |             self._handle_connection_exception(e)
343 |         return data
344 | 
345 |     async def read1(self, size=-1):
346 |         """Just call read."""
347 |         return await self.read(size=size)
348 | 
349 |     async def readinto(self, b):
350 |         """Read bytes into a pre-allocated, writable bytes-like object b and
351 |         return the number of bytes read.
352 |         In S3 the entire file is read into the bytesbuffer.  It is important
353 |         that the bytesbuffer is big enough to hold the entire file."""
354 |         # get the size of the file
355 |         size = await self._getsize()
356 |         b[:size] = await self.read(size)
357 |         return size
358 | 
359 |     async def readinto1(self, b):
360 |         """Just call readinto"""
361 |         return await self.readinto(b)
362 | 
363 |     async def _multipart_upload_from_buffer(self):
364 |         """Do a multipart upload from the buffer.
365 |         There are three cases:
366 |             1.  The size is exactly the same size as the MAXIMUM_PART_SIZE
367 |             2.  The size is greater than the MAXIMUM_PART_SIZE
368 |             3.  The size is multiple times greater than the MAX_UPLOAD_SIZE and
369 |                 requires splitting into smaller chunks
370 |         """
371 |         # check to see if bucket needs to be created
372 |         if self._create_bucket:
373 |             # check whether the bucket exists
374 |             bucket_list = await self._get_bucket_list()
375 |             if not self._bucket in bucket_list:
376 |                 await self._conn_obj.conn.create_bucket(Bucket=self._bucket)
377 | 
378 |         # if the current part is 1 we have to create the multipart upload
379 |         if self._current_part == 1:
380 |             response = await self._conn_obj.conn.create_multipart_upload(
381 |                 Bucket = self._bucket,
382 |                 Key = self._path
383 |             )
384 |             self._upload_id = response['UploadId']
385 |             # we need to keep a track of the multipart info
386 |             self._multipart_info = {'Parts' : []}
387 | 
388 |         # upload from a buffer - do we need to split into more than one
389 |         # multiparts?
390 |         new_buffer = []
391 |         for buffer_part in range(0, len(self._buffer)):
392 |             # is the current part of the buffer larger than the maximum
393 |             # upload size? split if it is
394 |             data_buf = self._buffer[buffer_part]
395 |             data_len = data_buf.tell()
396 |             if data_len >= self._part_size:
397 |                 data_buf.seek(0)
398 |                 data_pos = 0
399 |                 # split the file up
400 |                 while data_pos < data_len:
401 |                     new_buffer.append(io.BytesIO())
402 |                     # copy the data - don't overstep the buffer
403 |                     if data_pos + self._part_size >= data_len:
404 |                         sub_data = data_buf.read(data_len-data_pos)
405 |                     else:
406 |                         sub_data = data_buf.read(
407 |                             self._part_size
408 |                         )
409 |                     new_buffer[-1].write(sub_data)
410 |                     # increment to next
411 |                     data_pos += self._part_size
412 | 
413 |                 # free the old memory
414 |                 self._buffer[buffer_part].close()
415 |             else:
416 |                 # copy the old buffer into a new one
417 |                 self._buffer[buffer_part].seek(0)
418 |                 new_buffer.append(io.BytesIO(self._buffer[buffer_part].read()))
419 | 
420 |         # close other buffers first
421 |         for b in self._buffer:
422 |             b.close()
423 |         self._buffer = new_buffer
424 | 
425 |         tasks = []
426 | 
427 |         for buffer_part in range(0, len(self._buffer)):
428 |             # seek in the BytesIO buffer to get to the beginning after the
429 |             # writing
430 |             self._buffer[buffer_part].seek(0)
431 |             # upload here
432 |             # schedule the uploads
433 |             event_loop = asyncio.get_event_loop()
434 |             task = event_loop.create_task(self._conn_obj.conn.upload_part(
435 |                 Bucket=self._bucket,
436 |                 Key=self._path,
437 |                 UploadId=self._upload_id,
438 |                 PartNumber=self._current_part + buffer_part,
439 |                 Body=self._buffer[buffer_part]
440 |             ))
441 |             tasks.append(task)
442 | 
443 |         # await the completion of the uploads
444 |         res = await asyncio.gather(*tasks)
445 |         for buffer_part in range(0, len(self._buffer)):
446 |             # insert into the multipart info list of dictionaries
447 |             part = res[buffer_part]
448 |             self._multipart_info['Parts'].append(
449 |                 {
450 |                     'PartNumber' : self._current_part + buffer_part,
451 |                     'ETag' : part['ETag']
452 |                 }
453 |             )
454 | 
455 |         # add the total number of uploads to the current part
456 |         self._current_part += len(self._buffer)
457 | 
458 |         # reset all the byte buffers and their positions
459 |         for buffer_part in range(0, len(self._buffer)):
460 |             self._buffer[buffer_part].close()
461 |         self._buffer = [io.BytesIO()]
462 |         self._seek_pos = 0
463 | 
464 |     async def write(self, b):
465 |         """Write the given bytes-like object, b, and return the number of bytes
466 |         written (always equal to the length of b in bytes, since if the write
467 |         fails an OSError will be raised).
468 |         For the S3 file object we just write the file to a temporary bytearray
469 |         and increment the seek_pos.
470 |         This data will be uploaded to an object when .flush is called.
471 |         """
472 |         if "w" not in self._mode:
473 |             raise APIException(
474 |                 "Trying to write to a read only file, where mode != 'w'."
475 |             )
476 |         try:
477 |             # add to local, temporary bytearray
478 |             size = len(b)
479 |             self._buffer[-1].write(b)
480 |             self._seek_pos += size
481 |             # test to see whether we should do a multipart upload now
482 |             # this occurs when the number of buffers is > the maximum number of
483 |             # parts.  self._current_part is indexed from 1
484 |             if (self._multipart_upload and
485 |                 self._seek_pos > self._part_size):
486 |                 if len(self._buffer) == self._max_parts:
487 |                     await self._multipart_upload_from_buffer()
488 |                 else:
489 |                     # add another buffer to write to
490 |                     self._buffer.append(io.BytesIO())
491 | 
492 |         except ClientError as e:
493 |             raise IOException(
494 |                 "Could not write to object {} {}".format(self._path, e)
495 |             )
496 |         except AttributeError as e:
497 |             self._handle_connection_exception(e)
498 | 
499 |         return size
500 | 
501 |     async def close(self):
502 |         """Flush and close this stream. This method has no effect if the file is
503 |         already closed. Once the file is closed, any operation on the file (e.g.
504 |         reading or writing) will raise a ValueError.
505 | 
506 |         As a convenience, it is allowed to call this method more than once; only
507 |         the first call, however, will have an effect."""
508 |         try:
509 |             if not self._closed:
510 |                 # self.flush will upload the bytesarray to the S3 store
511 |                 await self.flush()
512 |                 s3aioFileObject._connection_pool.release(self._conn_obj)
513 |                 self._closed = True
514 |         except AttributeError as e:
515 |             self._handle_connection_exception(e)
516 |         return True
517 | 
518 |     async def seek(self, offset, whence=io.SEEK_SET):
519 |         """Change the stream position to the given byte offset. offset is
520 |         interpreted relative to the position indicated by whence. The default
521 |         value for whence is SEEK_SET. Values for whence are:
522 | 
523 |         SEEK_SET or 0 – start of the stream (the default); offset should be zero
524 |                         or positive
525 |         SEEK_CUR or 1 – current stream position; offset may be negative
526 |         SEEK_END or 2 – end of the stream; offset is usually negative
527 |         Return the new absolute position.
528 | 
529 |         Note: currently cannot seek when writing a file.
530 | 
531 |         """
532 | 
533 |         if self._mode == 'w':
534 |             raise IOException(
535 |                 "Cannot seek within a file that is being written to."
536 |             )
537 | 
538 |         size = await self._getsize()
539 |         error_string = "Seek {} is outside file size bounds 0->{} for file {}"
540 |         seek_pos = self._seek_pos
541 |         if whence == io.SEEK_SET:
542 |             # range check
543 |             seek_pos = offset
544 |         elif whence == io.SEEK_CUR:
545 |             seek_pos += offset
546 |         elif whence == io.SEEK_END:
547 |             seek_pos = size - offset
548 | 
549 |         # range checks
550 |         if (seek_pos >= size):
551 |             raise IOException(error_string.format(
552 |                 seek_pos,
553 |                 size,
554 |                 self._path)
555 |             )
556 |         elif (seek_pos < 0):
557 |             raise IOException(error_string.format(
558 |                 seek_pos,
559 |                 size,
560 |                 self._path)
561 |             )
562 |         self._seek_pos = seek_pos
563 |         return self._seek_pos
564 | 
565 |     def seekable(self):
566 |         """We can seek in s3 streams using the range get and range put features.
567 |         """
568 |         return True
569 | 
570 |     def tell(self):
571 |         """Return True if the stream supports random access. If False, seek(),
572 |         tell() and truncate() will raise OSError."""
573 |         return self._seek_pos
574 | 
575 |     def fileno(self):
576 |         """Return the underlying file descriptor (an integer) of the stream if
577 |         it exists. An IOError is raised if the IO object does not use a file
578 |         descriptor."""
579 |         raise io.UnsupportedOperation
580 | 
581 |     async def flush(self):
582 |         """Flush the write buffers of the stream.  This will upload the contents
583 |         of the final multipart upload of self._buffer to the S3 store."""
584 |         try:
585 |             if 'w' in self._mode:
586 |                 # if the size is less than the MAXIMUM UPLOAD SIZE
587 |                 # then just write the data
588 |                 size = self._buffer[0].tell()
589 |                 if ((self._current_part == 1 and
590 |                     size < self._part_size) or
591 |                     not self._multipart_upload
592 |                    ):
593 |                     if self._create_bucket:
594 |                         # check whether the bucket exists and create if not
595 |                         bucket_list = await self._get_bucket_list()
596 |                         if not self._bucket in bucket_list:
597 |                             await self._conn_obj.conn.create_bucket(
598 |                                 Bucket=self._bucket
599 |                             )
600 |                     # upload the whole buffer - seek back to the start first
601 |                     self._buffer[0].seek(0)
602 |                     await self._conn_obj.conn.put_object(
603 |                         Bucket=self._bucket,
604 |                         Key=self._path,
605 |                         Body=self._buffer[0].read(size)
606 |                     )
607 |                 else:
608 |                     # upload as multipart
609 |                     await self._multipart_upload_from_buffer()
610 |                     # finalise the multipart upload
611 |                     await self._conn_obj.conn.complete_multipart_upload(
612 |                         Bucket=self._bucket,
613 |                         Key=self._path,
614 |                         UploadId=self._upload_id,
615 |                         MultipartUpload=self._multipart_info
616 |                     )
617 |             # clear the buffers
618 |             for b in self._buffer:
619 |                 b.close()
620 | 
621 |         except AttributeError as e:
622 |             self._handle_connection_exception(e)
623 |         return True
624 | 
625 |     def readable(self):
626 |         """Return True if the stream can be read from. If False, read() will
627 |         raise IOError."""
628 |         return 'r' in self._mode or '+' in self._mode
629 | 
630 |     async def readline(self, size=-1):
631 |         """Read and return one line from the stream.
632 |         If size is specified, at most size bytes will be read."""
633 |         if 'b' in self._mode:
634 |             raise APIException(
635 |                 "readline on a binary file is not permitted: {}".format(
636 |                     self._uri)
637 |                 )
638 |         # only read a set number of bytes if size is passed in, otherwise
639 |         # read upto the file size
640 |         if size == -1:
641 |             size = self._getsize()
642 | 
643 |         # use the BytesIO readline methods
644 |         if self.tell() == 0:
645 |             buffer = await self.read(size=size)
646 |             self._buffer[-1].write(buffer)
647 |             self._buffer[-1].seek(0)
648 | 
649 |         line = self._buffer[-1].readline().decode().strip()
650 |         return line
651 | 
652 |     async def readlines(self, hint=-1):
653 |         """Read and return a list of lines from the stream. hint can be
654 |         specified to control the number of lines read: no more lines will be
655 |         read if the total size (in bytes/characters) of all lines so far exceeds
656 |         hint."""
657 |         if 'b' in self._mode:
658 |             raise APIException(
659 |                 "readline on a binary file is not permitted: {}".format(
660 |                     self._uri)
661 |                 )
662 |         # read the entire file in and decode it
663 |         lines = await self.read().decode().split("\n")
664 |         return lines
665 | 
666 |     def truncate(self, size=None):
667 |         """Not supported"""
668 |         raise io.UnsupportedOperation
669 | 
670 |     def writable(self):
671 |         """Return True if the stream supports writing. If False, write() and
672 |         truncate() will raise IOError."""
673 |         return 'w' in self._mode
674 | 
675 |     async def writelines(self, lines):
676 |         """Write a list of lines to the stream."""
677 |         # first check if the file is binary or not
678 |         if 'b' in self._mode:
679 |             raise APIException(
680 |                 "writelines on a binary file is not permitted: {}".format(
681 |                     self._uri)
682 |                 )
683 |         # write all but the last line with a line break
684 |         for l in lines:
685 |             await self.write((l+"\n").encode('utf-8'))
686 |         return True
687 | 
688 |     async def glob(self):
689 |         """Emulate glob on an open bucket.  The glob has been passed in via
690 |         self._path, created on connection to the server and bucket."""
691 |         # get the path string up to the wildcards
692 |         try:
693 |             pi1 = self._path.index("*")
694 |         except ValueError:
695 |             pi1 = len(self._path)
696 |         try:
697 |             pi2 = self._path.index("?")
698 |         except ValueError:
699 |             pi2 = len(self._path)
700 |         pi = min(pi1, pi2)
701 |         # using the prefix will cut down on the search space
702 |         prefix = self._path[:pi]
703 |         # get the wildcard
704 |         wildcard = self._path[pi:]
705 |         # set up the paginator
706 |         paginator = self._conn_obj.conn.get_paginator("list_objects_v2")
707 |         parameters = {
708 |             'Bucket': self._bucket,
709 |             'Prefix': prefix
710 |         }
711 |         page_iterator = paginator.paginate(**parameters)
712 |         files = []
713 |         async for page in page_iterator:
714 |             for item in page.get('Contents', []):
715 |                 fname = item['Key']
716 |                 # check that it matches against wildcard
717 |                 if fnmatch(fname, wildcard):
718 |                     files.append(item['Key'])
719 |         return files
720 | 


--------------------------------------------------------------------------------