├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── NOTICE.txt
├── README.md
├── THIRD-PARTY-LICENSES
├── awsio
    ├── __init__.py
    ├── csrc
    │   └── io
    │   │   └── s3
    │   │       ├── s3_file_wrapper.cpp
    │   │       ├── s3_io.cpp
    │   │       └── s3_io.h
    └── python
    │   ├── __init__.py
    │   └── lib
    │       ├── __init__.py
    │       └── io
    │           ├── __init__.py
    │           └── s3
    │               ├── __init__.py
    │               └── s3dataset.py
├── examples
    ├── s3_cv_iterable_example.py
    ├── s3_cv_iterable_shuffle_example.py
    ├── s3_cv_map_example.py
    ├── s3_cv_transform.py
    ├── s3_imagenet_example.py
    └── s3_nlp_iterable_example.py
├── setup.cfg
├── setup.py
├── tests
    ├── py-tests
    │   ├── test_integration.py
    │   ├── test_read_datasets.py
    │   ├── test_regions.py
    │   ├── test_s3dataset.py
    │   ├── test_s3iterabledataset.py
    │   └── test_utils.py
    └── smoke_tests
    │   └── import_awsio.sh
├── third_party
    ├── CMakeLists.txt
    └── cmake
    │   └── AwsSDK.cmake
├── tools
    └── get_version.py
└── version.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore Mac system files
 2 | .DS_store
 3 | 
 4 | # Ignore file extensions below
 5 | *.coverage
 6 | *.egg-info
 7 | *.log
 8 | *.pyc
 9 | 
10 | awsio/_version.py
11 | build/
12 | dist/
13 | 
14 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.13)
 2 | project(_pywrap_s3_io)
 3 | 
 4 | set(CMAKE_CXX_STANDARD 17)
 5 | SET(TORCH_MIN_VERSION "1.5.1")
 6 | 
 7 | find_package(Python3 COMPONENTS Interpreter Development)
 8 | 
 9 | find_package(AWSSDK REQUIRED COMPONENTS s3 transfer)
10 | 
11 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
12 | set(INCLUDE_DIRS "awsio/csrc/io/s3")
13 | 
14 | set(SOURCES "${INCLUDE_DIRS}/s3_io.cpp" )
15 | 
16 | include_directories(${INCLUDE_DIRS})
17 | find_package(pybind11 REQUIRED)
18 | pybind11_add_module(_pywrap_s3_io ${SOURCES} "${INCLUDE_DIRS}/s3_file_wrapper.cpp")
19 | 
20 | Message(STATUS "All linked libs: ${AWSSDK_LINK_LIBRARIES}")
21 | 
22 | target_link_libraries(_pywrap_s3_io PRIVATE ${AWSSDK_LINK_LIBRARIES} ${AWSSDK_PLATFORM_DEPS})
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 Amazon Web Services
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | amazon-s3-plugin-for-pytorch
2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # S3 Plugin
  2 | 
  3 | **Note: As of April 5th, 2022, this plugin is in maintenance mode. [The S3 IO is in the process of being upstreamed into `torchdata` package](https://github.com/pytorch/data/tree/main/torchdata/datapipes/iter/load#readme). In the future, we will support the new `torchdata` package, and be continuously improving the user experience and performance of the S3 IO datapipes. Please support and comment for the new S3 IO datapipes. Raise issues and create PRs if necessary.**
  4 | 
  5 | S3-plugin is a high performance PyTorch dataset library to efficiently access datasets stored in S3 buckets. It provides streaming data access to datasets of any size and thus eliminates the need to provision local storage capacity. The library is designed to leverage the high throughput that S3 offers to access objects with minimal latency.
  6 | 
  7 | The users have the flexibility to use either map-style or iterable-style dataset interfaces based on their needs. The library itself is file-format agnostic and presents objects in S3 as a binary buffer(blob). Users are free to apply any additional transformation on the data received from S3.
  8 | 
  9 | ## Compatible Images
 10 | 
 11 | Only the following images are compatible with the Amazon S3 plugin for PyTorch:
 12 | 
 13 | **Ubuntu 20.04**
 14 | - **CPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9.0-cpu-py38-ubuntu20.04-v1.1
 15 | - **GPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9.0-gpu-py38-cu111-ubuntu20.04-v1.1
 16 | 
 17 | **Ubuntu 18.04**
 18 | - **CPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.8.1-cpu-py36-ubuntu18.04-v1.6
 19 | - **GPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04-v1.7
 20 | 
 21 | ## Installation
 22 | 
 23 | You can install this package by following the below instructions.
 24 | 
 25 | #### Prerequisite
 26 | 
 27 | - Python 3.6 (or Python 3.7) is required for this installation.
 28 | 
 29 | - [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) for configuring S3 access.
 30 | 
 31 | - Pytorch >= 1.5 (If not available, S3-plugin installs latest Torch)
 32 | 
 33 | - *Note:* To run on Mac, [AWS_SDK_CPP](https://github.com/aws/aws-sdk-cpp) must be installed.
 34 | 
 35 | 
 36 | #### Installing S3-Plugin via Wheel
 37 | 
 38 | ```shell script
 39 | # List of wheels on Linux:
 40 | # python 3.7: https://aws-s3-plugin.s3.us-west-2.amazonaws.com/binaries/0.0.1/bd37e27/awsio-0.0.1%2Bbd37e27-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 41 | # python 3.8: https://aws-s3-plugin.s3.us-west-2.amazonaws.com/binaries/0.0.1/bd37e27/awsio-0.0.1%2Bbd37e27-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 42 | # python 3.9: https://aws-s3-plugin.s3.us-west-2.amazonaws.com/binaries/0.0.1/bd37e27/awsio-0.0.1%2Bbd37e27-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
 43 | aws s3 cp <S3 URI> .
 44 | pip install <whl name awsio-0.0.1-cp...whl>
 45 | ```
 46 | 
 47 | #### Installing S3-Plugin from source
 48 | 
 49 | ```shell
 50 | # install [aws-sdk-cpp](https://github.com/aws/aws-sdk-cpp). example installation guide
 51 | git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp
 52 | cd aws-sdk-cpp/
 53 | mkdir sdk-build
 54 | cd sdk-build
 55 | cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_ONLY="s3;transfer"
 56 | make
 57 | make install # may need sudo
 58 | 
 59 | # install pybind11. example:
 60 | conda install pybind11
 61 | export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:/usr/local/lib/python3.7/site-packages/pybind11
 62 | 
 63 | # install from source
 64 | python setup.py install
 65 | ```
 66 | 
 67 | ### Configuration
 68 | 
 69 | Before reading data from S3 bucket, you need to provide bucket region parameter:
 70 | 
 71 | * `AWS_REGION`: By default, regional endpoint is used for S3, with region controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then `us-west-2` is used by default.
 72 | 
 73 | To read objects in a bucket that is not publicly accessible, AWS credentials must be provided through one of the following methods:
 74 | 
 75 | * Install and configure [awscli](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) by `aws configure`. 
 76 | * Set credentials in the AWS credentials profile file on the local system, located at: `~/.aws/credentials` on Linux, macOS, or Unix
 77 | * Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables.
 78 | * If you are using this library on an EC2 instance, specify an IAM role and then give the EC2 instance access to that role.
 79 | 
 80 | #### Smoke Test
 81 | To test your setup, run:
 82 | ```
 83 | bash tests/smoke_tests/import_awsio.sh
 84 | ```
 85 | 
 86 | The test will first make sure that the package imports correctly by printing the commit hash related to the build.
 87 | Then, it will prompt the user for a S3 url to a file and return whether or not the file exists.
 88 | 
 89 | For example:
 90 | ```
 91 | $ bash tests/smoke_tests/import_awsio.sh 
 92 | Testing: import awsio
 93 | 0.0.1+b119a6d
 94 | import awsio succeeded
 95 | S3 URL : 's3://path/to/bucket/test_0.JPEG'
 96 | Testing: checking setup by quering whether or not 's3://path/to/bucket/test_0.JPEG' is an existing file
 97 | file_exists: True
 98 | Smoke test was successful.
 99 | ```
100 | 
101 | ### Usage
102 | 
103 | Once the above setup is complete, you can interact with S3 bucket in following ways:
104 | 
105 | Accepted input S3 url formats:
106 | 
107 | * Single url 
108 | 
109 | * `url = 's3://path/to/bucket/abc.tfrecord'`
110 | 
111 | * List of urls as follows:
112 | 
113 | ```urls = ['s3://path/to/bucket/abc.tfrecord','s3://path/to/bucket/def.tfrecord']```
114 | 
115 | * Prefix to S3 bucket to include all files under 's3_prefix' folder starting with '0'
116 | 
117 | ```urls = 's3://path/to/s3_prefix/0'```
118 |   
119 | * Using `list_files()` function, which can be used to manipulate input list of urls to fetch as follows:
120 | ```shell
121 | from awsio.python.lib.io.s3.s3dataset import list_files
122 | urls = list_files('s3://path/to/s3_prefix/0')
123 | ```
124 | 
125 | #### Map-Style Dataset
126 | 
127 | If each object in S3 contains a single training sample, then map-style dataset i.e. S3Dataset can be used. To partition data across nodes and to shuffle data, this dataset can be used with PyTorch distributed sampler. Additionally, pre-processing can be applied to the data in S3 by extending the S3Dataset class. Following example illustrates use of map-style S3Dataset for image datasets: 
128 | 
129 | ```python
130 | from awsio.python.lib.io.s3.s3dataset import S3Dataset
131 | from torch.utils.data import DataLoader
132 | from torchvision import transforms
133 | from PIL import Image
134 | import io
135 | 
136 | class S3ImageSet(S3Dataset):
137 |     def __init__(self, urls, transform=None):
138 |         super().__init__(urls)
139 |         self.transform = transform
140 | 
141 |     def __getitem__(self, idx):
142 |         img_name, img = super(S3ImageSet, self).__getitem__(idx)
143 |         # Convert bytes object to image
144 |         img = Image.open(io.BytesIO(img)).convert('RGB')
145 |         
146 |         # Apply preprocessing functions on data
147 |         if self.transform is not None:
148 |             img = self.transform(img)
149 |         return img
150 | 
151 | batch_size = 32
152 | 
153 | preproc = transforms.Compose([
154 |     transforms.ToTensor(),
155 |     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
156 |     transforms.Resize((100, 100))
157 | ])
158 | 
159 | # urls can be S3 prefix containing images or list of all individual S3 images
160 | urls = 's3://path/to/s3_prefix/'
161 | 
162 | dataset = S3ImageSet(urls, transform=preproc)
163 | dataloader = DataLoader(dataset,
164 |         batch_size=batch_size,
165 |         num_workers=64)
166 | 
167 | ```
168 | 
169 | 
170 | #### Iterable-style dataset
171 | 
172 | If each object in S3 contains multiple training samples e.g. archive files containing multiple small images or TF record files/shards containing multiple records, then it is advisable to use the Iterable-style dataset implementation i.e. S3IterableDataset. For the specific case of zip/tar archival files, each file contained in the archival is returned during each iteration in a streaming fashion. For all other file formats, binary blob for the whole shard is returned and users need to implement the appropriate parsing logic. Besides, S3IterableDataset takes care of partitioning the data across nodes and workers in a distributed setting.
173 | 
174 | `Note:` For datasets consisting of a large number of smaller objects, accessing each object individually can be inefficient. For such datasets, it is recommended to create shards of the training data and use S3IterableDataset for better performance.
175 | ```shell
176 | # tar file containing label and image files as below
177 |  tar --list --file=file1.tar |  sed 4q
178 | 
179 | 1234.cls
180 | 1234.jpg
181 | 5678.cls
182 | 5678.jpg
183 | ```
184 | 
185 | Consider tar file for image classification. It can be easily loaded by writing a custom python generator function using the iterator returned by S3IterableDataset. (Note: To create shards from a file dataset refer this [link](https://github.com/tmbdev/pytorch-imagenet-wds).)
186 | 
187 | 
188 | ```python
189 | from torch.utils.data import IterableDataset
190 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset
191 | from PIL import Image
192 | import io
193 | import numpy as np
194 | from torchvision import transforms
195 | 
196 | class ImageS3(IterableDataset):
197 |     def __init__(self, urls, shuffle_urls=False, transform=None):
198 |         self.s3_iter_dataset = S3IterableDataset(urls,
199 |                                                  shuffle_urls)
200 |         self.transform = transform
201 | 
202 |     def data_generator(self):
203 |         try:
204 |             while True:
205 |                 # Based on alphabetical order of files, sequence of label and image may change.
206 |                 label_fname, label_fobj = next(self.s3_iter_dataset_iterator)
207 |                 image_fname, image_fobj = next(self.s3_iter_dataset_iterator)
208 |                 
209 |                 label = int(label_fobj)
210 |                 image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB')
211 |                 
212 |                 # Apply torch vision transforms if provided
213 |                 if self.transform is not None:
214 |                     image_np = self.transform(image_np)
215 |                 yield image_np, label
216 | 
217 |         except StopIteration:
218 |             return
219 |             
220 |     def __iter__(self):
221 |         self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset)
222 |         return self.data_generator()
223 |         
224 |     def set_epoch(self, epoch):
225 |         self.s3_iter_dataset.set_epoch(epoch)
226 | 
227 | # urls can be a S3 prefix containing all the shards or a list of S3 paths for all the shards 
228 |  urls = ["s3://path/to/file1.tar", "s3://path/to/file2.tar"]
229 | 
230 | # Example Torchvision transforms to apply on data    
231 | preproc = transforms.Compose([
232 |     transforms.ToTensor(),
233 |     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
234 |     transforms.Resize((100, 100))
235 | ])
236 | 
237 | dataset = ImageS3(urls, transform=preproc)
238 | 
239 | ```
240 |  
241 | This dataset can be easily used with dataloader for parallel data loading and preprocessing:
242 | 
243 | ```python
244 | dataloader = torch.utils.data.DataLoader(dataset, num_workers=4, batch_size=32)
245 | ```
246 | 
247 | We can shuffle the sequence of fetching shards by setting shuffle_urls=True and calling set_epoch method at the beginning of every epochs as:
248 | ```python
249 | dataset = ImageS3(urls, transform=preproc, shuffle_urls=True)
250 | for epoch in range(epochs):
251 |     dataset.set_epoch(epoch)
252 |     # training code ...
253 | ```
254 | 
255 | Note that the above code will only shuffle sequence of shards, the individual training samples within shards will be fetched in the same order. To shuffle the order of training samples across shards, use ShuffleDataset. ShuffleDataset maintains a buffer of data samples read from multiple shards and returns a random sample from it. The count of samples to be buffered is specified by buffer_size. To use ShuffleDataset, update the above example as follows:
256 | 
257 | ```python
258 | dataset = ShuffleDataset(ImageS3(urls), buffer_size=4000)
259 | ```
260 | 
261 | #### Iterable-style dataset (NLP)
262 | The data set can be similarly used for NLP tasks. Following example demonstrates use for S3IterableDataset for BERT data loading. 
263 | 
264 | ```shell script
265 | # Consider S3 prefix containing hdf5 files.
266 | # Each hdf5 file contains numpy arrays for different variables required for BERT 
267 | # training such as next sentence labels, masks etc.
268 | aws s3 ls --human-readable s3://path/to/s3_prefix |  sed 3q
269 | 
270 | 
271 | file_1.hdf5
272 | file_2.hdf5
273 | file_3.hdf5
274 | 
275 | ```
276 | 
277 | ```python
278 | 
279 | import torch
280 | from torch.utils.data import IterableDataset, DataLoader
281 | from itertools import islice
282 | import h5py
283 | import numpy as np
284 | import io
285 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset
286 | 
287 | def create_data_samples_from_file(fileobj):
288 |     # Converts bytes data to numpy arrays
289 |     keys = ['input_ids', 'input_mask', 'segment_ids', \
290 |         'masked_lm_positions', 'masked_lm_ids', 'next_sentence_labels']
291 |     dataset = io.BytesIO(fileobj)
292 |     with h5py.File(dataset, "r") as f:
293 |         data_file = [np.asarray(f[key][:]) for key in keys]
294 |     return data_file
295 | 
296 | class s3_dataset(IterableDataset):
297 | 
298 |     def __init__(self, urls):
299 |         self.urls = urls
300 |         self.dataset = S3IterableDataset(self.urls, shuffle_urls=True)
301 | 
302 |     def data_generator(self):
303 |         try:
304 |             while True:
305 |                 filename, fileobj = next(self.dataset_iter)
306 |                 # data_samples: list of six numpy arrays 
307 |                 data_samples = create_data_samples_from_file(fileobj)
308 |                 
309 |                 for sample in list(zip(*data_samples)):
310 |                     # Preprocess sample if required and then yield
311 |                     yield sample
312 | 
313 |         except StopIteration as e:
314 |             return
315 | 
316 |     def __iter__(self):
317 |         self.dataset_iter = iter(self.dataset)
318 |         return self.data_generator()
319 | 
320 | urls = "s3://path/to/s3_prefix"
321 | train_dataset = s3_dataset(urls)
322 | 
323 | ```
324 | 
325 | ### Test Coverage
326 | 
327 | To check python test coverage, install [`coverage.py`](https://coverage.readthedocs.io/en/latest/index.html) as follows:
328 | 
329 | ```
330 | pip install coverage
331 | ```
332 | 
333 | To make sure that all tests are run, please also install `pytest`, `boto3`, and `pandas` as follows:
334 | ```
335 | pip install pytest boto3 pandas
336 | ``` 
337 | 
338 | To run tests and calculate coverage:
339 | 
340 | ```asm
341 | coverage erase
342 | coverage run -p --source=awsio -m pytest -v tests/py-tests/test_regions.py \
343 | tests/py-tests/test_utils.py \
344 | tests/py-tests/test_s3dataset.py \
345 | tests/py-tests/test_s3iterabledataset.py \
346 | tests/py-tests/test_read_datasets.py \
347 | tests/py-tests/test_integration.py
348 | coverage combine
349 | coverage report -m
350 | ```
351 | 


--------------------------------------------------------------------------------
/THIRD-PARTY-LICENSES:
--------------------------------------------------------------------------------
  1 | ** tensorflow; version 2.4.0 -- https://github.com/tensorflow/tensorflow
  2 | 
  3 | Apache License
  4 | 
  5 | Version 2.0, January 2004
  6 | 
  7 | http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND
  8 | DISTRIBUTION
  9 | 
 10 |    1. Definitions.
 11 | 
 12 |       "License" shall mean the terms and conditions for use, reproduction, and
 13 |       distribution as defined by Sections 1 through 9 of this document.
 14 | 
 15 |       "Licensor" shall mean the copyright owner or entity authorized by the
 16 |       copyright owner that is granting the License.
 17 | 
 18 |       "Legal Entity" shall mean the union of the acting entity and all other
 19 |       entities that control, are controlled by, or are under common control
 20 |       with that entity. For the purposes of this definition, "control" means
 21 |       (i) the power, direct or indirect, to cause the direction or management
 22 |       of such entity, whether by contract or otherwise, or (ii) ownership of
 23 |       fifty percent (50%) or more of the outstanding shares, or (iii)
 24 |       beneficial ownership of such entity.
 25 | 
 26 |       "You" (or "Your") shall mean an individual or Legal Entity exercising
 27 |       permissions granted by this License.
 28 | 
 29 |       "Source" form shall mean the preferred form for making modifications,
 30 |       including but not limited to software source code, documentation source,
 31 |       and configuration files.
 32 | 
 33 |       "Object" form shall mean any form resulting from mechanical
 34 |       transformation or translation of a Source form, including but not limited
 35 |       to compiled object code, generated documentation, and conversions to
 36 |       other media types.
 37 | 
 38 |       "Work" shall mean the work of authorship, whether in Source or Object
 39 |       form, made available under the License, as indicated by a copyright
 40 |       notice that is included in or attached to the work (an example is
 41 |       provided in the Appendix below).
 42 | 
 43 |       "Derivative Works" shall mean any work, whether in Source or Object form,
 44 |       that is based on (or derived from) the Work and for which the editorial
 45 |       revisions, annotations, elaborations, or other modifications represent,
 46 |       as a whole, an original work of authorship. For the purposes of this
 47 |       License, Derivative Works shall not include works that remain separable
 48 |       from, or merely link (or bind by name) to the interfaces of, the Work and
 49 |       Derivative Works thereof.
 50 | 
 51 |       "Contribution" shall mean any work of authorship, including the original
 52 |       version of the Work and any modifications or additions to that Work or
 53 |       Derivative Works thereof, that is intentionally submitted to Licensor for
 54 |       inclusion in the Work by the copyright owner or by an individual or Legal
 55 |       Entity authorized to submit on behalf of the copyright owner. For the
 56 |       purposes of this definition, "submitted" means any form of electronic,
 57 |       verbal, or written communication sent to the Licensor or its
 58 |       representatives, including but not limited to communication on electronic
 59 |       mailing lists, source code control systems, and issue tracking systems
 60 |       that are managed by, or on behalf of, the Licensor for the purpose of
 61 |       discussing and improving the Work, but excluding communication that is
 62 |       conspicuously marked or otherwise designated in writing by the copyright
 63 |       owner as "Not a Contribution."
 64 | 
 65 |       "Contributor" shall mean Licensor and any individual or Legal Entity on
 66 |       behalf of whom a Contribution has been received by Licensor and
 67 |       subsequently incorporated within the Work.
 68 | 
 69 |    2. Grant of Copyright License. Subject to the terms and conditions of this
 70 |    License, each Contributor hereby grants to You a perpetual, worldwide,
 71 |    non-exclusive, no-charge, royalty-free, irrevocable copyright license to
 72 |    reproduce, prepare Derivative Works of, publicly display, publicly perform,
 73 |    sublicense, and distribute the Work and such Derivative Works in Source or
 74 |    Object form.
 75 | 
 76 |    3. Grant of Patent License. Subject to the terms and conditions of this
 77 |    License, each Contributor hereby grants to You a perpetual, worldwide,
 78 |    non-exclusive, no-charge, royalty-free, irrevocable (except as stated in
 79 |    this section) patent license to make, have made, use, offer to sell, sell,
 80 |    import, and otherwise transfer the Work, where such license applies only to
 81 |    those patent claims licensable by such Contributor that are necessarily
 82 |    infringed by their Contribution(s) alone or by combination of their
 83 |    Contribution(s) with the Work to which such Contribution(s) was submitted.
 84 |    If You institute patent litigation against any entity (including a
 85 |    cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 86 |    Contribution incorporated within the Work constitutes direct or contributory
 87 |    patent infringement, then any patent licenses granted to You under this
 88 |    License for that Work shall terminate as of the date such litigation is
 89 |    filed.
 90 | 
 91 |    4. Redistribution. You may reproduce and distribute copies of the Work or
 92 |    Derivative Works thereof in any medium, with or without modifications, and
 93 |    in Source or Object form, provided that You meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or Derivative Works a
 96 |       copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices stating
 99 |       that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works that You
102 |       distribute, all copyright, patent, trademark, and attribution notices
103 |       from the Source form of the Work, excluding those notices that do not
104 |       pertain to any part of the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |       distribution, then any Derivative Works that You distribute must include
108 |       a readable copy of the attribution notices contained within such NOTICE
109 |       file, excluding those notices that do not pertain to any part of the
110 |       Derivative Works, in at least one of the following places: within a
111 |       NOTICE text file distributed as part of the Derivative Works; within the
112 |       Source form or documentation, if provided along with the Derivative
113 |       Works; or, within a display generated by the Derivative Works, if and
114 |       wherever such third-party notices normally appear. The contents of the
115 |       NOTICE file are for informational purposes only and do not modify the
116 |       License. You may add Your own attribution notices within Derivative Works
117 |       that You distribute, alongside or as an addendum to the NOTICE text from
118 |       the Work, provided that such additional attribution notices cannot be
119 |       construed as modifying the License.
120 | 
121 |       You may add Your own copyright statement to Your modifications and may
122 |       provide additional or different license terms and conditions for use,
123 |       reproduction, or distribution of Your modifications, or for any such
124 |       Derivative Works as a whole, provided Your use, reproduction, and
125 |       distribution of the Work otherwise complies with the conditions stated in
126 |       this License.
127 | 
128 |    5. Submission of Contributions. Unless You explicitly state otherwise, any
129 |    Contribution intentionally submitted for inclusion in the Work by You to the
130 |    Licensor shall be under the terms and conditions of this License, without
131 |    any additional terms or conditions. Notwithstanding the above, nothing
132 |    herein shall supersede or modify the terms of any separate license agreement
133 |    you may have executed with Licensor regarding such Contributions.
134 | 
135 |    6. Trademarks. This License does not grant permission to use the trade
136 |    names, trademarks, service marks, or product names of the Licensor, except
137 |    as required for reasonable and customary use in describing the origin of the
138 |    Work and reproducing the content of the NOTICE file.
139 | 
140 |    7. Disclaimer of Warranty. Unless required by applicable law or agreed to in
141 |    writing, Licensor provides the Work (and each Contributor provides its
142 |    Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
143 |    KIND, either express or implied, including, without limitation, any
144 |    warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or
145 |    FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining
146 |    the appropriateness of using or redistributing the Work and assume any risks
147 |    associated with Your exercise of permissions under this License.
148 | 
149 |    8. Limitation of Liability. In no event and under no legal theory, whether
150 |    in tort (including negligence), contract, or otherwise, unless required by
151 |    applicable law (such as deliberate and grossly negligent acts) or agreed to
152 |    in writing, shall any Contributor be liable to You for damages, including
153 |    any direct, indirect, special, incidental, or consequential damages of any
154 |    character arising as a result of this License or out of the use or inability
155 |    to use the Work (including but not limited to damages for loss of goodwill,
156 |    work stoppage, computer failure or malfunction, or any and all other
157 |    commercial damages or losses), even if such Contributor has been advised of
158 |    the possibility of such damages.
159 | 
160 |    9. Accepting Warranty or Additional Liability. While redistributing the Work
161 |    or Derivative Works thereof, You may choose to offer, and charge a fee for,
162 |    acceptance of support, warranty, indemnity, or other liability obligations
163 |    and/or rights consistent with this License. However, in accepting such
164 |    obligations, You may act only on Your own behalf and on Your sole
165 |    responsibility, not on behalf of any other Contributor, and only if You
166 |    agree to indemnify, defend, and hold each Contributor harmless for any
167 |    liability incurred by, or claims asserted against, such Contributor by
168 |    reason of your accepting any such warranty or additional liability. END OF
169 |    TERMS AND CONDITIONS
170 | 
171 | APPENDIX: How to apply the Apache License to your work.
172 | 
173 | To apply the Apache License to your work, attach the following boilerplate
174 | notice, with the fields enclosed by brackets "[]" replaced with your own
175 | identifying information. (Don't include the brackets!) The text should be
176 | enclosed in the appropriate comment syntax for the file format. We also
177 | recommend that a file or class name and description of purpose be included on
178 | the same "printed page" as the copyright notice for easier identification
179 | within third-party archives.
180 | 
181 | Copyright 2020 Amazon Web Services
182 | 
183 | Licensed under the Apache License, Version 2.0 (the "License");
184 | 
185 | you may not use this file except in compliance with the License.
186 | 
187 | You may obtain a copy of the License at
188 | 
189 | http://www.apache.org/licenses/LICENSE-2.0
190 | 
191 | Unless required by applicable law or agreed to in writing, software
192 | 
193 | distributed under the License is distributed on an "AS IS" BASIS,
194 | 
195 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
196 | 
197 | See the License for the specific language governing permissions and
198 | 
199 | limitations under the License.
200 | 
201 | * For tensorflow see also this required NOTICE:
202 |     Copyright 2019 The TensorFlow Authors.  All rights reserved.
203 | 


--------------------------------------------------------------------------------
/awsio/__init__.py:
--------------------------------------------------------------------------------
 1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 |   
 3 | #   Licensed under the Apache License, Version 2.0 (the "License").
 4 | #   You may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 |   
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 |   
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 | 
15 | from awsio import python
16 | from awsio._version import __version__
17 | 


--------------------------------------------------------------------------------
/awsio/csrc/io/s3/s3_file_wrapper.cpp:
--------------------------------------------------------------------------------
 1 | //   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 |   
 3 | //   Licensed under the Apache License, Version 2.0 (the "License").
 4 | //   You may not use this file except in compliance with the License.
 5 | //   You may obtain a copy of the License at
 6 |   
 7 | //       http://www.apache.org/licenses/LICENSE-2.0
 8 |   
 9 | //   Unless required by applicable law or agreed to in writing, software
10 | //   distributed under the License is distributed on an "AS IS" BASIS,
11 | //   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | //   See the License for the specific language governing permissions and
13 | //   limitations under the License.
14 | 
15 | #include <pybind11/stl.h>
16 | 
17 | #include <string>
18 | #include <vector>
19 | 
20 | #include "pybind11/pybind11.h"
21 | #include "s3_io.h"
22 | 
23 | namespace {
24 | namespace py = pybind11;
25 | using awsio::S3Init;
26 | PYBIND11_MODULE(_pywrap_s3_io, m) {
27 |     py::class_<S3Init>(m, "S3Init")
28 |         .def(py::init<>())
29 |         .def("s3_read",
30 |              [](S3Init* self, const std::string& file_url) {
31 |                  std::string result;
32 |                  self->s3_read(file_url, &result);
33 |                  return py::bytes(result);
34 |              })
35 |         .def("list_files",
36 |              [](S3Init* self, const std::string& file_url) {
37 |                  std::vector<std::string> filenames;
38 |                  self->list_files(file_url, &filenames);
39 |                  return filenames;
40 |              })
41 |         .def("file_exists",
42 |              [](S3Init* self, const std::string& file_url) {
43 |                  return self->file_exists(file_url);
44 |              })
45 |         .def("get_file_size",
46 |              [](S3Init* self, const std::string& file_url) {
47 |                  return self->get_file_size(file_url);
48 |         });
49 | }
50 | }  // namespace
51 | 


--------------------------------------------------------------------------------
/awsio/csrc/io/s3/s3_io.cpp:
--------------------------------------------------------------------------------
  1 | //   Original Copyright 2015 The TensorFlow Authors. Licensed under the Apache License, Version 2.0
  2 | //   Modifications Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  3 |   
  4 | //   Licensed under the Apache License, Version 2.0 (the "License").
  5 | //   You may not use this file except in compliance with the License.
  6 | //   You may obtain a copy of the License at
  7 |   
  8 | //       http://www.apache.org/licenses/LICENSE-2.0
  9 |   
 10 | //   Unless required by applicable law or agreed to in writing, software
 11 | //   distributed under the License is distributed on an "AS IS" BASIS,
 12 | //   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | //   See the License for the specific language governing permissions and
 14 | //   limitations under the License.
 15 | 
 16 | #include "s3_io.h"
 17 | 
 18 | #include <aws/core/Aws.h>
 19 | #include <aws/core/config/AWSProfileConfigLoader.h>
 20 | #include <aws/core/utils/FileSystemUtils.h>
 21 | #include <aws/core/utils/StringUtils.h>
 22 | #include <aws/core/utils/logging/AWSLogging.h>
 23 | #include <aws/core/utils/logging/LogSystemInterface.h>
 24 | #include <aws/core/utils/memory/AWSMemory.h>
 25 | #include <aws/core/utils/memory/stl/AWSStreamFwd.h>
 26 | #include <aws/core/utils/stream/PreallocatedStreamBuf.h>
 27 | #include <aws/core/utils/threading/Executor.h>
 28 | #include <aws/s3/S3Client.h>
 29 | #include <aws/s3/S3Errors.h>
 30 | #include <aws/s3/model/CompletedPart.h>
 31 | #include <aws/s3/model/GetObjectRequest.h>
 32 | #include <aws/s3/model/HeadBucketRequest.h>
 33 | #include <aws/s3/model/HeadObjectRequest.h>
 34 | #include <aws/s3/model/ListObjectsRequest.h>
 35 | #include <aws/transfer/TransferManager.h>
 36 | 
 37 | #include <fstream>
 38 | #include <string>
 39 | 
 40 | namespace awsio {
 41 | namespace {
 42 | static const size_t s3ReadBufferSize = 120 * 1024 * 1024;               // 16 MB
 43 | static const uint64_t s3MultiPartDownloadChunkSize = 50 * 1024 * 1024;  // 50 MB
 44 | static const int downloadRetries = 3;
 45 | static const int64_t s3TimeoutMsec = 300000;
 46 | static const int executorPoolSize = 25;
 47 | static const int S3GetFilesMaxKeys = 100;
 48 | 
 49 | Aws::Client::ClientConfiguration &setUpS3Config() {
 50 |     static Aws::Client::ClientConfiguration cfg;
 51 |     Aws::String config_file;
 52 |     const char *config_file_env = getenv("AWS_CONFIG_FILE");
 53 |     if (config_file_env) {
 54 |         config_file = config_file_env;
 55 |     } else {
 56 |         const char *home_env = getenv("HOME");
 57 |         if (home_env) {
 58 |             config_file = home_env;
 59 |             config_file += "/.aws/config";
 60 |         }
 61 |     }
 62 |     Aws::Config::AWSConfigFileProfileConfigLoader loader(config_file);
 63 |     loader.Load();
 64 | 
 65 |     const char *use_https = getenv("S3_USE_HTTPS");
 66 |     if (use_https) {
 67 |         if (use_https[0] == '0') {
 68 |             cfg.scheme = Aws::Http::Scheme::HTTP;
 69 |         } else {
 70 |             cfg.scheme = Aws::Http::Scheme::HTTPS;
 71 |         }
 72 |     }
 73 |     const char *verify_ssl = getenv("S3_VERIFY_SSL");
 74 |     if (verify_ssl) {
 75 |         if (verify_ssl[0] == '0') {
 76 |             cfg.verifySSL = false;
 77 |         } else {
 78 |             cfg.verifySSL = true;
 79 |         }
 80 |     }
 81 | 
 82 |     const char *region = getenv("AWS_REGION");
 83 |     if (region) {
 84 |         cfg.region = region;
 85 |     } else {
 86 |         cfg.region = "us-west-2";
 87 |     }
 88 | 
 89 |     const char *endpoint_url = getenv("S3_ENDPOINT_URL");
 90 |     if (endpoint_url) {
 91 |         cfg.endpointOverride = endpoint_url;
 92 |     }
 93 | 
 94 |     const char *proxy_host = getenv("S3_PROXY_HOST");
 95 |     if (proxy_host) {
 96 |         cfg.proxyHost = proxy_host;
 97 |     }
 98 | 
 99 |     const char *proxy_port = getenv("S3_PROXY_PORT");
100 |     if (proxy_port) {
101 |         cfg.proxyPort = atoi(proxy_port);
102 |     }
103 |     return cfg;
104 | }
105 | 
106 | void ShutdownClient(std::shared_ptr<Aws::S3::S3Client> *s3_client) {
107 |     if (s3_client != nullptr) {
108 |         delete s3_client;
109 |         Aws::SDKOptions options;
110 |         Aws::ShutdownAPI(options);
111 |     }
112 | }
113 | 
114 | void ShutdownTransferManager(
115 |     std::shared_ptr<Aws::Transfer::TransferManager> *transfer_manager) {
116 |     if (transfer_manager != nullptr) {
117 |         delete transfer_manager;
118 |     }
119 | }
120 | 
121 | void ShutdownExecutor(Aws::Utils::Threading::PooledThreadExecutor *executor) {
122 |   if (executor != nullptr) {
123 |     delete executor;
124 |   }
125 | }
126 | 
127 | void parseS3Path(const std::string &fname, std::string *bucket,
128 |                  std::string *object) {
129 |     if (fname.empty()) {
130 |         throw std::invalid_argument{"The filename cannot be an empty string."};
131 |     }
132 | 
133 |     if (fname.size() < 5 || fname.substr(0, 5) != "s3://") {
134 |         throw std::invalid_argument{
135 |             "The filename must start with the S3 scheme."};
136 |     }
137 | 
138 |     std::string path = fname.substr(5);
139 | 
140 |     if (path.empty()) {
141 |         throw std::invalid_argument{"The filename cannot be an empty string."};
142 |     }
143 | 
144 |     auto pos = path.find_first_of('/');
145 |     if (pos == 0) {
146 |         throw std::invalid_argument{
147 |             "The filename does not contain a bucket name."};
148 |     }
149 | 
150 |     *bucket = path.substr(0, pos);
151 |     *object = path.substr(pos + 1);
152 |     if (pos == std::string::npos) {
153 |         *object = "";
154 |     }
155 | }
156 | 
157 | class S3FS {
158 |    public:
159 |     S3FS(const std::string &bucket, const std::string &object,
160 |          const bool multi_part_download,
161 |          std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager,
162 |          std::shared_ptr<Aws::S3::S3Client> s3_client)
163 |         : bucket_name_(bucket),
164 |           object_name_(object),
165 |           multi_part_download_(multi_part_download),
166 |           transfer_manager_(transfer_manager),
167 |           s3_client_(s3_client) {}
168 | 
169 |     size_t read(uint64_t offset, size_t n, char *buffer) {
170 |         if (multi_part_download_) {
171 |             return readS3TransferManager(offset, n, buffer);
172 |         } else {
173 |             return readS3Client(offset, n, buffer);
174 |         }
175 |     }
176 | 
177 |     size_t readS3Client(uint64_t offset, size_t n, char *buffer) {
178 |         Aws::S3::Model::GetObjectRequest getObjectRequest;
179 | 
180 |         getObjectRequest.WithBucket(this->bucket_name_.c_str())
181 |             .WithKey(this->object_name_.c_str());
182 | 
183 |         std::string bytes = "bytes=";
184 | 	bytes += std::to_string(offset) + "-" + std::to_string(offset + n - 1);
185 | 
186 |         getObjectRequest.SetRange(bytes.c_str());
187 | 
188 |         // When you don’t want to load the entire file into memory,
189 |         // you can use IOStreamFactory in AmazonWebServiceRequest to pass a
190 |         // lambda to create a string stream.
191 |         getObjectRequest.SetResponseStreamFactory(
192 |             []() { return Aws::New<Aws::StringStream>("S3IOAllocationTag"); });
193 |         // get the object
194 |         auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
195 | 
196 |         if (!getObjectOutcome.IsSuccess()) {
197 |             auto error = getObjectOutcome.GetError();
198 |             std::cout << "ERROR: " << error.GetExceptionName() << ": "
199 |                       << error.GetMessage() << std::endl;
200 |             return 0;
201 |         } else {
202 |             n = getObjectOutcome.GetResult().GetContentLength();
203 |             // read data as a block:
204 |             getObjectOutcome.GetResult().GetBody().read(buffer, n);
205 |             return n;
206 |         }
207 |     }
208 | 
209 |     size_t readS3TransferManager(uint64_t offset, size_t n, char *buffer) {
210 |         auto create_stream_fn = [&]() {  // create stream lambda fn
211 |             return Aws::New<S3UnderlyingStream>(
212 |                 "S3ReadStream",
213 |                 Aws::New<Aws::Utils::Stream::PreallocatedStreamBuf>(
214 |                     "S3ReadStream", reinterpret_cast<unsigned char *>(buffer),
215 |                     n));
216 |         }; // This buffer is what we used to initialize streambuf and is in memory
217 | 
218 |         std::shared_ptr<Aws::Transfer::TransferHandle> downloadHandle =
219 |             this->transfer_manager_.get()->DownloadFile(
220 |                 this->bucket_name_.c_str(), this->object_name_.c_str(), offset,
221 |                 n, create_stream_fn);
222 |         downloadHandle->WaitUntilFinished();
223 | 
224 |         Aws::OFStream storeFile(object_name_.c_str(),
225 |                                 Aws::OFStream::out | Aws::OFStream::trunc);
226 | 
227 |         if (downloadHandle->GetStatus() !=
228 |             Aws::Transfer::TransferStatus::COMPLETED) {
229 |             auto error = downloadHandle->GetLastError();
230 |             std::cout << "ERROR: " << error.GetExceptionName() << ": "
231 |                       << error.GetMessage() << std::endl;
232 | 	    return 0;
233 |         } else {
234 |             return downloadHandle->GetBytesTransferred();
235 |         }
236 |     }
237 | 
238 |    private:
239 |     std::string bucket_name_;
240 |     std::string object_name_;
241 |     bool multi_part_download_;
242 |     std::shared_ptr<Aws::S3::S3Client> s3_client_;
243 |     std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
244 | };
245 | }  // namespace
246 | 
247 | S3Init::S3Init()
248 |     : s3_client_(nullptr, ShutdownClient),
249 |       transfer_manager_(nullptr, ShutdownTransferManager),
250 |       executor_(nullptr, ShutdownExecutor),
251 |       initialization_lock_() {
252 |     // Load reading parameters
253 |     buffer_size_ = s3ReadBufferSize;
254 |     const char *bufferSizeStr = getenv("S3_BUFFER_SIZE");
255 |     if (bufferSizeStr) {
256 |         buffer_size_ = std::stoull(bufferSizeStr);
257 |     }
258 |     multi_part_download_ = true;
259 |     const char *multi_download_disable_char =
260 |         getenv("S3_DISABLE_MULTI_PART_DOWNLOAD");
261 |     if (multi_download_disable_char) {
262 |         std::string multi_download_disable_str(multi_download_disable_char);
263 |         if (multi_download_disable_str == "ON") {
264 |             multi_part_download_ = false;
265 |         }
266 |     }
267 |     initializeS3Client();
268 | }
269 | 
270 | S3Init::~S3Init() {}
271 | 
272 | std::shared_ptr<Aws::S3::S3Client> S3Init::initializeS3Client() {
273 |     std::lock_guard<std::mutex> lock(this->initialization_lock_);
274 |     if (this->s3_client_.get() == nullptr) {
275 |         Aws::SDKOptions options;
276 |         Aws::InitAPI(options);
277 | 
278 |         // Set up the request
279 |         this->s3_client_ =
280 |             std::shared_ptr<Aws::S3::S3Client>(new Aws::S3::S3Client(
281 |                 setUpS3Config(),
282 |                 Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
283 |                 false));
284 |     }
285 |     return this->s3_client_;
286 | }
287 | 
288 | std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor>
289 | S3Init::initializeExecutor() {
290 |     if (this->executor_.get() == nullptr) {
291 |         this->executor_ =
292 |             Aws::MakeShared<Aws::Utils::Threading::PooledThreadExecutor>(
293 |                 "executor", executorPoolSize);
294 |     }
295 |     return this->executor_;
296 | }
297 | 
298 | std::shared_ptr<Aws::Transfer::TransferManager>
299 | S3Init::initializeTransferManager() {
300 |     std::shared_ptr<Aws::S3::S3Client> s3_client = initializeS3Client();
301 |     std::lock_guard<std::mutex> lock(this->initialization_lock_);
302 | 
303 |     if (this->transfer_manager_.get() == nullptr) {
304 |         Aws::Transfer::TransferManagerConfiguration transfer_config(
305 |             initializeExecutor().get());
306 |         transfer_config.s3Client = s3_client;
307 |         // This buffer is what we used to initialize streambuf and is in memory
308 |         transfer_config.bufferSize = s3MultiPartDownloadChunkSize;
309 |         transfer_config.transferBufferMaxHeapSize =
310 |             (executorPoolSize + 1) * s3MultiPartDownloadChunkSize;
311 |         this->transfer_manager_ =
312 |             Aws::Transfer::TransferManager::Create(transfer_config);
313 |     }
314 |     return this->transfer_manager_;
315 | }
316 | 
317 | void S3Init::s3_read(const std::string &file_url, std::string *result) {
318 |     std::string bucket, object;
319 |     parseS3Path(file_url, &bucket, &object);
320 |     S3FS s3handler(bucket, object, multi_part_download_,
321 |                    initializeTransferManager(), initializeS3Client());
322 | 
323 |     uint64_t offset = 0;
324 |     uint64_t result_size = 0;
325 |     uint64_t file_size = this->get_file_size(bucket, object);
326 |     std::size_t part_count = (std::max)(
327 |         static_cast<size_t>((file_size + buffer_size_ - 1) / buffer_size_),
328 |         static_cast<std::size_t>(1));
329 |     result->resize(file_size);
330 | 
331 |     for (int i = 0; i < part_count; i++) {
332 | 
333 |         offset = result_size;
334 | 
335 | 	size_t buf_len = std::min<size_t>(buffer_size_, file_size - result_size);
336 | 
337 |         size_t read_len =
338 |             s3handler.read(offset, buf_len, (char *)(result->data()) + offset);
339 | 
340 |         result_size += read_len;
341 | 
342 |         if (result_size == file_size) {
343 |             break;
344 |         }
345 | 
346 |         if (read_len != buf_len) {
347 |             std::cout << "Result size and buffer size did not match";
348 |             break;
349 |         }
350 |     }
351 | }
352 | 
353 | bool S3Init::file_exists(const std::string &file_url) {
354 |     std::string bucket, object;
355 |     parseS3Path(file_url, &bucket, &object);
356 |     Aws::S3::Model::HeadObjectRequest headObjectRequest;
357 |     headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
358 |     auto headObjectOutcome =
359 |         this->initializeS3Client()->HeadObject(headObjectRequest);
360 |     if (headObjectOutcome.IsSuccess()) {
361 |         return true;
362 |     }
363 |     return false;
364 | }
365 | 
366 | size_t S3Init::get_file_size(const std::string &bucket,
367 |                              const std::string &object) {
368 |     Aws::S3::Model::HeadObjectRequest headObjectRequest;
369 |     headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
370 |     auto headObjectOutcome =
371 |         this->initializeS3Client()->HeadObject(headObjectRequest);
372 |     if (headObjectOutcome.IsSuccess()) {
373 |         return headObjectOutcome.GetResult().GetContentLength();
374 |     }
375 |     Aws::String const &error_aws = headObjectOutcome.GetError().GetMessage();
376 |     std::string error_str(error_aws.c_str(), error_aws.size());
377 |     throw std::invalid_argument(error_str);
378 |     return 0;
379 | }
380 | 
381 | size_t S3Init::get_file_size(const std::string &file_url){
382 |     std::string bucket, object;
383 |     parseS3Path(file_url, &bucket, &object);
384 |     return this->get_file_size(bucket, object);
385 | }
386 | 
387 | void S3Init::list_files(const std::string &file_url,
388 |                         std::vector<std::string> *filenames) {
389 |     std::string bucket, prefix;
390 |     parseS3Path(file_url, &bucket, &prefix);
391 |     Aws::String default_key = "";
392 |     if (prefix.empty()) {
393 |         default_key = "/";
394 |     }
395 | 
396 |     Aws::S3::Model::ListObjectsRequest listObjectsRequest;
397 |     listObjectsRequest.WithBucket(bucket.c_str())
398 |         .WithPrefix(prefix.c_str())
399 |         .WithMaxKeys(S3GetFilesMaxKeys);
400 | 
401 |     Aws::S3::Model::ListObjectsResult listObjectsResult;
402 |     do {
403 |         auto listObjectsOutcome =
404 |             this->initializeS3Client()->ListObjects(listObjectsRequest);
405 |         if (!listObjectsOutcome.IsSuccess()) {
406 |             Aws::String const &error_aws =
407 |                 listObjectsOutcome.GetError().GetMessage();
408 |             std::string error_str(error_aws.c_str(), error_aws.size());
409 |             throw std::invalid_argument(error_str);
410 |         }
411 | 
412 |         listObjectsResult = listObjectsOutcome.GetResult();
413 |         Aws::Vector<Aws::S3::Model::Object> objects = listObjectsResult.GetContents();
414 |         if (!objects.empty()) {
415 |             for (const auto &object : objects) {
416 |                 Aws::String key = default_key + object.GetKey();
417 |                 if (key.back() == '/') {
418 |                     continue;
419 |                 }
420 |                 Aws::String bucket_aws(bucket.c_str(), bucket.size());
421 |                 Aws::String entry = "s3://" + bucket_aws + "/" + object.GetKey();
422 |                 filenames->push_back(entry.c_str());
423 |             }
424 |             listObjectsRequest.SetMarker(listObjectsResult.GetContents().back().GetKey());
425 |         }
426 |     } while (listObjectsResult.GetIsTruncated());
427 | }
428 | 
429 | }  // namespace awsio
430 | 


--------------------------------------------------------------------------------
/awsio/csrc/io/s3/s3_io.h:
--------------------------------------------------------------------------------
 1 | //   Original Copyright 2015 The TensorFlow Authors. Licensed under the Apache License, Version 2.0
 2 | //   Modifications Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 |   
 4 | //   Licensed under the Apache License, Version 2.0 (the "License").
 5 | //   You may not use this file except in compliance with the License.
 6 | //   You may obtain a copy of the License at
 7 |   
 8 | //       http://www.apache.org/licenses/LICENSE-2.0
 9 |   
10 | //   Unless required by applicable law or agreed to in writing, software
11 | //   distributed under the License is distributed on an "AS IS" BASIS,
12 | //   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | //   See the License for the specific language governing permissions and
14 | //   limitations under the License.
15 | 
16 | #ifndef AWSIO_S3_IO_H
17 | #define AWSIO_S3_IO_H
18 | 
19 | #include <aws/core/utils/StringUtils.h>
20 | #include <aws/core/utils/threading/Executor.h>
21 | #include <aws/s3/S3Client.h>
22 | #include <aws/transfer/TransferManager.h>
23 | 
24 | #include <mutex>
25 | 
26 | namespace awsio {
27 | // In memory stream implementation
28 | class S3UnderlyingStream : public Aws::IOStream {
29 |    public:
30 |     using Base = Aws::IOStream;
31 | 
32 |     // provide a customer controlled streambuf, so as to put all transferred
33 |     // data into this in memory buffer.
34 |     S3UnderlyingStream(std::streambuf *buf) : Base(buf) {}
35 | 
36 |     virtual ~S3UnderlyingStream() = default;
37 | };
38 | 
39 | class S3Init {
40 |    private:
41 |     std::shared_ptr<Aws::S3::S3Client> s3_client_;
42 |     std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor> executor_;
43 |     std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
44 |     size_t buffer_size_;
45 |     bool multi_part_download_;
46 | 
47 |     size_t get_file_size(const std::string &bucket, const std::string &object);
48 | 
49 |    public:
50 |     S3Init();
51 | 
52 |     ~S3Init();
53 | 
54 |     std::mutex initialization_lock_;
55 | 
56 |     std::shared_ptr<Aws::S3::S3Client> initializeS3Client();
57 |     std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor>
58 |     initializeExecutor();
59 |     std::shared_ptr<Aws::Transfer::TransferManager> initializeTransferManager();
60 | 
61 |     void s3_read(const std::string &file_url, std::string *result);
62 |     size_t get_file_size(const std::string &file_url);
63 |     bool file_exists(const std::string &file_url);
64 |     void list_files(const std::string &file_url,
65 |                     std::vector<std::string> *filenames);
66 | };
67 | }  // namespace awsio
68 | 
69 | #endif  // AWSIO_S3_IO_H
70 | 


--------------------------------------------------------------------------------
/awsio/python/__init__.py:
--------------------------------------------------------------------------------
 1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 |   
 3 | #   Licensed under the Apache License, Version 2.0 (the "License").
 4 | #   You may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 |   
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 |   
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 | 
15 | from . import lib
16 | 


--------------------------------------------------------------------------------
/awsio/python/lib/__init__.py:
--------------------------------------------------------------------------------
 1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 |   
 3 | #   Licensed under the Apache License, Version 2.0 (the "License").
 4 | #   You may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 |   
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 |   
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 | 
15 | from . import io
16 | 


--------------------------------------------------------------------------------
/awsio/python/lib/io/__init__.py:
--------------------------------------------------------------------------------
 1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 |   
 3 | #   Licensed under the Apache License, Version 2.0 (the "License").
 4 | #   You may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 |   
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 |   
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 |   
15 | from . import s3
16 | 


--------------------------------------------------------------------------------
/awsio/python/lib/io/s3/__init__.py:
--------------------------------------------------------------------------------
 1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 |   
 3 | #   Licensed under the Apache License, Version 2.0 (the "License").
 4 | #   You may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 |   
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 |   
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 | 
15 | from .s3dataset import S3Dataset, S3IterableDataset, ShuffleDataset
16 | from .s3dataset import list_files, get_file_size, file_exists
17 | 


--------------------------------------------------------------------------------
/awsio/python/lib/io/s3/s3dataset.py:
--------------------------------------------------------------------------------
  1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 |   
  3 | #   Licensed under the Apache License, Version 2.0 (the "License").
  4 | #   You may not use this file except in compliance with the License.
  5 | #   You may obtain a copy of the License at
  6 |   
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 |   
  9 | #   Unless required by applicable law or agreed to in writing, software
 10 | #   distributed under the License is distributed on an "AS IS" BASIS,
 11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #   See the License for the specific language governing permissions and
 13 | #   limitations under the License.
 14 | 
 15 | import tarfile
 16 | import io
 17 | import zipfile
 18 | import re
 19 | from torch.utils.data import IterableDataset, Dataset
 20 | import torch
 21 | import torch.distributed as dist
 22 | import _pywrap_s3_io
 23 | import random
 24 | from itertools import chain
 25 | 
 26 | meta_prefix = "__"
 27 | meta_suffix = "__"
 28 | 
 29 | def reraise_exception(exn): # pragma: no cover
 30 |     """Called in an exception handler to re-raise the exception."""
 31 |     raise exn
 32 | 
 33 | 
 34 | def tardata(fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception):
 35 |     """Iterator yielding filename, content pairs for the given tar stream.
 36 |     """
 37 |     # eliminated from test coverage since checking requires invalid tarfile
 38 |     try:
 39 |         stream = tarfile.open(fileobj=io.BytesIO(fileobj), mode="r|*")
 40 |         for tarinfo in stream:
 41 |             try:
 42 |                 if not tarinfo.isreg(): # pragma: no cover
 43 |                     continue
 44 |                 fname = tarinfo.name
 45 |                 if fname is None: # pragma: no cover
 46 |                     continue
 47 |                 if ("/" not in fname and fname.startswith(meta_prefix)
 48 |                         and fname.endswith(meta_suffix)): # pragma: no cover
 49 |                     # skipping metadata for now
 50 |                     continue
 51 |                 if skip_meta is not None and re.match(skip_meta, fname): # pragma: no cover
 52 |                     continue
 53 |                 data = stream.extractfile(tarinfo).read()
 54 |                 yield fname, data
 55 |             except Exception as exn: # pragma: no cover
 56 |                 if handler(exn):
 57 |                     continue
 58 |                 else:
 59 |                     break
 60 |         del stream
 61 |     except Exception as exn: # pragma: no cover
 62 |         handler(exn)
 63 | 
 64 | 
 65 | def zipdata(fileobj, handler=reraise_exception):
 66 |     """Iterator yielding filename, content pairs for the given zip stream.
 67 |     """
 68 |     # eliminated from test coverage since checking requires invalid zipfile
 69 |     try:
 70 |         with zipfile.ZipFile(io.BytesIO(fileobj), 'r') as zfile:
 71 |             try:
 72 |                 for file_ in zfile.namelist():
 73 |                     data = zfile.read(file_)
 74 |                     yield file_, data
 75 |             except Exception as exn: # pragma: no cover
 76 |                 print("Error:", exn)
 77 |     except Exception as exn: # pragma: no cover
 78 |         print("Error:", exn)
 79 | 
 80 | 
 81 | def file_exists(url):
 82 |     """Return if file exists or not"""
 83 |     handler = _pywrap_s3_io.S3Init()
 84 |     return handler.file_exists(url)
 85 | 
 86 | 
 87 | def get_file_size(url):
 88 |     """Return the file size of the specified file"""
 89 |     handler = _pywrap_s3_io.S3Init()
 90 |     return handler.get_file_size(url)
 91 | 
 92 | 
 93 | def list_files(url):
 94 |     """Returns a list of entries under the same prefix.
 95 |     """
 96 |     handler = _pywrap_s3_io.S3Init()
 97 |     return handler.list_files(url)
 98 | 
 99 | 
100 | class S3BaseClass(object):
101 |     """A base class for defining urls_list for S3Dataset and S3IterableDataset
102 |     """
103 |     def __init__(self, urls_list):
104 |         urls = [urls_list] if isinstance(urls_list, str) else urls_list
105 |         self._urls_list = self.create_urls_list(urls)
106 | 
107 |     def create_urls_list(self, urls):
108 |         handler = _pywrap_s3_io.S3Init()
109 |         urls_list = list()
110 |         for url in urls:
111 |             if not handler.file_exists(url):
112 |                 url_objects = handler.list_files(url)
113 |                 assert len(url_objects) != 0, \
114 |                     f"The directory {url} does not contain any objects."
115 |                 urls_list.extend(url_objects)
116 |             elif urls_list:
117 |                 urls_list.append(url)
118 |             else:
119 |                 urls_list = [url]
120 |         return urls_list
121 | 
122 |     @property
123 |     def urls_list(self):
124 |         return self._urls_list
125 | 
126 | 
127 | class S3Dataset(S3BaseClass, Dataset):
128 |     """A mapped-style dataset for objects in s3.
129 |     """
130 |     def __init__(self, urls_list):
131 |         """
132 |         Args:
133 |             urls_list (string or list of strings): the prefix(es) and
134 |                 filenames starting with 's3://'. Each string is assumed
135 |                 as a filename first. If the file doesn't exist, the string
136 |                 is assumed as a prefix.
137 |         """
138 |         S3BaseClass.__init__(self, urls_list)
139 |         # Initialize the handler in the worker since we want each worker to have
140 |         # it's own handler
141 |         self.handler = None
142 | 
143 |     def __len__(self):
144 |         return len(self.urls_list)
145 | 
146 |     def __getitem__(self, idx):
147 |         if self.handler == None:
148 |             self.handler = _pywrap_s3_io.S3Init()
149 |         filename = self.urls_list[idx]
150 |         fileobj = self.handler.s3_read(filename)
151 |         return filename, fileobj
152 | 
153 | 
154 | class S3IterableDataset(S3BaseClass, IterableDataset):
155 |     """Iterate over s3 dataset.
156 |     It handles some bookkeeping related to DataLoader.
157 |     """
158 |     def __init__(self, urls_list, shuffle_urls=False):
159 |         self.epoch = 0
160 |         self.shuffle_urls = shuffle_urls
161 |         self.dist = dist.is_initialized() if dist.is_available() else False
162 |         if self.dist:
163 |             self.world_size = dist.get_world_size()
164 |             self.rank = dist.get_rank()
165 |         S3BaseClass.__init__(self, urls_list)
166 | 
167 |     @property
168 |     def shuffled_list(self):
169 |         if self.shuffle_urls:
170 |             random.seed(self.epoch)
171 |             return random.sample(self.urls_list, len(self.urls_list))
172 |         else:
173 |             return self.urls_list
174 | 
175 |     def download_data(self, filename):
176 |         if filename[-3:] == "tar":
177 |             tarfile = tardata(self.handler.s3_read(filename))
178 |             for fname, content in tarfile:
179 |                 yield fname, content
180 |         elif filename[-3:] == "zip":
181 |             zipfile = zipdata(self.handler.s3_read(filename))
182 |             for fname, content in zipfile:
183 |                 yield fname, content
184 |         else:
185 |             yield filename, self.handler.s3_read(filename)
186 | 
187 |     def get_stream(self, urls_list):
188 |         return chain.from_iterable(map(self.download_data, urls_list))
189 | 
190 |     def worker_dist(self, urls):
191 |         if self.dist:
192 |             total_size = len(urls)
193 |             urls = urls[self.rank:total_size:self.world_size]
194 | 
195 |         worker_info = torch.utils.data.get_worker_info()
196 |         if worker_info is not None:
197 |             wid = worker_info.id
198 |             num_workers = worker_info.num_workers
199 |             length = len(urls)
200 |             return urls[wid:length:num_workers]
201 |         else:
202 |             return urls
203 | 
204 |     def __iter__(self):
205 |         self.handler = _pywrap_s3_io.S3Init()
206 |         urls = self.worker_dist(self.shuffled_list)
207 |         return self.get_stream(urls)
208 | 
209 |     def __len__(self):
210 |         return len(self.urls_list)
211 | 
212 |     def set_epoch(self, epoch):
213 |         self.epoch = epoch
214 | 
215 | 
216 | class ShuffleDataset(torch.utils.data.IterableDataset):
217 |     def __init__(self, dataset, buffer_size):
218 |         super().__init__()
219 |         self.dataset = dataset
220 |         self.buffer_size = buffer_size
221 | 
222 |     def __iter__(self):
223 |         shufbuf = []
224 |         try:
225 |             dataset_iter = iter(self.dataset)
226 |             for _ in range(self.buffer_size):
227 |                 shufbuf.append(next(dataset_iter))
228 |         except StopIteration:
229 |             self.buffer_size = len(shufbuf)
230 | 
231 |         try:
232 |             while True:
233 |                 try:
234 |                     if self.buffer_size == 0:
235 |                         break
236 |                     evict_idx = random.randint(0, self.buffer_size - 1)
237 |                     yield shufbuf.pop(evict_idx)
238 |                     item = next(dataset_iter)
239 |                     shufbuf.append(item)
240 |                 except StopIteration:
241 |                     break
242 |             while len(shufbuf) > 0:
243 |                 evict_idx = random.randint(0, len(shufbuf) - 1)
244 |                 yield shufbuf.pop(evict_idx)
245 |         except GeneratorExit: # pragma: no cover
246 |             pass
247 | 


--------------------------------------------------------------------------------
/examples/s3_cv_iterable_example.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import IterableDataset, DataLoader
 2 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset
 3 | from itertools import islice
 4 | from PIL import Image
 5 | import io
 6 | from torchvision import transforms
 7 | 
 8 | 
 9 | class ImageNetS3(IterableDataset):
10 |     def __init__(self, url_list, shuffle_urls=False, transform=None):
11 |         self.s3_iter_dataset = S3IterableDataset(url_list,
12 |                                                  shuffle_urls)
13 |         self.transform = transform
14 | 
15 | 
16 |     def data_generator(self):
17 |         try:
18 |             while True:
19 |                 # Based on aplhabetical order of files sequence of label and image will change.
20 |                 # e.g. for files 0186304.cls 0186304.jpg, 0186304.cls will be fetched first
21 |                 label_fname, label_fobj = next(self.s3_iter_dataset_iterator)
22 |                 image_fname, image_fobj = next(self.s3_iter_dataset_iterator)
23 |                 label = int(label_fobj)
24 |                 image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB')
25 | 
26 |                 # Apply torch visioin transforms if provided
27 |                 if self.transform is not None:
28 |                     image_np = self.transform(image_np)
29 |                 yield image_np, label
30 | 
31 |         except StopIteration:
32 |             raise StopIteration
33 | 
34 |     def __iter__(self):
35 |         self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset)
36 |         return self.data_generator()
37 | 
38 | batch_size = 32
39 | 
40 | url_list = ["s3://image-data-bucket/imagenet-train-000000.tar"]
41 | # Torchvision transforms to apply on data
42 | 
43 | preproc = transforms.Compose([
44 |     transforms.ToTensor(),
45 |     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
46 |     transforms.Resize((100, 100))
47 | ])
48 | 
49 | dataset = ImageNetS3(url_list, transform=preproc)
50 | 
51 | dataloader = DataLoader(dataset,
52 |                         batch_size=batch_size,
53 |                         num_workers=64)
54 | 
55 | for image, label in islice(dataset, 0, 3):
56 |     print(image.shape, label)


--------------------------------------------------------------------------------
/examples/s3_cv_iterable_shuffle_example.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import IterableDataset, DataLoader
 2 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset
 3 | from PIL import Image
 4 | import io
 5 | from torchvision import transforms
 6 | 
 7 | 
 8 | class ImageNetS3(IterableDataset):
 9 |     def __init__(self, url_list, shuffle_urls=False, transform=None):
10 |         self.s3_iter_dataset = S3IterableDataset(url_list,
11 |                                                  shuffle_urls)
12 |         self.transform = transform
13 | 
14 | 
15 |     def data_generator(self):
16 |         try:
17 |             while True:
18 |                 # Based on aplhabetical order of files sequence of label and image will change.
19 |                 # e.g. for files 0186304.cls 0186304.jpg, 0186304.cls will be fetched first
20 |                 label_fname, label_fobj = next(self.s3_iter_dataset_iterator)
21 |                 image_fname, image_fobj = next(self.s3_iter_dataset_iterator)
22 |                 label = int(label_fobj)
23 |                 image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB')
24 | 
25 |                 # Apply torch visioin transforms if provided
26 |                 if self.transform is not None:
27 |                     image_np = self.transform(image_np)
28 |                 yield image_np, label
29 | 
30 |         except StopIteration:
31 |             raise StopIteration
32 | 
33 |     def set_epoch(self, epoch):
34 |         self.s3_iter_dataset.set_epoch(epoch)
35 | 
36 |     def __iter__(self):
37 |         self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset)
38 |         return self.data_generator()
39 | 
40 | 
41 | url_list = ["s3://pt-s3plugin-test-data-west2/integration_tests/imagenet-train-000000.tar"]
42 | # Torchvision transforms to apply on data
43 | 
44 | preproc = transforms.Compose([
45 |     transforms.ToTensor(),
46 |     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
47 |     transforms.Resize((100, 100))
48 | ])
49 | 
50 | dataset = ImageNetS3(url_list, transform=preproc, shuffle_urls=True)
51 | 
52 | dataloader = DataLoader(dataset, num_workers=4, batch_size=32)
53 | 
54 | for e in range(5):
55 |     dataset.set_epoch(e)
56 | 


--------------------------------------------------------------------------------
/examples/s3_cv_map_example.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from awsio.python.lib.io.s3.s3dataset import S3Dataset
 3 | from torch.utils.data import DataLoader
 4 | 
 5 | url_list = ['s3://image-data-bucket/train/n01440764/n01440764_10026.JPEG',
 6 |  's3://image-data-bucket/train/n01440764/n01440764_10027.JPEG',
 7 |  's3://image-data-bucket/train/n01440764/n01440764_10029.JPEG']
 8 | 
 9 | dataset = S3Dataset(url_list)
10 | dataloader = DataLoader(dataset,
11 |         batch_size=2,
12 |         num_workers=64)
13 | 
14 | for i, (image, label) in enumerate(dataloader):
15 |     print(type(image), len(image))
16 | 
17 | 


--------------------------------------------------------------------------------
/examples/s3_cv_transform.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import DataLoader
 2 | from awsio.python.lib.io.s3.s3dataset import S3Dataset
 3 | from torchvision import transforms
 4 | from PIL import Image
 5 | import io
 6 | 
 7 | url_list = ['s3://pt-s3plugin-test-data-west2/images/test_9970.JPEG',
 8 |  's3://pt-s3plugin-test-data-west2/images/test_9971.JPEG',
 9 |  's3://pt-s3plugin-test-data-west2/images/test_9972.JPEG']
10 | 
11 | class S3ImageSet(S3Dataset):
12 |     def __init__(self, url, transform=None):
13 |         super().__init__(url)
14 |         self.transform = transform
15 | 
16 |     def __getitem__(self, idx) :
17 |         img_name, img = super(S3ImageSet, self).__getitem__(idx)
18 |         img = Image.open(io.BytesIO(img)).convert('RGB')
19 |         if self.transform is not None:
20 |             img = self.transform(img)
21 |         return img
22 | 
23 | preproc = transforms.Compose([
24 |     transforms.ToTensor(),
25 |     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
26 |     transforms.Resize((100, 100))
27 | ])
28 | dataset = S3ImageSet(url_list,transform=preproc)
29 | 
30 | dataloader = DataLoader(dataset,
31 |         batch_size=2,
32 |         num_workers=64)
33 | 
34 | for i in range(len(dataset)):
35 |     print(dataset[i])
36 | 


--------------------------------------------------------------------------------
/examples/s3_imagenet_example.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | # Based on https://github.com/pytorch/examples/blob/master/imagenet/main.py
 15 | 
 16 | import argparse
 17 | import os
 18 | import random
 19 | import shutil
 20 | import time
 21 | import warnings
 22 | 
 23 | import torch
 24 | import torch.nn as nn
 25 | import torch.nn.parallel
 26 | import torch.backends.cudnn as cudnn
 27 | import torch.distributed as dist
 28 | import torch.optim
 29 | import torch.multiprocessing as mp
 30 | import torch.utils.data
 31 | import torch.utils.data.distributed
 32 | import torchvision.transforms as transforms
 33 | #import torchvision.datasets as datasets
 34 | from torch.utils.data import IterableDataset, DataLoader
 35 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset
 36 | 
 37 | import torchvision.models as models
 38 | from PIL import Image
 39 | import io
 40 | from itertools import islice
 41 | 
 42 | model_names = sorted(name for name in models.__dict__
 43 |     if name.islower() and not name.startswith("__")
 44 |     and callable(models.__dict__[name]))
 45 | 
 46 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 47 | 
 48 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
 49 |                     choices=model_names,
 50 |                     help='model architecture: ' +
 51 |                         ' | '.join(model_names) +
 52 |                         ' (default: resnet18)')
 53 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 54 |                     help='number of data loading workers (default: 4)')
 55 | parser.add_argument('--epochs', default=2, type=int, metavar='N',
 56 |                     help='number of total epochs to run')
 57 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
 58 |                     help='manual epoch number (useful on restarts)')
 59 | parser.add_argument('-b', '--batch-size', default=256, type=int,
 60 |                     metavar='N',
 61 |                     help='mini-batch size (default: 256), this is the total '
 62 |                          'batch size of all GPUs on the current node when '
 63 |                          'using Data Parallel or Distributed Data Parallel')
 64 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
 65 |                     metavar='LR', help='initial learning rate', dest='lr')
 66 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
 67 |                     help='momentum')
 68 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
 69 |                     metavar='W', help='weight decay (default: 1e-4)',
 70 |                     dest='weight_decay')
 71 | parser.add_argument('-p', '--print-freq', default=10, type=int,
 72 |                     metavar='N', help='print frequency (default: 10)')
 73 | parser.add_argument('--resume', default='', type=str, metavar='PATH',
 74 |                     help='path to latest checkpoint (default: none)')
 75 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
 76 |                     help='evaluate model on validation set')
 77 | parser.add_argument('--pretrained', dest='pretrained', action='store_true',
 78 |                     help='use pre-trained model')
 79 | parser.add_argument('--world-size', default=-1, type=int,
 80 |                     help='number of nodes for distributed training')
 81 | parser.add_argument('--rank', default=-1, type=int,
 82 |                     help='node rank for distributed training')
 83 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str,
 84 |                     help='url used to set up distributed training')
 85 | parser.add_argument('--dist-backend', default='nccl', type=str,
 86 |                     help='distributed backend')
 87 | parser.add_argument('--seed', default=None, type=int,
 88 |                     help='seed for initializing training. ')
 89 | parser.add_argument('--gpu', default=None, type=int,
 90 |                     help='GPU id to use.')
 91 | parser.add_argument('--multiprocessing-distributed', action='store_true',
 92 |                     help='Use multi-processing distributed training to launch '
 93 |                          'N processes per node, which has N GPUs. This is the '
 94 |                          'fastest way to use PyTorch for either single node or '
 95 |                          'multi node data parallel training')
 96 | 
 97 | best_acc1 = 0
 98 | 
 99 | 
100 | class ImageNetS3(IterableDataset):
101 |     def __init__(self, url_list, shuffle_urls=False, transform=None):
102 |         self.s3_iter_dataset = S3IterableDataset(url_list,
103 |                                                  shuffle_urls)
104 |         self.transform = transform
105 | 
106 | 
107 |     def data_generator(self):
108 |         try:
109 |             while True:
110 |                 # Based on aplhabetical order of files sequence of label and image will change.
111 |                 # e.g. for files 0186304.cls 0186304.jpg, 0186304.cls will be fetched first
112 |                 label_fname, label_fobj = next(self.s3_iter_dataset_iterator)
113 |                 image_fname, image_fobj = next(self.s3_iter_dataset_iterator)
114 |                 label = int(label_fobj)
115 |                 image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB')
116 | 
117 |                 # Apply torch visioin transforms if provided
118 |                 if self.transform is not None:
119 |                     image_np = self.transform(image_np)
120 |                 yield image_np, label
121 | 
122 |         except StopIteration:
123 |             return
124 | 
125 |     def __iter__(self):
126 |         self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset)
127 |         return self.data_generator()
128 | 
129 |     def __len__(self):
130 |         return 1000
131 | 
132 | def main():
133 |     args = parser.parse_args()
134 | 
135 |     if args.seed is not None:
136 |         random.seed(args.seed)
137 |         torch.manual_seed(args.seed)
138 |         cudnn.deterministic = True
139 |         warnings.warn('You have chosen to seed training. '
140 |                       'This will turn on the CUDNN deterministic setting, '
141 |                       'which can slow down your training considerably! '
142 |                       'You may see unexpected behavior when restarting '
143 |                       'from checkpoints.')
144 | 
145 |     if args.gpu is not None:
146 |         warnings.warn('You have chosen a specific GPU. This will completely '
147 |                       'disable data parallelism.')
148 | 
149 |     if args.dist_url == "env://" and args.world_size == -1:
150 |         args.world_size = int(os.environ["WORLD_SIZE"])
151 | 
152 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
153 | 
154 |     ngpus_per_node = torch.cuda.device_count()
155 |     if args.multiprocessing_distributed:
156 |         # Since we have ngpus_per_node processes per node, the total world_size
157 |         # needs to be adjusted accordingly
158 |         args.world_size = ngpus_per_node * args.world_size
159 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
160 |         # main_worker process function
161 |         mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
162 |     else:
163 |         # Simply call main_worker function
164 |         main_worker(args.gpu, ngpus_per_node, args)
165 | 
166 | 
167 | def main_worker(gpu, ngpus_per_node, args):
168 |     global best_acc1
169 |     args.gpu = gpu
170 | 
171 |     if args.gpu is not None:
172 |         print("Use GPU: {} for training".format(args.gpu))
173 | 
174 |     if args.distributed:
175 |         if args.dist_url == "env://" and args.rank == -1:
176 |             args.rank = int(os.environ["RANK"])
177 |         if args.multiprocessing_distributed:
178 |             # For multiprocessing distributed training, rank needs to be the
179 |             # global rank among all the processes
180 |             args.rank = args.rank * ngpus_per_node + gpu
181 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
182 |                                 world_size=args.world_size, rank=args.rank)
183 |     # create model
184 |     if args.pretrained:
185 |         print("=> using pre-trained model '{}'".format(args.arch))
186 |         model = models.__dict__[args.arch](pretrained=True)
187 |     else:
188 |         print("=> creating model '{}'".format(args.arch))
189 |         model = models.__dict__[args.arch]()
190 | 
191 |     if not torch.cuda.is_available():
192 |         print('using CPU, this will be slow')
193 |     elif args.distributed:
194 |         # For multiprocessing distributed, DistributedDataParallel constructor
195 |         # should always set the single device scope, otherwise,
196 |         # DistributedDataParallel will use all available devices.
197 |         if args.gpu is not None:
198 |             torch.cuda.set_device(args.gpu)
199 |             model.cuda(args.gpu)
200 |             # When using a single GPU per process and per
201 |             # DistributedDataParallel, we need to divide the batch size
202 |             # ourselves based on the total number of GPUs we have
203 |             args.batch_size = int(args.batch_size / ngpus_per_node)
204 |             args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
205 |             model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
206 |         else:
207 |             model.cuda()
208 |             # DistributedDataParallel will divide and allocate batch_size to all
209 |             # available GPUs if device_ids are not set
210 |             model = torch.nn.parallel.DistributedDataParallel(model)
211 |     elif args.gpu is not None:
212 |         torch.cuda.set_device(args.gpu)
213 |         model = model.cuda(args.gpu)
214 |     else:
215 |         # DataParallel will divide and allocate batch_size to all available GPUs
216 |         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
217 |             model.features = torch.nn.DataParallel(model.features)
218 |             model.cuda()
219 |         else:
220 |             model = torch.nn.DataParallel(model).cuda()
221 | 
222 |     # define loss function (criterion) and optimizer
223 |     criterion = nn.CrossEntropyLoss().cuda(args.gpu)
224 | 
225 |     optimizer = torch.optim.SGD(model.parameters(), args.lr,
226 |                                 momentum=args.momentum,
227 |                                 weight_decay=args.weight_decay)
228 | 
229 |     # optionally resume from a checkpoint
230 |     if args.resume:
231 |         if os.path.isfile(args.resume):
232 |             print("=> loading checkpoint '{}'".format(args.resume))
233 |             if args.gpu is None:
234 |                 checkpoint = torch.load(args.resume)
235 |             else:
236 |                 # Map model to be loaded to specified single gpu.
237 |                 loc = 'cuda:{}'.format(args.gpu)
238 |                 checkpoint = torch.load(args.resume, map_location=loc)
239 |             args.start_epoch = checkpoint['epoch']
240 |             best_acc1 = checkpoint['best_acc1']
241 |             if args.gpu is not None:
242 |                 # best_acc1 may be from a checkpoint from a different GPU
243 |                 best_acc1 = best_acc1.to(args.gpu)
244 |             model.load_state_dict(checkpoint['state_dict'])
245 |             optimizer.load_state_dict(checkpoint['optimizer'])
246 |             print("=> loaded checkpoint '{}' (epoch {})"
247 |                   .format(args.resume, checkpoint['epoch']))
248 |         else:
249 |             print("=> no checkpoint found at '{}'".format(args.resume))
250 | 
251 |     cudnn.benchmark = True
252 | 
253 |     url_list = ["s3://pt-s3plugin-test-data-west2/integration_tests/imagenet-train-000000.tar"]
254 | 
255 |     preproc = transforms.Compose([
256 |         transforms.RandomResizedCrop(224),
257 |         transforms.RandomHorizontalFlip(),
258 |         transforms.ToTensor(),
259 |         transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
260 |     ])
261 | 
262 |     os.environ['AWS_REGION'] = 'us-west-2'
263 | 
264 |     train_dataset = ImageNetS3(url_list, transform=preproc)
265 |     train_sampler = None
266 | 
267 |     train_loader = torch.utils.data.DataLoader(
268 |         train_dataset, batch_size=args.batch_size,
269 |         num_workers=args.workers, pin_memory=True, sampler=train_sampler)
270 | 
271 |     for epoch in range(args.start_epoch, args.epochs):
272 |         if args.distributed:
273 |             train_sampler.set_epoch(epoch)
274 |         adjust_learning_rate(optimizer, epoch, args)
275 | 
276 |         # train for one epoch
277 |         train(train_loader, model, criterion, optimizer, epoch, args)
278 | 
279 | 
280 | def train(train_loader, model, criterion, optimizer, epoch, args):
281 |     batch_time = AverageMeter('Time', ':6.3f')
282 |     data_time = AverageMeter('Data', ':6.3f')
283 |     losses = AverageMeter('Loss', ':.4e')
284 |     top1 = AverageMeter('Acc@1', ':6.2f')
285 |     top5 = AverageMeter('Acc@5', ':6.2f')
286 |     progress = ProgressMeter(
287 |         len(train_loader),
288 |         [batch_time, data_time, losses, top1, top5],
289 |         prefix="Epoch: [{}]".format(epoch))
290 | 
291 |     # switch to train mode
292 |     model.train()
293 | 
294 |     end = time.time()
295 |     for i, (images, target) in enumerate(train_loader):
296 |         # measure data loading time
297 |         data_time.update(time.time() - end)
298 | 
299 |         if args.gpu is not None:
300 |             images = images.cuda(args.gpu, non_blocking=True)
301 |         if torch.cuda.is_available():
302 |             target = target.cuda(args.gpu, non_blocking=True)
303 | 
304 |         # compute output
305 |         output = model(images)
306 |         loss = criterion(output, target)
307 | 
308 |         # measure accuracy and record loss
309 |         acc1, acc5 = accuracy(output, target, topk=(1, 5))
310 |         losses.update(loss.item(), images.size(0))
311 |         top1.update(acc1[0], images.size(0))
312 |         top5.update(acc5[0], images.size(0))
313 | 
314 |         # compute gradient and do SGD step
315 |         optimizer.zero_grad()
316 |         loss.backward()
317 |         optimizer.step()
318 | 
319 |         # measure elapsed time
320 |         batch_time.update(time.time() - end)
321 |         end = time.time()
322 | 
323 |         if i % args.print_freq == 0:
324 |             progress.display(i)
325 | 
326 | 
327 | class AverageMeter(object):
328 |     """Computes and stores the average and current value"""
329 |     def __init__(self, name, fmt=':f'):
330 |         self.name = name
331 |         self.fmt = fmt
332 |         self.reset()
333 | 
334 |     def reset(self):
335 |         self.val = 0
336 |         self.avg = 0
337 |         self.sum = 0
338 |         self.count = 0
339 | 
340 |     def update(self, val, n=1):
341 |         self.val = val
342 |         self.sum += val * n
343 |         self.count += n
344 |         self.avg = self.sum / self.count
345 | 
346 |     def __str__(self):
347 |         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
348 |         return fmtstr.format(**self.__dict__)
349 | 
350 | 
351 | class ProgressMeter(object):
352 |     def __init__(self, num_batches, meters, prefix=""):
353 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
354 |         self.meters = meters
355 |         self.prefix = prefix
356 | 
357 |     def display(self, batch):
358 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
359 |         entries += [str(meter) for meter in self.meters]
360 |         print('\t'.join(entries))
361 | 
362 |     def _get_batch_fmtstr(self, num_batches):
363 |         num_digits = len(str(num_batches // 1))
364 |         fmt = '{:' + str(num_digits) + 'd}'
365 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
366 | 
367 | 
368 | def adjust_learning_rate(optimizer, epoch, args):
369 |     """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
370 |     lr = args.lr * (0.1 ** (epoch // 30))
371 |     for param_group in optimizer.param_groups:
372 |         param_group['lr'] = lr
373 | 
374 | 
375 | def accuracy(output, target, topk=(1,)):
376 |     """Computes the accuracy over the k top predictions for the specified values of k"""
377 |     with torch.no_grad():
378 |         maxk = max(topk)
379 |         batch_size = target.size(0)
380 | 
381 |         _, pred = output.topk(maxk, 1, True, True)
382 |         pred = pred.t()
383 |         correct = pred.eq(target.view(1, -1).expand_as(pred))
384 | 
385 |         res = []
386 |         for k in topk:
387 |             correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
388 |             res.append(correct_k.mul_(100.0 / batch_size))
389 |         return res
390 | 
391 | 
392 | if __name__ == '__main__':
393 |     main()
394 | 
395 | 


--------------------------------------------------------------------------------
/examples/s3_nlp_iterable_example.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.utils.data import IterableDataset, DataLoader
 3 | from itertools import islice
 4 | # data is in hdf5 format and converted to numpy
 5 | import h5py
 6 | import numpy as np
 7 | 
 8 | # packages for this example
 9 | import io
10 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset
11 | 
12 | def create_data_samples_from_file(fileobj):
13 |     """Convert bytes from S3IterableDataset to numpy arrays.
14 |     Helper function for class s3_dataset.
15 |     Returns a list of six numpy arrays which each contain
16 |     data (by key) for all samples in a file.
17 |     Keyword arguments:
18 |     fileobj -- the bytes string provided by S3IterableDataset
19 |     """
20 |     keys = ['input_ids', 'input_mask', 'segment_ids', \
21 |         'masked_lm_positions', 'masked_lm_ids', 'next_sentence_labels']
22 |     dataset = io.BytesIO(fileobj)
23 |     with h5py.File(dataset, "r") as f:
24 |         data_file = [np.asarray(f[key][:]) for key in keys]
25 |     return data_file
26 | 
27 | 
28 | class s3_dataset(IterableDataset):
29 |     """Dataset used for training.
30 |     Yields one sample at a time.
31 |     """
32 |     def __init__(self, s3_directory):
33 |         self.s3_directory = s3_directory
34 |         self.dataset = S3IterableDataset(self.s3_directory, shuffle_urls=True)
35 | 
36 |     def data_generator(self):
37 |         try:
38 |             while True:
39 |                 filename, fileobj = next(self.dataset_iter)
40 |                 # data_samples: list of six numpy arrays (each array contains all samples)
41 |                 data_samples = create_data_samples_from_file(fileobj)
42 |                 # transpose data_samples so that each index represents one sample
43 |                 for sample in list(zip(*data_samples)):
44 |                     yield sample
45 | 
46 |         except StopIteration as e:
47 |             raise e
48 | 
49 |     def __iter__(self):
50 |         self.dataset_iter = iter(self.dataset)
51 |         return self.data_generator()
52 | 
53 | 
54 | s3_directory = "s3://bert-data-bucket/training/wiki_books_corpus_training"
55 | train_dataset = s3_dataset(s3_directory=s3_directory)
56 | for sample in islice(train_dataset, 0, 1):
57 |     print(sample)
58 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 
4 | [metadata]
5 | license_file = LICENSE
6 | 
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | 
 14 | 
 15 | import os
 16 | import re
 17 | import sys
 18 | import platform
 19 | import subprocess
 20 | 
 21 | from pathlib import Path
 22 | from setuptools import setup, Extension, find_packages 
 23 | from setuptools.command.build_ext import build_ext
 24 | from distutils.version import LooseVersion
 25 | 
 26 | class CMakeExtension(Extension):
 27 |     def __init__(self, name, sourcedir=''):
 28 |         Extension.__init__(self, name, sources=[])
 29 |         self.sourcedir = os.path.abspath(sourcedir)
 30 | 
 31 | 
 32 | class CMakeBuild(build_ext):
 33 |     def run(self):
 34 |         try:
 35 |             out = subprocess.check_output(['cmake', '--version'])
 36 |         except OSError:
 37 |             raise RuntimeError("CMake must be installed to build the following extensions: " +
 38 |                                ", ".join(e.name for e in self.extensions))
 39 | 
 40 |         if platform.system() == "Windows":
 41 |             cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1))
 42 |             if cmake_version < '3.1.0':
 43 |                 raise RuntimeError("CMake >= 3.1.0 is required on Windows")
 44 | 
 45 |         for ext in self.extensions:
 46 |             self.build_extension(ext)
 47 | 
 48 |     def build_extension(self, ext):
 49 |         extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
 50 |         # required for auto-detection of auxiliary "native" libs
 51 |         if not extdir.endswith(os.path.sep):
 52 |             extdir += os.path.sep
 53 | 
 54 |         cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir,
 55 |                       '-DPYTHON_EXECUTABLE=' + sys.executable,
 56 |                       '-DCMAKE_PREFIX_PATH=' + os.environ['CMAKE_PREFIX_PATH'],
 57 |                       '-DCMAKE_CXX_FLAGS=' + "-fPIC"]
 58 | 
 59 |         cfg = 'Debug' if self.debug else 'Release'
 60 |         build_args = ['--config', cfg]
 61 | 
 62 |         if platform.system() == "Windows":
 63 |             cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)]
 64 |             if sys.maxsize > 2**32:
 65 |                 cmake_args += ['-A', 'x64']
 66 |             build_args += ['--', '/m']
 67 |         else:
 68 |             cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
 69 |             build_args += ['--', '-j2']
 70 | 
 71 |         env = os.environ.copy()
 72 |         env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''),
 73 |                                                               self.distribution.get_version())
 74 |         if not os.path.exists(self.build_temp):
 75 |             os.makedirs(self.build_temp)
 76 |         subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env)
 77 |         subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp)
 78 | 
 79 | 
 80 | def get_sha():
 81 |     try:
 82 |         return subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip()
 83 |     except Exception:
 84 |         return 'Unknown'
 85 | 
 86 | def get_version(sha):
 87 |     version = open('version.txt', 'r').read().strip()
 88 |     if sha != 'Unknown':
 89 |         version += '+' + sha[:7]
 90 |     return version
 91 | 
 92 | def write_version_file():
 93 |     sha = get_sha()
 94 |     version = get_version(sha)
 95 |     version_path = os.path.join(Path.cwd(), 'awsio', '_version.py') 
 96 |     with open(version_path, 'w') as f:
 97 |         f.write(f"__version__ = \"{version}\"\n")
 98 | 
 99 | if __name__ == "__main__":
100 |     # metadata
101 |     package_name = 'awsio'
102 |     required_packages = ["torch>=1.5.1"]
103 | 
104 |     # define __version__
105 |     write_version_file()
106 |     exec(open("awsio/_version.py").read())
107 |     print(f"Building wheel for {package_name}-{__version__}")
108 | 
109 |     with open('README.md') as f:
110 |         readme = f.read()
111 | 
112 |     setup(
113 |         name=package_name,
114 |         version=__version__,
115 |         author='Amazon Web Services',
116 |         author_email='aws-pytorch@amazon.com',
117 |         description='A package for creating PyTorch Datasets using objects in AWS S3 buckets',
118 |         long_description=readme,
119 |         license='Apache License 2.0',
120 |         keywords='ML Amazon AWS AI PyTorch',
121 | 
122 |         # Package info
123 |         packages=find_packages(exclude=('test',)),
124 |         zip_safe=False,
125 |         install_requires=required_packages,
126 |         extras_require={
127 |             "scipy": ["scipy"],
128 |         },
129 |         ext_modules=[CMakeExtension('aws_io')],
130 |         cmdclass=dict(build_ext=CMakeBuild),
131 |         classifiers=[
132 |             "Programming Language :: Python :: 3",
133 |             "License :: OSI Approved :: Apache Software License",
134 |             "Operating System :: OS Independent",
135 |         ],
136 |     )
137 | 


--------------------------------------------------------------------------------
/tests/py-tests/test_integration.py:
--------------------------------------------------------------------------------
  1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 |   
  3 | #   Licensed under the Apache License, Version 2.0 (the "License").
  4 | #   You may not use this file except in compliance with the License.
  5 | #   You may obtain a copy of the License at
  6 |   
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 |   
  9 | #   Unless required by applicable law or agreed to in writing, software
 10 | #   distributed under the License is distributed on an "AS IS" BASIS,
 11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #   See the License for the specific language governing permissions and
 13 | #   limitations under the License.
 14 | 
 15 | import io
 16 | import math
 17 | import boto3
 18 | from collections import defaultdict
 19 | from torch.utils.data import DataLoader
 20 | 
 21 | from awsio.python.lib.io.s3.s3dataset import S3Dataset, S3IterableDataset, ShuffleDataset
 22 | from awsio.python.lib.io.s3.s3dataset import tardata, zipdata
 23 | 
 24 | def read_using_boto(bucket, prefix_list):
 25 |         s= boto3.client('s3')
 26 |         s3_obj_set = set()
 27 | 
 28 |         for prefix in prefix_list:
 29 |             fs = io.BytesIO()
 30 |             s.download_fileobj(bucket,
 31 |                                 prefix,
 32 |                                 fs)
 33 |             file_content = fs.getvalue()
 34 | 
 35 |             if prefix[-3:] == "tar":
 36 |                 tarfile = tardata(file_content)
 37 |                 for fname, content in tarfile:
 38 |                     s3_obj_set.add((fname, content))                    
 39 |             elif prefix[-3:] == "zip":
 40 |                 zipfile = zipdata(file_content)
 41 |                 for fname, content in zipfile:
 42 |                     s3_obj_set.add((fname, content))
 43 |             else:
 44 |                 s3_obj_set.add((prefix.split("/")[-1], file_content))
 45 |         return s3_obj_set
 46 | 
 47 | def get_file_list(bucket, files_prefix):
 48 |     s3 = boto3.resource('s3')
 49 |     my_bucket = s3.Bucket(bucket)
 50 | 
 51 |     file_list = [summary.key for summary in my_bucket.objects.filter(Prefix=files_prefix)]
 52 |     return file_list[1:]
 53 | 
 54 | def run_workers(dataset_type, url_list,  batch_size, boto_obj_set):
 55 |     epochs = 2
 56 |     dataset_class = eval(dataset_type)
 57 |     for num_workers in [ 0, 4, 16]:
 58 |         s3_obj_set = set()
 59 |         dataset = dataset_class(url_list)
 60 |         dataloader = DataLoader(dataset,
 61 |                         batch_size=batch_size, 
 62 |                         num_workers=num_workers)
 63 |         for epoch in range(epochs):
 64 |             print ("\nTesting " + dataset_type + " with {} workers for epoch {}".format(
 65 |                         num_workers, epoch + 1))
 66 |             num_batches = 0
 67 |             for fname, fobj in dataloader:
 68 |                 fname = [x.split("/")[-1] for x in fname]
 69 |                 batch_set = set(map(tuple, zip(fname, fobj)))
 70 |                 s3_obj_set.update(batch_set)
 71 |                 num_batches += 1
 72 | 
 73 |             assert s3_obj_set == boto_obj_set, "Test fails for {} workers for".format(
 74 |                         num_workers) + dataset_type           
 75 |         print ("All data correctly loaded for " + dataset_type + " for {} workers".format(num_workers))
 76 | 
 77 | def test_tarfiles():
 78 |     bucket = "pt-s3plugin-test-data-west2"
 79 |     tarfiles_list = ["integration_tests/imagenet-train-000000.tar"]
 80 | 
 81 |     print("\nINITIATING: TARFILES READ TEST")
 82 |     boto_obj_set = read_using_boto(bucket, tarfiles_list)
 83 |     batch_size = 32
 84 |     url_list = ["s3://" + bucket + "/" + tarfile for tarfile in tarfiles_list]
 85 |     run_workers("S3IterableDataset", url_list, batch_size, boto_obj_set)
 86 | 
 87 | def test_files():
 88 |     bucket = "pt-s3plugin-test-data-west2"
 89 |     files_prefix = "integration_tests/files"
 90 |     assert files_prefix[-1] != "/", "Enter Prefix without trailing \"/\" else error"
 91 | 
 92 |     prefix_list = get_file_list(bucket, files_prefix)
 93 |     boto_obj_set = read_using_boto(bucket, prefix_list)
 94 |     batch_size = 32
 95 | 
 96 |     print ("\nINITIATING: INDIVIDUAL FILE READ TEST")
 97 |     url_list = ["s3://" + bucket + "/" + prefix for prefix in prefix_list]
 98 |     run_workers("S3IterableDataset", url_list, batch_size, boto_obj_set)
 99 |     run_workers("S3Dataset", url_list, batch_size, boto_obj_set)
100 | 
101 |     print ("\nINITIATING: READ FILES FROM PREFIX TEST")
102 |     url_list = ["s3://" + bucket + "/" + files_prefix]
103 |     run_workers("S3IterableDataset", url_list, batch_size, boto_obj_set)
104 |     run_workers("S3Dataset", url_list, batch_size, boto_obj_set)
105 | 
106 | def test_shuffleurls():
107 |     """
108 |     Args: 
109 |         bucket : name of the bucket
110 |         files_prefix : prefix of the location where files stored
111 |     
112 |     Logic:
113 |         Loop over dataloader twice, once with shuffle_urls as True and once as False
114 |         After both runs, 
115 |             the dataloaded should be the same, the loading order should be different
116 |         
117 |         Maintains a dictionary each of sets and lists. 
118 |         The keys of the dictionary is the state of shuffle_urls(True/False)
119 |         Values are the set/list of the samples 
120 |         
121 |         Test passes if the set of samples loaded in both cases is same and the list of 
122 |         samples is diffrent(loading order different - data being shuffled)
123 |     """
124 |     bucket = "pt-s3plugin-test-data-west2"
125 |     files_prefix = "integration_tests/files"
126 |     assert files_prefix[-1] != "/", "Enter Prefix without trailing \"/\" else error"
127 | 
128 |     prefix_list = get_file_list(bucket, files_prefix)
129 |     url_list = ["s3://" + bucket + "/" + prefix for prefix in prefix_list]
130 |     batch_size = 32
131 |     shuffled_sets = defaultdict(set)
132 |     shuffled_lists = defaultdict(list)
133 | 
134 |     print ("\nINITIATING SHUFFLE TEST")
135 |     for shuffle_urls in [True, False]:
136 |         dataset = S3IterableDataset(url_list, shuffle_urls=shuffle_urls)
137 |         dataloader = DataLoader(dataset,
138 |                         batch_size=batch_size)
139 |         
140 |         for fname, fobj in dataloader:
141 |             fname = [x.split("/")[-1] for x in fname]
142 |             batch_set = set(map(tuple, zip(fname, fobj)))
143 |             batch_list = list(map(tuple, zip(fname, fobj)))
144 |             shuffled_sets[str(shuffle_urls)].update(batch_set)
145 |             shuffled_lists[str(shuffle_urls)].append(batch_list)
146 |     assert shuffled_sets['True'] == shuffled_sets['False'] and shuffled_lists['True'] != shuffled_lists['False'], \
147 |             "Shuffling not working correctly"
148 |     print ("Shuffle test passed for S3IterableDataset")
149 | 
150 | def test_ShuffleDataset():
151 |     """
152 |         Args: 
153 |             bucket: name of the bucket
154 |             tarfiles_list: list of all tarfiles with the prefix
155 |             buffer_size: number of files the ShuffleDataset object caches
156 |         
157 |         Logic:
158 |             Loop over the ShuffleDataset Dataloader twice
159 |             For the runs, the corresponding batches returned should not be the same
160 |                 - ensures that shuffling is happening within tarfile constituents
161 |             After both the runs, the overall dataloaded should be the same
162 |             
163 |             If either of these conditions fails, then test fails
164 |     """
165 |     bucket = "pt-s3plugin-test-data-west2"
166 |     tarfiles_list = ["integration_tests/imagenet-train-000000.tar",
167 |             "integration_tests/imagenet-train-000001.tar"]
168 |     
169 |     url_list = ["s3://" + bucket + "/" + tarfile for tarfile in tarfiles_list]
170 |     batch_size = 32
171 | 
172 |     buffer_size = 300
173 |     for num_workers in [0, 16]:
174 |         for buffer_size in [30, 300, 3000]:
175 |             dataset = ShuffleDataset(S3IterableDataset(url_list), buffer_size=buffer_size)
176 |             dataloader = DataLoader(dataset,
177 |                                     batch_size=batch_size,
178 |                                     num_workers=num_workers)
179 |             batch_list1 = get_batches(dataloader)
180 |             batch_list2 = get_batches(dataloader)
181 | 
182 |             assert batches_shuffled(batch_list1, batch_list2), "ShuffleDataset Test fails: batches not shuffled"
183 |             assert batches_congruent(batch_list1, batch_list2), "ShuffleDataset Test fails: data mismatch"
184 |             print ("ShuffleDataset test passes for {} buffer_size & {} workers ".format(
185 |                                     buffer_size, num_workers))
186 | 
187 | def get_batches(dataloader):
188 |     """
189 |     Args: Pytorch Dataloader object
190 | 
191 |     returns a list of samples from the dataloader
192 |     """
193 |     batch_list = []
194 |     count = 0
195 |     for fname, fobj in dataloader:
196 |         fname = [x.split("/")[-1] for x in fname]
197 |         batch_list.append(list(zip(fname, fobj)))
198 |         count += 1
199 |     return batch_list
200 | 
201 | def batches_shuffled(batch_list1, batch_list2):
202 |     """
203 |     Ars: two lists of batches
204 |     
205 |     Returns True if the corresponding batches in lists are different
206 |     Returns False otherwise
207 |     """
208 |     for b1, b2 in zip(batch_list1, batch_list2):
209 |         if b1 == b2:
210 |             return False
211 |     return True
212 | 
213 | def batches_congruent(batch_list1, batch_list2):
214 |     """
215 |     Args: two lists of batches
216 | 
217 |     Returns True if the samples in both the lists matches
218 |     returns False otherwise
219 |     """
220 |     batches1_flat = [sample for batch in batch_list1 for sample in batch]
221 |     batches2_flat = [sample for batch in batch_list2 for sample in batch]
222 |     return set(batches1_flat) == set(batches2_flat)
223 | 


--------------------------------------------------------------------------------
/tests/py-tests/test_read_datasets.py:
--------------------------------------------------------------------------------
  1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | #   Licensed under the Apache License, Version 2.0 (the "License").
  4 | #   You may not use this file except in compliance with the License.
  5 | #   You may obtain a copy of the License at
  6 | #
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #   Unless required by applicable law or agreed to in writing, software
 10 | #   distributed under the License is distributed on an "AS IS" BASIS,
 11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #   See the License for the specific language governing permissions and
 13 | #   limitations under the License.
 14 | 
 15 | import os
 16 | import io
 17 | import pytest
 18 | from awsio.python.lib.io.s3.s3dataset import S3Dataset, S3IterableDataset
 19 | from awsio.python.lib.io.s3.s3dataset import list_files, file_exists
 20 | import boto3
 21 | 
 22 | 
 23 | def get_tar(s3_dataset_path):
 24 |     s3 = boto3.client('s3')
 25 |     s3.download_file(
 26 |         s3_dataset_path.split('/')[2],
 27 |         s3_dataset_path.split('/')[3], '/tmp/input_file.tar')
 28 |     import tarfile
 29 |     stream = tarfile.open('/tmp/input_file.tar')
 30 |     filenames_boto3 = []
 31 |     for tarinfo in stream:
 32 |         fname = tarinfo.name
 33 |         stream.extractfile(tarinfo).read()
 34 |         filenames_boto3.append(fname)
 35 |     return filenames_boto3
 36 | 
 37 | 
 38 | def test_tar_file_s3dataset():
 39 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tinyimagenet.tar'
 40 |     dataset = S3Dataset(s3_dataset_path)
 41 |     fileobj = io.BytesIO(dataset[0][1])
 42 |     import tarfile
 43 |     with tarfile.open(fileobj=fileobj, mode="r|*") as tar:
 44 |         result1 = len(tar.getmembers())
 45 |     result2 = get_tar(s3_dataset_path)
 46 |     assert result1 == len(result2)
 47 | 
 48 | 
 49 | def test_tar_file_s3iterabledataset():
 50 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tinyimagenet.tar'
 51 |     dataset = S3IterableDataset(s3_dataset_path)
 52 |     list_of_files = []
 53 |     for files in dataset:
 54 |         list_of_files.append(files[0][0])
 55 |     result1 = len(list_of_files)
 56 |     result2 = get_tar(s3_dataset_path)
 57 |     assert result1 == len(result2)
 58 | 
 59 | 
 60 | def get_zip(s3_dataset_path):
 61 |     s3 = boto3.client('s3')
 62 |     s3.download_file(
 63 |         s3_dataset_path.split('/')[2],
 64 |         s3_dataset_path.split('/')[3], '/tmp/input_file.zip')
 65 |     import zipfile
 66 |     filenames_boto3 = []
 67 |     with zipfile.ZipFile('/tmp/input_file.zip', 'r') as zfile:
 68 |         for file_ in zfile.namelist():
 69 |             zfile.read(file_)
 70 |             filenames_boto3.append(file_)
 71 |     return filenames_boto3
 72 | 
 73 | 
 74 | def test_zip_file_s3dataset():
 75 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tiny-imagenet-200.zip'
 76 |     dataset = S3Dataset(s3_dataset_path)
 77 |     fileobj = io.BytesIO(dataset[0][1])
 78 |     import zipfile
 79 |     with zipfile.ZipFile(fileobj, 'r') as zfile:
 80 |         result1 = len(zfile.namelist())
 81 |     result2 = get_zip(s3_dataset_path)
 82 |     assert result1 == len(result2)
 83 | 
 84 | 
 85 | def test_zip_file_s3iterabledataset():
 86 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tiny-imagenet-200.zip'
 87 |     dataset = S3IterableDataset(s3_dataset_path)
 88 |     list_of_files = []
 89 |     for files in dataset:
 90 |         list_of_files.append(files[0][0])
 91 |     result1 = len(list_of_files)
 92 |     result2 = get_zip(s3_dataset_path)
 93 |     assert result1 == len(result2)
 94 | 
 95 | 
 96 | def test_csv_file_s3dataset():
 97 |     os.environ['AWS_REGION'] = 'us-east-1'
 98 |     s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv'
 99 |     dataset = S3Dataset(s3_dataset_path)
100 |     import pandas as pd
101 |     result1 = pd.read_csv(io.BytesIO(dataset[0][1]))
102 |     s3 = boto3.client('s3')
103 |     obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2],
104 |                         Key=s3_dataset_path.split('/')[3])
105 |     result2 = pd.read_csv(io.BytesIO(obj['Body'].read()))
106 |     assert result1.equals(result2)
107 |     del os.environ['AWS_REGION']
108 | 
109 | 
110 | def test_csv_file_s3iterabledataset():
111 |     os.environ['AWS_REGION'] = 'us-east-1'
112 |     s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv'
113 |     dataset = S3IterableDataset(s3_dataset_path)
114 |     import pandas as pd
115 |     for files in dataset:
116 |         result1 = pd.read_csv(io.BytesIO(files[1]))
117 |     s3 = boto3.client('s3')
118 |     obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2], Key=s3_dataset_path.split('/')[3])
119 |     result2 = pd.read_csv(io.BytesIO(obj['Body'].read()))
120 |     assert result1.equals(result2)
121 |     del os.environ['AWS_REGION']
122 | 


--------------------------------------------------------------------------------
/tests/py-tests/test_regions.py:
--------------------------------------------------------------------------------
 1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | #   Licensed under the Apache License, Version 2.0 (the "License").
 4 | #   You may not use this file except in compliance with the License.
 5 | #   You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #   Unless required by applicable law or agreed to in writing, software
10 | #   distributed under the License is distributed on an "AS IS" BASIS,
11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #   See the License for the specific language governing permissions and
13 | #   limitations under the License.
14 | 
15 | import numpy as np
16 | import os
17 | import io
18 | import pytest
19 | from awsio.python.lib.io.s3.s3dataset import S3Dataset
20 | from awsio.python.lib.io.s3.s3dataset import (list_files, file_exists,
21 |                                               get_file_size)
22 | import boto3
23 | 
24 | def test_regions():
25 |     os.environ['AWS_REGION'] = 'us-east-1'
26 |     s3_dataset_path = 's3://pt-s3plugin-test-data-east1/images/n'
27 |     bucket_name = 'pt-s3plugin-test-data-east1'
28 |     prefix = 'images/n'
29 |     result1 = list_files(s3_dataset_path)
30 |     s3 = boto3.resource('s3')
31 |     test_bucket = s3.Bucket(bucket_name)
32 |     result2 = []
33 |     for url in test_bucket.objects.filter(Prefix=prefix):
34 |         result2.append('s3://' + url.bucket_name + '/' + url.key)
35 |     assert isinstance(result1, list)
36 |     assert isinstance(result2, list)
37 |     assert result1 == result2
38 |     del os.environ['AWS_REGION']
39 | 
40 | 
41 | def test_csv_file():
42 |     os.environ['AWS_REGION'] = 'us-east-1'
43 |     s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv'
44 |     dataset = S3Dataset(s3_dataset_path)
45 |     import pandas as pd
46 |     for files in dataset:
47 |         result1 = pd.read_csv(io.BytesIO(files[1]))
48 |     s3 = boto3.client('s3')
49 |     obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2], Key=s3_dataset_path.split('/')[3])
50 |     result2 = pd.read_csv(io.BytesIO(obj['Body'].read()))
51 |     assert result1.equals(result2)
52 |     del os.environ['AWS_REGION']
53 | 


--------------------------------------------------------------------------------
/tests/py-tests/test_s3dataset.py:
--------------------------------------------------------------------------------
  1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | #   Licensed under the Apache License, Version 2.0 (the "License").
  4 | #   You may not use this file except in compliance with the License.
  5 | #   You may obtain a copy of the License at
  6 | #
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #   Unless required by applicable law or agreed to in writing, software
 10 | #   distributed under the License is distributed on an "AS IS" BASIS,
 11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #   See the License for the specific language governing permissions and
 13 | #   limitations under the License.
 14 | 
 15 | import os
 16 | import pytest
 17 | from awsio.python.lib.io.s3.s3dataset import S3Dataset
 18 | import boto3
 19 | 
 20 | 
 21 | def test_file_path():
 22 |     """
 23 |     Test S3Dataset for existing and nonexistent path
 24 |     """
 25 |     # existing path
 26 |     s3_path = 's3://pt-s3plugin-test-data-west2/images/test'
 27 |     s3_dataset = S3Dataset(s3_path)
 28 |     assert s3_dataset
 29 | 
 30 |     # non-existent path
 31 |     s3_path_none = 's3://pt-s3plugin-test-data-west2/non_existent_path/test'
 32 |     with pytest.raises(AssertionError) as excinfo:   
 33 |         s3_dataset = S3Dataset(s3_path_none)
 34 |     assert 'does not contain any objects' in str(excinfo.value)
 35 | 
 36 | 
 37 | def test_urls_list():
 38 |     """
 39 |     Test whether urls_list input for S3Dataset works properly
 40 |     """
 41 |     os.environ['AWS_REGION'] = 'us-west-2'
 42 |     # provide url prefix (path within bucket)
 43 |     prefix_to_directory = 'images/test'
 44 |     prefix_to_file = 'test_1.JPEG'
 45 |     prefix_list=[prefix_to_directory, prefix_to_file]
 46 | 
 47 |     # set up boto3
 48 |     s3 = boto3.resource('s3')
 49 |     bucket_name = 'pt-s3plugin-test-data-west2'
 50 |     test_bucket = s3.Bucket(bucket_name)
 51 | 
 52 |     # try individual valid urls and collect url_list and all_boto3_files to test url list input
 53 |     urls_list = list()
 54 |     all_boto3_files = list()
 55 |     for prefix in prefix_list:
 56 |         # collect list of all file names using S3Dataset
 57 |         url = os.path.join('s3://', bucket_name, prefix)
 58 |         urls_list.append(url)
 59 |         s3_dataset = S3Dataset(url)
 60 |         s3_files = [item[0] for item in s3_dataset]
 61 | 
 62 |         # collect list of all file names using boto3
 63 |         boto3_files = [os.path.join('s3://', url.bucket_name, url.key) \
 64 |             for url in test_bucket.objects.filter(Prefix=prefix)]
 65 |         all_boto3_files.extend(boto3_files)
 66 | 
 67 |         assert s3_files == boto3_files
 68 | 
 69 |     # test list of two valid urls as input
 70 |     s3_dataset = S3Dataset(urls_list)
 71 |     s3_files = [item[0] for item in s3_dataset]
 72 | 
 73 |     assert s3_files == all_boto3_files
 74 | 
 75 |     # add an non-existent url to list of urls
 76 |     url_to_non_existent = 's3://pt-s3plugin-test-data-west2/non_existent_directory'
 77 |     urls_list.append(url_to_non_existent)
 78 |     with pytest.raises(AssertionError) as excinfo:   
 79 |         s3_dataset = S3Dataset(urls_list)
 80 |     assert 'does not contain any objects' in str(excinfo.value)    
 81 | 
 82 |     del os.environ['AWS_REGION']
 83 | 
 84 | 
 85 | def test_multi_download():
 86 |     """
 87 |     Test whether S3Dataset with multiple downloads in one url works properly
 88 |     """
 89 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test'
 90 |     bucket_name = 'pt-s3plugin-test-data-west2'
 91 |     prefix = 'images/test'
 92 | 
 93 |     if 'S3_DISABLE_MULTI_PART_DOWNLOAD' in os.environ:
 94 |         del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD']
 95 | 
 96 |     dataset = S3Dataset(s3_dataset_path)
 97 |     # collect filename from each item in dataset
 98 |     result1 = [item[0] for item in dataset]
 99 |     s3 = boto3.resource('s3')
100 |     test_bucket = s3.Bucket(bucket_name)
101 |     result2 = []
102 |     for url in test_bucket.objects.filter(Prefix=prefix):
103 |         result2.append('s3://' + url.bucket_name + '/' + url.key)
104 |     assert isinstance(result1, list)
105 |     assert isinstance(result2, list)
106 |     assert result1 == result2
107 | 
108 | 
109 | def test_disable_multi_download():
110 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test'
111 |     os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'] = "ON"
112 |     dataset = S3Dataset(s3_dataset_path)
113 |     result1 = [item[0] for item in dataset]
114 | 
115 |     # boto3
116 |     bucket_name = 'pt-s3plugin-test-data-west2'
117 |     prefix = 'images/test'
118 |     s3 = boto3.resource('s3')
119 |     test_bucket = s3.Bucket(bucket_name)
120 |     result2 = ['s3://' + url.bucket_name + '/' + url.key \
121 |         for url in test_bucket.objects.filter(Prefix=prefix)]
122 | 
123 |     assert isinstance(result1, list)
124 |     assert isinstance(result2, list)
125 |     assert result1 == result2
126 |     del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD']
127 | 


--------------------------------------------------------------------------------
/tests/py-tests/test_s3iterabledataset.py:
--------------------------------------------------------------------------------
  1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | #   Licensed under the Apache License, Version 2.0 (the "License").
  4 | #   You may not use this file except in compliance with the License.
  5 | #   You may obtain a copy of the License at
  6 | #
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #   Unless required by applicable law or agreed to in writing, software
 10 | #   distributed under the License is distributed on an "AS IS" BASIS,
 11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #   See the License for the specific language governing permissions and
 13 | #   limitations under the License.
 14 | 
 15 | import os
 16 | import io
 17 | import pytest
 18 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset, ShuffleDataset
 19 | import boto3
 20 | 
 21 | 
 22 | def test_file_path():
 23 |     """
 24 |     Test S3IterableDataset for existing and nonexistent path
 25 |     """
 26 |     # existing path
 27 |     s3_path = 's3://pt-s3plugin-test-data-west2/images/test'
 28 |     s3_dataset = S3IterableDataset(s3_path)
 29 |     assert s3_dataset
 30 | 
 31 |     # non-existent path
 32 |     s3_path_none = 's3://pt-s3plugin-test-data-west2/non_existent_path/test'
 33 |     with pytest.raises(AssertionError) as excinfo:   
 34 |         s3_dataset = S3IterableDataset(s3_path_none)
 35 |     assert 'does not contain any objects' in str(excinfo.value)
 36 | 
 37 | 
 38 | def test_urls_list():
 39 |     """
 40 |     Test whether urls_list input for S3IterableDataset works properly
 41 |     """
 42 |     os.environ['AWS_REGION'] = 'us-west-2'
 43 |     # provide url prefix (path within bucket)
 44 |     prefix_to_directory = 'images/test'
 45 |     prefix_to_file = 'test_1.JPEG'
 46 |     prefix_list=[prefix_to_directory, prefix_to_file]
 47 | 
 48 |     # set up boto3
 49 |     s3 = boto3.resource('s3')
 50 |     bucket_name = 'pt-s3plugin-test-data-west2'
 51 |     test_bucket = s3.Bucket(bucket_name)
 52 | 
 53 |     # try individual valid urls and collect url_list and all_boto3_files to test url list input
 54 |     urls_list = list()
 55 |     all_boto3_files = list()
 56 |     for prefix in prefix_list:
 57 |         # collect list of all file names using S3IterableDataset
 58 |         url = os.path.join('s3://', bucket_name, prefix)
 59 |         urls_list.append(url)
 60 |         s3_dataset = S3IterableDataset(url)
 61 |         s3_files = [item[0] for item in s3_dataset]
 62 | 
 63 |         # collect list of all file names using boto3
 64 |         boto3_files = [os.path.join('s3://', url.bucket_name, url.key) \
 65 |             for url in test_bucket.objects.filter(Prefix=prefix)]
 66 |         all_boto3_files.extend(boto3_files)
 67 | 
 68 |         assert s3_files == boto3_files
 69 | 
 70 |     # test list of two valid urls as input
 71 |     s3_dataset = S3IterableDataset(urls_list)
 72 |     s3_files = [item[0] for item in s3_dataset]
 73 | 
 74 |     assert s3_files == all_boto3_files
 75 | 
 76 |     # add an non-existent url to list of urls
 77 |     url_to_non_existent = 's3://pt-s3plugin-test-data-west2/non_existent_directory'
 78 |     urls_list.append(url_to_non_existent)
 79 |     with pytest.raises(AssertionError) as excinfo:   
 80 |         s3_dataset = S3IterableDataset(urls_list)
 81 |     assert 'does not contain any objects' in str(excinfo.value)    
 82 | 
 83 |     del os.environ['AWS_REGION']
 84 | 
 85 | 
 86 | def test_shuffle_true():
 87 |     """
 88 |     Tests shuffle_urls parameter, len and  set_epoch functions
 89 |     """
 90 |     os.environ['AWS_REGION'] = 'us-west-2'
 91 | 
 92 |     # create two datasets, one shuffled with self.epoch
 93 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test'
 94 |     s3_dataset0 = S3IterableDataset(s3_dataset_path)
 95 |     s3_dataset1 = S3IterableDataset(s3_dataset_path, shuffle_urls=True)
 96 |     s3_dataset1.set_epoch(5)
 97 | 
 98 |     # len is defined as the length of the urls_list created by the path
 99 |     assert len(s3_dataset0) == len(s3_dataset1)
100 | 
101 |     # check to make sure shuffling works
102 |     filenames0 = [item[0] for item in s3_dataset0]
103 |     filenames1 = [item[0] for item in s3_dataset1]
104 | 
105 |     assert len(filenames0) == len(filenames1)
106 |     assert filenames0 != filenames1
107 |     del os.environ['AWS_REGION']
108 | 
109 | 
110 | def test_multi_download():
111 |     s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv'
112 | 
113 |     if 'S3_DISABLE_MULTI_PART_DOWNLOAD' in os.environ:
114 |         del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD']
115 |     os.environ['AWS_REGION'] = 'us-east-1'
116 | 
117 |     dataset = S3IterableDataset(s3_dataset_path)
118 |     import pandas as pd
119 |     for files in dataset:
120 |         result1 = pd.read_csv(io.BytesIO(files[1]))
121 |     s3 = boto3.client('s3')
122 |     obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2],
123 |                         Key=s3_dataset_path.split('/')[3])
124 |     result2 = pd.read_csv(io.BytesIO(obj['Body'].read()))
125 |     assert result1.equals(result2)
126 | 
127 | 
128 | def test_disable_multi_download():
129 |     s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv'
130 |     os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'] = "ON"
131 |     os.environ['AWS_REGION'] = 'us-east-1'
132 |     dataset = S3IterableDataset(s3_dataset_path)
133 |     import pandas as pd
134 |     for files in dataset:
135 |         result1 = pd.read_csv(io.BytesIO(files[1]))
136 |     s3 = boto3.client('s3')
137 |     obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2],
138 |                         Key=s3_dataset_path.split('/')[3])
139 |     result2 = pd.read_csv(io.BytesIO(obj['Body'].read()))
140 |     assert result1.equals(result2)
141 |     del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'], os.environ['AWS_REGION']
142 | 
143 | 
144 | def test_shuffle_dataset():
145 | 
146 |     dataset = [i for i in range(10)]
147 | 
148 |     # buffer_size 1 should yield the dataset without shuffling
149 |     shuffle_dataset = ShuffleDataset(dataset=dataset, buffer_size=1)
150 |     shuffle_content = [item for item in shuffle_dataset]
151 |     assert dataset == shuffle_content
152 | 
153 |     # buffer_size smaller than dataset size
154 |     shuffle_dataset = ShuffleDataset(dataset=dataset, buffer_size=2)
155 |     assert set(dataset) == set(shuffle_content)
156 |     assert len(dataset) == len(shuffle_content)
157 | 
158 |     # buffer_size greater than dataset size
159 |     shuffle_dataset = ShuffleDataset(dataset=dataset, buffer_size=15)
160 |     assert set(dataset) == set(shuffle_content)
161 |     assert len(dataset) == len(shuffle_content)
162 | 


--------------------------------------------------------------------------------
/tests/py-tests/test_utils.py:
--------------------------------------------------------------------------------
  1 | #   Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | #   Licensed under the Apache License, Version 2.0 (the "License").
  4 | #   You may not use this file except in compliance with the License.
  5 | #   You may obtain a copy of the License at
  6 | #
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #   Unless required by applicable law or agreed to in writing, software
 10 | #   distributed under the License is distributed on an "AS IS" BASIS,
 11 | #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #   See the License for the specific language governing permissions and
 13 | #   limitations under the License.
 14 | 
 15 | import os
 16 | import pytest
 17 | from awsio.python.lib.io.s3.s3dataset import (list_files, file_exists,
 18 |                                               get_file_size)
 19 | import boto3
 20 | 
 21 | 
 22 | def test_wrong_filenames():
 23 |     filenames = ['', 'shor', 'not_start_s3', 's3://', 's3:///no_bucket']
 24 |     functions = [list_files, file_exists, get_file_size]
 25 |     exception = False
 26 |     for function in functions:
 27 |         for filename in filenames:
 28 |             try:
 29 |                 function(filename)
 30 |             except ValueError:
 31 |                 exception = True
 32 |             assert exception
 33 |             exception = False
 34 | 
 35 | 
 36 | def test_list_files_prefix():
 37 |     # default region is us-west-2
 38 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test'
 39 |     result1 = list_files(s3_dataset_path)
 40 |     s3 = boto3.resource('s3')
 41 |     test_bucket = s3.Bucket('pt-s3plugin-test-data-west2')
 42 |     result2 = []
 43 |     for url in test_bucket.objects.filter(Prefix='images/test'):
 44 |         result2.append('s3://' + url.bucket_name + '/' + url.key)
 45 |     assert isinstance(result1, list)
 46 |     assert isinstance(result2, list)
 47 |     assert len(result1) == len(result2)
 48 |     assert result1 == result2
 49 | 
 50 | 
 51 | def test_list_files_bucket():
 52 |     os.environ['AWS_REGION'] = 'us-west-2'
 53 |     # default region is us-west-2
 54 |     s3_dataset_path = 's3://pt-s3plugin-test-data-west2'
 55 |     result1 = list_files(s3_dataset_path)
 56 |     s3 = boto3.resource('s3')
 57 |     test_bucket = s3.Bucket('pt-s3plugin-test-data-west2')
 58 |     result2 = []
 59 |     for url in test_bucket.objects.all():
 60 |         if url.key[-1] == '/':
 61 |             continue
 62 |         result2.append('s3://' + url.bucket_name + '/' + url.key)
 63 |     assert isinstance(result1, list)
 64 |     assert isinstance(result2, list)
 65 |     assert result1 == result2
 66 |     del os.environ['AWS_REGION']
 67 | 
 68 | 
 69 | def test_file_exists():
 70 |     """
 71 |     There are four kinds of inputs for file_exists:
 72 |     1. object_name refers to a file (True)
 73 |     2. object_name refers to a folder (False)
 74 |     3. bucket_name does not refer to an existing bucket (False)
 75 |     4. object_name does not refer to an existing object (False)
 76 |     """
 77 |     s3_bucket = 's3://pt-s3plugin-test-data-west2'
 78 | 
 79 |     # case 1
 80 |     assert file_exists(os.path.join(s3_bucket, 'test_0.JPEG'))
 81 | 
 82 |     # case 2
 83 |     assert not file_exists(os.path.join(s3_bucket, 'folder_1'))
 84 | 
 85 |     # case 3
 86 |     assert not file_exists(os.path.join(s3_bucket, 'non_existent_folder'))
 87 | 
 88 |     # case 4
 89 |     assert not file_exists(os.path.join(s3_bucket, 'test_new_file.JPEG'))
 90 | 
 91 | 
 92 | def test_get_file_size():
 93 |     bucket_name = 'pt-s3plugin-test-data-west2'
 94 |     object_name = 'test_0.JPEG'
 95 | 
 96 |     result1 = get_file_size('s3://' + bucket_name + '/' + object_name)
 97 | 
 98 |     s3 = boto3.resource('s3')
 99 |     bucket = s3.Bucket(bucket_name)
100 |     result2 = bucket.Object(object_name).content_length
101 | 
102 |     assert result1 == result2
103 | 


--------------------------------------------------------------------------------
/tests/smoke_tests/import_awsio.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | echo "Testing: import awsio"
 4 | python -c "import awsio; print(awsio.__version__)"
 5 | echo "import awsio succeeded"
 6 | 
 7 | read -p "S3 URL : " s3_url
 8 | echo Testing: checking setup by quering whether or not $s3_url is an existing file
 9 | python -c "from awsio.python.lib.io.s3.s3dataset import file_exists; print(f\"file_exists: {file_exists($s3_url)}\")"
10 | echo Smoke test was successful.
11 | 
12 | 


--------------------------------------------------------------------------------
/third_party/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.1)
 2 | 
 3 | if(POLICY CMP0054)
 4 |     cmake_policy(SET CMP0054 NEW)
 5 | endif()
 6 | 
 7 | project(AWSIO_DEPS)
 8 | 
 9 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
10 | 
11 | include(ExternalProject)
12 | 
13 | set(AWS_DEPS_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}" CACHE PATH "Dependencies install directory.")
14 | set(AWS_DEPS_BUILD_DIR "${CMAKE_BINARY_DIR}/build" CACHE PATH "Dependencies build directory.")
15 | set(AWS_DEPS_DOWNLOAD_DIR "${AWS_DEPS_BUILD_DIR}/downloads" CACHE PATH "Dependencies download directory.")
16 | 
17 | set(AWS_C_COMMON_URL "https://github.com/awslabs/aws-c-common.git")
18 | set(AWS_C_COMMON_TAG "v0.4.15")
19 | include(BuildAwsSDK)
20 | 
21 | add_dependencies(AwsSDK)


--------------------------------------------------------------------------------
/third_party/cmake/AwsSDK.cmake:
--------------------------------------------------------------------------------
1 | find_package(AWSSD REQUIRED COMPONENTS transfer s3-encryption dynamodb)
2 | target_link_libraries(target ${AWSSDK_LINK_LIBRARIES})
3 | 
4 | 


--------------------------------------------------------------------------------
/tools/get_version.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | from pathlib import Path
 4 | 
 5 | project_root = Path(__file__).parent
 6 | 
 7 | def get_sha():
 8 |     try:
 9 |         return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=project_root).decode('ascii').strip()
10 |     except Exception:
11 |         return 'Unknown'
12 | 
13 | def get_version():
14 |     sha = get_sha()
15 |     version = open('version.txt', 'r').read().strip()
16 |     if sha != 'Unknown':
17 |         version += '+' + sha[:7]
18 |     return version
19 | 
20 | 


--------------------------------------------------------------------------------
/version.txt:
--------------------------------------------------------------------------------
1 | 0.0.1


--------------------------------------------------------------------------------