├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── NOTICE.txt ├── README.md ├── THIRD-PARTY-LICENSES ├── awsio ├── __init__.py ├── csrc │ └── io │ │ └── s3 │ │ ├── s3_file_wrapper.cpp │ │ ├── s3_io.cpp │ │ └── s3_io.h └── python │ ├── __init__.py │ └── lib │ ├── __init__.py │ └── io │ ├── __init__.py │ └── s3 │ ├── __init__.py │ └── s3dataset.py ├── examples ├── s3_cv_iterable_example.py ├── s3_cv_iterable_shuffle_example.py ├── s3_cv_map_example.py ├── s3_cv_transform.py ├── s3_imagenet_example.py └── s3_nlp_iterable_example.py ├── setup.cfg ├── setup.py ├── tests ├── py-tests │ ├── test_integration.py │ ├── test_read_datasets.py │ ├── test_regions.py │ ├── test_s3dataset.py │ ├── test_s3iterabledataset.py │ └── test_utils.py └── smoke_tests │ └── import_awsio.sh ├── third_party ├── CMakeLists.txt └── cmake │ └── AwsSDK.cmake ├── tools └── get_version.py └── version.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore Mac system files 2 | .DS_store 3 | 4 | # Ignore file extensions below 5 | *.coverage 6 | *.egg-info 7 | *.log 8 | *.pyc 9 | 10 | awsio/_version.py 11 | build/ 12 | dist/ 13 | 14 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.13) 2 | project(_pywrap_s3_io) 3 | 4 | set(CMAKE_CXX_STANDARD 17) 5 | SET(TORCH_MIN_VERSION "1.5.1") 6 | 7 | find_package(Python3 COMPONENTS Interpreter Development) 8 | 9 | find_package(AWSSDK REQUIRED COMPONENTS s3 transfer) 10 | 11 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 12 | set(INCLUDE_DIRS "awsio/csrc/io/s3") 13 | 14 | set(SOURCES "${INCLUDE_DIRS}/s3_io.cpp" ) 15 | 16 | include_directories(${INCLUDE_DIRS}) 17 | find_package(pybind11 REQUIRED) 18 | pybind11_add_module(_pywrap_s3_io ${SOURCES} "${INCLUDE_DIRS}/s3_file_wrapper.cpp") 19 | 20 | Message(STATUS "All linked libs: ${AWSSDK_LINK_LIBRARIES}") 21 | 22 | target_link_libraries(_pywrap_s3_io PRIVATE ${AWSSDK_LINK_LIBRARIES} ${AWSSDK_PLATFORM_DEPS}) 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Amazon Web Services 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | amazon-s3-plugin-for-pytorch 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # S3 Plugin 2 | 3 | **Note: As of April 5th, 2022, this plugin is in maintenance mode. [The S3 IO is in the process of being upstreamed into `torchdata` package](https://github.com/pytorch/data/tree/main/torchdata/datapipes/iter/load#readme). In the future, we will support the new `torchdata` package, and be continuously improving the user experience and performance of the S3 IO datapipes. Please support and comment for the new S3 IO datapipes. Raise issues and create PRs if necessary.** 4 | 5 | S3-plugin is a high performance PyTorch dataset library to efficiently access datasets stored in S3 buckets. It provides streaming data access to datasets of any size and thus eliminates the need to provision local storage capacity. The library is designed to leverage the high throughput that S3 offers to access objects with minimal latency. 6 | 7 | The users have the flexibility to use either map-style or iterable-style dataset interfaces based on their needs. The library itself is file-format agnostic and presents objects in S3 as a binary buffer(blob). Users are free to apply any additional transformation on the data received from S3. 8 | 9 | ## Compatible Images 10 | 11 | Only the following images are compatible with the Amazon S3 plugin for PyTorch: 12 | 13 | **Ubuntu 20.04** 14 | - **CPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9.0-cpu-py38-ubuntu20.04-v1.1 15 | - **GPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.9.0-gpu-py38-cu111-ubuntu20.04-v1.1 16 | 17 | **Ubuntu 18.04** 18 | - **CPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.8.1-cpu-py36-ubuntu18.04-v1.6 19 | - **GPU**: 763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:1.8.1-gpu-py36-cu111-ubuntu18.04-v1.7 20 | 21 | ## Installation 22 | 23 | You can install this package by following the below instructions. 24 | 25 | #### Prerequisite 26 | 27 | - Python 3.6 (or Python 3.7) is required for this installation. 28 | 29 | - [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) for configuring S3 access. 30 | 31 | - Pytorch >= 1.5 (If not available, S3-plugin installs latest Torch) 32 | 33 | - *Note:* To run on Mac, [AWS_SDK_CPP](https://github.com/aws/aws-sdk-cpp) must be installed. 34 | 35 | 36 | #### Installing S3-Plugin via Wheel 37 | 38 | ```shell script 39 | # List of wheels on Linux: 40 | # python 3.7: https://aws-s3-plugin.s3.us-west-2.amazonaws.com/binaries/0.0.1/bd37e27/awsio-0.0.1%2Bbd37e27-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 41 | # python 3.8: https://aws-s3-plugin.s3.us-west-2.amazonaws.com/binaries/0.0.1/bd37e27/awsio-0.0.1%2Bbd37e27-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 42 | # python 3.9: https://aws-s3-plugin.s3.us-west-2.amazonaws.com/binaries/0.0.1/bd37e27/awsio-0.0.1%2Bbd37e27-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 43 | aws s3 cp . 44 | pip install 45 | ``` 46 | 47 | #### Installing S3-Plugin from source 48 | 49 | ```shell 50 | # install [aws-sdk-cpp](https://github.com/aws/aws-sdk-cpp). example installation guide 51 | git clone --recurse-submodules https://github.com/aws/aws-sdk-cpp 52 | cd aws-sdk-cpp/ 53 | mkdir sdk-build 54 | cd sdk-build 55 | cmake .. -DCMAKE_BUILD_TYPE=Release -DBUILD_ONLY="s3;transfer" 56 | make 57 | make install # may need sudo 58 | 59 | # install pybind11. example: 60 | conda install pybind11 61 | export CMAKE_PREFIX_PATH=$CMAKE_PREFIX_PATH:/usr/local/lib/python3.7/site-packages/pybind11 62 | 63 | # install from source 64 | python setup.py install 65 | ``` 66 | 67 | ### Configuration 68 | 69 | Before reading data from S3 bucket, you need to provide bucket region parameter: 70 | 71 | * `AWS_REGION`: By default, regional endpoint is used for S3, with region controlled by `AWS_REGION`. If `AWS_REGION` is not specified, then `us-west-2` is used by default. 72 | 73 | To read objects in a bucket that is not publicly accessible, AWS credentials must be provided through one of the following methods: 74 | 75 | * Install and configure [awscli](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) by `aws configure`. 76 | * Set credentials in the AWS credentials profile file on the local system, located at: `~/.aws/credentials` on Linux, macOS, or Unix 77 | * Set the `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY` environment variables. 78 | * If you are using this library on an EC2 instance, specify an IAM role and then give the EC2 instance access to that role. 79 | 80 | #### Smoke Test 81 | To test your setup, run: 82 | ``` 83 | bash tests/smoke_tests/import_awsio.sh 84 | ``` 85 | 86 | The test will first make sure that the package imports correctly by printing the commit hash related to the build. 87 | Then, it will prompt the user for a S3 url to a file and return whether or not the file exists. 88 | 89 | For example: 90 | ``` 91 | $ bash tests/smoke_tests/import_awsio.sh 92 | Testing: import awsio 93 | 0.0.1+b119a6d 94 | import awsio succeeded 95 | S3 URL : 's3://path/to/bucket/test_0.JPEG' 96 | Testing: checking setup by quering whether or not 's3://path/to/bucket/test_0.JPEG' is an existing file 97 | file_exists: True 98 | Smoke test was successful. 99 | ``` 100 | 101 | ### Usage 102 | 103 | Once the above setup is complete, you can interact with S3 bucket in following ways: 104 | 105 | Accepted input S3 url formats: 106 | 107 | * Single url 108 | 109 | * `url = 's3://path/to/bucket/abc.tfrecord'` 110 | 111 | * List of urls as follows: 112 | 113 | ```urls = ['s3://path/to/bucket/abc.tfrecord','s3://path/to/bucket/def.tfrecord']``` 114 | 115 | * Prefix to S3 bucket to include all files under 's3_prefix' folder starting with '0' 116 | 117 | ```urls = 's3://path/to/s3_prefix/0'``` 118 | 119 | * Using `list_files()` function, which can be used to manipulate input list of urls to fetch as follows: 120 | ```shell 121 | from awsio.python.lib.io.s3.s3dataset import list_files 122 | urls = list_files('s3://path/to/s3_prefix/0') 123 | ``` 124 | 125 | #### Map-Style Dataset 126 | 127 | If each object in S3 contains a single training sample, then map-style dataset i.e. S3Dataset can be used. To partition data across nodes and to shuffle data, this dataset can be used with PyTorch distributed sampler. Additionally, pre-processing can be applied to the data in S3 by extending the S3Dataset class. Following example illustrates use of map-style S3Dataset for image datasets: 128 | 129 | ```python 130 | from awsio.python.lib.io.s3.s3dataset import S3Dataset 131 | from torch.utils.data import DataLoader 132 | from torchvision import transforms 133 | from PIL import Image 134 | import io 135 | 136 | class S3ImageSet(S3Dataset): 137 | def __init__(self, urls, transform=None): 138 | super().__init__(urls) 139 | self.transform = transform 140 | 141 | def __getitem__(self, idx): 142 | img_name, img = super(S3ImageSet, self).__getitem__(idx) 143 | # Convert bytes object to image 144 | img = Image.open(io.BytesIO(img)).convert('RGB') 145 | 146 | # Apply preprocessing functions on data 147 | if self.transform is not None: 148 | img = self.transform(img) 149 | return img 150 | 151 | batch_size = 32 152 | 153 | preproc = transforms.Compose([ 154 | transforms.ToTensor(), 155 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 156 | transforms.Resize((100, 100)) 157 | ]) 158 | 159 | # urls can be S3 prefix containing images or list of all individual S3 images 160 | urls = 's3://path/to/s3_prefix/' 161 | 162 | dataset = S3ImageSet(urls, transform=preproc) 163 | dataloader = DataLoader(dataset, 164 | batch_size=batch_size, 165 | num_workers=64) 166 | 167 | ``` 168 | 169 | 170 | #### Iterable-style dataset 171 | 172 | If each object in S3 contains multiple training samples e.g. archive files containing multiple small images or TF record files/shards containing multiple records, then it is advisable to use the Iterable-style dataset implementation i.e. S3IterableDataset. For the specific case of zip/tar archival files, each file contained in the archival is returned during each iteration in a streaming fashion. For all other file formats, binary blob for the whole shard is returned and users need to implement the appropriate parsing logic. Besides, S3IterableDataset takes care of partitioning the data across nodes and workers in a distributed setting. 173 | 174 | `Note:` For datasets consisting of a large number of smaller objects, accessing each object individually can be inefficient. For such datasets, it is recommended to create shards of the training data and use S3IterableDataset for better performance. 175 | ```shell 176 | # tar file containing label and image files as below 177 | tar --list --file=file1.tar | sed 4q 178 | 179 | 1234.cls 180 | 1234.jpg 181 | 5678.cls 182 | 5678.jpg 183 | ``` 184 | 185 | Consider tar file for image classification. It can be easily loaded by writing a custom python generator function using the iterator returned by S3IterableDataset. (Note: To create shards from a file dataset refer this [link](https://github.com/tmbdev/pytorch-imagenet-wds).) 186 | 187 | 188 | ```python 189 | from torch.utils.data import IterableDataset 190 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset 191 | from PIL import Image 192 | import io 193 | import numpy as np 194 | from torchvision import transforms 195 | 196 | class ImageS3(IterableDataset): 197 | def __init__(self, urls, shuffle_urls=False, transform=None): 198 | self.s3_iter_dataset = S3IterableDataset(urls, 199 | shuffle_urls) 200 | self.transform = transform 201 | 202 | def data_generator(self): 203 | try: 204 | while True: 205 | # Based on alphabetical order of files, sequence of label and image may change. 206 | label_fname, label_fobj = next(self.s3_iter_dataset_iterator) 207 | image_fname, image_fobj = next(self.s3_iter_dataset_iterator) 208 | 209 | label = int(label_fobj) 210 | image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB') 211 | 212 | # Apply torch vision transforms if provided 213 | if self.transform is not None: 214 | image_np = self.transform(image_np) 215 | yield image_np, label 216 | 217 | except StopIteration: 218 | return 219 | 220 | def __iter__(self): 221 | self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset) 222 | return self.data_generator() 223 | 224 | def set_epoch(self, epoch): 225 | self.s3_iter_dataset.set_epoch(epoch) 226 | 227 | # urls can be a S3 prefix containing all the shards or a list of S3 paths for all the shards 228 | urls = ["s3://path/to/file1.tar", "s3://path/to/file2.tar"] 229 | 230 | # Example Torchvision transforms to apply on data 231 | preproc = transforms.Compose([ 232 | transforms.ToTensor(), 233 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 234 | transforms.Resize((100, 100)) 235 | ]) 236 | 237 | dataset = ImageS3(urls, transform=preproc) 238 | 239 | ``` 240 | 241 | This dataset can be easily used with dataloader for parallel data loading and preprocessing: 242 | 243 | ```python 244 | dataloader = torch.utils.data.DataLoader(dataset, num_workers=4, batch_size=32) 245 | ``` 246 | 247 | We can shuffle the sequence of fetching shards by setting shuffle_urls=True and calling set_epoch method at the beginning of every epochs as: 248 | ```python 249 | dataset = ImageS3(urls, transform=preproc, shuffle_urls=True) 250 | for epoch in range(epochs): 251 | dataset.set_epoch(epoch) 252 | # training code ... 253 | ``` 254 | 255 | Note that the above code will only shuffle sequence of shards, the individual training samples within shards will be fetched in the same order. To shuffle the order of training samples across shards, use ShuffleDataset. ShuffleDataset maintains a buffer of data samples read from multiple shards and returns a random sample from it. The count of samples to be buffered is specified by buffer_size. To use ShuffleDataset, update the above example as follows: 256 | 257 | ```python 258 | dataset = ShuffleDataset(ImageS3(urls), buffer_size=4000) 259 | ``` 260 | 261 | #### Iterable-style dataset (NLP) 262 | The data set can be similarly used for NLP tasks. Following example demonstrates use for S3IterableDataset for BERT data loading. 263 | 264 | ```shell script 265 | # Consider S3 prefix containing hdf5 files. 266 | # Each hdf5 file contains numpy arrays for different variables required for BERT 267 | # training such as next sentence labels, masks etc. 268 | aws s3 ls --human-readable s3://path/to/s3_prefix | sed 3q 269 | 270 | 271 | file_1.hdf5 272 | file_2.hdf5 273 | file_3.hdf5 274 | 275 | ``` 276 | 277 | ```python 278 | 279 | import torch 280 | from torch.utils.data import IterableDataset, DataLoader 281 | from itertools import islice 282 | import h5py 283 | import numpy as np 284 | import io 285 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset 286 | 287 | def create_data_samples_from_file(fileobj): 288 | # Converts bytes data to numpy arrays 289 | keys = ['input_ids', 'input_mask', 'segment_ids', \ 290 | 'masked_lm_positions', 'masked_lm_ids', 'next_sentence_labels'] 291 | dataset = io.BytesIO(fileobj) 292 | with h5py.File(dataset, "r") as f: 293 | data_file = [np.asarray(f[key][:]) for key in keys] 294 | return data_file 295 | 296 | class s3_dataset(IterableDataset): 297 | 298 | def __init__(self, urls): 299 | self.urls = urls 300 | self.dataset = S3IterableDataset(self.urls, shuffle_urls=True) 301 | 302 | def data_generator(self): 303 | try: 304 | while True: 305 | filename, fileobj = next(self.dataset_iter) 306 | # data_samples: list of six numpy arrays 307 | data_samples = create_data_samples_from_file(fileobj) 308 | 309 | for sample in list(zip(*data_samples)): 310 | # Preprocess sample if required and then yield 311 | yield sample 312 | 313 | except StopIteration as e: 314 | return 315 | 316 | def __iter__(self): 317 | self.dataset_iter = iter(self.dataset) 318 | return self.data_generator() 319 | 320 | urls = "s3://path/to/s3_prefix" 321 | train_dataset = s3_dataset(urls) 322 | 323 | ``` 324 | 325 | ### Test Coverage 326 | 327 | To check python test coverage, install [`coverage.py`](https://coverage.readthedocs.io/en/latest/index.html) as follows: 328 | 329 | ``` 330 | pip install coverage 331 | ``` 332 | 333 | To make sure that all tests are run, please also install `pytest`, `boto3`, and `pandas` as follows: 334 | ``` 335 | pip install pytest boto3 pandas 336 | ``` 337 | 338 | To run tests and calculate coverage: 339 | 340 | ```asm 341 | coverage erase 342 | coverage run -p --source=awsio -m pytest -v tests/py-tests/test_regions.py \ 343 | tests/py-tests/test_utils.py \ 344 | tests/py-tests/test_s3dataset.py \ 345 | tests/py-tests/test_s3iterabledataset.py \ 346 | tests/py-tests/test_read_datasets.py \ 347 | tests/py-tests/test_integration.py 348 | coverage combine 349 | coverage report -m 350 | ``` 351 | -------------------------------------------------------------------------------- /THIRD-PARTY-LICENSES: -------------------------------------------------------------------------------- 1 | ** tensorflow; version 2.4.0 -- https://github.com/tensorflow/tensorflow 2 | 3 | Apache License 4 | 5 | Version 2.0, January 2004 6 | 7 | http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND 8 | DISTRIBUTION 9 | 10 | 1. Definitions. 11 | 12 | "License" shall mean the terms and conditions for use, reproduction, and 13 | distribution as defined by Sections 1 through 9 of this document. 14 | 15 | "Licensor" shall mean the copyright owner or entity authorized by the 16 | copyright owner that is granting the License. 17 | 18 | "Legal Entity" shall mean the union of the acting entity and all other 19 | entities that control, are controlled by, or are under common control 20 | with that entity. For the purposes of this definition, "control" means 21 | (i) the power, direct or indirect, to cause the direction or management 22 | of such entity, whether by contract or otherwise, or (ii) ownership of 23 | fifty percent (50%) or more of the outstanding shares, or (iii) 24 | beneficial ownership of such entity. 25 | 26 | "You" (or "Your") shall mean an individual or Legal Entity exercising 27 | permissions granted by this License. 28 | 29 | "Source" form shall mean the preferred form for making modifications, 30 | including but not limited to software source code, documentation source, 31 | and configuration files. 32 | 33 | "Object" form shall mean any form resulting from mechanical 34 | transformation or translation of a Source form, including but not limited 35 | to compiled object code, generated documentation, and conversions to 36 | other media types. 37 | 38 | "Work" shall mean the work of authorship, whether in Source or Object 39 | form, made available under the License, as indicated by a copyright 40 | notice that is included in or attached to the work (an example is 41 | provided in the Appendix below). 42 | 43 | "Derivative Works" shall mean any work, whether in Source or Object form, 44 | that is based on (or derived from) the Work and for which the editorial 45 | revisions, annotations, elaborations, or other modifications represent, 46 | as a whole, an original work of authorship. For the purposes of this 47 | License, Derivative Works shall not include works that remain separable 48 | from, or merely link (or bind by name) to the interfaces of, the Work and 49 | Derivative Works thereof. 50 | 51 | "Contribution" shall mean any work of authorship, including the original 52 | version of the Work and any modifications or additions to that Work or 53 | Derivative Works thereof, that is intentionally submitted to Licensor for 54 | inclusion in the Work by the copyright owner or by an individual or Legal 55 | Entity authorized to submit on behalf of the copyright owner. For the 56 | purposes of this definition, "submitted" means any form of electronic, 57 | verbal, or written communication sent to the Licensor or its 58 | representatives, including but not limited to communication on electronic 59 | mailing lists, source code control systems, and issue tracking systems 60 | that are managed by, or on behalf of, the Licensor for the purpose of 61 | discussing and improving the Work, but excluding communication that is 62 | conspicuously marked or otherwise designated in writing by the copyright 63 | owner as "Not a Contribution." 64 | 65 | "Contributor" shall mean Licensor and any individual or Legal Entity on 66 | behalf of whom a Contribution has been received by Licensor and 67 | subsequently incorporated within the Work. 68 | 69 | 2. Grant of Copyright License. Subject to the terms and conditions of this 70 | License, each Contributor hereby grants to You a perpetual, worldwide, 71 | non-exclusive, no-charge, royalty-free, irrevocable copyright license to 72 | reproduce, prepare Derivative Works of, publicly display, publicly perform, 73 | sublicense, and distribute the Work and such Derivative Works in Source or 74 | Object form. 75 | 76 | 3. Grant of Patent License. Subject to the terms and conditions of this 77 | License, each Contributor hereby grants to You a perpetual, worldwide, 78 | non-exclusive, no-charge, royalty-free, irrevocable (except as stated in 79 | this section) patent license to make, have made, use, offer to sell, sell, 80 | import, and otherwise transfer the Work, where such license applies only to 81 | those patent claims licensable by such Contributor that are necessarily 82 | infringed by their Contribution(s) alone or by combination of their 83 | Contribution(s) with the Work to which such Contribution(s) was submitted. 84 | If You institute patent litigation against any entity (including a 85 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 86 | Contribution incorporated within the Work constitutes direct or contributory 87 | patent infringement, then any patent licenses granted to You under this 88 | License for that Work shall terminate as of the date such litigation is 89 | filed. 90 | 91 | 4. Redistribution. You may reproduce and distribute copies of the Work or 92 | Derivative Works thereof in any medium, with or without modifications, and 93 | in Source or Object form, provided that You meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or Derivative Works a 96 | copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices stating 99 | that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works that You 102 | distribute, all copyright, patent, trademark, and attribution notices 103 | from the Source form of the Work, excluding those notices that do not 104 | pertain to any part of the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must include 108 | a readable copy of the attribution notices contained within such NOTICE 109 | file, excluding those notices that do not pertain to any part of the 110 | Derivative Works, in at least one of the following places: within a 111 | NOTICE text file distributed as part of the Derivative Works; within the 112 | Source form or documentation, if provided along with the Derivative 113 | Works; or, within a display generated by the Derivative Works, if and 114 | wherever such third-party notices normally appear. The contents of the 115 | NOTICE file are for informational purposes only and do not modify the 116 | License. You may add Your own attribution notices within Derivative Works 117 | that You distribute, alongside or as an addendum to the NOTICE text from 118 | the Work, provided that such additional attribution notices cannot be 119 | construed as modifying the License. 120 | 121 | You may add Your own copyright statement to Your modifications and may 122 | provide additional or different license terms and conditions for use, 123 | reproduction, or distribution of Your modifications, or for any such 124 | Derivative Works as a whole, provided Your use, reproduction, and 125 | distribution of the Work otherwise complies with the conditions stated in 126 | this License. 127 | 128 | 5. Submission of Contributions. Unless You explicitly state otherwise, any 129 | Contribution intentionally submitted for inclusion in the Work by You to the 130 | Licensor shall be under the terms and conditions of this License, without 131 | any additional terms or conditions. Notwithstanding the above, nothing 132 | herein shall supersede or modify the terms of any separate license agreement 133 | you may have executed with Licensor regarding such Contributions. 134 | 135 | 6. Trademarks. This License does not grant permission to use the trade 136 | names, trademarks, service marks, or product names of the Licensor, except 137 | as required for reasonable and customary use in describing the origin of the 138 | Work and reproducing the content of the NOTICE file. 139 | 140 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in 141 | writing, Licensor provides the Work (and each Contributor provides its 142 | Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 143 | KIND, either express or implied, including, without limitation, any 144 | warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or 145 | FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining 146 | the appropriateness of using or redistributing the Work and assume any risks 147 | associated with Your exercise of permissions under this License. 148 | 149 | 8. Limitation of Liability. In no event and under no legal theory, whether 150 | in tort (including negligence), contract, or otherwise, unless required by 151 | applicable law (such as deliberate and grossly negligent acts) or agreed to 152 | in writing, shall any Contributor be liable to You for damages, including 153 | any direct, indirect, special, incidental, or consequential damages of any 154 | character arising as a result of this License or out of the use or inability 155 | to use the Work (including but not limited to damages for loss of goodwill, 156 | work stoppage, computer failure or malfunction, or any and all other 157 | commercial damages or losses), even if such Contributor has been advised of 158 | the possibility of such damages. 159 | 160 | 9. Accepting Warranty or Additional Liability. While redistributing the Work 161 | or Derivative Works thereof, You may choose to offer, and charge a fee for, 162 | acceptance of support, warranty, indemnity, or other liability obligations 163 | and/or rights consistent with this License. However, in accepting such 164 | obligations, You may act only on Your own behalf and on Your sole 165 | responsibility, not on behalf of any other Contributor, and only if You 166 | agree to indemnify, defend, and hold each Contributor harmless for any 167 | liability incurred by, or claims asserted against, such Contributor by 168 | reason of your accepting any such warranty or additional liability. END OF 169 | TERMS AND CONDITIONS 170 | 171 | APPENDIX: How to apply the Apache License to your work. 172 | 173 | To apply the Apache License to your work, attach the following boilerplate 174 | notice, with the fields enclosed by brackets "[]" replaced with your own 175 | identifying information. (Don't include the brackets!) The text should be 176 | enclosed in the appropriate comment syntax for the file format. We also 177 | recommend that a file or class name and description of purpose be included on 178 | the same "printed page" as the copyright notice for easier identification 179 | within third-party archives. 180 | 181 | Copyright 2020 Amazon Web Services 182 | 183 | Licensed under the Apache License, Version 2.0 (the "License"); 184 | 185 | you may not use this file except in compliance with the License. 186 | 187 | You may obtain a copy of the License at 188 | 189 | http://www.apache.org/licenses/LICENSE-2.0 190 | 191 | Unless required by applicable law or agreed to in writing, software 192 | 193 | distributed under the License is distributed on an "AS IS" BASIS, 194 | 195 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 196 | 197 | See the License for the specific language governing permissions and 198 | 199 | limitations under the License. 200 | 201 | * For tensorflow see also this required NOTICE: 202 | Copyright 2019 The TensorFlow Authors. All rights reserved. 203 | -------------------------------------------------------------------------------- /awsio/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from awsio import python 16 | from awsio._version import __version__ 17 | -------------------------------------------------------------------------------- /awsio/csrc/io/s3/s3_file_wrapper.cpp: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | // Licensed under the Apache License, Version 2.0 (the "License"). 4 | // You may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | #include "pybind11/pybind11.h" 21 | #include "s3_io.h" 22 | 23 | namespace { 24 | namespace py = pybind11; 25 | using awsio::S3Init; 26 | PYBIND11_MODULE(_pywrap_s3_io, m) { 27 | py::class_(m, "S3Init") 28 | .def(py::init<>()) 29 | .def("s3_read", 30 | [](S3Init* self, const std::string& file_url) { 31 | std::string result; 32 | self->s3_read(file_url, &result); 33 | return py::bytes(result); 34 | }) 35 | .def("list_files", 36 | [](S3Init* self, const std::string& file_url) { 37 | std::vector filenames; 38 | self->list_files(file_url, &filenames); 39 | return filenames; 40 | }) 41 | .def("file_exists", 42 | [](S3Init* self, const std::string& file_url) { 43 | return self->file_exists(file_url); 44 | }) 45 | .def("get_file_size", 46 | [](S3Init* self, const std::string& file_url) { 47 | return self->get_file_size(file_url); 48 | }); 49 | } 50 | } // namespace 51 | -------------------------------------------------------------------------------- /awsio/csrc/io/s3/s3_io.cpp: -------------------------------------------------------------------------------- 1 | // Original Copyright 2015 The TensorFlow Authors. Licensed under the Apache License, Version 2.0 2 | // Modifications Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | 4 | // Licensed under the Apache License, Version 2.0 (the "License"). 5 | // You may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #include "s3_io.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include 38 | #include 39 | 40 | namespace awsio { 41 | namespace { 42 | static const size_t s3ReadBufferSize = 120 * 1024 * 1024; // 16 MB 43 | static const uint64_t s3MultiPartDownloadChunkSize = 50 * 1024 * 1024; // 50 MB 44 | static const int downloadRetries = 3; 45 | static const int64_t s3TimeoutMsec = 300000; 46 | static const int executorPoolSize = 25; 47 | static const int S3GetFilesMaxKeys = 100; 48 | 49 | Aws::Client::ClientConfiguration &setUpS3Config() { 50 | static Aws::Client::ClientConfiguration cfg; 51 | Aws::String config_file; 52 | const char *config_file_env = getenv("AWS_CONFIG_FILE"); 53 | if (config_file_env) { 54 | config_file = config_file_env; 55 | } else { 56 | const char *home_env = getenv("HOME"); 57 | if (home_env) { 58 | config_file = home_env; 59 | config_file += "/.aws/config"; 60 | } 61 | } 62 | Aws::Config::AWSConfigFileProfileConfigLoader loader(config_file); 63 | loader.Load(); 64 | 65 | const char *use_https = getenv("S3_USE_HTTPS"); 66 | if (use_https) { 67 | if (use_https[0] == '0') { 68 | cfg.scheme = Aws::Http::Scheme::HTTP; 69 | } else { 70 | cfg.scheme = Aws::Http::Scheme::HTTPS; 71 | } 72 | } 73 | const char *verify_ssl = getenv("S3_VERIFY_SSL"); 74 | if (verify_ssl) { 75 | if (verify_ssl[0] == '0') { 76 | cfg.verifySSL = false; 77 | } else { 78 | cfg.verifySSL = true; 79 | } 80 | } 81 | 82 | const char *region = getenv("AWS_REGION"); 83 | if (region) { 84 | cfg.region = region; 85 | } else { 86 | cfg.region = "us-west-2"; 87 | } 88 | 89 | const char *endpoint_url = getenv("S3_ENDPOINT_URL"); 90 | if (endpoint_url) { 91 | cfg.endpointOverride = endpoint_url; 92 | } 93 | 94 | const char *proxy_host = getenv("S3_PROXY_HOST"); 95 | if (proxy_host) { 96 | cfg.proxyHost = proxy_host; 97 | } 98 | 99 | const char *proxy_port = getenv("S3_PROXY_PORT"); 100 | if (proxy_port) { 101 | cfg.proxyPort = atoi(proxy_port); 102 | } 103 | return cfg; 104 | } 105 | 106 | void ShutdownClient(std::shared_ptr *s3_client) { 107 | if (s3_client != nullptr) { 108 | delete s3_client; 109 | Aws::SDKOptions options; 110 | Aws::ShutdownAPI(options); 111 | } 112 | } 113 | 114 | void ShutdownTransferManager( 115 | std::shared_ptr *transfer_manager) { 116 | if (transfer_manager != nullptr) { 117 | delete transfer_manager; 118 | } 119 | } 120 | 121 | void ShutdownExecutor(Aws::Utils::Threading::PooledThreadExecutor *executor) { 122 | if (executor != nullptr) { 123 | delete executor; 124 | } 125 | } 126 | 127 | void parseS3Path(const std::string &fname, std::string *bucket, 128 | std::string *object) { 129 | if (fname.empty()) { 130 | throw std::invalid_argument{"The filename cannot be an empty string."}; 131 | } 132 | 133 | if (fname.size() < 5 || fname.substr(0, 5) != "s3://") { 134 | throw std::invalid_argument{ 135 | "The filename must start with the S3 scheme."}; 136 | } 137 | 138 | std::string path = fname.substr(5); 139 | 140 | if (path.empty()) { 141 | throw std::invalid_argument{"The filename cannot be an empty string."}; 142 | } 143 | 144 | auto pos = path.find_first_of('/'); 145 | if (pos == 0) { 146 | throw std::invalid_argument{ 147 | "The filename does not contain a bucket name."}; 148 | } 149 | 150 | *bucket = path.substr(0, pos); 151 | *object = path.substr(pos + 1); 152 | if (pos == std::string::npos) { 153 | *object = ""; 154 | } 155 | } 156 | 157 | class S3FS { 158 | public: 159 | S3FS(const std::string &bucket, const std::string &object, 160 | const bool multi_part_download, 161 | std::shared_ptr transfer_manager, 162 | std::shared_ptr s3_client) 163 | : bucket_name_(bucket), 164 | object_name_(object), 165 | multi_part_download_(multi_part_download), 166 | transfer_manager_(transfer_manager), 167 | s3_client_(s3_client) {} 168 | 169 | size_t read(uint64_t offset, size_t n, char *buffer) { 170 | if (multi_part_download_) { 171 | return readS3TransferManager(offset, n, buffer); 172 | } else { 173 | return readS3Client(offset, n, buffer); 174 | } 175 | } 176 | 177 | size_t readS3Client(uint64_t offset, size_t n, char *buffer) { 178 | Aws::S3::Model::GetObjectRequest getObjectRequest; 179 | 180 | getObjectRequest.WithBucket(this->bucket_name_.c_str()) 181 | .WithKey(this->object_name_.c_str()); 182 | 183 | std::string bytes = "bytes="; 184 | bytes += std::to_string(offset) + "-" + std::to_string(offset + n - 1); 185 | 186 | getObjectRequest.SetRange(bytes.c_str()); 187 | 188 | // When you don’t want to load the entire file into memory, 189 | // you can use IOStreamFactory in AmazonWebServiceRequest to pass a 190 | // lambda to create a string stream. 191 | getObjectRequest.SetResponseStreamFactory( 192 | []() { return Aws::New("S3IOAllocationTag"); }); 193 | // get the object 194 | auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest); 195 | 196 | if (!getObjectOutcome.IsSuccess()) { 197 | auto error = getObjectOutcome.GetError(); 198 | std::cout << "ERROR: " << error.GetExceptionName() << ": " 199 | << error.GetMessage() << std::endl; 200 | return 0; 201 | } else { 202 | n = getObjectOutcome.GetResult().GetContentLength(); 203 | // read data as a block: 204 | getObjectOutcome.GetResult().GetBody().read(buffer, n); 205 | return n; 206 | } 207 | } 208 | 209 | size_t readS3TransferManager(uint64_t offset, size_t n, char *buffer) { 210 | auto create_stream_fn = [&]() { // create stream lambda fn 211 | return Aws::New( 212 | "S3ReadStream", 213 | Aws::New( 214 | "S3ReadStream", reinterpret_cast(buffer), 215 | n)); 216 | }; // This buffer is what we used to initialize streambuf and is in memory 217 | 218 | std::shared_ptr downloadHandle = 219 | this->transfer_manager_.get()->DownloadFile( 220 | this->bucket_name_.c_str(), this->object_name_.c_str(), offset, 221 | n, create_stream_fn); 222 | downloadHandle->WaitUntilFinished(); 223 | 224 | Aws::OFStream storeFile(object_name_.c_str(), 225 | Aws::OFStream::out | Aws::OFStream::trunc); 226 | 227 | if (downloadHandle->GetStatus() != 228 | Aws::Transfer::TransferStatus::COMPLETED) { 229 | auto error = downloadHandle->GetLastError(); 230 | std::cout << "ERROR: " << error.GetExceptionName() << ": " 231 | << error.GetMessage() << std::endl; 232 | return 0; 233 | } else { 234 | return downloadHandle->GetBytesTransferred(); 235 | } 236 | } 237 | 238 | private: 239 | std::string bucket_name_; 240 | std::string object_name_; 241 | bool multi_part_download_; 242 | std::shared_ptr s3_client_; 243 | std::shared_ptr transfer_manager_; 244 | }; 245 | } // namespace 246 | 247 | S3Init::S3Init() 248 | : s3_client_(nullptr, ShutdownClient), 249 | transfer_manager_(nullptr, ShutdownTransferManager), 250 | executor_(nullptr, ShutdownExecutor), 251 | initialization_lock_() { 252 | // Load reading parameters 253 | buffer_size_ = s3ReadBufferSize; 254 | const char *bufferSizeStr = getenv("S3_BUFFER_SIZE"); 255 | if (bufferSizeStr) { 256 | buffer_size_ = std::stoull(bufferSizeStr); 257 | } 258 | multi_part_download_ = true; 259 | const char *multi_download_disable_char = 260 | getenv("S3_DISABLE_MULTI_PART_DOWNLOAD"); 261 | if (multi_download_disable_char) { 262 | std::string multi_download_disable_str(multi_download_disable_char); 263 | if (multi_download_disable_str == "ON") { 264 | multi_part_download_ = false; 265 | } 266 | } 267 | initializeS3Client(); 268 | } 269 | 270 | S3Init::~S3Init() {} 271 | 272 | std::shared_ptr S3Init::initializeS3Client() { 273 | std::lock_guard lock(this->initialization_lock_); 274 | if (this->s3_client_.get() == nullptr) { 275 | Aws::SDKOptions options; 276 | Aws::InitAPI(options); 277 | 278 | // Set up the request 279 | this->s3_client_ = 280 | std::shared_ptr(new Aws::S3::S3Client( 281 | setUpS3Config(), 282 | Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, 283 | false)); 284 | } 285 | return this->s3_client_; 286 | } 287 | 288 | std::shared_ptr 289 | S3Init::initializeExecutor() { 290 | if (this->executor_.get() == nullptr) { 291 | this->executor_ = 292 | Aws::MakeShared( 293 | "executor", executorPoolSize); 294 | } 295 | return this->executor_; 296 | } 297 | 298 | std::shared_ptr 299 | S3Init::initializeTransferManager() { 300 | std::shared_ptr s3_client = initializeS3Client(); 301 | std::lock_guard lock(this->initialization_lock_); 302 | 303 | if (this->transfer_manager_.get() == nullptr) { 304 | Aws::Transfer::TransferManagerConfiguration transfer_config( 305 | initializeExecutor().get()); 306 | transfer_config.s3Client = s3_client; 307 | // This buffer is what we used to initialize streambuf and is in memory 308 | transfer_config.bufferSize = s3MultiPartDownloadChunkSize; 309 | transfer_config.transferBufferMaxHeapSize = 310 | (executorPoolSize + 1) * s3MultiPartDownloadChunkSize; 311 | this->transfer_manager_ = 312 | Aws::Transfer::TransferManager::Create(transfer_config); 313 | } 314 | return this->transfer_manager_; 315 | } 316 | 317 | void S3Init::s3_read(const std::string &file_url, std::string *result) { 318 | std::string bucket, object; 319 | parseS3Path(file_url, &bucket, &object); 320 | S3FS s3handler(bucket, object, multi_part_download_, 321 | initializeTransferManager(), initializeS3Client()); 322 | 323 | uint64_t offset = 0; 324 | uint64_t result_size = 0; 325 | uint64_t file_size = this->get_file_size(bucket, object); 326 | std::size_t part_count = (std::max)( 327 | static_cast((file_size + buffer_size_ - 1) / buffer_size_), 328 | static_cast(1)); 329 | result->resize(file_size); 330 | 331 | for (int i = 0; i < part_count; i++) { 332 | 333 | offset = result_size; 334 | 335 | size_t buf_len = std::min(buffer_size_, file_size - result_size); 336 | 337 | size_t read_len = 338 | s3handler.read(offset, buf_len, (char *)(result->data()) + offset); 339 | 340 | result_size += read_len; 341 | 342 | if (result_size == file_size) { 343 | break; 344 | } 345 | 346 | if (read_len != buf_len) { 347 | std::cout << "Result size and buffer size did not match"; 348 | break; 349 | } 350 | } 351 | } 352 | 353 | bool S3Init::file_exists(const std::string &file_url) { 354 | std::string bucket, object; 355 | parseS3Path(file_url, &bucket, &object); 356 | Aws::S3::Model::HeadObjectRequest headObjectRequest; 357 | headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str()); 358 | auto headObjectOutcome = 359 | this->initializeS3Client()->HeadObject(headObjectRequest); 360 | if (headObjectOutcome.IsSuccess()) { 361 | return true; 362 | } 363 | return false; 364 | } 365 | 366 | size_t S3Init::get_file_size(const std::string &bucket, 367 | const std::string &object) { 368 | Aws::S3::Model::HeadObjectRequest headObjectRequest; 369 | headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str()); 370 | auto headObjectOutcome = 371 | this->initializeS3Client()->HeadObject(headObjectRequest); 372 | if (headObjectOutcome.IsSuccess()) { 373 | return headObjectOutcome.GetResult().GetContentLength(); 374 | } 375 | Aws::String const &error_aws = headObjectOutcome.GetError().GetMessage(); 376 | std::string error_str(error_aws.c_str(), error_aws.size()); 377 | throw std::invalid_argument(error_str); 378 | return 0; 379 | } 380 | 381 | size_t S3Init::get_file_size(const std::string &file_url){ 382 | std::string bucket, object; 383 | parseS3Path(file_url, &bucket, &object); 384 | return this->get_file_size(bucket, object); 385 | } 386 | 387 | void S3Init::list_files(const std::string &file_url, 388 | std::vector *filenames) { 389 | std::string bucket, prefix; 390 | parseS3Path(file_url, &bucket, &prefix); 391 | Aws::String default_key = ""; 392 | if (prefix.empty()) { 393 | default_key = "/"; 394 | } 395 | 396 | Aws::S3::Model::ListObjectsRequest listObjectsRequest; 397 | listObjectsRequest.WithBucket(bucket.c_str()) 398 | .WithPrefix(prefix.c_str()) 399 | .WithMaxKeys(S3GetFilesMaxKeys); 400 | 401 | Aws::S3::Model::ListObjectsResult listObjectsResult; 402 | do { 403 | auto listObjectsOutcome = 404 | this->initializeS3Client()->ListObjects(listObjectsRequest); 405 | if (!listObjectsOutcome.IsSuccess()) { 406 | Aws::String const &error_aws = 407 | listObjectsOutcome.GetError().GetMessage(); 408 | std::string error_str(error_aws.c_str(), error_aws.size()); 409 | throw std::invalid_argument(error_str); 410 | } 411 | 412 | listObjectsResult = listObjectsOutcome.GetResult(); 413 | Aws::Vector objects = listObjectsResult.GetContents(); 414 | if (!objects.empty()) { 415 | for (const auto &object : objects) { 416 | Aws::String key = default_key + object.GetKey(); 417 | if (key.back() == '/') { 418 | continue; 419 | } 420 | Aws::String bucket_aws(bucket.c_str(), bucket.size()); 421 | Aws::String entry = "s3://" + bucket_aws + "/" + object.GetKey(); 422 | filenames->push_back(entry.c_str()); 423 | } 424 | listObjectsRequest.SetMarker(listObjectsResult.GetContents().back().GetKey()); 425 | } 426 | } while (listObjectsResult.GetIsTruncated()); 427 | } 428 | 429 | } // namespace awsio 430 | -------------------------------------------------------------------------------- /awsio/csrc/io/s3/s3_io.h: -------------------------------------------------------------------------------- 1 | // Original Copyright 2015 The TensorFlow Authors. Licensed under the Apache License, Version 2.0 2 | // Modifications Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | 4 | // Licensed under the Apache License, Version 2.0 (the "License"). 5 | // You may not use this file except in compliance with the License. 6 | // You may obtain a copy of the License at 7 | 8 | // http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | // Unless required by applicable law or agreed to in writing, software 11 | // distributed under the License is distributed on an "AS IS" BASIS, 12 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | // See the License for the specific language governing permissions and 14 | // limitations under the License. 15 | 16 | #ifndef AWSIO_S3_IO_H 17 | #define AWSIO_S3_IO_H 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | 26 | namespace awsio { 27 | // In memory stream implementation 28 | class S3UnderlyingStream : public Aws::IOStream { 29 | public: 30 | using Base = Aws::IOStream; 31 | 32 | // provide a customer controlled streambuf, so as to put all transferred 33 | // data into this in memory buffer. 34 | S3UnderlyingStream(std::streambuf *buf) : Base(buf) {} 35 | 36 | virtual ~S3UnderlyingStream() = default; 37 | }; 38 | 39 | class S3Init { 40 | private: 41 | std::shared_ptr s3_client_; 42 | std::shared_ptr executor_; 43 | std::shared_ptr transfer_manager_; 44 | size_t buffer_size_; 45 | bool multi_part_download_; 46 | 47 | size_t get_file_size(const std::string &bucket, const std::string &object); 48 | 49 | public: 50 | S3Init(); 51 | 52 | ~S3Init(); 53 | 54 | std::mutex initialization_lock_; 55 | 56 | std::shared_ptr initializeS3Client(); 57 | std::shared_ptr 58 | initializeExecutor(); 59 | std::shared_ptr initializeTransferManager(); 60 | 61 | void s3_read(const std::string &file_url, std::string *result); 62 | size_t get_file_size(const std::string &file_url); 63 | bool file_exists(const std::string &file_url); 64 | void list_files(const std::string &file_url, 65 | std::vector *filenames); 66 | }; 67 | } // namespace awsio 68 | 69 | #endif // AWSIO_S3_IO_H 70 | -------------------------------------------------------------------------------- /awsio/python/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import lib 16 | -------------------------------------------------------------------------------- /awsio/python/lib/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import io 16 | -------------------------------------------------------------------------------- /awsio/python/lib/io/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from . import s3 16 | -------------------------------------------------------------------------------- /awsio/python/lib/io/s3/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .s3dataset import S3Dataset, S3IterableDataset, ShuffleDataset 16 | from .s3dataset import list_files, get_file_size, file_exists 17 | -------------------------------------------------------------------------------- /awsio/python/lib/io/s3/s3dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import tarfile 16 | import io 17 | import zipfile 18 | import re 19 | from torch.utils.data import IterableDataset, Dataset 20 | import torch 21 | import torch.distributed as dist 22 | import _pywrap_s3_io 23 | import random 24 | from itertools import chain 25 | 26 | meta_prefix = "__" 27 | meta_suffix = "__" 28 | 29 | def reraise_exception(exn): # pragma: no cover 30 | """Called in an exception handler to re-raise the exception.""" 31 | raise exn 32 | 33 | 34 | def tardata(fileobj, skip_meta=r"__[^/]*__($|/)", handler=reraise_exception): 35 | """Iterator yielding filename, content pairs for the given tar stream. 36 | """ 37 | # eliminated from test coverage since checking requires invalid tarfile 38 | try: 39 | stream = tarfile.open(fileobj=io.BytesIO(fileobj), mode="r|*") 40 | for tarinfo in stream: 41 | try: 42 | if not tarinfo.isreg(): # pragma: no cover 43 | continue 44 | fname = tarinfo.name 45 | if fname is None: # pragma: no cover 46 | continue 47 | if ("/" not in fname and fname.startswith(meta_prefix) 48 | and fname.endswith(meta_suffix)): # pragma: no cover 49 | # skipping metadata for now 50 | continue 51 | if skip_meta is not None and re.match(skip_meta, fname): # pragma: no cover 52 | continue 53 | data = stream.extractfile(tarinfo).read() 54 | yield fname, data 55 | except Exception as exn: # pragma: no cover 56 | if handler(exn): 57 | continue 58 | else: 59 | break 60 | del stream 61 | except Exception as exn: # pragma: no cover 62 | handler(exn) 63 | 64 | 65 | def zipdata(fileobj, handler=reraise_exception): 66 | """Iterator yielding filename, content pairs for the given zip stream. 67 | """ 68 | # eliminated from test coverage since checking requires invalid zipfile 69 | try: 70 | with zipfile.ZipFile(io.BytesIO(fileobj), 'r') as zfile: 71 | try: 72 | for file_ in zfile.namelist(): 73 | data = zfile.read(file_) 74 | yield file_, data 75 | except Exception as exn: # pragma: no cover 76 | print("Error:", exn) 77 | except Exception as exn: # pragma: no cover 78 | print("Error:", exn) 79 | 80 | 81 | def file_exists(url): 82 | """Return if file exists or not""" 83 | handler = _pywrap_s3_io.S3Init() 84 | return handler.file_exists(url) 85 | 86 | 87 | def get_file_size(url): 88 | """Return the file size of the specified file""" 89 | handler = _pywrap_s3_io.S3Init() 90 | return handler.get_file_size(url) 91 | 92 | 93 | def list_files(url): 94 | """Returns a list of entries under the same prefix. 95 | """ 96 | handler = _pywrap_s3_io.S3Init() 97 | return handler.list_files(url) 98 | 99 | 100 | class S3BaseClass(object): 101 | """A base class for defining urls_list for S3Dataset and S3IterableDataset 102 | """ 103 | def __init__(self, urls_list): 104 | urls = [urls_list] if isinstance(urls_list, str) else urls_list 105 | self._urls_list = self.create_urls_list(urls) 106 | 107 | def create_urls_list(self, urls): 108 | handler = _pywrap_s3_io.S3Init() 109 | urls_list = list() 110 | for url in urls: 111 | if not handler.file_exists(url): 112 | url_objects = handler.list_files(url) 113 | assert len(url_objects) != 0, \ 114 | f"The directory {url} does not contain any objects." 115 | urls_list.extend(url_objects) 116 | elif urls_list: 117 | urls_list.append(url) 118 | else: 119 | urls_list = [url] 120 | return urls_list 121 | 122 | @property 123 | def urls_list(self): 124 | return self._urls_list 125 | 126 | 127 | class S3Dataset(S3BaseClass, Dataset): 128 | """A mapped-style dataset for objects in s3. 129 | """ 130 | def __init__(self, urls_list): 131 | """ 132 | Args: 133 | urls_list (string or list of strings): the prefix(es) and 134 | filenames starting with 's3://'. Each string is assumed 135 | as a filename first. If the file doesn't exist, the string 136 | is assumed as a prefix. 137 | """ 138 | S3BaseClass.__init__(self, urls_list) 139 | # Initialize the handler in the worker since we want each worker to have 140 | # it's own handler 141 | self.handler = None 142 | 143 | def __len__(self): 144 | return len(self.urls_list) 145 | 146 | def __getitem__(self, idx): 147 | if self.handler == None: 148 | self.handler = _pywrap_s3_io.S3Init() 149 | filename = self.urls_list[idx] 150 | fileobj = self.handler.s3_read(filename) 151 | return filename, fileobj 152 | 153 | 154 | class S3IterableDataset(S3BaseClass, IterableDataset): 155 | """Iterate over s3 dataset. 156 | It handles some bookkeeping related to DataLoader. 157 | """ 158 | def __init__(self, urls_list, shuffle_urls=False): 159 | self.epoch = 0 160 | self.shuffle_urls = shuffle_urls 161 | self.dist = dist.is_initialized() if dist.is_available() else False 162 | if self.dist: 163 | self.world_size = dist.get_world_size() 164 | self.rank = dist.get_rank() 165 | S3BaseClass.__init__(self, urls_list) 166 | 167 | @property 168 | def shuffled_list(self): 169 | if self.shuffle_urls: 170 | random.seed(self.epoch) 171 | return random.sample(self.urls_list, len(self.urls_list)) 172 | else: 173 | return self.urls_list 174 | 175 | def download_data(self, filename): 176 | if filename[-3:] == "tar": 177 | tarfile = tardata(self.handler.s3_read(filename)) 178 | for fname, content in tarfile: 179 | yield fname, content 180 | elif filename[-3:] == "zip": 181 | zipfile = zipdata(self.handler.s3_read(filename)) 182 | for fname, content in zipfile: 183 | yield fname, content 184 | else: 185 | yield filename, self.handler.s3_read(filename) 186 | 187 | def get_stream(self, urls_list): 188 | return chain.from_iterable(map(self.download_data, urls_list)) 189 | 190 | def worker_dist(self, urls): 191 | if self.dist: 192 | total_size = len(urls) 193 | urls = urls[self.rank:total_size:self.world_size] 194 | 195 | worker_info = torch.utils.data.get_worker_info() 196 | if worker_info is not None: 197 | wid = worker_info.id 198 | num_workers = worker_info.num_workers 199 | length = len(urls) 200 | return urls[wid:length:num_workers] 201 | else: 202 | return urls 203 | 204 | def __iter__(self): 205 | self.handler = _pywrap_s3_io.S3Init() 206 | urls = self.worker_dist(self.shuffled_list) 207 | return self.get_stream(urls) 208 | 209 | def __len__(self): 210 | return len(self.urls_list) 211 | 212 | def set_epoch(self, epoch): 213 | self.epoch = epoch 214 | 215 | 216 | class ShuffleDataset(torch.utils.data.IterableDataset): 217 | def __init__(self, dataset, buffer_size): 218 | super().__init__() 219 | self.dataset = dataset 220 | self.buffer_size = buffer_size 221 | 222 | def __iter__(self): 223 | shufbuf = [] 224 | try: 225 | dataset_iter = iter(self.dataset) 226 | for _ in range(self.buffer_size): 227 | shufbuf.append(next(dataset_iter)) 228 | except StopIteration: 229 | self.buffer_size = len(shufbuf) 230 | 231 | try: 232 | while True: 233 | try: 234 | if self.buffer_size == 0: 235 | break 236 | evict_idx = random.randint(0, self.buffer_size - 1) 237 | yield shufbuf.pop(evict_idx) 238 | item = next(dataset_iter) 239 | shufbuf.append(item) 240 | except StopIteration: 241 | break 242 | while len(shufbuf) > 0: 243 | evict_idx = random.randint(0, len(shufbuf) - 1) 244 | yield shufbuf.pop(evict_idx) 245 | except GeneratorExit: # pragma: no cover 246 | pass 247 | -------------------------------------------------------------------------------- /examples/s3_cv_iterable_example.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import IterableDataset, DataLoader 2 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset 3 | from itertools import islice 4 | from PIL import Image 5 | import io 6 | from torchvision import transforms 7 | 8 | 9 | class ImageNetS3(IterableDataset): 10 | def __init__(self, url_list, shuffle_urls=False, transform=None): 11 | self.s3_iter_dataset = S3IterableDataset(url_list, 12 | shuffle_urls) 13 | self.transform = transform 14 | 15 | 16 | def data_generator(self): 17 | try: 18 | while True: 19 | # Based on aplhabetical order of files sequence of label and image will change. 20 | # e.g. for files 0186304.cls 0186304.jpg, 0186304.cls will be fetched first 21 | label_fname, label_fobj = next(self.s3_iter_dataset_iterator) 22 | image_fname, image_fobj = next(self.s3_iter_dataset_iterator) 23 | label = int(label_fobj) 24 | image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB') 25 | 26 | # Apply torch visioin transforms if provided 27 | if self.transform is not None: 28 | image_np = self.transform(image_np) 29 | yield image_np, label 30 | 31 | except StopIteration: 32 | raise StopIteration 33 | 34 | def __iter__(self): 35 | self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset) 36 | return self.data_generator() 37 | 38 | batch_size = 32 39 | 40 | url_list = ["s3://image-data-bucket/imagenet-train-000000.tar"] 41 | # Torchvision transforms to apply on data 42 | 43 | preproc = transforms.Compose([ 44 | transforms.ToTensor(), 45 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 46 | transforms.Resize((100, 100)) 47 | ]) 48 | 49 | dataset = ImageNetS3(url_list, transform=preproc) 50 | 51 | dataloader = DataLoader(dataset, 52 | batch_size=batch_size, 53 | num_workers=64) 54 | 55 | for image, label in islice(dataset, 0, 3): 56 | print(image.shape, label) -------------------------------------------------------------------------------- /examples/s3_cv_iterable_shuffle_example.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import IterableDataset, DataLoader 2 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset 3 | from PIL import Image 4 | import io 5 | from torchvision import transforms 6 | 7 | 8 | class ImageNetS3(IterableDataset): 9 | def __init__(self, url_list, shuffle_urls=False, transform=None): 10 | self.s3_iter_dataset = S3IterableDataset(url_list, 11 | shuffle_urls) 12 | self.transform = transform 13 | 14 | 15 | def data_generator(self): 16 | try: 17 | while True: 18 | # Based on aplhabetical order of files sequence of label and image will change. 19 | # e.g. for files 0186304.cls 0186304.jpg, 0186304.cls will be fetched first 20 | label_fname, label_fobj = next(self.s3_iter_dataset_iterator) 21 | image_fname, image_fobj = next(self.s3_iter_dataset_iterator) 22 | label = int(label_fobj) 23 | image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB') 24 | 25 | # Apply torch visioin transforms if provided 26 | if self.transform is not None: 27 | image_np = self.transform(image_np) 28 | yield image_np, label 29 | 30 | except StopIteration: 31 | raise StopIteration 32 | 33 | def set_epoch(self, epoch): 34 | self.s3_iter_dataset.set_epoch(epoch) 35 | 36 | def __iter__(self): 37 | self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset) 38 | return self.data_generator() 39 | 40 | 41 | url_list = ["s3://pt-s3plugin-test-data-west2/integration_tests/imagenet-train-000000.tar"] 42 | # Torchvision transforms to apply on data 43 | 44 | preproc = transforms.Compose([ 45 | transforms.ToTensor(), 46 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 47 | transforms.Resize((100, 100)) 48 | ]) 49 | 50 | dataset = ImageNetS3(url_list, transform=preproc, shuffle_urls=True) 51 | 52 | dataloader = DataLoader(dataset, num_workers=4, batch_size=32) 53 | 54 | for e in range(5): 55 | dataset.set_epoch(e) 56 | -------------------------------------------------------------------------------- /examples/s3_cv_map_example.py: -------------------------------------------------------------------------------- 1 | 2 | from awsio.python.lib.io.s3.s3dataset import S3Dataset 3 | from torch.utils.data import DataLoader 4 | 5 | url_list = ['s3://image-data-bucket/train/n01440764/n01440764_10026.JPEG', 6 | 's3://image-data-bucket/train/n01440764/n01440764_10027.JPEG', 7 | 's3://image-data-bucket/train/n01440764/n01440764_10029.JPEG'] 8 | 9 | dataset = S3Dataset(url_list) 10 | dataloader = DataLoader(dataset, 11 | batch_size=2, 12 | num_workers=64) 13 | 14 | for i, (image, label) in enumerate(dataloader): 15 | print(type(image), len(image)) 16 | 17 | -------------------------------------------------------------------------------- /examples/s3_cv_transform.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | from awsio.python.lib.io.s3.s3dataset import S3Dataset 3 | from torchvision import transforms 4 | from PIL import Image 5 | import io 6 | 7 | url_list = ['s3://pt-s3plugin-test-data-west2/images/test_9970.JPEG', 8 | 's3://pt-s3plugin-test-data-west2/images/test_9971.JPEG', 9 | 's3://pt-s3plugin-test-data-west2/images/test_9972.JPEG'] 10 | 11 | class S3ImageSet(S3Dataset): 12 | def __init__(self, url, transform=None): 13 | super().__init__(url) 14 | self.transform = transform 15 | 16 | def __getitem__(self, idx) : 17 | img_name, img = super(S3ImageSet, self).__getitem__(idx) 18 | img = Image.open(io.BytesIO(img)).convert('RGB') 19 | if self.transform is not None: 20 | img = self.transform(img) 21 | return img 22 | 23 | preproc = transforms.Compose([ 24 | transforms.ToTensor(), 25 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 26 | transforms.Resize((100, 100)) 27 | ]) 28 | dataset = S3ImageSet(url_list,transform=preproc) 29 | 30 | dataloader = DataLoader(dataset, 31 | batch_size=2, 32 | num_workers=64) 33 | 34 | for i in range(len(dataset)): 35 | print(dataset[i]) 36 | -------------------------------------------------------------------------------- /examples/s3_imagenet_example.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | # Based on https://github.com/pytorch/examples/blob/master/imagenet/main.py 15 | 16 | import argparse 17 | import os 18 | import random 19 | import shutil 20 | import time 21 | import warnings 22 | 23 | import torch 24 | import torch.nn as nn 25 | import torch.nn.parallel 26 | import torch.backends.cudnn as cudnn 27 | import torch.distributed as dist 28 | import torch.optim 29 | import torch.multiprocessing as mp 30 | import torch.utils.data 31 | import torch.utils.data.distributed 32 | import torchvision.transforms as transforms 33 | #import torchvision.datasets as datasets 34 | from torch.utils.data import IterableDataset, DataLoader 35 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset 36 | 37 | import torchvision.models as models 38 | from PIL import Image 39 | import io 40 | from itertools import islice 41 | 42 | model_names = sorted(name for name in models.__dict__ 43 | if name.islower() and not name.startswith("__") 44 | and callable(models.__dict__[name])) 45 | 46 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 47 | 48 | parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', 49 | choices=model_names, 50 | help='model architecture: ' + 51 | ' | '.join(model_names) + 52 | ' (default: resnet18)') 53 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 54 | help='number of data loading workers (default: 4)') 55 | parser.add_argument('--epochs', default=2, type=int, metavar='N', 56 | help='number of total epochs to run') 57 | parser.add_argument('--start-epoch', default=0, type=int, metavar='N', 58 | help='manual epoch number (useful on restarts)') 59 | parser.add_argument('-b', '--batch-size', default=256, type=int, 60 | metavar='N', 61 | help='mini-batch size (default: 256), this is the total ' 62 | 'batch size of all GPUs on the current node when ' 63 | 'using Data Parallel or Distributed Data Parallel') 64 | parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, 65 | metavar='LR', help='initial learning rate', dest='lr') 66 | parser.add_argument('--momentum', default=0.9, type=float, metavar='M', 67 | help='momentum') 68 | parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float, 69 | metavar='W', help='weight decay (default: 1e-4)', 70 | dest='weight_decay') 71 | parser.add_argument('-p', '--print-freq', default=10, type=int, 72 | metavar='N', help='print frequency (default: 10)') 73 | parser.add_argument('--resume', default='', type=str, metavar='PATH', 74 | help='path to latest checkpoint (default: none)') 75 | parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true', 76 | help='evaluate model on validation set') 77 | parser.add_argument('--pretrained', dest='pretrained', action='store_true', 78 | help='use pre-trained model') 79 | parser.add_argument('--world-size', default=-1, type=int, 80 | help='number of nodes for distributed training') 81 | parser.add_argument('--rank', default=-1, type=int, 82 | help='node rank for distributed training') 83 | parser.add_argument('--dist-url', default='tcp://224.66.41.62:23456', type=str, 84 | help='url used to set up distributed training') 85 | parser.add_argument('--dist-backend', default='nccl', type=str, 86 | help='distributed backend') 87 | parser.add_argument('--seed', default=None, type=int, 88 | help='seed for initializing training. ') 89 | parser.add_argument('--gpu', default=None, type=int, 90 | help='GPU id to use.') 91 | parser.add_argument('--multiprocessing-distributed', action='store_true', 92 | help='Use multi-processing distributed training to launch ' 93 | 'N processes per node, which has N GPUs. This is the ' 94 | 'fastest way to use PyTorch for either single node or ' 95 | 'multi node data parallel training') 96 | 97 | best_acc1 = 0 98 | 99 | 100 | class ImageNetS3(IterableDataset): 101 | def __init__(self, url_list, shuffle_urls=False, transform=None): 102 | self.s3_iter_dataset = S3IterableDataset(url_list, 103 | shuffle_urls) 104 | self.transform = transform 105 | 106 | 107 | def data_generator(self): 108 | try: 109 | while True: 110 | # Based on aplhabetical order of files sequence of label and image will change. 111 | # e.g. for files 0186304.cls 0186304.jpg, 0186304.cls will be fetched first 112 | label_fname, label_fobj = next(self.s3_iter_dataset_iterator) 113 | image_fname, image_fobj = next(self.s3_iter_dataset_iterator) 114 | label = int(label_fobj) 115 | image_np = Image.open(io.BytesIO(image_fobj)).convert('RGB') 116 | 117 | # Apply torch visioin transforms if provided 118 | if self.transform is not None: 119 | image_np = self.transform(image_np) 120 | yield image_np, label 121 | 122 | except StopIteration: 123 | return 124 | 125 | def __iter__(self): 126 | self.s3_iter_dataset_iterator = iter(self.s3_iter_dataset) 127 | return self.data_generator() 128 | 129 | def __len__(self): 130 | return 1000 131 | 132 | def main(): 133 | args = parser.parse_args() 134 | 135 | if args.seed is not None: 136 | random.seed(args.seed) 137 | torch.manual_seed(args.seed) 138 | cudnn.deterministic = True 139 | warnings.warn('You have chosen to seed training. ' 140 | 'This will turn on the CUDNN deterministic setting, ' 141 | 'which can slow down your training considerably! ' 142 | 'You may see unexpected behavior when restarting ' 143 | 'from checkpoints.') 144 | 145 | if args.gpu is not None: 146 | warnings.warn('You have chosen a specific GPU. This will completely ' 147 | 'disable data parallelism.') 148 | 149 | if args.dist_url == "env://" and args.world_size == -1: 150 | args.world_size = int(os.environ["WORLD_SIZE"]) 151 | 152 | args.distributed = args.world_size > 1 or args.multiprocessing_distributed 153 | 154 | ngpus_per_node = torch.cuda.device_count() 155 | if args.multiprocessing_distributed: 156 | # Since we have ngpus_per_node processes per node, the total world_size 157 | # needs to be adjusted accordingly 158 | args.world_size = ngpus_per_node * args.world_size 159 | # Use torch.multiprocessing.spawn to launch distributed processes: the 160 | # main_worker process function 161 | mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args)) 162 | else: 163 | # Simply call main_worker function 164 | main_worker(args.gpu, ngpus_per_node, args) 165 | 166 | 167 | def main_worker(gpu, ngpus_per_node, args): 168 | global best_acc1 169 | args.gpu = gpu 170 | 171 | if args.gpu is not None: 172 | print("Use GPU: {} for training".format(args.gpu)) 173 | 174 | if args.distributed: 175 | if args.dist_url == "env://" and args.rank == -1: 176 | args.rank = int(os.environ["RANK"]) 177 | if args.multiprocessing_distributed: 178 | # For multiprocessing distributed training, rank needs to be the 179 | # global rank among all the processes 180 | args.rank = args.rank * ngpus_per_node + gpu 181 | dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, 182 | world_size=args.world_size, rank=args.rank) 183 | # create model 184 | if args.pretrained: 185 | print("=> using pre-trained model '{}'".format(args.arch)) 186 | model = models.__dict__[args.arch](pretrained=True) 187 | else: 188 | print("=> creating model '{}'".format(args.arch)) 189 | model = models.__dict__[args.arch]() 190 | 191 | if not torch.cuda.is_available(): 192 | print('using CPU, this will be slow') 193 | elif args.distributed: 194 | # For multiprocessing distributed, DistributedDataParallel constructor 195 | # should always set the single device scope, otherwise, 196 | # DistributedDataParallel will use all available devices. 197 | if args.gpu is not None: 198 | torch.cuda.set_device(args.gpu) 199 | model.cuda(args.gpu) 200 | # When using a single GPU per process and per 201 | # DistributedDataParallel, we need to divide the batch size 202 | # ourselves based on the total number of GPUs we have 203 | args.batch_size = int(args.batch_size / ngpus_per_node) 204 | args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) 205 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) 206 | else: 207 | model.cuda() 208 | # DistributedDataParallel will divide and allocate batch_size to all 209 | # available GPUs if device_ids are not set 210 | model = torch.nn.parallel.DistributedDataParallel(model) 211 | elif args.gpu is not None: 212 | torch.cuda.set_device(args.gpu) 213 | model = model.cuda(args.gpu) 214 | else: 215 | # DataParallel will divide and allocate batch_size to all available GPUs 216 | if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): 217 | model.features = torch.nn.DataParallel(model.features) 218 | model.cuda() 219 | else: 220 | model = torch.nn.DataParallel(model).cuda() 221 | 222 | # define loss function (criterion) and optimizer 223 | criterion = nn.CrossEntropyLoss().cuda(args.gpu) 224 | 225 | optimizer = torch.optim.SGD(model.parameters(), args.lr, 226 | momentum=args.momentum, 227 | weight_decay=args.weight_decay) 228 | 229 | # optionally resume from a checkpoint 230 | if args.resume: 231 | if os.path.isfile(args.resume): 232 | print("=> loading checkpoint '{}'".format(args.resume)) 233 | if args.gpu is None: 234 | checkpoint = torch.load(args.resume) 235 | else: 236 | # Map model to be loaded to specified single gpu. 237 | loc = 'cuda:{}'.format(args.gpu) 238 | checkpoint = torch.load(args.resume, map_location=loc) 239 | args.start_epoch = checkpoint['epoch'] 240 | best_acc1 = checkpoint['best_acc1'] 241 | if args.gpu is not None: 242 | # best_acc1 may be from a checkpoint from a different GPU 243 | best_acc1 = best_acc1.to(args.gpu) 244 | model.load_state_dict(checkpoint['state_dict']) 245 | optimizer.load_state_dict(checkpoint['optimizer']) 246 | print("=> loaded checkpoint '{}' (epoch {})" 247 | .format(args.resume, checkpoint['epoch'])) 248 | else: 249 | print("=> no checkpoint found at '{}'".format(args.resume)) 250 | 251 | cudnn.benchmark = True 252 | 253 | url_list = ["s3://pt-s3plugin-test-data-west2/integration_tests/imagenet-train-000000.tar"] 254 | 255 | preproc = transforms.Compose([ 256 | transforms.RandomResizedCrop(224), 257 | transforms.RandomHorizontalFlip(), 258 | transforms.ToTensor(), 259 | transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), 260 | ]) 261 | 262 | os.environ['AWS_REGION'] = 'us-west-2' 263 | 264 | train_dataset = ImageNetS3(url_list, transform=preproc) 265 | train_sampler = None 266 | 267 | train_loader = torch.utils.data.DataLoader( 268 | train_dataset, batch_size=args.batch_size, 269 | num_workers=args.workers, pin_memory=True, sampler=train_sampler) 270 | 271 | for epoch in range(args.start_epoch, args.epochs): 272 | if args.distributed: 273 | train_sampler.set_epoch(epoch) 274 | adjust_learning_rate(optimizer, epoch, args) 275 | 276 | # train for one epoch 277 | train(train_loader, model, criterion, optimizer, epoch, args) 278 | 279 | 280 | def train(train_loader, model, criterion, optimizer, epoch, args): 281 | batch_time = AverageMeter('Time', ':6.3f') 282 | data_time = AverageMeter('Data', ':6.3f') 283 | losses = AverageMeter('Loss', ':.4e') 284 | top1 = AverageMeter('Acc@1', ':6.2f') 285 | top5 = AverageMeter('Acc@5', ':6.2f') 286 | progress = ProgressMeter( 287 | len(train_loader), 288 | [batch_time, data_time, losses, top1, top5], 289 | prefix="Epoch: [{}]".format(epoch)) 290 | 291 | # switch to train mode 292 | model.train() 293 | 294 | end = time.time() 295 | for i, (images, target) in enumerate(train_loader): 296 | # measure data loading time 297 | data_time.update(time.time() - end) 298 | 299 | if args.gpu is not None: 300 | images = images.cuda(args.gpu, non_blocking=True) 301 | if torch.cuda.is_available(): 302 | target = target.cuda(args.gpu, non_blocking=True) 303 | 304 | # compute output 305 | output = model(images) 306 | loss = criterion(output, target) 307 | 308 | # measure accuracy and record loss 309 | acc1, acc5 = accuracy(output, target, topk=(1, 5)) 310 | losses.update(loss.item(), images.size(0)) 311 | top1.update(acc1[0], images.size(0)) 312 | top5.update(acc5[0], images.size(0)) 313 | 314 | # compute gradient and do SGD step 315 | optimizer.zero_grad() 316 | loss.backward() 317 | optimizer.step() 318 | 319 | # measure elapsed time 320 | batch_time.update(time.time() - end) 321 | end = time.time() 322 | 323 | if i % args.print_freq == 0: 324 | progress.display(i) 325 | 326 | 327 | class AverageMeter(object): 328 | """Computes and stores the average and current value""" 329 | def __init__(self, name, fmt=':f'): 330 | self.name = name 331 | self.fmt = fmt 332 | self.reset() 333 | 334 | def reset(self): 335 | self.val = 0 336 | self.avg = 0 337 | self.sum = 0 338 | self.count = 0 339 | 340 | def update(self, val, n=1): 341 | self.val = val 342 | self.sum += val * n 343 | self.count += n 344 | self.avg = self.sum / self.count 345 | 346 | def __str__(self): 347 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 348 | return fmtstr.format(**self.__dict__) 349 | 350 | 351 | class ProgressMeter(object): 352 | def __init__(self, num_batches, meters, prefix=""): 353 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 354 | self.meters = meters 355 | self.prefix = prefix 356 | 357 | def display(self, batch): 358 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 359 | entries += [str(meter) for meter in self.meters] 360 | print('\t'.join(entries)) 361 | 362 | def _get_batch_fmtstr(self, num_batches): 363 | num_digits = len(str(num_batches // 1)) 364 | fmt = '{:' + str(num_digits) + 'd}' 365 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 366 | 367 | 368 | def adjust_learning_rate(optimizer, epoch, args): 369 | """Sets the learning rate to the initial LR decayed by 10 every 30 epochs""" 370 | lr = args.lr * (0.1 ** (epoch // 30)) 371 | for param_group in optimizer.param_groups: 372 | param_group['lr'] = lr 373 | 374 | 375 | def accuracy(output, target, topk=(1,)): 376 | """Computes the accuracy over the k top predictions for the specified values of k""" 377 | with torch.no_grad(): 378 | maxk = max(topk) 379 | batch_size = target.size(0) 380 | 381 | _, pred = output.topk(maxk, 1, True, True) 382 | pred = pred.t() 383 | correct = pred.eq(target.view(1, -1).expand_as(pred)) 384 | 385 | res = [] 386 | for k in topk: 387 | correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) 388 | res.append(correct_k.mul_(100.0 / batch_size)) 389 | return res 390 | 391 | 392 | if __name__ == '__main__': 393 | main() 394 | 395 | -------------------------------------------------------------------------------- /examples/s3_nlp_iterable_example.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import IterableDataset, DataLoader 3 | from itertools import islice 4 | # data is in hdf5 format and converted to numpy 5 | import h5py 6 | import numpy as np 7 | 8 | # packages for this example 9 | import io 10 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset 11 | 12 | def create_data_samples_from_file(fileobj): 13 | """Convert bytes from S3IterableDataset to numpy arrays. 14 | Helper function for class s3_dataset. 15 | Returns a list of six numpy arrays which each contain 16 | data (by key) for all samples in a file. 17 | Keyword arguments: 18 | fileobj -- the bytes string provided by S3IterableDataset 19 | """ 20 | keys = ['input_ids', 'input_mask', 'segment_ids', \ 21 | 'masked_lm_positions', 'masked_lm_ids', 'next_sentence_labels'] 22 | dataset = io.BytesIO(fileobj) 23 | with h5py.File(dataset, "r") as f: 24 | data_file = [np.asarray(f[key][:]) for key in keys] 25 | return data_file 26 | 27 | 28 | class s3_dataset(IterableDataset): 29 | """Dataset used for training. 30 | Yields one sample at a time. 31 | """ 32 | def __init__(self, s3_directory): 33 | self.s3_directory = s3_directory 34 | self.dataset = S3IterableDataset(self.s3_directory, shuffle_urls=True) 35 | 36 | def data_generator(self): 37 | try: 38 | while True: 39 | filename, fileobj = next(self.dataset_iter) 40 | # data_samples: list of six numpy arrays (each array contains all samples) 41 | data_samples = create_data_samples_from_file(fileobj) 42 | # transpose data_samples so that each index represents one sample 43 | for sample in list(zip(*data_samples)): 44 | yield sample 45 | 46 | except StopIteration as e: 47 | raise e 48 | 49 | def __iter__(self): 50 | self.dataset_iter = iter(self.dataset) 51 | return self.data_generator() 52 | 53 | 54 | s3_directory = "s3://bert-data-bucket/training/wiki_books_corpus_training" 55 | train_dataset = s3_dataset(s3_directory=s3_directory) 56 | for sample in islice(train_dataset, 0, 1): 57 | print(sample) 58 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [metadata] 5 | license_file = LICENSE 6 | 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | 15 | import os 16 | import re 17 | import sys 18 | import platform 19 | import subprocess 20 | 21 | from pathlib import Path 22 | from setuptools import setup, Extension, find_packages 23 | from setuptools.command.build_ext import build_ext 24 | from distutils.version import LooseVersion 25 | 26 | class CMakeExtension(Extension): 27 | def __init__(self, name, sourcedir=''): 28 | Extension.__init__(self, name, sources=[]) 29 | self.sourcedir = os.path.abspath(sourcedir) 30 | 31 | 32 | class CMakeBuild(build_ext): 33 | def run(self): 34 | try: 35 | out = subprocess.check_output(['cmake', '--version']) 36 | except OSError: 37 | raise RuntimeError("CMake must be installed to build the following extensions: " + 38 | ", ".join(e.name for e in self.extensions)) 39 | 40 | if platform.system() == "Windows": 41 | cmake_version = LooseVersion(re.search(r'version\s*([\d.]+)', out.decode()).group(1)) 42 | if cmake_version < '3.1.0': 43 | raise RuntimeError("CMake >= 3.1.0 is required on Windows") 44 | 45 | for ext in self.extensions: 46 | self.build_extension(ext) 47 | 48 | def build_extension(self, ext): 49 | extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) 50 | # required for auto-detection of auxiliary "native" libs 51 | if not extdir.endswith(os.path.sep): 52 | extdir += os.path.sep 53 | 54 | cmake_args = ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + extdir, 55 | '-DPYTHON_EXECUTABLE=' + sys.executable, 56 | '-DCMAKE_PREFIX_PATH=' + os.environ['CMAKE_PREFIX_PATH'], 57 | '-DCMAKE_CXX_FLAGS=' + "-fPIC"] 58 | 59 | cfg = 'Debug' if self.debug else 'Release' 60 | build_args = ['--config', cfg] 61 | 62 | if platform.system() == "Windows": 63 | cmake_args += ['-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{}={}'.format(cfg.upper(), extdir)] 64 | if sys.maxsize > 2**32: 65 | cmake_args += ['-A', 'x64'] 66 | build_args += ['--', '/m'] 67 | else: 68 | cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg] 69 | build_args += ['--', '-j2'] 70 | 71 | env = os.environ.copy() 72 | env['CXXFLAGS'] = '{} -DVERSION_INFO=\\"{}\\"'.format(env.get('CXXFLAGS', ''), 73 | self.distribution.get_version()) 74 | if not os.path.exists(self.build_temp): 75 | os.makedirs(self.build_temp) 76 | subprocess.check_call(['cmake', ext.sourcedir] + cmake_args, cwd=self.build_temp, env=env) 77 | subprocess.check_call(['cmake', '--build', '.'] + build_args, cwd=self.build_temp) 78 | 79 | 80 | def get_sha(): 81 | try: 82 | return subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('ascii').strip() 83 | except Exception: 84 | return 'Unknown' 85 | 86 | def get_version(sha): 87 | version = open('version.txt', 'r').read().strip() 88 | if sha != 'Unknown': 89 | version += '+' + sha[:7] 90 | return version 91 | 92 | def write_version_file(): 93 | sha = get_sha() 94 | version = get_version(sha) 95 | version_path = os.path.join(Path.cwd(), 'awsio', '_version.py') 96 | with open(version_path, 'w') as f: 97 | f.write(f"__version__ = \"{version}\"\n") 98 | 99 | if __name__ == "__main__": 100 | # metadata 101 | package_name = 'awsio' 102 | required_packages = ["torch>=1.5.1"] 103 | 104 | # define __version__ 105 | write_version_file() 106 | exec(open("awsio/_version.py").read()) 107 | print(f"Building wheel for {package_name}-{__version__}") 108 | 109 | with open('README.md') as f: 110 | readme = f.read() 111 | 112 | setup( 113 | name=package_name, 114 | version=__version__, 115 | author='Amazon Web Services', 116 | author_email='aws-pytorch@amazon.com', 117 | description='A package for creating PyTorch Datasets using objects in AWS S3 buckets', 118 | long_description=readme, 119 | license='Apache License 2.0', 120 | keywords='ML Amazon AWS AI PyTorch', 121 | 122 | # Package info 123 | packages=find_packages(exclude=('test',)), 124 | zip_safe=False, 125 | install_requires=required_packages, 126 | extras_require={ 127 | "scipy": ["scipy"], 128 | }, 129 | ext_modules=[CMakeExtension('aws_io')], 130 | cmdclass=dict(build_ext=CMakeBuild), 131 | classifiers=[ 132 | "Programming Language :: Python :: 3", 133 | "License :: OSI Approved :: Apache Software License", 134 | "Operating System :: OS Independent", 135 | ], 136 | ) 137 | -------------------------------------------------------------------------------- /tests/py-tests/test_integration.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import io 16 | import math 17 | import boto3 18 | from collections import defaultdict 19 | from torch.utils.data import DataLoader 20 | 21 | from awsio.python.lib.io.s3.s3dataset import S3Dataset, S3IterableDataset, ShuffleDataset 22 | from awsio.python.lib.io.s3.s3dataset import tardata, zipdata 23 | 24 | def read_using_boto(bucket, prefix_list): 25 | s= boto3.client('s3') 26 | s3_obj_set = set() 27 | 28 | for prefix in prefix_list: 29 | fs = io.BytesIO() 30 | s.download_fileobj(bucket, 31 | prefix, 32 | fs) 33 | file_content = fs.getvalue() 34 | 35 | if prefix[-3:] == "tar": 36 | tarfile = tardata(file_content) 37 | for fname, content in tarfile: 38 | s3_obj_set.add((fname, content)) 39 | elif prefix[-3:] == "zip": 40 | zipfile = zipdata(file_content) 41 | for fname, content in zipfile: 42 | s3_obj_set.add((fname, content)) 43 | else: 44 | s3_obj_set.add((prefix.split("/")[-1], file_content)) 45 | return s3_obj_set 46 | 47 | def get_file_list(bucket, files_prefix): 48 | s3 = boto3.resource('s3') 49 | my_bucket = s3.Bucket(bucket) 50 | 51 | file_list = [summary.key for summary in my_bucket.objects.filter(Prefix=files_prefix)] 52 | return file_list[1:] 53 | 54 | def run_workers(dataset_type, url_list, batch_size, boto_obj_set): 55 | epochs = 2 56 | dataset_class = eval(dataset_type) 57 | for num_workers in [ 0, 4, 16]: 58 | s3_obj_set = set() 59 | dataset = dataset_class(url_list) 60 | dataloader = DataLoader(dataset, 61 | batch_size=batch_size, 62 | num_workers=num_workers) 63 | for epoch in range(epochs): 64 | print ("\nTesting " + dataset_type + " with {} workers for epoch {}".format( 65 | num_workers, epoch + 1)) 66 | num_batches = 0 67 | for fname, fobj in dataloader: 68 | fname = [x.split("/")[-1] for x in fname] 69 | batch_set = set(map(tuple, zip(fname, fobj))) 70 | s3_obj_set.update(batch_set) 71 | num_batches += 1 72 | 73 | assert s3_obj_set == boto_obj_set, "Test fails for {} workers for".format( 74 | num_workers) + dataset_type 75 | print ("All data correctly loaded for " + dataset_type + " for {} workers".format(num_workers)) 76 | 77 | def test_tarfiles(): 78 | bucket = "pt-s3plugin-test-data-west2" 79 | tarfiles_list = ["integration_tests/imagenet-train-000000.tar"] 80 | 81 | print("\nINITIATING: TARFILES READ TEST") 82 | boto_obj_set = read_using_boto(bucket, tarfiles_list) 83 | batch_size = 32 84 | url_list = ["s3://" + bucket + "/" + tarfile for tarfile in tarfiles_list] 85 | run_workers("S3IterableDataset", url_list, batch_size, boto_obj_set) 86 | 87 | def test_files(): 88 | bucket = "pt-s3plugin-test-data-west2" 89 | files_prefix = "integration_tests/files" 90 | assert files_prefix[-1] != "/", "Enter Prefix without trailing \"/\" else error" 91 | 92 | prefix_list = get_file_list(bucket, files_prefix) 93 | boto_obj_set = read_using_boto(bucket, prefix_list) 94 | batch_size = 32 95 | 96 | print ("\nINITIATING: INDIVIDUAL FILE READ TEST") 97 | url_list = ["s3://" + bucket + "/" + prefix for prefix in prefix_list] 98 | run_workers("S3IterableDataset", url_list, batch_size, boto_obj_set) 99 | run_workers("S3Dataset", url_list, batch_size, boto_obj_set) 100 | 101 | print ("\nINITIATING: READ FILES FROM PREFIX TEST") 102 | url_list = ["s3://" + bucket + "/" + files_prefix] 103 | run_workers("S3IterableDataset", url_list, batch_size, boto_obj_set) 104 | run_workers("S3Dataset", url_list, batch_size, boto_obj_set) 105 | 106 | def test_shuffleurls(): 107 | """ 108 | Args: 109 | bucket : name of the bucket 110 | files_prefix : prefix of the location where files stored 111 | 112 | Logic: 113 | Loop over dataloader twice, once with shuffle_urls as True and once as False 114 | After both runs, 115 | the dataloaded should be the same, the loading order should be different 116 | 117 | Maintains a dictionary each of sets and lists. 118 | The keys of the dictionary is the state of shuffle_urls(True/False) 119 | Values are the set/list of the samples 120 | 121 | Test passes if the set of samples loaded in both cases is same and the list of 122 | samples is diffrent(loading order different - data being shuffled) 123 | """ 124 | bucket = "pt-s3plugin-test-data-west2" 125 | files_prefix = "integration_tests/files" 126 | assert files_prefix[-1] != "/", "Enter Prefix without trailing \"/\" else error" 127 | 128 | prefix_list = get_file_list(bucket, files_prefix) 129 | url_list = ["s3://" + bucket + "/" + prefix for prefix in prefix_list] 130 | batch_size = 32 131 | shuffled_sets = defaultdict(set) 132 | shuffled_lists = defaultdict(list) 133 | 134 | print ("\nINITIATING SHUFFLE TEST") 135 | for shuffle_urls in [True, False]: 136 | dataset = S3IterableDataset(url_list, shuffle_urls=shuffle_urls) 137 | dataloader = DataLoader(dataset, 138 | batch_size=batch_size) 139 | 140 | for fname, fobj in dataloader: 141 | fname = [x.split("/")[-1] for x in fname] 142 | batch_set = set(map(tuple, zip(fname, fobj))) 143 | batch_list = list(map(tuple, zip(fname, fobj))) 144 | shuffled_sets[str(shuffle_urls)].update(batch_set) 145 | shuffled_lists[str(shuffle_urls)].append(batch_list) 146 | assert shuffled_sets['True'] == shuffled_sets['False'] and shuffled_lists['True'] != shuffled_lists['False'], \ 147 | "Shuffling not working correctly" 148 | print ("Shuffle test passed for S3IterableDataset") 149 | 150 | def test_ShuffleDataset(): 151 | """ 152 | Args: 153 | bucket: name of the bucket 154 | tarfiles_list: list of all tarfiles with the prefix 155 | buffer_size: number of files the ShuffleDataset object caches 156 | 157 | Logic: 158 | Loop over the ShuffleDataset Dataloader twice 159 | For the runs, the corresponding batches returned should not be the same 160 | - ensures that shuffling is happening within tarfile constituents 161 | After both the runs, the overall dataloaded should be the same 162 | 163 | If either of these conditions fails, then test fails 164 | """ 165 | bucket = "pt-s3plugin-test-data-west2" 166 | tarfiles_list = ["integration_tests/imagenet-train-000000.tar", 167 | "integration_tests/imagenet-train-000001.tar"] 168 | 169 | url_list = ["s3://" + bucket + "/" + tarfile for tarfile in tarfiles_list] 170 | batch_size = 32 171 | 172 | buffer_size = 300 173 | for num_workers in [0, 16]: 174 | for buffer_size in [30, 300, 3000]: 175 | dataset = ShuffleDataset(S3IterableDataset(url_list), buffer_size=buffer_size) 176 | dataloader = DataLoader(dataset, 177 | batch_size=batch_size, 178 | num_workers=num_workers) 179 | batch_list1 = get_batches(dataloader) 180 | batch_list2 = get_batches(dataloader) 181 | 182 | assert batches_shuffled(batch_list1, batch_list2), "ShuffleDataset Test fails: batches not shuffled" 183 | assert batches_congruent(batch_list1, batch_list2), "ShuffleDataset Test fails: data mismatch" 184 | print ("ShuffleDataset test passes for {} buffer_size & {} workers ".format( 185 | buffer_size, num_workers)) 186 | 187 | def get_batches(dataloader): 188 | """ 189 | Args: Pytorch Dataloader object 190 | 191 | returns a list of samples from the dataloader 192 | """ 193 | batch_list = [] 194 | count = 0 195 | for fname, fobj in dataloader: 196 | fname = [x.split("/")[-1] for x in fname] 197 | batch_list.append(list(zip(fname, fobj))) 198 | count += 1 199 | return batch_list 200 | 201 | def batches_shuffled(batch_list1, batch_list2): 202 | """ 203 | Ars: two lists of batches 204 | 205 | Returns True if the corresponding batches in lists are different 206 | Returns False otherwise 207 | """ 208 | for b1, b2 in zip(batch_list1, batch_list2): 209 | if b1 == b2: 210 | return False 211 | return True 212 | 213 | def batches_congruent(batch_list1, batch_list2): 214 | """ 215 | Args: two lists of batches 216 | 217 | Returns True if the samples in both the lists matches 218 | returns False otherwise 219 | """ 220 | batches1_flat = [sample for batch in batch_list1 for sample in batch] 221 | batches2_flat = [sample for batch in batch_list2 for sample in batch] 222 | return set(batches1_flat) == set(batches2_flat) 223 | -------------------------------------------------------------------------------- /tests/py-tests/test_read_datasets.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import io 17 | import pytest 18 | from awsio.python.lib.io.s3.s3dataset import S3Dataset, S3IterableDataset 19 | from awsio.python.lib.io.s3.s3dataset import list_files, file_exists 20 | import boto3 21 | 22 | 23 | def get_tar(s3_dataset_path): 24 | s3 = boto3.client('s3') 25 | s3.download_file( 26 | s3_dataset_path.split('/')[2], 27 | s3_dataset_path.split('/')[3], '/tmp/input_file.tar') 28 | import tarfile 29 | stream = tarfile.open('/tmp/input_file.tar') 30 | filenames_boto3 = [] 31 | for tarinfo in stream: 32 | fname = tarinfo.name 33 | stream.extractfile(tarinfo).read() 34 | filenames_boto3.append(fname) 35 | return filenames_boto3 36 | 37 | 38 | def test_tar_file_s3dataset(): 39 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tinyimagenet.tar' 40 | dataset = S3Dataset(s3_dataset_path) 41 | fileobj = io.BytesIO(dataset[0][1]) 42 | import tarfile 43 | with tarfile.open(fileobj=fileobj, mode="r|*") as tar: 44 | result1 = len(tar.getmembers()) 45 | result2 = get_tar(s3_dataset_path) 46 | assert result1 == len(result2) 47 | 48 | 49 | def test_tar_file_s3iterabledataset(): 50 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tinyimagenet.tar' 51 | dataset = S3IterableDataset(s3_dataset_path) 52 | list_of_files = [] 53 | for files in dataset: 54 | list_of_files.append(files[0][0]) 55 | result1 = len(list_of_files) 56 | result2 = get_tar(s3_dataset_path) 57 | assert result1 == len(result2) 58 | 59 | 60 | def get_zip(s3_dataset_path): 61 | s3 = boto3.client('s3') 62 | s3.download_file( 63 | s3_dataset_path.split('/')[2], 64 | s3_dataset_path.split('/')[3], '/tmp/input_file.zip') 65 | import zipfile 66 | filenames_boto3 = [] 67 | with zipfile.ZipFile('/tmp/input_file.zip', 'r') as zfile: 68 | for file_ in zfile.namelist(): 69 | zfile.read(file_) 70 | filenames_boto3.append(file_) 71 | return filenames_boto3 72 | 73 | 74 | def test_zip_file_s3dataset(): 75 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tiny-imagenet-200.zip' 76 | dataset = S3Dataset(s3_dataset_path) 77 | fileobj = io.BytesIO(dataset[0][1]) 78 | import zipfile 79 | with zipfile.ZipFile(fileobj, 'r') as zfile: 80 | result1 = len(zfile.namelist()) 81 | result2 = get_zip(s3_dataset_path) 82 | assert result1 == len(result2) 83 | 84 | 85 | def test_zip_file_s3iterabledataset(): 86 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/tiny-imagenet-200.zip' 87 | dataset = S3IterableDataset(s3_dataset_path) 88 | list_of_files = [] 89 | for files in dataset: 90 | list_of_files.append(files[0][0]) 91 | result1 = len(list_of_files) 92 | result2 = get_zip(s3_dataset_path) 93 | assert result1 == len(result2) 94 | 95 | 96 | def test_csv_file_s3dataset(): 97 | os.environ['AWS_REGION'] = 'us-east-1' 98 | s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv' 99 | dataset = S3Dataset(s3_dataset_path) 100 | import pandas as pd 101 | result1 = pd.read_csv(io.BytesIO(dataset[0][1])) 102 | s3 = boto3.client('s3') 103 | obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2], 104 | Key=s3_dataset_path.split('/')[3]) 105 | result2 = pd.read_csv(io.BytesIO(obj['Body'].read())) 106 | assert result1.equals(result2) 107 | del os.environ['AWS_REGION'] 108 | 109 | 110 | def test_csv_file_s3iterabledataset(): 111 | os.environ['AWS_REGION'] = 'us-east-1' 112 | s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv' 113 | dataset = S3IterableDataset(s3_dataset_path) 114 | import pandas as pd 115 | for files in dataset: 116 | result1 = pd.read_csv(io.BytesIO(files[1])) 117 | s3 = boto3.client('s3') 118 | obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2], Key=s3_dataset_path.split('/')[3]) 119 | result2 = pd.read_csv(io.BytesIO(obj['Body'].read())) 120 | assert result1.equals(result2) 121 | del os.environ['AWS_REGION'] 122 | -------------------------------------------------------------------------------- /tests/py-tests/test_regions.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import numpy as np 16 | import os 17 | import io 18 | import pytest 19 | from awsio.python.lib.io.s3.s3dataset import S3Dataset 20 | from awsio.python.lib.io.s3.s3dataset import (list_files, file_exists, 21 | get_file_size) 22 | import boto3 23 | 24 | def test_regions(): 25 | os.environ['AWS_REGION'] = 'us-east-1' 26 | s3_dataset_path = 's3://pt-s3plugin-test-data-east1/images/n' 27 | bucket_name = 'pt-s3plugin-test-data-east1' 28 | prefix = 'images/n' 29 | result1 = list_files(s3_dataset_path) 30 | s3 = boto3.resource('s3') 31 | test_bucket = s3.Bucket(bucket_name) 32 | result2 = [] 33 | for url in test_bucket.objects.filter(Prefix=prefix): 34 | result2.append('s3://' + url.bucket_name + '/' + url.key) 35 | assert isinstance(result1, list) 36 | assert isinstance(result2, list) 37 | assert result1 == result2 38 | del os.environ['AWS_REGION'] 39 | 40 | 41 | def test_csv_file(): 42 | os.environ['AWS_REGION'] = 'us-east-1' 43 | s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv' 44 | dataset = S3Dataset(s3_dataset_path) 45 | import pandas as pd 46 | for files in dataset: 47 | result1 = pd.read_csv(io.BytesIO(files[1])) 48 | s3 = boto3.client('s3') 49 | obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2], Key=s3_dataset_path.split('/')[3]) 50 | result2 = pd.read_csv(io.BytesIO(obj['Body'].read())) 51 | assert result1.equals(result2) 52 | del os.environ['AWS_REGION'] 53 | -------------------------------------------------------------------------------- /tests/py-tests/test_s3dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pytest 17 | from awsio.python.lib.io.s3.s3dataset import S3Dataset 18 | import boto3 19 | 20 | 21 | def test_file_path(): 22 | """ 23 | Test S3Dataset for existing and nonexistent path 24 | """ 25 | # existing path 26 | s3_path = 's3://pt-s3plugin-test-data-west2/images/test' 27 | s3_dataset = S3Dataset(s3_path) 28 | assert s3_dataset 29 | 30 | # non-existent path 31 | s3_path_none = 's3://pt-s3plugin-test-data-west2/non_existent_path/test' 32 | with pytest.raises(AssertionError) as excinfo: 33 | s3_dataset = S3Dataset(s3_path_none) 34 | assert 'does not contain any objects' in str(excinfo.value) 35 | 36 | 37 | def test_urls_list(): 38 | """ 39 | Test whether urls_list input for S3Dataset works properly 40 | """ 41 | os.environ['AWS_REGION'] = 'us-west-2' 42 | # provide url prefix (path within bucket) 43 | prefix_to_directory = 'images/test' 44 | prefix_to_file = 'test_1.JPEG' 45 | prefix_list=[prefix_to_directory, prefix_to_file] 46 | 47 | # set up boto3 48 | s3 = boto3.resource('s3') 49 | bucket_name = 'pt-s3plugin-test-data-west2' 50 | test_bucket = s3.Bucket(bucket_name) 51 | 52 | # try individual valid urls and collect url_list and all_boto3_files to test url list input 53 | urls_list = list() 54 | all_boto3_files = list() 55 | for prefix in prefix_list: 56 | # collect list of all file names using S3Dataset 57 | url = os.path.join('s3://', bucket_name, prefix) 58 | urls_list.append(url) 59 | s3_dataset = S3Dataset(url) 60 | s3_files = [item[0] for item in s3_dataset] 61 | 62 | # collect list of all file names using boto3 63 | boto3_files = [os.path.join('s3://', url.bucket_name, url.key) \ 64 | for url in test_bucket.objects.filter(Prefix=prefix)] 65 | all_boto3_files.extend(boto3_files) 66 | 67 | assert s3_files == boto3_files 68 | 69 | # test list of two valid urls as input 70 | s3_dataset = S3Dataset(urls_list) 71 | s3_files = [item[0] for item in s3_dataset] 72 | 73 | assert s3_files == all_boto3_files 74 | 75 | # add an non-existent url to list of urls 76 | url_to_non_existent = 's3://pt-s3plugin-test-data-west2/non_existent_directory' 77 | urls_list.append(url_to_non_existent) 78 | with pytest.raises(AssertionError) as excinfo: 79 | s3_dataset = S3Dataset(urls_list) 80 | assert 'does not contain any objects' in str(excinfo.value) 81 | 82 | del os.environ['AWS_REGION'] 83 | 84 | 85 | def test_multi_download(): 86 | """ 87 | Test whether S3Dataset with multiple downloads in one url works properly 88 | """ 89 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test' 90 | bucket_name = 'pt-s3plugin-test-data-west2' 91 | prefix = 'images/test' 92 | 93 | if 'S3_DISABLE_MULTI_PART_DOWNLOAD' in os.environ: 94 | del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'] 95 | 96 | dataset = S3Dataset(s3_dataset_path) 97 | # collect filename from each item in dataset 98 | result1 = [item[0] for item in dataset] 99 | s3 = boto3.resource('s3') 100 | test_bucket = s3.Bucket(bucket_name) 101 | result2 = [] 102 | for url in test_bucket.objects.filter(Prefix=prefix): 103 | result2.append('s3://' + url.bucket_name + '/' + url.key) 104 | assert isinstance(result1, list) 105 | assert isinstance(result2, list) 106 | assert result1 == result2 107 | 108 | 109 | def test_disable_multi_download(): 110 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test' 111 | os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'] = "ON" 112 | dataset = S3Dataset(s3_dataset_path) 113 | result1 = [item[0] for item in dataset] 114 | 115 | # boto3 116 | bucket_name = 'pt-s3plugin-test-data-west2' 117 | prefix = 'images/test' 118 | s3 = boto3.resource('s3') 119 | test_bucket = s3.Bucket(bucket_name) 120 | result2 = ['s3://' + url.bucket_name + '/' + url.key \ 121 | for url in test_bucket.objects.filter(Prefix=prefix)] 122 | 123 | assert isinstance(result1, list) 124 | assert isinstance(result2, list) 125 | assert result1 == result2 126 | del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'] 127 | -------------------------------------------------------------------------------- /tests/py-tests/test_s3iterabledataset.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import io 17 | import pytest 18 | from awsio.python.lib.io.s3.s3dataset import S3IterableDataset, ShuffleDataset 19 | import boto3 20 | 21 | 22 | def test_file_path(): 23 | """ 24 | Test S3IterableDataset for existing and nonexistent path 25 | """ 26 | # existing path 27 | s3_path = 's3://pt-s3plugin-test-data-west2/images/test' 28 | s3_dataset = S3IterableDataset(s3_path) 29 | assert s3_dataset 30 | 31 | # non-existent path 32 | s3_path_none = 's3://pt-s3plugin-test-data-west2/non_existent_path/test' 33 | with pytest.raises(AssertionError) as excinfo: 34 | s3_dataset = S3IterableDataset(s3_path_none) 35 | assert 'does not contain any objects' in str(excinfo.value) 36 | 37 | 38 | def test_urls_list(): 39 | """ 40 | Test whether urls_list input for S3IterableDataset works properly 41 | """ 42 | os.environ['AWS_REGION'] = 'us-west-2' 43 | # provide url prefix (path within bucket) 44 | prefix_to_directory = 'images/test' 45 | prefix_to_file = 'test_1.JPEG' 46 | prefix_list=[prefix_to_directory, prefix_to_file] 47 | 48 | # set up boto3 49 | s3 = boto3.resource('s3') 50 | bucket_name = 'pt-s3plugin-test-data-west2' 51 | test_bucket = s3.Bucket(bucket_name) 52 | 53 | # try individual valid urls and collect url_list and all_boto3_files to test url list input 54 | urls_list = list() 55 | all_boto3_files = list() 56 | for prefix in prefix_list: 57 | # collect list of all file names using S3IterableDataset 58 | url = os.path.join('s3://', bucket_name, prefix) 59 | urls_list.append(url) 60 | s3_dataset = S3IterableDataset(url) 61 | s3_files = [item[0] for item in s3_dataset] 62 | 63 | # collect list of all file names using boto3 64 | boto3_files = [os.path.join('s3://', url.bucket_name, url.key) \ 65 | for url in test_bucket.objects.filter(Prefix=prefix)] 66 | all_boto3_files.extend(boto3_files) 67 | 68 | assert s3_files == boto3_files 69 | 70 | # test list of two valid urls as input 71 | s3_dataset = S3IterableDataset(urls_list) 72 | s3_files = [item[0] for item in s3_dataset] 73 | 74 | assert s3_files == all_boto3_files 75 | 76 | # add an non-existent url to list of urls 77 | url_to_non_existent = 's3://pt-s3plugin-test-data-west2/non_existent_directory' 78 | urls_list.append(url_to_non_existent) 79 | with pytest.raises(AssertionError) as excinfo: 80 | s3_dataset = S3IterableDataset(urls_list) 81 | assert 'does not contain any objects' in str(excinfo.value) 82 | 83 | del os.environ['AWS_REGION'] 84 | 85 | 86 | def test_shuffle_true(): 87 | """ 88 | Tests shuffle_urls parameter, len and set_epoch functions 89 | """ 90 | os.environ['AWS_REGION'] = 'us-west-2' 91 | 92 | # create two datasets, one shuffled with self.epoch 93 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test' 94 | s3_dataset0 = S3IterableDataset(s3_dataset_path) 95 | s3_dataset1 = S3IterableDataset(s3_dataset_path, shuffle_urls=True) 96 | s3_dataset1.set_epoch(5) 97 | 98 | # len is defined as the length of the urls_list created by the path 99 | assert len(s3_dataset0) == len(s3_dataset1) 100 | 101 | # check to make sure shuffling works 102 | filenames0 = [item[0] for item in s3_dataset0] 103 | filenames1 = [item[0] for item in s3_dataset1] 104 | 105 | assert len(filenames0) == len(filenames1) 106 | assert filenames0 != filenames1 107 | del os.environ['AWS_REGION'] 108 | 109 | 110 | def test_multi_download(): 111 | s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv' 112 | 113 | if 'S3_DISABLE_MULTI_PART_DOWNLOAD' in os.environ: 114 | del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'] 115 | os.environ['AWS_REGION'] = 'us-east-1' 116 | 117 | dataset = S3IterableDataset(s3_dataset_path) 118 | import pandas as pd 119 | for files in dataset: 120 | result1 = pd.read_csv(io.BytesIO(files[1])) 121 | s3 = boto3.client('s3') 122 | obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2], 123 | Key=s3_dataset_path.split('/')[3]) 124 | result2 = pd.read_csv(io.BytesIO(obj['Body'].read())) 125 | assert result1.equals(result2) 126 | 127 | 128 | def test_disable_multi_download(): 129 | s3_dataset_path = 's3://pt-s3plugin-test-data-east1/genome-scores.csv' 130 | os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'] = "ON" 131 | os.environ['AWS_REGION'] = 'us-east-1' 132 | dataset = S3IterableDataset(s3_dataset_path) 133 | import pandas as pd 134 | for files in dataset: 135 | result1 = pd.read_csv(io.BytesIO(files[1])) 136 | s3 = boto3.client('s3') 137 | obj = s3.get_object(Bucket=s3_dataset_path.split('/')[2], 138 | Key=s3_dataset_path.split('/')[3]) 139 | result2 = pd.read_csv(io.BytesIO(obj['Body'].read())) 140 | assert result1.equals(result2) 141 | del os.environ['S3_DISABLE_MULTI_PART_DOWNLOAD'], os.environ['AWS_REGION'] 142 | 143 | 144 | def test_shuffle_dataset(): 145 | 146 | dataset = [i for i in range(10)] 147 | 148 | # buffer_size 1 should yield the dataset without shuffling 149 | shuffle_dataset = ShuffleDataset(dataset=dataset, buffer_size=1) 150 | shuffle_content = [item for item in shuffle_dataset] 151 | assert dataset == shuffle_content 152 | 153 | # buffer_size smaller than dataset size 154 | shuffle_dataset = ShuffleDataset(dataset=dataset, buffer_size=2) 155 | assert set(dataset) == set(shuffle_content) 156 | assert len(dataset) == len(shuffle_content) 157 | 158 | # buffer_size greater than dataset size 159 | shuffle_dataset = ShuffleDataset(dataset=dataset, buffer_size=15) 160 | assert set(dataset) == set(shuffle_content) 161 | assert len(dataset) == len(shuffle_content) 162 | -------------------------------------------------------------------------------- /tests/py-tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). 4 | # You may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import pytest 17 | from awsio.python.lib.io.s3.s3dataset import (list_files, file_exists, 18 | get_file_size) 19 | import boto3 20 | 21 | 22 | def test_wrong_filenames(): 23 | filenames = ['', 'shor', 'not_start_s3', 's3://', 's3:///no_bucket'] 24 | functions = [list_files, file_exists, get_file_size] 25 | exception = False 26 | for function in functions: 27 | for filename in filenames: 28 | try: 29 | function(filename) 30 | except ValueError: 31 | exception = True 32 | assert exception 33 | exception = False 34 | 35 | 36 | def test_list_files_prefix(): 37 | # default region is us-west-2 38 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2/images/test' 39 | result1 = list_files(s3_dataset_path) 40 | s3 = boto3.resource('s3') 41 | test_bucket = s3.Bucket('pt-s3plugin-test-data-west2') 42 | result2 = [] 43 | for url in test_bucket.objects.filter(Prefix='images/test'): 44 | result2.append('s3://' + url.bucket_name + '/' + url.key) 45 | assert isinstance(result1, list) 46 | assert isinstance(result2, list) 47 | assert len(result1) == len(result2) 48 | assert result1 == result2 49 | 50 | 51 | def test_list_files_bucket(): 52 | os.environ['AWS_REGION'] = 'us-west-2' 53 | # default region is us-west-2 54 | s3_dataset_path = 's3://pt-s3plugin-test-data-west2' 55 | result1 = list_files(s3_dataset_path) 56 | s3 = boto3.resource('s3') 57 | test_bucket = s3.Bucket('pt-s3plugin-test-data-west2') 58 | result2 = [] 59 | for url in test_bucket.objects.all(): 60 | if url.key[-1] == '/': 61 | continue 62 | result2.append('s3://' + url.bucket_name + '/' + url.key) 63 | assert isinstance(result1, list) 64 | assert isinstance(result2, list) 65 | assert result1 == result2 66 | del os.environ['AWS_REGION'] 67 | 68 | 69 | def test_file_exists(): 70 | """ 71 | There are four kinds of inputs for file_exists: 72 | 1. object_name refers to a file (True) 73 | 2. object_name refers to a folder (False) 74 | 3. bucket_name does not refer to an existing bucket (False) 75 | 4. object_name does not refer to an existing object (False) 76 | """ 77 | s3_bucket = 's3://pt-s3plugin-test-data-west2' 78 | 79 | # case 1 80 | assert file_exists(os.path.join(s3_bucket, 'test_0.JPEG')) 81 | 82 | # case 2 83 | assert not file_exists(os.path.join(s3_bucket, 'folder_1')) 84 | 85 | # case 3 86 | assert not file_exists(os.path.join(s3_bucket, 'non_existent_folder')) 87 | 88 | # case 4 89 | assert not file_exists(os.path.join(s3_bucket, 'test_new_file.JPEG')) 90 | 91 | 92 | def test_get_file_size(): 93 | bucket_name = 'pt-s3plugin-test-data-west2' 94 | object_name = 'test_0.JPEG' 95 | 96 | result1 = get_file_size('s3://' + bucket_name + '/' + object_name) 97 | 98 | s3 = boto3.resource('s3') 99 | bucket = s3.Bucket(bucket_name) 100 | result2 = bucket.Object(object_name).content_length 101 | 102 | assert result1 == result2 103 | -------------------------------------------------------------------------------- /tests/smoke_tests/import_awsio.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "Testing: import awsio" 4 | python -c "import awsio; print(awsio.__version__)" 5 | echo "import awsio succeeded" 6 | 7 | read -p "S3 URL : " s3_url 8 | echo Testing: checking setup by quering whether or not $s3_url is an existing file 9 | python -c "from awsio.python.lib.io.s3.s3dataset import file_exists; print(f\"file_exists: {file_exists($s3_url)}\")" 10 | echo Smoke test was successful. 11 | 12 | -------------------------------------------------------------------------------- /third_party/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.1) 2 | 3 | if(POLICY CMP0054) 4 | cmake_policy(SET CMP0054 NEW) 5 | endif() 6 | 7 | project(AWSIO_DEPS) 8 | 9 | list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") 10 | 11 | include(ExternalProject) 12 | 13 | set(AWS_DEPS_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}" CACHE PATH "Dependencies install directory.") 14 | set(AWS_DEPS_BUILD_DIR "${CMAKE_BINARY_DIR}/build" CACHE PATH "Dependencies build directory.") 15 | set(AWS_DEPS_DOWNLOAD_DIR "${AWS_DEPS_BUILD_DIR}/downloads" CACHE PATH "Dependencies download directory.") 16 | 17 | set(AWS_C_COMMON_URL "https://github.com/awslabs/aws-c-common.git") 18 | set(AWS_C_COMMON_TAG "v0.4.15") 19 | include(BuildAwsSDK) 20 | 21 | add_dependencies(AwsSDK) -------------------------------------------------------------------------------- /third_party/cmake/AwsSDK.cmake: -------------------------------------------------------------------------------- 1 | find_package(AWSSD REQUIRED COMPONENTS transfer s3-encryption dynamodb) 2 | target_link_libraries(target ${AWSSDK_LINK_LIBRARIES}) 3 | 4 | -------------------------------------------------------------------------------- /tools/get_version.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | from pathlib import Path 4 | 5 | project_root = Path(__file__).parent 6 | 7 | def get_sha(): 8 | try: 9 | return subprocess.check_output(['git', 'rev-parse', 'HEAD'], cwd=project_root).decode('ascii').strip() 10 | except Exception: 11 | return 'Unknown' 12 | 13 | def get_version(): 14 | sha = get_sha() 15 | version = open('version.txt', 'r').read().strip() 16 | if sha != 'Unknown': 17 | version += '+' + sha[:7] 18 | return version 19 | 20 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.0.1 --------------------------------------------------------------------------------