├── .gitignore ├── .pypirc ├── LICENSE ├── MANIFEST ├── README.md ├── requirements.txt ├── s3concurrent ├── __init__.py ├── bin │ ├── s3concurrent_download │ └── s3concurrent_upload ├── s3concurrent.py └── tests │ ├── data │ └── .gitkeep │ └── test_s3concurrent.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.mo 2 | *.egg-info 3 | *.egg 4 | *.EGG 5 | *.EGG-INFO 6 | build 7 | develop-eggs 8 | downloads 9 | eggs 10 | fake-eggs 11 | parts 12 | dist 13 | .installed.cfg 14 | .mr.developer.cfg 15 | .hg 16 | .bzr 17 | .svn 18 | *.pyc 19 | *.pyo 20 | *.tmp* 21 | /venv/* 22 | -------------------------------------------------------------------------------- /.pypirc: -------------------------------------------------------------------------------- 1 | [distutils] 2 | index-servers = 3 | pypi 4 | pypitest 5 | 6 | [pypi] 7 | repository: https://pypi.python.org/pypi 8 | username: {{your_username}} 9 | password: {{your_password}} 10 | 11 | [pypitest] 12 | repository: https://testpypi.python.org/pypi 13 | username: {{your_username}} 14 | password: {{your_password}} -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Quid, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | README.txt 3 | setup.py 4 | s3concurrent/__init__.py 5 | s3concurrent/s3concurrent.py 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | s3concurrent uploads/downloads files to/from S3. 4 | 5 | Features include: 6 | 7 | * Handles deep folder structures with many files. 8 | * Uploads/downloads many files concurrently. 9 | * Maintains folder structure between a S3 bucket and local file system. 10 | * Only uploads/downloads a file when a file has changed between S3 bucket and 11 | local file system. 12 | 13 | # Installation 14 | 15 | ``` 16 | git clone https://github.com/quid/s3concurrent.git 17 | pip install s3concurrent/ 18 | ``` 19 | 20 | # Usage 21 | 22 | ## s3concurrent_download 23 | 24 | usage: s3concurrent_download [-h] [--prefix PREFIX] 25 | [--local_folder LOCAL_FOLDER] 26 | [--thread_count THREAD_COUNT] 27 | [--max_retry MAX_RETRY] 28 | s3_key s3_secret bucket_name 29 | 30 | positional arguments: 31 | s3_key Your S3 API Key 32 | s3_secret Your S3 secret key 33 | bucket_name Your S3 bucket name 34 | 35 | optional arguments: 36 | -h, --help show this help message and exit 37 | --prefix PREFIX Path to a folder in the S3 bucket (e.g. my/dest/folder/) 38 | --local_folder LOCAL_FOLDER 39 | Path to a a local filesystem folder (e.g. /my/src/folder) 40 | --thread_count THREAD_COUNT 41 | Number of concurrent files to upload/download 42 | --max_retry MAX_RETRY 43 | Max retries for uploading/downloading a file 44 | 45 | ## s3concurrent_upload 46 | 47 | usage: s3concurrent_upload [-h] [--prefix PREFIX] 48 | [--local_folder LOCAL_FOLDER] 49 | [--thread_count THREAD_COUNT] 50 | [--max_retry MAX_RETRY] 51 | s3_key s3_secret bucket_name 52 | 53 | positional arguments: 54 | s3_key Your S3 API Key 55 | s3_secret Your S3 secret key 56 | bucket_name Your S3 bucket name 57 | 58 | optional arguments: 59 | -h, --help show this help message and exit 60 | --prefix PREFIX Path to a folder in the S3 bucket (e.g. my/dest/folder/) 61 | --local_folder LOCAL_FOLDER 62 | Path to a a local filesystem folder (e.g. /my/src/folder) 63 | --thread_count THREAD_COUNT 64 | Number of concurrent files to upload/download 65 | --max_retry MAX_RETRY 66 | Max retries for uploading/downloading a file 67 | 68 | 69 | # Examples 70 | 71 | Download files from the folder 'mirror/pypi' in a S3 bucket to a local folder 72 | '/path/to/mirror/pypi' with 20 concurrent downloads. 73 | 74 | 75 | ``` 76 | s3concurrent_download --local_folder /path/to/mirror/pypi --prefix mirror/pypi --thread_count 20 77 | ``` 78 | 79 | Upload files from the folder '/tmp/benchmark' to a 'benchmark' folder on S3 with 80 | 10 concurrent uploads and 3 retries per upload. 81 | 82 | ``` 83 | s3concurrent_upload --local_folder /tmp/benchmark --prefix benchmark --thread_count 10 --max_retry 3 84 | ``` 85 | 86 | # Running the tests 87 | 88 | To run s3concurrent tests, please use the following command from s3concurrent's root directory after downloading the repository. 89 | 90 | ``` 91 | python -m unittest discover s3concurrent/tests 92 | ``` 93 | 94 | You should see all 14 tests passing in the end of the console outputs. 95 | 96 | ---------------------------------------------------------------------- 97 | Ran 14 tests in 0.222s 98 | 99 | OK 100 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | argparse>=1.3.0 2 | boto>=2.32.1 3 | colorlog==2.6.1 4 | mock==1.0.1 5 | nose==1.3.4 6 | -------------------------------------------------------------------------------- /s3concurrent/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quid/s3concurrent/b9c2442bed24e5cd3c536d012d0db39e341323c2/s3concurrent/__init__.py -------------------------------------------------------------------------------- /s3concurrent/bin/s3concurrent_download: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | # Tell the system to load the s3concurrent package 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))) 8 | 9 | import s3concurrent 10 | 11 | if __name__ == '__main__': 12 | sys.exit(s3concurrent.main('download', sys.argv[1:])) 13 | -------------------------------------------------------------------------------- /s3concurrent/bin/s3concurrent_upload: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os 4 | import sys 5 | 6 | # Tell the system to load the s3concurrent package 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..'))) 8 | 9 | import s3concurrent 10 | 11 | if __name__ == '__main__': 12 | sys.exit(s3concurrent.main('upload', sys.argv[1:])) 13 | -------------------------------------------------------------------------------- /s3concurrent/s3concurrent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import binascii 5 | import colorlog 6 | import hashlib 7 | import logging 8 | import os 9 | import sys 10 | import threading 11 | import time 12 | 13 | from boto.s3.bucket import Bucket 14 | from boto.s3.connection import S3Connection 15 | from boto.s3.key import Key 16 | from Queue import Queue 17 | 18 | # AWS magic chunk size number. Discovered via brute force. 19 | AWS_UPLOAD_PART_SIZE = 64 * 1024 * 1024 20 | 21 | # Max number of items allowed in the queue to keep from blowing up memory 22 | MAX_QUEUE_SIZE = 100000 23 | 24 | # Configure logging 25 | logger = logging.getLogger() 26 | logger.setLevel(logging.DEBUG) 27 | 28 | ch = logging.StreamHandler(sys.stdout) 29 | ch.setLevel(logging.INFO) 30 | formatter = colorlog.ColoredFormatter('%(log_color)s%(levelname)s: %(message)s') 31 | ch.setFormatter(formatter) 32 | logger.addHandler(ch) 33 | 34 | 35 | class ProcessKeyQueue: 36 | ''' 37 | ProcessKeyQueue implements the queuing functions needed for s3concurrent upload/download. 38 | ''' 39 | 40 | def __init__(self): 41 | self.process_able_keys_queue = Queue() 42 | self.enqueued_counter = 0 43 | self.de_queue_counter = 0 44 | self.all_processed = False 45 | self.queuing = False 46 | 47 | def enqueue_item(self, key, local_file_path, enqueue_count=1): 48 | ''' 49 | Enqueues an item to upload/download. 50 | 51 | :param key: s3 key to upload/download 52 | :param local_file_path: local file path corresponding to the s3 key 53 | :param enqueue_count: number of times this key has been enqueued 54 | ''' 55 | self.process_able_keys_queue.put((key, local_file_path, enqueue_count)) 56 | self.enqueued_counter += 1 57 | 58 | def is_empty(self): 59 | ''' 60 | Checks if the queue is empty. 61 | 62 | :return: (bool) true if the queue is empty 63 | ''' 64 | return self.process_able_keys_queue.empty() 65 | 66 | def de_queue_an_item(self): 67 | ''' 68 | De-queues an item from the queue. 69 | 70 | :return: an item previously enqueued 71 | ''' 72 | value = None 73 | 74 | if not self.is_empty(): 75 | value = self.process_able_keys_queue.get() 76 | self.de_queue_counter += 1 77 | 78 | return value 79 | 80 | def is_queuing(self): 81 | ''' 82 | Checks if queuing all the process-able keys from S3. 83 | 84 | :return: True if still queuing 85 | ''' 86 | return self.queuing 87 | 88 | def queuing_stopped(self): 89 | ''' 90 | Stops the queuing from S3. 91 | ''' 92 | self.queuing = False 93 | 94 | def queuing_started(self): 95 | ''' 96 | Starts the queuing from S3. 97 | ''' 98 | self.queuing = True 99 | 100 | 101 | def enqueue_s3_keys_for_download(s3_bucket, prefix, destination_folder, queue): 102 | ''' 103 | En-queues S3 Keys to be downloaded. 104 | 105 | :param s3_bucket: Boto Bucket object that contains the keys to be downloaded 106 | :param prefix: The path to the S3 folder to be downloaded. Example: bucket_root/folder_1 107 | :param destination_folder: The relative or absolute path to the folder you wish to download to 108 | :param queue: A ProcessKeyQueue instance to enqueue all the keys in 109 | ''' 110 | bucket_list = s3_bucket.list(prefix=prefix) 111 | 112 | for key in bucket_list: 113 | # prepare local destination structure 114 | destination = destination_folder + key.name.replace(prefix, '', 1) if prefix else ('/' + key.name) 115 | try: 116 | containing_dir = os.path.dirname(destination) 117 | if not os.path.exists(containing_dir): 118 | os.makedirs(containing_dir) 119 | 120 | # Don't queue more items while over 100,000 to prevent memory explosion 121 | while MAX_QUEUE_SIZE < queue.process_able_keys_queue.qsize(): 122 | time.sleep(1) 123 | 124 | # enqueue 125 | queue.enqueue_item(key, destination) 126 | 127 | except: 128 | logger.exception('Cannot enqueue key: {0}'.format(key.name)) 129 | 130 | logger.info('Initial queuing has completed. {0} keys has been enqueued.'.format(queue.enqueued_counter)) 131 | queue.queuing_stopped() 132 | 133 | 134 | def enqueue_s3_keys_for_upload(s3_bucket, prefix, from_folder, queue): 135 | ''' 136 | En-queues S3 Keys to be uploaded. 137 | 138 | :param s3_bucket: Boto Bucket object that contains the keys to be uploaded to 139 | :param prefix: The path to the S3 folder to be downloaded. Example: bucket_root/folder_1 140 | :param from_folder: The relative or absolute path to the folder you wish to upload from 141 | :param queue: A ProcessKeyQueue instance to enqueue all the keys in 142 | ''' 143 | abs_from_folder_path = os.path.abspath(from_folder) 144 | 145 | for root, dirs, files in os.walk(abs_from_folder_path): 146 | for single_file in files: 147 | abs_file_path = os.path.join(root, single_file) 148 | 149 | s3_key_name = abs_file_path.replace(abs_from_folder_path, '', 1) 150 | if not s3_key_name.startswith('/') and prefix != '': 151 | s3_key_name = '/' + s3_key_name 152 | s3_key_name = prefix + s3_key_name 153 | 154 | key = Key(s3_bucket) 155 | key.key = s3_key_name 156 | 157 | # Don't queue more items to prevent memory explosion 158 | while MAX_QUEUE_SIZE < queue.process_able_keys_queue.qsize(): 159 | time.sleep(1) 160 | 161 | queue.enqueue_item(key, abs_file_path) 162 | 163 | logger.info('Initial queuing has completed. {0} keys has been enqueued.'.format(queue.enqueued_counter)) 164 | queue.queuing_stopped() 165 | 166 | 167 | def is_sync_needed(key, local_file_path): 168 | ''' 169 | Checks if the local file is identical to the S3 key by using the file's md5 hash. 170 | 171 | :param key: The S3 key object. 172 | :param local_file_path: (str), path to download the key to 173 | ''' 174 | sync_needed = True 175 | if os.path.isfile(local_file_path) and key.exists(): 176 | try: 177 | key_etag = key.etag 178 | if not key_etag: 179 | key_etag = key.bucket.lookup(key.name).etag 180 | 181 | if not _s3_etag_match(key_etag, local_file_path): 182 | sync_needed = False 183 | 184 | except: 185 | logger.exception(sys.exc_info()) 186 | logger.error( 187 | 'Cannot compare local file {0} against remote file {1}. s3concurrent will process it anyway.' 188 | .format(local_file_path, key.name)) 189 | 190 | return sync_needed 191 | 192 | 193 | def _s3_etag_match(etag, file_path): 194 | ''' 195 | Checks if the local file's checksum matches the S3 etag. 196 | 197 | :param key: (str), the S3 etag. 198 | :param file_path: (str), the local file to check. 199 | :return: (bool), whether or not the etag matches the checksum of the local file. 200 | ''' 201 | matches = False 202 | 203 | if '-' in etag: 204 | # If the etag contains a dash, then the file was uploaded in parts 205 | matches = _calculate_s3_etag(file_path, AWS_UPLOAD_PART_SIZE) == etag 206 | 207 | else: 208 | # Etag will be a MD5 checksum when the file was uploaded as a whole 209 | matches = _get_md5(file_path) == etag 210 | 211 | return matches 212 | 213 | 214 | def _calculate_s3_etag(file_path, part_size): 215 | ''' 216 | Calculates the S3 etag of a file when the upload was performed in parts. 217 | 218 | :param file_path: (str), the local file calculate the etag for. 219 | :param part_size: (int), the size of the chunks that were used to upload the file to S3. 220 | :return: (str), the calculated S3 etag of the local file. 221 | ''' 222 | block_count = 0 223 | md5string = '' 224 | with open(file_path, 'rb') as open_file: 225 | buf = open_file.read(part_size) 226 | while len(buf) > 0: 227 | hasher = hashlib.md5() 228 | hasher.update(buf) 229 | md5string += binascii.unhexlify(hasher.hexdigest()) 230 | block_count += 1 231 | 232 | buf = open_file.read(part_size) 233 | 234 | hasher = hashlib.md5() 235 | hasher.update(md5string) 236 | return hasher.hexdigest() + '-' + str(block_count) 237 | 238 | 239 | def _get_md5(filename, blocksize=65536): 240 | ''' 241 | Retrieves the MD5 checksum for the given filename. 242 | 243 | :param filename: (str), the file path to obtain the checksum of 244 | :param blocksize: (int), the largest chunk of file size to read into memory 245 | :return: the MD5 checksum 246 | ''' 247 | hasher = hashlib.md5() 248 | with open(filename, 'rb') as open_file: 249 | buf = open_file.read(blocksize) 250 | while len(buf) > 0: 251 | hasher.update(buf) 252 | buf = open_file.read(blocksize) 253 | 254 | return hasher.hexdigest() 255 | 256 | 257 | def process_a_key(queue, action, max_retry): 258 | ''' 259 | Process (download or upload) a S3 key from/to respective local path. 260 | 261 | :param queue: A ProcessKeyQueue instance to de-queue a key from 262 | :param action: download or upload 263 | :param max_retry: The max times for s3concurrent to retry uploading/downloading a key 264 | ''' 265 | if not queue.is_empty(): 266 | key, local_path, enqueue_count = queue.de_queue_an_item() 267 | 268 | try: 269 | 270 | if is_sync_needed(key, local_path) and enqueue_count <= max_retry: 271 | 272 | # wait accordingly to enqueue_count 273 | if enqueue_count > 1: 274 | wait_time = enqueue_count ** 2 275 | logger.info('Attempt no.{0} to {1} {2}. Wait {3} secs.'.format(enqueue_count, action, key.name, wait_time)) 276 | time.sleep(wait_time) 277 | 278 | # conduct upload/download 279 | if action == 'download': 280 | key.get_contents_to_filename(local_path) 281 | else: 282 | key.set_contents_from_filename(local_path) 283 | 284 | elif enqueue_count > max_retry: 285 | logger.error('Ignoring {0} since s3concurrent had tried downloading {1} times.'.format(key.name, max_retry)) 286 | 287 | except: 288 | if key.size == 0: 289 | logger.info('%s is a directory, ignoring', key.name) 290 | 291 | else: 292 | logger.warn('Error {0}ing file with key: {1}, putting it back to the queue'.format(action, key.name)) 293 | queue.enqueue_item(key, local_path, enqueue_count=enqueue_count + 1) 294 | 295 | else: 296 | # do nothing when the queue is empty 297 | pass 298 | 299 | 300 | def consume_queue(queue, action, thread_pool_size, max_retry): 301 | ''' 302 | Consumes the queue with the designated thread poll size by uploading/downloading the keys to 303 | their respective destinations. 304 | 305 | :param queue: A ProcessKeyQueue instance to consume all the keys from 306 | :param action: "download" or "upload" 307 | :param thread_pool_size: The Designated thread pool size. (how many concurrent threads to process files.) 308 | :param max_retry: The max times for s3concurrent to retry uploading/downloading a key 309 | ''' 310 | thread_pool = [] 311 | 312 | while queue.is_queuing() or not queue.is_empty() or len(thread_pool) != 0: 313 | # de-pool the done threads 314 | for t in thread_pool: 315 | if not t.is_alive(): 316 | thread_pool.remove(t) 317 | 318 | # en-pool new threads 319 | if not queue.is_empty() and len(thread_pool) <= thread_pool_size: 320 | t = threading.Thread(target=process_a_key, args=[queue, action, max_retry]) 321 | t.start() 322 | thread_pool.append(t) 323 | 324 | queue.all_processed = True 325 | 326 | 327 | def process_all(action, s3_key, s3_secret, bucket_name, prefix, local_folder, queue, thread_count, max_retry): 328 | ''' 329 | Orchestrates the en-queuing and consuming threads in conducting: 330 | 1. Local folder structure construction 331 | 2. S3 key en-queuing 332 | 3. S3 key uploading/downloading if file updated 333 | 334 | :param action: download or upload 335 | :param s3_key: Your S3 API Key 336 | :param s3_secret: Your S3 API Secret 337 | :param bucket_name: Your S3 bucket name 338 | :param prefix: The path to the S3 folder to be downloaded. Example: bucket_root/folder_1 339 | :param local_folder: The local folder you wish to upload/download the files from/to 340 | :param queue: A ProcessKeyQueue instance to enqueue all the keys in 341 | :param thread_count: The number of threads that you wish s3concurrent to use 342 | :param max_retry: The max times for s3concurrent to retry uploading/downloading a key 343 | :return: True is all processed, false if interrupted in any way 344 | ''' 345 | conn = S3Connection(s3_key, s3_secret) 346 | bucket = Bucket(connection=conn, name=bucket_name) 347 | 348 | if action == 'download': 349 | target_function = enqueue_s3_keys_for_download 350 | else: 351 | target_function = enqueue_s3_keys_for_upload 352 | 353 | enqueue_thread = threading.Thread(target=target_function, args=(bucket, prefix, local_folder, queue)) 354 | enqueue_thread.daemon = True 355 | enqueue_thread.start() 356 | 357 | queue.queuing_started() 358 | 359 | consume_thread = threading.Thread(target=consume_queue, args=(queue, action, thread_count, max_retry)) 360 | consume_thread.daemon = True 361 | consume_thread.start() 362 | 363 | while not queue.all_processed: 364 | # report progress every 10 secs 365 | logger.info('{0} keys enqueued, and {1} keys {2}ed'.format(queue.enqueued_counter, queue.de_queue_counter, action)) 366 | time.sleep(10) 367 | 368 | logger.info('{0} keys enqueued, and {1} keys {2}ed'.format(queue.enqueued_counter, queue.de_queue_counter, action)) 369 | 370 | 371 | def main(action, command_line_args): 372 | parser = argparse.ArgumentParser(prog='s3concurrent_{0}'.format(action)) 373 | parser.add_argument('s3_key', help="Your S3 API Key") 374 | parser.add_argument('s3_secret', help="Your S3 secret Key") 375 | parser.add_argument('bucket_name', help="Your S3 bucket name") 376 | parser.add_argument('--prefix', default=None, help="Path to a folder in the S3 bucket (e.g. my/dest/folder/)".format(action)) 377 | parser.add_argument('--local_folder', default='.', help="Path to a a local filesystem folder (e.g. /my/src/folder)".format(action)) 378 | parser.add_argument('--thread_count', default=10, help="Number of concurrent files to upload/download") 379 | parser.add_argument('--max_retry', default=10, help="Max retries for uploading/downloading a file") 380 | 381 | args = parser.parse_args(command_line_args) 382 | 383 | queue = ProcessKeyQueue() 384 | 385 | if args.s3_key and args.s3_secret and args.bucket_name: 386 | process_all(action, args.s3_key, args.s3_secret, args.bucket_name, args.prefix, args.local_folder, queue, int(args.thread_count), int(args.max_retry)) 387 | 388 | if queue.all_processed: 389 | logger.info('All keys are {0}ed'.format(action)) 390 | else: 391 | logger.info('{0} interrupted'.format(action)) 392 | 393 | return queue.all_processed 394 | 395 | 396 | def s3concurrent_download(command_line_args=None): 397 | main('download', command_line_args=command_line_args) 398 | 399 | 400 | def s3concurrent_upload(command_line_args=None): 401 | main('upload', command_line_args=command_line_args) 402 | -------------------------------------------------------------------------------- /s3concurrent/tests/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quid/s3concurrent/b9c2442bed24e5cd3c536d012d0db39e341323c2/s3concurrent/tests/data/.gitkeep -------------------------------------------------------------------------------- /s3concurrent/tests/test_s3concurrent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import mock 4 | import os 5 | import shutil 6 | import tempfile 7 | import time 8 | import unittest 9 | import uuid 10 | 11 | from s3concurrent import s3concurrent 12 | 13 | sandbox = os.path.dirname(os.path.realpath(__file__)) + '/sandbox/' 14 | 15 | 16 | class TestS3Concurrent(unittest.TestCase): 17 | 18 | def test_enqueue_s3_keys_for_download(self): 19 | 20 | mock_folder1 = 'a/b/' 21 | mocked_key1 = mock.Mock() 22 | mocked_key1.name = mock_folder1 + 'c' 23 | 24 | mock_folder2 = 'b/c/' 25 | mocked_key2 = mock.Mock() 26 | mocked_key2.name = mock_folder2 + 'd' 27 | 28 | mock_folder3 = 'c/d/' 29 | mocked_key3 = mock.Mock() 30 | mocked_key3.name = mock_folder3 + 'e' 31 | 32 | mocked_bucket = mock.Mock() 33 | mocked_bucket.list = lambda prefix: [mocked_key1, mocked_key2, mocked_key3] 34 | 35 | queue = s3concurrent.ProcessKeyQueue() 36 | 37 | s3concurrent.enqueue_s3_keys_for_download(mocked_bucket, 'test/prefix', sandbox, queue) 38 | 39 | self.assertTrue(os.path.exists(sandbox + mock_folder1)) 40 | self.assertTrue(os.path.exists(sandbox + mock_folder2)) 41 | self.assertTrue(os.path.exists(sandbox + mock_folder3)) 42 | 43 | self.assertEquals(queue.enqueued_counter, 3) 44 | self.assertFalse(queue.is_empty()) 45 | 46 | self.assertFalse(queue.is_queuing()) 47 | 48 | @mock.patch('os.path.dirname', side_effect=Exception) 49 | def test_enqueue_s3_keys_for_download_error(self, mocked_dirname): 50 | mock_folder1 = 'a/b/' 51 | mocked_key1 = mock.Mock() 52 | mocked_key1.name = mock_folder1 + 'c' 53 | 54 | mocked_bucket = mock.Mock() 55 | mocked_bucket.list = lambda prefix: [mocked_key1] 56 | 57 | queue = s3concurrent.ProcessKeyQueue() 58 | 59 | s3concurrent.enqueue_s3_keys_for_download(mocked_bucket, 'test/prefix', sandbox, queue) 60 | 61 | self.assertEquals(queue.enqueued_counter, 0) 62 | self.assertTrue(queue.is_empty()) 63 | 64 | self.assertFalse(queue.is_queuing()) 65 | 66 | def test_enqueue_s3_keys_for_upload(self): 67 | # fake files to be enqueued 68 | for item in ['a', 'b', 'c']: 69 | with open(sandbox + '{0}.txt'.format(item), 'wb') as f: 70 | f.write('mocked file') 71 | 72 | mocked_bucket = mock.Mock() 73 | 74 | queue = s3concurrent.ProcessKeyQueue() 75 | 76 | s3concurrent.enqueue_s3_keys_for_upload(mocked_bucket, 'test/prefix', sandbox, queue) 77 | 78 | self.assertEquals(queue.enqueued_counter, 3) 79 | self.assertFalse(queue.is_empty()) 80 | 81 | self.assertFalse(queue.is_queuing()) 82 | 83 | def test_download_a_key(self): 84 | mock_folder1 = 'a/b/' 85 | mocked_key1 = mock.Mock() 86 | mocked_key1.name = mock_folder1 + 'c' 87 | mocked_key1.get_contents_to_filename = mock.Mock() 88 | 89 | queue = s3concurrent.ProcessKeyQueue() 90 | queue.enqueue_item(mocked_key1, sandbox) 91 | 92 | self.assertEquals(queue.enqueued_counter, 1) 93 | 94 | s3concurrent.process_a_key(queue, 'download', 1) 95 | 96 | self.assertEquals(queue.de_queue_counter, 1) 97 | self.assertTrue(queue.is_empty()) 98 | mocked_key1.get_contents_to_filename.assert_called_once_with(sandbox) 99 | 100 | def test_download_a_key_error(self): 101 | mock_folder1 = 'a/b/' 102 | mocked_key1 = mock.Mock() 103 | mocked_key1.name = mock_folder1 + 'c' 104 | mocked_key1.get_contents_to_filename = mock.Mock() 105 | mocked_key1.get_contents_to_filename.side_effect = Exception 106 | 107 | queue = s3concurrent.ProcessKeyQueue() 108 | queue.enqueue_item(mocked_key1, sandbox) 109 | 110 | self.assertEquals(queue.enqueued_counter, 1) 111 | 112 | s3concurrent.process_a_key(queue, 'download', 1) 113 | 114 | self.assertEquals(queue.de_queue_counter, 1) 115 | mocked_key1.get_contents_to_filename.assert_called_once_with(sandbox) 116 | self.assertFalse(queue.is_empty()) 117 | 118 | self.assertEquals(queue.enqueued_counter, 2) 119 | 120 | def test_upload_a_key(self): 121 | test_key_name = sandbox + 'test.txt' 122 | 123 | with open(test_key_name, 'wb') as f: 124 | f.write('mocked file') 125 | 126 | mocked_key1 = mock.Mock() 127 | mocked_key1.name = test_key_name 128 | mocked_key1.set_contents_from_filename = mock.Mock() 129 | 130 | queue = s3concurrent.ProcessKeyQueue() 131 | queue.enqueue_item(mocked_key1, test_key_name) 132 | 133 | self.assertEquals(queue.enqueued_counter, 1) 134 | 135 | s3concurrent.process_a_key(queue, 'upload', 1) 136 | 137 | self.assertEquals(queue.de_queue_counter, 1) 138 | self.assertTrue(queue.is_empty()) 139 | mocked_key1.set_contents_from_filename.assert_called_once_with(test_key_name) 140 | 141 | def test_upload_a_key_error(self): 142 | test_key_name = sandbox + 'test.txt' 143 | 144 | mocked_key1 = mock.Mock() 145 | mocked_key1.name = test_key_name 146 | mocked_key1.set_contents_from_filename = mock.Mock() 147 | mocked_key1.set_contents_from_filename.side_effect = Exception 148 | 149 | queue = s3concurrent.ProcessKeyQueue() 150 | queue.enqueue_item(mocked_key1, sandbox) 151 | 152 | self.assertEquals(queue.enqueued_counter, 1) 153 | 154 | s3concurrent.process_a_key(queue, 'upload', 1) 155 | 156 | self.assertEquals(queue.de_queue_counter, 1) 157 | mocked_key1.set_contents_from_filename.assert_called_once_with(sandbox) 158 | self.assertFalse(queue.is_empty()) 159 | 160 | self.assertEquals(queue.enqueued_counter, 2) 161 | 162 | def test_is_sync_needed(self): 163 | mocked_key1 = mock.Mock() 164 | mocked_key1.etag = '' 165 | 166 | mocked_file_path = sandbox + '/a.txt' 167 | 168 | with open(mocked_file_path, 'wb') as f: 169 | f.write('mocked file') 170 | 171 | download = s3concurrent.is_sync_needed(mocked_key1, mocked_file_path) 172 | self.assertTrue(download) 173 | 174 | def test_is_sync_not_needed(self): 175 | mocked_key1 = mock.Mock() 176 | mocked_key1.etag = '"de3a2ccff42d63dc60c6955634d122da"' 177 | 178 | mocked_file_path = sandbox + '/a.txt' 179 | 180 | with open(mocked_file_path, 'wb') as f: 181 | f.write('mocked file') 182 | 183 | self.assertEquals('de3a2ccff42d63dc60c6955634d122da', s3concurrent._get_md5(mocked_file_path)) 184 | 185 | download = s3concurrent.is_sync_needed(mocked_key1, mocked_file_path) 186 | self.assertFalse(download) 187 | 188 | @mock.patch('hashlib.md5', side_effect=Exception) 189 | def test_is_sync_needed_error(self, mocked_read_md5): 190 | mocked_key1 = mock.Mock() 191 | download = s3concurrent.is_sync_needed(mocked_key1, sandbox + '/a.txt') 192 | self.assertTrue(download) 193 | 194 | @mock.patch('s3concurrent.s3concurrent.process_a_key') 195 | def test_consume_download_queue(self, mocked_consume_a_key): 196 | mocked_key1 = mock.Mock() 197 | mocked_key2 = mock.Mock() 198 | mocked_key3 = mock.Mock() 199 | 200 | queue = s3concurrent.ProcessKeyQueue() 201 | 202 | queue.queuing_started() 203 | queue.enqueue_item(mocked_key1, sandbox) 204 | queue.enqueue_item(mocked_key2, sandbox) 205 | queue.enqueue_item(mocked_key3, sandbox) 206 | queue.queuing_stopped() 207 | 208 | def mock_dequeue_a_key(queue, action, max_retry): 209 | queue.de_queue_an_item() 210 | 211 | mocked_consume_a_key.side_effect = mock_dequeue_a_key 212 | 213 | self.assertFalse(queue.is_empty()) 214 | self.assertEquals(3, queue.enqueued_counter) 215 | 216 | s3concurrent.consume_queue(queue, 'download', 3, 1) 217 | time.sleep(0.1) 218 | 219 | self.assertTrue(queue.is_empty()) 220 | self.assertEquals(3, queue.de_queue_counter) 221 | 222 | @mock.patch('s3concurrent.s3concurrent.process_a_key') 223 | def test_consume_upload_queue(self, mocked_consume_a_key): 224 | mocked_key1 = mock.Mock() 225 | mocked_key2 = mock.Mock() 226 | mocked_key3 = mock.Mock() 227 | 228 | queue = s3concurrent.ProcessKeyQueue() 229 | 230 | queue.queuing_started() 231 | queue.enqueue_item(mocked_key1, sandbox) 232 | queue.enqueue_item(mocked_key2, sandbox) 233 | queue.enqueue_item(mocked_key3, sandbox) 234 | queue.queuing_stopped() 235 | 236 | def mock_dequeue_a_key(queue, action, max_retry): 237 | queue.de_queue_an_item() 238 | 239 | mocked_consume_a_key.side_effect = mock_dequeue_a_key 240 | 241 | self.assertFalse(queue.is_empty()) 242 | self.assertEquals(3, queue.enqueued_counter) 243 | 244 | s3concurrent.consume_queue(queue, 'upload', 3, 1) 245 | time.sleep(0.1) 246 | 247 | self.assertTrue(queue.is_empty()) 248 | self.assertEquals(3, queue.de_queue_counter) 249 | 250 | @mock.patch('time.sleep') 251 | @mock.patch('s3concurrent.s3concurrent.is_sync_needed', return_value=True) 252 | def test_process_a_key_waiting(self, mocked_is_sync_needed, mocked_sleep): 253 | mock_folder1 = 'a/b/' 254 | mocked_key1 = mock.Mock() 255 | mocked_key1.name = mock_folder1 + 'c' 256 | mocked_key1.get_contents_to_filename = mock.Mock() 257 | 258 | queue = s3concurrent.ProcessKeyQueue() 259 | queue.enqueue_item(mocked_key1, sandbox, 2) 260 | 261 | s3concurrent.process_a_key(queue, 'download', 3) 262 | 263 | mocked_sleep.assert_called_once_with(4) 264 | 265 | @mock.patch('time.sleep') 266 | @mock.patch('s3concurrent.s3concurrent.is_sync_needed', return_value=True) 267 | def test_process_a_key_max_retry(self, mocked_is_sync_needed, mocked_sleep): 268 | mock_folder1 = 'a/b/' 269 | mocked_key1 = mock.Mock() 270 | mocked_key1.name = mock_folder1 + 'c' 271 | mocked_key1.get_contents_to_filename = mock.Mock() 272 | 273 | queue = s3concurrent.ProcessKeyQueue() 274 | queue.enqueue_item(mocked_key1, sandbox, 2) 275 | 276 | s3concurrent.process_a_key(queue, 'download', 1) 277 | 278 | self.assertEquals(0, mocked_sleep.call_count) 279 | 280 | def test_get_md5(self): 281 | self.assertEquals( 282 | '032b6af31d2d1be87ff63adb423d270f', 283 | s3concurrent._get_md5(self.temp_filename) 284 | ) 285 | 286 | def test_calculate_s3_etag(self): 287 | self.assertEquals( 288 | '3d6c16c58ab63e8b4f66cb09040eb660-5', 289 | s3concurrent._calculate_s3_etag(self.temp_filename, s3concurrent.AWS_UPLOAD_PART_SIZE) 290 | ) 291 | 292 | def test_s3_etag_match_with_multipart_upload(self): 293 | self.assertTrue( 294 | s3concurrent._s3_etag_match( 295 | '3d6c16c58ab63e8b4f66cb09040eb660-5', 296 | self.temp_filename 297 | ) 298 | ) 299 | 300 | def test_s3_etag_match_with_multipart_upload_incorrect(self): 301 | self.assertFalse( 302 | s3concurrent._s3_etag_match( 303 | '3d6d16c58ab63e8b4f66cb09040eb660-5', 304 | self.temp_filename 305 | ) 306 | ) 307 | 308 | def test_s3_etag_match_with_singlepart_upload(self): 309 | self.assertTrue( 310 | s3concurrent._s3_etag_match( 311 | '032b6af31d2d1be87ff63adb423d270f', 312 | self.temp_filename 313 | ) 314 | ) 315 | 316 | def test_s3_etag_match_with_singlepart_upload_incorrect(self): 317 | self.assertFalse( 318 | s3concurrent._s3_etag_match( 319 | '032b6bf31d2d1be87ff63adb423d270f', 320 | self.temp_filename 321 | ) 322 | ) 323 | 324 | @classmethod 325 | def setUpClass(cls): 326 | # Create a predictable 296.1 MB temporary file 327 | elements = [200, 50, 25] * 9999 328 | cls.temp_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', "%s.bin" % uuid.uuid4()) 329 | tempfile = open(cls.temp_filename, 'wb') 330 | 331 | for i in xrange(0, 9872): 332 | tempfile.write(bytearray(elements)) 333 | 334 | tempfile.close() 335 | 336 | @classmethod 337 | def tearDownClass(cls): 338 | os.remove(cls.temp_filename) 339 | 340 | def setUp(self): 341 | if os.path.exists(sandbox): 342 | shutil.rmtree(sandbox) 343 | os.makedirs(sandbox) 344 | 345 | def tearDown(self): 346 | shutil.rmtree(sandbox) 347 | 348 | 349 | if __name__ == '__main__': 350 | unittest.main() 351 | 352 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/quid/s3concurrent/b9c2442bed24e5cd3c536d012d0db39e341323c2/setup.cfg -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from pip.req import parse_requirements 3 | 4 | setup( 5 | name='s3concurrent', 6 | version='0.3.0', 7 | author='Quid Inc.', 8 | author_email='ops@quid.com', 9 | packages=['s3concurrent'], 10 | scripts=[], 11 | url='https://github.com/quid/s3concurrent', 12 | license='MIT', 13 | description='A fast S3 downloader/uploader for deep file structures.', 14 | keywords='s3 download upload tools', 15 | long_description=open('README.md').read(), 16 | install_requires=(str(ir.req) for ir in \ 17 | parse_requirements('requirements.txt', session=False) 18 | ), 19 | entry_points={ 20 | 'console_scripts': [ 21 | 's3concurrent_download=s3concurrent.s3concurrent:s3concurrent_download', 22 | 's3concurrent_upload=s3concurrent.s3concurrent:s3concurrent_upload' 23 | ]} 24 | ) 25 | --------------------------------------------------------------------------------