├── .gitignore
├── .pypirc
├── LICENSE
├── MANIFEST
├── README.md
├── requirements.txt
├── s3concurrent
    ├── __init__.py
    ├── bin
    │   ├── s3concurrent_download
    │   └── s3concurrent_upload
    ├── s3concurrent.py
    └── tests
    │   ├── data
    │       └── .gitkeep
    │   └── test_s3concurrent.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.mo
 2 | *.egg-info
 3 | *.egg
 4 | *.EGG
 5 | *.EGG-INFO
 6 | build
 7 | develop-eggs
 8 | downloads
 9 | eggs
10 | fake-eggs
11 | parts
12 | dist
13 | .installed.cfg
14 | .mr.developer.cfg
15 | .hg
16 | .bzr
17 | .svn
18 | *.pyc
19 | *.pyo
20 | *.tmp*
21 | /venv/*
22 | 


--------------------------------------------------------------------------------
/.pypirc:
--------------------------------------------------------------------------------
 1 | [distutils]
 2 | index-servers =
 3 |   pypi
 4 |   pypitest
 5 | 
 6 | [pypi]
 7 | repository: https://pypi.python.org/pypi
 8 | username: {{your_username}}
 9 | password: {{your_password}}
10 | 
11 | [pypitest]
12 | repository: https://testpypi.python.org/pypi
13 | username: {{your_username}}
14 | password: {{your_password}}


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Quid, Inc.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | README.txt
3 | setup.py
4 | s3concurrent/__init__.py
5 | s3concurrent/s3concurrent.py
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | s3concurrent uploads/downloads files to/from S3. 
  4 | 
  5 | Features include:
  6 | 
  7 | * Handles deep folder structures with many files.  
  8 | * Uploads/downloads many files concurrently.
  9 | * Maintains folder structure between a S3 bucket and local file system.
 10 | * Only uploads/downloads a file when a file has changed between S3 bucket and
 11 | local file system.
 12 | 
 13 | # Installation
 14 | 
 15 | ```
 16 | git clone https://github.com/quid/s3concurrent.git
 17 | pip install s3concurrent/
 18 | ```
 19 | 
 20 | # Usage
 21 | 
 22 | ## s3concurrent_download
 23 | 
 24 |     usage: s3concurrent_download [-h] [--prefix PREFIX]
 25 |                            [--local_folder LOCAL_FOLDER]
 26 |                            [--thread_count THREAD_COUNT]
 27 |                            [--max_retry MAX_RETRY]
 28 |                            s3_key s3_secret bucket_name
 29 | 
 30 |     positional arguments:
 31 |       s3_key                Your S3 API Key
 32 |       s3_secret             Your S3 secret key
 33 |       bucket_name           Your S3 bucket name
 34 | 
 35 |     optional arguments:
 36 |       -h, --help            show this help message and exit
 37 |       --prefix PREFIX       Path to a folder in the S3 bucket (e.g. my/dest/folder/)
 38 |       --local_folder LOCAL_FOLDER
 39 |                             Path to a a local filesystem folder (e.g. /my/src/folder)
 40 |       --thread_count THREAD_COUNT
 41 |                             Number of concurrent files to upload/download
 42 |       --max_retry MAX_RETRY
 43 |                             Max retries for uploading/downloading a file
 44 | 
 45 | ## s3concurrent_upload
 46 | 
 47 |     usage: s3concurrent_upload [-h] [--prefix PREFIX]
 48 |                            [--local_folder LOCAL_FOLDER]
 49 |                            [--thread_count THREAD_COUNT]
 50 |                            [--max_retry MAX_RETRY]
 51 |                            s3_key s3_secret bucket_name
 52 | 
 53 |     positional arguments:
 54 |       s3_key                Your S3 API Key
 55 |       s3_secret             Your S3 secret key
 56 |       bucket_name           Your S3 bucket name
 57 | 
 58 |     optional arguments:
 59 |       -h, --help            show this help message and exit
 60 |       --prefix PREFIX       Path to a folder in the S3 bucket (e.g. my/dest/folder/)
 61 |       --local_folder LOCAL_FOLDER
 62 |                             Path to a a local filesystem folder (e.g. /my/src/folder)
 63 |       --thread_count THREAD_COUNT
 64 |                             Number of concurrent files to upload/download
 65 |       --max_retry MAX_RETRY
 66 |                             Max retries for uploading/downloading a file
 67 | 
 68 | 
 69 | # Examples
 70 | 
 71 | Download files from the folder 'mirror/pypi' in a S3 bucket to a local folder 
 72 | '/path/to/mirror/pypi' with 20 concurrent downloads.
 73 | 
 74 | 
 75 | ```
 76 | s3concurrent_download <your_S3_Key> <your_S3_Secret> <your_S3_Bucket> --local_folder /path/to/mirror/pypi --prefix mirror/pypi --thread_count 20
 77 | ```
 78 | 
 79 | Upload files from the folder '/tmp/benchmark' to a 'benchmark' folder on S3 with 
 80 | 10 concurrent uploads and 3 retries per upload.
 81 | 
 82 | ```
 83 | s3concurrent_upload <your_S3_Key> <your_S3_Secret> <your_S3_Bucket> --local_folder /tmp/benchmark --prefix benchmark --thread_count 10 --max_retry 3
 84 | ```
 85 | 
 86 | # Running the tests
 87 | 
 88 | To run s3concurrent tests, please use the following command from s3concurrent's root directory after downloading the repository.
 89 | 
 90 | ```
 91 | python -m unittest discover s3concurrent/tests
 92 | ```
 93 | 
 94 | You should see all 14 tests passing in the end of the console outputs.
 95 |     
 96 |     ----------------------------------------------------------------------
 97 |     Ran 14 tests in 0.222s
 98 |     
 99 |     OK
100 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | argparse>=1.3.0
2 | boto>=2.32.1
3 | colorlog==2.6.1
4 | mock==1.0.1
5 | nose==1.3.4
6 | 


--------------------------------------------------------------------------------
/s3concurrent/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quid/s3concurrent/b9c2442bed24e5cd3c536d012d0db39e341323c2/s3concurrent/__init__.py


--------------------------------------------------------------------------------
/s3concurrent/bin/s3concurrent_download:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | # Tell the system to load the s3concurrent package
 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')))
 8 | 
 9 | import s3concurrent
10 | 
11 | if __name__ == '__main__':
12 |     sys.exit(s3concurrent.main('download', sys.argv[1:]))
13 | 


--------------------------------------------------------------------------------
/s3concurrent/bin/s3concurrent_upload:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import os
 4 | import sys
 5 | 
 6 | # Tell the system to load the s3concurrent package
 7 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), '..')))
 8 | 
 9 | import s3concurrent
10 | 
11 | if __name__ == '__main__':
12 |     sys.exit(s3concurrent.main('upload', sys.argv[1:]))
13 | 


--------------------------------------------------------------------------------
/s3concurrent/s3concurrent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | import binascii
  5 | import colorlog
  6 | import hashlib
  7 | import logging
  8 | import os
  9 | import sys
 10 | import threading
 11 | import time
 12 | 
 13 | from boto.s3.bucket import Bucket
 14 | from boto.s3.connection import S3Connection
 15 | from boto.s3.key import Key
 16 | from Queue import Queue
 17 | 
 18 | # AWS magic chunk size number. Discovered via brute force.
 19 | AWS_UPLOAD_PART_SIZE = 64 * 1024 * 1024
 20 | 
 21 | # Max number of items allowed in the queue to keep from blowing up memory
 22 | MAX_QUEUE_SIZE = 100000
 23 | 
 24 | # Configure logging
 25 | logger = logging.getLogger()
 26 | logger.setLevel(logging.DEBUG)
 27 | 
 28 | ch = logging.StreamHandler(sys.stdout)
 29 | ch.setLevel(logging.INFO)
 30 | formatter = colorlog.ColoredFormatter('%(log_color)s%(levelname)s: %(message)s')
 31 | ch.setFormatter(formatter)
 32 | logger.addHandler(ch)
 33 | 
 34 | 
 35 | class ProcessKeyQueue:
 36 |     '''
 37 |     ProcessKeyQueue implements the queuing functions needed for s3concurrent upload/download.
 38 |     '''
 39 | 
 40 |     def __init__(self):
 41 |         self.process_able_keys_queue = Queue()
 42 |         self.enqueued_counter = 0
 43 |         self.de_queue_counter = 0
 44 |         self.all_processed = False
 45 |         self.queuing = False
 46 | 
 47 |     def enqueue_item(self, key, local_file_path, enqueue_count=1):
 48 |         '''
 49 |         Enqueues an item to upload/download.
 50 | 
 51 |         :param key:                 s3 key to upload/download
 52 |         :param local_file_path:     local file path corresponding to the s3 key
 53 |         :param enqueue_count:       number of times this key has been enqueued
 54 |         '''
 55 |         self.process_able_keys_queue.put((key, local_file_path, enqueue_count))
 56 |         self.enqueued_counter += 1
 57 | 
 58 |     def is_empty(self):
 59 |         '''
 60 |         Checks if the queue is empty.
 61 | 
 62 |         :return:                (bool) true if the queue is empty
 63 |         '''
 64 |         return self.process_able_keys_queue.empty()
 65 | 
 66 |     def de_queue_an_item(self):
 67 |         '''
 68 |         De-queues an item from the queue.
 69 | 
 70 |         :return:                an item previously enqueued
 71 |         '''
 72 |         value = None
 73 | 
 74 |         if not self.is_empty():
 75 |             value = self.process_able_keys_queue.get()
 76 |             self.de_queue_counter += 1
 77 | 
 78 |         return value
 79 | 
 80 |     def is_queuing(self):
 81 |         '''
 82 |         Checks if queuing all the process-able keys from S3.
 83 | 
 84 |         :return:                 True if still queuing
 85 |         '''
 86 |         return self.queuing
 87 | 
 88 |     def queuing_stopped(self):
 89 |         '''
 90 |         Stops the queuing from S3.
 91 |         '''
 92 |         self.queuing = False
 93 | 
 94 |     def queuing_started(self):
 95 |         '''
 96 |         Starts the queuing from S3.
 97 |         '''
 98 |         self.queuing = True
 99 | 
100 | 
101 | def enqueue_s3_keys_for_download(s3_bucket, prefix, destination_folder, queue):
102 |     '''
103 |     En-queues S3 Keys to be downloaded.
104 | 
105 |     :param s3_bucket:               Boto Bucket object that contains the keys to be downloaded
106 |     :param prefix:                  The path to the S3 folder to be downloaded. Example: bucket_root/folder_1
107 |     :param destination_folder:      The relative or absolute path to the folder you wish to download to
108 |     :param queue:                   A ProcessKeyQueue instance to enqueue all the keys in
109 |     '''
110 |     bucket_list = s3_bucket.list(prefix=prefix)
111 | 
112 |     for key in bucket_list:
113 |         # prepare local destination structure
114 |         destination = destination_folder + key.name.replace(prefix, '', 1) if prefix else ('/' + key.name)
115 |         try:
116 |             containing_dir = os.path.dirname(destination)
117 |             if not os.path.exists(containing_dir):
118 |                 os.makedirs(containing_dir)
119 | 
120 |             # Don't queue more items while over 100,000 to prevent memory explosion
121 |             while MAX_QUEUE_SIZE < queue.process_able_keys_queue.qsize():
122 |                 time.sleep(1)
123 | 
124 |             # enqueue
125 |             queue.enqueue_item(key, destination)
126 | 
127 |         except:
128 |             logger.exception('Cannot enqueue key: {0}'.format(key.name))
129 | 
130 |     logger.info('Initial queuing has completed. {0} keys has been enqueued.'.format(queue.enqueued_counter))
131 |     queue.queuing_stopped()
132 | 
133 | 
134 | def enqueue_s3_keys_for_upload(s3_bucket, prefix, from_folder, queue):
135 |     '''
136 |     En-queues S3 Keys to be uploaded.
137 | 
138 |     :param s3_bucket:               Boto Bucket object that contains the keys to be uploaded to
139 |     :param prefix:                  The path to the S3 folder to be downloaded. Example: bucket_root/folder_1
140 |     :param from_folder:             The relative or absolute path to the folder you wish to upload from
141 |     :param queue:                   A ProcessKeyQueue instance to enqueue all the keys in
142 |     '''
143 |     abs_from_folder_path = os.path.abspath(from_folder)
144 | 
145 |     for root, dirs, files in os.walk(abs_from_folder_path):
146 |         for single_file in files:
147 |             abs_file_path = os.path.join(root, single_file)
148 | 
149 |             s3_key_name = abs_file_path.replace(abs_from_folder_path, '', 1)
150 |             if not s3_key_name.startswith('/') and prefix != '':
151 |                 s3_key_name = '/' + s3_key_name
152 |             s3_key_name = prefix + s3_key_name
153 | 
154 |             key = Key(s3_bucket)
155 |             key.key = s3_key_name
156 | 
157 |             # Don't queue more items to prevent memory explosion
158 |             while MAX_QUEUE_SIZE < queue.process_able_keys_queue.qsize():
159 |                 time.sleep(1)
160 | 
161 |             queue.enqueue_item(key, abs_file_path)
162 | 
163 |     logger.info('Initial queuing has completed. {0} keys has been enqueued.'.format(queue.enqueued_counter))
164 |     queue.queuing_stopped()
165 | 
166 | 
167 | def is_sync_needed(key, local_file_path):
168 |     '''
169 |     Checks if the local file is identical to the S3 key by using the file's md5 hash.
170 | 
171 |     :param key:                         The S3 key object.
172 |     :param local_file_path:             (str), path to download the key to
173 |     '''
174 |     sync_needed = True
175 |     if os.path.isfile(local_file_path) and key.exists():
176 |         try:
177 |             key_etag = key.etag
178 |             if not key_etag:
179 |                 key_etag = key.bucket.lookup(key.name).etag
180 | 
181 |             if not _s3_etag_match(key_etag, local_file_path):
182 |                 sync_needed = False
183 | 
184 |         except:
185 |             logger.exception(sys.exc_info())
186 |             logger.error(
187 |                 'Cannot compare local file {0} against remote file {1}. s3concurrent will process it anyway.'
188 |                 .format(local_file_path, key.name))
189 | 
190 |     return sync_needed
191 | 
192 | 
193 | def _s3_etag_match(etag, file_path):
194 |     '''
195 |     Checks if the local file's checksum matches the S3 etag.
196 | 
197 |     :param key:                         (str), the S3 etag.
198 |     :param file_path:                   (str), the local file to check.
199 |     :return:                            (bool), whether or not the etag matches the checksum of the local file.
200 |     '''
201 |     matches = False
202 | 
203 |     if '-' in etag:
204 |         # If the etag contains a dash, then the file was uploaded in parts
205 |         matches = _calculate_s3_etag(file_path, AWS_UPLOAD_PART_SIZE) == etag
206 | 
207 |     else:
208 |         # Etag will be a MD5 checksum when the file was uploaded as a whole
209 |         matches = _get_md5(file_path) == etag
210 | 
211 |     return matches
212 | 
213 | 
214 | def _calculate_s3_etag(file_path, part_size):
215 |     '''
216 |     Calculates the S3 etag of a file when the upload was performed in parts.
217 | 
218 |     :param file_path:                   (str), the local file calculate the etag for.
219 |     :param part_size:                   (int), the size of the chunks that were used to upload the file to S3.
220 |     :return:                            (str), the calculated S3 etag of the local file.
221 |     '''
222 |     block_count = 0
223 |     md5string = ''
224 |     with open(file_path, 'rb') as open_file:
225 |         buf = open_file.read(part_size)
226 |         while len(buf) > 0:
227 |             hasher = hashlib.md5()
228 |             hasher.update(buf)
229 |             md5string += binascii.unhexlify(hasher.hexdigest())
230 |             block_count += 1
231 | 
232 |             buf = open_file.read(part_size)
233 | 
234 |     hasher = hashlib.md5()
235 |     hasher.update(md5string)
236 |     return hasher.hexdigest() + '-' + str(block_count)
237 | 
238 | 
239 | def _get_md5(filename, blocksize=65536):
240 |     '''
241 |     Retrieves the MD5 checksum for the given filename.
242 | 
243 |     :param filename: (str), the file path to obtain the checksum of
244 |     :param blocksize: (int), the largest chunk of file size to read into memory
245 |     :return: the MD5 checksum
246 |     '''
247 |     hasher = hashlib.md5()
248 |     with open(filename, 'rb') as open_file:
249 |         buf = open_file.read(blocksize)
250 |         while len(buf) > 0:
251 |             hasher.update(buf)
252 |             buf = open_file.read(blocksize)
253 | 
254 |         return hasher.hexdigest()
255 | 
256 | 
257 | def process_a_key(queue, action, max_retry):
258 |     '''
259 |     Process (download or upload) a S3 key from/to respective local path.
260 | 
261 |     :param queue:                   A ProcessKeyQueue instance to de-queue a key from
262 |     :param action:                  download or upload
263 |     :param max_retry:               The max times for s3concurrent to retry uploading/downloading a key
264 |     '''
265 |     if not queue.is_empty():
266 |         key, local_path, enqueue_count = queue.de_queue_an_item()
267 | 
268 |         try:
269 | 
270 |             if is_sync_needed(key, local_path) and enqueue_count <= max_retry:
271 | 
272 |                 # wait accordingly to enqueue_count
273 |                 if enqueue_count > 1:
274 |                     wait_time = enqueue_count ** 2
275 |                     logger.info('Attempt no.{0} to {1} {2}. Wait {3} secs.'.format(enqueue_count, action, key.name, wait_time))
276 |                     time.sleep(wait_time)
277 | 
278 |                 # conduct upload/download
279 |                 if action == 'download':
280 |                     key.get_contents_to_filename(local_path)
281 |                 else:
282 |                     key.set_contents_from_filename(local_path)
283 | 
284 |             elif enqueue_count > max_retry:
285 |                 logger.error('Ignoring {0} since s3concurrent had tried downloading {1} times.'.format(key.name, max_retry))
286 | 
287 |         except:
288 |             if key.size == 0:
289 |                 logger.info('%s is a directory, ignoring', key.name)
290 | 
291 |             else:
292 |                 logger.warn('Error {0}ing file with key: {1}, putting it back to the queue'.format(action, key.name))
293 |                 queue.enqueue_item(key, local_path, enqueue_count=enqueue_count + 1)
294 | 
295 |     else:
296 |         # do nothing when the queue is empty
297 |         pass
298 | 
299 | 
300 | def consume_queue(queue, action, thread_pool_size, max_retry):
301 |     '''
302 |     Consumes the queue with the designated thread poll size by uploading/downloading the keys to
303 |     their respective destinations.
304 | 
305 |     :param queue:                   A ProcessKeyQueue instance to consume all the keys from
306 |     :param action:                  "download" or "upload"
307 |     :param thread_pool_size:        The Designated thread pool size. (how many concurrent threads to process files.)
308 |     :param max_retry:               The max times for s3concurrent to retry uploading/downloading a key
309 |     '''
310 |     thread_pool = []
311 | 
312 |     while queue.is_queuing() or not queue.is_empty() or len(thread_pool) != 0:
313 |         # de-pool the done threads
314 |         for t in thread_pool:
315 |             if not t.is_alive():
316 |                 thread_pool.remove(t)
317 | 
318 |         # en-pool new threads
319 |         if not queue.is_empty() and len(thread_pool) <= thread_pool_size:
320 |             t = threading.Thread(target=process_a_key, args=[queue, action, max_retry])
321 |             t.start()
322 |             thread_pool.append(t)
323 | 
324 |     queue.all_processed = True
325 | 
326 | 
327 | def process_all(action, s3_key, s3_secret, bucket_name, prefix, local_folder, queue, thread_count, max_retry):
328 |     '''
329 |     Orchestrates the en-queuing and consuming threads in conducting:
330 |     1. Local folder structure construction
331 |     2. S3 key en-queuing
332 |     3. S3 key uploading/downloading if file updated
333 | 
334 |     :param action:                  download or upload
335 |     :param s3_key:                  Your S3 API Key
336 |     :param s3_secret:               Your S3 API Secret
337 |     :param bucket_name:             Your S3 bucket name
338 |     :param prefix:                  The path to the S3 folder to be downloaded. Example: bucket_root/folder_1
339 |     :param local_folder:            The local folder you wish to upload/download the files from/to
340 |     :param queue:                   A ProcessKeyQueue instance to enqueue all the keys in
341 |     :param thread_count:            The number of threads that you wish s3concurrent to use
342 |     :param max_retry:               The max times for s3concurrent to retry uploading/downloading a key
343 |     :return:                        True is all processed, false if interrupted in any way
344 |     '''
345 |     conn = S3Connection(s3_key, s3_secret)
346 |     bucket = Bucket(connection=conn, name=bucket_name)
347 | 
348 |     if action == 'download':
349 |         target_function = enqueue_s3_keys_for_download
350 |     else:
351 |         target_function = enqueue_s3_keys_for_upload
352 | 
353 |     enqueue_thread = threading.Thread(target=target_function, args=(bucket, prefix, local_folder, queue))
354 |     enqueue_thread.daemon = True
355 |     enqueue_thread.start()
356 | 
357 |     queue.queuing_started()
358 | 
359 |     consume_thread = threading.Thread(target=consume_queue, args=(queue, action, thread_count, max_retry))
360 |     consume_thread.daemon = True
361 |     consume_thread.start()
362 | 
363 |     while not queue.all_processed:
364 |         # report progress every 10 secs
365 |         logger.info('{0} keys enqueued, and {1} keys {2}ed'.format(queue.enqueued_counter, queue.de_queue_counter, action))
366 |         time.sleep(10)
367 | 
368 |     logger.info('{0} keys enqueued, and {1} keys {2}ed'.format(queue.enqueued_counter, queue.de_queue_counter, action))
369 | 
370 | 
371 | def main(action, command_line_args):
372 |     parser = argparse.ArgumentParser(prog='s3concurrent_{0}'.format(action))
373 |     parser.add_argument('s3_key', help="Your S3 API Key")
374 |     parser.add_argument('s3_secret', help="Your S3 secret Key")
375 |     parser.add_argument('bucket_name', help="Your S3 bucket name")
376 |     parser.add_argument('--prefix', default=None, help="Path to a folder in the S3 bucket (e.g. my/dest/folder/)".format(action))
377 |     parser.add_argument('--local_folder', default='.', help="Path to a a local filesystem folder (e.g. /my/src/folder)".format(action))
378 |     parser.add_argument('--thread_count', default=10, help="Number of concurrent files to upload/download")
379 |     parser.add_argument('--max_retry', default=10, help="Max retries for uploading/downloading a file")
380 | 
381 |     args = parser.parse_args(command_line_args)
382 | 
383 |     queue = ProcessKeyQueue()
384 | 
385 |     if args.s3_key and args.s3_secret and args.bucket_name:
386 |         process_all(action, args.s3_key, args.s3_secret, args.bucket_name, args.prefix, args.local_folder, queue, int(args.thread_count), int(args.max_retry))
387 | 
388 |         if queue.all_processed:
389 |             logger.info('All keys are {0}ed'.format(action))
390 |         else:
391 |             logger.info('{0} interrupted'.format(action))
392 | 
393 |     return queue.all_processed
394 | 
395 | 
396 | def s3concurrent_download(command_line_args=None):
397 |     main('download', command_line_args=command_line_args)
398 | 
399 | 
400 | def s3concurrent_upload(command_line_args=None):
401 |     main('upload', command_line_args=command_line_args)
402 | 


--------------------------------------------------------------------------------
/s3concurrent/tests/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quid/s3concurrent/b9c2442bed24e5cd3c536d012d0db39e341323c2/s3concurrent/tests/data/.gitkeep


--------------------------------------------------------------------------------
/s3concurrent/tests/test_s3concurrent.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import mock
  4 | import os
  5 | import shutil
  6 | import tempfile
  7 | import time
  8 | import unittest
  9 | import uuid
 10 | 
 11 | from s3concurrent import s3concurrent
 12 | 
 13 | sandbox = os.path.dirname(os.path.realpath(__file__)) + '/sandbox/'
 14 | 
 15 | 
 16 | class TestS3Concurrent(unittest.TestCase):
 17 | 
 18 |     def test_enqueue_s3_keys_for_download(self):
 19 | 
 20 |         mock_folder1 = 'a/b/'
 21 |         mocked_key1 = mock.Mock()
 22 |         mocked_key1.name = mock_folder1 + 'c'
 23 | 
 24 |         mock_folder2 = 'b/c/'
 25 |         mocked_key2 = mock.Mock()
 26 |         mocked_key2.name = mock_folder2 + 'd'
 27 | 
 28 |         mock_folder3 = 'c/d/'
 29 |         mocked_key3 = mock.Mock()
 30 |         mocked_key3.name = mock_folder3 + 'e'
 31 | 
 32 |         mocked_bucket = mock.Mock()
 33 |         mocked_bucket.list = lambda prefix: [mocked_key1, mocked_key2, mocked_key3]
 34 | 
 35 |         queue = s3concurrent.ProcessKeyQueue()
 36 | 
 37 |         s3concurrent.enqueue_s3_keys_for_download(mocked_bucket, 'test/prefix', sandbox, queue)
 38 | 
 39 |         self.assertTrue(os.path.exists(sandbox + mock_folder1))
 40 |         self.assertTrue(os.path.exists(sandbox + mock_folder2))
 41 |         self.assertTrue(os.path.exists(sandbox + mock_folder3))
 42 | 
 43 |         self.assertEquals(queue.enqueued_counter, 3)
 44 |         self.assertFalse(queue.is_empty())
 45 | 
 46 |         self.assertFalse(queue.is_queuing())
 47 | 
 48 |     @mock.patch('os.path.dirname', side_effect=Exception)
 49 |     def test_enqueue_s3_keys_for_download_error(self, mocked_dirname):
 50 |         mock_folder1 = 'a/b/'
 51 |         mocked_key1 = mock.Mock()
 52 |         mocked_key1.name = mock_folder1 + 'c'
 53 | 
 54 |         mocked_bucket = mock.Mock()
 55 |         mocked_bucket.list = lambda prefix: [mocked_key1]
 56 | 
 57 |         queue = s3concurrent.ProcessKeyQueue()
 58 | 
 59 |         s3concurrent.enqueue_s3_keys_for_download(mocked_bucket, 'test/prefix', sandbox, queue)
 60 | 
 61 |         self.assertEquals(queue.enqueued_counter, 0)
 62 |         self.assertTrue(queue.is_empty())
 63 | 
 64 |         self.assertFalse(queue.is_queuing())
 65 | 
 66 |     def test_enqueue_s3_keys_for_upload(self):
 67 |         # fake files to be enqueued
 68 |         for item in ['a', 'b', 'c']:
 69 |             with open(sandbox + '{0}.txt'.format(item), 'wb') as f:
 70 |                 f.write('mocked file')
 71 | 
 72 |         mocked_bucket = mock.Mock()
 73 | 
 74 |         queue = s3concurrent.ProcessKeyQueue()
 75 | 
 76 |         s3concurrent.enqueue_s3_keys_for_upload(mocked_bucket, 'test/prefix', sandbox, queue)
 77 | 
 78 |         self.assertEquals(queue.enqueued_counter, 3)
 79 |         self.assertFalse(queue.is_empty())
 80 | 
 81 |         self.assertFalse(queue.is_queuing())
 82 | 
 83 |     def test_download_a_key(self):
 84 |         mock_folder1 = 'a/b/'
 85 |         mocked_key1 = mock.Mock()
 86 |         mocked_key1.name = mock_folder1 + 'c'
 87 |         mocked_key1.get_contents_to_filename = mock.Mock()
 88 | 
 89 |         queue = s3concurrent.ProcessKeyQueue()
 90 |         queue.enqueue_item(mocked_key1, sandbox)
 91 | 
 92 |         self.assertEquals(queue.enqueued_counter, 1)
 93 | 
 94 |         s3concurrent.process_a_key(queue, 'download', 1)
 95 | 
 96 |         self.assertEquals(queue.de_queue_counter, 1)
 97 |         self.assertTrue(queue.is_empty())
 98 |         mocked_key1.get_contents_to_filename.assert_called_once_with(sandbox)
 99 | 
100 |     def test_download_a_key_error(self):
101 |         mock_folder1 = 'a/b/'
102 |         mocked_key1 = mock.Mock()
103 |         mocked_key1.name = mock_folder1 + 'c'
104 |         mocked_key1.get_contents_to_filename = mock.Mock()
105 |         mocked_key1.get_contents_to_filename.side_effect = Exception
106 | 
107 |         queue = s3concurrent.ProcessKeyQueue()
108 |         queue.enqueue_item(mocked_key1, sandbox)
109 | 
110 |         self.assertEquals(queue.enqueued_counter, 1)
111 | 
112 |         s3concurrent.process_a_key(queue, 'download', 1)
113 | 
114 |         self.assertEquals(queue.de_queue_counter, 1)
115 |         mocked_key1.get_contents_to_filename.assert_called_once_with(sandbox)
116 |         self.assertFalse(queue.is_empty())
117 | 
118 |         self.assertEquals(queue.enqueued_counter, 2)
119 | 
120 |     def test_upload_a_key(self):
121 |         test_key_name = sandbox + 'test.txt'
122 | 
123 |         with open(test_key_name, 'wb') as f:
124 |             f.write('mocked file')
125 | 
126 |         mocked_key1 = mock.Mock()
127 |         mocked_key1.name = test_key_name
128 |         mocked_key1.set_contents_from_filename = mock.Mock()
129 | 
130 |         queue = s3concurrent.ProcessKeyQueue()
131 |         queue.enqueue_item(mocked_key1, test_key_name)
132 | 
133 |         self.assertEquals(queue.enqueued_counter, 1)
134 | 
135 |         s3concurrent.process_a_key(queue, 'upload', 1)
136 | 
137 |         self.assertEquals(queue.de_queue_counter, 1)
138 |         self.assertTrue(queue.is_empty())
139 |         mocked_key1.set_contents_from_filename.assert_called_once_with(test_key_name)
140 | 
141 |     def test_upload_a_key_error(self):
142 |         test_key_name = sandbox + 'test.txt'
143 | 
144 |         mocked_key1 = mock.Mock()
145 |         mocked_key1.name = test_key_name
146 |         mocked_key1.set_contents_from_filename = mock.Mock()
147 |         mocked_key1.set_contents_from_filename.side_effect = Exception
148 | 
149 |         queue = s3concurrent.ProcessKeyQueue()
150 |         queue.enqueue_item(mocked_key1, sandbox)
151 | 
152 |         self.assertEquals(queue.enqueued_counter, 1)
153 | 
154 |         s3concurrent.process_a_key(queue, 'upload', 1)
155 | 
156 |         self.assertEquals(queue.de_queue_counter, 1)
157 |         mocked_key1.set_contents_from_filename.assert_called_once_with(sandbox)
158 |         self.assertFalse(queue.is_empty())
159 | 
160 |         self.assertEquals(queue.enqueued_counter, 2)
161 | 
162 |     def test_is_sync_needed(self):
163 |         mocked_key1 = mock.Mock()
164 |         mocked_key1.etag = ''
165 | 
166 |         mocked_file_path = sandbox + '/a.txt'
167 | 
168 |         with open(mocked_file_path, 'wb') as f:
169 |             f.write('mocked file')
170 | 
171 |         download = s3concurrent.is_sync_needed(mocked_key1, mocked_file_path)
172 |         self.assertTrue(download)
173 | 
174 |     def test_is_sync_not_needed(self):
175 |         mocked_key1 = mock.Mock()
176 |         mocked_key1.etag = '"de3a2ccff42d63dc60c6955634d122da"'
177 | 
178 |         mocked_file_path = sandbox + '/a.txt'
179 | 
180 |         with open(mocked_file_path, 'wb') as f:
181 |             f.write('mocked file')
182 | 
183 |         self.assertEquals('de3a2ccff42d63dc60c6955634d122da', s3concurrent._get_md5(mocked_file_path))
184 | 
185 |         download = s3concurrent.is_sync_needed(mocked_key1, mocked_file_path)
186 |         self.assertFalse(download)
187 | 
188 |     @mock.patch('hashlib.md5', side_effect=Exception)
189 |     def test_is_sync_needed_error(self, mocked_read_md5):
190 |         mocked_key1 = mock.Mock()
191 |         download = s3concurrent.is_sync_needed(mocked_key1, sandbox + '/a.txt')
192 |         self.assertTrue(download)
193 | 
194 |     @mock.patch('s3concurrent.s3concurrent.process_a_key')
195 |     def test_consume_download_queue(self, mocked_consume_a_key):
196 |         mocked_key1 = mock.Mock()
197 |         mocked_key2 = mock.Mock()
198 |         mocked_key3 = mock.Mock()
199 | 
200 |         queue = s3concurrent.ProcessKeyQueue()
201 | 
202 |         queue.queuing_started()
203 |         queue.enqueue_item(mocked_key1, sandbox)
204 |         queue.enqueue_item(mocked_key2, sandbox)
205 |         queue.enqueue_item(mocked_key3, sandbox)
206 |         queue.queuing_stopped()
207 | 
208 |         def mock_dequeue_a_key(queue, action, max_retry):
209 |             queue.de_queue_an_item()
210 | 
211 |         mocked_consume_a_key.side_effect = mock_dequeue_a_key
212 | 
213 |         self.assertFalse(queue.is_empty())
214 |         self.assertEquals(3, queue.enqueued_counter)
215 | 
216 |         s3concurrent.consume_queue(queue, 'download', 3, 1)
217 |         time.sleep(0.1)
218 | 
219 |         self.assertTrue(queue.is_empty())
220 |         self.assertEquals(3, queue.de_queue_counter)
221 | 
222 |     @mock.patch('s3concurrent.s3concurrent.process_a_key')
223 |     def test_consume_upload_queue(self, mocked_consume_a_key):
224 |         mocked_key1 = mock.Mock()
225 |         mocked_key2 = mock.Mock()
226 |         mocked_key3 = mock.Mock()
227 | 
228 |         queue = s3concurrent.ProcessKeyQueue()
229 | 
230 |         queue.queuing_started()
231 |         queue.enqueue_item(mocked_key1, sandbox)
232 |         queue.enqueue_item(mocked_key2, sandbox)
233 |         queue.enqueue_item(mocked_key3, sandbox)
234 |         queue.queuing_stopped()
235 | 
236 |         def mock_dequeue_a_key(queue, action, max_retry):
237 |             queue.de_queue_an_item()
238 | 
239 |         mocked_consume_a_key.side_effect = mock_dequeue_a_key
240 | 
241 |         self.assertFalse(queue.is_empty())
242 |         self.assertEquals(3, queue.enqueued_counter)
243 | 
244 |         s3concurrent.consume_queue(queue, 'upload', 3, 1)
245 |         time.sleep(0.1)
246 | 
247 |         self.assertTrue(queue.is_empty())
248 |         self.assertEquals(3, queue.de_queue_counter)
249 | 
250 |     @mock.patch('time.sleep')
251 |     @mock.patch('s3concurrent.s3concurrent.is_sync_needed', return_value=True)
252 |     def test_process_a_key_waiting(self, mocked_is_sync_needed, mocked_sleep):
253 |         mock_folder1 = 'a/b/'
254 |         mocked_key1 = mock.Mock()
255 |         mocked_key1.name = mock_folder1 + 'c'
256 |         mocked_key1.get_contents_to_filename = mock.Mock()
257 | 
258 |         queue = s3concurrent.ProcessKeyQueue()
259 |         queue.enqueue_item(mocked_key1, sandbox, 2)
260 | 
261 |         s3concurrent.process_a_key(queue, 'download', 3)
262 | 
263 |         mocked_sleep.assert_called_once_with(4)
264 | 
265 |     @mock.patch('time.sleep')
266 |     @mock.patch('s3concurrent.s3concurrent.is_sync_needed', return_value=True)
267 |     def test_process_a_key_max_retry(self, mocked_is_sync_needed, mocked_sleep):
268 |         mock_folder1 = 'a/b/'
269 |         mocked_key1 = mock.Mock()
270 |         mocked_key1.name = mock_folder1 + 'c'
271 |         mocked_key1.get_contents_to_filename = mock.Mock()
272 | 
273 |         queue = s3concurrent.ProcessKeyQueue()
274 |         queue.enqueue_item(mocked_key1, sandbox, 2)
275 | 
276 |         s3concurrent.process_a_key(queue, 'download', 1)
277 | 
278 |         self.assertEquals(0, mocked_sleep.call_count)
279 | 
280 |     def test_get_md5(self):
281 |         self.assertEquals(
282 |             '032b6af31d2d1be87ff63adb423d270f',
283 |             s3concurrent._get_md5(self.temp_filename)
284 |         )
285 | 
286 |     def test_calculate_s3_etag(self):
287 |         self.assertEquals(
288 |             '3d6c16c58ab63e8b4f66cb09040eb660-5',
289 |             s3concurrent._calculate_s3_etag(self.temp_filename, s3concurrent.AWS_UPLOAD_PART_SIZE)
290 |         )
291 | 
292 |     def test_s3_etag_match_with_multipart_upload(self):
293 |         self.assertTrue(
294 |             s3concurrent._s3_etag_match(
295 |                 '3d6c16c58ab63e8b4f66cb09040eb660-5',
296 |                 self.temp_filename
297 |             )
298 |         )
299 | 
300 |     def test_s3_etag_match_with_multipart_upload_incorrect(self):
301 |         self.assertFalse(
302 |             s3concurrent._s3_etag_match(
303 |                 '3d6d16c58ab63e8b4f66cb09040eb660-5',
304 |                 self.temp_filename
305 |             )
306 |         )
307 | 
308 |     def test_s3_etag_match_with_singlepart_upload(self):
309 |         self.assertTrue(
310 |             s3concurrent._s3_etag_match(
311 |                 '032b6af31d2d1be87ff63adb423d270f',
312 |                 self.temp_filename
313 |             )
314 |         )
315 | 
316 |     def test_s3_etag_match_with_singlepart_upload_incorrect(self):
317 |         self.assertFalse(
318 |             s3concurrent._s3_etag_match(
319 |                 '032b6bf31d2d1be87ff63adb423d270f',
320 |                 self.temp_filename
321 |             )
322 |         )
323 | 
324 |     @classmethod
325 |     def setUpClass(cls):
326 |         # Create a predictable 296.1 MB temporary file
327 |         elements = [200, 50, 25] * 9999
328 |         cls.temp_filename = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', "%s.bin" % uuid.uuid4())
329 |         tempfile = open(cls.temp_filename, 'wb')
330 | 
331 |         for i in xrange(0, 9872):
332 |             tempfile.write(bytearray(elements))
333 | 
334 |         tempfile.close()
335 | 
336 |     @classmethod
337 |     def tearDownClass(cls):
338 |         os.remove(cls.temp_filename)
339 | 
340 |     def setUp(self):
341 |         if os.path.exists(sandbox):
342 |             shutil.rmtree(sandbox)
343 |         os.makedirs(sandbox)
344 | 
345 |     def tearDown(self):
346 |         shutil.rmtree(sandbox)
347 | 
348 | 
349 | if __name__ == '__main__':
350 |     unittest.main()
351 | 
352 |     


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/quid/s3concurrent/b9c2442bed24e5cd3c536d012d0db39e341323c2/setup.cfg


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from pip.req import parse_requirements
 3 | 
 4 | setup(
 5 |     name='s3concurrent',
 6 |     version='0.3.0',
 7 |     author='Quid Inc.',
 8 |     author_email='ops@quid.com',
 9 |     packages=['s3concurrent'],
10 |     scripts=[],
11 |     url='https://github.com/quid/s3concurrent',
12 |     license='MIT',
13 |     description='A fast S3 downloader/uploader for deep file structures.',
14 |     keywords='s3 download upload tools',
15 |     long_description=open('README.md').read(),
16 |     install_requires=(str(ir.req) for ir in \
17 |         parse_requirements('requirements.txt', session=False)
18 |     ),
19 |     entry_points={
20 |     'console_scripts': [
21 |         's3concurrent_download=s3concurrent.s3concurrent:s3concurrent_download',
22 |         's3concurrent_upload=s3concurrent.s3concurrent:s3concurrent_upload'
23 |     ]}
24 | )
25 | 


--------------------------------------------------------------------------------