├── README ├── sstable2s3.py ├── s32sstable.py └── sstables3.py /README: -------------------------------------------------------------------------------- 1 | Backup your Cassandra SSTables to S3 2 | 3 | Multithreaded to upload multiple files simultaneously. 4 | Uses Multipart Uploads to handle large files to handle resuming. 5 | GZips tables on the fly so extra disk space is not required to upload. 6 | 7 | TODO: Parallelize uploading parts of the same file. 8 | -------------------------------------------------------------------------------- /sstable2s3.py: -------------------------------------------------------------------------------- 1 | #!/opt/ActivePython-2.7/bin/python 2 | import boto 3 | # import pyinotyify 4 | 5 | from threading import Thread 6 | from optparse import OptionParser 7 | from StringIO import StringIO 8 | import struct 9 | import resource 10 | import logging 11 | import os.path 12 | import socket 13 | import json 14 | import sys 15 | import os 16 | import time 17 | import threading 18 | import datetime 19 | import math 20 | import mimetypes 21 | import hashlib 22 | import io 23 | import pickle 24 | import sqlite3 25 | import zlib 26 | import binascii 27 | from sstables3 import * 28 | 29 | 30 | def main(): 31 | # fix for http://bugs.python.org/issue7980 with strptime 32 | time.strptime('31 Jan 11', '%d %b %y') 33 | 34 | 35 | parser = OptionParser(usage='%prog [options] ') 36 | parser.add_option('-k', '--aws-key', dest='aws_key', default=None) 37 | parser.add_option('-s', '--aws-secret', dest='aws_secret', default=None) 38 | parser.add_option('--resume', action='store_true', dest='resume', default=False) 39 | # TODO - if ignoring compacted files, don't upload them nor put them in the manifest files 40 | parser.add_option('--ignore-compacted', action='store_true', dest='ignore_compacted', default=False) 41 | options, args = parser.parse_args() 42 | 43 | if len(args) < 4: 44 | parser.print_help() 45 | return -1 46 | 47 | bucket = args[0] 48 | prefix = args[1] 49 | path = args[2] 50 | sqlite = args[3] 51 | aws_key = options.aws_key 52 | aws_secret = options.aws_secret 53 | resume = options.resume 54 | ignore_compacted = options.ignore_compacted 55 | 56 | wrapper = SSTableS3(aws_key, aws_secret, bucket, prefix) 57 | wrapper.init_sqlite(sqlite) 58 | if not resume: 59 | wrapper.cancel_pending_uploads() 60 | 61 | wrapper.sync_to_bucketPath(path, ignore_compacted) 62 | 63 | if __name__ == '__main__': 64 | sys.exit(main()) 65 | -------------------------------------------------------------------------------- /s32sstable.py: -------------------------------------------------------------------------------- 1 | #!/opt/ActivePython-2.7/bin/python 2 | import boto 3 | 4 | from threading import Thread 5 | from optparse import OptionParser 6 | from StringIO import StringIO 7 | import struct 8 | import resource 9 | import logging 10 | import os.path 11 | import socket 12 | import json 13 | import sys 14 | import os 15 | import time 16 | import threading 17 | import datetime 18 | import math 19 | import mimetypes 20 | import hashlib 21 | import io 22 | import pickle 23 | import sqlite3 24 | import zlib 25 | import binascii 26 | import re 27 | from sstables3 import * 28 | 29 | MAX_THREADS = 4 30 | 31 | def main(): 32 | # fix for http://bugs.python.org/issue7980 with strptime 33 | time.strptime('31 Jan 11', '%d %b %y') 34 | 35 | parser = OptionParser(usage='%prog [options] ') 36 | parser.add_option('-k', '--aws-key', dest='aws_key', default=None) 37 | parser.add_option('-s', '--aws-secret', dest='aws_secret', default=None) 38 | parser.add_option('--restore-compacted', action='store_true', dest='restore_compacted', default=False) 39 | parser.add_option('--delete', action='store_true', dest='delete', default=False) 40 | options, args = parser.parse_args() 41 | 42 | aws_key = options.aws_key 43 | aws_secret = options.aws_secret 44 | restore_compacted = options.restore_compacted 45 | 46 | if len(args) >= 2: 47 | bucket = args[0] 48 | prefix = args[1] 49 | wrapper = SSTableS3(aws_key, aws_secret, bucket, prefix) 50 | else: 51 | parser.print_help() 52 | return -1 53 | 54 | 55 | if len(args)==2: 56 | # search the bucket for manifest files 57 | manifests = wrapper.listManifests() 58 | print repr(manifests) 59 | # echo out the manifest file listings 60 | return -1 61 | 62 | if len(args) < 4: 63 | parser.print_help() 64 | return -1 65 | 66 | manifest = args[2] 67 | target_path = args[3] 68 | 69 | wrapper = SSTableS3(aws_key, aws_secret, bucket, prefix) 70 | local_filelist = wrapper.createPathManifest(target_path) 71 | 72 | # download the requested manifest from s3 73 | manifest_data = wrapper.getManifest(manifest) 74 | # parse the manifests 75 | manifest_files = manifest_data['files'] 76 | manifest_files.sort() 77 | 78 | if (restore_compacted == True): 79 | filtered_files = manifest_files 80 | else: 81 | filtered_files = wrapper.filterCompactedFiles(manifest_files) 82 | 83 | paths = [] 84 | for _filename in filtered_files: 85 | # strip filename to last slash '/' 86 | last_slash_idx = _filename.rfind('/') 87 | _path = _filename[0:last_slash_idx] 88 | if _path not in paths: 89 | paths.append(_path) 90 | 91 | # create the appropriate final directory structure 92 | for _path in paths: 93 | fullpath = os.path.join(target_path, _path) 94 | if not os.path.exists(fullpath): 95 | os.makedirs(fullpath) 96 | 97 | 98 | # for f in manifest: 99 | # while True: 100 | # if threading.activeCount() < MAX_THREADS: 101 | # self.thread_wait = 0.015625 102 | # # sys.stderr.write("starting new thread for " + f + " with " + str(threading.activeCount()) + "/" + str(MAX_THREADS) + " threads running\n") 103 | # t = Thread(target=self.syncFileS3, args=(path, f)) 104 | # t.setDaemon(True) 105 | # t.start() 106 | # threadlist.append(t) 107 | # break 108 | # else: 109 | # # sys.stderr.write("sleeping for " + str(self.thread_wait) + " seconds with " + str(threading.activeCount()) + "/" + str(MAX_THREADS) + " threads running\n") 110 | # self.thread_wait = min(self.thread_wait * 2, 60); 111 | # time.sleep(self.thread_wait) 112 | # for t in threadlist: 113 | # t.join() 114 | 115 | # see what files already exist locally in the path 116 | threadlist = [] 117 | for _filename in filtered_files: 118 | fullpath = os.path.join(target_path, _filename) 119 | key = prefix 120 | if not prefix.endswith('/'): 121 | key = key + '/' 122 | key = key + _filename + '.gz' 123 | if os.path.exists(fullpath): 124 | print fullpath + ' already exists.. .skipping' 125 | else: 126 | print 'downloading ' + _filename + ' to ' + fullpath 127 | wrapper.downloadGzipFile(key, fullpath) 128 | wrapper.updateFileMtimeFromS3(key, fullpath) 129 | 130 | # copy down each file to a tmp directory 131 | # gunzip each file into the appropriate directories 132 | # set the correct permissions 133 | # delete files that aren't in manifest if 134 | 135 | if __name__ == '__main__': 136 | sys.exit(main()) 137 | -------------------------------------------------------------------------------- /sstables3.py: -------------------------------------------------------------------------------- 1 | import boto 2 | from threading import Thread 3 | from optparse import OptionParser 4 | from StringIO import StringIO 5 | import struct 6 | import resource 7 | import logging 8 | import os.path 9 | import socket 10 | import json 11 | import sys 12 | import os 13 | import time 14 | import threading 15 | import datetime 16 | import math 17 | import mimetypes 18 | import hashlib 19 | import io 20 | import pickle 21 | import sqlite3 22 | import zlib 23 | import binascii 24 | 25 | 26 | MP_CHUNK_READ = 268435456 # 256MB 27 | MAX_THREADS = 5 28 | CRC_INIT = zlib.crc32("") & 0xffffffffL 29 | 30 | def write32u(output, value): 31 | output.write(struct.pack("> 1) & 0x7FFFFFFF 100 | i = i + 1 101 | return sum 102 | 103 | @staticmethod 104 | def crc32_combine(crc1, crc2, len2): 105 | even = [] 106 | odd = [] 107 | if (len2 == 0): # degenerate case 108 | return crc1 109 | 110 | odd.append(0xEDB88320L) # CRC-32 polynomial 111 | row = 1 112 | 113 | for n in range(1, 32): 114 | odd.append(row) 115 | row = row << 1 116 | 117 | even = zlib_crc32.gf2_matrix_square(even, odd) 118 | odd = zlib_crc32.gf2_matrix_square(odd, even) 119 | 120 | while (len2 != 0): 121 | even = zlib_crc32.gf2_matrix_square(even, odd) 122 | if (len2 & 1): 123 | crc1 = zlib_crc32.gf2_matrix_times(even, crc1) 124 | len2 = len2 >> 1 125 | 126 | if (len2 == 0): 127 | break 128 | 129 | odd = zlib_crc32.gf2_matrix_square(odd, even) 130 | if (len2 & 1): 131 | crc1 = zlib_crc32.gf2_matrix_times(odd, crc1) 132 | len2 = len2 >> 1 133 | 134 | crc1 = crc1 ^ crc2 135 | return crc1 136 | 137 | # class FileGz(): 138 | # def __init__(self, filepath, keyname): 139 | # self.filepath = filepath 140 | # 141 | # def single_read(self): 142 | # 143 | # def part_read(self, part_num, seek, readsize): 144 | # 145 | # def header(self): 146 | # h = '\037\213' # magic header 147 | # h = h + '\010' # compression method 148 | # fname = os.path.basename(self.filepath) 149 | # flags = 0 150 | # if fname: 151 | # flags = 8 # FNAME (filename flag) 152 | # h = h + chr(flags) 153 | # mtime = self.fstats.st_mtime 154 | # h = h + struct.pack("L', binascii.unhexlify(row[1].zfill(8)))[0] 287 | _size = row[2] 288 | crc = zlib_crc32.crc32_combine(crc, _crc, _size) 289 | # if (len(crc_list) != len(size_list): # throw some sort of error 290 | return crc 291 | 292 | def startUpload(self): 293 | # check which parts have already been uploaded 294 | missing_part_ids = self.getMissingParts() 295 | for part_num in missing_part_ids: 296 | sys.stderr.write(time.asctime() + ": Buffering part " + str(part_num) + "/" + str(self.part_count) + " for " + self.key_name + "\n") 297 | if self.key_name.endswith('.gz'): 298 | self.uploadPartGz(part_num) 299 | else: 300 | self.uploadPart(part_num) 301 | sys.stderr.write(time.asctime() + ": COMPLETED uploading part " + str(part_num) + "/" + str(self.part_count) + " for " + self.key_name + "\n") 302 | # recheck all the parts again before completing the upload 303 | missing_part_ids2 = self.getMissingParts(False) 304 | if len(missing_part_ids2)==0: 305 | self.mpu.complete_upload() 306 | else: 307 | sys.stderr.write(time.asctime() + ": INCOMPLETE upload due to missing parts for " + self.key_name + ", retry the next time around\n") 308 | 309 | class SSTableS3(object): 310 | 311 | def __init__(self, aws_key, aws_secret, bucket, key_prefix): 312 | self.aws_key = aws_key 313 | self.aws_secret = aws_secret 314 | self.bucket = bucket 315 | # remove a trailing slash if it exists 316 | if key_prefix[-1:] == "/": 317 | key_prefix = key_prefix[0:-1] 318 | self.key_prefix = key_prefix 319 | self.connection = boto.connect_s3(self.aws_key, self.aws_secret) 320 | self.bucket_obj = self.connection.get_bucket(bucket) 321 | 322 | def init_sqlite(self, sqlite): 323 | self.sqlite = sqlite 324 | self.sqlite_connection = sqlite3.connect(sqlite) 325 | c = self.sqlite_connection.cursor() 326 | c.execute('''CREATE TABLE IF NOT EXISTS multipartuploads (key_name text, part_number integer, crc text, size integer, PRIMARY KEY (key_name, part_number))''') 327 | self.sqlite_connection.commit() 328 | 329 | def cancel_pending_uploads(self): 330 | multipart_uploads = self.bucket_obj.list_multipart_uploads() 331 | sys.stderr.write("Canceling outstanding multipart uploads\n") 332 | for mpu in multipart_uploads: 333 | if (self.key_prefix in mpu.key_name): # don't want to clobber things we shouldn't be handling 334 | sys.stderr.write("Canceling multipart upload for " + mpu.key_name + "\n") 335 | mpu.cancel_upload() 336 | sys.stderr.write("COMPLETED canceling outstanding multipart uploads\n") 337 | c = self.sqlite_connection.cursor() 338 | c.execute('''DELETE FROM multipartuploads''') 339 | self.sqlite_connection.commit() 340 | sys.stderr.write("TRUNCATE multipartuploads\n") 341 | time.sleep(5) 342 | 343 | def sync_to_bucketPath(self, path, ignore_compacted=False): 344 | manifest = self.createPathManifest(path) 345 | manifest.sort() 346 | if (ignore_compacted == True): 347 | filtered_manifest = self.filterCompactedFiles(manifest) 348 | else: 349 | filtered_manifest = manifest 350 | sys.stderr.write(str(len(filtered_manifest)) + " files in manifest\n") 351 | ts = time.time() 352 | key = self.bucket_obj.new_key('%s/manifests/%s-%s.manifest.json' % (self.key_prefix, socket.getfqdn(), ts)) 353 | key.set_contents_from_string(json.dumps({'files': filtered_manifest, 'path': path, 'prefix': self.key_prefix, 'hostname': socket.getfqdn(), 'timestamp': ts})) 354 | threadlist = [] 355 | for f in filtered_manifest: 356 | while True: 357 | if threading.activeCount() < MAX_THREADS: 358 | self.thread_wait = 0.015625 359 | # sys.stderr.write("starting new thread for " + f + " with " + str(threading.activeCount()) + "/" + str(MAX_THREADS) + " threads running\n") 360 | t = Thread(target=self.syncFileS3, args=(path, f)) 361 | t.setDaemon(True) 362 | t.start() 363 | threadlist.append(t) 364 | break 365 | else: 366 | # sys.stderr.write("sleeping for " + str(self.thread_wait) + " seconds with " + str(threading.activeCount()) + "/" + str(MAX_THREADS) + " threads running\n") 367 | self.thread_wait = min(self.thread_wait * 2, 60); 368 | time.sleep(self.thread_wait) 369 | for t in threadlist: 370 | t.join() 371 | 372 | def syncFileS3(self, pathhead, pathtail): 373 | filepath = os.path.join(pathhead, pathtail) 374 | if self.key_prefix.endswith('/'): 375 | keyname = self.key_prefix + pathtail + '.gz' 376 | else: 377 | keyname = self.key_prefix + '/' + pathtail + '.gz' 378 | connection = boto.connect_s3(self.aws_key, self.aws_secret) 379 | bucket_obj = connection.get_bucket(self.bucket) 380 | s3_key = bucket_obj.get_key(keyname) 381 | 382 | if s3_key: 383 | local_fstat = os.stat(filepath) 384 | s3_datetime = datetime.datetime(*time.strptime(s3_key.last_modified, '%a, %d %b %Y %H:%M:%S %Z')[0:6]) 385 | local_datetime = datetime.datetime.utcfromtimestamp(local_fstat.st_mtime) 386 | s3_size = s3_key.size 387 | local_size = local_fstat.st_size 388 | # if local_datetime >= s3_datetime or s3_size != local_size: 389 | if local_datetime > s3_datetime: 390 | self.uploadFileS3(filepath, keyname, True) 391 | else: 392 | self.uploadFileS3(filepath, keyname, True) 393 | 394 | def uploadFileS3(self, filepath, keyname, replace=False): 395 | file_size = os.stat(filepath).st_size 396 | 397 | # Filter.db files GZip TOO well. Upload as a single gz, not multipart upload or S3 will complain the parts are too small 398 | if ('Filter.db' in filepath): 399 | chunk_size = file_size 400 | else: 401 | chunk_size = MP_CHUNK_READ 402 | parts = int(max(math.ceil(file_size / float(chunk_size)), 1)) 403 | 404 | sys.stderr.write(time.asctime() + ": Starting upload of " + filepath + " with " + str(parts) + " parts\n") 405 | start_time = time.clock() 406 | mpfu = MultiPartFileUploader(filepath, self.bucket, keyname, self.aws_key, self.aws_secret, self.sqlite, chunk_size) 407 | mpfu.startUpload() 408 | sys.stderr.write(time.asctime() + ": Finished upload of " + filepath + " with " + str(parts) + " parts\n") 409 | 410 | def createPathManifest(self, filepath): 411 | lsr = self.listFiles(filepath) 412 | manifest = [] 413 | for i in lsr: 414 | newpath = self.trimPath(i, filepath) 415 | if newpath.find('-tmp') == -1 and newpath.find('snapshots') == -1: 416 | manifest.append(newpath) 417 | return manifest 418 | 419 | def trimPath(self, path, relativeTo): 420 | # path = os.path.relpath(i, filepath) 421 | if path.startswith(relativeTo): 422 | replaced_path = path.replace(relativeTo, '', 1) 423 | if replaced_path.startswith('/'): 424 | replaced_path = replaced_path.replace('/', '', 1) 425 | path = replaced_path 426 | return path 427 | 428 | def listFiles(self, path): 429 | files = [] 430 | if (os.path.isdir(path) and not os.path.islink(path)): 431 | for i in os.listdir(path): 432 | filepath = os.path.join(path, i); 433 | if (os.path.isfile(filepath) and not os.path.islink(filepath)): 434 | files.append(filepath) 435 | else: 436 | files.extend(self.listFiles(os.path.join(path, i))) 437 | return files 438 | 439 | def listManifests(self): 440 | searchPrefix = self.key_prefix + '/manifests/' 441 | manifests = self.bucket_obj.get_all_keys(prefix=searchPrefix) 442 | return manifests 443 | 444 | def getManifest(self, manifest): 445 | key_obj = self.bucket_obj.get_key(manifest) 446 | manifest_contents = json.loads(key_obj.get_contents_as_string()) 447 | return manifest_contents 448 | 449 | def downloadGzipFile(self, key, filepath): 450 | key_obj = self.bucket_obj.get_key(key) 451 | # decompression obj 452 | d = StreamDecompressor(filepath) 453 | key_obj.get_contents_to_file(d) 454 | d.close() 455 | 456 | def updateFileMtimeFromS3(self, key, filepath): 457 | key_obj = self.bucket_obj.get_key(key) 458 | s3_datetime = datetime.datetime(*time.strptime(key_obj.last_modified, '%a, %d %b %Y %H:%M:%S %Z')[0:6]) 459 | mtime = (s3_datetime - datetime.datetime(1970, 1, 1)).total_seconds() 460 | # set the atime and mtime of the local file to the mtime of the s3 object 461 | os.utime(filepath, (mtime, mtime)) 462 | 463 | def filterCompactedFiles(self, files): 464 | files.sort() 465 | compacted_list = [] 466 | 467 | # figure out the compacted files prefixes 468 | for _filename in files: 469 | if (_filename.endswith('-Compacted')): 470 | str_idx = _filename.rfind('-Compacted') 471 | compacted_list.append(_filename[0:str_idx+1]) 472 | 473 | filtered_files = [] 474 | 475 | # now loop through the compacted file prefixes and remove all the files related to the campacted sstables 476 | for _filename in files: 477 | found_match = False 478 | for file_prefix in compacted_list: 479 | if file_prefix in _filename: 480 | found_match = True 481 | break 482 | if not found_match: 483 | filtered_files.append(_filename) 484 | 485 | return filtered_files 486 | --------------------------------------------------------------------------------