├── tests ├── __init__.py ├── mockboto.py └── s3mysqldump_tests.py ├── MANIFEST.in ├── .gitignore ├── README.rst ├── bin └── s3mysqldump ├── setup.py └── s3mysqldump.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .#* 2 | *.pyc 3 | *~ 4 | *.swp 5 | 6 | build 7 | dist 8 | s3mysqldump.egg-info 9 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | s3mysqldump 2 | =========== 3 | 4 | **s3mysqldump** is a tool to dump mysql tables to S3, so they can be consumed by Elastic MapReduce, etc. 5 | 6 | Installation 7 | ============ 8 | 9 | From source: 10 | 11 | python setup.py install 12 | 13 | A Simple Example 14 | ================ 15 | 16 | The following command dumps 'user' table in 'db' database to s3 bucket s3://emr-storage/. 'my.cnf' specifies mysql parameters. 'boto.cfg' is the configure file for s3 connection which specifies things like aws credentials etc. 17 | 18 | ``s3mysqldump -v --force -m my.cnf -s -b boto.cfg db user s3://emr-storage/user.sql`` 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /bin/s3mysqldump: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2011 Yelp 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import sys 16 | 17 | import s3mysqldump 18 | 19 | if __name__ == '__main__': 20 | s3mysqldump.main(sys.argv[1:]) 21 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup 3 | except ImportError: 4 | from distutils.core import setup 5 | 6 | 7 | setup( 8 | author='David Marin', 9 | author_email='dave@yelp.com', 10 | classifiers=[ 11 | 'Development Status :: 4 - Beta', 12 | 'Intended Audience :: Developers', 13 | 'License :: OSI Approved :: Apache Software License', 14 | 'Natural Language :: English', 15 | 'Operating System :: OS Independent', 16 | 'Programming Language :: Python', 17 | 'Programming Language :: Python :: 2.5', 18 | 'Programming Language :: Python :: 2.6', 19 | 'Programming Language :: Python :: 2.7', 20 | 'Topic :: Database', 21 | 'Topic :: System :: Archiving :: Mirroring', 22 | 'Topic :: System :: Distributed Computing', 23 | 'Topic :: Utilities', 24 | ], 25 | description='Dump MySQL tables to S3, and parse them', 26 | install_requires=['boto>=1.9'], 27 | license='Apache', 28 | long_description=open('README.rst').read(), 29 | name='s3mysqldump', 30 | provides=['s3mysqldump'], 31 | py_modules=['s3mysqldump'], 32 | scripts=['bin/s3mysqldump'], 33 | url='http://github.com/Yelp/s3mysqldump', 34 | version='0.1', 35 | ) 36 | -------------------------------------------------------------------------------- /tests/mockboto.py: -------------------------------------------------------------------------------- 1 | # Copyright 2009-2011 Yelp 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Mercilessly taunt an Amazonian river dolphin. 15 | 16 | This is by no means a complete mock of boto; it has only the methods I need 17 | to make tests work. This was copied from mrjob, but is subtly different. 18 | 19 | If you need a more extensive set of mock boto objects, we recommend adding 20 | some sort of sandboxing feature to boto, rather than extending these somewhat 21 | ad-hoc mock objects. 22 | """ 23 | from __future__ import with_statement 24 | 25 | import boto.exception 26 | 27 | 28 | ### S3 ### 29 | 30 | def add_mock_s3_data(mock_s3_fs, data): 31 | """Update mock_s3_fs (which is just a dictionary mapping bucket to 32 | key to contents) with a map from bucket name to key name to data.""" 33 | for bucket_name, key_name_to_bytes in data.iteritems(): 34 | mock_s3_fs.setdefault(bucket_name, {}) 35 | bucket = mock_s3_fs[bucket_name] 36 | 37 | for key_name, bytes in key_name_to_bytes.iteritems(): 38 | bucket[key_name] = bytes 39 | 40 | 41 | class MockS3Connection(object): 42 | """Mock out boto.s3.Connection 43 | """ 44 | def __init__(self, aws_access_key_id=None, aws_secret_access_key=None, 45 | is_secure=True, port=None, proxy=None, proxy_port=None, 46 | proxy_user=None, proxy_pass=None, 47 | host=None, debug=0, https_connection_factory=None, 48 | calling_format=None, path='/', provider='aws', 49 | bucket_class=None, mock_s3_fs=None): 50 | """Mock out a connection to S3. Most of these args are the same 51 | as for the real S3Connection, and are ignored. 52 | 53 | You can set up a mock filesystem to share with other objects 54 | by specifying mock_s3_fs. The mock filesystem is just a map 55 | from bucket name to key name to bytes. 56 | """ 57 | self.mock_s3_fs = {} if mock_s3_fs is None else mock_s3_fs 58 | self.endpoint = host or 's3.amazonaws.com' 59 | self.aws_access_key_id = aws_access_key_id 60 | self.aws_secret_access_key = aws_secret_access_key 61 | 62 | def get_bucket(self, bucket_name): 63 | if bucket_name in self.mock_s3_fs: 64 | return MockBucket(connection=self, name=bucket_name) 65 | else: 66 | raise boto.exception.S3ResponseError(404, 'Not Found') 67 | 68 | def create_bucket(self, bucket_name, headers=None, location='', 69 | policy=None): 70 | if bucket_name in self.mock_s3_fs: 71 | raise boto.exception.S3CreateError(409, 'Conflict') 72 | else: 73 | self.mock_s3_fs[bucket_name] = {} 74 | 75 | 76 | class MockBucket: 77 | """Mock out boto.s3.Bucket 78 | """ 79 | def __init__(self, connection=None, name=None): 80 | """You can optionally specify a 'data' argument, which will instantiate 81 | mock keys and mock data. data should be a map from key name to bytes. 82 | """ 83 | self.name = name 84 | self.connection = connection 85 | 86 | def new_key(self, key_name): 87 | mock_s3_fs = self.connection.mock_s3_fs 88 | 89 | if key_name not in mock_s3_fs[self.name]: 90 | mock_s3_fs[self.name][key_name] = '' 91 | return MockKey(bucket=self, name=key_name) 92 | 93 | def get_key(self, key_name): 94 | mock_s3_fs = self.connection.mock_s3_fs 95 | 96 | if key_name in mock_s3_fs[self.name]: 97 | return MockKey(bucket=self, name=key_name) 98 | else: 99 | return None 100 | 101 | 102 | class MockKey(object): 103 | """Mock out boto.s3.Key""" 104 | 105 | def __init__(self, bucket=None, name=None): 106 | """You can optionally specify a 'data' argument, which will fill 107 | the key with mock data. 108 | """ 109 | self.bucket = bucket 110 | self.name = name 111 | 112 | def set_contents_from_filename(self, filename, cb=None, num_cb=0): 113 | mock_s3_fs = self.bucket.connection.mock_s3_fs 114 | with open(filename) as f: 115 | f.seek(0) 116 | contents = f.read() 117 | mock_s3_fs[self.bucket.name][self.name] = contents 118 | -------------------------------------------------------------------------------- /tests/s3mysqldump_tests.py: -------------------------------------------------------------------------------- 1 | """Unit tests for s3mysqldump. 2 | 3 | These tests mock out mysqldump and s3. 4 | """ 5 | from __future__ import with_statement 6 | 7 | import datetime 8 | import logging 9 | import os 10 | import sys 11 | import tempfile 12 | 13 | import boto 14 | import s3mysqldump 15 | 16 | from testify import TestCase 17 | from testify import assert_equal 18 | from testify import assert_in 19 | from testify import assert_raises 20 | from testify import run 21 | from testify import setup 22 | from testify import teardown 23 | 24 | from tests.mockboto import MockS3Connection 25 | from tests.mockboto import add_mock_s3_data 26 | 27 | 28 | class MockS3AndMysqldumpTestCase(TestCase): 29 | 30 | @setup 31 | def use_echo_as_mysqldump(self): 32 | """Replace mysqldump with echo, so we can what command lines 33 | were outputted.""" 34 | self.monkey_patch(s3mysqldump, 'DEFAULT_MYSQLDUMP_BIN', 'echo') 35 | 36 | @setup 37 | def wrap_make_option_parser(self): 38 | """Wrap the option parser so that it doesn't print help or 39 | error messages; instead it updates self.times_help_printed 40 | and self.parser_errors. 41 | 42 | It will still exit on errors; be prepared to catch SystemExit 43 | """ 44 | self.times_help_printed = 0 45 | self.parser_errors = [] 46 | 47 | real_make_option_parser = s3mysqldump.make_option_parser 48 | 49 | def fake_print_help(): 50 | self.times_help_printed += 1 51 | 52 | def fake_error(msg): 53 | self.parser_errors.append(msg) 54 | sys.exit(1) 55 | 56 | def wrapper(): 57 | parser = real_make_option_parser() 58 | parser.print_help = fake_print_help 59 | parser.error = fake_error 60 | return parser 61 | 62 | self.monkey_patch(s3mysqldump, 'make_option_parser', wrapper) 63 | 64 | @setup 65 | def sandbox_s3(self): 66 | """Mock out the S3 filesystem. self.mock_s3_fs will be a map 67 | from bucket name to key name to contents. 68 | 69 | Also, add a single bucket named 'walrus' 70 | """ 71 | self.mock_s3_fs = {} 72 | self.aws_access_key_id = None 73 | self.aws_secret_access_key = None 74 | 75 | def mock_boto_connect_s3(*args, **kwargs): 76 | kwargs['mock_s3_fs'] = self.mock_s3_fs 77 | 78 | # keep track of credentials passed explicitly to connect_s3() 79 | if 'aws_access_key_id' in kwargs: 80 | self.aws_access_key_id = kwargs['aws_access_key_id'] 81 | if 'aws_secret_access_key' in kwargs: 82 | self.aws_secret_access_key = kwargs['aws_secret_access_key'] 83 | 84 | return MockS3Connection(*args, **kwargs) 85 | 86 | self.monkey_patch(boto, 'connect_s3', mock_boto_connect_s3) 87 | 88 | add_mock_s3_data(self.mock_s3_fs, {'walrus': {}}) 89 | 90 | @setup 91 | def disable_s3mysqldump_logging(self): 92 | """Quiet logging messages from s3mysqldump.""" 93 | self.monkey_patch(logging.getLogger('s3mysqldump'), 'disabled', True) 94 | 95 | @setup 96 | def wrap_get_current_time(self): 97 | """Monkey-patch datetime.datetime so that we can set current 98 | time with self.set_now_to() and self.set_utcnow_to()""" 99 | real_get_current_time = s3mysqldump.get_current_time 100 | self._now = None 101 | self._utcnow = None 102 | 103 | def fake_get_current_time(utc=False): 104 | result = self._utcnow if utc else self._now 105 | return result or real_get_current_time(utc=utc) 106 | 107 | self.monkey_patch( 108 | s3mysqldump, 'get_current_time', fake_get_current_time) 109 | 110 | def set_now_to(self, now): 111 | """Monkey-patch the value of now() returned by datetime.datetime.now() 112 | (and utcnow()). Set this to None to use current time.""" 113 | self._now = now 114 | 115 | def set_utcnow_to(self, utcnow): 116 | """Monkey-patch the value of now() returned by datetime.datetime.now() 117 | (and utcnow()). Set this to None to use current time.""" 118 | self._utcnow = utcnow 119 | 120 | def monkey_patch(self, obj, field, value): 121 | """Monkey-patch obj.field with value. This will be undone 122 | after each test.""" 123 | if not hasattr(self, '_monkey_patches'): 124 | self._monkey_patches = [] 125 | real_value = getattr(obj, field) 126 | self._monkey_patches.append((obj, field, real_value)) 127 | setattr(obj, field, value) 128 | 129 | @teardown 130 | def un_monkey_patch(self): 131 | """Undo monkey patching.""" 132 | if not hasattr(self, '_monkey_patches'): 133 | return 134 | # I suspect it's better to do this in reverse order, though 135 | # it may not matter 136 | for obj, field, real_value in reversed(self._monkey_patches): 137 | setattr(obj, field, real_value) 138 | 139 | def check_s3(self, bucket, key, args): 140 | """Check that mysqldump was run with the given args, and the 141 | contents copied to S3. 142 | """ 143 | assert_in(bucket, self.mock_s3_fs) 144 | assert_in(key, self.mock_s3_fs[bucket]) 145 | contents = self.mock_s3_fs[bucket][key] 146 | 147 | # we run "echo" in place of mysqldump, so the key's contents 148 | # should just be the arguments we passed to echo 149 | assert_equal(contents.rstrip(), args) 150 | 151 | 152 | class TestTablesAndDatabases(MockS3AndMysqldumpTestCase): 153 | """Tests of specifying databases and the %D and %T options""" 154 | 155 | def test_basic_case(self): 156 | s3mysqldump.main(['foo', 's3://walrus/foo.sql']) 157 | self.check_s3('walrus', 'foo.sql', '--tables -- foo') 158 | 159 | def test_percent_D(self): 160 | s3mysqldump.main(['foo', 's3://walrus/%D.sql']) 161 | self.check_s3('walrus', 'foo.sql', '--tables -- foo') 162 | 163 | def no_percent_T_without_tables(self): 164 | assert_raises(SystemExit, 165 | s3mysqldump.main, ['foo', 's3://walrus/%D/%T.sql']) 166 | 167 | def test_one_table(self): 168 | s3mysqldump.main(['foo', 'bar', 's3://walrus/foo.sql']) 169 | self.check_s3('walrus', 'foo.sql', '--tables -- foo bar') 170 | 171 | def test_percent_T_on_one_table(self): 172 | s3mysqldump.main(['foo', 'bar', 's3://walrus/%T.sql']) 173 | self.check_s3('walrus', 'bar.sql', '--tables -- foo bar') 174 | 175 | def test_percent_D_and_T_on_one_table(self): 176 | s3mysqldump.main(['foo', 'bar', 's3://walrus/%D/%T.sql']) 177 | self.check_s3('walrus', 'foo/bar.sql', '--tables -- foo bar') 178 | 179 | def test_many_tables(self): 180 | s3mysqldump.main(['foo', 'bar', 'baz', 'qux', 181 | 's3://walrus/foo.sql']) 182 | self.check_s3('walrus', 'foo.sql', '--tables -- foo bar baz qux') 183 | 184 | def test_percent_T_on_many_tables(self): 185 | s3mysqldump.main(['foo', 'bar', 'baz', 'qux', 186 | 's3://walrus/%T.sql']) 187 | self.check_s3('walrus', 'bar.sql', '--tables -- foo bar') 188 | self.check_s3('walrus', 'baz.sql', '--tables -- foo baz') 189 | self.check_s3('walrus', 'qux.sql', '--tables -- foo qux') 190 | 191 | def test_percent_D_and_T_on_many_tables(self): 192 | s3mysqldump.main(['foo', 'bar', 'baz', 'qux', 193 | 's3://walrus/%D/%T.sql']) 194 | self.check_s3('walrus', 'foo/bar.sql', '--tables -- foo bar') 195 | self.check_s3('walrus', 'foo/baz.sql', '--tables -- foo baz') 196 | self.check_s3('walrus', 'foo/qux.sql', '--tables -- foo qux') 197 | 198 | def test_one_database(self): 199 | s3mysqldump.main(['-B', 'foo', 's3://walrus/foo.sql']) 200 | self.check_s3('walrus', 'foo.sql', '--databases -- foo') 201 | 202 | def test_percent_D_with_one_database(self): 203 | s3mysqldump.main(['-B', 'foo', 's3://walrus/%D.sql']) 204 | self.check_s3('walrus', 'foo.sql', '--databases -- foo') 205 | 206 | def test_no_percent_T_with_databases_mode(self): 207 | assert_raises(SystemExit, 208 | s3mysqldump.main, ['-B', 'foo', 's3://walrus/%D/%T.sql']) 209 | 210 | def test_many_databases(self): 211 | s3mysqldump.main(['-B', 'foo1', 'foo2', 'foo3', 's3://walrus/foo.sql']) 212 | self.check_s3('walrus', 'foo.sql', '--databases -- foo1 foo2 foo3') 213 | 214 | def test_percent_D_with_many_databases(self): 215 | s3mysqldump.main(['-B', 'foo1', 'foo2', 'foo3', 's3://walrus/%D.sql']) 216 | self.check_s3('walrus', 'foo1.sql', '--databases -- foo1') 217 | self.check_s3('walrus', 'foo2.sql', '--databases -- foo2') 218 | self.check_s3('walrus', 'foo3.sql', '--databases -- foo3') 219 | 220 | def test_all_databases(self): 221 | s3mysqldump.main(['-A', 's3://walrus/dbs.sql']) 222 | self.check_s3('walrus', 'dbs.sql', '--all-databases') 223 | 224 | def test_no_names_with_all_databases(self): 225 | assert_raises(SystemExit, 226 | s3mysqldump.main, ['-A', 'foo', 's3://walrus/foo.sql']) 227 | 228 | def test_no_percent_T_with_all_databases(self): 229 | assert_raises(SystemExit, 230 | s3mysqldump.main, ['-A', 's3://walrus/%T.sql']) 231 | 232 | def test_no_percent_D_with_all_databases(self): 233 | assert_raises(SystemExit, 234 | s3mysqldump.main, ['-A', 's3://walrus/%D.sql']) 235 | 236 | 237 | class TestInterpolation(MockS3AndMysqldumpTestCase): 238 | 239 | @setup 240 | def set_now(self): 241 | self.set_now_to(datetime.datetime(2010, 6, 6, 4, 26)) 242 | self.set_utcnow_to(datetime.datetime(2010, 6, 6, 11, 26)) 243 | 244 | def test_date_interpolation(self): 245 | s3mysqldump.main(['foo', 's3://walrus/%Y/%m/%d/foo.sql']) 246 | self.check_s3('walrus', '2010/06/06/foo.sql', '--tables -- foo') 247 | 248 | def test_time_interpolation(self): 249 | s3mysqldump.main(['foo', 's3://walrus/%Y/%m/%d/%H:%M/foo.sql']) 250 | self.check_s3('walrus', '2010/06/06/04:26/foo.sql', '--tables -- foo') 251 | 252 | def test_utc(self): 253 | s3mysqldump.main( 254 | ['foo', '--utc', 's3://walrus/%Y/%m/%d/%H:%M/foo.sql']) 255 | self.check_s3('walrus', '2010/06/06/11:26/foo.sql', '--tables -- foo') 256 | 257 | def test_date_and_percent_D_and_T(self): 258 | s3mysqldump.main(['foo', 'bar', 'baz', 'qux', 259 | 's3://walrus/%Y/%m/%d/%D/%T.sql']) 260 | self.check_s3( 261 | 'walrus', '2010/06/06/foo/bar.sql', '--tables -- foo bar') 262 | self.check_s3( 263 | 'walrus', '2010/06/06/foo/baz.sql', '--tables -- foo baz') 264 | self.check_s3( 265 | 'walrus', '2010/06/06/foo/qux.sql', '--tables -- foo qux') 266 | 267 | def test_percent_escaping(self): 268 | # %D, %T aren't allowed with -A, so check that we don't 269 | # interpret %%D and %%T as these fields 270 | s3mysqldump.main(['-A', 's3://walrus/%%Y%%m%%d/%Y/%m/%d/%%D%%T.sql']) 271 | self.check_s3( 272 | 'walrus', '%Y%m%d/2010/06/06/%D%T.sql', '--all-databases') 273 | 274 | 275 | class TestBotoConfig(MockS3AndMysqldumpTestCase): 276 | 277 | @setup 278 | def make_boto_cfg(self): 279 | _, self.boto_cfg = tempfile.mkstemp(prefix='boto.cfg') 280 | with open(self.boto_cfg, 'w') as f: 281 | f.write('[Credentials]\n') 282 | f.write('aws_access_key_id = 12345678910\n') 283 | f.write('aws_secret_access_key = ABCDEFGHIJKLMNOPQRSTUVWXYZ\n') 284 | 285 | @teardown 286 | def rm_boto_cfg(self): 287 | os.unlink(self.boto_cfg) 288 | 289 | def test_no_boto_cfg(self): 290 | s3mysqldump.main(['foo', 's3://walrus/foo.sql']) 291 | assert_equal(self.aws_access_key_id, None) 292 | assert_equal(self.aws_secret_access_key, None) 293 | 294 | def test_with_boto_cfg(self): 295 | s3mysqldump.main(['-b', self.boto_cfg, 'foo', 's3://walrus/foo.sql']) 296 | assert_equal(self.aws_access_key_id, '12345678910') 297 | assert_equal(self.aws_secret_access_key, 'ABCDEFGHIJKLMNOPQRSTUVWXYZ') 298 | 299 | 300 | if __name__ == '__main__': 301 | run() 302 | -------------------------------------------------------------------------------- /s3mysqldump.py: -------------------------------------------------------------------------------- 1 | # Copyright 2011 Yelp 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Dump mysql tables to S3, so they can be consumed by Elastic MapReduce, etc. 15 | """ 16 | from __future__ import with_statement 17 | 18 | __author__ = 'David Marin ' 19 | __version__ = '0.1' 20 | 21 | import datetime 22 | import glob 23 | import logging 24 | import optparse 25 | import os 26 | import pipes 27 | import re 28 | import shlex 29 | import shutil 30 | import subprocess 31 | import sys 32 | import tempfile 33 | import time 34 | import socket 35 | 36 | import boto 37 | import boto.pyami.config 38 | 39 | 40 | log = logging.getLogger('s3mysqldump') 41 | 42 | 43 | DEFAULT_MYSQLDUMP_BIN = 'mysqldump' 44 | 45 | SINGLE_ROW_FORMAT_OPTS = [ 46 | # --skip-opt causes out of memory error on 5.0.91, so do explicitly instead 47 | # '--skip-opt', 48 | '--compact', 49 | '--complete-insert', 50 | '--default_character_set=utf8', 51 | '--hex-blob', 52 | '--no-create-db', 53 | '--no-create-info', 54 | '--quick', 55 | '--skip-lock-tables', 56 | '--skip-extended-insert', 57 | ] 58 | 59 | 60 | S3_URI_RE = re.compile(r'^s3n?://(.*?)/(.*)$') 61 | 62 | S3_MAX_PUT_SIZE = 4 * 1024 * 1024 * 1024 # actually 5G, but 4 to be safe 63 | 64 | # match directives in a strftime format string (e.g. '%Y-%m-%d') 65 | # for fully correct handling of percent literals (e.g. don't match %T in %%T) 66 | STRFTIME_FIELD_RE = re.compile('%(.)') 67 | 68 | 69 | def main(args): 70 | """Run the mysqldump utility. 71 | 72 | :param list args: alternate command line arguments (normally we read from 73 | ``sys.argv[:1]``) 74 | """ 75 | databases, tables, s3_uri_format, options = parse_args(args) 76 | 77 | now = get_current_time(utc=options.utc) 78 | 79 | # set up logging 80 | if not options.quiet: 81 | log_to_stream(name='s3mysqldump', debug=options.verbose) 82 | 83 | s3_conn = connect_s3(boto_cfg=options.boto_cfg, host=options.s3_endpoint) 84 | 85 | extra_opts = parse_opts(options.mysqldump_extra_opts) 86 | 87 | # helper function, to call once, or once per table, below 88 | def mysqldump_to_s3(s3_uri, databases=None, tables=None): 89 | if not options.force and s3_key_exists(s3_conn, s3_uri): 90 | log.warn('%s already exists; use --force to overwrite' % (s3_uri,)) 91 | return 92 | 93 | log.info('dumping %s -> %s' % (dump_desc(databases, tables), s3_uri)) 94 | with tempfile.NamedTemporaryFile(prefix='s3mysqldump-') as file: 95 | # dump to a temp file 96 | success = mysqldump_to_file( 97 | file, databases, tables, 98 | mysqldump_bin=options.mysqldump_bin, 99 | my_cnf=options.my_cnf, 100 | extra_opts=extra_opts, 101 | single_row_format=options.single_row_format) 102 | 103 | # upload to S3 (if mysqldump worked!) 104 | if success: 105 | log.debug(' %s -> %s' % (file.name, s3_uri)) 106 | start = time.time() 107 | 108 | s3_key = make_s3_key(s3_conn, s3_uri) 109 | if os.path.getsize(file.name) > S3_MAX_PUT_SIZE: 110 | upload_multipart(s3_key, file.name) 111 | else: 112 | log.debug('Upload to %r' % s3_key) 113 | upload_singlepart(s3_key, file.name) 114 | log.debug(' Done in %.1fs' % (time.time() - start)) 115 | 116 | # output to separate files, if specified by %T and %D 117 | if has_table_field(s3_uri_format): 118 | assert len(databases) == 1 119 | database = databases[0] 120 | for table in tables: 121 | s3_uri = resolve_s3_uri_format(s3_uri_format, now, database, table) 122 | mysqldump_to_s3(s3_uri, [database], [table]) 123 | elif has_database_field(s3_uri_format): 124 | for database in databases: 125 | s3_uri = resolve_s3_uri_format(s3_uri_format, now, database) 126 | mysqldump_to_s3(s3_uri, [database], tables) 127 | else: 128 | s3_uri = resolve_s3_uri_format(s3_uri_format, now) 129 | mysqldump_to_s3(s3_uri, databases, tables) 130 | 131 | 132 | def get_current_time(utc=False): 133 | """Get the current time. This is broken out so we can monkey-patch 134 | it for testing.""" 135 | if utc: 136 | return datetime.datetime.utcnow() 137 | else: 138 | return datetime.datetime.now() 139 | 140 | 141 | def dump_desc(databases=None, tables=None): 142 | """Return a description of the given database and tables, for logging""" 143 | if not databases: 144 | return 'all databases' 145 | elif not tables: 146 | if len(databases) == 1: 147 | return databases[0] 148 | else: 149 | return '{%s}' % ','.join(databases) 150 | elif len(tables) == 1: 151 | return '%s.%s' % (databases[0], tables[0]) 152 | else: 153 | return '%s.{%s}' % (databases[0], ','.join(tables)) 154 | 155 | 156 | def has_database_field(s3_uri_format): 157 | """Check if s3_uri_format contains %D (which is meant to be replaced) 158 | with database name. But don't accidentally consume percent literals 159 | (e.g. ``%%D``). 160 | """ 161 | return 'D' in STRFTIME_FIELD_RE.findall(s3_uri_format) 162 | 163 | 164 | def has_table_field(s3_uri_format): 165 | """Check if s3_uri_format contains %T (which is meant to be replaced) 166 | with table name. But don't accidentally consume percent literals 167 | (e.g. ``%%T``). 168 | """ 169 | return 'T' in STRFTIME_FIELD_RE.findall(s3_uri_format) 170 | 171 | 172 | def resolve_s3_uri_format(s3_uri_format, now, database=None, table=None): 173 | """Run `:py:func`~datetime.datetime.strftime` on `s3_uri_format`, 174 | and also replace ``%D`` with *database* and ``%T`` with table. 175 | 176 | :param string s3_uri_format: s3 URI, possibly with strftime fields 177 | :param now: current time, as a :py:class:`~datetime.datetime` 178 | :param string database: database name. 179 | :param string table: table name. 180 | """ 181 | def replacer(match): 182 | if match.group(1) == 'D' and database is not None: 183 | return database 184 | elif match.group(1) == 'T' and table is not None: 185 | return table 186 | else: 187 | return match.group(0) 188 | 189 | return now.strftime(STRFTIME_FIELD_RE.sub(replacer, s3_uri_format)) 190 | 191 | 192 | def parse_args(args): 193 | """Parse command-line arguments 194 | 195 | :param list args: alternate command line arguments (normally we read from 196 | ``sys.argv[1:]``) 197 | 198 | :return: *database*, *tables*, *s3_uri*, *options* 199 | """ 200 | parser = make_option_parser() 201 | 202 | if not args: 203 | parser.print_help() 204 | sys.exit() 205 | 206 | options, args = parser.parse_args(args) 207 | 208 | s3_uri_format = args[-1] 209 | if not S3_URI_RE.match(s3_uri_format): 210 | parser.error('Invalid s3_uri_format: %r' % s3_uri_format) 211 | 212 | if options.mode == 'tables': 213 | if len(args) < 2: 214 | parser.error('You must specify at least db_name and s3_uri_format') 215 | 216 | databases = args[:1] 217 | tables = args[1:-1] 218 | elif options.mode == 'databases': 219 | if len(args) < 2: 220 | parser.error('You must specify at least db_name and s3_uri_format') 221 | 222 | databases = args[:-1] 223 | tables = None 224 | else: 225 | assert options.mode == 'all_databases' 226 | if len(args) > 1: 227 | parser.error("Don't specify database names with --all-databases") 228 | databases = None 229 | tables = None 230 | 231 | if has_table_field(s3_uri_format) and not tables: 232 | parser.error('If you use %T, you must specify one or more tables') 233 | 234 | if has_database_field(s3_uri_format) and not databases: 235 | parser.error('If you use %D, you must specify one or more databases' 236 | ' (use %d for day of month)') 237 | 238 | return databases, tables, s3_uri_format, options 239 | 240 | 241 | def connect_s3(boto_cfg=None, **kwargs): 242 | """Make a connection to S3 using :py:mod:`boto` and return it. 243 | 244 | :param string boto_cfg: Optional path to boto.cfg file to read credentials 245 | from 246 | :param kwargs: Optional additional keyword args to pass to 247 | :py:func:`boto.connect_s3`. Keyword args set to ``None`` 248 | will be filtered out (so we can use boto's defaults). 249 | """ 250 | if boto_cfg: 251 | configs = boto.pyami.config.Config(path=boto_cfg) 252 | kwargs['aws_access_key_id'] = configs.get( 253 | 'Credentials', 'aws_access_key_id') 254 | kwargs['aws_secret_access_key'] = configs.get( 255 | 'Credentials', 'aws_secret_access_key') 256 | kwargs = dict((k, v) for k, v in kwargs.iteritems() if v is not None) 257 | return boto.connect_s3(**kwargs) 258 | 259 | 260 | def s3_key_exists(s3_conn, s3_uri): 261 | bucket_name, key_name = parse_s3_uri(s3_uri) 262 | bucket = s3_conn.get_bucket(bucket_name) 263 | return bool(bucket.get_key(key_name)) 264 | 265 | 266 | def make_s3_key(s3_conn, s3_uri): 267 | """Get the S3 key corresponding *s3_uri*, creating it if it doesn't exist. 268 | """ 269 | bucket_name, key_name = parse_s3_uri(s3_uri) 270 | bucket = s3_conn.get_bucket(bucket_name) 271 | s3_key = bucket.get_key(key_name) 272 | if s3_key: 273 | return s3_key 274 | else: 275 | return bucket.new_key(key_name) 276 | 277 | 278 | def sleeping_callback(t): 279 | """Return a callback function that sleeps for t seconds""" 280 | return lambda _, __: time.sleep(t) 281 | 282 | S3_ATTEMPTS = 4 # number of times to retry failed uploads 283 | S3_THROTTLE = 60 # number of times to throttle during upload 284 | 285 | 286 | def upload_multipart(s3_key, large_file): 287 | """Split up a large_file into chunks suitable for multipart upload, then 288 | upload each chunk.""" 289 | split_dir = tempfile.mkdtemp(prefix='s3mysqldump-split-') 290 | split_prefix = "%s/part-" % split_dir 291 | 292 | args = ['split', "--line-bytes=%u" % S3_MAX_PUT_SIZE, '--suffix-length=5', 293 | '-d', large_file, split_prefix] 294 | log.info(' '.join(pipes.quote(arg) for arg in args)) 295 | subprocess.check_call(args) 296 | 297 | mp = s3_key.bucket.initiate_multipart_upload(s3_key.name) 298 | log.info('Multipart upload to %r' % s3_key) 299 | for part, filename in enumerate(sorted(glob.glob(split_prefix + '*'))): 300 | for t in xrange(S3_ATTEMPTS): 301 | try: 302 | with open(filename, 'rb') as part_file: 303 | mp.upload_part_from_file( 304 | part_file, 305 | part + 1, 306 | cb=sleeping_callback(t), 307 | num_cb=S3_THROTTLE 308 | ) # counting starts at 1 309 | log.debug('Part %s uploaded to %r' % (part + 1, s3_key)) 310 | break 311 | except socket.error as e: 312 | log.warn('Part %s, upload attempt %s/%s: ' 313 | 'upload_part_from_file raised %r' % 314 | (part + 1, t, S3_ATTEMPTS, e)) 315 | else: 316 | raise socket.error("Upload failed") 317 | 318 | mp.complete_upload() 319 | 320 | shutil.rmtree(split_dir, True) 321 | 322 | 323 | def upload_singlepart(s3_key, filename): 324 | """Upload a normal sized file. Retry with sleeping callbacks when 325 | throttled by S3. 326 | """ 327 | for t in xrange(S3_ATTEMPTS): 328 | try: 329 | s3_key.set_contents_from_filename( 330 | filename, 331 | cb=sleeping_callback(t), 332 | num_cb=S3_THROTTLE 333 | ) 334 | break 335 | except socket.error as e: 336 | log.warn('Upload attempt %s/%s: set_contents_from_file raised %r' % 337 | (t, S3_ATTEMPTS, e)) 338 | else: 339 | raise socket.error("Upload failed") 340 | 341 | 342 | def make_option_parser(): 343 | usage = '%prog [options] db_name [tbl_name ...] s3_uri_format' 344 | description = ('Dump one or more MySQL tables to S3.' 345 | ' s3_uri_format may be a strftime() format string, e.g.' 346 | ' s3://foo/%Y/%m/%d/, for daily (or hourly) dumps. You can' 347 | ' also use %D for database name and %T for table name. ' 348 | ' Using %T will create one key per table.') 349 | option_parser = optparse.OptionParser(usage=usage, description=description) 350 | 351 | # trying to pick short opt names that won't get confused with 352 | # the mysql options 353 | option_parser.add_option( 354 | '-A', '--all-databases', dest='mode', default='tables', 355 | action='store_const', const='all_databases', 356 | help='Dump all tables from all databases.') 357 | option_parser.add_option( 358 | '-B', '--databases', dest='mode', default='tables', 359 | action='store_const', const='databases', 360 | help='Dump entire databases rather than tables') 361 | option_parser.add_option( 362 | '-b', '--boto-cfg', dest='boto_cfg', default=None, 363 | help='Alternate path to boto.cfg file (for S3 credentials). See' + 364 | ' http://code.google.com/p/boto/wiki/BotoConfig for details. You' 365 | ' can also pass in S3 credentials by setting the environment' 366 | ' variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY.') 367 | option_parser.add_option( 368 | '-f', '--force', dest='force', default=False, action='store_true', 369 | help='Overwrite existing keys on S3') 370 | option_parser.add_option( 371 | '-m', '--my-cnf', dest='my_cnf', default=None, 372 | help='Alternate path to my.cnf (for MySQL credentials). See' + 373 | ' http://dev.mysql.com/doc/refman/5.5/en/option-files.html for' + 374 | ' details. You can also specify this path in the environment' 375 | ' variable MY_CNF.') 376 | option_parser.add_option( 377 | '--mysqldump-bin', dest='mysqldump_bin', 378 | default=DEFAULT_MYSQLDUMP_BIN, 379 | help='alternate path to mysqldump binary') 380 | option_parser.add_option( 381 | '-M', '--mysqldump-extra-opts', dest='mysqldump_extra_opts', 382 | default=[], action='append', 383 | help='extra args to pass to mysqldump (e.g. "-e --comment -vvv").' 384 | ' Use -m (see above) for passwords and other credentials.') 385 | option_parser.add_option( 386 | '-q', '--quiet', dest='quiet', default=False, 387 | action='store_true', 388 | help="Don't print to stderr") 389 | option_parser.add_option( 390 | '--tables', dest='mode', default='tables', 391 | action='store_const', const='tables', 392 | help='Dump tables from one database (the default).') 393 | option_parser.add_option( 394 | '--s3-endpoint', dest='s3_endpoint', default=None, 395 | help=('alternate S3 endpoint to connect to (e.g.' 396 | ' us-west-1.elasticmapreduce.amazonaws.com).')) 397 | option_parser.add_option( 398 | '-s', '--single-row-format', dest='single_row_format', default=False, 399 | action='store_true', 400 | help=('Output single-row INSERT statements, and turn off locking, for' 401 | ' easy data processing. Equivalent to -M "%s"' 402 | % ' '.join(SINGLE_ROW_FORMAT_OPTS))) 403 | option_parser.add_option( 404 | '--utc', dest='utc', default=False, action='store_true', 405 | help='Use UTC rather than local time to process s3_uri_format') 406 | option_parser.add_option( 407 | '-v', '--verbose', dest='verbose', default=False, 408 | action='store_true', 409 | help='Print more messages') 410 | 411 | return option_parser 412 | 413 | 414 | def parse_s3_uri(uri): 415 | """Parse an S3 URI into (bucket, key) 416 | 417 | >>> parse_s3_uri('s3://walrus/tmp/') 418 | ('walrus', 'tmp/') 419 | 420 | If ``uri`` is not an S3 URI, raise a ValueError 421 | """ 422 | match = S3_URI_RE.match(uri) 423 | if match: 424 | return match.groups() 425 | else: 426 | raise ValueError('Invalid S3 URI: %s' % uri) 427 | 428 | 429 | def log_to_stream(name=None, stream=None, format=None, level=None, 430 | debug=False): 431 | """Set up logging. 432 | 433 | :type name: str 434 | :param name: name of the logger, or ``None`` for the root logger 435 | :type stderr: file object 436 | :param stderr: stream to log to (default is ``sys.stderr``) 437 | :type format: str 438 | :param format: log message format (default is '%(message)s') 439 | :param level: log level to use 440 | :type debug: bool 441 | :param debug: quick way of setting the log level; if true, use 442 | ``logging.DEBUG``; otherwise use ``logging.INFO`` 443 | """ 444 | if level is None: 445 | level = logging.DEBUG if debug else logging.INFO 446 | 447 | if format is None: 448 | format = '%(message)s' 449 | 450 | if stream is None: 451 | stream = sys.stderr 452 | 453 | handler = logging.StreamHandler(stream) 454 | handler.setLevel(level) 455 | handler.setFormatter(logging.Formatter(format)) 456 | 457 | logger = logging.getLogger(name) 458 | logger.setLevel(level) 459 | logger.addHandler(handler) 460 | 461 | 462 | def parse_opts(list_of_opts): 463 | """Used to parse :option:`--mysql-extra-opts`. Take a list of strings 464 | containing space-separated arguments, parse them, and return a list 465 | of arguments.""" 466 | results = [] 467 | for opts in list_of_opts: 468 | results.extend(shlex.split(opts)) 469 | return results 470 | 471 | 472 | def mysqldump_to_file(file, databases=None, tables=None, mysqldump_bin=None, 473 | my_cnf=None, extra_opts=None, single_row_format=False): 474 | """Run mysqldump on a single table and dump it to a file 475 | 476 | :param string file: file object to dump to 477 | :param databases: sequence of MySQL database names, or ``None`` for all 478 | databases 479 | :param tables: sequences of MySQL table names, or ``None`` for all tables. 480 | If you specify tables, there must be exactly one database 481 | name, due to limitations of :command:`mysqldump` 482 | :param string mysqldump_bin: alternate path to mysqldump binary 483 | :param string my_cnf: alternate path to my.cnf file containing MySQL 484 | credentials. If not set, this function will also try 485 | to read the environment variable :envvar:`MY_CNF`. 486 | :param extra_opts: a list of additional arguments to pass to mysqldump 487 | (e.g. hostname, port, and credentials). 488 | :param single_row_format: Output single-row INSERT statements, and turn off 489 | locking, for easy data processing.. Passes 490 | ``--compact --complete-insert 491 | --default_character_set=utf8 --hex-blob 492 | --no-create-db --no-create-info --quick 493 | --skip-lock-tables --skip-extended-insert`` to 494 | :command:`mysqldump`. Note this also turns off 495 | table locking. You can override any of this with 496 | *extra_opts*. 497 | 498 | If you dump multiple databases in single-row format, you will still get one 499 | ``USE`` statement per database; :command:`mysqldump` doesn't have a way to 500 | turn this off. 501 | """ 502 | if tables and (not databases or len(databases) != 1): 503 | raise ValueError( 504 | 'If you specify tables you must specify exactly one database') 505 | 506 | args = [] 507 | args.append(mysqldump_bin or DEFAULT_MYSQLDUMP_BIN) 508 | 509 | # --defaults-file apparently has to go before any other options 510 | if my_cnf: 511 | args.append('--defaults-file=' + my_cnf) 512 | elif os.environ.get('MY_CNF'): 513 | args.append('--defaults-file=' + os.environ['MY_CNF']) 514 | 515 | if single_row_format: 516 | args.extend(SINGLE_ROW_FORMAT_OPTS) 517 | if extra_opts: 518 | args.extend(extra_opts) 519 | 520 | if not databases and not tables: 521 | args.append('--all-databases') 522 | elif len(databases) > 1 or tables is None: 523 | args.append('--databases') 524 | args.append('--') 525 | args.extend(databases) 526 | else: 527 | assert len(databases) == 1 528 | args.append('--tables') 529 | args.append('--') 530 | args.append(databases[0]) 531 | args.extend(tables) 532 | 533 | # do it! 534 | log.debug(' %s > %s' % ( 535 | ' '.join(pipes.quote(arg) for arg in args), 536 | getattr(file, 'name', None) or repr(file))) 537 | 538 | start = time.time() 539 | 540 | returncode = subprocess.call(args, stdout=file) 541 | 542 | if returncode: 543 | log.debug(' Failed with returncode %d' % returncode) 544 | else: 545 | log.debug(' Done in %.1fs' % (time.time() - start)) 546 | 547 | return not returncode 548 | 549 | 550 | if __name__ == '__main__': 551 | main(sys.argv[1:]) 552 | --------------------------------------------------------------------------------