├── LICENSE ├── README.md └── db_converter.py /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2012 Lanyrd Ltd. 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | Redistributions in binary form must reproduce the above copyright notice, this 12 | list of conditions and the following disclaimer in the documentation and/or 13 | other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MySQL to PostgreSQL Converter 2 | ============================= 3 | 4 | Lanyrd's MySQL to PostgreSQL conversion script. Use with care. 5 | 6 | This script was designed for our specific database and column requirements - 7 | notably, it doubles the lengths of VARCHARs due to a unicode size problem we 8 | had, places indexes on all foreign keys, and presumes you're using Django 9 | for column typing purposes. 10 | 11 | How to use 12 | ---------- 13 | 14 | First, dump your MySQL database in PostgreSQL-compatible format 15 | 16 | mysqldump --compatible=postgresql --default-character-set=utf8 \ 17 | -r databasename.mysql -u root databasename 18 | 19 | Then, convert it using the dbconverter.py script 20 | 21 | `python db_converter.py databasename.mysql databasename.psql` 22 | 23 | It'll print progress to the terminal. 24 | 25 | Finally, load your new dump into a fresh PostgreSQL database using: 26 | 27 | `psql -f databasename.psql` 28 | 29 | More information 30 | ---------------- 31 | 32 | You can learn more about the move which this powered at http://lanyrd.com/blog/2012/lanyrds-big-move/ and some technical details of it at http://www.aeracode.org/2012/11/13/one-change-not-enough/. 33 | -------------------------------------------------------------------------------- /db_converter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Fixes a MySQL dump made with the right format so it can be directly 5 | imported to a new PostgreSQL database. 6 | 7 | Dump using: 8 | mysqldump --compatible=postgresql --default-character-set=utf8 -r databasename.mysql -u root databasename 9 | """ 10 | 11 | import re 12 | import sys 13 | import os 14 | import time 15 | import subprocess 16 | 17 | 18 | def parse(input_filename, output_filename): 19 | "Feed it a file, and it'll output a fixed one" 20 | 21 | # State storage 22 | if input_filename == "-": 23 | num_lines = -1 24 | else: 25 | num_lines = int(subprocess.check_output(["wc", "-l", input_filename]).strip().split()[0]) 26 | tables = {} 27 | current_table = None 28 | creation_lines = [] 29 | enum_types = [] 30 | foreign_key_lines = [] 31 | fulltext_key_lines = [] 32 | sequence_lines = [] 33 | cast_lines = [] 34 | num_inserts = 0 35 | started = time.time() 36 | 37 | # Open output file and write header. Logging file handle will be stdout 38 | # unless we're writing output to stdout, in which case NO PROGRESS FOR YOU. 39 | if output_filename == "-": 40 | output = sys.stdout 41 | logging = open(os.devnull, "w") 42 | else: 43 | output = open(output_filename, "w") 44 | logging = sys.stdout 45 | 46 | if input_filename == "-": 47 | input_fh = sys.stdin 48 | else: 49 | input_fh = open(input_filename) 50 | 51 | 52 | output.write("-- Converted by db_converter\n") 53 | output.write("START TRANSACTION;\n") 54 | output.write("SET standard_conforming_strings=off;\n") 55 | output.write("SET escape_string_warning=off;\n") 56 | output.write("SET CONSTRAINTS ALL DEFERRED;\n\n") 57 | 58 | for i, line in enumerate(input_fh): 59 | time_taken = time.time() - started 60 | percentage_done = (i+1) / float(num_lines) 61 | secs_left = (time_taken / percentage_done) - time_taken 62 | logging.write("\rLine %i (of %s: %.2f%%) [%s tables] [%s inserts] [ETA: %i min %i sec]" % ( 63 | i + 1, 64 | num_lines, 65 | ((i+1)/float(num_lines))*100, 66 | len(tables), 67 | num_inserts, 68 | secs_left // 60, 69 | secs_left % 60, 70 | )) 71 | logging.flush() 72 | line = line.decode("utf8").strip().replace(r"\\", "WUBWUBREALSLASHWUB").replace(r"\'", "''").replace("WUBWUBREALSLASHWUB", r"\\") 73 | # Ignore comment lines 74 | if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES") or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") or not line: 75 | continue 76 | 77 | # Outside of anything handling 78 | if current_table is None: 79 | # Start of a table creation statement? 80 | if line.startswith("CREATE TABLE"): 81 | current_table = line.split('"')[1] 82 | tables[current_table] = {"columns": []} 83 | creation_lines = [] 84 | # Inserting data into a table? 85 | elif line.startswith("INSERT INTO"): 86 | output.write(line.encode("utf8").replace("'0000-00-00 00:00:00'", "NULL") + "\n") 87 | num_inserts += 1 88 | # ??? 89 | else: 90 | print "\n ! Unknown line in main body: %s" % line 91 | 92 | # Inside-create-statement handling 93 | else: 94 | # Is it a column? 95 | if line.startswith('"'): 96 | useless, name, definition = line.strip(",").split('"',2) 97 | try: 98 | type, extra = definition.strip().split(" ", 1) 99 | 100 | # This must be a tricky enum 101 | if ')' in extra: 102 | type, extra = definition.strip().split(")") 103 | 104 | except ValueError: 105 | type = definition.strip() 106 | extra = "" 107 | extra = re.sub("CHARACTER SET [\w\d]+\s*", "", extra.replace("unsigned", "")) 108 | extra = re.sub("COLLATE [\w\d]+\s*", "", extra.replace("unsigned", "")) 109 | 110 | # See if it needs type conversion 111 | final_type = None 112 | set_sequence = None 113 | if type.startswith("tinyint("): 114 | type = "int4" 115 | set_sequence = True 116 | final_type = "boolean" 117 | elif type.startswith("int("): 118 | type = "integer" 119 | set_sequence = True 120 | elif type.startswith("bigint("): 121 | type = "bigint" 122 | set_sequence = True 123 | elif type == "longtext": 124 | type = "text" 125 | elif type == "mediumtext": 126 | type = "text" 127 | elif type == "tinytext": 128 | type = "text" 129 | elif type.startswith("varchar("): 130 | size = int(type.split("(")[1].rstrip(")")) 131 | type = "varchar(%s)" % (size * 2) 132 | elif type.startswith("smallint("): 133 | type = "int2" 134 | set_sequence = True 135 | elif type == "datetime": 136 | type = "timestamp with time zone" 137 | elif type == "double": 138 | type = "double precision" 139 | elif type.endswith("blob"): 140 | type = "bytea" 141 | elif type.startswith("enum(") or type.startswith("set("): 142 | 143 | types_str = type.split("(")[1].rstrip(")").rstrip('"') 144 | types_arr = [type_str.strip('\'') for type_str in types_str.split(",")] 145 | 146 | # Considered using values to make a name, but its dodgy 147 | # enum_name = '_'.join(types_arr) 148 | enum_name = "{0}_{1}".format(current_table, name) 149 | 150 | if enum_name not in enum_types: 151 | output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str)); 152 | enum_types.append(enum_name) 153 | 154 | type = enum_name 155 | 156 | if final_type: 157 | cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" DROP DEFAULT, ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % (current_table, name, name, final_type, name, final_type)) 158 | # ID fields need sequences [if they are integers?] 159 | if name == "id" and set_sequence is True: 160 | sequence_lines.append("CREATE SEQUENCE %s_id_seq" % (current_table)) 161 | sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % (current_table, current_table)) 162 | sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % (current_table, current_table)) 163 | # Record it 164 | creation_lines.append('"%s" %s %s' % (name, type, extra)) 165 | tables[current_table]['columns'].append((name, type, extra)) 166 | # Is it a constraint or something? 167 | elif line.startswith("PRIMARY KEY"): 168 | creation_lines.append(line.rstrip(",")) 169 | elif line.startswith("CONSTRAINT"): 170 | foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % (current_table, line.split("CONSTRAINT")[1].strip().rstrip(","))) 171 | foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % (current_table, line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(","))) 172 | elif line.startswith("UNIQUE KEY"): 173 | creation_lines.append("UNIQUE (%s)" % line.split("(")[1].split(")")[0]) 174 | elif line.startswith("FULLTEXT KEY"): 175 | 176 | fulltext_keys = " || ' ' || ".join( line.split('(')[-1].split(')')[0].replace('"', '').split(',') ) 177 | fulltext_key_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % (current_table, fulltext_keys)) 178 | 179 | elif line.startswith("KEY"): 180 | pass 181 | # Is it the end of the table? 182 | elif line == ");": 183 | output.write("CREATE TABLE \"%s\" (\n" % current_table) 184 | for i, line in enumerate(creation_lines): 185 | output.write(" %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else "")) 186 | output.write(');\n\n') 187 | current_table = None 188 | # ??? 189 | else: 190 | print "\n ! Unknown line inside table creation: %s" % line 191 | 192 | 193 | # Finish file 194 | output.write("\n-- Post-data save --\n") 195 | output.write("COMMIT;\n") 196 | output.write("START TRANSACTION;\n") 197 | 198 | # Write typecasts out 199 | output.write("\n-- Typecasts --\n") 200 | for line in cast_lines: 201 | output.write("%s;\n" % line) 202 | 203 | # Write FK constraints out 204 | output.write("\n-- Foreign keys --\n") 205 | for line in foreign_key_lines: 206 | output.write("%s;\n" % line) 207 | 208 | # Write sequences out 209 | output.write("\n-- Sequences --\n") 210 | for line in sequence_lines: 211 | output.write("%s;\n" % line) 212 | 213 | # Write full-text indexkeyses out 214 | output.write("\n-- Full Text keys --\n") 215 | for line in fulltext_key_lines: 216 | output.write("%s;\n" % line) 217 | 218 | # Finish file 219 | output.write("\n") 220 | output.write("COMMIT;\n") 221 | print "" 222 | 223 | 224 | if __name__ == "__main__": 225 | parse(sys.argv[1], sys.argv[2]) 226 | --------------------------------------------------------------------------------