├── LICENSE
├── README.md
└── db_converter.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2012 Lanyrd Ltd.
 2 | 
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | Redistributions of source code must retain the above copyright notice, this
 9 | list of conditions and the following disclaimer.
10 | 
11 | Redistributions in binary form must reproduce the above copyright notice, this
12 | list of conditions and the following disclaimer in the documentation and/or
13 | other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | MySQL to PostgreSQL Converter
 2 | =============================
 3 | 
 4 | Lanyrd's MySQL to PostgreSQL conversion script. Use with care.
 5 | 
 6 | This script was designed for our specific database and column requirements -
 7 | notably, it doubles the lengths of VARCHARs due to a unicode size problem we
 8 | had, places indexes on all foreign keys, and presumes you're using Django
 9 | for column typing purposes.
10 | 
11 | How to use
12 | ----------
13 | 
14 | First, dump your MySQL database in PostgreSQL-compatible format
15 | 
16 |     mysqldump --compatible=postgresql --default-character-set=utf8 \
17 |     -r databasename.mysql -u root databasename
18 | 
19 | Then, convert it using the dbconverter.py script
20 | 
21 | `python db_converter.py databasename.mysql databasename.psql`
22 | 
23 | It'll print progress to the terminal.
24 | 
25 | Finally, load your new dump into a fresh PostgreSQL database using: 
26 | 
27 | `psql -f databasename.psql`
28 | 
29 | More information
30 | ----------------
31 | 
32 | You can learn more about the move which this powered at http://lanyrd.com/blog/2012/lanyrds-big-move/ and some technical details of it at http://www.aeracode.org/2012/11/13/one-change-not-enough/.
33 | 


--------------------------------------------------------------------------------
/db_converter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """
  4 | Fixes a MySQL dump made with the right format so it can be directly
  5 | imported to a new PostgreSQL database.
  6 | 
  7 | Dump using:
  8 | mysqldump --compatible=postgresql --default-character-set=utf8 -r databasename.mysql -u root databasename
  9 | """
 10 | 
 11 | import re
 12 | import sys
 13 | import os
 14 | import time
 15 | import subprocess
 16 | 
 17 | 
 18 | def parse(input_filename, output_filename):
 19 |     "Feed it a file, and it'll output a fixed one"
 20 | 
 21 |     # State storage
 22 |     if input_filename == "-":
 23 |         num_lines = -1
 24 |     else:
 25 |         num_lines = int(subprocess.check_output(["wc", "-l", input_filename]).strip().split()[0])
 26 |     tables = {}
 27 |     current_table = None
 28 |     creation_lines = []
 29 |     enum_types = []
 30 |     foreign_key_lines = []
 31 |     fulltext_key_lines = []
 32 |     sequence_lines = []
 33 |     cast_lines = []
 34 |     num_inserts = 0
 35 |     started = time.time()
 36 | 
 37 |     # Open output file and write header. Logging file handle will be stdout
 38 |     # unless we're writing output to stdout, in which case NO PROGRESS FOR YOU.
 39 |     if output_filename == "-":
 40 |         output = sys.stdout
 41 |         logging = open(os.devnull, "w")
 42 |     else:
 43 |         output = open(output_filename, "w")
 44 |         logging = sys.stdout
 45 | 
 46 |     if input_filename == "-":
 47 |         input_fh = sys.stdin
 48 |     else:
 49 |         input_fh = open(input_filename)
 50 | 
 51 | 
 52 |     output.write("-- Converted by db_converter\n")
 53 |     output.write("START TRANSACTION;\n")
 54 |     output.write("SET standard_conforming_strings=off;\n")
 55 |     output.write("SET escape_string_warning=off;\n")
 56 |     output.write("SET CONSTRAINTS ALL DEFERRED;\n\n")
 57 | 
 58 |     for i, line in enumerate(input_fh):
 59 |         time_taken = time.time() - started
 60 |         percentage_done = (i+1) / float(num_lines)
 61 |         secs_left = (time_taken / percentage_done) - time_taken
 62 |         logging.write("\rLine %i (of %s: %.2f%%) [%s tables] [%s inserts] [ETA: %i min %i sec]" % (
 63 |             i + 1,
 64 |             num_lines,
 65 |             ((i+1)/float(num_lines))*100,
 66 |             len(tables),
 67 |             num_inserts,
 68 |             secs_left // 60,
 69 |             secs_left % 60,
 70 |         ))
 71 |         logging.flush()
 72 |         line = line.decode("utf8").strip().replace(r"\\", "WUBWUBREALSLASHWUB").replace(r"\'", "''").replace("WUBWUBREALSLASHWUB", r"\\")
 73 |         # Ignore comment lines
 74 |         if line.startswith("--") or line.startswith("/*") or line.startswith("LOCK TABLES") or line.startswith("DROP TABLE") or line.startswith("UNLOCK TABLES") or not line:
 75 |             continue
 76 | 
 77 |         # Outside of anything handling
 78 |         if current_table is None:
 79 |             # Start of a table creation statement?
 80 |             if line.startswith("CREATE TABLE"):
 81 |                 current_table = line.split('"')[1]
 82 |                 tables[current_table] = {"columns": []}
 83 |                 creation_lines = []
 84 |             # Inserting data into a table?
 85 |             elif line.startswith("INSERT INTO"):
 86 |                 output.write(line.encode("utf8").replace("'0000-00-00 00:00:00'", "NULL") + "\n")
 87 |                 num_inserts += 1
 88 |             # ???
 89 |             else:
 90 |                 print "\n ! Unknown line in main body: %s" % line
 91 | 
 92 |         # Inside-create-statement handling
 93 |         else:
 94 |             # Is it a column?
 95 |             if line.startswith('"'):
 96 |                 useless, name, definition = line.strip(",").split('"',2)
 97 |                 try:
 98 |                     type, extra = definition.strip().split(" ", 1)
 99 | 
100 |                     # This must be a tricky enum
101 |                     if ')' in extra:
102 |                         type, extra = definition.strip().split(")")
103 | 
104 |                 except ValueError:
105 |                     type = definition.strip()
106 |                     extra = ""
107 |                 extra = re.sub("CHARACTER SET [\w\d]+\s*", "", extra.replace("unsigned", ""))
108 |                 extra = re.sub("COLLATE [\w\d]+\s*", "", extra.replace("unsigned", ""))
109 | 
110 |                 # See if it needs type conversion
111 |                 final_type = None
112 |                 set_sequence = None
113 |                 if type.startswith("tinyint("):
114 |                     type = "int4"
115 |                     set_sequence = True
116 |                     final_type = "boolean"
117 |                 elif type.startswith("int("):
118 |                     type = "integer"
119 |                     set_sequence = True
120 |                 elif type.startswith("bigint("):
121 |                     type = "bigint"
122 |                     set_sequence = True
123 |                 elif type == "longtext":
124 |                     type = "text"
125 |                 elif type == "mediumtext":
126 |                     type = "text"
127 |                 elif type == "tinytext":
128 |                     type = "text"
129 |                 elif type.startswith("varchar("):
130 |                     size = int(type.split("(")[1].rstrip(")"))
131 |                     type = "varchar(%s)" % (size * 2)
132 |                 elif type.startswith("smallint("):
133 |                     type = "int2"
134 |                     set_sequence = True
135 |                 elif type == "datetime":
136 |                     type = "timestamp with time zone"
137 |                 elif type == "double":
138 |                     type = "double precision"
139 |                 elif type.endswith("blob"):
140 |                     type = "bytea"
141 |                 elif type.startswith("enum(") or type.startswith("set("):
142 | 
143 |                     types_str = type.split("(")[1].rstrip(")").rstrip('"')
144 |                     types_arr = [type_str.strip('\'') for type_str in types_str.split(",")]
145 | 
146 |                     # Considered using values to make a name, but its dodgy
147 |                     # enum_name = '_'.join(types_arr)
148 |                     enum_name = "{0}_{1}".format(current_table, name)
149 | 
150 |                     if enum_name not in enum_types:
151 |                         output.write("CREATE TYPE {0} AS ENUM ({1}); \n".format(enum_name, types_str));
152 |                         enum_types.append(enum_name)
153 | 
154 |                     type = enum_name
155 | 
156 |                 if final_type:
157 |                     cast_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"%s\" DROP DEFAULT, ALTER COLUMN \"%s\" TYPE %s USING CAST(\"%s\" as %s)" % (current_table, name, name, final_type, name, final_type))
158 |                 # ID fields need sequences [if they are integers?]
159 |                 if name == "id" and set_sequence is True:
160 |                     sequence_lines.append("CREATE SEQUENCE %s_id_seq" % (current_table))
161 |                     sequence_lines.append("SELECT setval('%s_id_seq', max(id)) FROM %s" % (current_table, current_table))
162 |                     sequence_lines.append("ALTER TABLE \"%s\" ALTER COLUMN \"id\" SET DEFAULT nextval('%s_id_seq')" % (current_table, current_table))
163 |                 # Record it
164 |                 creation_lines.append('"%s" %s %s' % (name, type, extra))
165 |                 tables[current_table]['columns'].append((name, type, extra))
166 |             # Is it a constraint or something?
167 |             elif line.startswith("PRIMARY KEY"):
168 |                 creation_lines.append(line.rstrip(","))
169 |             elif line.startswith("CONSTRAINT"):
170 |                 foreign_key_lines.append("ALTER TABLE \"%s\" ADD CONSTRAINT %s DEFERRABLE INITIALLY DEFERRED" % (current_table, line.split("CONSTRAINT")[1].strip().rstrip(",")))
171 |                 foreign_key_lines.append("CREATE INDEX ON \"%s\" %s" % (current_table, line.split("FOREIGN KEY")[1].split("REFERENCES")[0].strip().rstrip(",")))
172 |             elif line.startswith("UNIQUE KEY"):
173 |                 creation_lines.append("UNIQUE (%s)" % line.split("(")[1].split(")")[0])
174 |             elif line.startswith("FULLTEXT KEY"):
175 | 
176 |                 fulltext_keys = " || ' ' || ".join( line.split('(')[-1].split(')')[0].replace('"', '').split(',') )
177 |                 fulltext_key_lines.append("CREATE INDEX ON %s USING gin(to_tsvector('english', %s))" % (current_table, fulltext_keys))
178 | 
179 |             elif line.startswith("KEY"):
180 |                 pass
181 |             # Is it the end of the table?
182 |             elif line == ");":
183 |                 output.write("CREATE TABLE \"%s\" (\n" % current_table)
184 |                 for i, line in enumerate(creation_lines):
185 |                     output.write("    %s%s\n" % (line, "," if i != (len(creation_lines) - 1) else ""))
186 |                 output.write(');\n\n')
187 |                 current_table = None
188 |             # ???
189 |             else:
190 |                 print "\n ! Unknown line inside table creation: %s" % line
191 | 
192 | 
193 |     # Finish file
194 |     output.write("\n-- Post-data save --\n")
195 |     output.write("COMMIT;\n")
196 |     output.write("START TRANSACTION;\n")
197 | 
198 |     # Write typecasts out
199 |     output.write("\n-- Typecasts --\n")
200 |     for line in cast_lines:
201 |         output.write("%s;\n" % line)
202 | 
203 |     # Write FK constraints out
204 |     output.write("\n-- Foreign keys --\n")
205 |     for line in foreign_key_lines:
206 |         output.write("%s;\n" % line)
207 | 
208 |     # Write sequences out
209 |     output.write("\n-- Sequences --\n")
210 |     for line in sequence_lines:
211 |         output.write("%s;\n" % line)
212 | 
213 |     # Write full-text indexkeyses out
214 |     output.write("\n-- Full Text keys --\n")
215 |     for line in fulltext_key_lines:
216 |         output.write("%s;\n" % line)
217 | 
218 |     # Finish file
219 |     output.write("\n")
220 |     output.write("COMMIT;\n")
221 |     print ""
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     parse(sys.argv[1], sys.argv[2])
226 | 


--------------------------------------------------------------------------------