├── hive_compared_bq ├── __init__.py ├── hive.py ├── bq.py └── hive_compared_bq.py ├── .gitignore ├── udf ├── hcbq.jar └── README ├── docs └── images │ ├── differences_sha.png │ └── differences_count.png ├── src └── main │ └── java │ └── org │ └── apache │ └── hadoop │ └── hive │ └── ql │ └── udf │ └── generic │ └── GenericUDFDecodeCP1252.java ├── LICENSE └── README.md /hive_compared_bq/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | launch.sh 3 | *.swp 4 | *.pyc 5 | -------------------------------------------------------------------------------- /udf/hcbq.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/udf/hcbq.jar -------------------------------------------------------------------------------- /docs/images/differences_sha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/docs/images/differences_sha.png -------------------------------------------------------------------------------- /docs/images/differences_count.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/docs/images/differences_count.png -------------------------------------------------------------------------------- /udf/README: -------------------------------------------------------------------------------- 1 | 2 | This jar contains 2 UDFs: 3 | - UDFSha1, which is the unmodified sha1 UDF that appeared with Hive 1.3.0 4 | - GenericUDFDecodeCP1252, whose source code is in src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDecodeCP1252.java 5 | 6 | -------------------------------------------------------------------------------- /src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDecodeCP1252.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * Copyright 2017 bol.com. All Rights Reserved 4 | * 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | 19 | package org.apache.hadoop.hive.ql.udf.generic; 20 | 21 | import org.apache.hadoop.hive.ql.exec.Description; 22 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException; 23 | import org.apache.hadoop.hive.ql.metadata.HiveException; 24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter; 26 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; 27 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 28 | import org.apache.hadoop.io.Text; 29 | 30 | import java.nio.ByteBuffer; 31 | import java.nio.charset.Charset; 32 | 33 | import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP; 34 | 35 | /** 36 | * UDFDecodeCP1252. 37 | */ 38 | @Description( 39 | name = "UDFDecodeCP1252", 40 | value = "_FUNC_(str) - Returns str, which is the same string but in a Google's UTF8 encoding instead of the " 41 | + "initial CP1252 encoding") 42 | public class GenericUDFDecodeCP1252 extends GenericUDF { 43 | private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[1]; 44 | private transient Converter[] converters = new Converter[1]; 45 | private final Text output = new Text(); 46 | private final static Charset CP1252 = Charset.forName( "CP1252"); 47 | 48 | @Override 49 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { 50 | checkArgsSize(arguments, 1, 1); 51 | 52 | checkArgPrimitive(arguments, 0); 53 | 54 | checkArgGroups(arguments, 0, inputTypes, STRING_GROUP); 55 | 56 | obtainStringConverter(arguments, 0, inputTypes, converters); 57 | 58 | ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector; 59 | return outputOI; 60 | } 61 | 62 | @Override 63 | public Object evaluate(DeferredObject[] arguments) throws HiveException { 64 | final String stringToDecode = getStringValue(arguments, 0, converters); 65 | if (stringToDecode == null) { 66 | return null; 67 | } 68 | 69 | final char[] charactersToDecode = stringToDecode.toCharArray(); 70 | final byte[] bytesArray = new byte[1]; 71 | StringBuilder utf8_text = new StringBuilder(charactersToDecode.length); 72 | for (int i = 0; i < charactersToDecode.length; i++){ 73 | char character = charactersToDecode[i]; 74 | 75 | if( Character.isISOControl( character)){ // encoding problems happened only with control characters 76 | int asciiCode = (int)character; 77 | if( asciiCode >= 1 && asciiCode <= 9 ){ 78 | utf8_text.append(" "); // those control characters are transformed to spaces by BigQuery 79 | } 80 | else { 81 | int codePoint = Character.codePointAt( charactersToDecode,i); 82 | // First of all, let's handle the control characters that are transformed to space by BigQuery 83 | // (about those characters see: http://www.fileformat.info/info/unicode/char/0081/index.htm, 84 | // and http://www.ltg.ed.ac.uk/~richard/utf-8.cgi?input=129&mode=decimal 85 | // TODO add missing characters (so far I have just seen those 3) 86 | if( codePoint == 129 || codePoint == 144 || codePoint == 157 ){ 87 | utf8_text.append(" "); 88 | } 89 | else { 90 | // this part tries to solve the cases like: 91 | // C299 (hex) CP1252 character being transformed by BigQuery into E284A2 (hex) in UTF8 92 | 93 | // first we need to get rid of the control character (in our example, we get rid of 'C2' and 94 | // we just keep '99') 95 | String last_2_hexa_bytes = String.format("%02x", (int) character); 96 | // gets the byte array from above "hexa string representation" 97 | // (solution extracted from https://stackoverflow.com/a/140861/4064443) 98 | bytesArray[0] = (byte) ((Character.digit(last_2_hexa_bytes.charAt(0), 16) << 4) 99 | | Character.digit(last_2_hexa_bytes.charAt(1), 16)); 100 | ByteBuffer wrappedBytes = ByteBuffer.wrap(bytesArray); 101 | 102 | // now we can properly decode from CP1252 charset 103 | String decodedString = CP1252.decode(wrappedBytes).toString(); 104 | utf8_text.append(decodedString); 105 | } 106 | } 107 | } 108 | else{ // "standard" characters are OK. We just copied them "as is" 109 | utf8_text.append(character); 110 | } 111 | } 112 | output.set(utf8_text.toString()); 113 | return output; 114 | } 115 | 116 | @Override 117 | public String getDisplayString(String[] children) { 118 | return getStandardDisplayString(getFuncName(), children); 119 | } 120 | 121 | @Override 122 | protected String getFuncName() { 123 | return "decodeCP1252"; 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /hive_compared_bq/hive.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Copyright 2017 bol.com. All Rights Reserved 4 | 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | import logging 20 | import sys 21 | import time 22 | # noinspection PyProtectedMember 23 | from hive_compared_bq import _Table 24 | import pyhs2 # TODO switch to another module since this one is deprecated and does not support Python 3 25 | # see notes in : https://github.com/BradRuderman/pyhs2 26 | 27 | 28 | class THive(_Table): 29 | """Hive implementation of the _Table object""" 30 | 31 | def __init__(self, database, table, parent, hs2_server, jar_path): 32 | _Table.__init__(self, database, table, parent) 33 | self.server = hs2_server 34 | self.connection = self._create_connection() 35 | self.jarPath = jar_path 36 | 37 | def get_type(self): 38 | return "hive" 39 | 40 | def _create_connection(self): 41 | """Connect to the table and return the connection object that we will use to launch queries""" 42 | return pyhs2.connect(host=self.server, port=10000, authMechanism="KERBEROS", database=self.database) 43 | 44 | def get_ddl_columns(self): 45 | if len(self._ddl_columns) > 0: 46 | return self._ddl_columns 47 | 48 | is_col_def = True 49 | cur = self.connection.cursor() 50 | cur.execute("describe " + self.full_name) 51 | all_columns = [] 52 | while cur.hasMoreRows: 53 | row = cur.fetchone() 54 | if row is None: 55 | continue 56 | col_name = row[0] 57 | col_type = row[1] 58 | 59 | if col_name == "" or col_name == "None": 60 | continue 61 | if col_name.startswith('#'): 62 | if "Partition Information" in col_name: 63 | is_col_def = False 64 | continue 65 | 66 | my_dic = {"name": col_name, "type": col_type} 67 | if is_col_def: 68 | all_columns.append(my_dic) 69 | else: 70 | self._ddl_partitions.append(my_dic) 71 | cur.close() 72 | 73 | self.filter_columns_from_cli(all_columns) 74 | 75 | return self._ddl_columns 76 | 77 | def get_column_statistics(self, query, selected_columns): 78 | cur = self.connection.cursor() 79 | cur.execute(query) 80 | while cur.hasMoreRows: 81 | fetched = cur.fetchone() 82 | if fetched is not None: 83 | for idx, col in enumerate(selected_columns): 84 | value_column = fetched[idx] 85 | col["Counter"][value_column] += 1 # TODO what happens with NULL? 86 | cur.close() 87 | 88 | def create_sql_groupby_count(self): 89 | where_condition = "" 90 | if self.where_condition is not None: 91 | where_condition = "WHERE " + self.where_condition 92 | query = "SELECT hash( cast( %s as STRING)) %% %i AS gb, count(*) AS count FROM %s %s GROUP BY " \ 93 | "hash( cast( %s as STRING)) %% %i" \ 94 | % (self.get_groupby_column(), self.tc.number_of_group_by, self.full_name, where_condition, 95 | self.get_groupby_column(), self.tc.number_of_group_by) 96 | logging.debug("Hive query is: %s", query) 97 | 98 | return query 99 | 100 | def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values): 101 | gb_column = self.get_groupby_column() 102 | where_condition = "" 103 | if self.where_condition is not None: 104 | where_condition = self.where_condition + " AND" 105 | hive_query = "SELECT hash( cast( %s as STRING)) %% %i as bucket, %s, %s FROM %s WHERE %s " \ 106 | "hash( cast( %s as STRING)) %% %i IN (%s)" \ 107 | % (gb_column, self.tc.number_of_group_by, gb_column, extra_columns_str, self.full_name, 108 | where_condition, gb_column, self.tc.number_of_group_by, buckets_values) 109 | logging.debug("Hive query to show the buckets and the extra columns is: %s", hive_query) 110 | 111 | return hive_query 112 | 113 | def create_sql_intermediate_checksums(self): 114 | column_blocks = self.get_column_blocks(self.get_ddl_columns()) 115 | number_of_blocks = len(column_blocks) 116 | logging.debug("%i column_blocks (with a size of %i columns) have been considered: %s", number_of_blocks, 117 | self.tc.block_size, str(column_blocks)) 118 | 119 | # Generate the concatenations for the column_blocks 120 | hive_basic_shas = "" 121 | for idx, block in enumerate(column_blocks): 122 | hive_basic_shas += "base64( unhex( SHA1( concat( " 123 | for col in block: 124 | name = col["name"] 125 | hive_value_name = name 126 | if col["type"] == 'date': 127 | hive_value_name = "cast( %s as STRING)" % name 128 | elif col["type"] == 'float' or col["type"] == 'double': 129 | hive_value_name = "cast( floor( %s * 10000 ) as bigint)" % name 130 | elif col["type"] == 'string' and name in self.decodeCP1252_columns: 131 | hive_value_name = "DecodeCP1252( %s)" % name 132 | hive_basic_shas += "CASE WHEN %s IS NULL THEN 'n_%s' ELSE %s END, '|'," % (name, name[:2], 133 | hive_value_name) 134 | hive_basic_shas = hive_basic_shas[:-6] + ")))) as block_%i,\n" % idx 135 | hive_basic_shas = hive_basic_shas[:-2] 136 | 137 | where_condition = "" 138 | if self.where_condition is not None: 139 | where_condition = "WHERE " + self.where_condition 140 | 141 | hive_query = "WITH blocks AS (\nSELECT hash( cast( %s as STRING)) %% %i as gb,\n%s\nFROM %s %s\n),\n" \ 142 | % (self.get_groupby_column(), self.tc.number_of_group_by, hive_basic_shas, self.full_name, 143 | where_condition) # 1st CTE with the basic block shas 144 | list_blocks = ", ".join(["block_%i" % i for i in range(number_of_blocks)]) 145 | hive_query += "full_lines AS(\nSELECT gb, base64( unhex( SHA1( concat( %s)))) as row_sha, %s FROM blocks\n)\n" \ 146 | % (list_blocks, list_blocks) # 2nd CTE to get all the info of a row 147 | hive_list_shas = ", ".join(["base64( unhex( SHA1( concat_ws( '|', sort_array( collect_list( block_%i)))))) as " 148 | "block_%i_gb " % (i, i) for i in range(number_of_blocks)]) 149 | hive_query += "SELECT gb, base64( unhex( SHA1( concat_ws( '|', sort_array( collect_list( row_sha)))))) as " \ 150 | "row_sha_gb, %s FROM full_lines GROUP BY gb" % hive_list_shas # final query where all the shas 151 | # are grouped by row-blocks 152 | logging.debug("##### Final Hive query is:\n%s\n", hive_query) 153 | 154 | return hive_query 155 | 156 | def delete_temporary_table(self, table_name): 157 | self.query("DROP TABLE " + table_name).close() 158 | 159 | def query(self, query): 160 | """Execute the received query in Hive and return the cursor which is ready to be fetched and MUST be closed after 161 | 162 | :type query: str 163 | :param query: query to execute in Hive 164 | 165 | :rtype: :class:`pyhs2.cursor.Cursor` 166 | :returns: the cursor for this query 167 | 168 | :raises: IOError if the query has some execution errors 169 | """ 170 | logging.debug("Launching Hive query") 171 | # TODO split number should be done in function of file format (ORC, Avro...) and number of columns 172 | # split_maxsize = 256000000 173 | # split_maxsize = 64000000 174 | split_maxsize = 8000000 175 | # split_maxsize = 16000000 176 | try: 177 | cur = self.connection.cursor() 178 | cur.execute("set mapreduce.input.fileinputformat.split.maxsize = %i" % split_maxsize) 179 | cur.execute("set hive.fetch.task.conversion=minimal") # force a MapReduce, because simple 'fetch' queries 180 | # on a large table may generate some timeout otherwise 181 | cur.execute(query) 182 | except: 183 | raise IOError("There was a problem in executing the query in Hive: %s", sys.exc_info()[1]) 184 | logging.debug("Fetching Hive results") 185 | return cur 186 | 187 | def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False): 188 | try: 189 | cur = self.query(query) 190 | while cur.hasMoreRows: 191 | row = cur.fetchone() 192 | if row is not None: 193 | if not all_columns_from_2: 194 | result_dic[row[0]] = row[1] 195 | else: 196 | result_dic[row[0]] = row[2:] 197 | except: 198 | result_dic["error"] = sys.exc_info()[1] 199 | raise 200 | finally: 201 | cur.close() 202 | logging.debug("All %i Hive rows fetched", len(result_dic)) 203 | 204 | def launch_query_csv_compare_result(self, query, rows): 205 | cur = self.query(query) 206 | while cur.hasMoreRows: 207 | row = cur.fetchone() 208 | if row is not None: 209 | line = "^ " + " | ".join([str(col) for col in row]) + " $" 210 | rows.append(line) 211 | logging.debug("All %i Hive rows fetched", len(rows)) 212 | cur.close() 213 | 214 | def launch_query_with_intermediate_table(self, query, result): 215 | try: 216 | cur = self.query("add jar " + self.jarPath) # must be in a separated execution 217 | cur.execute("create temporary function SHA1 as 'org.apache.hadoop.hive.ql.udf.UDFSha1'") 218 | cur.execute("create temporary function DecodeCP1252 as " 219 | "'org.apache.hadoop.hive.ql.udf.generic.GenericUDFDecodeCP1252'") 220 | except: 221 | result["error"] = sys.exc_info()[1] 222 | raise 223 | 224 | if "error" in result: 225 | cur.close() 226 | return # let's stop the thread if some error popped up elsewhere 227 | 228 | tmp_table = "%s.temp_hiveCmpBq_%s_%s" % (self.database, self.full_name.replace('.', '_'), 229 | str(time.time()).replace('.', '_')) 230 | cur.execute("CREATE TABLE " + tmp_table + " AS\n" + query) 231 | cur.close() 232 | result["names_sha_tables"][self.get_id_string()] = tmp_table # we confirm this table has been created 233 | result["cleaning"].append((tmp_table, self)) 234 | 235 | logging.debug("The temporary table for Hive is " + tmp_table) 236 | 237 | if "error" in result: # A problem happened in the other query of the other table (usually BQ, since it is 238 | # faster than Hive) so there is no need to pursue or have the temp table 239 | return 240 | 241 | projection_hive_row_sha = "SELECT gb, row_sha_gb FROM %s" % tmp_table 242 | self.launch_query_dict_result(projection_hive_row_sha, result["sha_dictionaries"][self.get_id_string()]) 243 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2017 bol.com 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /hive_compared_bq/bq.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Copyright 2017 bol.com. All Rights Reserved 4 | 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | import logging 20 | import sys 21 | import time 22 | # noinspection PyProtectedMember 23 | from hive_compared_bq import _Table 24 | from google.cloud import bigquery 25 | 26 | 27 | class TBigQuery(_Table): 28 | """BigQuery implementation of the _Table object""" 29 | 30 | hash2_js_udf = '''create temp function hash2(text STRING) 31 | returns INT64 32 | LANGUAGE js AS """ 33 | if(text === null){ 34 | return 0 // same behaviour as in Hive 35 | } 36 | let myHash = 0 37 | for (let character of text){ 38 | myHash = myHash * 31 + character.charCodeAt(0) 39 | if (myHash >= 4294967296){ // because in Hive hash() is computed on integers range 40 | myHash = myHash % 4294967296 41 | } 42 | } 43 | if (myHash >= 2147483648){ 44 | myHash = myHash - 4294967296 45 | } 46 | return myHash 47 | """; 48 | ''' 49 | 50 | def __init__(self, database, table, parent, project): 51 | _Table.__init__(self, database, table, parent) 52 | 53 | self.project = project # the Google Cloud project where this dataset/table belongs.If Null, then the default 54 | # environment where this script is executed is used. 55 | self.connection = self._create_connection() 56 | 57 | # check that we can reach dataset and table 58 | dataset = self.connection.dataset(database) 59 | if not dataset.exists(): 60 | raise AttributeError("The dataset %s:%s does not seem to exist or is unreachable" % (project, database)) 61 | 62 | mytable = dataset.table(table) 63 | if not mytable.exists(): 64 | raise AttributeError("The table %s:%s.%s does not seem to exist or is unreachable" % 65 | (project, database, table)) 66 | 67 | def get_type(self): 68 | return "bigQuery" 69 | 70 | def _create_connection(self): 71 | """Connect to the table and return the connection object that we will use to launch queries""" 72 | if self.project is None: 73 | return bigquery.Client() 74 | else: 75 | return bigquery.Client(project=self.project) 76 | 77 | def get_ddl_columns(self): 78 | if len(self._ddl_columns) > 0: 79 | return self._ddl_columns 80 | else: 81 | dataset = self.connection.dataset(self.database) 82 | table = dataset.table(self.table) 83 | table.reload() 84 | schema = table.schema 85 | 86 | all_columns = [] 87 | for field in schema: 88 | col_name = str(field.name) 89 | col_type = str( 90 | field.field_type.lower()) # force 'str' to remove unicode notation and align it to Hive format 91 | # let's align the types with the ones in Hive 92 | if col_type == 'integer': 93 | col_type = 'bigint' 94 | my_dic = {"name": col_name, "type": col_type} 95 | all_columns.append(my_dic) 96 | 97 | self.filter_columns_from_cli(all_columns) 98 | 99 | return self._ddl_columns 100 | 101 | def get_column_statistics(self, query, selected_columns): 102 | for row in self.query(query): 103 | for idx, col in enumerate(selected_columns): 104 | value_column = row[idx] 105 | col["Counter"][value_column] += 1 106 | 107 | def create_sql_groupby_count(self): 108 | where_condition = "" 109 | if self.where_condition is not None: 110 | where_condition = "WHERE " + self.where_condition 111 | query = self.hash2_js_udf + "SELECT MOD( hash2( cast(%s as STRING)), %i) as gb, count(*) as count FROM %s %s " \ 112 | "GROUP BY gb ORDER BY gb" \ 113 | % (self.get_groupby_column(), self.tc.number_of_group_by, self.full_name, 114 | where_condition) 115 | logging.debug("BigQuery query is: %s", query) 116 | return query 117 | 118 | def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values): 119 | where_condition = "" 120 | if self.where_condition is not None: 121 | where_condition = self.where_condition + " AND" 122 | gb_column = self.get_groupby_column() 123 | bq_query = self.hash2_js_udf + "SELECT MOD( hash2( cast(%s as STRING)), %i) as bucket, %s as gb, %s FROM %s " \ 124 | "WHERE %s MOD( hash2( cast(%s as STRING)), %i) IN (%s)" \ 125 | % (gb_column, self.tc.number_of_group_by, gb_column, extra_columns_str, 126 | self.full_name, where_condition, gb_column, self.tc.number_of_group_by, 127 | buckets_values) 128 | logging.debug("BQ query to show the buckets and the extra columns is: %s", bq_query) 129 | 130 | return bq_query 131 | 132 | def create_sql_intermediate_checksums(self): 133 | column_blocks = self.get_column_blocks(self.get_ddl_columns()) 134 | number_of_blocks = len(column_blocks) 135 | logging.debug("%i column_blocks (with a size of %i columns) have been considered: %s", number_of_blocks, 136 | self.tc.block_size, str(column_blocks)) 137 | 138 | # Generate the concatenations for the column_blocks 139 | bq_basic_shas = "" 140 | for idx, block in enumerate(column_blocks): 141 | bq_basic_shas += "TO_BASE64( sha1( concat( " 142 | for col in block: 143 | name = col["name"] 144 | bq_value_name = name 145 | if col["type"] == 'decimal': # removing trailing & unnecessary 'zero decimal' (*.0) 146 | bq_value_name = 'regexp_replace( %s, "\\.0$", "")' % name 147 | elif col["type"] == 'float' or col["type"] == 'double': 148 | bq_value_name = "cast( cast( FLOOR( %s * 10000) as INT64) as STRING)" % name 149 | elif not col["type"] == 'string': 150 | bq_value_name = "cast( %s as STRING)" % name 151 | bq_basic_shas += "CASE WHEN %s IS NULL THEN 'n_%s' ELSE %s END, '|'," % (name, name[:2], bq_value_name) 152 | bq_basic_shas = bq_basic_shas[:-6] + "))) as block_%i,\n" % idx 153 | bq_basic_shas = bq_basic_shas[:-2] 154 | 155 | where_condition = "" 156 | if self.where_condition is not None: 157 | where_condition = "WHERE " + self.where_condition 158 | 159 | bq_query = self.hash2_js_udf + "WITH blocks AS (\nSELECT MOD( hash2( cast(%s as STRING)), %i) as gb,\n%s\n" \ 160 | "FROM %s %s\n),\n" \ 161 | % (self.get_groupby_column(), self.tc.number_of_group_by, bq_basic_shas, 162 | self.full_name, where_condition) # 1st CTE with the basic block shas 163 | list_blocks = ", ".join(["block_%i" % i for i in range(number_of_blocks)]) 164 | bq_query += "full_lines AS(\nSELECT gb, TO_BASE64( sha1( concat( %s))) as row_sha, %s FROM blocks\n)\n" \ 165 | % (list_blocks, list_blocks) # 2nd CTE to get all the info of a row 166 | bq_list_shas = ", ".join(["TO_BASE64( sha1( STRING_AGG( block_%i, '|' ORDER BY block_%i))) as block_%i_gb " 167 | % (i, i, i) for i in range(number_of_blocks)]) 168 | bq_query += "SELECT gb, TO_BASE64( sha1( STRING_AGG( row_sha, '|' ORDER BY row_sha))) as row_sha_gb, %s FROM " \ 169 | "full_lines GROUP BY gb" % bq_list_shas # final query where all the shas are grouped by row-blocks 170 | logging.debug("##### Final BigQuery query is:\n%s\n", bq_query) 171 | 172 | return bq_query 173 | 174 | def delete_temporary_table(self, table_name): 175 | pass # The temporary (cached) tables in BigQuery are deleted after 24 hours 176 | 177 | def query(self, query): 178 | """Execute the received query in BigQuery and return an iterate Result object 179 | 180 | :type query: str 181 | :param query: query to execute in BigQuery 182 | 183 | :rtype: list of rows 184 | :returns: the QueryResults for this query 185 | """ 186 | logging.debug("Launching BigQuery query") 187 | q = self.connection.run_sync_query(query) 188 | q.timeout_ms = 600000 # 10 minutes to execute the BQ query should be more than enough. 1 minute was too short 189 | # TODO use maxResults https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query? : 190 | q.use_legacy_sql = False 191 | q.run() 192 | logging.debug("Fetching BigQuery results") 193 | return q.fetch_data() 194 | 195 | def query_ctas_bq(self, query): 196 | """Execute the received query in BigQuery and return the name of the cache results table 197 | 198 | This is the equivalent of a "Create Table As a Select" in Hive. The advantage is that BigQuery only keeps that 199 | table during 24 hours (we don't have to delete it just like in the case of Hive), and we're not charged for the 200 | space used. 201 | 202 | :type query: str 203 | :param query: query to execute in BigQuery 204 | 205 | :rtype: str 206 | :returns: the full name of the cache table (dataset.table) that stores those results 207 | 208 | :raises: IOError if the query has some execution errors 209 | """ 210 | logging.debug("Launching BigQuery CTAS query") 211 | job_name = "job_hive_compared_bq_%f" % time.time() # Job ID must be unique 212 | job = self.connection.run_async_query(job_name.replace('.', '_'), 213 | query) # replace(): Job IDs must be alphanumeric 214 | job.use_legacy_sql = False 215 | job.begin() 216 | time.sleep(3) # 3 second is the minimum latency we get in BQ in general. So no need to try fetching before 217 | retry_count = 300 # 10 minutes (because of below time sleep of 2 seconds). This should be enough 218 | while retry_count > 0 and job.state != 'DONE': 219 | retry_count -= 1 220 | time.sleep(2) 221 | job.reload() 222 | logging.debug("BigQuery CTAS query finished") 223 | 224 | if job.errors is not None: 225 | raise IOError("There was a problem in executing the query in BigQuery: %s" % str(job.errors)) 226 | 227 | cache_table = job.destination.dataset_name + '.' + job.destination.name 228 | logging.debug("The cache table of the final comparison query in BigQuery is: " + cache_table) 229 | 230 | return cache_table 231 | 232 | def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False): 233 | for row in self.query(query): 234 | if not all_columns_from_2: 235 | result_dic[row[0]] = row[1] 236 | else: 237 | result_dic[row[0]] = row[2:] 238 | logging.debug("All %i BigQuery rows fetched", len(result_dic)) 239 | 240 | def launch_query_csv_compare_result(self, query, rows): 241 | for row in self.query(query): 242 | line = "^ " + " | ".join([str(col) for col in row]) + " $" 243 | rows.append(line) 244 | logging.debug("All %i BigQuery rows fetched", len(rows)) 245 | 246 | def launch_query_with_intermediate_table(self, query, result): 247 | try: 248 | result["names_sha_tables"][self.get_id_string()] = self.query_ctas_bq(query) 249 | projection_gb_row_sha = "SELECT gb, row_sha_gb FROM %s" % result["names_sha_tables"][self.get_id_string()] 250 | self.launch_query_dict_result(projection_gb_row_sha, result["sha_dictionaries"][self.get_id_string()]) 251 | except: 252 | result["error"] = sys.exc_info()[1] 253 | raise 254 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hive_compared_bq 2 | 3 | hive_compared_bq is a Python program that compares 2 (SQL like) tables, and graphically shows the rows/columns that are different if any are found. 4 | 5 | Comparing the full content of some tables in a scalable way has proved to be a difficult task. 6 | 7 | hive_compared_bq tackles this problem by: 8 | * Not moving the data, avoiding long data transfer typically needed for a "Join compared" approach. Instead, only the aggregated data is moved. 9 | * Working on the full datasets and all the columns, so that we are 100% sure that the 2 tables are the same 10 | * Leveraging the tools (currently: Hive, BigQuery) to be as scalable as possible 11 | * Showing the developer in a graphical way the differences found, so that the developer can easily understands its mistakes and fix them 12 | 13 | # Table of contents 14 | * [Features](#features) 15 | * [Installation](#installation) 16 | + [Note Python version 2.6](#note-python-version-26) 17 | + [Installing Python 2.7](#installing-python-27) 18 | + [For Hive](#for-hive) 19 | - [Installation of the required UDF](#installation-of-the-required-udf) 20 | + [For Big Query](#for-big-query) 21 | * [Usage](#usage) 22 | + [Get Help](#get-help) 23 | + [Basic execution](#basic-execution) 24 | + [Explanation of results](#explanation-of-results) 25 | - [Case of identical tables](#case-of-identical-tables) 26 | - [Case of number of rows not matching](#case-of-number-of-rows-not-matching) 27 | - [Case of differences inside the rows](#case-of-differences-inside-the-rows) 28 | + [Advanced executions](#advanced-executions) 29 | - [Faster executions](#faster-executions) 30 | - [Skewing problem](#skewing-problem) 31 | - [Schema not matching](#schema-not-matching) 32 | - [HBase tables](#hbase-tables) 33 | - [Encoding differences between Hive and BigQuery](#encoding-differences-between-hive-and-bigquery) 34 | - [Problems in the selection of the GroupBy column](#problems-in-the-selection-of-the-groupby-column) 35 | * [Algorithm](#algorithm) 36 | + [Imprecision due to "float" of "double" types](#imprecision-due-to--float--of--double--types) 37 | 38 | ## Features 39 | 40 | * Engines supported: Hive, BigQuery (and HBase to some extent). In theory, it is easy to extend it to other SQL backends such as Spanner, CloudSQL, Oracle... Help is welcomed :) ! 41 | * Possibility to only select specific columns or to remove some of them (useful if the schema between the tables is not exactly the same, or if we know that some columns are different and we don't want them to "pollute" the results) 42 | * Possibility to just do a quick check (counting the rows in an advanced way) instead of complete checksum verification 43 | * Detection of skew 44 | 45 | ## Installation 46 | 47 | This software works with Mac (tested on 10.11 and 10.12) and on Linux (tested on Redhat 6). It is not expected to work on Windows. 48 | 49 | Make sure the following software is available: 50 | * Python = 2.7 or 2.6 (see [restrictions for 2.6](#note-python-version-26)) 51 | * pyhs2 (just for Hive) 52 | * google.cloud.bigquery (just for BigQuery) 53 | 54 | ### Note Python version 2.6 55 | BigQuery is not supported by this version of Python (Google Cloud SDK only supports Python 2.7) 56 | 57 | It is needed to install some backports for Python's collection. For instance, for a Redhat server it would be: 58 | ```bash 59 | yum install python-backport_collections.noarch 60 | ``` 61 | 62 | ### Installing Python 2.7 63 | On RHEL6, Python 2.6 was installed by default. And the RPM version in EPEL for Python2.7 had no RPM available, so I compiled it this way: 64 | ```bash 65 | su -c "yum install gcc gcc-c++ zlib-devel sqlite-devel openssl-devel" 66 | wget https://www.python.org/ftp/python/2.7.13/Python-2.7.13.tgz 67 | tar xvfz Python-2.7.13.tgz 68 | cd Python-2.7.13 69 | ./configure --with-ensurepip=install 70 | make 71 | su -c "make altinstall" 72 | su -c "echo 'export CLOUDSDK_PYTHON=/usr/local/bin/python2.7' >> /etc/bashrc" 73 | export CLOUDSDK_PYTHON=/usr/local/bin/python2.7 74 | ``` 75 | 76 | ### For Hive 77 | Execute this as "root": 78 | ```bash 79 | yum install cyrus-sasl-gssapi cyrus-sasl-devel 80 | pip install pyhs2 81 | ``` 82 | 83 | 84 | #### Installation of the required UDF 85 | 86 | On the Hadoop cluster where Hive executes, you must copy the library: `udf/hcbq.jar` (this jar contains 2 UDFs: one to compute the SHA1s, and the other to handle encoding translation). 87 | 88 | Place this Jar on a HDFS directory where you have read access (in the example below, we will consider that this jar has been copied in `/user/sluangsay/lib/hcbq.jar`). 89 | 90 | ### For Big Query 91 | 92 | (steps extracted from https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-python) 93 | 94 | To execute below commands, make sure that you already have a Google Cloud account with a BigQuery project created. 95 | 96 | ```bash 97 | mkdir -p ~/bin/googleSdk 98 | cd ~/bin/googleSdk 99 | su -c "pip install --upgrade google-cloud-bigquery" 100 | 101 | wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-170.0.0-linux-x86_64.tar.gz 102 | tar xvfz google-cloud-sdk-170.0.0-linux-x86_64.tar.gz 103 | 104 | ./google-cloud-sdk/bin/gcloud init 105 | ./google-cloud-sdk/bin/gcloud auth application-default login 106 | ./google-cloud-sdk/install.sh 107 | ``` 108 | 109 | This last command tells you to source 2 files in every session. So include them in your bash profile. For instance in my case it would be: 110 | ```bash 111 | echo "source ${HOME}/bin/googlesdk/google-cloud-sdk/path.bash.inc" >> ~/.bash_profile 112 | echo "source ${HOME}/bin/googlesdk/google-cloud-sdk/completion.bash.inc" >> ~/.bash_profile 113 | ``` 114 | Open a new bash session to activate those changes. 115 | 116 | ## Usage 117 | 118 | ### Get Help 119 | 120 | To see all the quick options of hive_compared_bq.py, just execute it without any arguments: 121 | ```bash 122 | python hive_compared_bq.py 123 | ``` 124 | 125 | When calling the help, you get more detailled on the different options and also some examples: 126 | ```bash 127 | python hive_compared_bq.py --help 128 | ``` 129 | 130 | ### Basic execution 131 | 132 | You must indicate the 2 tables to compare as arguments (the first table will be considered as the "source" table, and the second one as the "destination" table). 133 | 134 | Each table must have the following format: `type`/`database`.`table` 135 | where: 136 | * `type` is the technology of your database (currently, only 'hive' and 'bq' (BigQuery) are supported) 137 | * `database` is the name of your database (also called "dataset" in BigQuery) 138 | * `table` is of course the name of your table 139 | 140 | About the location of those databases: 141 | * In the case of BigQuery, the default Google Cloud project configured in your environment is selected.
142 | If you want to specify another project, you must indicate it with the `project` parameter with the `-s` or `-d` options. 143 | * In the case of Hive, you must specify the hostname of the HiveServer2, using the `hs2` parameter with the `-s` or `-d` options. 144 | 145 | Another note for Hive: you need to pass the HDFS direction of the jar of the required UDF (see installation of Hive above), using the 'jar' option. 146 | 147 | To clarify all the above, let's consider that we want to compare the following 2 tables: 148 | * A Hive table called `hive_compared_bq_table`, inside the database `sluangsay`.
149 | With those parameters, the argument to give is: `hive/sluangsay.hive_compared_bq_table`.
150 | Let's suppose also that the hostname of HiveServer2 is `master-003.bol.net`, and that we installed the required Jar in the HDFS path 'hdfs://hdp/user/sluangsay/lib/hcbq.jar'. Then, since this table is the first one on our command line, it is the source table and we need to define the option: `-s "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}"` 151 | * A BigQuery table also alled `hive_compared_bq_table`, inside the dataset `bidwh2`. 152 | 153 | To compare above 2 tables, you need to execute: 154 | ```bash 155 | python hive_compared_bq.py -s "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}" hive/sluangsay.hive_compared_bq_table bq/bidwh2.hive_compared_bq_table 156 | ``` 157 | 158 | ### Explanation of results 159 | 160 | #### Case of identical tables 161 | 162 | When executing above command, if the 2 tables have exactly the same data, then we would get an output similar to: 163 | 164 | ``` 165 | [INFO] [2017-09-15 10:59:20,851] (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values 166 | [INFO] [2017-09-15 10:59:22,285] (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences) 167 | [INFO] [2017-09-15 10:59:22,286] (MainThread) Executing the 'Group By' Count queries for sluangsay.hive_compared_bq_table (hive) and bidwh2.hive_compared_bq_table (bigQuery) to do first comparison 168 | No differences were found when doing a Count on the tables sluangsay.hive_compared_bq_table and bidwh2.hive_compared_bq_table and grouping by on the column rowkey 169 | [INFO] [2017-09-15 10:59:48,727] (MainThread) Executing the 'shas' queries for hive_sluangsay.hive_compared_bq_table and bigQuery_bidwh2.hive_compared_bq_table to do final comparison 170 | Sha queries were done and no differences were found: the tables hive_sluangsay.hive_compared_bq_table and bigQuery_bidwh2.hive_compared_bq_table are equal! 171 | ``` 172 | 173 | The first 2 lines describe the first step of the algorithm (see the explanation of the algorithm later), and we see that `rowkey` is the column that is here used to perform the `Group By` operations.
174 | Then, the next 2 lines show the second step: we count the number of rows for each value of `rowkey`. In that case, those values match for the 2 tables.
175 | Finally, we do the extensive computation of the SHAs for all the columns and rows of the 2 tables. At the end, the script tells us that our 2 tables are identical.
176 | We can also check the returns value of the script: it is 0, which means no differences. 177 | 178 | #### Case of number of rows not matching 179 | 180 | The first difference that can be observed is that the number of rows associated to (at least) one GroupBy value does not match between the 2 tables.
181 | In such case, the standard output would be similar to: 182 | 183 | ``` 184 | [INFO] [2017-09-18 08:09:06,947] (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values 185 | [INFO] [2017-09-18 08:09:07,739] (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences) 186 | [INFO] [2017-09-18 08:09:07,739] (MainThread) Executing the 'Group By' Count queries for sluangsay.hive_compared_bq_table2 (hive) and bidwh2.hive_compared_bq_table2 (bigQuery) to do first comparison 187 | [INFO] [2017-09-18 08:09:35,392] (MainThread) We found at least 3 differences in Group By count 188 | ``` 189 | 190 | And the return value of the script would be 1. 191 | 192 | Some of the differences are also shown in a HTML file that is automatically opened in your browser: 193 | 194 | ![alt text](docs/images/differences_count.png?raw=true "Differences in GroupBy numbers") 195 | 196 | This shows a table, with the rows of 1 table on the left side, and the ones of the other table on the right side.
197 | The names of the table appear at the top of the table.
198 | There we can also see the names of the columns that are shown.
199 | For performance reason and also sake of brevity, only some 7 columns are shown.
200 | The first 2 columns are "special columns": the GroupBy column (2nd column), and just before its SHA1 value (1st column).
201 | In this example, we can see that only rows on the left side appear: this is because the table on the right does not contain rows that contain the rowkeys 21411000029, 65900009 and 6560009. 202 | 203 | #### Case of differences inside the rows 204 | 205 | If the numbers of rows match it is still possible to observe some differences in the last (SHA1) step. In which case, we would get some message similar to: 206 | 207 | ``` 208 | [INFO] [2017-09-28 05:53:55,890] (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values 209 | [INFO] [2017-09-28 05:53:56,897] (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences) 210 | [INFO] [2017-09-28 05:53:56,897] (MainThread) Executing the 'shas' queries for hive_sluangsay.hive_compared_bq_table3 and bigQuery_bidwh2.hive_compared_bq_table3 to do final comparison 211 | [INFO] [2017-09-28 05:54:26,961] (MainThread) We found 2 differences in sha verification 212 | Showing differences for columns product_category ,product_group ,product_subgroup ,product_subsubgroup ,unit 213 | ``` 214 | 215 | The return value of the script would be 1. 216 | 217 | Some of the differences are also shown in a HTML file that is automatically opened in your browser: 218 | 219 | ![alt text](docs/images/differences_sha.png?raw=true "Differences in SHA") 220 | 221 | We can see some similar information as the one exposed for differences in the [Count validation](#case-of-number-of-rows-not-matching).
222 | The 7 columns are: first the 2 "special columns", then the 5 columns of a "column block" (see [algorithm](#algorithm) for explanations) which contains some differences. That means that at least 1 of those 5 columns has at least 1 value different.
223 | In our example, we can see that the 2 rows have some differences in the column `product_subgroup`. Those differences are highlighted in yellow. 224 | 225 | If there are several "column blocks" that have some differences, then the program will first show the column block that contains more "row blocks" with differences (take care: that does not mean that it is the column block that contains more differences. We could have indeed a column block with just 1 row block with differences, but that 1 row block could contain 1000s of rows with differences. On the other hand, we could imagine another column block with 2 row blocks containing differences, but each row block could contain 1 single row with differences).
226 | Then after, the program will ask you if you wish to see another column block with differences. 227 | 228 | ### Advanced executions 229 | 230 | #### Faster executions 231 | 232 | By default, the script executes 3 steps (see explanation on how the algorithm executes). 233 | You can run the script faster by removing some steps that you think are unnecessary: 234 | 235 | * with the `--group-by-column` option, you can indicate directly which column you want to use for the Group By. 236 | That means that the first step of analyzing some sample of 10 columns won't be needed. 237 | This step won't usually make a huge difference in the execution time (it usually takes 1-2 seconds), but by doing this you avoid launching a query that might cost you some money (example of BigQuery). 238 | With this option, you might also be able to provide a better column that the one the script would have discovered by itself, which might speed up the following queries (by avoiding Skew for instance, see notes later). 239 | 240 | * with the `--just-count` option, you say that you just want to do a 'count rows validation'. 241 | This is some kind of "basic validation" because you won't be sure that the contents of the rows have identical values. 242 | But that allows you to avoid the final "full SHAs validation" step, that is more expensive and timely to compute. 243 | And maybe this 'count' validation' is enough for you now, because you are at an early stage of development, and you don't need to be 100% sure of your data 244 | (checking the count of rows is also a good idea to double check if some JOINs or some Filters conditions work properly). 245 | Don't forget to eventually run a full SHAs validation when you finish developing. 246 | 247 | * with `--just-sha`, you specify that you don't need the 'count' validation. If you know from previous executions that the counts are correct, then you might indeed decide to skip that previous step. 248 | However, it is a bit at your own risk, because if the counts are not correct, the script will fail but you will have executed a more complex/costly query for that ('count' validation use faster/cheaper queries). 249 | 250 | Another solution to have your validation being executed faster is to limit the scope of your validations. If you decide to validate less data, then you need to process less data, meaning that your queries will be faster/cheaper: 251 | 252 | * for instance, you might be interested in just validating some specific critical columns (maybe because you know that your ETL process does not make any changes on some columns, so why "validating" them?). 253 | In such case, you can use the `--column-range`, `--columns` or `--ignore-columns` options. 254 | 255 | * another example would be just validating some specific partitions. If your data is partitioned by days for instance, then you might decide to only validate 1 day of data. 256 | In such case you have to use the `--source-where` and `--destination-where` to specify a Where condition that tells which partition you want to consider. 257 | 258 | #### Skewing problem 259 | 260 | The program does several queries with some GroupBy operations. As for every GroupBy operation with huge volume of data, skew can be a performance killer, or can even make the query failing because of lack of resources. 261 | 262 | The "count comparison" step (see description of the algorithm to know more about the steps) does not do any complex computation, so skew is annoying but should not be a very big issue.
263 | However, the "SHA1 comparison" step launches some heavy queries, and skew can mean a huge difference.
264 | In our algorithm, skew is determined by the skew in the distribution of the GroupBy column. This is why the selection of this column in the first step of the program is crucial, and that it can mean a huge difference if you specify yourself a column that has a good distribution.
265 | For the above reasons, a skew detection is done during the "count comparison" step. In case a "groupBy value" appears in more than 40 000 rows (you may change this default value with the option '--skew-threshold'), then this groupBy value will be registered and a warning message like this will appear: 266 | ``` 267 | Some important skew (threshold: %i) was detected in the Group By column x. The top values are: 268 | ``` 269 | Following that message, the list of all the top 10 skewed values (and their number of occurrences) pops up so that a developer can know that he should avoid selecting that GroupBy column the next time he runs the program. 270 | And it also gives the possibility to wonder if it is normal for this dataset to contain such skewed values. 271 | 272 | If the "count comparison" step has not encountered any error, but some skew has been discovered, then the program will also stop, with the following message: 273 | ``` 274 | No difference in Group By count was detected but we saw some important skew that could make the next step (comparison of the shas) very slow or failing. So better stopping now. You should consider choosing another Group By column with the '--group-by-column' option 275 | ``` 276 | As explained before, stopping at this stage is a protection to avoid launching some heavy/costly queries that have some high probability to fail.
277 | Should you face this situation, then your best option is to specify a GroupBy column with a better distribution with the `--group-by-column` option. Another possibility is to raise the threshold with `--skew-threshold`: in such case that means that you accept and understand the risk of launching the SHA1 computations with these skewed values. 278 | 279 | #### Schema not matching 280 | 281 | To do the comparison, the program needs to first discover the schemas of the tables. What is actually done is fetching the schema of the "source table", and assuming that the "destination table" has the same schema. 282 | 283 | If the schemas between the 2 tables don't match, then by default it is not possible to compare them.
284 | This can easily happen for instance in partitioned tables with BigQuery, where the "column partition" is called `_PARTITIONTIME` and may not match the name of this table in Hive.
285 | To overcome this, there are 2 possibilities: 286 | * creating some views (in Hive or BigQuery) to rename or remove some columns. Take care not to make this view too complex otherwise that could lower the validity of the comparison. 287 | * limit the columns you want to compare, using the options `--column-range`, `--columns`, `--ignore-columns`. The problem of this approach is that you cannot rename columns. 288 | 289 | #### HBase tables 290 | 291 | It is possible to do also the comparison with some HBase tables.
292 | To do so, you need to create a Hive table with a HBase backend (see documentation: https://cwiki.apache.org/confluence/display/Hive/HBaseIntegration ).
293 | Be aware that Hive on top HBase has some limitations, so you won't be able to check all the data in your HBase tables (for instance: you cannot see the timestamps, or older versions of a cell, and all the columns must be properly defined in the Hive schema). 294 | 295 | #### Encoding differences between Hive and BigQuery 296 | 297 | All the data that is internally stored by BigQuery is automatically saved with an UTF8 encoding. That can give some problems when some strings in Hive are using another encoding: the byte representation between each column may not match even if the 'texts' are the same.
298 | To overcoome this, a UDF has been developed in Hive in order to transform some specific columns with a CP1252 encoding into Google's UTF8 encoding.
299 | Use the `--decodeCP1252-columns` option to specify those columns. Take care: this is code is quite experimental and it does not guarantee that the "translation of encoding" will totally work. 300 | 301 | #### Problems in the selection of the GroupBy column 302 | 303 | The first step of the algorithms tries to estimate the best "GroupBy" column (see explanations about Algorithm below). 304 | However, this estimation can go wrong and in such case the following message appears: 305 | ``` 306 | Error: we could not find a suitable column to do a Group By. Either relax the selection condition with the '--max-gb-percent' option or directly select the column with '--group-by-column' 307 | ``` 308 | 309 | If you face this problem, then just follow one of the 2 solutions proposed in the message above.
310 | The best solution is obviously to have some knowledge about the data and to directly indicate to the script which column is the best one to do some GroupBy, using the `--group-by-column` option.
311 | The other possibility is to use the `--max-gb-percent` option and to make it higher than the default value (1%), in order to allow a bit less homogeneous distribution in the data of the GroupBy column and thus have more probability to find a GroupBy column. 312 | 313 | ## Algorithm 314 | 315 | The goal of hive_compared_bq was to avoid all the shortcomings of previous approaches that tried to solve the same comparison problem. 316 | Some of those approaches are: 317 | * Manual check, comparing some few rows and counting the total number of rows.
318 | Problem of this approach is obviously that it cannot work with a certain amount of data or with tables with lot of columns. So the check cannot be complete. 319 | * Doing some "JOINs" comparisons between the 2 tables.
320 | The drawback of this is that Joins are some quite slow operations in general. What is more, it is very difficult to do a fair comparison when there are multiple rows with the same "Join column". 321 | The last inconvenience is that you have to ensure that the 2 tables are located on the same place, which might force you to transfer the data from 1 database to another one (ex: from a Hive database to a BigQuery database), which takes a long time if the table is huge. 322 | * Sorting all the rows and doing a Diff between them (just like described here: https://community.hortonworks.com/articles/1283/hive-script-to-validate-tables-compare-one-with-an.html ).
323 | The inconvenience with this is also that the 2 tables have to be located on the same place. What is more, the "Diff" operations occurs in memory which means that this approach does not work with huge tables. 324 | 325 | To solve above problems, it was decided to: 326 | * develop an algorithm that would leverage the BigData backends (Hive, BigQuery or others): all the complicated computations must be performed on those engines that naturally scale.
327 | That means that the goal of the algorithm is to generate some complex SQL queries that will be sent to those backends, and then that will compare the results of those queries. 328 | Those results must be small, otherwise the Python program would not be able to do the final comparison in memory. 329 | * compare all the rows and all the columns of the tables.
330 | To ensure that we can have some relatively small amount of results (see reason just above) coming from a huge amount of data, we need to "reduce" the results. 331 | Using some checksums (SHA1) algorithm seems appropriate because it helps in doing that reduction and also in giving a high confidence in the validity of the comparison. 332 | 333 | In summary, the idea of the programs is just to compute locally to each table all the checksums of all the rows/columns, to "Group BY" those checksums and generate some global checksums on top of them.
334 | And then, all those general checksums will be transferred back to the program where the final comparison will be made and where potential differences will be shown to the user. 335 | The following simplified pseudo SQL query summarizes a bit this idea: 336 | 337 | WITH blocks AS ( 338 | SELECT MOD( hash2( column), 100000) as gb, sha1(concat( col0, col1, col2, col3, col4)) as block_0, 339 | sha1(concat( col5, col6, col7, col8, col9)) as block_1, ... as block_N FROM table 340 | ), 341 | full_lines AS ( 342 | SELECT gb, sha1(concat( block_0, |, block_1...) as row_sha, block_0, block_1 ... FROM blocks 343 | ) 344 | SELECT gb, sha1(concat(list)) as sline, sha1(concat(list)) as sblock_1, 345 | sha1(concat(list)) as sblock_2 ... as sblock_N FROM GROUP BY gb 346 | 347 | The real query is obviously a bit more complex, because we need to take care about type conversions, NULL values and the ordering of the rows for the same GroupBy values.
348 | If you feel interested in seeing the real query, launch the script with the `--verbose` option. 349 | 350 | The result of above query is stored in a temporary table. The size of this table is small (because we take a modulo of the HASH of the groupBy column, and because the number of columns is limited).
351 | Therefore, because this amount of data is reasonable, we can allow ourselves to download those results locally where our Python program is being executed. 352 | 353 | In the 2 temporary tables, the first 2 columns (gb and sline) are the most important.
354 | "gb" is used as a kind of "index". "sline" is the total checksum that 355 | corresponds to all the columns of all the rows whose GroupBy value modulo 100000 matches the "gb" value.
356 | If "sline" differs between the 2 tables for a same "gb" value, then we know that some rows in that "GroupBy-modulo block" don't match between the 2 tables.
357 | Then, the program queries again the temporary tables and ask all the columns for all the specific "gb" values that have a difference.
358 | The idea is to discover which "block of columns" (called "sblock_N" in the previous pseudo query) are different.
359 | By doing so, we are able to know not only which rows are different, but also in which columns those differences appear.
360 | This allows us to launch some final simple SELECT queries against the original tables, fetching those rows and those specific columns.
361 | Finally, those 2 small datasets are sorted and a difference of them is exposed in a web browser. 362 | 363 | The pseudo SQL query shown above is quite heavy to compute, and has to access all the data of each table. 364 | In order to avoid launching such a heavy query in case of "stupid mistakes" (maybe your ETL flow failed and there is 1 table that is void), a quick comparison step has been introduced before. 365 | In general, we can divide the program in 3 steps: 366 | 367 | 1. **Determination of the GroupBy column**
368 | In this first step, we try to assess which column is more suitable to be used as a GroupBy column.
369 | It is important to find a column that has many different values (otherwise, the next queries will be poorly parallelized, and we can have some OOM problems).
370 | We want also to avoid some skewed columns.
371 | To do so, the program fetch a sample of data from 1 table: some 10 000 rows and some 10 columns. We only look at 10 columns for 3 reasons: 1) we suppose that it would be enough to find there a good one 2) to limit the amount of data transfered over the network 3) to avoid being billed too much (for instance, the price in BigQuery grows with the number of columns being read).
372 | Finally, an analysis is done on those columns, we discard all the columns that don't have enough diversity or that are too skewed, and from the remaining columns we keep the one that seems less skewed. 373 | Working on a sample has some limits and it is possible that the chosen column is not the best one. It is also possible that all the 10 columns are discarded.
374 | In both situation you can use the `--group-by-column` option to overcome that problem. 375 | 376 | 2. **Quick count check**
377 | The program does a first validation to quickly check, in a light way if there are some big differences in the 2 tables.
378 | This is why we just try to count the number of rows in the tables. More than counting the total number of rows, we will check the number of rows for each "row-bucket", that is: group of rows with a GroupBy value whose hash modulo 10000 are the same.
379 | This verification is fast because it just need to do some small computation on just 1 column. There is indeed no point in trying to do some very complex computation on all the columns if the number of rows does not match.
380 | In such case, the differences are shown to the user and the program stops, without executing the 3rd step.
381 | During this step, a skew detection is also performed, which is a protection for the 3rd step. 382 | 383 | 3. **Full SHA1 validation**
384 | This step, described earlier in this paragraph, is the only one that can guarantee that the 2 tables are identical. 385 | 386 | ### Imprecision due to "float" of "double" types 387 | 388 | The representations of decimal numbers are not always exact, and small differences in their representations can be sometimes observed between Hive and BigQuery, meaning that the corresponding SHA1 values will be totally different.
389 | For instance, the following query in BigQuery returns `8.5400000000000009`: 390 | ``` 391 | select cast( cast( 8.540000000000001 as FLOAT64 ) as STRING) 392 | ``` 393 | And in Hive we would get `8.540000000000001`. 394 | 395 | To overcome those problems, it was decided to: 396 | * multiply any float or double number by 10000 (most of the decimal numbers managed in Bol.com are 'price numbers' with just 2 digits, so multiplying by 10000 should give enough precision). 397 | * round the resulting number with floor() in order to get rid of any "decimal imprecision" 398 | * cast the result into an integer (otherwise the previous number may end up with a '.0' or not, depending if it is Hive or BigQuery) 399 | 400 | The above approach works in many cases but we understand that it is not a general solution (for instance, no distinction would be made between '0.0000001' and '0').
401 | We may thus need in the future to add other possibilities to solve those imprecision problems. 402 | -------------------------------------------------------------------------------- /hive_compared_bq/hive_compared_bq.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Copyright 2017 bol.com. All Rights Reserved 4 | 5 | 6 | Licensed under the Apache License, Version 2.0 (the "License"); 7 | you may not use this file except in compliance with the License. 8 | You may obtain a copy of the License at 9 | 10 | http://www.apache.org/licenses/LICENSE-2.0 11 | 12 | Unless required by applicable law or agreed to in writing, software 13 | distributed under the License is distributed on an "AS IS" BASIS, 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | See the License for the specific language governing permissions and 16 | limitations under the License. 17 | """ 18 | 19 | import argparse 20 | import ast 21 | import logging 22 | import threading 23 | import difflib 24 | import re 25 | import sys 26 | import webbrowser 27 | from abc import ABCMeta, abstractmethod 28 | 29 | if sys.version_info[0:2] == (2, 6): 30 | # noinspection PyUnresolvedReferences 31 | from backport_collections import Counter 32 | else: 33 | from collections import Counter 34 | 35 | ABC = ABCMeta('ABC', (object,), {}) # compatible with Python 2 *and* 3 36 | 37 | 38 | class _Table(ABC): 39 | """Represent an abstract table that contains database connection and the related SQL executions 40 | 41 | :type database: str 42 | :param database: Name of the database 43 | 44 | :type table: str 45 | :param table: Name of the table 46 | 47 | :type tc: :class:`TableComparator` 48 | :param tc: reference to the parent object, in order to access the configuration 49 | 50 | :type column_range: str 51 | :param column_range: Python array range that represents the range of columns in the DDL that we want to compare 52 | 53 | :type chosen_columns: str 54 | :param chosen_columns: list of the (str) columns we want to focus on 55 | 56 | :type ignore_columns: str 57 | :param ignore_columns: list of the (str) columns we want to ignore 58 | 59 | :type decodeCP1252_columns: str 60 | :param decodeCP1252_columns: list of the (str) columns that are encoded in CP1252 and we want to transform into 61 | Google's UTF8 encoding 62 | 63 | :type where_condition: str 64 | :param where_condition: boolean SQL condition that will be added to the WHERE condition of all the queries for that 65 | table, so that we can restrict the analysis to a specific scope. Also quite useful when 66 | we want to work on specific partitions. 67 | 68 | :type full_name: str 69 | :param full_name: the full name of the table: . 70 | """ 71 | 72 | __metaclass__ = ABCMeta 73 | 74 | def __init__(self, database, table, parent, *args, **kwargs): 75 | self.database = database 76 | self.table = table 77 | self.tc = parent 78 | self.column_range = ":" 79 | self.chosen_columns = None 80 | self.ignore_columns = None 81 | self.decodeCP1252_columns = [] 82 | self.where_condition = None 83 | self.full_name = self.database + '.' + self.table 84 | self._ddl_columns = [] # array instead of dictionary because we want to maintain the order of the columns 85 | self._ddl_partitions = [] # take care, those rows also appear in the columns array 86 | self._group_by_column = None # the column that is used to "bucket" the rows 87 | 88 | @staticmethod 89 | def check_stdin_options(typedb, stdin_options, allowed_options, compulsory_options): 90 | """Validate the options entered, given those allowed and those compulsory 91 | 92 | :type typedb: str 93 | :param typedb: the type of the database ({hive,bq}) 94 | 95 | :type stdin_options: str 96 | :param stdin_options: the options entered on command line for this table, given in a Python dictionary format 97 | 98 | :type allowed_options: list of str 99 | :param allowed_options: the list of options that can be used for this table 100 | 101 | :type compulsory_options: dict 102 | :param compulsory_options: dictionary where the keys are the name of the compulsory options, and the value 103 | is a small description of it 104 | 105 | :rtype: dict 106 | :return: a dictionary of the options entered by the user 107 | 108 | :raises: ValueError if the option entered is invalid 109 | """ 110 | hash_options = {} 111 | if stdin_options is not None: 112 | try: 113 | hash_options = ast.literal_eval(stdin_options) 114 | except: 115 | raise ValueError("The option must be in a Python dictionary format (just like: {'jar': " 116 | "'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}\nThe value " 117 | "received was: %s" % stdin_options) 118 | 119 | for key in hash_options: 120 | if key not in allowed_options: 121 | raise ValueError("The following option for %s is not supported: %s\nThe only supported options" 122 | " are: %s" % (typedb, key, allowed_options)) 123 | 124 | # Needs to be checked even if the user has given no option 125 | for key in compulsory_options: 126 | if key not in hash_options: 127 | raise ValueError("%s option (%s) must be defined for %s tables" % (key, compulsory_options[key], 128 | typedb)) 129 | 130 | return hash_options 131 | 132 | @staticmethod 133 | def create_table_from_string(argument, options, table_comparator): 134 | """Parse an argument (usually received on the command line) and return the corresponding Table object 135 | 136 | :type argument: str 137 | :param argument: description of the table to connect to. Must have the format /.
138 | type can be {hive,bq} 139 | 140 | :type options: str 141 | :param options: the dictionary of all the options for this table connection. Could be for instance: 142 | "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}" 143 | 144 | :type table_comparator: :class:`TableComparator` 145 | :param table_comparator: the TableComparator parent object, to have a reference to the configuration properties 146 | 147 | :rtype: :class:`_Table` 148 | :returns: the _Table object that corresponds to the argument 149 | 150 | :raises: ValueError if the argument is void or does not match the format 151 | """ 152 | match = re.match(r'(\w+)/(\w+)\.(\w+)', argument) 153 | if match is None: 154 | raise ValueError("Table description must follow the following format: '/.
'") 155 | 156 | typedb = match.group(1) 157 | database = match.group(2) 158 | table = match.group(3) 159 | 160 | if typedb == "bq": 161 | hash_options = _Table.check_stdin_options(typedb, options, ["project"], {}) 162 | from bq import TBigQuery 163 | return TBigQuery(database, table, table_comparator, hash_options.get('project')) 164 | elif typedb == "hive": 165 | hash_options = _Table.check_stdin_options(typedb, options, ["jar", "hs2"], {'hs2': 'Hive Server2 hostname'}) 166 | from hive import THive 167 | return THive(database, table, table_comparator, hash_options['hs2'], hash_options.get('jar')) 168 | else: 169 | raise ValueError("The database type %s is currently not supported" % typedb) 170 | 171 | @abstractmethod 172 | def get_type(self): 173 | """Return the (string) type of the database (Hive, BigQuery)""" 174 | pass 175 | 176 | def get_id_string(self): 177 | """Return a string that fully identifies the table""" 178 | return self.get_type() + "_" + self.full_name 179 | 180 | def set_where_condition(self, where): 181 | """the WHERE condition we want to apply for the table. Could be useful in case of partitioned tables 182 | 183 | :type where: str 184 | :param where: the WHERE condition. Example: datedir="2017-05-01" 185 | """ 186 | self.where_condition = where 187 | 188 | def set_column_range(self, column_range): 189 | self.column_range = column_range 190 | 191 | def set_chosen_columns(self, cols): 192 | self.chosen_columns = cols.split(",") 193 | 194 | def set_ignore_columns(self, cols): 195 | self.ignore_columns = cols.split(",") 196 | 197 | def set_decode_cp1252_columns(self, cols): 198 | self.decodeCP1252_columns = cols.split(",") 199 | 200 | def set_group_by_column(self, col): 201 | self._group_by_column = col 202 | 203 | @abstractmethod 204 | def get_ddl_columns(self): 205 | """ Return the columns of this table 206 | 207 | The list of the column is an attribute of the class. If it already exists, then it is directly returned. 208 | Otherwise, a connection is made to the database to get the schema of the table and at the same time the 209 | attribute (list) partition is filled. 210 | 211 | :rtype: list of dict 212 | :returns: list of {"name": "type"} dictionaries that represent the columns of this table 213 | """ 214 | pass 215 | 216 | def get_groupby_column(self): 217 | """Return a column that seems to have a good distribution in order to do interesting GROUP BY queries with it 218 | 219 | This _group_by_column is an attribute of the class. If it already exists, then it is directly returned. 220 | Otherwise, a small query to get some sample rows on some few columns is performed, in order to evaluate 221 | which of those columns present the best distribution (we want a column that will have as many Group By values 222 | as possible, and that avoid a bit the skew, so that when we detect a specific difference on a bucket, we will 223 | be able to show a number of lines for this bucket not too big). The found column is then saved as the attribute 224 | and returned. 225 | 226 | :rtype: str 227 | :returns: the column that will be used in the Group By 228 | """ 229 | if self._group_by_column is not None: 230 | return self._group_by_column 231 | 232 | query, selected_columns = self.get_sample_query() 233 | 234 | # Get a sample from the table and fill Counters to each column 235 | logging.info("Analyzing the columns %s with a sample of %i values", str([x["name"] for x in selected_columns]), 236 | self.tc.sample_rows_number) 237 | for col in selected_columns: 238 | col["Counter"] = Counter() # col: {"name","type"} dictionary. New "counter" key is to track distribution 239 | 240 | self.get_column_statistics(query, selected_columns) 241 | self.find_best_distributed_column(selected_columns) 242 | 243 | return self._group_by_column 244 | 245 | @abstractmethod 246 | def get_column_statistics(self, query, selected_columns): 247 | """Launch the sample query and register the distribution of the selected_columns in the "Counters" 248 | 249 | :type query: str 250 | :param query: SQL query that gets a sample of rows with only the selected columns 251 | 252 | :type selected_columns: list of dict 253 | :param selected_columns: list of the few columns selected in the sample query. It has the format: 254 | {"name": col_name, "type": col_type, "Counter": frequency_of_values} 255 | """ 256 | pass 257 | 258 | def find_best_distributed_column(self, selected_columns): 259 | """Look at the statistics from the sample to estimate which column has the best distribution to do a GROUP BY 260 | 261 | The best column is automatically saved in the attribute _group_by_column. If all the selected columns have a 262 | poor distribution, then we exit with error. 263 | We say that we have a poor distribution if the most frequent value of a column is superior than 264 | max_frequent_number. 265 | Then, the best column is the one that has not a poor distribution, and whose sum of occurrences of the 50 most 266 | frequent values is minimal. 267 | 268 | :type selected_columns: list of dict 269 | :param selected_columns: list of the few columns selected in the sample query. It has the format: 270 | {"name": col_name, "type": col_type, "Counter": frequency_of_values} 271 | """ 272 | max_frequent_number = self.tc.sample_rows_number * self.tc.max_percent_most_frequent_value_in_column // 100 273 | current_lowest_weight = sys.maxint 274 | highest_first = max_frequent_number 275 | 276 | for col in selected_columns: 277 | highest = col["Counter"].most_common(1)[0] 278 | value_most_popular = highest[0] 279 | frequency_most_popular_value = highest[1] 280 | if frequency_most_popular_value > max_frequent_number: 281 | logging.debug( 282 | "Discarding column '%s' because '%s' was found in sample %i times (higher than limit of %i)", 283 | col["name"], value_most_popular, frequency_most_popular_value, max_frequent_number) 284 | continue 285 | # The biggest value is not too high, so let's see how big are the 50 biggest values 286 | weight_of_most_frequent_values = sum([x[1] for x in col["Counter"] 287 | .most_common(self.tc.number_of_most_frequent_values_to_weight)]) 288 | logging.debug("%s: sum up of the %i most frequent occurrence: %i", col["name"], 289 | self.tc.number_of_most_frequent_values_to_weight, weight_of_most_frequent_values) 290 | if weight_of_most_frequent_values < current_lowest_weight: 291 | self._group_by_column = col["name"] # we save here this potential "best group-by" column 292 | current_lowest_weight = weight_of_most_frequent_values 293 | highest_first = frequency_most_popular_value 294 | 295 | if self._group_by_column is None: 296 | sys.exit("Error: we could not find a suitable column to do a Group By. Either relax the selection condition" 297 | " with the '--max-gb-percent' option or directly select the column with '--group-by-column' ") 298 | 299 | logging.info("Best column to do a GROUP BY is %s (occurrences of most frequent value: %i / the %i most frequent" 300 | "values sum up %i occurrences)", self._group_by_column, highest_first, 301 | self.tc.number_of_most_frequent_values_to_weight, current_lowest_weight) 302 | 303 | def filter_columns_from_cli(self, all_columns): 304 | """Filter the columns received from the table schema with the options given by the user, and save this result 305 | 306 | :type all_columns: list of dict 307 | :param all_columns: each column in the table is represented by an element in the list, with the following 308 | format: {"name": col_name, "type": col_type} 309 | """ 310 | if self.column_range == ":": 311 | if self.chosen_columns is not None: # user has declared the columns he wants to analyze 312 | leftover = list(self.chosen_columns) 313 | for col in all_columns: 314 | if col['name'] in leftover: 315 | leftover.remove(col['name']) 316 | self._ddl_columns.append(col) 317 | if len(leftover) > 0: 318 | sys.exit("Error: you asked to analyze the columns %s but we could not find them in the table %s" 319 | % (str(leftover), self.get_id_string())) 320 | else: 321 | self._ddl_columns = all_columns 322 | else: # user has requested a specific range of columns 323 | match = re.match(r'(\d*):(\d*)', self.column_range) 324 | if match is None: 325 | raise ValueError("The column range must follow the Python style '1:9'. You gave: %s", self.column_range) 326 | 327 | start = 0 328 | if len(match.group(1)) > 0: 329 | start = int(match.group(1)) 330 | 331 | end = len(all_columns) 332 | if len(match.group(2)) > 0: 333 | end = int(match.group(2)) 334 | 335 | self._ddl_columns = all_columns[start:end] 336 | logging.debug("The range of columns has been reduced to: %s", self._ddl_columns) 337 | 338 | # Even if the user requested just a specific range of columns, he still has the possibility to remove some 339 | # columns in that range: 340 | if self.ignore_columns is not None: 341 | all_columns = [] 342 | for col in self._ddl_columns: 343 | if not col['name'] in self.ignore_columns: 344 | all_columns.append(col) 345 | self._ddl_columns = list(all_columns) 346 | 347 | @abstractmethod 348 | def create_sql_groupby_count(self): 349 | """ Return a SQL query where we count the number of rows for each Group of hash() on the groupby_column 350 | 351 | The column found in get_groupby_column() is used to do a Group By and count the number of rows for each group. 352 | But in order to reduce the number of Groups, we hash the value of this column, and we take the modulo 353 | "number_of_group_by" of this hash value. 354 | 355 | :rtype: str 356 | :returns: SQL query to do the Group By Count 357 | """ 358 | pass 359 | 360 | @abstractmethod 361 | def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values): 362 | """ Return a SQL query that shows the rows that match some specific buckets and some given columns 363 | 364 | :type extra_columns_str: str 365 | :param extra_columns_str: the list of extra columns (separated by ",") we want to show to help debugging 366 | 367 | :type buckets_values: str 368 | :param buckets_values: the list of values (separated by ",") of the buckets we want to fetch 369 | 370 | :rtype: str 371 | :returns: SQL query to do the Group By Buckets 372 | """ 373 | pass 374 | 375 | @abstractmethod 376 | def create_sql_intermediate_checksums(self): 377 | """Build and return the query that generates all the checksums to make the final comparison 378 | 379 | The query will have the following schema: 380 | 381 | WITH blocks AS ( 382 | SELECT MOD( hash2( column), 100000) as gb, sha1(concat( col0, col1, col2, col3, col4)) as block_0, 383 | sha1(concat( col5, col6, col7, col8, col9)) as block_1, ... as block_N FROM table 384 | ), 385 | full_lines AS ( 386 | SELECT gb, sha1(concat( block_0, |, block_1...) as row_sha, block_0, block_1 ... FROM blocks 387 | ) 388 | SELECT gb, sha1(concat(list)) as sline, sha1(concat(list)) as sblock_1, 389 | sha1(concat(list)) as sblock_2 ... as sblock_N FROM GROUP BY gb 390 | 391 | :rtype: str 392 | :returns: the SQL query with the Group By and the shas 393 | """ 394 | pass 395 | 396 | @abstractmethod 397 | def delete_temporary_table(self, table_name): 398 | """Drop the temporary table if needed (if it is not automatically deleted by the system) 399 | 400 | :type table_name: str 401 | :param table_name: name of the table to delete 402 | """ 403 | pass 404 | 405 | @abstractmethod 406 | def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False): 407 | """Launch the SQL query and stores the results of the 1st and 2nd columns in the dictionary 408 | 409 | The 1st column of each row is stored as the key of the dictionary, the 2nd column is for the value. This method 410 | is meant to catch the result of a Group By query, so that we are sure that the keys fetched are unique. 411 | 412 | :type query: str 413 | :param query: query to execute 414 | 415 | :type result_dic: dict 416 | :param result_dic: dictionary to store the result 417 | 418 | :type all_columns_from_2: bool 419 | :param all_columns_from_2: True if we want to fetch all the columns of the row, starting from the 2nd one. False 420 | if we want the value of the dictionary to only have the 1st column (take care: 421 | columns start counting with 0). 422 | (default: False) 423 | """ 424 | pass 425 | 426 | @abstractmethod 427 | def launch_query_csv_compare_result(self, query, rows): 428 | """Launch the SQL query and stores the rows in an array with some kind of CSV formatting 429 | 430 | The only reason for the "CSV formatting" (separation of columns with "|") is to help in comparing the rows with 431 | the difflib library. This is also the reason why all the lines start with "^" and end with "$" 432 | 433 | :type query: str 434 | :param query: query to execute 435 | 436 | :type rows: list of str 437 | :param rows: the (void) array that will store the rows 438 | """ 439 | pass 440 | 441 | @abstractmethod 442 | def launch_query_with_intermediate_table(self, query, result): 443 | """Launch the query, stores the results in a temporary table and put the first 2 columns in a dictionary 444 | 445 | This method is used to computes a lot of checksums and thus is a bit heavy to compute. This is why we store 446 | all those detailed results in a temporary table. Then present a summary of the returned rows by storing the 447 | first 2 columns in a dictionary under the ``result`` dictionary. 448 | 449 | :type query: str 450 | :param query: query to execute 451 | 452 | :type result: dict 453 | :param result: dictionary to store the result 454 | """ 455 | 456 | def get_sample_query(self): 457 | """ Build a SQL query to get some sample lines with limited amount of columns 458 | 459 | We limit the number of columns to a small number (ex: 10) because it is usually unnecessary to look at all 460 | the columns in order to find one with a good distribution (usually, the "index" will be in the first columns). 461 | What is more, in BigQuery we are billed by the number of columns we read so we need to avoid reading 462 | hundreds of columns just like what we have for big tables. 463 | 464 | :rtype: tuple 465 | :returns: ``(query, selected_columns)``, where ``query`` is the sample SQL query; ``selected_columns`` is the 466 | list of columns that are fetched 467 | """ 468 | query = "SELECT" 469 | selected_columns = self.get_ddl_columns()[:self.tc.sample_column_number] 470 | for col in selected_columns: 471 | query += " %s," % col["name"] # for the last column we'll remove that trailing "," 472 | where_condition = "" 473 | if self.where_condition is not None: 474 | where_condition = "WHERE " + self.where_condition 475 | query = query[:-1] + " FROM %s %s LIMIT %i" % (self.full_name, where_condition, self.tc.sample_rows_number) 476 | return query, selected_columns 477 | 478 | def get_column_blocks(self, ddl): 479 | """Returns the list of a column blocks for a specific DDL (see function create_sql_intermediate_checksums) 480 | 481 | :type ddl: list of dict 482 | :param ddl: the ddl of the tables, containing dictionaries with keys (name, type) to describe each column 483 | 484 | :rtype: list of list 485 | :returns: list of each block, each one containing the (5) columns dictionary ({name, type}) that describe it 486 | """ 487 | column_blocks = [] 488 | for idx, col in enumerate(ddl): 489 | block_id = idx // self.tc.block_size 490 | if idx % self.tc.block_size == 0: 491 | column_blocks.append([]) 492 | column_blocks[block_id].append({"name": col["name"], "type": col["type"]}) 493 | return column_blocks 494 | 495 | 496 | class TableComparator(object): 497 | """Represent the general configuration of the program (tables names, number of rows to scan...) """ 498 | 499 | def __init__(self): 500 | self.tsrc = None 501 | self.tdst = None 502 | 503 | # 10 000 rows should be good enough to use as a sample 504 | # (Google uses some sample of 1000 or 3000 rows) 505 | # Estimation of memory for the Counters: 10000 * 10 * ( 10 * 4 + 4) * 4 = 16.8 MB 506 | # (based on estimation : rows * columns * ( size string + int) * overhead Counter ) 507 | self.sample_rows_number = 10000 508 | self.sample_column_number = 10 509 | self.max_percent_most_frequent_value_in_column = None 510 | self.number_of_most_frequent_values_to_weight = 50 511 | 512 | self.number_of_group_by = 100000 # 7999 is the limit if you want to manually download the data from BQ. This 513 | # limit does not apply in this script because we fetch the data with the Python API instead. 514 | # TODO ideally this limit would be dynamic (counting the total rows), in order to get an average of 7 515 | # lines per bucket 516 | self.skew_threshold = 40000 # if we detect that a Group By has more than this amount of rows 517 | # (compare_groupby_count() method), then we raise an exception, because the computation of the shas (which is 518 | # computationally expensive) might suffer a lot from this skew, and might also trigger some OOM exception. 519 | # 40 000 seems a safe number: 520 | # Let's consider a table with many column: 1000 columns. That means 201 sha blocks (1000 / 5 + 1) 521 | # a sha is 29 characters 522 | # so the max memory with a skew of 40 000 would be: 523 | # 201 * 40000 * 29 / 1024 /1024 = 222 MB, which should fit into the Heap of a task process 524 | self.block_size = 5 # 5 columns means that when we want to debug we have enough context. But it small enough to 525 | # avoid being charged too much by Google when querying on it 526 | reload(sys) 527 | # below method really exists (don't know why PyCharm cannot see it) and is really needed 528 | # noinspection PyUnresolvedReferences 529 | sys.setdefaultencoding('utf-8') 530 | 531 | def set_tsrc(self, table): 532 | """Set the source table to be compared 533 | 534 | :type table: :class:`_Table` 535 | :param table: the _Table object 536 | """ 537 | self.tsrc = table 538 | 539 | def set_tdst(self, table): 540 | """Set the destination table to be compared 541 | 542 | :type table: :class:`_Table` 543 | :param table: the _Table object 544 | """ 545 | self.tdst = table 546 | 547 | def set_skew_threshold(self, threshold): 548 | """Set the threshold value for the skew 549 | 550 | :type threshold: int 551 | :param threshold: the threshold value (default: 40 000) 552 | """ 553 | self.skew_threshold = threshold 554 | 555 | def set_max_percent_most_frequent_value_in_column(self, percent): 556 | """Set the max_percent_most_frequent_value_in_column value 557 | 558 | If in one sample a column has a value whose frequency is higher than this percentage, then this column is not 559 | considered as a suitable column to do the Group By 560 | 561 | :type percent: float 562 | :param percent: the percentage value 563 | """ 564 | self.max_percent_most_frequent_value_in_column = percent 565 | 566 | def compare_groupby_count(self): 567 | """Runs a light query on Hive and BigQuery to check if the counts match, using the ideal column estimated before 568 | 569 | Some skew detection is also performed here. And the program stops if we detect some important skew and no 570 | difference was detected (meaning that the sha computation would have been performed with that skew). 571 | 572 | :rtype: tuple 573 | :returns: ``(summary_differences, big_small_bucket)``, where ``summary_differences`` is a list of tuples, 574 | one per difference containing (groupByValue, number of differences for this bucket, count of rows 575 | for this bucket for the "biggest table"); ``big_small_bucket`` is a tuple containing the table that 576 | has the biggest distribution (according to the Group By column) and then the other table 577 | """ 578 | logging.info("Executing the 'Group By' Count queries for %s (%s) and %s (%s) to do first comparison", 579 | self.tsrc.full_name, self.tsrc.get_type(), self.tdst.full_name, self.tdst.get_type()) 580 | src_query = self.tsrc.create_sql_groupby_count() 581 | dst_query = self.tdst.create_sql_groupby_count() 582 | 583 | result = {"src_count_dict": {}, "dst_count_dict": {}} 584 | t_src = threading.Thread(name='srcGroupBy-' + self.tsrc.get_type(), target=self.tsrc.launch_query_dict_result, 585 | args=(src_query, result["src_count_dict"])) 586 | t_dst = threading.Thread(name='dstGroupBy-' + self.tdst.get_type(), target=self.tdst.launch_query_dict_result, 587 | args=(dst_query, result["dst_count_dict"])) 588 | t_src.start() 589 | t_dst.start() 590 | t_src.join() 591 | t_dst.join() 592 | 593 | for k in result: 594 | if 'error' in result[k]: 595 | sys.exit(result[k]["error"]) 596 | 597 | # #### Let's compare the count between the 2 Group By queries 598 | # iterate on biggest dictionary so that we're sure to se a difference if there is one 599 | logging.debug("Searching differences in Group By") 600 | if len(result["src_count_dict"]) > len(result["dst_count_dict"]): 601 | big_dict = result["src_count_dict"] 602 | small_dict = result["dst_count_dict"] 603 | big_small_bucket = (self.tsrc, self.tdst) 604 | else: 605 | big_dict = result["dst_count_dict"] 606 | small_dict = result["src_count_dict"] 607 | big_small_bucket = (self.tdst, self.tsrc) 608 | 609 | differences = Counter() 610 | skew = Counter() 611 | for (k, v) in big_dict.iteritems(): 612 | if k not in small_dict: 613 | differences[k] = -v # we want to see the differences where we have less lines to compare 614 | elif v != small_dict[k]: 615 | differences[k] = - abs(v - small_dict[k]) 616 | # we check the skew even if some differences were found above and we will never enter the sha computation, 617 | # so that the developer can fix at early stage 618 | max_value = max(v, small_dict.get(k)) 619 | if max_value > self.skew_threshold: 620 | skew[k] = max_value 621 | summary_differences = [(k, -v, big_dict[k]) for (k, v) in differences.most_common()] 622 | if len(skew) > 0: 623 | logging.warning("Some important skew (threshold: %i) was detected in the Group By column %s. The top values" 624 | " are: %s", self.skew_threshold, self.tsrc.get_groupby_column(), str(skew.most_common(10))) 625 | if len(summary_differences) == 0: 626 | sys.exit("No difference in Group By count was detected but we saw some important skew that could make " 627 | "the next step (comparison of the shas) very slow or failing. So better stopping now. You " 628 | "should consider choosing another Group By column with the '--group-by-column' option") 629 | 630 | if len(summary_differences) != 0: 631 | logging.info("We found at least %i differences in Group By count", len(summary_differences)) 632 | logging.debug("Differences in Group By count are: %s", summary_differences[:300]) 633 | 634 | return summary_differences, big_small_bucket 635 | 636 | def show_results_count(self, summary_differences, big_small_bucket): 637 | """If any differences found in the Count Group By step, then show them in a webpage 638 | 639 | :type summary_differences: list of tuple 640 | :param summary_differences: list of all the differences where each difference is described as: (groupByValue, 641 | number of differences for this bucket, count of rows for this bucket for the "biggest table") 642 | 643 | :type big_small_bucket: tuple 644 | :param big_small_bucket: tuple containing the table that has the biggest distribution (according to the Group By 645 | column) and then the other table 646 | """ 647 | # TODO break/refactor a bit this function in the same way it has been done for the sha part 648 | 649 | # We want to return at most 6 blocks of lines corresponding to different group by values. For the sake of 650 | # brevity, each block should not show more than 70 lines. Blocks that show rows that appear in only on 1 table 651 | # should be limited to 3 (so that we can see "context" when debugging). To also give context, we will show some 652 | # few other columns. 653 | number_buckets_only_one_table = 0 654 | number_buckets_found = 0 655 | buckets_bigtable = [] 656 | buckets_smalltable = [] 657 | bigtable = big_small_bucket[0] 658 | smalltable = big_small_bucket[1] 659 | for (bucket, difference_num, biggest_num) in summary_differences: 660 | if difference_num > 70: 661 | break # since the Counter was ordered from small differences to biggest, we know that this difference 662 | # number can only increase. So let's go out of the loop 663 | if biggest_num > 70: 664 | continue 665 | if difference_num == biggest_num: 666 | if number_buckets_only_one_table == 3: 667 | continue 668 | else: 669 | number_buckets_only_one_table += 1 670 | number_buckets_found += 1 671 | if number_buckets_found == 6: 672 | break 673 | buckets_bigtable.append(bucket) 674 | else: 675 | buckets_bigtable.append(bucket) 676 | buckets_smalltable.append(bucket) 677 | number_buckets_found += 1 678 | if number_buckets_found == 6: 679 | break 680 | if len(buckets_bigtable) == 0: # let's ensure that we have at least 1 value to show 681 | (bucket, difference_num, biggest_num) = summary_differences[0] 682 | buckets_bigtable.append(bucket) 683 | if difference_num != biggest_num: 684 | buckets_smalltable.append(bucket) 685 | logging.debug("Buckets for %s: %s \t\tBuckets for %s: %s", bigtable.full_name, str(buckets_bigtable), 686 | smalltable.full_name, str(buckets_smalltable)) 687 | 688 | gb_column = self.tsrc.get_groupby_column() 689 | extra_columns = [x["name"] for x in self.tsrc.get_ddl_columns()[:6]] # add 5 extra columns to see some context 690 | if gb_column in extra_columns: 691 | extra_columns.remove(gb_column) 692 | elif len(extra_columns) == 6: 693 | extra_columns = extra_columns[:-1] # limit to 5 columns 694 | extra_columns_str = str(extra_columns)[1:-1].replace("'", "") 695 | bigtable_query = bigtable.create_sql_show_bucket_columns(extra_columns_str, str(buckets_bigtable)[1:-1]) 696 | 697 | result = {"big_rows": [], "small_rows": []} 698 | t_big = threading.Thread(name='bigShowCountDifferences-' + bigtable.get_type(), 699 | target=bigtable.launch_query_csv_compare_result, 700 | args=(bigtable_query, result["big_rows"])) 701 | t_big.start() 702 | 703 | if len(buckets_smalltable) > 0: # in case 0, then it means that the "smalltable" does not contain any of 704 | # the rows that appear in the "bigtable". In such case, there is no need to launch the query 705 | smalltable_query = smalltable.create_sql_show_bucket_columns(extra_columns_str, 706 | str(buckets_smalltable)[1:-1]) 707 | t_small = threading.Thread(name='smallShowCountDifferences-' + smalltable.get_type(), 708 | target=smalltable.launch_query_csv_compare_result, 709 | args=(smalltable_query, result["small_rows"])) 710 | t_small.start() 711 | t_small.join() 712 | t_big.join() 713 | 714 | sorted_file = {} 715 | for instance in ("big_rows", "small_rows"): 716 | result[instance].sort() 717 | sorted_file[instance] = "/tmp/count_diff_" + instance 718 | with open(sorted_file[instance], "w") as f: 719 | f.write("\n".join(result[instance])) 720 | 721 | column_description = "
hash(%s) , %s , %s" \ 722 | % (bigtable.get_groupby_column(), bigtable.get_groupby_column(), extra_columns_str) 723 | diff_string = difflib.HtmlDiff().make_file(result["big_rows"], result["small_rows"], bigtable.get_id_string() + 724 | column_description, smalltable.get_id_string() + column_description, 725 | context=False, numlines=30) 726 | html_file = "/tmp/count_diff.html" 727 | with open(html_file, "w") as f: 728 | f.write(diff_string) 729 | logging.debug("Sorted results of the queries are in the files %s and %s. HTML differences are in %s", 730 | sorted_file["big_rows"], sorted_file["small_rows"], html_file) 731 | webbrowser.open("file://" + html_file, new=2) 732 | 733 | def compare_shas(self): 734 | """Runs the final queries on Hive and BigQuery to check if the checksum match and return the list of differences 735 | 736 | :rtype: tuple 737 | :returns: ``(list_differences, names_sha_tables, tables_to_clean)``, where ``list_differences`` is the list of 738 | Group By values which presents different row checksums; ``names_sha_tables`` is a dictionary that 739 | contains the names of the "temporary" result tables; ``tables_to_clean`` is a dictionary of the 740 | temporary tables we will want to remove at the end of the process 741 | """ 742 | logging.info("Executing the 'shas' queries for %s and %s to do final comparison", 743 | self.tsrc.get_id_string(), self.tdst.get_id_string()) 744 | 745 | tsrc_query = self.tsrc.create_sql_intermediate_checksums() 746 | tdst_query = self.tdst.create_sql_intermediate_checksums() 747 | 748 | # "cleaning" is for all the tables that will need to be eventually deleted. It must contain tuples (, corresponding _Table object). "names_sha_tables" contains all the temporary tables generated 750 | # even the BigQuery cached table that does not need to be deleted. "sha_dictionaries" contains the results. 751 | result = {"cleaning": [], "names_sha_tables": {}, "sha_dictionaries": { 752 | self.tsrc.get_id_string(): {}, 753 | self.tdst.get_id_string(): {} 754 | }} 755 | t_src = threading.Thread(name='shaBy-' + self.tsrc.get_id_string(), 756 | target=self.tsrc.launch_query_with_intermediate_table, 757 | args=(tsrc_query, result)) 758 | t_dst = threading.Thread(name='shaBy-' + self.tdst.get_id_string(), 759 | target=self.tdst.launch_query_with_intermediate_table, 760 | args=(tdst_query, result)) 761 | t_src.start() 762 | t_dst.start() 763 | t_src.join() 764 | t_dst.join() 765 | 766 | if "error" in result: 767 | for table_name, table_object in result["cleaning"]: 768 | table_object.delete_temporary_table(table_name) 769 | sys.exit(result["error"]) 770 | 771 | # Comparing the results of those dictionaries 772 | logging.debug("Searching differences in Shas") 773 | src_num_gb = len(result["sha_dictionaries"][self.tsrc.get_id_string()]) 774 | dst_num_gb = len(result["sha_dictionaries"][self.tdst.get_id_string()]) 775 | if not src_num_gb == dst_num_gb: 776 | sys.exit("The number of Group By values is not the same when doing the final sha queries (%s: %i - " 777 | "%s: %i).\nMake sure to first execute the 'count' verification step!" 778 | % (self.tsrc.get_id_string(), src_num_gb, self.tdst.get_id_string(), dst_num_gb)) 779 | 780 | list_differences = [] 781 | for (k, v) in result["sha_dictionaries"][self.tdst.get_id_string()].iteritems(): 782 | if k not in result["sha_dictionaries"][self.tsrc.get_id_string()]: 783 | sys.exit("The Group By value %s appears in %s but not in %s.\nMake sure to first execute the " 784 | "'count' verification step!" % (k, self.tdst.get_id_string(), self.tsrc.get_id_string())) 785 | elif v != result["sha_dictionaries"][self.tsrc.get_id_string()][k]: 786 | list_differences.append(k) 787 | 788 | if len(list_differences) != 0: 789 | logging.info("We found %i differences in sha verification", len(list_differences)) 790 | logging.debug("Differences in sha are: %s", list_differences[:300]) 791 | 792 | return list_differences, result["names_sha_tables"], result["cleaning"] 793 | 794 | def get_column_blocks_most_differences(self, differences, temp_tables): 795 | """Return the information of which columns contain most differences 796 | 797 | From the compare_shas step, we know all the rowBuckets that present some differences. The goal of this 798 | function is to identify which columnsBuckets have those differences. 799 | 800 | :type differences: list of str 801 | :param differences: the list of Group By values which present different row checksums 802 | 803 | :type temp_tables: dict 804 | :param temp_tables: contains the names of the temporary tables ["src_table", "dst_table"] 805 | 806 | :rtype: tuple 807 | :returns: ``(column_blocks_most_differences, map_colblocks_bucketrows)``, where 808 | ``column_blocks_most_differences`` is the Counter of the column blocks with most differences; 809 | ``map_colblocks_bucketrows`` is a list that "maps" a column block with the list of the hash of the 810 | bucket rows that contain a difference 811 | 812 | :raises: IOError if the query has some execution errors 813 | """ 814 | subset_differences = str(differences[:10000])[1:-1] # let's choose quite a big number (instead of just looking 815 | # at some few (5 for instance) differences for 2 reasons: 1) by fetching more rows we will find estimate 816 | # better which column blocks fail often 2) we have less possibilities to face some 'permutations' problems 817 | logging.debug("The sha differences that we consider are: %s", str(subset_differences)) 818 | 819 | src_query = "SELECT * FROM %s WHERE gb IN (%s)" % (temp_tables[self.tsrc.get_id_string()], subset_differences) 820 | dst_query = "SELECT * FROM %s WHERE gb IN (%s)" % (temp_tables[self.tdst.get_id_string()], subset_differences) 821 | logging.debug("queries to find differences in bucket_blocks are: \n%s\n%s", src_query, dst_query) 822 | 823 | src_sha_lines = {} # key=gb, values=list of shas from the blocks (not the one of the whole line) 824 | dst_sha_lines = {} 825 | t_src = threading.Thread(name='srcFetchShaDifferences', target=self.tsrc.launch_query_dict_result, 826 | args=(src_query, src_sha_lines, True)) 827 | t_dst = threading.Thread(name='dstFetchShaDifferences', target=self.tdst.launch_query_dict_result, 828 | args=(dst_query, dst_sha_lines, True)) 829 | t_src.start() 830 | t_dst.start() 831 | t_src.join() 832 | t_dst.join() 833 | 834 | # We want to find the column blocks that present most of the differences, and the bucket_rows associated to it 835 | column_blocks_most_differences = Counter() 836 | column_blocks = self.tsrc.get_column_blocks(self.tsrc.get_ddl_columns()) 837 | # noinspection PyUnusedLocal 838 | map_colblocks_bucketrows = [[] for x in range(len(column_blocks))] 839 | for bucket_row, dst_blocks in dst_sha_lines.iteritems(): 840 | src_blocks = src_sha_lines[bucket_row] 841 | for idx, sha in enumerate(dst_blocks): 842 | if sha != src_blocks[idx]: 843 | column_blocks_most_differences[idx] += 1 844 | map_colblocks_bucketrows[idx].append(bucket_row) 845 | logging.debug("Block columns with most differences are: %s. Which correspond to those bucket rows: %s", 846 | column_blocks_most_differences, map_colblocks_bucketrows) 847 | 848 | # collisions could happen for instance with those 2 "rows" (1st column is the Group BY value, 2nd column is the 849 | # value of a column in the data, 3rd column is the value of another column which belongs to another 'block 850 | # column' than the 2nd one): 851 | # ## SRC table: 852 | # bucket1 1 A 853 | # bucket1 0 B 854 | # ## DST table: 855 | # bucket1 0 A 856 | # bucket1 1 B 857 | # In such case, the 'grouped sha of each column' will be always the same. But the sha-lines will be different 858 | # That leads to a bad situation where the program says that there are some differences but it is not possible 859 | # to show them on the the web page (which is quite confusing for the user running the program). 860 | if len(column_blocks_most_differences) == 0: 861 | raise RuntimeError("Program faced some collisions when trying to assess which blocks of columns were not" 862 | "correct. Please contact the developer to ask for a fix") 863 | 864 | return column_blocks_most_differences, map_colblocks_bucketrows 865 | 866 | def get_sql_final_differences(self, column_blocks_most_differences, map_colblocks_bucketrows, index): 867 | """Return the queries to get the real data for the differences found in the last compare_shas() step 868 | 869 | Get the definition of the [index]th column block with most differences, and generate the SQL queries that will 870 | be triggered to show to the developer those differences. 871 | 872 | :type column_blocks_most_differences: :class:`Counter` 873 | :param column_blocks_most_differences: the Counter of the column blocks with most differences. The keys of this 874 | Counter is the index of the column block 875 | 876 | :type map_colblocks_bucketrows: list of list 877 | :param map_colblocks_bucketrows: list that "maps" a column block (the element N in the list corresponds to the 878 | column block N) with the list of the hash of the bucket rows that contain a difference. 879 | 880 | :type index: int 881 | :param index: the position of the block column we want to get 882 | 883 | :rtype: tuple of str 884 | :returns: ``(hive_final_sql, bq_final_sql, list_column_to_check)``, the queries to be executed to do the final 885 | debugging, and the name of the columns that are fetched. 886 | """ 887 | column_block_most_different = column_blocks_most_differences.most_common(index)[index - 1][0] 888 | column_blocks = self.tsrc.get_column_blocks(self.tsrc.get_ddl_columns()) 889 | # buckets otherwise we might want to take a second block 890 | list_column_to_check = " ,".join([x["name"] for x in column_blocks[column_block_most_different]]) 891 | # let's display just 10 buckets in error max 892 | list_hashs = " ,".join(map(str, map_colblocks_bucketrows[column_block_most_different][:10])) 893 | 894 | src_final_sql = self.tsrc.create_sql_show_bucket_columns(list_column_to_check, list_hashs) 895 | dst_final_sql = self.tdst.create_sql_show_bucket_columns(list_column_to_check, list_hashs) 896 | logging.debug("Final source query is: %s \nFinal dest query is: %s", src_final_sql, dst_final_sql) 897 | 898 | return src_final_sql, dst_final_sql, list_column_to_check 899 | 900 | @staticmethod 901 | def display_html_diff(result, file_name, col_description): 902 | """Show the difference of the analysis in a graphical webpage 903 | 904 | :type result: dict 905 | :param result: dictionary that contains the hashMaps of some of the rows with differences between the 2 tables 906 | 907 | :type file_name: str 908 | :param file_name: prefix of the path where will be written the temporary files 909 | 910 | :type col_description: str 911 | :param col_description: "," separated list of the 5 extra columns from the column block we show in the diff 912 | """ 913 | sorted_file = {} 914 | keys = result.keys() 915 | for instance in keys: 916 | result[instance].sort() 917 | sorted_file[instance] = file_name + "_" + instance 918 | with open(sorted_file[instance], "w") as f: 919 | f.write("\n".join(result[instance])) 920 | 921 | diff_html = difflib.HtmlDiff().make_file(result[keys[0]], result[keys[1]], keys[0] + col_description, keys[1] 922 | + col_description, context=True, numlines=15) 923 | html_file = file_name + ".html" 924 | with open(html_file, "w") as f: 925 | f.write(diff_html) 926 | logging.debug("Sorted results of the queries are in the files %s and %s. HTML differences are in %s", 927 | sorted_file[keys[0]], sorted_file[keys[1]], html_file) 928 | webbrowser.open("file://" + html_file, new=2) 929 | 930 | def show_results_final_differences(self, src_sql, dst_sql, list_extra_columns): 931 | """If any differences found in the shas analysis step, then show them in a webpage 932 | 933 | :type src_sql: str 934 | :param src_sql: the query of the source table to launch to see the rows that are different 935 | 936 | :type dst_sql: str 937 | :param dst_sql: the query of the destination table to launch to see the rows that are different 938 | 939 | :type list_extra_columns: str 940 | :param list_extra_columns: list (',' separated) of the extra columns that will be shown in the differences. 941 | This parameter is only used to show the description in the web page. 942 | """ 943 | src_id = self.tsrc.get_id_string() 944 | dst_id = self.tdst.get_id_string() 945 | result = {src_id: [], dst_id: []} 946 | t_src = threading.Thread(name='srcShowShaFinalDifferences', target=self.tsrc.launch_query_csv_compare_result, 947 | args=(src_sql, result[src_id])) 948 | t_dst = threading.Thread(name='dstShowShaFinalDifferences', target=self.tdst.launch_query_csv_compare_result, 949 | args=(dst_sql, result[dst_id])) 950 | t_src.start() 951 | t_dst.start() 952 | t_src.join() 953 | t_dst.join() 954 | 955 | col_description = "
hash(%s) , %s , %s" \ 956 | % (self.tsrc.get_groupby_column(), self.tsrc.get_groupby_column(), list_extra_columns) 957 | 958 | self.display_html_diff(result, "/tmp/sha_diff", col_description) 959 | 960 | return False # no need to execute the script further since errors have already been spotted 961 | 962 | def synchronise_tables(self): 963 | """Ensure that some specific properties between the 2 tables have the same values, like the Group By column""" 964 | self.tdst._ddl_columns = self.tsrc.get_ddl_columns() 965 | # a check DDL comparison 966 | self.tdst._group_by_column = self.tsrc.get_groupby_column() # the Group By must use the same column for both 967 | # tables 968 | 969 | def perform_step_count(self): 970 | """Execute the Count comparison of the 2 tables 971 | 972 | :rtype: bool 973 | :returns: True if we haven't found differences yet and further analysis is needed 974 | """ 975 | self.synchronise_tables() 976 | diff, big_small = self.compare_groupby_count() 977 | 978 | if len(diff) == 0: 979 | print("No differences were found when doing a Count on the tables %s and %s and grouping by on the " 980 | "column %s" % (self.tsrc.full_name, self.tdst.full_name, self.tsrc.get_groupby_column())) 981 | return True # means that we should continue executing the script 982 | 983 | self.show_results_count(diff, big_small) 984 | return False # no need to execute the script further since errors have already been spotted 985 | 986 | @staticmethod 987 | def clean_step_sha(tables_to_clean): 988 | """Delete temporary table if needed 989 | 990 | :type tables_to_clean: dict 991 | :param tables_to_clean: contains the name of the tables that need to be deleted, and the table object 992 | """ 993 | for table_name, table_object in tables_to_clean: 994 | table_object.delete_temporary_table(table_name) 995 | 996 | def perform_step_sha(self): 997 | """Execute the Sha comparison of the 2 tables""" 998 | self.synchronise_tables() 999 | sha_differences, temporary_tables, tables_to_clean = self.compare_shas() 1000 | if len(sha_differences) == 0: 1001 | print("Sha queries were done and no differences were found: the tables %s and %s are equal!" 1002 | % (self.tsrc.get_id_string(), self.tdst.get_id_string())) 1003 | TableComparator.clean_step_sha(tables_to_clean) 1004 | sys.exit(0) 1005 | 1006 | cb_most_diff, map_cb_bucketrows = self.get_column_blocks_most_differences(sha_differences, temporary_tables) 1007 | 1008 | for idx_cb in range(1, len(cb_most_diff) + 1): 1009 | if idx_cb > 1: 1010 | answer = raw_input('Do you want to see more differences? [Y/n]: ') 1011 | # Yes being the default, we only exit in case of properly pushing 'n' 1012 | if answer == 'n': 1013 | break 1014 | 1015 | queries = self.get_sql_final_differences(cb_most_diff, map_cb_bucketrows, idx_cb) 1016 | print("Showing differences for columns " + queries[2]) 1017 | self.show_results_final_differences(queries[0], queries[1], queries[2]) 1018 | 1019 | TableComparator.clean_step_sha(tables_to_clean) 1020 | sys.exit(1) 1021 | 1022 | 1023 | def parse_arguments(): 1024 | """Parse the arguments received on the command line and returns the args element of argparse 1025 | 1026 | :rtype: namespace 1027 | :returns: The object that contains all the configuration of the command line 1028 | """ 1029 | parser = argparse.ArgumentParser(description="Compare table with table ", 1030 | formatter_class=argparse.RawTextHelpFormatter) 1031 | parser.add_argument("source", help="the original (correct version) table\n" 1032 | "The format must have the following format: /.
\n" 1033 | " can be: bq or hive\n ") 1034 | parser.add_argument("destination", help="the destination table that needs to be compared\n" 1035 | "Format follows the one for the source table") 1036 | 1037 | parser.add_argument("-s", "--source-options", help="options for the source table\nFor Hive that could be: {'jar': " 1038 | "'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': " 1039 | "'master-003.bol.net'}\nExample for BigQuery: {'project': " 1040 | "'myGoogleCloudProject'}") 1041 | parser.add_argument("-d", "--destination-options", help="options for the destination table") 1042 | 1043 | parser.add_argument("--source-where", help="the WHERE condition we want to apply for the source table\n" 1044 | "Could be useful in case of partitioned tables\n" 1045 | "Example: \"datedir='2017-05-01'\"") 1046 | parser.add_argument("--destination-where", help="the WHERE condition we want to apply for the destination table") 1047 | 1048 | parser.add_argument("--max-gb-percent", type=float, default=1.0, 1049 | help="if in one sample a column has a value whose frequency is higher than this percentage, " 1050 | "then this column is discarded (default: 1.0)") 1051 | 1052 | parser.add_argument("--skew-threshold", type=int, 1053 | help="increase the threshold (default: 40 000) if you have some skew but if the amount of" 1054 | "columns is reduced so that you feel confident that all the shas will fit into memory") 1055 | 1056 | group_columns = parser.add_mutually_exclusive_group() 1057 | group_columns.add_argument("--column-range", default=":", 1058 | help="Instead of checking all the columns, you can define a range of columns to check\n" 1059 | "This works as a Python array-range. Meaning that if you want to only analyze the " 1060 | "first 20 columns, you need to give:\n :20") 1061 | group_columns.add_argument("--columns", 1062 | help="Instead of checking all the columns, you can give here the list of the columns you" 1063 | " want to check. Example: 'column1,column14,column23'") 1064 | 1065 | parser.add_argument("--ignore-columns", 1066 | help="the columns in argument will be ignored from analysis. Example: 'column1,column14," 1067 | "column23'") # not in the group_columns, because we want to be able to ask for a range 1068 | # of columns but removing some few at the same time 1069 | 1070 | parser.add_argument("--decodeCP1252-columns", 1071 | help="try to transform the CP1252 encoding from the (string) columns in argument into " 1072 | "Google's UTF8 encoding (Experimental). Example: 'column1,column14,column23'") 1073 | 1074 | parser.add_argument("--group-by-column", 1075 | help="the column in argument is enforced to be the Group By column. Can be useful if the sample" 1076 | "query does not manage to find a good Group By column and we need to avoid some skew") 1077 | 1078 | group_step = parser.add_mutually_exclusive_group() 1079 | group_step.add_argument("--just-count", help="only perform the Count check", action="store_true") 1080 | group_step.add_argument("--just-sha", help="only perform the final sha check", action="store_true") 1081 | 1082 | group_log = parser.add_mutually_exclusive_group() 1083 | group_log.add_argument("-v", "--verbose", help="show debug information", action="store_true") 1084 | group_log.add_argument("-q", "--quiet", help="only show important information", action="store_true") 1085 | 1086 | return parser.parse_args() 1087 | 1088 | 1089 | def create_table_from_args(definition, options, where, args, tc): 1090 | """Create and returns a _Table object based on some arguments that were received on the command line 1091 | 1092 | :type definition: str 1093 | :param definition: basic definition of the table, in the format: /.
1094 | 1095 | :type options: str 1096 | :param options: JSON representation of options for this table. Ex: {'project': 'myGoogleCloudProject'} 1097 | 1098 | :type where: str 1099 | :param where: SQL filter that will be put in the WHERE clause, to limit the scope of the table analysed. 1100 | 1101 | :type args: :class:`ArgumentParser` 1102 | :param args: object containing all the arguments from the command line 1103 | 1104 | :type tc: :class:`TableComparator` 1105 | :param tc: current TableComparator instance, so that the table has a pointer to general configuration 1106 | 1107 | :rtype: :class:`_Table` 1108 | :returns: The _Table object, fully configured 1109 | """ 1110 | table = _Table.create_table_from_string(definition, options, tc) 1111 | table.set_where_condition(where) 1112 | table.set_column_range(args.column_range) 1113 | if args.columns is not None: 1114 | table.set_chosen_columns(args.columns) 1115 | if args.ignore_columns is not None: 1116 | table.set_ignore_columns(args.ignore_columns) 1117 | if args.decodeCP1252_columns is not None: 1118 | table.set_decode_cp1252_columns(args.decodeCP1252_columns) 1119 | table.set_group_by_column(args.group_by_column) # if not defined, then it's None and we'll compute it later 1120 | 1121 | return table 1122 | 1123 | 1124 | def main(): 1125 | args = parse_arguments() 1126 | 1127 | level_logging = logging.INFO 1128 | if args.verbose: 1129 | level_logging = logging.DEBUG 1130 | elif args.quiet: 1131 | level_logging = logging.WARNING 1132 | logging.basicConfig(level=level_logging, format='[%(levelname)s]\t[%(asctime)s] (%(threadName)-10s) %(message)s', ) 1133 | 1134 | logging.debug("Starting comparison program with arguments: %s", args) 1135 | 1136 | # Create the TableComparator that contains the definition of the 2 tables we want to compare 1137 | tc = TableComparator() 1138 | tc.set_max_percent_most_frequent_value_in_column(args.max_gb_percent) 1139 | source_table = create_table_from_args(args.source, args.source_options, args.source_where, args, tc) 1140 | destination_table = create_table_from_args(args.destination, args.destination_options, args.destination_where, 1141 | args, tc) 1142 | if args.skew_threshold is not None: 1143 | tc.set_skew_threshold(args.skew_threshold) 1144 | tc.set_tsrc(source_table) 1145 | tc.set_tdst(destination_table) 1146 | 1147 | # Step: count 1148 | if not args.just_sha: 1149 | do_we_continue = tc.perform_step_count() 1150 | if not do_we_continue: 1151 | sys.exit(1) 1152 | 1153 | # Step: sha 1154 | if not args.just_count: 1155 | tc.perform_step_sha() 1156 | 1157 | if __name__ == "__main__": 1158 | main() 1159 | --------------------------------------------------------------------------------