├── hive_compared_bq
    ├── __init__.py
    ├── hive.py
    ├── bq.py
    └── hive_compared_bq.py
├── .gitignore
├── udf
    ├── hcbq.jar
    └── README
├── docs
    └── images
    │   ├── differences_sha.png
    │   └── differences_count.png
├── src
    └── main
    │   └── java
    │       └── org
    │           └── apache
    │               └── hadoop
    │                   └── hive
    │                       └── ql
    │                           └── udf
    │                               └── generic
    │                                   └── GenericUDFDecodeCP1252.java
├── LICENSE
└── README.md


/hive_compared_bq/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | launch.sh
3 | *.swp
4 | *.pyc
5 | 


--------------------------------------------------------------------------------
/udf/hcbq.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/udf/hcbq.jar


--------------------------------------------------------------------------------
/docs/images/differences_sha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/docs/images/differences_sha.png


--------------------------------------------------------------------------------
/docs/images/differences_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/docs/images/differences_count.png


--------------------------------------------------------------------------------
/udf/README:
--------------------------------------------------------------------------------
1 | 
2 | This jar contains 2 UDFs:
3 | - UDFSha1, which is the unmodified sha1 UDF that appeared with Hive 1.3.0
4 | - GenericUDFDecodeCP1252, whose source code is in src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDecodeCP1252.java
5 | 
6 | 


--------------------------------------------------------------------------------
/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDecodeCP1252.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  *
  3 |  * Copyright 2017 bol.com. All Rights Reserved
  4 |  *
  5 |  *
  6 |  * Licensed under the Apache License, Version 2.0 (the "License");
  7 |  * you may not use this file except in compliance with the License.
  8 |  * You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  *     Unless required by applicable law or agreed to in writing, software
 13 |  *     distributed under the License is distributed on an "AS IS" BASIS,
 14 |  *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  *     See the License for the specific language governing permissions and
 16 |  *     limitations under the License.
 17 |  */
 18 | 
 19 | package org.apache.hadoop.hive.ql.udf.generic;
 20 | 
 21 | import org.apache.hadoop.hive.ql.exec.Description;
 22 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
 23 | import org.apache.hadoop.hive.ql.metadata.HiveException;
 24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
 26 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
 27 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 28 | import org.apache.hadoop.io.Text;
 29 | 
 30 | import java.nio.ByteBuffer;
 31 | import java.nio.charset.Charset;
 32 | 
 33 | import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP;
 34 | 
 35 | /**
 36 |  * UDFDecodeCP1252.
 37 |  */
 38 | @Description(
 39 |         name = "UDFDecodeCP1252",
 40 |         value = "_FUNC_(str) - Returns str, which is the same string but in a Google's UTF8 encoding instead of the "
 41 |                 + "initial CP1252 encoding")
 42 | public class GenericUDFDecodeCP1252 extends GenericUDF {
 43 |     private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[1];
 44 |     private transient Converter[] converters = new Converter[1];
 45 |     private final Text output = new Text();
 46 |     private final static Charset CP1252 = Charset.forName( "CP1252");
 47 | 
 48 |     @Override
 49 |     public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
 50 |         checkArgsSize(arguments, 1, 1);
 51 | 
 52 |         checkArgPrimitive(arguments, 0);
 53 | 
 54 |         checkArgGroups(arguments, 0, inputTypes, STRING_GROUP);
 55 | 
 56 |         obtainStringConverter(arguments, 0, inputTypes, converters);
 57 | 
 58 |         ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
 59 |         return outputOI;
 60 |     }
 61 | 
 62 |     @Override
 63 |     public Object evaluate(DeferredObject[] arguments) throws HiveException {
 64 |         final String stringToDecode = getStringValue(arguments, 0, converters);
 65 |         if (stringToDecode == null) {
 66 |             return null;
 67 |         }
 68 | 
 69 |         final char[] charactersToDecode = stringToDecode.toCharArray();
 70 |         final byte[] bytesArray = new byte[1];
 71 |         StringBuilder utf8_text = new StringBuilder(charactersToDecode.length);
 72 |         for (int i = 0; i < charactersToDecode.length; i++){
 73 |             char character = charactersToDecode[i];
 74 | 
 75 |             if( Character.isISOControl( character)){  // encoding problems happened only with control characters
 76 |                 int asciiCode = (int)character;
 77 |                 if( asciiCode >= 1 && asciiCode <= 9 ){
 78 |                     utf8_text.append(" ");    // those control characters are transformed to spaces by BigQuery
 79 |                 }
 80 |                 else {
 81 |                     int codePoint = Character.codePointAt( charactersToDecode,i);
 82 |                     // First of all, let's handle the control characters that are transformed to space by BigQuery
 83 |                     // (about those characters see: http://www.fileformat.info/info/unicode/char/0081/index.htm,
 84 |                     // and http://www.ltg.ed.ac.uk/~richard/utf-8.cgi?input=129&mode=decimal
 85 |                     // TODO add missing characters (so far I have just seen those 3)
 86 |                     if( codePoint == 129 || codePoint == 144 || codePoint == 157 ){
 87 |                         utf8_text.append(" ");
 88 |                     }
 89 |                     else {
 90 |                         // this part tries to solve the cases like:
 91 |                         // C299 (hex) CP1252 character being transformed by BigQuery into E284A2 (hex) in UTF8
 92 | 
 93 |                         // first we need to get rid of the control character (in our example, we get rid of 'C2' and
 94 |                         // we just keep '99')
 95 |                         String last_2_hexa_bytes = String.format("%02x", (int) character);
 96 |                         // gets the byte array from above "hexa string representation"
 97 |                         // (solution extracted from https://stackoverflow.com/a/140861/4064443)
 98 |                         bytesArray[0] = (byte) ((Character.digit(last_2_hexa_bytes.charAt(0), 16) << 4)
 99 |                                 | Character.digit(last_2_hexa_bytes.charAt(1), 16));
100 |                         ByteBuffer wrappedBytes = ByteBuffer.wrap(bytesArray);
101 | 
102 |                         // now we can properly decode from CP1252 charset
103 |                         String decodedString = CP1252.decode(wrappedBytes).toString();
104 |                         utf8_text.append(decodedString);
105 |                     }
106 |                 }
107 |             }
108 |             else{   // "standard" characters are OK. We just copied them "as is"
109 |                 utf8_text.append(character);
110 |             }
111 |         }
112 |         output.set(utf8_text.toString());
113 |         return output;
114 |     }
115 | 
116 |     @Override
117 |     public String getDisplayString(String[] children) {
118 |         return getStandardDisplayString(getFuncName(), children);
119 |     }
120 | 
121 |     @Override
122 |     protected String getFuncName() {
123 |         return "decodeCP1252";
124 |     }
125 | }
126 | 


--------------------------------------------------------------------------------
/hive_compared_bq/hive.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Copyright 2017 bol.com. All Rights Reserved
  4 | 
  5 | 
  6 | Licensed under the Apache License, Version 2.0 (the "License");
  7 | you may not use this file except in compliance with the License.
  8 | You may obtain a copy of the License at
  9 | 
 10 |     http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 | Unless required by applicable law or agreed to in writing, software
 13 | distributed under the License is distributed on an "AS IS" BASIS,
 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | See the License for the specific language governing permissions and
 16 | limitations under the License.
 17 | """
 18 | 
 19 | import logging
 20 | import sys
 21 | import time
 22 | # noinspection PyProtectedMember
 23 | from hive_compared_bq import _Table
 24 | import pyhs2  # TODO switch to another module since this one is deprecated and does not support Python 3
 25 | # see notes in : https://github.com/BradRuderman/pyhs2
 26 | 
 27 | 
 28 | class THive(_Table):
 29 |     """Hive implementation of the _Table object"""
 30 | 
 31 |     def __init__(self, database, table, parent, hs2_server, jar_path):
 32 |         _Table.__init__(self, database, table, parent)
 33 |         self.server = hs2_server
 34 |         self.connection = self._create_connection()
 35 |         self.jarPath = jar_path
 36 | 
 37 |     def get_type(self):
 38 |         return "hive"
 39 | 
 40 |     def _create_connection(self):
 41 |         """Connect to the table and return the connection object that we will use to launch queries"""
 42 |         return pyhs2.connect(host=self.server, port=10000, authMechanism="KERBEROS", database=self.database)
 43 | 
 44 |     def get_ddl_columns(self):
 45 |         if len(self._ddl_columns) > 0:
 46 |             return self._ddl_columns
 47 | 
 48 |         is_col_def = True
 49 |         cur = self.connection.cursor()
 50 |         cur.execute("describe " + self.full_name)
 51 |         all_columns = []
 52 |         while cur.hasMoreRows:
 53 |             row = cur.fetchone()
 54 |             if row is None:
 55 |                 continue
 56 |             col_name = row[0]
 57 |             col_type = row[1]
 58 | 
 59 |             if col_name == "" or col_name == "None":
 60 |                 continue
 61 |             if col_name.startswith('#'):
 62 |                 if "Partition Information" in col_name:
 63 |                     is_col_def = False
 64 |                 continue
 65 | 
 66 |             my_dic = {"name": col_name, "type": col_type}
 67 |             if is_col_def:
 68 |                 all_columns.append(my_dic)
 69 |             else:
 70 |                 self._ddl_partitions.append(my_dic)
 71 |         cur.close()
 72 | 
 73 |         self.filter_columns_from_cli(all_columns)
 74 | 
 75 |         return self._ddl_columns
 76 | 
 77 |     def get_column_statistics(self, query, selected_columns):
 78 |         cur = self.connection.cursor()
 79 |         cur.execute(query)
 80 |         while cur.hasMoreRows:
 81 |             fetched = cur.fetchone()
 82 |             if fetched is not None:
 83 |                 for idx, col in enumerate(selected_columns):
 84 |                     value_column = fetched[idx]
 85 |                     col["Counter"][value_column] += 1  # TODO what happens with NULL?
 86 |         cur.close()
 87 | 
 88 |     def create_sql_groupby_count(self):
 89 |         where_condition = ""
 90 |         if self.where_condition is not None:
 91 |             where_condition = "WHERE " + self.where_condition
 92 |         query = "SELECT hash( cast( %s as STRING)) %% %i AS gb, count(*) AS count FROM %s %s GROUP BY " \
 93 |                 "hash( cast( %s as STRING)) %% %i" \
 94 |                 % (self.get_groupby_column(), self.tc.number_of_group_by, self.full_name, where_condition,
 95 |                    self.get_groupby_column(), self.tc.number_of_group_by)
 96 |         logging.debug("Hive query is: %s", query)
 97 | 
 98 |         return query
 99 | 
100 |     def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values):
101 |         gb_column = self.get_groupby_column()
102 |         where_condition = ""
103 |         if self.where_condition is not None:
104 |             where_condition = self.where_condition + " AND"
105 |         hive_query = "SELECT hash( cast( %s as STRING)) %% %i as bucket, %s, %s FROM %s WHERE %s " \
106 |                      "hash( cast( %s as STRING)) %% %i IN (%s)" \
107 |                      % (gb_column, self.tc.number_of_group_by, gb_column, extra_columns_str, self.full_name,
108 |                         where_condition, gb_column, self.tc.number_of_group_by, buckets_values)
109 |         logging.debug("Hive query to show the buckets and the extra columns is: %s", hive_query)
110 | 
111 |         return hive_query
112 | 
113 |     def create_sql_intermediate_checksums(self):
114 |         column_blocks = self.get_column_blocks(self.get_ddl_columns())
115 |         number_of_blocks = len(column_blocks)
116 |         logging.debug("%i column_blocks (with a size of %i columns) have been considered: %s", number_of_blocks,
117 |                       self.tc.block_size, str(column_blocks))
118 | 
119 |         # Generate the concatenations for the column_blocks
120 |         hive_basic_shas = ""
121 |         for idx, block in enumerate(column_blocks):
122 |             hive_basic_shas += "base64( unhex( SHA1( concat( "
123 |             for col in block:
124 |                 name = col["name"]
125 |                 hive_value_name = name
126 |                 if col["type"] == 'date':
127 |                     hive_value_name = "cast( %s as STRING)" % name
128 |                 elif col["type"] == 'float' or col["type"] == 'double':
129 |                     hive_value_name = "cast( floor( %s * 10000 ) as bigint)" % name
130 |                 elif col["type"] == 'string' and name in self.decodeCP1252_columns:
131 |                     hive_value_name = "DecodeCP1252( %s)" % name
132 |                 hive_basic_shas += "CASE WHEN %s IS NULL THEN 'n_%s' ELSE %s END, '|'," % (name, name[:2],
133 |                                                                                            hive_value_name)
134 |             hive_basic_shas = hive_basic_shas[:-6] + ")))) as block_%i,\n" % idx
135 |         hive_basic_shas = hive_basic_shas[:-2]
136 | 
137 |         where_condition = ""
138 |         if self.where_condition is not None:
139 |             where_condition = "WHERE " + self.where_condition
140 | 
141 |         hive_query = "WITH blocks AS (\nSELECT hash( cast( %s as STRING)) %% %i as gb,\n%s\nFROM %s %s\n),\n" \
142 |                      % (self.get_groupby_column(), self.tc.number_of_group_by, hive_basic_shas, self.full_name,
143 |                         where_condition)  # 1st CTE with the basic block shas
144 |         list_blocks = ", ".join(["block_%i" % i for i in range(number_of_blocks)])
145 |         hive_query += "full_lines AS(\nSELECT gb, base64( unhex( SHA1( concat( %s)))) as row_sha, %s FROM blocks\n)\n" \
146 |                       % (list_blocks, list_blocks)  # 2nd CTE to get all the info of a row
147 |         hive_list_shas = ", ".join(["base64( unhex( SHA1( concat_ws( '|', sort_array( collect_list( block_%i)))))) as "
148 |                                     "block_%i_gb " % (i, i) for i in range(number_of_blocks)])
149 |         hive_query += "SELECT gb, base64( unhex( SHA1( concat_ws( '|', sort_array( collect_list( row_sha)))))) as " \
150 |                       "row_sha_gb, %s FROM full_lines GROUP BY gb" % hive_list_shas  # final query where all the shas
151 |         # are grouped by row-blocks
152 |         logging.debug("##### Final Hive query is:\n%s\n", hive_query)
153 | 
154 |         return hive_query
155 | 
156 |     def delete_temporary_table(self, table_name):
157 |         self.query("DROP TABLE " + table_name).close()
158 | 
159 |     def query(self, query):
160 |         """Execute the received query in Hive and return the cursor which is ready to be fetched and MUST be closed after
161 | 
162 |         :type query: str
163 |         :param query: query to execute in Hive
164 | 
165 |         :rtype: :class:`pyhs2.cursor.Cursor`
166 |         :returns: the cursor for this query
167 | 
168 |         :raises: IOError if the query has some execution errors
169 |         """
170 |         logging.debug("Launching Hive query")
171 |         #  TODO split number should be done in function of file format (ORC, Avro...) and number of columns
172 |         #  split_maxsize = 256000000
173 |         # split_maxsize = 64000000
174 |         split_maxsize = 8000000
175 |         # split_maxsize = 16000000
176 |         try:
177 |             cur = self.connection.cursor()
178 |             cur.execute("set mapreduce.input.fileinputformat.split.maxsize = %i" % split_maxsize)
179 |             cur.execute("set hive.fetch.task.conversion=minimal")  # force a MapReduce, because simple 'fetch' queries
180 |             # on a large table may generate some timeout otherwise
181 |             cur.execute(query)
182 |         except:
183 |             raise IOError("There was a problem in executing the query in Hive: %s", sys.exc_info()[1])
184 |         logging.debug("Fetching Hive results")
185 |         return cur
186 | 
187 |     def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False):
188 |         try:
189 |             cur = self.query(query)
190 |             while cur.hasMoreRows:
191 |                 row = cur.fetchone()
192 |                 if row is not None:
193 |                     if not all_columns_from_2:
194 |                         result_dic[row[0]] = row[1]
195 |                     else:
196 |                         result_dic[row[0]] = row[2:]
197 |         except:
198 |             result_dic["error"] = sys.exc_info()[1]
199 |             raise
200 |         finally:
201 |             cur.close()
202 |         logging.debug("All %i Hive rows fetched", len(result_dic))
203 | 
204 |     def launch_query_csv_compare_result(self, query, rows):
205 |         cur = self.query(query)
206 |         while cur.hasMoreRows:
207 |             row = cur.fetchone()
208 |             if row is not None:
209 |                 line = "^ " + " | ".join([str(col) for col in row]) + " $"
210 |                 rows.append(line)
211 |         logging.debug("All %i Hive rows fetched", len(rows))
212 |         cur.close()
213 | 
214 |     def launch_query_with_intermediate_table(self, query, result):
215 |         try:
216 |             cur = self.query("add jar " + self.jarPath)  # must be in a separated execution
217 |             cur.execute("create temporary function SHA1 as 'org.apache.hadoop.hive.ql.udf.UDFSha1'")
218 |             cur.execute("create temporary function DecodeCP1252 as "
219 |                         "'org.apache.hadoop.hive.ql.udf.generic.GenericUDFDecodeCP1252'")
220 |         except:
221 |             result["error"] = sys.exc_info()[1]
222 |             raise
223 | 
224 |         if "error" in result:
225 |             cur.close()
226 |             return  # let's stop the thread if some error popped up elsewhere
227 | 
228 |         tmp_table = "%s.temp_hiveCmpBq_%s_%s" % (self.database, self.full_name.replace('.', '_'),
229 |                                                  str(time.time()).replace('.', '_'))
230 |         cur.execute("CREATE TABLE " + tmp_table + " AS\n" + query)
231 |         cur.close()
232 |         result["names_sha_tables"][self.get_id_string()] = tmp_table  # we confirm this table has been created
233 |         result["cleaning"].append((tmp_table, self))
234 | 
235 |         logging.debug("The temporary table for Hive is " + tmp_table)
236 | 
237 |         if "error" in result:  # A problem happened in the other query of the other table (usually BQ, since it is
238 |             # faster than Hive) so there is no need to pursue or have the temp table
239 |             return
240 | 
241 |         projection_hive_row_sha = "SELECT gb, row_sha_gb FROM %s" % tmp_table
242 |         self.launch_query_dict_result(projection_hive_row_sha, result["sha_dictionaries"][self.get_id_string()])
243 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2017 bol.com
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/hive_compared_bq/bq.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Copyright 2017 bol.com. All Rights Reserved
  4 | 
  5 | 
  6 | Licensed under the Apache License, Version 2.0 (the "License");
  7 | you may not use this file except in compliance with the License.
  8 | You may obtain a copy of the License at
  9 | 
 10 |     http://www.apache.org/licenses/LICENSE-2.0
 11 | 
 12 | Unless required by applicable law or agreed to in writing, software
 13 | distributed under the License is distributed on an "AS IS" BASIS,
 14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | See the License for the specific language governing permissions and
 16 | limitations under the License.
 17 | """
 18 | 
 19 | import logging
 20 | import sys
 21 | import time
 22 | # noinspection PyProtectedMember
 23 | from hive_compared_bq import _Table
 24 | from google.cloud import bigquery
 25 | 
 26 | 
 27 | class TBigQuery(_Table):
 28 |     """BigQuery implementation of the _Table object"""
 29 | 
 30 |     hash2_js_udf = '''create temp function hash2(text STRING)
 31 |     returns INT64
 32 |     LANGUAGE js AS """
 33 |       if(text === null){
 34 |         return 0  // same behaviour as in Hive
 35 |       }
 36 |       let myHash = 0
 37 |       for (let character of text){
 38 |         myHash = myHash * 31 + character.charCodeAt(0)
 39 |         if (myHash >= 4294967296){ // because in Hive hash() is computed on integers range
 40 |           myHash = myHash % 4294967296
 41 |         }
 42 |       }
 43 |       if (myHash >= 2147483648){
 44 |         myHash = myHash - 4294967296
 45 |       }
 46 |       return myHash
 47 |     """;
 48 |     '''
 49 | 
 50 |     def __init__(self, database, table, parent, project):
 51 |         _Table.__init__(self, database, table, parent)
 52 | 
 53 |         self.project = project  # the Google Cloud project where this dataset/table belongs.If Null, then the default
 54 |         #  environment where this script is executed is used.
 55 |         self.connection = self._create_connection()
 56 | 
 57 |         # check that we can reach dataset and table
 58 |         dataset = self.connection.dataset(database)
 59 |         if not dataset.exists():
 60 |             raise AttributeError("The dataset %s:%s does not seem to exist or is unreachable" % (project, database))
 61 | 
 62 |         mytable = dataset.table(table)
 63 |         if not mytable.exists():
 64 |             raise AttributeError("The table %s:%s.%s does not seem to exist or is unreachable" %
 65 |                                  (project, database, table))
 66 | 
 67 |     def get_type(self):
 68 |         return "bigQuery"
 69 | 
 70 |     def _create_connection(self):
 71 |         """Connect to the table and return the connection object that we will use to launch queries"""
 72 |         if self.project is None:
 73 |             return bigquery.Client()
 74 |         else:
 75 |             return bigquery.Client(project=self.project)
 76 | 
 77 |     def get_ddl_columns(self):
 78 |         if len(self._ddl_columns) > 0:
 79 |             return self._ddl_columns
 80 |         else:
 81 |             dataset = self.connection.dataset(self.database)
 82 |             table = dataset.table(self.table)
 83 |             table.reload()
 84 |             schema = table.schema
 85 | 
 86 |             all_columns = []
 87 |             for field in schema:
 88 |                 col_name = str(field.name)
 89 |                 col_type = str(
 90 |                     field.field_type.lower())  # force 'str' to remove unicode notation and align it to Hive format
 91 |                 # let's align the types with the ones in Hive
 92 |                 if col_type == 'integer':
 93 |                     col_type = 'bigint'
 94 |                 my_dic = {"name": col_name, "type": col_type}
 95 |                 all_columns.append(my_dic)
 96 | 
 97 |             self.filter_columns_from_cli(all_columns)
 98 | 
 99 |             return self._ddl_columns
100 | 
101 |     def get_column_statistics(self, query, selected_columns):
102 |         for row in self.query(query):
103 |             for idx, col in enumerate(selected_columns):
104 |                 value_column = row[idx]
105 |                 col["Counter"][value_column] += 1
106 | 
107 |     def create_sql_groupby_count(self):
108 |         where_condition = ""
109 |         if self.where_condition is not None:
110 |             where_condition = "WHERE " + self.where_condition
111 |         query = self.hash2_js_udf + "SELECT MOD( hash2( cast(%s as STRING)), %i) as gb, count(*) as count FROM %s %s " \
112 |                                     "GROUP BY gb ORDER BY gb" \
113 |                                     % (self.get_groupby_column(), self.tc.number_of_group_by, self.full_name,
114 |                                        where_condition)
115 |         logging.debug("BigQuery query is: %s", query)
116 |         return query
117 | 
118 |     def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values):
119 |         where_condition = ""
120 |         if self.where_condition is not None:
121 |             where_condition = self.where_condition + " AND"
122 |         gb_column = self.get_groupby_column()
123 |         bq_query = self.hash2_js_udf + "SELECT MOD( hash2( cast(%s as STRING)), %i) as bucket, %s as gb, %s FROM %s " \
124 |                                        "WHERE %s MOD( hash2( cast(%s as STRING)), %i) IN (%s)" \
125 |                                        % (gb_column, self.tc.number_of_group_by, gb_column, extra_columns_str,
126 |                                           self.full_name, where_condition, gb_column, self.tc.number_of_group_by,
127 |                                           buckets_values)
128 |         logging.debug("BQ query to show the buckets and the extra columns is: %s", bq_query)
129 | 
130 |         return bq_query
131 | 
132 |     def create_sql_intermediate_checksums(self):
133 |         column_blocks = self.get_column_blocks(self.get_ddl_columns())
134 |         number_of_blocks = len(column_blocks)
135 |         logging.debug("%i column_blocks (with a size of %i columns) have been considered: %s", number_of_blocks,
136 |                       self.tc.block_size, str(column_blocks))
137 | 
138 |         # Generate the concatenations for the column_blocks
139 |         bq_basic_shas = ""
140 |         for idx, block in enumerate(column_blocks):
141 |             bq_basic_shas += "TO_BASE64( sha1( concat( "
142 |             for col in block:
143 |                 name = col["name"]
144 |                 bq_value_name = name
145 |                 if col["type"] == 'decimal':  # removing trailing & unnecessary 'zero decimal' (*.0)
146 |                     bq_value_name = 'regexp_replace( %s, "\\.0$", "")' % name
147 |                 elif col["type"] == 'float' or col["type"] == 'double':
148 |                     bq_value_name = "cast( cast( FLOOR( %s * 10000) as INT64) as STRING)" % name
149 |                 elif not col["type"] == 'string':
150 |                     bq_value_name = "cast( %s as STRING)" % name
151 |                 bq_basic_shas += "CASE WHEN %s IS NULL THEN 'n_%s' ELSE %s END, '|'," % (name, name[:2], bq_value_name)
152 |             bq_basic_shas = bq_basic_shas[:-6] + "))) as block_%i,\n" % idx
153 |         bq_basic_shas = bq_basic_shas[:-2]
154 | 
155 |         where_condition = ""
156 |         if self.where_condition is not None:
157 |             where_condition = "WHERE " + self.where_condition
158 | 
159 |         bq_query = self.hash2_js_udf + "WITH blocks AS (\nSELECT MOD( hash2( cast(%s as STRING)), %i) as gb,\n%s\n" \
160 |                                        "FROM %s %s\n),\n" \
161 |                                        % (self.get_groupby_column(), self.tc.number_of_group_by, bq_basic_shas,
162 |                                           self.full_name, where_condition)  # 1st CTE with the basic block shas
163 |         list_blocks = ", ".join(["block_%i" % i for i in range(number_of_blocks)])
164 |         bq_query += "full_lines AS(\nSELECT gb, TO_BASE64( sha1( concat( %s))) as row_sha, %s FROM blocks\n)\n" \
165 |                     % (list_blocks, list_blocks)  # 2nd CTE to get all the info of a row
166 |         bq_list_shas = ", ".join(["TO_BASE64( sha1( STRING_AGG( block_%i, '|' ORDER BY block_%i))) as block_%i_gb "
167 |                                   % (i, i, i) for i in range(number_of_blocks)])
168 |         bq_query += "SELECT gb, TO_BASE64( sha1( STRING_AGG( row_sha, '|' ORDER BY row_sha))) as row_sha_gb, %s FROM " \
169 |                     "full_lines GROUP BY gb" % bq_list_shas  # final query where all the shas are grouped by row-blocks
170 |         logging.debug("##### Final BigQuery query is:\n%s\n", bq_query)
171 | 
172 |         return bq_query
173 | 
174 |     def delete_temporary_table(self, table_name):
175 |         pass  # The temporary (cached) tables in BigQuery are deleted after 24 hours
176 | 
177 |     def query(self, query):
178 |         """Execute the received query in BigQuery and return an iterate Result object
179 | 
180 |         :type query: str
181 |         :param query: query to execute in BigQuery
182 | 
183 |         :rtype: list of rows
184 |         :returns: the QueryResults for this query
185 |         """
186 |         logging.debug("Launching BigQuery query")
187 |         q = self.connection.run_sync_query(query)
188 |         q.timeout_ms = 600000  # 10 minutes to execute the BQ query should be more than enough. 1 minute was too short
189 |         # TODO use maxResults https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query? :
190 |         q.use_legacy_sql = False
191 |         q.run()
192 |         logging.debug("Fetching BigQuery results")
193 |         return q.fetch_data()
194 | 
195 |     def query_ctas_bq(self, query):
196 |         """Execute the received query in BigQuery and return the name of the cache results table
197 | 
198 |         This is the equivalent of a "Create Table As a Select" in Hive. The advantage is that BigQuery only keeps that
199 |         table during 24 hours (we don't have to delete it just like in the case of Hive), and we're not charged for the
200 |         space used.
201 | 
202 |         :type query: str
203 |         :param query: query to execute in BigQuery
204 | 
205 |         :rtype: str
206 |         :returns: the full name of the cache table (dataset.table) that stores those results
207 | 
208 |         :raises: IOError if the query has some execution errors
209 |         """
210 |         logging.debug("Launching BigQuery CTAS query")
211 |         job_name = "job_hive_compared_bq_%f" % time.time()  # Job ID must be unique
212 |         job = self.connection.run_async_query(job_name.replace('.', '_'),
213 |                                               query)  # replace(): Job IDs must be alphanumeric
214 |         job.use_legacy_sql = False
215 |         job.begin()
216 |         time.sleep(3)  # 3 second is the minimum latency we get in BQ in general. So no need to try fetching before
217 |         retry_count = 300  # 10 minutes (because of below time sleep of 2 seconds). This should be enough
218 |         while retry_count > 0 and job.state != 'DONE':
219 |             retry_count -= 1
220 |             time.sleep(2)
221 |             job.reload()
222 |         logging.debug("BigQuery CTAS query finished")
223 | 
224 |         if job.errors is not None:
225 |             raise IOError("There was a problem in executing the query in BigQuery: %s" % str(job.errors))
226 | 
227 |         cache_table = job.destination.dataset_name + '.' + job.destination.name
228 |         logging.debug("The cache table of the final comparison query in BigQuery is: " + cache_table)
229 | 
230 |         return cache_table
231 | 
232 |     def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False):
233 |         for row in self.query(query):
234 |             if not all_columns_from_2:
235 |                 result_dic[row[0]] = row[1]
236 |             else:
237 |                 result_dic[row[0]] = row[2:]
238 |         logging.debug("All %i BigQuery rows fetched", len(result_dic))
239 | 
240 |     def launch_query_csv_compare_result(self, query, rows):
241 |         for row in self.query(query):
242 |             line = "^ " + " | ".join([str(col) for col in row]) + " $"
243 |             rows.append(line)
244 |         logging.debug("All %i BigQuery rows fetched", len(rows))
245 | 
246 |     def launch_query_with_intermediate_table(self, query, result):
247 |         try:
248 |             result["names_sha_tables"][self.get_id_string()] = self.query_ctas_bq(query)
249 |             projection_gb_row_sha = "SELECT gb, row_sha_gb FROM %s" % result["names_sha_tables"][self.get_id_string()]
250 |             self.launch_query_dict_result(projection_gb_row_sha, result["sha_dictionaries"][self.get_id_string()])
251 |         except:
252 |             result["error"] = sys.exc_info()[1]
253 |             raise
254 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # hive_compared_bq
  2 | 
  3 | hive_compared_bq is a Python program that compares 2 (SQL like) tables, and graphically shows the rows/columns that are different if any are found.
  4 | 
  5 | Comparing the full content of some tables in a scalable way has proved to be a difficult task.
  6 | 
  7 | hive_compared_bq tackles this problem by:
  8 | * Not moving the data, avoiding long data transfer typically needed for a "Join compared" approach. Instead, only the aggregated data is moved.
  9 | * Working on the full datasets and all the columns, so that we are 100% sure that the 2 tables are the same
 10 | * Leveraging the tools (currently: Hive, BigQuery) to be as scalable as possible
 11 | * Showing the developer in a graphical way the differences found, so that the developer can easily understands its mistakes and fix them
 12 | 
 13 | # Table of contents
 14 |   * [Features](#features)
 15 |   * [Installation](#installation)
 16 |     + [Note Python version 2.6](#note-python-version-26)
 17 |     + [Installing Python 2.7](#installing-python-27)
 18 |     + [For Hive](#for-hive)
 19 |       - [Installation of the required UDF](#installation-of-the-required-udf)
 20 |     + [For Big Query](#for-big-query)
 21 |   * [Usage](#usage)
 22 |     + [Get Help](#get-help)
 23 |     + [Basic execution](#basic-execution)
 24 |     + [Explanation of results](#explanation-of-results)
 25 |       - [Case of identical tables](#case-of-identical-tables)
 26 |       - [Case of number of rows not matching](#case-of-number-of-rows-not-matching)
 27 |       - [Case of differences inside the rows](#case-of-differences-inside-the-rows)
 28 |     + [Advanced executions](#advanced-executions)
 29 |       - [Faster executions](#faster-executions)
 30 |       - [Skewing problem](#skewing-problem)
 31 |       - [Schema not matching](#schema-not-matching)
 32 |       - [HBase tables](#hbase-tables)
 33 |       - [Encoding differences between Hive and BigQuery](#encoding-differences-between-hive-and-bigquery)
 34 |       - [Problems in the selection of the GroupBy column](#problems-in-the-selection-of-the-groupby-column)
 35 |   * [Algorithm](#algorithm)
 36 |     + [Imprecision due to "float" of "double" types](#imprecision-due-to--float--of--double--types)
 37 | 
 38 | ## Features
 39 | 
 40 | * Engines supported: Hive, BigQuery (and HBase to some extent). In theory, it is easy to extend it to other SQL backends such as Spanner, CloudSQL, Oracle... Help is welcomed :) !
 41 | * Possibility to only select specific columns or to remove some of them (useful if the schema between the tables is not exactly the same, or if we know that some columns are different and we don't want them to "pollute" the results)
 42 | * Possibility to just do a quick check (counting the rows in an advanced way) instead of complete checksum verification
 43 | * Detection of skew
 44 | 
 45 | ## Installation
 46 | 
 47 | This software works with Mac (tested on 10.11 and 10.12) and on Linux (tested on Redhat 6). It is not expected to work on Windows.
 48 | 
 49 | Make sure the following software is available:
 50 | * Python = 2.7 or 2.6 (see [restrictions for 2.6](#note-python-version-26))
 51 | * pyhs2 (just for Hive)
 52 | * google.cloud.bigquery (just for BigQuery)
 53 | 
 54 | ### Note Python version 2.6
 55 | BigQuery is not supported by this version of Python (Google Cloud SDK only supports Python 2.7)
 56 | 
 57 | It is needed to install some backports for Python's collection. For instance, for a Redhat server it would be:
 58 | ```bash
 59 | yum install python-backport_collections.noarch
 60 | ```
 61 | 
 62 | ### Installing Python 2.7
 63 | On RHEL6, Python 2.6 was installed by default. And the RPM version in EPEL for Python2.7 had no RPM available, so I compiled it this way:
 64 | ```bash
 65 | su -c "yum install gcc gcc-c++ zlib-devel sqlite-devel openssl-devel"
 66 | wget https://www.python.org/ftp/python/2.7.13/Python-2.7.13.tgz
 67 | tar xvfz Python-2.7.13.tgz
 68 | cd Python-2.7.13
 69 | ./configure --with-ensurepip=install
 70 | make
 71 | su -c "make altinstall"
 72 | su -c "echo 'export CLOUDSDK_PYTHON=/usr/local/bin/python2.7' >> /etc/bashrc"
 73 | export CLOUDSDK_PYTHON=/usr/local/bin/python2.7
 74 | ```
 75 | 
 76 | ### For Hive
 77 | Execute this as "root":
 78 | ```bash
 79 | yum install cyrus-sasl-gssapi cyrus-sasl-devel
 80 | pip install pyhs2
 81 | ```
 82 | 
 83 | 
 84 | #### Installation of the required UDF
 85 | 
 86 | On the Hadoop cluster where Hive executes, you must copy the library: `udf/hcbq.jar` (this jar contains 2 UDFs: one to compute the SHA1s, and the other to handle encoding translation).
 87 | 
 88 | Place this Jar on a HDFS directory where you have read access (in the example below, we will consider that this jar has been copied in `/user/sluangsay/lib/hcbq.jar`).
 89 | 
 90 | ### For Big Query
 91 | 
 92 | (steps extracted from https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-python)
 93 | 
 94 | To execute below commands, make sure that you already have a Google Cloud account with a BigQuery project created.
 95 | 
 96 | ```bash
 97 | mkdir -p ~/bin/googleSdk
 98 | cd ~/bin/googleSdk
 99 | su -c "pip install --upgrade google-cloud-bigquery"
100 | 
101 | wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-170.0.0-linux-x86_64.tar.gz
102 | tar xvfz google-cloud-sdk-170.0.0-linux-x86_64.tar.gz 
103 | 
104 | ./google-cloud-sdk/bin/gcloud init
105 | ./google-cloud-sdk/bin/gcloud auth application-default login
106 | ./google-cloud-sdk/install.sh
107 | ```
108 | 
109 | This last command tells you to source 2 files in every session. So include them in your bash profile. For instance in my case it would be:
110 | ```bash
111 | echo "source ${HOME}/bin/googlesdk/google-cloud-sdk/path.bash.inc" >> ~/.bash_profile
112 | echo "source ${HOME}/bin/googlesdk/google-cloud-sdk/completion.bash.inc" >> ~/.bash_profile
113 | ```
114 | Open a new bash session to activate those changes.
115 | 
116 | ## Usage
117 | 
118 | ### Get Help
119 | 
120 | To see all the quick options of hive_compared_bq.py, just execute it without any arguments:
121 | ```bash
122 | python hive_compared_bq.py
123 | ```
124 | 
125 | When calling the help, you get more detailled on the different options and also some examples:
126 | ```bash
127 | python hive_compared_bq.py --help
128 | ```
129 | 
130 | ### Basic execution
131 | 
132 | You must indicate the 2 tables to compare as arguments (the first table will be considered as the "source" table, and the second one as the "destination" table).
133 | 
134 | Each table must have the following format: `type`/`database`.`table`
135 | where:
136 | * `type` is the technology of your database (currently, only 'hive' and 'bq' (BigQuery) are supported)
137 | * `database` is the name of your database (also called "dataset" in BigQuery)
138 | * `table` is of course the name of your table
139 | 
140 | About the location of those databases:
141 | * In the case of BigQuery, the default Google Cloud project configured in your environment is selected.<br/>
142 | If you want to specify another project, you must indicate it with the `project` parameter with the `-s` or `-d` options.
143 | * In the case of Hive, you must specify the hostname of the HiveServer2, using the `hs2` parameter with the `-s` or `-d` options.
144 | 
145 | Another note for Hive: you need to pass the HDFS direction of the jar of the required UDF (see installation of Hive above), using the 'jar' option.
146 | 
147 | To clarify all the above, let's consider that we want to compare the following 2 tables:
148 | * A Hive table called `hive_compared_bq_table`, inside the database `sluangsay`.<br/>
149 | With those parameters, the argument to give is: `hive/sluangsay.hive_compared_bq_table`.<br/>
150 | Let's suppose also that the hostname of HiveServer2 is `master-003.bol.net`, and that we installed the required Jar in the HDFS path 'hdfs://hdp/user/sluangsay/lib/hcbq.jar'. Then, since this table is the first one on our command line, it is the source table and we need to define the option: `-s "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}"`
151 | * A BigQuery table also alled `hive_compared_bq_table`, inside the dataset `bidwh2`.
152 | 
153 | To compare above 2 tables, you need to execute:
154 | ```bash
155 | python hive_compared_bq.py -s "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}" hive/sluangsay.hive_compared_bq_table bq/bidwh2.hive_compared_bq_table
156 | ```
157 | 
158 | ### Explanation of results
159 | 
160 | #### Case of identical tables
161 | 
162 | When executing above command, if the 2 tables have exactly the same data, then we would get an output similar to:
163 | 
164 | ```
165 | [INFO]  [2017-09-15 10:59:20,851]  (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values
166 | [INFO]  [2017-09-15 10:59:22,285]  (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences)
167 | [INFO]  [2017-09-15 10:59:22,286]  (MainThread) Executing the 'Group By' Count queries for sluangsay.hive_compared_bq_table (hive) and bidwh2.hive_compared_bq_table (bigQuery) to do first comparison
168 | No differences were found when doing a Count on the tables sluangsay.hive_compared_bq_table and bidwh2.hive_compared_bq_table and grouping by on the column rowkey
169 | [INFO]  [2017-09-15 10:59:48,727]  (MainThread) Executing the 'shas' queries for hive_sluangsay.hive_compared_bq_table and bigQuery_bidwh2.hive_compared_bq_table to do final comparison
170 | Sha queries were done and no differences were found: the tables hive_sluangsay.hive_compared_bq_table and bigQuery_bidwh2.hive_compared_bq_table are equal!
171 | ```
172 | 
173 | The first 2 lines describe the first step of the algorithm (see the explanation of the algorithm later), and we see that `rowkey` is the column that is here used to perform the `Group By` operations.<br/>
174 | Then, the next 2 lines show the second step: we count the number of rows for each value of `rowkey`. In that case, those values match for the 2 tables.<br/>
175 | Finally, we do the extensive computation of the SHAs for all the columns and rows of the 2 tables. At the end, the script tells us that our 2 tables are identical.<br/>
176 | We can also check the returns value of the script: it is 0, which means no differences.
177 | 
178 | #### Case of number of rows not matching
179 | 
180 | The first difference that can be observed is that the number of rows associated to (at least) one GroupBy value does not match between the 2 tables.<br/>
181 | In such case, the standard output would be similar to:
182 | 
183 | ```
184 | [INFO]	[2017-09-18 08:09:06,947]  (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values
185 | [INFO]	[2017-09-18 08:09:07,739]  (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences)
186 | [INFO]	[2017-09-18 08:09:07,739]  (MainThread) Executing the 'Group By' Count queries for sluangsay.hive_compared_bq_table2 (hive) and bidwh2.hive_compared_bq_table2 (bigQuery) to do first comparison
187 | [INFO]	[2017-09-18 08:09:35,392]  (MainThread) We found at least 3 differences in Group By count
188 | ```
189 | 
190 | And the return value of the script would be 1.
191 | 
192 | Some of the differences are also shown in a HTML file that is automatically opened in your browser:
193 | 
194 | ![alt text](docs/images/differences_count.png?raw=true "Differences in GroupBy numbers")
195 | 
196 | This shows a table, with the rows of 1 table on the left side, and the ones of the other table on the right side.<br/>
197 | The names of the table appear at the top of the table.<br/>
198 | There we can also see the names of the columns that are shown.<br/>
199 | For performance reason and also sake of brevity, only some 7 columns are shown.<br/>
200 | The first 2 columns are "special columns": the GroupBy column (2nd column), and just before its SHA1 value (1st column).<br/>
201 | In this example, we can see that only rows on the left side appear: this is because the table on the right does not contain rows that contain the rowkeys 21411000029, 65900009 and 6560009.
202 | 
203 | #### Case of differences inside the rows
204 | 
205 | If the numbers of rows match it is still possible to observe some differences in the last (SHA1) step. In which case, we would get some message similar to:
206 | 
207 | ```
208 | [INFO]	[2017-09-28 05:53:55,890]  (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values
209 | [INFO]	[2017-09-28 05:53:56,897]  (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences)
210 | [INFO]	[2017-09-28 05:53:56,897]  (MainThread) Executing the 'shas' queries for hive_sluangsay.hive_compared_bq_table3 and bigQuery_bidwh2.hive_compared_bq_table3 to do final comparison
211 | [INFO]	[2017-09-28 05:54:26,961]  (MainThread) We found 2 differences in sha verification
212 | Showing differences for columns product_category ,product_group ,product_subgroup ,product_subsubgroup ,unit
213 | ```
214 | 
215 | The return value of the script would be 1.
216 | 
217 | Some of the differences are also shown in a HTML file that is automatically opened in your browser:
218 | 
219 | ![alt text](docs/images/differences_sha.png?raw=true "Differences in SHA")
220 | 
221 | We can see some similar information as the one exposed for differences in the [Count validation](#case-of-number-of-rows-not-matching).<br/>
222 | The 7 columns are: first the 2 "special columns", then the 5 columns of a "column block" (see [algorithm](#algorithm) for explanations) which contains some differences. That means that at least 1 of those 5 columns has at least 1 value different.<br/>
223 | In our example, we can see that the 2 rows have some differences in the column `product_subgroup`. Those differences are highlighted in yellow.
224 | 
225 | If there are several "column blocks" that have some differences, then the program will first show the column block that contains more "row blocks" with differences (take care: that does not mean that it is the column block that contains more differences. We could have indeed a column block with just 1 row block with differences, but that 1 row block could contain 1000s of rows with differences. On the other hand, we could imagine another column block with 2 row blocks containing differences, but each row block could contain 1 single row with differences).<br/>
226 | Then after, the program will ask you if you wish to see another column block with differences.
227 | 
228 | ### Advanced executions
229 | 
230 | #### Faster executions
231 | 
232 | By default, the script executes 3 steps (see explanation on how the algorithm executes).
233 | You can run the script faster by removing some steps that you think are unnecessary:
234 | 
235 | * with the `--group-by-column` option, you can indicate directly which column you want to use for the Group By.
236 | That means that the first step of analyzing some sample of 10 columns won't be needed.
237 | This step won't usually make a huge difference in the execution time (it usually takes 1-2 seconds), but by doing this you avoid launching a query that might cost you some money (example of BigQuery).
238 | With this option, you might also be able to provide a better column that the one the script would have discovered by itself, which might speed up the following queries (by avoiding Skew for instance, see notes later).
239 | 
240 | * with the `--just-count` option, you say that you just want to do a 'count rows validation'.
241 | This is some kind of "basic validation" because you won't be sure that the contents of the rows have identical values.
242 | But that allows you to avoid the final "full SHAs validation" step, that is more expensive and timely to compute.
243 | And maybe this 'count' validation' is enough for you now, because you are at an early stage of development, and you don't need to be 100% sure of your data
244 | (checking the count of rows is also a good idea to double check if some JOINs or some Filters conditions work properly).
245 | Don't forget to eventually run a full SHAs validation when you finish developing.
246 | 
247 | * with `--just-sha`, you specify that you don't need the 'count' validation. If you know from previous executions that the counts are correct, then you might indeed decide to skip that previous step.
248 | However, it is a bit at your own risk, because if the counts are not correct, the script will fail but you will have executed a more complex/costly query for that ('count' validation use faster/cheaper queries).
249 | 
250 | Another solution to have your validation being executed faster is to limit the scope of your validations. If you decide to validate less data, then you need to process less data, meaning that your queries will be faster/cheaper:
251 | 
252 | * for instance, you might be interested in just validating some specific critical columns (maybe because you know that your ETL process does not make any changes on some columns, so why "validating" them?).
253 | In such case, you can use the `--column-range`, `--columns` or `--ignore-columns` options.
254 | 
255 | * another example would be just validating some specific partitions. If your data is partitioned by days for instance, then you might decide to only validate 1 day of data.
256 | In such case you have to use the `--source-where` and `--destination-where` to specify a Where condition that tells which partition you want to consider.
257 | 
258 | #### Skewing problem
259 | 
260 | The program does several queries with some GroupBy operations. As for every GroupBy operation with huge volume of data, skew can be a performance killer, or can even make the query failing because of lack of resources.
261 | 
262 | The "count comparison" step (see description of the algorithm to know more about the steps) does not do any complex computation, so skew is annoying but should not be a very big issue.<br/>
263 | However, the "SHA1 comparison" step launches some heavy queries, and skew can mean a huge difference.<br/>
264 | In our algorithm, skew is determined by the skew in the distribution of the GroupBy column. This is why the selection of this column in the first step of the program is crucial, and that it can mean a huge difference if you specify yourself a column that has a good distribution.<br/>
265 | For the above reasons, a skew detection is done during the "count comparison" step. In case a "groupBy value" appears in more than 40 000 rows (you may change this default value with the option '--skew-threshold'), then this groupBy value will be registered and a warning message like this will appear:
266 | ```
267 | Some important skew (threshold: %i) was detected in the Group By column x. The top values are:
268 | ```
269 | Following that message, the list of all the top 10 skewed values (and their number of occurrences) pops up so that a developer can know that he should avoid selecting that GroupBy column the next time he runs the program.
270 | And it also gives the possibility to wonder if it is normal for this dataset to contain such skewed values.
271 | 
272 | If the "count comparison" step has not encountered any error, but some skew has been discovered, then the program will also stop, with the following message:
273 | ```
274 | No difference in Group By count was detected but we saw some important skew that could make the next step (comparison of the shas) very slow or failing. So better stopping now. You should consider choosing another Group By column with the '--group-by-column' option
275 | ```
276 | As explained before, stopping at this stage is a protection to avoid launching some heavy/costly queries that have some high probability to fail.<br/>
277 | Should you face this situation, then your best option is to specify a GroupBy column with a better distribution with the `--group-by-column` option. Another possibility is to raise the threshold with `--skew-threshold`: in such case that means that you accept and understand the risk of launching the SHA1 computations with these skewed values.
278 | 
279 | #### Schema not matching
280 | 
281 | To do the comparison, the program needs to first discover the schemas of the tables. What is actually done is fetching the schema of the "source table", and assuming that the "destination table" has the same schema.
282 | 
283 | If the schemas between the 2 tables don't match, then by default it is not possible to compare them.<br/>
284 | This can easily happen for instance in partitioned tables with BigQuery, where the "column partition" is called `_PARTITIONTIME` and may not match the name of this table in Hive.<br/>
285 | To overcome this, there are 2 possibilities:
286 | * creating some views (in Hive or BigQuery) to rename or remove some columns. Take care not to make this view too complex otherwise that could lower the validity of the comparison.
287 | * limit the columns you want to compare, using the options `--column-range`, `--columns`, `--ignore-columns`. The problem of this approach is that you cannot rename columns.
288 | 
289 | #### HBase tables
290 | 
291 | It is possible to do also the comparison with some HBase tables.<br/>
292 | To do so, you need to create a Hive table with a HBase backend (see documentation: https://cwiki.apache.org/confluence/display/Hive/HBaseIntegration ).<br/>
293 | Be aware that Hive on top HBase has some limitations, so you won't be able to check all the data in your HBase tables (for instance: you cannot see the timestamps, or older versions of a cell, and all the columns must be properly defined in the Hive schema).
294 | 
295 | #### Encoding differences between Hive and BigQuery
296 | 
297 | All the data that is internally stored by BigQuery is automatically saved with an UTF8 encoding. That can give some problems when some strings in Hive are using another encoding: the byte representation between each column may not match even if the 'texts' are the same.<br/>
298 | To overcoome this, a UDF has been developed in Hive in order to transform some specific columns with a CP1252 encoding into Google's UTF8 encoding.<br/>
299 | Use the `--decodeCP1252-columns` option to specify those columns. Take care: this is code is quite experimental and it does not guarantee that the "translation of encoding" will totally work.
300 | 
301 | #### Problems in the selection of the GroupBy column
302 | 
303 | The first step of the algorithms tries to estimate the best "GroupBy" column (see explanations about Algorithm below).
304 | However, this estimation can go wrong and in such case the following message appears:
305 | ```
306 | Error: we could not find a suitable column to do a Group By. Either relax the selection condition with the '--max-gb-percent' option or directly select the column with '--group-by-column'
307 | ```
308 | 
309 | If you face this problem, then just follow one of the 2 solutions proposed in the message above.<br/>
310 | The best solution is obviously to have some knowledge about the data and to directly indicate to the script which column is the best one to do some GroupBy, using the `--group-by-column` option.<br/>
311 | The other possibility is to use the `--max-gb-percent` option and to make it higher than the default value (1%), in order to allow a bit less homogeneous distribution in the data of the GroupBy column and thus have more probability to find a GroupBy column.
312 | 
313 | ## Algorithm
314 | 
315 | The goal of hive_compared_bq was to avoid all the shortcomings of previous approaches that tried to solve the same comparison problem.
316 | Some of those approaches are:
317 | * Manual check, comparing some few rows and counting the total number of rows.<br/>
318 | Problem of this approach is obviously that it cannot work with a certain amount of data or with tables with lot of columns. So the check cannot be complete.
319 | * Doing some "JOINs" comparisons between the 2 tables.<br/>
320 | The drawback of this is that Joins are some quite slow operations in general. What is more, it is very difficult to do a fair comparison when there are multiple rows with the same "Join column".
321 | The last inconvenience is that you have to ensure that the 2 tables are located on the same place, which might force you to transfer the data from 1 database to another one (ex: from a Hive database to a BigQuery database), which takes a long time if the table is huge.
322 | * Sorting all the rows and doing a Diff between them (just like described here: https://community.hortonworks.com/articles/1283/hive-script-to-validate-tables-compare-one-with-an.html ).<br/>
323 | The inconvenience with this is also that the 2 tables have to be located on the same place. What is more, the "Diff" operations occurs in memory which means that this approach does not work with huge tables.
324 | 
325 | To solve above problems, it was decided to:
326 | * develop an algorithm that would leverage the BigData backends (Hive, BigQuery or others): all the complicated computations must be performed on those engines that naturally scale.<br/>
327 | That means that the goal of the algorithm is to generate some complex SQL queries that will be sent to those backends, and then that will compare the results of those queries.
328 | Those results must be small, otherwise the Python program would not be able to do the final comparison in memory.
329 | * compare all the rows and all the columns of the tables.<br/>
330 | To ensure that we can have some relatively small amount of results (see reason just above) coming from a huge amount of data, we need to "reduce" the results.
331 | Using some checksums (SHA1) algorithm seems appropriate because it helps in doing that reduction and also in giving a high confidence in the validity of the comparison.
332 | 
333 | In summary, the idea of the programs is just to compute locally to each table all the checksums of all the rows/columns, to "Group BY" those checksums and generate some global checksums on top of them.<br/>
334 | And then, all those general checksums will be transferred back to the program where the final comparison will be made and where potential differences will be shown to the user.
335 | The following simplified pseudo SQL query summarizes a bit this idea:
336 | 
337 |     WITH blocks AS (
338 |         SELECT MOD( hash2( column), 100000) as gb, sha1(concat( col0, col1, col2, col3, col4)) as block_0,
339 |           sha1(concat( col5, col6, col7, col8, col9)) as block_1, ... as block_N FROM table
340 |     ),
341 |     full_lines AS (
342 |         SELECT gb, sha1(concat( block_0, |, block_1...) as row_sha, block_0, block_1 ... FROM blocks
343 |     )
344 |     SELECT gb, sha1(concat(list<row_sha>)) as sline, sha1(concat(list<block_0>)) as sblock_1,
345 |         sha1(concat(list<block_1>)) as sblock_2 ... as sblock_N FROM GROUP BY gb
346 | 
347 | The real query is obviously a bit more complex, because we need to take care about type conversions, NULL values and the ordering of the rows for the same GroupBy values.<br/>
348 | If you feel interested in seeing the real query, launch the script with the `--verbose` option.
349 | 
350 | The result of above query is stored in a temporary table. The size of this table is small (because we take a modulo of the HASH of the groupBy column, and because the number of columns is limited).<br/>
351 | Therefore, because this amount of data is reasonable, we can allow ourselves to download those results locally where our Python program is being executed.
352 | 
353 | In the 2 temporary tables, the first 2 columns (gb and sline) are the most important.<br/>
354 | "gb" is used as a kind of "index". "sline" is the total checksum that
355 | corresponds to all the columns of all the rows whose GroupBy value modulo 100000 matches the "gb" value.<br/>
356 | If "sline" differs between the 2 tables for a same "gb" value, then we know that some rows in that "GroupBy-modulo block" don't match between the 2 tables.<br/>
357 | Then, the program queries again the temporary tables and ask all the columns for all the specific "gb" values that have a difference.<br/>
358 | The idea is to discover which "block of columns" (called "sblock_N" in the previous pseudo query) are different.<br/>
359 | By doing so, we are able to know not only which rows are different, but also in which columns those differences appear.<br/>
360 | This allows us to launch some final simple SELECT queries against the original tables, fetching those rows and those specific columns.<br/>
361 | Finally, those 2 small datasets are sorted and a difference of them is exposed in a web browser.
362 | 
363 | The pseudo SQL query shown above is quite heavy to compute, and has to access all the data of each table.
364 | In order to avoid launching such a heavy query in case of "stupid mistakes" (maybe your ETL flow failed and there is 1 table that is void), a quick comparison step has been introduced before.
365 | In general, we can divide the program in 3 steps:
366 | 
367 | 1. **Determination of the GroupBy column**<br/>
368 | In this first step, we try to assess which column is more suitable to be used as a GroupBy column.<br/>
369 | It is important to find a column that has many different values (otherwise, the next queries will be poorly parallelized, and we can have some OOM problems).<br/>
370 | We want also to avoid some skewed columns.<br/>
371 | To do so, the program fetch a sample of data from 1 table: some 10 000 rows and some 10 columns. We only look at 10 columns for 3 reasons: 1) we suppose that it would be enough to find there a good one 2) to limit the amount of data transfered over the network 3) to avoid being billed too much (for instance, the price in BigQuery grows with the number of columns being read).<br/>
372 | Finally, an analysis is done on those columns, we discard all the columns that don't have enough diversity or that are too skewed, and from the remaining columns we keep the one that seems less skewed.
373 | Working on a sample has some limits and it is possible that the chosen column is not the best one. It is also possible that all the 10 columns are discarded.<br/>
374 | In both situation you can use the `--group-by-column` option to overcome that problem.
375 | 
376 | 2. **Quick count check**<br/>
377 | The program does a first validation to quickly check, in a light way if there are some big differences in the 2 tables.<br/>
378 | This is why we just try to count the number of rows in the tables. More than counting the total number of rows, we will check the number of rows for each "row-bucket", that is: group of rows with a GroupBy value whose hash modulo 10000 are the same.<br/>
379 | This verification is fast because it just need to do some small computation on just 1 column. There is indeed no point in trying to do some very complex computation on all the columns if the number of rows does not match.<br/>
380 | In such case, the differences are shown to the user and the program stops, without executing the 3rd step.<br/>
381 | During this step, a skew detection is also performed, which is a protection for the 3rd step.
382 | 
383 | 3. **Full SHA1 validation**<br/>
384 | This step, described earlier in this paragraph, is the only one that can guarantee that the 2 tables are identical.
385 | 
386 | ### Imprecision due to "float" of "double" types
387 | 
388 | The representations of decimal numbers are not always exact, and small differences in their representations can be sometimes observed between Hive and BigQuery, meaning that the corresponding SHA1 values will be totally different.<br/>
389 | For instance, the following query in BigQuery returns `8.5400000000000009`:
390 | ```
391 | select  cast( cast( 8.540000000000001 as FLOAT64 ) as STRING)
392 | ```
393 | And in Hive we would get `8.540000000000001`.
394 | 
395 | To overcome those problems, it was decided to:
396 | * multiply any float or double number by 10000 (most of the decimal numbers managed in Bol.com are 'price numbers' with just 2 digits, so multiplying by 10000 should give enough precision).
397 | * round the resulting number with floor() in order to get rid of any "decimal imprecision"
398 | * cast the result into an integer (otherwise the previous number may end up with a '.0' or not, depending if it is Hive or BigQuery)
399 | 
400 | The above approach works in many cases but we understand that it is not a general solution (for instance, no distinction would be made between '0.0000001' and '0').<br/>
401 | We may thus need in the future to add other possibilities to solve those imprecision problems.
402 | 


--------------------------------------------------------------------------------
/hive_compared_bq/hive_compared_bq.py:
--------------------------------------------------------------------------------
   1 | """
   2 | 
   3 | Copyright 2017 bol.com. All Rights Reserved
   4 | 
   5 | 
   6 | Licensed under the Apache License, Version 2.0 (the "License");
   7 | you may not use this file except in compliance with the License.
   8 | You may obtain a copy of the License at
   9 | 
  10 |     http://www.apache.org/licenses/LICENSE-2.0
  11 | 
  12 | Unless required by applicable law or agreed to in writing, software
  13 | distributed under the License is distributed on an "AS IS" BASIS,
  14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 | See the License for the specific language governing permissions and
  16 | limitations under the License.
  17 | """
  18 | 
  19 | import argparse
  20 | import ast
  21 | import logging
  22 | import threading
  23 | import difflib
  24 | import re
  25 | import sys
  26 | import webbrowser
  27 | from abc import ABCMeta, abstractmethod
  28 | 
  29 | if sys.version_info[0:2] == (2, 6):
  30 |     # noinspection PyUnresolvedReferences
  31 |     from backport_collections import Counter
  32 | else:
  33 |     from collections import Counter
  34 | 
  35 | ABC = ABCMeta('ABC', (object,), {})  # compatible with Python 2 *and* 3
  36 | 
  37 | 
  38 | class _Table(ABC):
  39 |     """Represent an abstract table that contains database connection and the related SQL executions
  40 | 
  41 |     :type database: str
  42 |     :param database: Name of the database
  43 | 
  44 |     :type table: str
  45 |     :param table: Name of the table
  46 | 
  47 |     :type tc: :class:`TableComparator`
  48 |     :param tc: reference to the parent object, in order to access the configuration
  49 | 
  50 |     :type column_range: str
  51 |     :param column_range: Python array range that represents the range of columns in the DDL that we want to compare
  52 | 
  53 |     :type chosen_columns: str
  54 |     :param chosen_columns: list of the (str) columns we want to focus on
  55 | 
  56 |     :type ignore_columns: str
  57 |     :param ignore_columns: list of the (str) columns we want to ignore
  58 | 
  59 |     :type decodeCP1252_columns: str
  60 |     :param decodeCP1252_columns: list of the (str) columns that are encoded in CP1252 and we want to transform into
  61 |             Google's UTF8 encoding
  62 | 
  63 |     :type where_condition: str
  64 |     :param where_condition: boolean SQL condition that will be added to the WHERE condition of all the queries for that
  65 |                             table, so that we can restrict the analysis to a specific scope. Also quite useful when
  66 |                             we want to work on specific partitions.
  67 | 
  68 |     :type full_name: str
  69 |     :param full_name: the full name of the table: <database>.<table>
  70 |     """
  71 | 
  72 |     __metaclass__ = ABCMeta
  73 | 
  74 |     def __init__(self, database, table, parent, *args, **kwargs):
  75 |         self.database = database
  76 |         self.table = table
  77 |         self.tc = parent
  78 |         self.column_range = ":"
  79 |         self.chosen_columns = None
  80 |         self.ignore_columns = None
  81 |         self.decodeCP1252_columns = []
  82 |         self.where_condition = None
  83 |         self.full_name = self.database + '.' + self.table
  84 |         self._ddl_columns = []  # array instead of dictionary because we want to maintain the order of the columns
  85 |         self._ddl_partitions = []  # take care, those rows also appear in the columns array
  86 |         self._group_by_column = None  # the column that is used to "bucket" the rows
  87 | 
  88 |     @staticmethod
  89 |     def check_stdin_options(typedb, stdin_options, allowed_options, compulsory_options):
  90 |         """Validate the options entered, given those allowed and those compulsory
  91 | 
  92 |         :type typedb: str
  93 |         :param typedb: the type of the database ({hive,bq})
  94 | 
  95 |         :type stdin_options: str
  96 |         :param stdin_options: the options entered on command line for this table, given in a Python dictionary format
  97 | 
  98 |         :type allowed_options: list of str
  99 |         :param allowed_options: the list of options that can be used for this table
 100 | 
 101 |         :type compulsory_options: dict
 102 |         :param compulsory_options: dictionary where the keys are the name of the compulsory options, and the value
 103 |                 is a small description of it
 104 | 
 105 |         :rtype: dict
 106 |         :return: a dictionary of the options entered by the user
 107 | 
 108 |         :raises: ValueError if the option entered is invalid
 109 |         """
 110 |         hash_options = {}
 111 |         if stdin_options is not None:
 112 |             try:
 113 |                 hash_options = ast.literal_eval(stdin_options)
 114 |             except:
 115 |                 raise ValueError("The option must be in a Python dictionary format (just like: {'jar': "
 116 |                                  "'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}\nThe value "
 117 |                                  "received was: %s" % stdin_options)
 118 | 
 119 |             for key in hash_options:
 120 |                 if key not in allowed_options:
 121 |                     raise ValueError("The following option for %s is not supported: %s\nThe only supported options"
 122 |                                      " are: %s" % (typedb, key, allowed_options))
 123 | 
 124 |         # Needs to be checked even if the user has given no option
 125 |         for key in compulsory_options:
 126 |             if key not in hash_options:
 127 |                 raise ValueError("%s option (%s) must be defined for %s tables" % (key, compulsory_options[key],
 128 |                                  typedb))
 129 | 
 130 |         return hash_options
 131 | 
 132 |     @staticmethod
 133 |     def create_table_from_string(argument, options, table_comparator):
 134 |         """Parse an argument (usually received on the command line) and return the corresponding Table object
 135 | 
 136 |         :type argument: str
 137 |         :param argument: description of the table to connect to. Must have the format <type>/<database>.<table>
 138 |                         type can be {hive,bq}
 139 | 
 140 |         :type options: str
 141 |         :param options: the dictionary of all the options for this table connection. Could be for instance:
 142 |                          "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}"
 143 | 
 144 |         :type table_comparator: :class:`TableComparator`
 145 |         :param table_comparator: the TableComparator parent object, to have a reference to the configuration properties
 146 | 
 147 |         :rtype: :class:`_Table`
 148 |         :returns: the _Table object that corresponds to the argument
 149 | 
 150 |         :raises: ValueError if the argument is void or does not match the format
 151 |         """
 152 |         match = re.match(r'(\w+)/(\w+)\.(\w+)', argument)
 153 |         if match is None:
 154 |             raise ValueError("Table description must follow the following format: '<type>/<database>.<table>'")
 155 | 
 156 |         typedb = match.group(1)
 157 |         database = match.group(2)
 158 |         table = match.group(3)
 159 | 
 160 |         if typedb == "bq":
 161 |             hash_options = _Table.check_stdin_options(typedb, options, ["project"], {})
 162 |             from bq import TBigQuery
 163 |             return TBigQuery(database, table, table_comparator, hash_options.get('project'))
 164 |         elif typedb == "hive":
 165 |             hash_options = _Table.check_stdin_options(typedb, options, ["jar", "hs2"], {'hs2': 'Hive Server2 hostname'})
 166 |             from hive import THive
 167 |             return THive(database, table, table_comparator, hash_options['hs2'], hash_options.get('jar'))
 168 |         else:
 169 |             raise ValueError("The database type %s is currently not supported" % typedb)
 170 | 
 171 |     @abstractmethod
 172 |     def get_type(self):
 173 |         """Return the (string) type of the database (Hive, BigQuery)"""
 174 |         pass
 175 | 
 176 |     def get_id_string(self):
 177 |         """Return a string that fully identifies the table"""
 178 |         return self.get_type() + "_" + self.full_name
 179 | 
 180 |     def set_where_condition(self, where):
 181 |         """the WHERE condition we want to apply for the table. Could be useful in case of partitioned tables
 182 | 
 183 |         :type where: str
 184 |         :param where: the WHERE condition. Example: datedir="2017-05-01"
 185 |         """
 186 |         self.where_condition = where
 187 | 
 188 |     def set_column_range(self, column_range):
 189 |         self.column_range = column_range
 190 | 
 191 |     def set_chosen_columns(self, cols):
 192 |         self.chosen_columns = cols.split(",")
 193 | 
 194 |     def set_ignore_columns(self, cols):
 195 |         self.ignore_columns = cols.split(",")
 196 | 
 197 |     def set_decode_cp1252_columns(self, cols):
 198 |         self.decodeCP1252_columns = cols.split(",")
 199 | 
 200 |     def set_group_by_column(self, col):
 201 |         self._group_by_column = col
 202 | 
 203 |     @abstractmethod
 204 |     def get_ddl_columns(self):
 205 |         """ Return the columns of this table
 206 | 
 207 |         The list of the column is an attribute of the class. If it already exists, then it is directly returned.
 208 |         Otherwise, a connection is made to the database to get the schema of the table and at the same time the
 209 |         attribute (list) partition is filled.
 210 | 
 211 |         :rtype: list of dict
 212 |         :returns: list of {"name": "type"} dictionaries that represent the columns of this table
 213 |         """
 214 |         pass
 215 | 
 216 |     def get_groupby_column(self):
 217 |         """Return a column that seems to have a good distribution in order to do interesting GROUP BY queries with it
 218 | 
 219 |         This _group_by_column is an attribute of the class. If it already exists, then it is directly returned.
 220 |         Otherwise, a small query to get some sample rows on some few columns is performed, in order to evaluate
 221 |         which of those columns present the best distribution (we want a column that will have as many Group By values
 222 |         as possible, and that avoid a bit the skew, so that when we detect a specific difference on a bucket, we will
 223 |         be able to show a number of lines for this bucket not too big). The found column is then saved as the attribute
 224 |         and returned.
 225 | 
 226 |         :rtype: str
 227 |         :returns: the column that will be used in the Group By
 228 |         """
 229 |         if self._group_by_column is not None:
 230 |             return self._group_by_column
 231 | 
 232 |         query, selected_columns = self.get_sample_query()
 233 | 
 234 |         #  Get a sample from the table and fill Counters to each column
 235 |         logging.info("Analyzing the columns %s with a sample of %i values", str([x["name"] for x in selected_columns]),
 236 |                      self.tc.sample_rows_number)
 237 |         for col in selected_columns:
 238 |             col["Counter"] = Counter()  # col: {"name","type"} dictionary. New "counter" key is to track distribution
 239 | 
 240 |         self.get_column_statistics(query, selected_columns)
 241 |         self.find_best_distributed_column(selected_columns)
 242 | 
 243 |         return self._group_by_column
 244 | 
 245 |     @abstractmethod
 246 |     def get_column_statistics(self, query, selected_columns):
 247 |         """Launch the sample query and register the distribution of the selected_columns in the "Counters"
 248 | 
 249 |         :type query: str
 250 |         :param query: SQL query that gets a sample of rows with only the selected columns
 251 | 
 252 |         :type selected_columns: list of dict
 253 |         :param selected_columns: list of the few columns selected in the sample query. It has the format:
 254 |                 {"name": col_name, "type": col_type, "Counter": frequency_of_values}
 255 |         """
 256 |         pass
 257 | 
 258 |     def find_best_distributed_column(self, selected_columns):
 259 |         """Look at the statistics from the sample to estimate which column has the best distribution to do a GROUP BY
 260 | 
 261 |         The best column is automatically saved in the attribute _group_by_column. If all the selected columns have a
 262 |         poor distribution, then we exit with error.
 263 |         We say that we have a poor distribution if the most frequent value of a column is superior than
 264 |         max_frequent_number.
 265 |         Then, the best column is the one that has not a poor distribution, and whose sum of occurrences of the 50 most
 266 |         frequent values is minimal.
 267 | 
 268 |         :type selected_columns: list of dict
 269 |         :param selected_columns: list of the few columns selected in the sample query. It has the format:
 270 |                 {"name": col_name, "type": col_type, "Counter": frequency_of_values}
 271 |         """
 272 |         max_frequent_number = self.tc.sample_rows_number * self.tc.max_percent_most_frequent_value_in_column // 100
 273 |         current_lowest_weight = sys.maxint
 274 |         highest_first = max_frequent_number
 275 | 
 276 |         for col in selected_columns:
 277 |             highest = col["Counter"].most_common(1)[0]
 278 |             value_most_popular = highest[0]
 279 |             frequency_most_popular_value = highest[1]
 280 |             if frequency_most_popular_value > max_frequent_number:
 281 |                 logging.debug(
 282 |                     "Discarding column '%s' because '%s' was found in sample %i times (higher than limit of %i)",
 283 |                     col["name"], value_most_popular, frequency_most_popular_value, max_frequent_number)
 284 |                 continue
 285 |             # The biggest value is not too high, so let's see how big are the 50 biggest values
 286 |             weight_of_most_frequent_values = sum([x[1] for x in col["Counter"]
 287 |                                                  .most_common(self.tc.number_of_most_frequent_values_to_weight)])
 288 |             logging.debug("%s: sum up of the %i most frequent occurrence: %i", col["name"],
 289 |                           self.tc.number_of_most_frequent_values_to_weight, weight_of_most_frequent_values)
 290 |             if weight_of_most_frequent_values < current_lowest_weight:
 291 |                 self._group_by_column = col["name"]  # we save here this potential "best group-by" column
 292 |                 current_lowest_weight = weight_of_most_frequent_values
 293 |                 highest_first = frequency_most_popular_value
 294 | 
 295 |         if self._group_by_column is None:
 296 |             sys.exit("Error: we could not find a suitable column to do a Group By. Either relax the selection condition"
 297 |                      " with the '--max-gb-percent' option or directly select the column with '--group-by-column' ")
 298 | 
 299 |         logging.info("Best column to do a GROUP BY is %s (occurrences of most frequent value: %i / the %i most frequent"
 300 |                      "values sum up %i occurrences)", self._group_by_column, highest_first,
 301 |                      self.tc.number_of_most_frequent_values_to_weight, current_lowest_weight)
 302 | 
 303 |     def filter_columns_from_cli(self, all_columns):
 304 |         """Filter the columns received from the table schema with the options given by the user, and save this result
 305 | 
 306 |         :type all_columns: list of dict
 307 |         :param all_columns: each column in the table is represented by an element in the list, with the following
 308 |                 format: {"name": col_name, "type": col_type}
 309 |         """
 310 |         if self.column_range == ":":
 311 |             if self.chosen_columns is not None:  # user has declared the columns he wants to analyze
 312 |                 leftover = list(self.chosen_columns)
 313 |                 for col in all_columns:
 314 |                     if col['name'] in leftover:
 315 |                         leftover.remove(col['name'])
 316 |                         self._ddl_columns.append(col)
 317 |                 if len(leftover) > 0:
 318 |                     sys.exit("Error: you asked to analyze the columns %s but we could not find them in the table %s"
 319 |                              % (str(leftover), self.get_id_string()))
 320 |             else:
 321 |                 self._ddl_columns = all_columns
 322 |         else:  # user has requested a specific range of columns
 323 |             match = re.match(r'(\d*):(\d*)', self.column_range)
 324 |             if match is None:
 325 |                 raise ValueError("The column range must follow the Python style '1:9'. You gave: %s", self.column_range)
 326 | 
 327 |             start = 0
 328 |             if len(match.group(1)) > 0:
 329 |                 start = int(match.group(1))
 330 | 
 331 |             end = len(all_columns)
 332 |             if len(match.group(2)) > 0:
 333 |                 end = int(match.group(2))
 334 | 
 335 |             self._ddl_columns = all_columns[start:end]
 336 |             logging.debug("The range of columns has been reduced to: %s", self._ddl_columns)
 337 | 
 338 |         # Even if the user requested just a specific range of columns, he still has the possibility to remove some
 339 |         # columns in that range:
 340 |         if self.ignore_columns is not None:
 341 |             all_columns = []
 342 |             for col in self._ddl_columns:
 343 |                 if not col['name'] in self.ignore_columns:
 344 |                     all_columns.append(col)
 345 |             self._ddl_columns = list(all_columns)
 346 | 
 347 |     @abstractmethod
 348 |     def create_sql_groupby_count(self):
 349 |         """ Return a SQL query where we count the number of rows for each Group of hash() on the groupby_column
 350 | 
 351 |         The column found in get_groupby_column() is used to do a Group By and count the number of rows for each group.
 352 |         But in order to reduce the number of Groups, we hash the value of this column, and we take the modulo
 353 |         "number_of_group_by" of this hash value.
 354 | 
 355 |         :rtype: str
 356 |         :returns: SQL query to do the Group By Count
 357 |         """
 358 |         pass
 359 | 
 360 |     @abstractmethod
 361 |     def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values):
 362 |         """ Return a SQL query that shows the rows that match some specific buckets and some given columns
 363 | 
 364 |         :type extra_columns_str: str
 365 |         :param extra_columns_str: the list of extra columns (separated by ",") we want to show to help debugging
 366 | 
 367 |         :type buckets_values: str
 368 |         :param buckets_values: the list of values (separated by ",") of the buckets we want to fetch
 369 | 
 370 |         :rtype: str
 371 |         :returns: SQL query to do the Group By Buckets
 372 |         """
 373 |         pass
 374 | 
 375 |     @abstractmethod
 376 |     def create_sql_intermediate_checksums(self):
 377 |         """Build and return the query that generates all the checksums to make the final comparison
 378 | 
 379 |         The query will have the following schema:
 380 | 
 381 |     WITH blocks AS (
 382 |         SELECT MOD( hash2( column), 100000) as gb, sha1(concat( col0, col1, col2, col3, col4)) as block_0,
 383 |           sha1(concat( col5, col6, col7, col8, col9)) as block_1, ... as block_N FROM table
 384 |     ),
 385 |     full_lines AS (
 386 |         SELECT gb, sha1(concat( block_0, |, block_1...) as row_sha, block_0, block_1 ... FROM blocks
 387 |     )
 388 |     SELECT gb, sha1(concat(list<row_sha>)) as sline, sha1(concat(list<block_0>)) as sblock_1,
 389 |         sha1(concat(list<block_1>)) as sblock_2 ... as sblock_N FROM GROUP BY gb
 390 | 
 391 |         :rtype: str
 392 |         :returns: the SQL query with the Group By and the shas
 393 |         """
 394 |         pass
 395 | 
 396 |     @abstractmethod
 397 |     def delete_temporary_table(self, table_name):
 398 |         """Drop the temporary table if needed (if it is not automatically deleted by the system)
 399 | 
 400 |         :type table_name: str
 401 |         :param table_name: name of the table to delete
 402 |         """
 403 |         pass
 404 | 
 405 |     @abstractmethod
 406 |     def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False):
 407 |         """Launch the SQL query and stores the results of the 1st and 2nd columns in the dictionary
 408 | 
 409 |         The 1st column of each row is stored as the key of the dictionary, the 2nd column is for the value. This method
 410 |         is meant to catch the result of a Group By query, so that we are sure that the keys fetched are unique.
 411 | 
 412 |         :type query: str
 413 |         :param query: query to execute
 414 | 
 415 |         :type result_dic: dict
 416 |         :param result_dic: dictionary to store the result
 417 | 
 418 |         :type all_columns_from_2: bool
 419 |         :param all_columns_from_2: True if we want to fetch all the columns of the row, starting from the 2nd one. False
 420 |                                     if we want the value of the dictionary to only have the 1st column (take care:
 421 |                                     columns start counting with 0).
 422 |                                     (default: False)
 423 |         """
 424 |         pass
 425 | 
 426 |     @abstractmethod
 427 |     def launch_query_csv_compare_result(self, query, rows):
 428 |         """Launch the SQL query and stores the rows in an array with some kind of CSV formatting
 429 | 
 430 |         The only reason for the "CSV formatting" (separation of columns with "|") is to help in comparing the rows with
 431 |         the difflib library. This is also the reason why all the lines start with "^" and end with "$"
 432 | 
 433 |         :type query: str
 434 |         :param query: query to execute
 435 | 
 436 |         :type rows: list of str
 437 |         :param rows: the (void) array that will store the rows
 438 |         """
 439 |         pass
 440 | 
 441 |     @abstractmethod
 442 |     def launch_query_with_intermediate_table(self, query, result):
 443 |         """Launch the query, stores the results in a temporary table and put the first 2 columns in a dictionary
 444 | 
 445 |         This method is used to computes a lot of checksums and thus is a bit heavy to compute. This is why we store
 446 |         all those detailed results in a temporary table. Then present a summary of the returned rows by storing the
 447 |         first 2 columns in a dictionary under the ``result`` dictionary.
 448 | 
 449 |         :type query: str
 450 |         :param query: query to execute
 451 | 
 452 |         :type result: dict
 453 |         :param result: dictionary to store the result
 454 |         """
 455 | 
 456 |     def get_sample_query(self):
 457 |         """ Build a SQL query to get some sample lines with limited amount of columns
 458 | 
 459 |         We limit the number of columns to a small number (ex: 10) because it is usually unnecessary to look at all
 460 |         the columns in order to find one with a good distribution (usually, the "index" will be in the first columns).
 461 |         What is more, in BigQuery we are billed by the number of columns we read so we need to avoid reading
 462 |         hundreds of columns just like what we have for big tables.
 463 | 
 464 |         :rtype: tuple
 465 |         :returns: ``(query, selected_columns)``, where ``query`` is the sample SQL query; ``selected_columns`` is the
 466 |                     list of columns that are fetched
 467 |         """
 468 |         query = "SELECT"
 469 |         selected_columns = self.get_ddl_columns()[:self.tc.sample_column_number]
 470 |         for col in selected_columns:
 471 |             query += " %s," % col["name"]  # for the last column we'll remove that trailing ","
 472 |         where_condition = ""
 473 |         if self.where_condition is not None:
 474 |             where_condition = "WHERE " + self.where_condition
 475 |         query = query[:-1] + " FROM %s %s LIMIT %i" % (self.full_name, where_condition, self.tc.sample_rows_number)
 476 |         return query, selected_columns
 477 | 
 478 |     def get_column_blocks(self, ddl):
 479 |         """Returns the list of a column blocks for a specific DDL (see function create_sql_intermediate_checksums)
 480 | 
 481 |         :type ddl: list of dict
 482 |         :param ddl: the ddl of the tables, containing dictionaries with keys (name, type) to describe each column
 483 | 
 484 |         :rtype: list of list
 485 |         :returns: list of each block, each one containing the (5) columns dictionary ({name, type}) that describe it
 486 |         """
 487 |         column_blocks = []
 488 |         for idx, col in enumerate(ddl):
 489 |             block_id = idx // self.tc.block_size
 490 |             if idx % self.tc.block_size == 0:
 491 |                 column_blocks.append([])
 492 |             column_blocks[block_id].append({"name": col["name"], "type": col["type"]})
 493 |         return column_blocks
 494 | 
 495 | 
 496 | class TableComparator(object):
 497 |     """Represent the general configuration of the program (tables names, number of rows to scan...) """
 498 | 
 499 |     def __init__(self):
 500 |         self.tsrc = None
 501 |         self.tdst = None
 502 | 
 503 |         # 10 000 rows should be good enough to use as a sample
 504 |         #  (Google uses some sample of 1000 or 3000 rows)
 505 |         #  Estimation of memory for the Counters: 10000 * 10 * ( 10 * 4 + 4) * 4 = 16.8 MB
 506 |         #  (based on estimation : rows * columns * ( size string + int) * overhead Counter )
 507 |         self.sample_rows_number = 10000
 508 |         self.sample_column_number = 10
 509 |         self.max_percent_most_frequent_value_in_column = None
 510 |         self.number_of_most_frequent_values_to_weight = 50
 511 | 
 512 |         self.number_of_group_by = 100000  # 7999 is the limit if you want to manually download the data from BQ. This
 513 |         # limit does not apply in this script because we fetch the data with the Python API instead.
 514 |         #  TODO ideally this limit would be dynamic (counting the total rows), in order to get an average of 7
 515 |         # lines per bucket
 516 |         self.skew_threshold = 40000  # if we detect that a Group By has more than this amount of rows
 517 |         # (compare_groupby_count() method), then we raise an exception, because the computation of the shas (which is
 518 |         # computationally expensive) might suffer a lot from this skew, and might also trigger some OOM exception.
 519 |         # 40 000 seems a safe number:
 520 |         # Let's consider a table with many column: 1000 columns. That means 201 sha blocks (1000 / 5 + 1)
 521 |         # a sha is 29 characters
 522 |         # so the max memory with a skew of 40 000 would be:
 523 |         # 201 * 40000 * 29 / 1024 /1024 = 222 MB, which should fit into the Heap of a task process
 524 |         self.block_size = 5  # 5 columns means that when we want to debug we have enough context. But it small enough to
 525 |         #  avoid being charged too much by Google when querying on it
 526 |         reload(sys)
 527 |         # below method really exists (don't know why PyCharm cannot see it) and is really needed
 528 |         # noinspection PyUnresolvedReferences
 529 |         sys.setdefaultencoding('utf-8')
 530 | 
 531 |     def set_tsrc(self, table):
 532 |         """Set the source table to be compared
 533 | 
 534 |         :type table: :class:`_Table`
 535 |         :param table: the _Table object
 536 |         """
 537 |         self.tsrc = table
 538 | 
 539 |     def set_tdst(self, table):
 540 |         """Set the destination table to be compared
 541 | 
 542 |         :type table: :class:`_Table`
 543 |         :param table: the _Table object
 544 |         """
 545 |         self.tdst = table
 546 | 
 547 |     def set_skew_threshold(self, threshold):
 548 |         """Set the threshold value for the skew
 549 | 
 550 |         :type threshold: int
 551 |         :param threshold: the threshold value (default: 40 000)
 552 |         """
 553 |         self.skew_threshold = threshold
 554 | 
 555 |     def set_max_percent_most_frequent_value_in_column(self, percent):
 556 |         """Set the max_percent_most_frequent_value_in_column value
 557 | 
 558 |         If in one sample a column has a value whose frequency is higher than this percentage, then this column is not
 559 |         considered as a suitable column to do the Group By
 560 | 
 561 |         :type percent: float
 562 |         :param percent: the percentage value
 563 |         """
 564 |         self.max_percent_most_frequent_value_in_column = percent
 565 | 
 566 |     def compare_groupby_count(self):
 567 |         """Runs a light query on Hive and BigQuery to check if the counts match, using the ideal column estimated before
 568 | 
 569 |         Some skew detection is also performed here. And the program stops if we detect some important skew and no
 570 |         difference was detected (meaning that the sha computation would have been performed with that skew).
 571 | 
 572 |         :rtype: tuple
 573 |         :returns: ``(summary_differences, big_small_bucket)``, where ``summary_differences`` is a list of tuples,
 574 |                     one per difference containing (groupByValue, number of differences for this bucket, count of rows
 575 |                     for this bucket for the "biggest table"); ``big_small_bucket`` is a tuple containing the table that
 576 |                     has the biggest distribution (according to the Group By column) and then the other table
 577 |         """
 578 |         logging.info("Executing the 'Group By' Count queries for %s (%s) and %s (%s) to do first comparison",
 579 |                      self.tsrc.full_name, self.tsrc.get_type(), self.tdst.full_name, self.tdst.get_type())
 580 |         src_query = self.tsrc.create_sql_groupby_count()
 581 |         dst_query = self.tdst.create_sql_groupby_count()
 582 | 
 583 |         result = {"src_count_dict": {}, "dst_count_dict": {}}
 584 |         t_src = threading.Thread(name='srcGroupBy-' + self.tsrc.get_type(), target=self.tsrc.launch_query_dict_result,
 585 |                                  args=(src_query, result["src_count_dict"]))
 586 |         t_dst = threading.Thread(name='dstGroupBy-' + self.tdst.get_type(), target=self.tdst.launch_query_dict_result,
 587 |                                  args=(dst_query, result["dst_count_dict"]))
 588 |         t_src.start()
 589 |         t_dst.start()
 590 |         t_src.join()
 591 |         t_dst.join()
 592 | 
 593 |         for k in result:
 594 |             if 'error' in result[k]:
 595 |                 sys.exit(result[k]["error"])
 596 | 
 597 |         # #### Let's compare the count between the 2 Group By queries
 598 |         # iterate on biggest dictionary so that we're sure to se a difference if there is one
 599 |         logging.debug("Searching differences in Group By")
 600 |         if len(result["src_count_dict"]) > len(result["dst_count_dict"]):
 601 |             big_dict = result["src_count_dict"]
 602 |             small_dict = result["dst_count_dict"]
 603 |             big_small_bucket = (self.tsrc, self.tdst)
 604 |         else:
 605 |             big_dict = result["dst_count_dict"]
 606 |             small_dict = result["src_count_dict"]
 607 |             big_small_bucket = (self.tdst, self.tsrc)
 608 | 
 609 |         differences = Counter()
 610 |         skew = Counter()
 611 |         for (k, v) in big_dict.iteritems():
 612 |             if k not in small_dict:
 613 |                 differences[k] = -v  # we want to see the differences where we have less lines to compare
 614 |             elif v != small_dict[k]:
 615 |                 differences[k] = - abs(v - small_dict[k])
 616 |             # we check the skew even if some differences were found above and we will never enter the sha computation,
 617 |             # so that the developer can fix at early stage
 618 |             max_value = max(v, small_dict.get(k))
 619 |             if max_value > self.skew_threshold:
 620 |                 skew[k] = max_value
 621 |         summary_differences = [(k, -v, big_dict[k]) for (k, v) in differences.most_common()]
 622 |         if len(skew) > 0:
 623 |             logging.warning("Some important skew (threshold: %i) was detected in the Group By column %s. The top values"
 624 |                             " are: %s", self.skew_threshold, self.tsrc.get_groupby_column(), str(skew.most_common(10)))
 625 |             if len(summary_differences) == 0:
 626 |                 sys.exit("No difference in Group By count was detected but we saw some important skew that could make "
 627 |                          "the next step (comparison of the shas) very slow or failing. So better stopping now. You "
 628 |                          "should consider choosing another Group By column with the '--group-by-column' option")
 629 | 
 630 |         if len(summary_differences) != 0:
 631 |             logging.info("We found at least %i differences in Group By count", len(summary_differences))
 632 |             logging.debug("Differences in Group By count are: %s", summary_differences[:300])
 633 | 
 634 |         return summary_differences, big_small_bucket
 635 | 
 636 |     def show_results_count(self, summary_differences, big_small_bucket):
 637 |         """If any differences found in the Count Group By step, then show them in a webpage
 638 | 
 639 |         :type summary_differences: list of tuple
 640 |         :param summary_differences: list of all the differences where each difference is described as: (groupByValue,
 641 |         number of differences for this bucket, count of rows for this bucket for the "biggest table")
 642 | 
 643 |         :type big_small_bucket: tuple
 644 |         :param big_small_bucket: tuple containing the table that has the biggest distribution (according to the Group By
 645 |          column) and then the other table
 646 |         """
 647 |         # TODO break/refactor a bit this function in the same way it has been done for the sha part
 648 | 
 649 |         # We want to return at most 6 blocks of lines corresponding to different group by values. For the sake of
 650 |         # brevity, each block should not show more than 70 lines. Blocks that show rows that appear in only on 1 table
 651 |         # should be limited to 3 (so that we can see "context" when debugging). To also give context, we will show some
 652 |         # few other columns.
 653 |         number_buckets_only_one_table = 0
 654 |         number_buckets_found = 0
 655 |         buckets_bigtable = []
 656 |         buckets_smalltable = []
 657 |         bigtable = big_small_bucket[0]
 658 |         smalltable = big_small_bucket[1]
 659 |         for (bucket, difference_num, biggest_num) in summary_differences:
 660 |             if difference_num > 70:
 661 |                 break  # since the Counter was ordered from small differences to biggest, we know that this difference
 662 |                 # number can only increase. So let's go out of the loop
 663 |             if biggest_num > 70:
 664 |                 continue
 665 |             if difference_num == biggest_num:
 666 |                 if number_buckets_only_one_table == 3:
 667 |                     continue
 668 |                 else:
 669 |                     number_buckets_only_one_table += 1
 670 |                     number_buckets_found += 1
 671 |                     if number_buckets_found == 6:
 672 |                         break
 673 |                     buckets_bigtable.append(bucket)
 674 |             else:
 675 |                 buckets_bigtable.append(bucket)
 676 |                 buckets_smalltable.append(bucket)
 677 |                 number_buckets_found += 1
 678 |                 if number_buckets_found == 6:
 679 |                     break
 680 |         if len(buckets_bigtable) == 0:  # let's ensure that we have at least 1 value to show
 681 |             (bucket, difference_num, biggest_num) = summary_differences[0]
 682 |             buckets_bigtable.append(bucket)
 683 |             if difference_num != biggest_num:
 684 |                 buckets_smalltable.append(bucket)
 685 |         logging.debug("Buckets for %s: %s \t\tBuckets for %s: %s", bigtable.full_name, str(buckets_bigtable),
 686 |                       smalltable.full_name, str(buckets_smalltable))
 687 | 
 688 |         gb_column = self.tsrc.get_groupby_column()
 689 |         extra_columns = [x["name"] for x in self.tsrc.get_ddl_columns()[:6]]  # add 5 extra columns to see some context
 690 |         if gb_column in extra_columns:
 691 |             extra_columns.remove(gb_column)
 692 |         elif len(extra_columns) == 6:
 693 |             extra_columns = extra_columns[:-1]  # limit to 5 columns
 694 |         extra_columns_str = str(extra_columns)[1:-1].replace("'", "")
 695 |         bigtable_query = bigtable.create_sql_show_bucket_columns(extra_columns_str, str(buckets_bigtable)[1:-1])
 696 | 
 697 |         result = {"big_rows": [], "small_rows": []}
 698 |         t_big = threading.Thread(name='bigShowCountDifferences-' + bigtable.get_type(),
 699 |                                  target=bigtable.launch_query_csv_compare_result,
 700 |                                  args=(bigtable_query, result["big_rows"]))
 701 |         t_big.start()
 702 | 
 703 |         if len(buckets_smalltable) > 0:  # in case 0, then it means that the "smalltable" does not contain any of
 704 |             # the rows that appear in the "bigtable". In such case, there is no need to launch the query
 705 |             smalltable_query = smalltable.create_sql_show_bucket_columns(extra_columns_str,
 706 |                                                                          str(buckets_smalltable)[1:-1])
 707 |             t_small = threading.Thread(name='smallShowCountDifferences-' + smalltable.get_type(),
 708 |                                        target=smalltable.launch_query_csv_compare_result,
 709 |                                        args=(smalltable_query, result["small_rows"]))
 710 |             t_small.start()
 711 |             t_small.join()
 712 |         t_big.join()
 713 | 
 714 |         sorted_file = {}
 715 |         for instance in ("big_rows", "small_rows"):
 716 |             result[instance].sort()
 717 |             sorted_file[instance] = "/tmp/count_diff_" + instance
 718 |             with open(sorted_file[instance], "w") as f:
 719 |                 f.write("\n".join(result[instance]))
 720 | 
 721 |         column_description = "</br>hash(%s) , %s , %s" \
 722 |                              % (bigtable.get_groupby_column(), bigtable.get_groupby_column(), extra_columns_str)
 723 |         diff_string = difflib.HtmlDiff().make_file(result["big_rows"], result["small_rows"], bigtable.get_id_string() +
 724 |                                                    column_description, smalltable.get_id_string() + column_description,
 725 |                                                    context=False, numlines=30)
 726 |         html_file = "/tmp/count_diff.html"
 727 |         with open(html_file, "w") as f:
 728 |             f.write(diff_string)
 729 |         logging.debug("Sorted results of the queries are in the files %s and %s. HTML differences are in %s",
 730 |                       sorted_file["big_rows"], sorted_file["small_rows"], html_file)
 731 |         webbrowser.open("file://" + html_file, new=2)
 732 | 
 733 |     def compare_shas(self):
 734 |         """Runs the final queries on Hive and BigQuery to check if the checksum match and return the list of differences
 735 | 
 736 |         :rtype: tuple
 737 |         :returns: ``(list_differences, names_sha_tables, tables_to_clean)``, where ``list_differences`` is the list of
 738 |                     Group By values which presents different row checksums; ``names_sha_tables`` is a dictionary that
 739 |                     contains the names of the "temporary" result tables; ``tables_to_clean`` is a dictionary of the
 740 |                     temporary tables we will want to remove at the end of the process
 741 |         """
 742 |         logging.info("Executing the 'shas' queries for %s and %s to do final comparison",
 743 |                      self.tsrc.get_id_string(), self.tdst.get_id_string())
 744 | 
 745 |         tsrc_query = self.tsrc.create_sql_intermediate_checksums()
 746 |         tdst_query = self.tdst.create_sql_intermediate_checksums()
 747 | 
 748 |         # "cleaning" is for all the tables that will need to be eventually deleted. It must contain tuples (<name of
 749 |         # table to delete>, corresponding _Table object). "names_sha_tables" contains all the temporary tables generated
 750 |         # even the BigQuery cached table that does not need to be deleted. "sha_dictionaries" contains the results.
 751 |         result = {"cleaning": [], "names_sha_tables": {}, "sha_dictionaries": {
 752 |             self.tsrc.get_id_string(): {},
 753 |             self.tdst.get_id_string(): {}
 754 |         }}
 755 |         t_src = threading.Thread(name='shaBy-' + self.tsrc.get_id_string(),
 756 |                                  target=self.tsrc.launch_query_with_intermediate_table,
 757 |                                  args=(tsrc_query, result))
 758 |         t_dst = threading.Thread(name='shaBy-' + self.tdst.get_id_string(),
 759 |                                  target=self.tdst.launch_query_with_intermediate_table,
 760 |                                  args=(tdst_query, result))
 761 |         t_src.start()
 762 |         t_dst.start()
 763 |         t_src.join()
 764 |         t_dst.join()
 765 | 
 766 |         if "error" in result:
 767 |             for table_name, table_object in result["cleaning"]:
 768 |                 table_object.delete_temporary_table(table_name)
 769 |             sys.exit(result["error"])
 770 | 
 771 |         # Comparing the results of those dictionaries
 772 |         logging.debug("Searching differences in Shas")
 773 |         src_num_gb = len(result["sha_dictionaries"][self.tsrc.get_id_string()])
 774 |         dst_num_gb = len(result["sha_dictionaries"][self.tdst.get_id_string()])
 775 |         if not src_num_gb == dst_num_gb:
 776 |             sys.exit("The number of Group By values is not the same when doing the final sha queries (%s: %i - "
 777 |                      "%s: %i).\nMake sure to first execute the 'count' verification step!"
 778 |                      % (self.tsrc.get_id_string(), src_num_gb, self.tdst.get_id_string(), dst_num_gb))
 779 | 
 780 |         list_differences = []
 781 |         for (k, v) in result["sha_dictionaries"][self.tdst.get_id_string()].iteritems():
 782 |             if k not in result["sha_dictionaries"][self.tsrc.get_id_string()]:
 783 |                 sys.exit("The Group By value %s appears in %s but not in %s.\nMake sure to first execute the "
 784 |                          "'count' verification step!" % (k, self.tdst.get_id_string(), self.tsrc.get_id_string()))
 785 |             elif v != result["sha_dictionaries"][self.tsrc.get_id_string()][k]:
 786 |                 list_differences.append(k)
 787 | 
 788 |         if len(list_differences) != 0:
 789 |             logging.info("We found %i differences in sha verification", len(list_differences))
 790 |             logging.debug("Differences in sha are: %s", list_differences[:300])
 791 | 
 792 |         return list_differences, result["names_sha_tables"], result["cleaning"]
 793 | 
 794 |     def get_column_blocks_most_differences(self, differences, temp_tables):
 795 |         """Return the information of which columns contain most differences
 796 | 
 797 |         From the compare_shas step, we know all the rowBuckets that present some differences. The goal of this
 798 |         function is to identify which columnsBuckets have those differences.
 799 | 
 800 |         :type differences: list of str
 801 |         :param differences: the list of Group By values which present different row checksums
 802 | 
 803 |         :type temp_tables: dict
 804 |         :param temp_tables: contains the names of the temporary tables ["src_table", "dst_table"]
 805 | 
 806 |         :rtype: tuple
 807 |         :returns: ``(column_blocks_most_differences, map_colblocks_bucketrows)``, where
 808 |                     ``column_blocks_most_differences`` is the Counter of the column blocks with most differences;
 809 |                     ``map_colblocks_bucketrows`` is a list that "maps" a column block with the list of the hash of the
 810 |                     bucket rows that contain a difference
 811 | 
 812 |         :raises: IOError if the query has some execution errors
 813 |         """
 814 |         subset_differences = str(differences[:10000])[1:-1]  # let's choose quite a big number (instead of just looking
 815 |         # at some few (5 for instance) differences for 2 reasons: 1) by fetching more rows we will find estimate
 816 |         # better which column blocks fail often 2) we have less possibilities to face some 'permutations' problems
 817 |         logging.debug("The sha differences that we consider are: %s", str(subset_differences))
 818 | 
 819 |         src_query = "SELECT * FROM %s WHERE gb IN (%s)" % (temp_tables[self.tsrc.get_id_string()], subset_differences)
 820 |         dst_query = "SELECT * FROM %s WHERE gb IN (%s)" % (temp_tables[self.tdst.get_id_string()], subset_differences)
 821 |         logging.debug("queries to find differences in bucket_blocks are: \n%s\n%s", src_query, dst_query)
 822 | 
 823 |         src_sha_lines = {}  # key=gb, values=list of shas from the blocks (not the one of the whole line)
 824 |         dst_sha_lines = {}
 825 |         t_src = threading.Thread(name='srcFetchShaDifferences', target=self.tsrc.launch_query_dict_result,
 826 |                                  args=(src_query, src_sha_lines, True))
 827 |         t_dst = threading.Thread(name='dstFetchShaDifferences', target=self.tdst.launch_query_dict_result,
 828 |                                  args=(dst_query, dst_sha_lines, True))
 829 |         t_src.start()
 830 |         t_dst.start()
 831 |         t_src.join()
 832 |         t_dst.join()
 833 | 
 834 |         # We want to find the column blocks that present most of the differences, and the bucket_rows associated to it
 835 |         column_blocks_most_differences = Counter()
 836 |         column_blocks = self.tsrc.get_column_blocks(self.tsrc.get_ddl_columns())
 837 |         # noinspection PyUnusedLocal
 838 |         map_colblocks_bucketrows = [[] for x in range(len(column_blocks))]
 839 |         for bucket_row, dst_blocks in dst_sha_lines.iteritems():
 840 |             src_blocks = src_sha_lines[bucket_row]
 841 |             for idx, sha in enumerate(dst_blocks):
 842 |                 if sha != src_blocks[idx]:
 843 |                     column_blocks_most_differences[idx] += 1
 844 |                     map_colblocks_bucketrows[idx].append(bucket_row)
 845 |         logging.debug("Block columns with most differences are: %s. Which correspond to those bucket rows: %s",
 846 |                       column_blocks_most_differences, map_colblocks_bucketrows)
 847 | 
 848 |         # collisions could happen for instance with those 2 "rows" (1st column is the Group BY value, 2nd column is the
 849 |         # value of a column in the data, 3rd column is the value of another column which belongs to another 'block
 850 |         # column' than the 2nd one):
 851 |         # ## SRC table:
 852 |         # bucket1   1   A
 853 |         # bucket1   0   B
 854 |         # ## DST table:
 855 |         # bucket1   0   A
 856 |         # bucket1   1   B
 857 |         # In such case, the 'grouped sha of each column' will be always the same. But the sha-lines will be different
 858 |         # That leads to a bad situation where the program says that there are some differences but it  is not possible
 859 |         # to show them on the the web page (which is quite confusing for the user running the program).
 860 |         if len(column_blocks_most_differences) == 0:
 861 |             raise RuntimeError("Program faced some collisions when trying to assess which blocks of columns were not"
 862 |                                "correct. Please contact the developer to ask for a fix")
 863 | 
 864 |         return column_blocks_most_differences, map_colblocks_bucketrows
 865 | 
 866 |     def get_sql_final_differences(self, column_blocks_most_differences, map_colblocks_bucketrows, index):
 867 |         """Return the queries to get the real data for the differences found in the last compare_shas() step
 868 | 
 869 |         Get the definition of the [index]th column block with most differences, and generate the SQL queries that will
 870 |         be triggered to show to the developer those differences.
 871 | 
 872 |         :type column_blocks_most_differences: :class:`Counter`
 873 |         :param column_blocks_most_differences: the Counter of the column blocks with most differences. The keys of this
 874 |                 Counter is the index of the column block
 875 | 
 876 |         :type map_colblocks_bucketrows: list of list
 877 |         :param map_colblocks_bucketrows: list that "maps" a column block (the element N in the list corresponds to the
 878 |         column block N) with the list of the hash of the bucket rows that contain a difference.
 879 | 
 880 |         :type index: int
 881 |         :param index: the position of the block column we want to get
 882 | 
 883 |         :rtype: tuple of str
 884 |         :returns: ``(hive_final_sql, bq_final_sql, list_column_to_check)``, the queries to be executed to do the final
 885 |                     debugging, and the name of the columns that are fetched.
 886 |         """
 887 |         column_block_most_different = column_blocks_most_differences.most_common(index)[index - 1][0]
 888 |         column_blocks = self.tsrc.get_column_blocks(self.tsrc.get_ddl_columns())
 889 |         # buckets otherwise we might want to take a second block
 890 |         list_column_to_check = " ,".join([x["name"] for x in column_blocks[column_block_most_different]])
 891 |         # let's display just 10 buckets in error max
 892 |         list_hashs = " ,".join(map(str, map_colblocks_bucketrows[column_block_most_different][:10]))
 893 | 
 894 |         src_final_sql = self.tsrc.create_sql_show_bucket_columns(list_column_to_check, list_hashs)
 895 |         dst_final_sql = self.tdst.create_sql_show_bucket_columns(list_column_to_check, list_hashs)
 896 |         logging.debug("Final source query is: %s \nFinal dest query is: %s", src_final_sql, dst_final_sql)
 897 | 
 898 |         return src_final_sql, dst_final_sql, list_column_to_check
 899 | 
 900 |     @staticmethod
 901 |     def display_html_diff(result, file_name, col_description):
 902 |         """Show the difference of the analysis in a graphical webpage
 903 | 
 904 |          :type result: dict
 905 |          :param result: dictionary that contains the hashMaps of some of the rows with differences between the 2 tables
 906 | 
 907 |          :type file_name: str
 908 |          :param file_name: prefix of the path where will be written the temporary files
 909 | 
 910 |          :type col_description: str
 911 |          :param col_description: "," separated list of the 5 extra columns from the column block we show in the diff
 912 |          """
 913 |         sorted_file = {}
 914 |         keys = result.keys()
 915 |         for instance in keys:
 916 |             result[instance].sort()
 917 |             sorted_file[instance] = file_name + "_" + instance
 918 |             with open(sorted_file[instance], "w") as f:
 919 |                 f.write("\n".join(result[instance]))
 920 | 
 921 |         diff_html = difflib.HtmlDiff().make_file(result[keys[0]], result[keys[1]], keys[0] + col_description, keys[1]
 922 |                                                  + col_description, context=True, numlines=15)
 923 |         html_file = file_name + ".html"
 924 |         with open(html_file, "w") as f:
 925 |             f.write(diff_html)
 926 |         logging.debug("Sorted results of the queries are in the files %s and %s. HTML differences are in %s",
 927 |                       sorted_file[keys[0]], sorted_file[keys[1]], html_file)
 928 |         webbrowser.open("file://" + html_file, new=2)
 929 | 
 930 |     def show_results_final_differences(self, src_sql, dst_sql, list_extra_columns):
 931 |         """If any differences found in the shas analysis step, then show them in a webpage
 932 | 
 933 |         :type src_sql: str
 934 |         :param src_sql: the query of the source table to launch to see the rows that are different
 935 | 
 936 |         :type dst_sql: str
 937 |         :param dst_sql: the query of the destination table to launch to see the rows that are different
 938 | 
 939 |         :type list_extra_columns: str
 940 |         :param list_extra_columns: list (',' separated) of the extra columns that will be shown in the differences.
 941 |                 This parameter is only used to show the description in the web page.
 942 |         """
 943 |         src_id = self.tsrc.get_id_string()
 944 |         dst_id = self.tdst.get_id_string()
 945 |         result = {src_id: [], dst_id: []}
 946 |         t_src = threading.Thread(name='srcShowShaFinalDifferences', target=self.tsrc.launch_query_csv_compare_result,
 947 |                                  args=(src_sql, result[src_id]))
 948 |         t_dst = threading.Thread(name='dstShowShaFinalDifferences', target=self.tdst.launch_query_csv_compare_result,
 949 |                                  args=(dst_sql, result[dst_id]))
 950 |         t_src.start()
 951 |         t_dst.start()
 952 |         t_src.join()
 953 |         t_dst.join()
 954 | 
 955 |         col_description = "</br>hash(%s) , %s , %s" \
 956 |                           % (self.tsrc.get_groupby_column(), self.tsrc.get_groupby_column(), list_extra_columns)
 957 | 
 958 |         self.display_html_diff(result, "/tmp/sha_diff", col_description)
 959 | 
 960 |         return False  # no need to execute the script further since errors have already been spotted
 961 | 
 962 |     def synchronise_tables(self):
 963 |         """Ensure that some specific properties between the 2 tables have the same values, like the Group By column"""
 964 |         self.tdst._ddl_columns = self.tsrc.get_ddl_columns()
 965 |         # a check DDL comparison
 966 |         self.tdst._group_by_column = self.tsrc.get_groupby_column()  # the Group By must use the same column for both
 967 |         # tables
 968 | 
 969 |     def perform_step_count(self):
 970 |         """Execute the Count comparison of the 2 tables
 971 | 
 972 |         :rtype: bool
 973 |         :returns: True if we haven't found differences yet and further analysis is needed
 974 |         """
 975 |         self.synchronise_tables()
 976 |         diff, big_small = self.compare_groupby_count()
 977 | 
 978 |         if len(diff) == 0:
 979 |             print("No differences were found when doing a Count on the tables %s and %s and grouping by on the "
 980 |                   "column %s" % (self.tsrc.full_name, self.tdst.full_name, self.tsrc.get_groupby_column()))
 981 |             return True  # means that we should continue executing the script
 982 | 
 983 |         self.show_results_count(diff, big_small)
 984 |         return False  # no need to execute the script further since errors have already been spotted
 985 | 
 986 |     @staticmethod
 987 |     def clean_step_sha(tables_to_clean):
 988 |         """Delete temporary table if needed
 989 | 
 990 |         :type tables_to_clean: dict
 991 |         :param tables_to_clean: contains the name of the tables that need to be deleted, and the table object
 992 |         """
 993 |         for table_name, table_object in tables_to_clean:
 994 |             table_object.delete_temporary_table(table_name)
 995 | 
 996 |     def perform_step_sha(self):
 997 |         """Execute the Sha comparison of the 2 tables"""
 998 |         self.synchronise_tables()
 999 |         sha_differences, temporary_tables, tables_to_clean = self.compare_shas()
1000 |         if len(sha_differences) == 0:
1001 |             print("Sha queries were done and no differences were found: the tables %s and %s are equal!"
1002 |                   % (self.tsrc.get_id_string(), self.tdst.get_id_string()))
1003 |             TableComparator.clean_step_sha(tables_to_clean)
1004 |             sys.exit(0)
1005 | 
1006 |         cb_most_diff, map_cb_bucketrows = self.get_column_blocks_most_differences(sha_differences, temporary_tables)
1007 | 
1008 |         for idx_cb in range(1, len(cb_most_diff) + 1):
1009 |             if idx_cb > 1:
1010 |                 answer = raw_input('Do you want to see more differences? [Y/n]: ')
1011 |                 # Yes being the default, we only exit in case of properly pushing 'n'
1012 |                 if answer == 'n':
1013 |                     break
1014 | 
1015 |             queries = self.get_sql_final_differences(cb_most_diff, map_cb_bucketrows, idx_cb)
1016 |             print("Showing differences for columns " + queries[2])
1017 |             self.show_results_final_differences(queries[0], queries[1], queries[2])
1018 | 
1019 |         TableComparator.clean_step_sha(tables_to_clean)
1020 |         sys.exit(1)
1021 | 
1022 | 
1023 | def parse_arguments():
1024 |     """Parse the arguments received on the command line and returns the args element of argparse
1025 | 
1026 |     :rtype: namespace
1027 |     :returns: The object that contains all the configuration of the command line
1028 |     """
1029 |     parser = argparse.ArgumentParser(description="Compare table <source> with table <destination>",
1030 |                                      formatter_class=argparse.RawTextHelpFormatter)
1031 |     parser.add_argument("source", help="the original (correct version) table\n"
1032 |                                        "The format must have the following format: <type>/<database>.<table>\n"
1033 |                                        "<type> can be: bq or hive\n ")
1034 |     parser.add_argument("destination", help="the destination table that needs to be compared\n"
1035 |                                             "Format follows the one for the source table")
1036 | 
1037 |     parser.add_argument("-s", "--source-options", help="options for the source table\nFor Hive that could be: {'jar': "
1038 |                                                        "'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': "
1039 |                                                        "'master-003.bol.net'}\nExample for BigQuery: {'project': "
1040 |                                                        "'myGoogleCloudProject'}")
1041 |     parser.add_argument("-d", "--destination-options", help="options for the destination table")
1042 | 
1043 |     parser.add_argument("--source-where", help="the WHERE condition we want to apply for the source table\n"
1044 |                                                "Could be useful in case of partitioned tables\n"
1045 |                                                "Example: \"datedir='2017-05-01'\"")
1046 |     parser.add_argument("--destination-where", help="the WHERE condition we want to apply for the destination table")
1047 | 
1048 |     parser.add_argument("--max-gb-percent", type=float, default=1.0,
1049 |                         help="if in one sample a column has a value whose frequency is higher than this percentage, "
1050 |                              "then this column is discarded (default: 1.0)")
1051 | 
1052 |     parser.add_argument("--skew-threshold", type=int,
1053 |                         help="increase the threshold (default: 40 000) if you have some skew but if the amount of"
1054 |                              "columns is reduced so that you feel confident that all the shas will fit into memory")
1055 | 
1056 |     group_columns = parser.add_mutually_exclusive_group()
1057 |     group_columns.add_argument("--column-range", default=":",
1058 |                                help="Instead of checking all the columns, you can define a range of columns to check\n"
1059 |                                     "This works as a Python array-range. Meaning that if you want to only analyze the "
1060 |                                     "first 20 columns, you need to give:\n :20")
1061 |     group_columns.add_argument("--columns",
1062 |                                help="Instead of checking all the columns, you can give here the list of the columns you"
1063 |                                     " want to check. Example: 'column1,column14,column23'")
1064 | 
1065 |     parser.add_argument("--ignore-columns",
1066 |                         help="the columns in argument will be ignored from analysis. Example: 'column1,column14,"
1067 |                              "column23'")  # not in the group_columns, because we want to be able to ask for a range
1068 |     # of columns but removing some few at the same time
1069 | 
1070 |     parser.add_argument("--decodeCP1252-columns",
1071 |                         help="try to transform the CP1252 encoding from the (string) columns in argument into "
1072 |                              "Google's UTF8 encoding (Experimental). Example: 'column1,column14,column23'")
1073 | 
1074 |     parser.add_argument("--group-by-column",
1075 |                         help="the column in argument is enforced to be the Group By column. Can be useful if the sample"
1076 |                              "query does not manage to find a good Group By column and we need to avoid some skew")
1077 | 
1078 |     group_step = parser.add_mutually_exclusive_group()
1079 |     group_step.add_argument("--just-count", help="only perform the Count check", action="store_true")
1080 |     group_step.add_argument("--just-sha", help="only perform the final sha check", action="store_true")
1081 | 
1082 |     group_log = parser.add_mutually_exclusive_group()
1083 |     group_log.add_argument("-v", "--verbose", help="show debug information", action="store_true")
1084 |     group_log.add_argument("-q", "--quiet", help="only show important information", action="store_true")
1085 | 
1086 |     return parser.parse_args()
1087 | 
1088 | 
1089 | def create_table_from_args(definition, options, where, args, tc):
1090 |     """Create and returns a _Table object based on some arguments that were received on the command line
1091 | 
1092 |     :type definition: str
1093 |     :param definition: basic definition of the table, in the format: <type>/<database>.<table>
1094 | 
1095 |     :type options: str
1096 |     :param options: JSON representation of options for this table. Ex: {'project': 'myGoogleCloudProject'}
1097 | 
1098 |     :type where: str
1099 |     :param where: SQL filter that will be put in the WHERE clause, to limit the scope of the table analysed.
1100 | 
1101 |     :type args: :class:`ArgumentParser`
1102 |     :param args: object containing all the arguments from the command line
1103 | 
1104 |     :type tc: :class:`TableComparator`
1105 |     :param tc: current TableComparator instance, so that the table has a pointer to general configuration
1106 | 
1107 |     :rtype: :class:`_Table`
1108 |     :returns: The _Table object, fully configured
1109 |     """
1110 |     table = _Table.create_table_from_string(definition, options, tc)
1111 |     table.set_where_condition(where)
1112 |     table.set_column_range(args.column_range)
1113 |     if args.columns is not None:
1114 |         table.set_chosen_columns(args.columns)
1115 |     if args.ignore_columns is not None:
1116 |         table.set_ignore_columns(args.ignore_columns)
1117 |     if args.decodeCP1252_columns is not None:
1118 |         table.set_decode_cp1252_columns(args.decodeCP1252_columns)
1119 |     table.set_group_by_column(args.group_by_column)  # if not defined, then it's None and we'll compute it later
1120 | 
1121 |     return table
1122 | 
1123 | 
1124 | def main():
1125 |     args = parse_arguments()
1126 | 
1127 |     level_logging = logging.INFO
1128 |     if args.verbose:
1129 |         level_logging = logging.DEBUG
1130 |     elif args.quiet:
1131 |         level_logging = logging.WARNING
1132 |     logging.basicConfig(level=level_logging, format='[%(levelname)s]\t[%(asctime)s]  (%(threadName)-10s) %(message)s', )
1133 | 
1134 |     logging.debug("Starting comparison program with arguments: %s", args)
1135 | 
1136 |     # Create the TableComparator that contains the definition of the 2 tables we want to compare
1137 |     tc = TableComparator()
1138 |     tc.set_max_percent_most_frequent_value_in_column(args.max_gb_percent)
1139 |     source_table = create_table_from_args(args.source, args.source_options, args.source_where, args, tc)
1140 |     destination_table = create_table_from_args(args.destination, args.destination_options, args.destination_where,
1141 |                                                args, tc)
1142 |     if args.skew_threshold is not None:
1143 |         tc.set_skew_threshold(args.skew_threshold)
1144 |     tc.set_tsrc(source_table)
1145 |     tc.set_tdst(destination_table)
1146 | 
1147 |     # Step: count
1148 |     if not args.just_sha:
1149 |         do_we_continue = tc.perform_step_count()
1150 |         if not do_we_continue:
1151 |             sys.exit(1)
1152 | 
1153 |     # Step: sha
1154 |     if not args.just_count:
1155 |         tc.perform_step_sha()
1156 | 
1157 | if __name__ == "__main__":
1158 |     main()
1159 | 


--------------------------------------------------------------------------------