├── hive_compared_bq
├── __init__.py
├── hive.py
├── bq.py
└── hive_compared_bq.py
├── .gitignore
├── udf
├── hcbq.jar
└── README
├── docs
└── images
│ ├── differences_sha.png
│ └── differences_count.png
├── src
└── main
│ └── java
│ └── org
│ └── apache
│ └── hadoop
│ └── hive
│ └── ql
│ └── udf
│ └── generic
│ └── GenericUDFDecodeCP1252.java
├── LICENSE
└── README.md
/hive_compared_bq/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | launch.sh
3 | *.swp
4 | *.pyc
5 |
--------------------------------------------------------------------------------
/udf/hcbq.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/udf/hcbq.jar
--------------------------------------------------------------------------------
/docs/images/differences_sha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/docs/images/differences_sha.png
--------------------------------------------------------------------------------
/docs/images/differences_count.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bolcom/hive_compared_bq/HEAD/docs/images/differences_count.png
--------------------------------------------------------------------------------
/udf/README:
--------------------------------------------------------------------------------
1 |
2 | This jar contains 2 UDFs:
3 | - UDFSha1, which is the unmodified sha1 UDF that appeared with Hive 1.3.0
4 | - GenericUDFDecodeCP1252, whose source code is in src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDecodeCP1252.java
5 |
6 |
--------------------------------------------------------------------------------
/src/main/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFDecodeCP1252.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | * Copyright 2017 bol.com. All Rights Reserved
4 | *
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 |
19 | package org.apache.hadoop.hive.ql.udf.generic;
20 |
21 | import org.apache.hadoop.hive.ql.exec.Description;
22 | import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
23 | import org.apache.hadoop.hive.ql.metadata.HiveException;
24 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
25 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.Converter;
26 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
27 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
28 | import org.apache.hadoop.io.Text;
29 |
30 | import java.nio.ByteBuffer;
31 | import java.nio.charset.Charset;
32 |
33 | import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping.STRING_GROUP;
34 |
35 | /**
36 | * UDFDecodeCP1252.
37 | */
38 | @Description(
39 | name = "UDFDecodeCP1252",
40 | value = "_FUNC_(str) - Returns str, which is the same string but in a Google's UTF8 encoding instead of the "
41 | + "initial CP1252 encoding")
42 | public class GenericUDFDecodeCP1252 extends GenericUDF {
43 | private transient PrimitiveCategory[] inputTypes = new PrimitiveCategory[1];
44 | private transient Converter[] converters = new Converter[1];
45 | private final Text output = new Text();
46 | private final static Charset CP1252 = Charset.forName( "CP1252");
47 |
48 | @Override
49 | public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException {
50 | checkArgsSize(arguments, 1, 1);
51 |
52 | checkArgPrimitive(arguments, 0);
53 |
54 | checkArgGroups(arguments, 0, inputTypes, STRING_GROUP);
55 |
56 | obtainStringConverter(arguments, 0, inputTypes, converters);
57 |
58 | ObjectInspector outputOI = PrimitiveObjectInspectorFactory.writableStringObjectInspector;
59 | return outputOI;
60 | }
61 |
62 | @Override
63 | public Object evaluate(DeferredObject[] arguments) throws HiveException {
64 | final String stringToDecode = getStringValue(arguments, 0, converters);
65 | if (stringToDecode == null) {
66 | return null;
67 | }
68 |
69 | final char[] charactersToDecode = stringToDecode.toCharArray();
70 | final byte[] bytesArray = new byte[1];
71 | StringBuilder utf8_text = new StringBuilder(charactersToDecode.length);
72 | for (int i = 0; i < charactersToDecode.length; i++){
73 | char character = charactersToDecode[i];
74 |
75 | if( Character.isISOControl( character)){ // encoding problems happened only with control characters
76 | int asciiCode = (int)character;
77 | if( asciiCode >= 1 && asciiCode <= 9 ){
78 | utf8_text.append(" "); // those control characters are transformed to spaces by BigQuery
79 | }
80 | else {
81 | int codePoint = Character.codePointAt( charactersToDecode,i);
82 | // First of all, let's handle the control characters that are transformed to space by BigQuery
83 | // (about those characters see: http://www.fileformat.info/info/unicode/char/0081/index.htm,
84 | // and http://www.ltg.ed.ac.uk/~richard/utf-8.cgi?input=129&mode=decimal
85 | // TODO add missing characters (so far I have just seen those 3)
86 | if( codePoint == 129 || codePoint == 144 || codePoint == 157 ){
87 | utf8_text.append(" ");
88 | }
89 | else {
90 | // this part tries to solve the cases like:
91 | // C299 (hex) CP1252 character being transformed by BigQuery into E284A2 (hex) in UTF8
92 |
93 | // first we need to get rid of the control character (in our example, we get rid of 'C2' and
94 | // we just keep '99')
95 | String last_2_hexa_bytes = String.format("%02x", (int) character);
96 | // gets the byte array from above "hexa string representation"
97 | // (solution extracted from https://stackoverflow.com/a/140861/4064443)
98 | bytesArray[0] = (byte) ((Character.digit(last_2_hexa_bytes.charAt(0), 16) << 4)
99 | | Character.digit(last_2_hexa_bytes.charAt(1), 16));
100 | ByteBuffer wrappedBytes = ByteBuffer.wrap(bytesArray);
101 |
102 | // now we can properly decode from CP1252 charset
103 | String decodedString = CP1252.decode(wrappedBytes).toString();
104 | utf8_text.append(decodedString);
105 | }
106 | }
107 | }
108 | else{ // "standard" characters are OK. We just copied them "as is"
109 | utf8_text.append(character);
110 | }
111 | }
112 | output.set(utf8_text.toString());
113 | return output;
114 | }
115 |
116 | @Override
117 | public String getDisplayString(String[] children) {
118 | return getStandardDisplayString(getFuncName(), children);
119 | }
120 |
121 | @Override
122 | protected String getFuncName() {
123 | return "decodeCP1252";
124 | }
125 | }
126 |
--------------------------------------------------------------------------------
/hive_compared_bq/hive.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Copyright 2017 bol.com. All Rights Reserved
4 |
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | """
18 |
19 | import logging
20 | import sys
21 | import time
22 | # noinspection PyProtectedMember
23 | from hive_compared_bq import _Table
24 | import pyhs2 # TODO switch to another module since this one is deprecated and does not support Python 3
25 | # see notes in : https://github.com/BradRuderman/pyhs2
26 |
27 |
28 | class THive(_Table):
29 | """Hive implementation of the _Table object"""
30 |
31 | def __init__(self, database, table, parent, hs2_server, jar_path):
32 | _Table.__init__(self, database, table, parent)
33 | self.server = hs2_server
34 | self.connection = self._create_connection()
35 | self.jarPath = jar_path
36 |
37 | def get_type(self):
38 | return "hive"
39 |
40 | def _create_connection(self):
41 | """Connect to the table and return the connection object that we will use to launch queries"""
42 | return pyhs2.connect(host=self.server, port=10000, authMechanism="KERBEROS", database=self.database)
43 |
44 | def get_ddl_columns(self):
45 | if len(self._ddl_columns) > 0:
46 | return self._ddl_columns
47 |
48 | is_col_def = True
49 | cur = self.connection.cursor()
50 | cur.execute("describe " + self.full_name)
51 | all_columns = []
52 | while cur.hasMoreRows:
53 | row = cur.fetchone()
54 | if row is None:
55 | continue
56 | col_name = row[0]
57 | col_type = row[1]
58 |
59 | if col_name == "" or col_name == "None":
60 | continue
61 | if col_name.startswith('#'):
62 | if "Partition Information" in col_name:
63 | is_col_def = False
64 | continue
65 |
66 | my_dic = {"name": col_name, "type": col_type}
67 | if is_col_def:
68 | all_columns.append(my_dic)
69 | else:
70 | self._ddl_partitions.append(my_dic)
71 | cur.close()
72 |
73 | self.filter_columns_from_cli(all_columns)
74 |
75 | return self._ddl_columns
76 |
77 | def get_column_statistics(self, query, selected_columns):
78 | cur = self.connection.cursor()
79 | cur.execute(query)
80 | while cur.hasMoreRows:
81 | fetched = cur.fetchone()
82 | if fetched is not None:
83 | for idx, col in enumerate(selected_columns):
84 | value_column = fetched[idx]
85 | col["Counter"][value_column] += 1 # TODO what happens with NULL?
86 | cur.close()
87 |
88 | def create_sql_groupby_count(self):
89 | where_condition = ""
90 | if self.where_condition is not None:
91 | where_condition = "WHERE " + self.where_condition
92 | query = "SELECT hash( cast( %s as STRING)) %% %i AS gb, count(*) AS count FROM %s %s GROUP BY " \
93 | "hash( cast( %s as STRING)) %% %i" \
94 | % (self.get_groupby_column(), self.tc.number_of_group_by, self.full_name, where_condition,
95 | self.get_groupby_column(), self.tc.number_of_group_by)
96 | logging.debug("Hive query is: %s", query)
97 |
98 | return query
99 |
100 | def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values):
101 | gb_column = self.get_groupby_column()
102 | where_condition = ""
103 | if self.where_condition is not None:
104 | where_condition = self.where_condition + " AND"
105 | hive_query = "SELECT hash( cast( %s as STRING)) %% %i as bucket, %s, %s FROM %s WHERE %s " \
106 | "hash( cast( %s as STRING)) %% %i IN (%s)" \
107 | % (gb_column, self.tc.number_of_group_by, gb_column, extra_columns_str, self.full_name,
108 | where_condition, gb_column, self.tc.number_of_group_by, buckets_values)
109 | logging.debug("Hive query to show the buckets and the extra columns is: %s", hive_query)
110 |
111 | return hive_query
112 |
113 | def create_sql_intermediate_checksums(self):
114 | column_blocks = self.get_column_blocks(self.get_ddl_columns())
115 | number_of_blocks = len(column_blocks)
116 | logging.debug("%i column_blocks (with a size of %i columns) have been considered: %s", number_of_blocks,
117 | self.tc.block_size, str(column_blocks))
118 |
119 | # Generate the concatenations for the column_blocks
120 | hive_basic_shas = ""
121 | for idx, block in enumerate(column_blocks):
122 | hive_basic_shas += "base64( unhex( SHA1( concat( "
123 | for col in block:
124 | name = col["name"]
125 | hive_value_name = name
126 | if col["type"] == 'date':
127 | hive_value_name = "cast( %s as STRING)" % name
128 | elif col["type"] == 'float' or col["type"] == 'double':
129 | hive_value_name = "cast( floor( %s * 10000 ) as bigint)" % name
130 | elif col["type"] == 'string' and name in self.decodeCP1252_columns:
131 | hive_value_name = "DecodeCP1252( %s)" % name
132 | hive_basic_shas += "CASE WHEN %s IS NULL THEN 'n_%s' ELSE %s END, '|'," % (name, name[:2],
133 | hive_value_name)
134 | hive_basic_shas = hive_basic_shas[:-6] + ")))) as block_%i,\n" % idx
135 | hive_basic_shas = hive_basic_shas[:-2]
136 |
137 | where_condition = ""
138 | if self.where_condition is not None:
139 | where_condition = "WHERE " + self.where_condition
140 |
141 | hive_query = "WITH blocks AS (\nSELECT hash( cast( %s as STRING)) %% %i as gb,\n%s\nFROM %s %s\n),\n" \
142 | % (self.get_groupby_column(), self.tc.number_of_group_by, hive_basic_shas, self.full_name,
143 | where_condition) # 1st CTE with the basic block shas
144 | list_blocks = ", ".join(["block_%i" % i for i in range(number_of_blocks)])
145 | hive_query += "full_lines AS(\nSELECT gb, base64( unhex( SHA1( concat( %s)))) as row_sha, %s FROM blocks\n)\n" \
146 | % (list_blocks, list_blocks) # 2nd CTE to get all the info of a row
147 | hive_list_shas = ", ".join(["base64( unhex( SHA1( concat_ws( '|', sort_array( collect_list( block_%i)))))) as "
148 | "block_%i_gb " % (i, i) for i in range(number_of_blocks)])
149 | hive_query += "SELECT gb, base64( unhex( SHA1( concat_ws( '|', sort_array( collect_list( row_sha)))))) as " \
150 | "row_sha_gb, %s FROM full_lines GROUP BY gb" % hive_list_shas # final query where all the shas
151 | # are grouped by row-blocks
152 | logging.debug("##### Final Hive query is:\n%s\n", hive_query)
153 |
154 | return hive_query
155 |
156 | def delete_temporary_table(self, table_name):
157 | self.query("DROP TABLE " + table_name).close()
158 |
159 | def query(self, query):
160 | """Execute the received query in Hive and return the cursor which is ready to be fetched and MUST be closed after
161 |
162 | :type query: str
163 | :param query: query to execute in Hive
164 |
165 | :rtype: :class:`pyhs2.cursor.Cursor`
166 | :returns: the cursor for this query
167 |
168 | :raises: IOError if the query has some execution errors
169 | """
170 | logging.debug("Launching Hive query")
171 | # TODO split number should be done in function of file format (ORC, Avro...) and number of columns
172 | # split_maxsize = 256000000
173 | # split_maxsize = 64000000
174 | split_maxsize = 8000000
175 | # split_maxsize = 16000000
176 | try:
177 | cur = self.connection.cursor()
178 | cur.execute("set mapreduce.input.fileinputformat.split.maxsize = %i" % split_maxsize)
179 | cur.execute("set hive.fetch.task.conversion=minimal") # force a MapReduce, because simple 'fetch' queries
180 | # on a large table may generate some timeout otherwise
181 | cur.execute(query)
182 | except:
183 | raise IOError("There was a problem in executing the query in Hive: %s", sys.exc_info()[1])
184 | logging.debug("Fetching Hive results")
185 | return cur
186 |
187 | def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False):
188 | try:
189 | cur = self.query(query)
190 | while cur.hasMoreRows:
191 | row = cur.fetchone()
192 | if row is not None:
193 | if not all_columns_from_2:
194 | result_dic[row[0]] = row[1]
195 | else:
196 | result_dic[row[0]] = row[2:]
197 | except:
198 | result_dic["error"] = sys.exc_info()[1]
199 | raise
200 | finally:
201 | cur.close()
202 | logging.debug("All %i Hive rows fetched", len(result_dic))
203 |
204 | def launch_query_csv_compare_result(self, query, rows):
205 | cur = self.query(query)
206 | while cur.hasMoreRows:
207 | row = cur.fetchone()
208 | if row is not None:
209 | line = "^ " + " | ".join([str(col) for col in row]) + " $"
210 | rows.append(line)
211 | logging.debug("All %i Hive rows fetched", len(rows))
212 | cur.close()
213 |
214 | def launch_query_with_intermediate_table(self, query, result):
215 | try:
216 | cur = self.query("add jar " + self.jarPath) # must be in a separated execution
217 | cur.execute("create temporary function SHA1 as 'org.apache.hadoop.hive.ql.udf.UDFSha1'")
218 | cur.execute("create temporary function DecodeCP1252 as "
219 | "'org.apache.hadoop.hive.ql.udf.generic.GenericUDFDecodeCP1252'")
220 | except:
221 | result["error"] = sys.exc_info()[1]
222 | raise
223 |
224 | if "error" in result:
225 | cur.close()
226 | return # let's stop the thread if some error popped up elsewhere
227 |
228 | tmp_table = "%s.temp_hiveCmpBq_%s_%s" % (self.database, self.full_name.replace('.', '_'),
229 | str(time.time()).replace('.', '_'))
230 | cur.execute("CREATE TABLE " + tmp_table + " AS\n" + query)
231 | cur.close()
232 | result["names_sha_tables"][self.get_id_string()] = tmp_table # we confirm this table has been created
233 | result["cleaning"].append((tmp_table, self))
234 |
235 | logging.debug("The temporary table for Hive is " + tmp_table)
236 |
237 | if "error" in result: # A problem happened in the other query of the other table (usually BQ, since it is
238 | # faster than Hive) so there is no need to pursue or have the temp table
239 | return
240 |
241 | projection_hive_row_sha = "SELECT gb, row_sha_gb FROM %s" % tmp_table
242 | self.launch_query_dict_result(projection_hive_row_sha, result["sha_dictionaries"][self.get_id_string()])
243 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "{}"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright 2017 bol.com
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/hive_compared_bq/bq.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Copyright 2017 bol.com. All Rights Reserved
4 |
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | """
18 |
19 | import logging
20 | import sys
21 | import time
22 | # noinspection PyProtectedMember
23 | from hive_compared_bq import _Table
24 | from google.cloud import bigquery
25 |
26 |
27 | class TBigQuery(_Table):
28 | """BigQuery implementation of the _Table object"""
29 |
30 | hash2_js_udf = '''create temp function hash2(text STRING)
31 | returns INT64
32 | LANGUAGE js AS """
33 | if(text === null){
34 | return 0 // same behaviour as in Hive
35 | }
36 | let myHash = 0
37 | for (let character of text){
38 | myHash = myHash * 31 + character.charCodeAt(0)
39 | if (myHash >= 4294967296){ // because in Hive hash() is computed on integers range
40 | myHash = myHash % 4294967296
41 | }
42 | }
43 | if (myHash >= 2147483648){
44 | myHash = myHash - 4294967296
45 | }
46 | return myHash
47 | """;
48 | '''
49 |
50 | def __init__(self, database, table, parent, project):
51 | _Table.__init__(self, database, table, parent)
52 |
53 | self.project = project # the Google Cloud project where this dataset/table belongs.If Null, then the default
54 | # environment where this script is executed is used.
55 | self.connection = self._create_connection()
56 |
57 | # check that we can reach dataset and table
58 | dataset = self.connection.dataset(database)
59 | if not dataset.exists():
60 | raise AttributeError("The dataset %s:%s does not seem to exist or is unreachable" % (project, database))
61 |
62 | mytable = dataset.table(table)
63 | if not mytable.exists():
64 | raise AttributeError("The table %s:%s.%s does not seem to exist or is unreachable" %
65 | (project, database, table))
66 |
67 | def get_type(self):
68 | return "bigQuery"
69 |
70 | def _create_connection(self):
71 | """Connect to the table and return the connection object that we will use to launch queries"""
72 | if self.project is None:
73 | return bigquery.Client()
74 | else:
75 | return bigquery.Client(project=self.project)
76 |
77 | def get_ddl_columns(self):
78 | if len(self._ddl_columns) > 0:
79 | return self._ddl_columns
80 | else:
81 | dataset = self.connection.dataset(self.database)
82 | table = dataset.table(self.table)
83 | table.reload()
84 | schema = table.schema
85 |
86 | all_columns = []
87 | for field in schema:
88 | col_name = str(field.name)
89 | col_type = str(
90 | field.field_type.lower()) # force 'str' to remove unicode notation and align it to Hive format
91 | # let's align the types with the ones in Hive
92 | if col_type == 'integer':
93 | col_type = 'bigint'
94 | my_dic = {"name": col_name, "type": col_type}
95 | all_columns.append(my_dic)
96 |
97 | self.filter_columns_from_cli(all_columns)
98 |
99 | return self._ddl_columns
100 |
101 | def get_column_statistics(self, query, selected_columns):
102 | for row in self.query(query):
103 | for idx, col in enumerate(selected_columns):
104 | value_column = row[idx]
105 | col["Counter"][value_column] += 1
106 |
107 | def create_sql_groupby_count(self):
108 | where_condition = ""
109 | if self.where_condition is not None:
110 | where_condition = "WHERE " + self.where_condition
111 | query = self.hash2_js_udf + "SELECT MOD( hash2( cast(%s as STRING)), %i) as gb, count(*) as count FROM %s %s " \
112 | "GROUP BY gb ORDER BY gb" \
113 | % (self.get_groupby_column(), self.tc.number_of_group_by, self.full_name,
114 | where_condition)
115 | logging.debug("BigQuery query is: %s", query)
116 | return query
117 |
118 | def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values):
119 | where_condition = ""
120 | if self.where_condition is not None:
121 | where_condition = self.where_condition + " AND"
122 | gb_column = self.get_groupby_column()
123 | bq_query = self.hash2_js_udf + "SELECT MOD( hash2( cast(%s as STRING)), %i) as bucket, %s as gb, %s FROM %s " \
124 | "WHERE %s MOD( hash2( cast(%s as STRING)), %i) IN (%s)" \
125 | % (gb_column, self.tc.number_of_group_by, gb_column, extra_columns_str,
126 | self.full_name, where_condition, gb_column, self.tc.number_of_group_by,
127 | buckets_values)
128 | logging.debug("BQ query to show the buckets and the extra columns is: %s", bq_query)
129 |
130 | return bq_query
131 |
132 | def create_sql_intermediate_checksums(self):
133 | column_blocks = self.get_column_blocks(self.get_ddl_columns())
134 | number_of_blocks = len(column_blocks)
135 | logging.debug("%i column_blocks (with a size of %i columns) have been considered: %s", number_of_blocks,
136 | self.tc.block_size, str(column_blocks))
137 |
138 | # Generate the concatenations for the column_blocks
139 | bq_basic_shas = ""
140 | for idx, block in enumerate(column_blocks):
141 | bq_basic_shas += "TO_BASE64( sha1( concat( "
142 | for col in block:
143 | name = col["name"]
144 | bq_value_name = name
145 | if col["type"] == 'decimal': # removing trailing & unnecessary 'zero decimal' (*.0)
146 | bq_value_name = 'regexp_replace( %s, "\\.0$", "")' % name
147 | elif col["type"] == 'float' or col["type"] == 'double':
148 | bq_value_name = "cast( cast( FLOOR( %s * 10000) as INT64) as STRING)" % name
149 | elif not col["type"] == 'string':
150 | bq_value_name = "cast( %s as STRING)" % name
151 | bq_basic_shas += "CASE WHEN %s IS NULL THEN 'n_%s' ELSE %s END, '|'," % (name, name[:2], bq_value_name)
152 | bq_basic_shas = bq_basic_shas[:-6] + "))) as block_%i,\n" % idx
153 | bq_basic_shas = bq_basic_shas[:-2]
154 |
155 | where_condition = ""
156 | if self.where_condition is not None:
157 | where_condition = "WHERE " + self.where_condition
158 |
159 | bq_query = self.hash2_js_udf + "WITH blocks AS (\nSELECT MOD( hash2( cast(%s as STRING)), %i) as gb,\n%s\n" \
160 | "FROM %s %s\n),\n" \
161 | % (self.get_groupby_column(), self.tc.number_of_group_by, bq_basic_shas,
162 | self.full_name, where_condition) # 1st CTE with the basic block shas
163 | list_blocks = ", ".join(["block_%i" % i for i in range(number_of_blocks)])
164 | bq_query += "full_lines AS(\nSELECT gb, TO_BASE64( sha1( concat( %s))) as row_sha, %s FROM blocks\n)\n" \
165 | % (list_blocks, list_blocks) # 2nd CTE to get all the info of a row
166 | bq_list_shas = ", ".join(["TO_BASE64( sha1( STRING_AGG( block_%i, '|' ORDER BY block_%i))) as block_%i_gb "
167 | % (i, i, i) for i in range(number_of_blocks)])
168 | bq_query += "SELECT gb, TO_BASE64( sha1( STRING_AGG( row_sha, '|' ORDER BY row_sha))) as row_sha_gb, %s FROM " \
169 | "full_lines GROUP BY gb" % bq_list_shas # final query where all the shas are grouped by row-blocks
170 | logging.debug("##### Final BigQuery query is:\n%s\n", bq_query)
171 |
172 | return bq_query
173 |
174 | def delete_temporary_table(self, table_name):
175 | pass # The temporary (cached) tables in BigQuery are deleted after 24 hours
176 |
177 | def query(self, query):
178 | """Execute the received query in BigQuery and return an iterate Result object
179 |
180 | :type query: str
181 | :param query: query to execute in BigQuery
182 |
183 | :rtype: list of rows
184 | :returns: the QueryResults for this query
185 | """
186 | logging.debug("Launching BigQuery query")
187 | q = self.connection.run_sync_query(query)
188 | q.timeout_ms = 600000 # 10 minutes to execute the BQ query should be more than enough. 1 minute was too short
189 | # TODO use maxResults https://cloud.google.com/bigquery/docs/reference/rest/v2/jobs/query? :
190 | q.use_legacy_sql = False
191 | q.run()
192 | logging.debug("Fetching BigQuery results")
193 | return q.fetch_data()
194 |
195 | def query_ctas_bq(self, query):
196 | """Execute the received query in BigQuery and return the name of the cache results table
197 |
198 | This is the equivalent of a "Create Table As a Select" in Hive. The advantage is that BigQuery only keeps that
199 | table during 24 hours (we don't have to delete it just like in the case of Hive), and we're not charged for the
200 | space used.
201 |
202 | :type query: str
203 | :param query: query to execute in BigQuery
204 |
205 | :rtype: str
206 | :returns: the full name of the cache table (dataset.table) that stores those results
207 |
208 | :raises: IOError if the query has some execution errors
209 | """
210 | logging.debug("Launching BigQuery CTAS query")
211 | job_name = "job_hive_compared_bq_%f" % time.time() # Job ID must be unique
212 | job = self.connection.run_async_query(job_name.replace('.', '_'),
213 | query) # replace(): Job IDs must be alphanumeric
214 | job.use_legacy_sql = False
215 | job.begin()
216 | time.sleep(3) # 3 second is the minimum latency we get in BQ in general. So no need to try fetching before
217 | retry_count = 300 # 10 minutes (because of below time sleep of 2 seconds). This should be enough
218 | while retry_count > 0 and job.state != 'DONE':
219 | retry_count -= 1
220 | time.sleep(2)
221 | job.reload()
222 | logging.debug("BigQuery CTAS query finished")
223 |
224 | if job.errors is not None:
225 | raise IOError("There was a problem in executing the query in BigQuery: %s" % str(job.errors))
226 |
227 | cache_table = job.destination.dataset_name + '.' + job.destination.name
228 | logging.debug("The cache table of the final comparison query in BigQuery is: " + cache_table)
229 |
230 | return cache_table
231 |
232 | def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False):
233 | for row in self.query(query):
234 | if not all_columns_from_2:
235 | result_dic[row[0]] = row[1]
236 | else:
237 | result_dic[row[0]] = row[2:]
238 | logging.debug("All %i BigQuery rows fetched", len(result_dic))
239 |
240 | def launch_query_csv_compare_result(self, query, rows):
241 | for row in self.query(query):
242 | line = "^ " + " | ".join([str(col) for col in row]) + " $"
243 | rows.append(line)
244 | logging.debug("All %i BigQuery rows fetched", len(rows))
245 |
246 | def launch_query_with_intermediate_table(self, query, result):
247 | try:
248 | result["names_sha_tables"][self.get_id_string()] = self.query_ctas_bq(query)
249 | projection_gb_row_sha = "SELECT gb, row_sha_gb FROM %s" % result["names_sha_tables"][self.get_id_string()]
250 | self.launch_query_dict_result(projection_gb_row_sha, result["sha_dictionaries"][self.get_id_string()])
251 | except:
252 | result["error"] = sys.exc_info()[1]
253 | raise
254 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # hive_compared_bq
2 |
3 | hive_compared_bq is a Python program that compares 2 (SQL like) tables, and graphically shows the rows/columns that are different if any are found.
4 |
5 | Comparing the full content of some tables in a scalable way has proved to be a difficult task.
6 |
7 | hive_compared_bq tackles this problem by:
8 | * Not moving the data, avoiding long data transfer typically needed for a "Join compared" approach. Instead, only the aggregated data is moved.
9 | * Working on the full datasets and all the columns, so that we are 100% sure that the 2 tables are the same
10 | * Leveraging the tools (currently: Hive, BigQuery) to be as scalable as possible
11 | * Showing the developer in a graphical way the differences found, so that the developer can easily understands its mistakes and fix them
12 |
13 | # Table of contents
14 | * [Features](#features)
15 | * [Installation](#installation)
16 | + [Note Python version 2.6](#note-python-version-26)
17 | + [Installing Python 2.7](#installing-python-27)
18 | + [For Hive](#for-hive)
19 | - [Installation of the required UDF](#installation-of-the-required-udf)
20 | + [For Big Query](#for-big-query)
21 | * [Usage](#usage)
22 | + [Get Help](#get-help)
23 | + [Basic execution](#basic-execution)
24 | + [Explanation of results](#explanation-of-results)
25 | - [Case of identical tables](#case-of-identical-tables)
26 | - [Case of number of rows not matching](#case-of-number-of-rows-not-matching)
27 | - [Case of differences inside the rows](#case-of-differences-inside-the-rows)
28 | + [Advanced executions](#advanced-executions)
29 | - [Faster executions](#faster-executions)
30 | - [Skewing problem](#skewing-problem)
31 | - [Schema not matching](#schema-not-matching)
32 | - [HBase tables](#hbase-tables)
33 | - [Encoding differences between Hive and BigQuery](#encoding-differences-between-hive-and-bigquery)
34 | - [Problems in the selection of the GroupBy column](#problems-in-the-selection-of-the-groupby-column)
35 | * [Algorithm](#algorithm)
36 | + [Imprecision due to "float" of "double" types](#imprecision-due-to--float--of--double--types)
37 |
38 | ## Features
39 |
40 | * Engines supported: Hive, BigQuery (and HBase to some extent). In theory, it is easy to extend it to other SQL backends such as Spanner, CloudSQL, Oracle... Help is welcomed :) !
41 | * Possibility to only select specific columns or to remove some of them (useful if the schema between the tables is not exactly the same, or if we know that some columns are different and we don't want them to "pollute" the results)
42 | * Possibility to just do a quick check (counting the rows in an advanced way) instead of complete checksum verification
43 | * Detection of skew
44 |
45 | ## Installation
46 |
47 | This software works with Mac (tested on 10.11 and 10.12) and on Linux (tested on Redhat 6). It is not expected to work on Windows.
48 |
49 | Make sure the following software is available:
50 | * Python = 2.7 or 2.6 (see [restrictions for 2.6](#note-python-version-26))
51 | * pyhs2 (just for Hive)
52 | * google.cloud.bigquery (just for BigQuery)
53 |
54 | ### Note Python version 2.6
55 | BigQuery is not supported by this version of Python (Google Cloud SDK only supports Python 2.7)
56 |
57 | It is needed to install some backports for Python's collection. For instance, for a Redhat server it would be:
58 | ```bash
59 | yum install python-backport_collections.noarch
60 | ```
61 |
62 | ### Installing Python 2.7
63 | On RHEL6, Python 2.6 was installed by default. And the RPM version in EPEL for Python2.7 had no RPM available, so I compiled it this way:
64 | ```bash
65 | su -c "yum install gcc gcc-c++ zlib-devel sqlite-devel openssl-devel"
66 | wget https://www.python.org/ftp/python/2.7.13/Python-2.7.13.tgz
67 | tar xvfz Python-2.7.13.tgz
68 | cd Python-2.7.13
69 | ./configure --with-ensurepip=install
70 | make
71 | su -c "make altinstall"
72 | su -c "echo 'export CLOUDSDK_PYTHON=/usr/local/bin/python2.7' >> /etc/bashrc"
73 | export CLOUDSDK_PYTHON=/usr/local/bin/python2.7
74 | ```
75 |
76 | ### For Hive
77 | Execute this as "root":
78 | ```bash
79 | yum install cyrus-sasl-gssapi cyrus-sasl-devel
80 | pip install pyhs2
81 | ```
82 |
83 |
84 | #### Installation of the required UDF
85 |
86 | On the Hadoop cluster where Hive executes, you must copy the library: `udf/hcbq.jar` (this jar contains 2 UDFs: one to compute the SHA1s, and the other to handle encoding translation).
87 |
88 | Place this Jar on a HDFS directory where you have read access (in the example below, we will consider that this jar has been copied in `/user/sluangsay/lib/hcbq.jar`).
89 |
90 | ### For Big Query
91 |
92 | (steps extracted from https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-python)
93 |
94 | To execute below commands, make sure that you already have a Google Cloud account with a BigQuery project created.
95 |
96 | ```bash
97 | mkdir -p ~/bin/googleSdk
98 | cd ~/bin/googleSdk
99 | su -c "pip install --upgrade google-cloud-bigquery"
100 |
101 | wget https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-170.0.0-linux-x86_64.tar.gz
102 | tar xvfz google-cloud-sdk-170.0.0-linux-x86_64.tar.gz
103 |
104 | ./google-cloud-sdk/bin/gcloud init
105 | ./google-cloud-sdk/bin/gcloud auth application-default login
106 | ./google-cloud-sdk/install.sh
107 | ```
108 |
109 | This last command tells you to source 2 files in every session. So include them in your bash profile. For instance in my case it would be:
110 | ```bash
111 | echo "source ${HOME}/bin/googlesdk/google-cloud-sdk/path.bash.inc" >> ~/.bash_profile
112 | echo "source ${HOME}/bin/googlesdk/google-cloud-sdk/completion.bash.inc" >> ~/.bash_profile
113 | ```
114 | Open a new bash session to activate those changes.
115 |
116 | ## Usage
117 |
118 | ### Get Help
119 |
120 | To see all the quick options of hive_compared_bq.py, just execute it without any arguments:
121 | ```bash
122 | python hive_compared_bq.py
123 | ```
124 |
125 | When calling the help, you get more detailled on the different options and also some examples:
126 | ```bash
127 | python hive_compared_bq.py --help
128 | ```
129 |
130 | ### Basic execution
131 |
132 | You must indicate the 2 tables to compare as arguments (the first table will be considered as the "source" table, and the second one as the "destination" table).
133 |
134 | Each table must have the following format: `type`/`database`.`table`
135 | where:
136 | * `type` is the technology of your database (currently, only 'hive' and 'bq' (BigQuery) are supported)
137 | * `database` is the name of your database (also called "dataset" in BigQuery)
138 | * `table` is of course the name of your table
139 |
140 | About the location of those databases:
141 | * In the case of BigQuery, the default Google Cloud project configured in your environment is selected.
142 | If you want to specify another project, you must indicate it with the `project` parameter with the `-s` or `-d` options.
143 | * In the case of Hive, you must specify the hostname of the HiveServer2, using the `hs2` parameter with the `-s` or `-d` options.
144 |
145 | Another note for Hive: you need to pass the HDFS direction of the jar of the required UDF (see installation of Hive above), using the 'jar' option.
146 |
147 | To clarify all the above, let's consider that we want to compare the following 2 tables:
148 | * A Hive table called `hive_compared_bq_table`, inside the database `sluangsay`.
149 | With those parameters, the argument to give is: `hive/sluangsay.hive_compared_bq_table`.
150 | Let's suppose also that the hostname of HiveServer2 is `master-003.bol.net`, and that we installed the required Jar in the HDFS path 'hdfs://hdp/user/sluangsay/lib/hcbq.jar'. Then, since this table is the first one on our command line, it is the source table and we need to define the option: `-s "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}"`
151 | * A BigQuery table also alled `hive_compared_bq_table`, inside the dataset `bidwh2`.
152 |
153 | To compare above 2 tables, you need to execute:
154 | ```bash
155 | python hive_compared_bq.py -s "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}" hive/sluangsay.hive_compared_bq_table bq/bidwh2.hive_compared_bq_table
156 | ```
157 |
158 | ### Explanation of results
159 |
160 | #### Case of identical tables
161 |
162 | When executing above command, if the 2 tables have exactly the same data, then we would get an output similar to:
163 |
164 | ```
165 | [INFO] [2017-09-15 10:59:20,851] (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values
166 | [INFO] [2017-09-15 10:59:22,285] (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences)
167 | [INFO] [2017-09-15 10:59:22,286] (MainThread) Executing the 'Group By' Count queries for sluangsay.hive_compared_bq_table (hive) and bidwh2.hive_compared_bq_table (bigQuery) to do first comparison
168 | No differences were found when doing a Count on the tables sluangsay.hive_compared_bq_table and bidwh2.hive_compared_bq_table and grouping by on the column rowkey
169 | [INFO] [2017-09-15 10:59:48,727] (MainThread) Executing the 'shas' queries for hive_sluangsay.hive_compared_bq_table and bigQuery_bidwh2.hive_compared_bq_table to do final comparison
170 | Sha queries were done and no differences were found: the tables hive_sluangsay.hive_compared_bq_table and bigQuery_bidwh2.hive_compared_bq_table are equal!
171 | ```
172 |
173 | The first 2 lines describe the first step of the algorithm (see the explanation of the algorithm later), and we see that `rowkey` is the column that is here used to perform the `Group By` operations.
174 | Then, the next 2 lines show the second step: we count the number of rows for each value of `rowkey`. In that case, those values match for the 2 tables.
175 | Finally, we do the extensive computation of the SHAs for all the columns and rows of the 2 tables. At the end, the script tells us that our 2 tables are identical.
176 | We can also check the returns value of the script: it is 0, which means no differences.
177 |
178 | #### Case of number of rows not matching
179 |
180 | The first difference that can be observed is that the number of rows associated to (at least) one GroupBy value does not match between the 2 tables.
181 | In such case, the standard output would be similar to:
182 |
183 | ```
184 | [INFO] [2017-09-18 08:09:06,947] (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values
185 | [INFO] [2017-09-18 08:09:07,739] (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences)
186 | [INFO] [2017-09-18 08:09:07,739] (MainThread) Executing the 'Group By' Count queries for sluangsay.hive_compared_bq_table2 (hive) and bidwh2.hive_compared_bq_table2 (bigQuery) to do first comparison
187 | [INFO] [2017-09-18 08:09:35,392] (MainThread) We found at least 3 differences in Group By count
188 | ```
189 |
190 | And the return value of the script would be 1.
191 |
192 | Some of the differences are also shown in a HTML file that is automatically opened in your browser:
193 |
194 | 
195 |
196 | This shows a table, with the rows of 1 table on the left side, and the ones of the other table on the right side.
197 | The names of the table appear at the top of the table.
198 | There we can also see the names of the columns that are shown.
199 | For performance reason and also sake of brevity, only some 7 columns are shown.
200 | The first 2 columns are "special columns": the GroupBy column (2nd column), and just before its SHA1 value (1st column).
201 | In this example, we can see that only rows on the left side appear: this is because the table on the right does not contain rows that contain the rowkeys 21411000029, 65900009 and 6560009.
202 |
203 | #### Case of differences inside the rows
204 |
205 | If the numbers of rows match it is still possible to observe some differences in the last (SHA1) step. In which case, we would get some message similar to:
206 |
207 | ```
208 | [INFO] [2017-09-28 05:53:55,890] (MainThread) Analyzing the columns ['rowkey', 'calc_timestamp', 'categorization_step', 'dts_modified', 'global_id', 'product_category', 'product_group', 'product_subgroup', 'product_subsubgroup', 'unit'] with a sample of 10000 values
209 | [INFO] [2017-09-28 05:53:56,897] (MainThread) Best column to do a GROUP BY is rowkey (occurrences of most frequent value: 1 / the 50 most frequentvalues sum up 50 occurrences)
210 | [INFO] [2017-09-28 05:53:56,897] (MainThread) Executing the 'shas' queries for hive_sluangsay.hive_compared_bq_table3 and bigQuery_bidwh2.hive_compared_bq_table3 to do final comparison
211 | [INFO] [2017-09-28 05:54:26,961] (MainThread) We found 2 differences in sha verification
212 | Showing differences for columns product_category ,product_group ,product_subgroup ,product_subsubgroup ,unit
213 | ```
214 |
215 | The return value of the script would be 1.
216 |
217 | Some of the differences are also shown in a HTML file that is automatically opened in your browser:
218 |
219 | 
220 |
221 | We can see some similar information as the one exposed for differences in the [Count validation](#case-of-number-of-rows-not-matching).
222 | The 7 columns are: first the 2 "special columns", then the 5 columns of a "column block" (see [algorithm](#algorithm) for explanations) which contains some differences. That means that at least 1 of those 5 columns has at least 1 value different.
223 | In our example, we can see that the 2 rows have some differences in the column `product_subgroup`. Those differences are highlighted in yellow.
224 |
225 | If there are several "column blocks" that have some differences, then the program will first show the column block that contains more "row blocks" with differences (take care: that does not mean that it is the column block that contains more differences. We could have indeed a column block with just 1 row block with differences, but that 1 row block could contain 1000s of rows with differences. On the other hand, we could imagine another column block with 2 row blocks containing differences, but each row block could contain 1 single row with differences).
226 | Then after, the program will ask you if you wish to see another column block with differences.
227 |
228 | ### Advanced executions
229 |
230 | #### Faster executions
231 |
232 | By default, the script executes 3 steps (see explanation on how the algorithm executes).
233 | You can run the script faster by removing some steps that you think are unnecessary:
234 |
235 | * with the `--group-by-column` option, you can indicate directly which column you want to use for the Group By.
236 | That means that the first step of analyzing some sample of 10 columns won't be needed.
237 | This step won't usually make a huge difference in the execution time (it usually takes 1-2 seconds), but by doing this you avoid launching a query that might cost you some money (example of BigQuery).
238 | With this option, you might also be able to provide a better column that the one the script would have discovered by itself, which might speed up the following queries (by avoiding Skew for instance, see notes later).
239 |
240 | * with the `--just-count` option, you say that you just want to do a 'count rows validation'.
241 | This is some kind of "basic validation" because you won't be sure that the contents of the rows have identical values.
242 | But that allows you to avoid the final "full SHAs validation" step, that is more expensive and timely to compute.
243 | And maybe this 'count' validation' is enough for you now, because you are at an early stage of development, and you don't need to be 100% sure of your data
244 | (checking the count of rows is also a good idea to double check if some JOINs or some Filters conditions work properly).
245 | Don't forget to eventually run a full SHAs validation when you finish developing.
246 |
247 | * with `--just-sha`, you specify that you don't need the 'count' validation. If you know from previous executions that the counts are correct, then you might indeed decide to skip that previous step.
248 | However, it is a bit at your own risk, because if the counts are not correct, the script will fail but you will have executed a more complex/costly query for that ('count' validation use faster/cheaper queries).
249 |
250 | Another solution to have your validation being executed faster is to limit the scope of your validations. If you decide to validate less data, then you need to process less data, meaning that your queries will be faster/cheaper:
251 |
252 | * for instance, you might be interested in just validating some specific critical columns (maybe because you know that your ETL process does not make any changes on some columns, so why "validating" them?).
253 | In such case, you can use the `--column-range`, `--columns` or `--ignore-columns` options.
254 |
255 | * another example would be just validating some specific partitions. If your data is partitioned by days for instance, then you might decide to only validate 1 day of data.
256 | In such case you have to use the `--source-where` and `--destination-where` to specify a Where condition that tells which partition you want to consider.
257 |
258 | #### Skewing problem
259 |
260 | The program does several queries with some GroupBy operations. As for every GroupBy operation with huge volume of data, skew can be a performance killer, or can even make the query failing because of lack of resources.
261 |
262 | The "count comparison" step (see description of the algorithm to know more about the steps) does not do any complex computation, so skew is annoying but should not be a very big issue.
263 | However, the "SHA1 comparison" step launches some heavy queries, and skew can mean a huge difference.
264 | In our algorithm, skew is determined by the skew in the distribution of the GroupBy column. This is why the selection of this column in the first step of the program is crucial, and that it can mean a huge difference if you specify yourself a column that has a good distribution.
265 | For the above reasons, a skew detection is done during the "count comparison" step. In case a "groupBy value" appears in more than 40 000 rows (you may change this default value with the option '--skew-threshold'), then this groupBy value will be registered and a warning message like this will appear:
266 | ```
267 | Some important skew (threshold: %i) was detected in the Group By column x. The top values are:
268 | ```
269 | Following that message, the list of all the top 10 skewed values (and their number of occurrences) pops up so that a developer can know that he should avoid selecting that GroupBy column the next time he runs the program.
270 | And it also gives the possibility to wonder if it is normal for this dataset to contain such skewed values.
271 |
272 | If the "count comparison" step has not encountered any error, but some skew has been discovered, then the program will also stop, with the following message:
273 | ```
274 | No difference in Group By count was detected but we saw some important skew that could make the next step (comparison of the shas) very slow or failing. So better stopping now. You should consider choosing another Group By column with the '--group-by-column' option
275 | ```
276 | As explained before, stopping at this stage is a protection to avoid launching some heavy/costly queries that have some high probability to fail.
277 | Should you face this situation, then your best option is to specify a GroupBy column with a better distribution with the `--group-by-column` option. Another possibility is to raise the threshold with `--skew-threshold`: in such case that means that you accept and understand the risk of launching the SHA1 computations with these skewed values.
278 |
279 | #### Schema not matching
280 |
281 | To do the comparison, the program needs to first discover the schemas of the tables. What is actually done is fetching the schema of the "source table", and assuming that the "destination table" has the same schema.
282 |
283 | If the schemas between the 2 tables don't match, then by default it is not possible to compare them.
284 | This can easily happen for instance in partitioned tables with BigQuery, where the "column partition" is called `_PARTITIONTIME` and may not match the name of this table in Hive.
285 | To overcome this, there are 2 possibilities:
286 | * creating some views (in Hive or BigQuery) to rename or remove some columns. Take care not to make this view too complex otherwise that could lower the validity of the comparison.
287 | * limit the columns you want to compare, using the options `--column-range`, `--columns`, `--ignore-columns`. The problem of this approach is that you cannot rename columns.
288 |
289 | #### HBase tables
290 |
291 | It is possible to do also the comparison with some HBase tables.
292 | To do so, you need to create a Hive table with a HBase backend (see documentation: https://cwiki.apache.org/confluence/display/Hive/HBaseIntegration ).
293 | Be aware that Hive on top HBase has some limitations, so you won't be able to check all the data in your HBase tables (for instance: you cannot see the timestamps, or older versions of a cell, and all the columns must be properly defined in the Hive schema).
294 |
295 | #### Encoding differences between Hive and BigQuery
296 |
297 | All the data that is internally stored by BigQuery is automatically saved with an UTF8 encoding. That can give some problems when some strings in Hive are using another encoding: the byte representation between each column may not match even if the 'texts' are the same.
298 | To overcoome this, a UDF has been developed in Hive in order to transform some specific columns with a CP1252 encoding into Google's UTF8 encoding.
299 | Use the `--decodeCP1252-columns` option to specify those columns. Take care: this is code is quite experimental and it does not guarantee that the "translation of encoding" will totally work.
300 |
301 | #### Problems in the selection of the GroupBy column
302 |
303 | The first step of the algorithms tries to estimate the best "GroupBy" column (see explanations about Algorithm below).
304 | However, this estimation can go wrong and in such case the following message appears:
305 | ```
306 | Error: we could not find a suitable column to do a Group By. Either relax the selection condition with the '--max-gb-percent' option or directly select the column with '--group-by-column'
307 | ```
308 |
309 | If you face this problem, then just follow one of the 2 solutions proposed in the message above.
310 | The best solution is obviously to have some knowledge about the data and to directly indicate to the script which column is the best one to do some GroupBy, using the `--group-by-column` option.
311 | The other possibility is to use the `--max-gb-percent` option and to make it higher than the default value (1%), in order to allow a bit less homogeneous distribution in the data of the GroupBy column and thus have more probability to find a GroupBy column.
312 |
313 | ## Algorithm
314 |
315 | The goal of hive_compared_bq was to avoid all the shortcomings of previous approaches that tried to solve the same comparison problem.
316 | Some of those approaches are:
317 | * Manual check, comparing some few rows and counting the total number of rows.
318 | Problem of this approach is obviously that it cannot work with a certain amount of data or with tables with lot of columns. So the check cannot be complete.
319 | * Doing some "JOINs" comparisons between the 2 tables.
320 | The drawback of this is that Joins are some quite slow operations in general. What is more, it is very difficult to do a fair comparison when there are multiple rows with the same "Join column".
321 | The last inconvenience is that you have to ensure that the 2 tables are located on the same place, which might force you to transfer the data from 1 database to another one (ex: from a Hive database to a BigQuery database), which takes a long time if the table is huge.
322 | * Sorting all the rows and doing a Diff between them (just like described here: https://community.hortonworks.com/articles/1283/hive-script-to-validate-tables-compare-one-with-an.html ).
323 | The inconvenience with this is also that the 2 tables have to be located on the same place. What is more, the "Diff" operations occurs in memory which means that this approach does not work with huge tables.
324 |
325 | To solve above problems, it was decided to:
326 | * develop an algorithm that would leverage the BigData backends (Hive, BigQuery or others): all the complicated computations must be performed on those engines that naturally scale.
327 | That means that the goal of the algorithm is to generate some complex SQL queries that will be sent to those backends, and then that will compare the results of those queries.
328 | Those results must be small, otherwise the Python program would not be able to do the final comparison in memory.
329 | * compare all the rows and all the columns of the tables.
330 | To ensure that we can have some relatively small amount of results (see reason just above) coming from a huge amount of data, we need to "reduce" the results.
331 | Using some checksums (SHA1) algorithm seems appropriate because it helps in doing that reduction and also in giving a high confidence in the validity of the comparison.
332 |
333 | In summary, the idea of the programs is just to compute locally to each table all the checksums of all the rows/columns, to "Group BY" those checksums and generate some global checksums on top of them.
334 | And then, all those general checksums will be transferred back to the program where the final comparison will be made and where potential differences will be shown to the user.
335 | The following simplified pseudo SQL query summarizes a bit this idea:
336 |
337 | WITH blocks AS (
338 | SELECT MOD( hash2( column), 100000) as gb, sha1(concat( col0, col1, col2, col3, col4)) as block_0,
339 | sha1(concat( col5, col6, col7, col8, col9)) as block_1, ... as block_N FROM table
340 | ),
341 | full_lines AS (
342 | SELECT gb, sha1(concat( block_0, |, block_1...) as row_sha, block_0, block_1 ... FROM blocks
343 | )
344 | SELECT gb, sha1(concat(list)) as sline, sha1(concat(list)) as sblock_1,
345 | sha1(concat(list)) as sblock_2 ... as sblock_N FROM GROUP BY gb
346 |
347 | The real query is obviously a bit more complex, because we need to take care about type conversions, NULL values and the ordering of the rows for the same GroupBy values.
348 | If you feel interested in seeing the real query, launch the script with the `--verbose` option.
349 |
350 | The result of above query is stored in a temporary table. The size of this table is small (because we take a modulo of the HASH of the groupBy column, and because the number of columns is limited).
351 | Therefore, because this amount of data is reasonable, we can allow ourselves to download those results locally where our Python program is being executed.
352 |
353 | In the 2 temporary tables, the first 2 columns (gb and sline) are the most important.
354 | "gb" is used as a kind of "index". "sline" is the total checksum that
355 | corresponds to all the columns of all the rows whose GroupBy value modulo 100000 matches the "gb" value.
356 | If "sline" differs between the 2 tables for a same "gb" value, then we know that some rows in that "GroupBy-modulo block" don't match between the 2 tables.
357 | Then, the program queries again the temporary tables and ask all the columns for all the specific "gb" values that have a difference.
358 | The idea is to discover which "block of columns" (called "sblock_N" in the previous pseudo query) are different.
359 | By doing so, we are able to know not only which rows are different, but also in which columns those differences appear.
360 | This allows us to launch some final simple SELECT queries against the original tables, fetching those rows and those specific columns.
361 | Finally, those 2 small datasets are sorted and a difference of them is exposed in a web browser.
362 |
363 | The pseudo SQL query shown above is quite heavy to compute, and has to access all the data of each table.
364 | In order to avoid launching such a heavy query in case of "stupid mistakes" (maybe your ETL flow failed and there is 1 table that is void), a quick comparison step has been introduced before.
365 | In general, we can divide the program in 3 steps:
366 |
367 | 1. **Determination of the GroupBy column**
368 | In this first step, we try to assess which column is more suitable to be used as a GroupBy column.
369 | It is important to find a column that has many different values (otherwise, the next queries will be poorly parallelized, and we can have some OOM problems).
370 | We want also to avoid some skewed columns.
371 | To do so, the program fetch a sample of data from 1 table: some 10 000 rows and some 10 columns. We only look at 10 columns for 3 reasons: 1) we suppose that it would be enough to find there a good one 2) to limit the amount of data transfered over the network 3) to avoid being billed too much (for instance, the price in BigQuery grows with the number of columns being read).
372 | Finally, an analysis is done on those columns, we discard all the columns that don't have enough diversity or that are too skewed, and from the remaining columns we keep the one that seems less skewed.
373 | Working on a sample has some limits and it is possible that the chosen column is not the best one. It is also possible that all the 10 columns are discarded.
374 | In both situation you can use the `--group-by-column` option to overcome that problem.
375 |
376 | 2. **Quick count check**
377 | The program does a first validation to quickly check, in a light way if there are some big differences in the 2 tables.
378 | This is why we just try to count the number of rows in the tables. More than counting the total number of rows, we will check the number of rows for each "row-bucket", that is: group of rows with a GroupBy value whose hash modulo 10000 are the same.
379 | This verification is fast because it just need to do some small computation on just 1 column. There is indeed no point in trying to do some very complex computation on all the columns if the number of rows does not match.
380 | In such case, the differences are shown to the user and the program stops, without executing the 3rd step.
381 | During this step, a skew detection is also performed, which is a protection for the 3rd step.
382 |
383 | 3. **Full SHA1 validation**
384 | This step, described earlier in this paragraph, is the only one that can guarantee that the 2 tables are identical.
385 |
386 | ### Imprecision due to "float" of "double" types
387 |
388 | The representations of decimal numbers are not always exact, and small differences in their representations can be sometimes observed between Hive and BigQuery, meaning that the corresponding SHA1 values will be totally different.
389 | For instance, the following query in BigQuery returns `8.5400000000000009`:
390 | ```
391 | select cast( cast( 8.540000000000001 as FLOAT64 ) as STRING)
392 | ```
393 | And in Hive we would get `8.540000000000001`.
394 |
395 | To overcome those problems, it was decided to:
396 | * multiply any float or double number by 10000 (most of the decimal numbers managed in Bol.com are 'price numbers' with just 2 digits, so multiplying by 10000 should give enough precision).
397 | * round the resulting number with floor() in order to get rid of any "decimal imprecision"
398 | * cast the result into an integer (otherwise the previous number may end up with a '.0' or not, depending if it is Hive or BigQuery)
399 |
400 | The above approach works in many cases but we understand that it is not a general solution (for instance, no distinction would be made between '0.0000001' and '0').
401 | We may thus need in the future to add other possibilities to solve those imprecision problems.
402 |
--------------------------------------------------------------------------------
/hive_compared_bq/hive_compared_bq.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | Copyright 2017 bol.com. All Rights Reserved
4 |
5 |
6 | Licensed under the Apache License, Version 2.0 (the "License");
7 | you may not use this file except in compliance with the License.
8 | You may obtain a copy of the License at
9 |
10 | http://www.apache.org/licenses/LICENSE-2.0
11 |
12 | Unless required by applicable law or agreed to in writing, software
13 | distributed under the License is distributed on an "AS IS" BASIS,
14 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | See the License for the specific language governing permissions and
16 | limitations under the License.
17 | """
18 |
19 | import argparse
20 | import ast
21 | import logging
22 | import threading
23 | import difflib
24 | import re
25 | import sys
26 | import webbrowser
27 | from abc import ABCMeta, abstractmethod
28 |
29 | if sys.version_info[0:2] == (2, 6):
30 | # noinspection PyUnresolvedReferences
31 | from backport_collections import Counter
32 | else:
33 | from collections import Counter
34 |
35 | ABC = ABCMeta('ABC', (object,), {}) # compatible with Python 2 *and* 3
36 |
37 |
38 | class _Table(ABC):
39 | """Represent an abstract table that contains database connection and the related SQL executions
40 |
41 | :type database: str
42 | :param database: Name of the database
43 |
44 | :type table: str
45 | :param table: Name of the table
46 |
47 | :type tc: :class:`TableComparator`
48 | :param tc: reference to the parent object, in order to access the configuration
49 |
50 | :type column_range: str
51 | :param column_range: Python array range that represents the range of columns in the DDL that we want to compare
52 |
53 | :type chosen_columns: str
54 | :param chosen_columns: list of the (str) columns we want to focus on
55 |
56 | :type ignore_columns: str
57 | :param ignore_columns: list of the (str) columns we want to ignore
58 |
59 | :type decodeCP1252_columns: str
60 | :param decodeCP1252_columns: list of the (str) columns that are encoded in CP1252 and we want to transform into
61 | Google's UTF8 encoding
62 |
63 | :type where_condition: str
64 | :param where_condition: boolean SQL condition that will be added to the WHERE condition of all the queries for that
65 | table, so that we can restrict the analysis to a specific scope. Also quite useful when
66 | we want to work on specific partitions.
67 |
68 | :type full_name: str
69 | :param full_name: the full name of the table: .
70 | """
71 |
72 | __metaclass__ = ABCMeta
73 |
74 | def __init__(self, database, table, parent, *args, **kwargs):
75 | self.database = database
76 | self.table = table
77 | self.tc = parent
78 | self.column_range = ":"
79 | self.chosen_columns = None
80 | self.ignore_columns = None
81 | self.decodeCP1252_columns = []
82 | self.where_condition = None
83 | self.full_name = self.database + '.' + self.table
84 | self._ddl_columns = [] # array instead of dictionary because we want to maintain the order of the columns
85 | self._ddl_partitions = [] # take care, those rows also appear in the columns array
86 | self._group_by_column = None # the column that is used to "bucket" the rows
87 |
88 | @staticmethod
89 | def check_stdin_options(typedb, stdin_options, allowed_options, compulsory_options):
90 | """Validate the options entered, given those allowed and those compulsory
91 |
92 | :type typedb: str
93 | :param typedb: the type of the database ({hive,bq})
94 |
95 | :type stdin_options: str
96 | :param stdin_options: the options entered on command line for this table, given in a Python dictionary format
97 |
98 | :type allowed_options: list of str
99 | :param allowed_options: the list of options that can be used for this table
100 |
101 | :type compulsory_options: dict
102 | :param compulsory_options: dictionary where the keys are the name of the compulsory options, and the value
103 | is a small description of it
104 |
105 | :rtype: dict
106 | :return: a dictionary of the options entered by the user
107 |
108 | :raises: ValueError if the option entered is invalid
109 | """
110 | hash_options = {}
111 | if stdin_options is not None:
112 | try:
113 | hash_options = ast.literal_eval(stdin_options)
114 | except:
115 | raise ValueError("The option must be in a Python dictionary format (just like: {'jar': "
116 | "'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}\nThe value "
117 | "received was: %s" % stdin_options)
118 |
119 | for key in hash_options:
120 | if key not in allowed_options:
121 | raise ValueError("The following option for %s is not supported: %s\nThe only supported options"
122 | " are: %s" % (typedb, key, allowed_options))
123 |
124 | # Needs to be checked even if the user has given no option
125 | for key in compulsory_options:
126 | if key not in hash_options:
127 | raise ValueError("%s option (%s) must be defined for %s tables" % (key, compulsory_options[key],
128 | typedb))
129 |
130 | return hash_options
131 |
132 | @staticmethod
133 | def create_table_from_string(argument, options, table_comparator):
134 | """Parse an argument (usually received on the command line) and return the corresponding Table object
135 |
136 | :type argument: str
137 | :param argument: description of the table to connect to. Must have the format /.
138 | type can be {hive,bq}
139 |
140 | :type options: str
141 | :param options: the dictionary of all the options for this table connection. Could be for instance:
142 | "{'jar': 'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': 'master-003.bol.net'}"
143 |
144 | :type table_comparator: :class:`TableComparator`
145 | :param table_comparator: the TableComparator parent object, to have a reference to the configuration properties
146 |
147 | :rtype: :class:`_Table`
148 | :returns: the _Table object that corresponds to the argument
149 |
150 | :raises: ValueError if the argument is void or does not match the format
151 | """
152 | match = re.match(r'(\w+)/(\w+)\.(\w+)', argument)
153 | if match is None:
154 | raise ValueError("Table description must follow the following format: '/.
'")
155 |
156 | typedb = match.group(1)
157 | database = match.group(2)
158 | table = match.group(3)
159 |
160 | if typedb == "bq":
161 | hash_options = _Table.check_stdin_options(typedb, options, ["project"], {})
162 | from bq import TBigQuery
163 | return TBigQuery(database, table, table_comparator, hash_options.get('project'))
164 | elif typedb == "hive":
165 | hash_options = _Table.check_stdin_options(typedb, options, ["jar", "hs2"], {'hs2': 'Hive Server2 hostname'})
166 | from hive import THive
167 | return THive(database, table, table_comparator, hash_options['hs2'], hash_options.get('jar'))
168 | else:
169 | raise ValueError("The database type %s is currently not supported" % typedb)
170 |
171 | @abstractmethod
172 | def get_type(self):
173 | """Return the (string) type of the database (Hive, BigQuery)"""
174 | pass
175 |
176 | def get_id_string(self):
177 | """Return a string that fully identifies the table"""
178 | return self.get_type() + "_" + self.full_name
179 |
180 | def set_where_condition(self, where):
181 | """the WHERE condition we want to apply for the table. Could be useful in case of partitioned tables
182 |
183 | :type where: str
184 | :param where: the WHERE condition. Example: datedir="2017-05-01"
185 | """
186 | self.where_condition = where
187 |
188 | def set_column_range(self, column_range):
189 | self.column_range = column_range
190 |
191 | def set_chosen_columns(self, cols):
192 | self.chosen_columns = cols.split(",")
193 |
194 | def set_ignore_columns(self, cols):
195 | self.ignore_columns = cols.split(",")
196 |
197 | def set_decode_cp1252_columns(self, cols):
198 | self.decodeCP1252_columns = cols.split(",")
199 |
200 | def set_group_by_column(self, col):
201 | self._group_by_column = col
202 |
203 | @abstractmethod
204 | def get_ddl_columns(self):
205 | """ Return the columns of this table
206 |
207 | The list of the column is an attribute of the class. If it already exists, then it is directly returned.
208 | Otherwise, a connection is made to the database to get the schema of the table and at the same time the
209 | attribute (list) partition is filled.
210 |
211 | :rtype: list of dict
212 | :returns: list of {"name": "type"} dictionaries that represent the columns of this table
213 | """
214 | pass
215 |
216 | def get_groupby_column(self):
217 | """Return a column that seems to have a good distribution in order to do interesting GROUP BY queries with it
218 |
219 | This _group_by_column is an attribute of the class. If it already exists, then it is directly returned.
220 | Otherwise, a small query to get some sample rows on some few columns is performed, in order to evaluate
221 | which of those columns present the best distribution (we want a column that will have as many Group By values
222 | as possible, and that avoid a bit the skew, so that when we detect a specific difference on a bucket, we will
223 | be able to show a number of lines for this bucket not too big). The found column is then saved as the attribute
224 | and returned.
225 |
226 | :rtype: str
227 | :returns: the column that will be used in the Group By
228 | """
229 | if self._group_by_column is not None:
230 | return self._group_by_column
231 |
232 | query, selected_columns = self.get_sample_query()
233 |
234 | # Get a sample from the table and fill Counters to each column
235 | logging.info("Analyzing the columns %s with a sample of %i values", str([x["name"] for x in selected_columns]),
236 | self.tc.sample_rows_number)
237 | for col in selected_columns:
238 | col["Counter"] = Counter() # col: {"name","type"} dictionary. New "counter" key is to track distribution
239 |
240 | self.get_column_statistics(query, selected_columns)
241 | self.find_best_distributed_column(selected_columns)
242 |
243 | return self._group_by_column
244 |
245 | @abstractmethod
246 | def get_column_statistics(self, query, selected_columns):
247 | """Launch the sample query and register the distribution of the selected_columns in the "Counters"
248 |
249 | :type query: str
250 | :param query: SQL query that gets a sample of rows with only the selected columns
251 |
252 | :type selected_columns: list of dict
253 | :param selected_columns: list of the few columns selected in the sample query. It has the format:
254 | {"name": col_name, "type": col_type, "Counter": frequency_of_values}
255 | """
256 | pass
257 |
258 | def find_best_distributed_column(self, selected_columns):
259 | """Look at the statistics from the sample to estimate which column has the best distribution to do a GROUP BY
260 |
261 | The best column is automatically saved in the attribute _group_by_column. If all the selected columns have a
262 | poor distribution, then we exit with error.
263 | We say that we have a poor distribution if the most frequent value of a column is superior than
264 | max_frequent_number.
265 | Then, the best column is the one that has not a poor distribution, and whose sum of occurrences of the 50 most
266 | frequent values is minimal.
267 |
268 | :type selected_columns: list of dict
269 | :param selected_columns: list of the few columns selected in the sample query. It has the format:
270 | {"name": col_name, "type": col_type, "Counter": frequency_of_values}
271 | """
272 | max_frequent_number = self.tc.sample_rows_number * self.tc.max_percent_most_frequent_value_in_column // 100
273 | current_lowest_weight = sys.maxint
274 | highest_first = max_frequent_number
275 |
276 | for col in selected_columns:
277 | highest = col["Counter"].most_common(1)[0]
278 | value_most_popular = highest[0]
279 | frequency_most_popular_value = highest[1]
280 | if frequency_most_popular_value > max_frequent_number:
281 | logging.debug(
282 | "Discarding column '%s' because '%s' was found in sample %i times (higher than limit of %i)",
283 | col["name"], value_most_popular, frequency_most_popular_value, max_frequent_number)
284 | continue
285 | # The biggest value is not too high, so let's see how big are the 50 biggest values
286 | weight_of_most_frequent_values = sum([x[1] for x in col["Counter"]
287 | .most_common(self.tc.number_of_most_frequent_values_to_weight)])
288 | logging.debug("%s: sum up of the %i most frequent occurrence: %i", col["name"],
289 | self.tc.number_of_most_frequent_values_to_weight, weight_of_most_frequent_values)
290 | if weight_of_most_frequent_values < current_lowest_weight:
291 | self._group_by_column = col["name"] # we save here this potential "best group-by" column
292 | current_lowest_weight = weight_of_most_frequent_values
293 | highest_first = frequency_most_popular_value
294 |
295 | if self._group_by_column is None:
296 | sys.exit("Error: we could not find a suitable column to do a Group By. Either relax the selection condition"
297 | " with the '--max-gb-percent' option or directly select the column with '--group-by-column' ")
298 |
299 | logging.info("Best column to do a GROUP BY is %s (occurrences of most frequent value: %i / the %i most frequent"
300 | "values sum up %i occurrences)", self._group_by_column, highest_first,
301 | self.tc.number_of_most_frequent_values_to_weight, current_lowest_weight)
302 |
303 | def filter_columns_from_cli(self, all_columns):
304 | """Filter the columns received from the table schema with the options given by the user, and save this result
305 |
306 | :type all_columns: list of dict
307 | :param all_columns: each column in the table is represented by an element in the list, with the following
308 | format: {"name": col_name, "type": col_type}
309 | """
310 | if self.column_range == ":":
311 | if self.chosen_columns is not None: # user has declared the columns he wants to analyze
312 | leftover = list(self.chosen_columns)
313 | for col in all_columns:
314 | if col['name'] in leftover:
315 | leftover.remove(col['name'])
316 | self._ddl_columns.append(col)
317 | if len(leftover) > 0:
318 | sys.exit("Error: you asked to analyze the columns %s but we could not find them in the table %s"
319 | % (str(leftover), self.get_id_string()))
320 | else:
321 | self._ddl_columns = all_columns
322 | else: # user has requested a specific range of columns
323 | match = re.match(r'(\d*):(\d*)', self.column_range)
324 | if match is None:
325 | raise ValueError("The column range must follow the Python style '1:9'. You gave: %s", self.column_range)
326 |
327 | start = 0
328 | if len(match.group(1)) > 0:
329 | start = int(match.group(1))
330 |
331 | end = len(all_columns)
332 | if len(match.group(2)) > 0:
333 | end = int(match.group(2))
334 |
335 | self._ddl_columns = all_columns[start:end]
336 | logging.debug("The range of columns has been reduced to: %s", self._ddl_columns)
337 |
338 | # Even if the user requested just a specific range of columns, he still has the possibility to remove some
339 | # columns in that range:
340 | if self.ignore_columns is not None:
341 | all_columns = []
342 | for col in self._ddl_columns:
343 | if not col['name'] in self.ignore_columns:
344 | all_columns.append(col)
345 | self._ddl_columns = list(all_columns)
346 |
347 | @abstractmethod
348 | def create_sql_groupby_count(self):
349 | """ Return a SQL query where we count the number of rows for each Group of hash() on the groupby_column
350 |
351 | The column found in get_groupby_column() is used to do a Group By and count the number of rows for each group.
352 | But in order to reduce the number of Groups, we hash the value of this column, and we take the modulo
353 | "number_of_group_by" of this hash value.
354 |
355 | :rtype: str
356 | :returns: SQL query to do the Group By Count
357 | """
358 | pass
359 |
360 | @abstractmethod
361 | def create_sql_show_bucket_columns(self, extra_columns_str, buckets_values):
362 | """ Return a SQL query that shows the rows that match some specific buckets and some given columns
363 |
364 | :type extra_columns_str: str
365 | :param extra_columns_str: the list of extra columns (separated by ",") we want to show to help debugging
366 |
367 | :type buckets_values: str
368 | :param buckets_values: the list of values (separated by ",") of the buckets we want to fetch
369 |
370 | :rtype: str
371 | :returns: SQL query to do the Group By Buckets
372 | """
373 | pass
374 |
375 | @abstractmethod
376 | def create_sql_intermediate_checksums(self):
377 | """Build and return the query that generates all the checksums to make the final comparison
378 |
379 | The query will have the following schema:
380 |
381 | WITH blocks AS (
382 | SELECT MOD( hash2( column), 100000) as gb, sha1(concat( col0, col1, col2, col3, col4)) as block_0,
383 | sha1(concat( col5, col6, col7, col8, col9)) as block_1, ... as block_N FROM table
384 | ),
385 | full_lines AS (
386 | SELECT gb, sha1(concat( block_0, |, block_1...) as row_sha, block_0, block_1 ... FROM blocks
387 | )
388 | SELECT gb, sha1(concat(list)) as sline, sha1(concat(list)) as sblock_1,
389 | sha1(concat(list)) as sblock_2 ... as sblock_N FROM GROUP BY gb
390 |
391 | :rtype: str
392 | :returns: the SQL query with the Group By and the shas
393 | """
394 | pass
395 |
396 | @abstractmethod
397 | def delete_temporary_table(self, table_name):
398 | """Drop the temporary table if needed (if it is not automatically deleted by the system)
399 |
400 | :type table_name: str
401 | :param table_name: name of the table to delete
402 | """
403 | pass
404 |
405 | @abstractmethod
406 | def launch_query_dict_result(self, query, result_dic, all_columns_from_2=False):
407 | """Launch the SQL query and stores the results of the 1st and 2nd columns in the dictionary
408 |
409 | The 1st column of each row is stored as the key of the dictionary, the 2nd column is for the value. This method
410 | is meant to catch the result of a Group By query, so that we are sure that the keys fetched are unique.
411 |
412 | :type query: str
413 | :param query: query to execute
414 |
415 | :type result_dic: dict
416 | :param result_dic: dictionary to store the result
417 |
418 | :type all_columns_from_2: bool
419 | :param all_columns_from_2: True if we want to fetch all the columns of the row, starting from the 2nd one. False
420 | if we want the value of the dictionary to only have the 1st column (take care:
421 | columns start counting with 0).
422 | (default: False)
423 | """
424 | pass
425 |
426 | @abstractmethod
427 | def launch_query_csv_compare_result(self, query, rows):
428 | """Launch the SQL query and stores the rows in an array with some kind of CSV formatting
429 |
430 | The only reason for the "CSV formatting" (separation of columns with "|") is to help in comparing the rows with
431 | the difflib library. This is also the reason why all the lines start with "^" and end with "$"
432 |
433 | :type query: str
434 | :param query: query to execute
435 |
436 | :type rows: list of str
437 | :param rows: the (void) array that will store the rows
438 | """
439 | pass
440 |
441 | @abstractmethod
442 | def launch_query_with_intermediate_table(self, query, result):
443 | """Launch the query, stores the results in a temporary table and put the first 2 columns in a dictionary
444 |
445 | This method is used to computes a lot of checksums and thus is a bit heavy to compute. This is why we store
446 | all those detailed results in a temporary table. Then present a summary of the returned rows by storing the
447 | first 2 columns in a dictionary under the ``result`` dictionary.
448 |
449 | :type query: str
450 | :param query: query to execute
451 |
452 | :type result: dict
453 | :param result: dictionary to store the result
454 | """
455 |
456 | def get_sample_query(self):
457 | """ Build a SQL query to get some sample lines with limited amount of columns
458 |
459 | We limit the number of columns to a small number (ex: 10) because it is usually unnecessary to look at all
460 | the columns in order to find one with a good distribution (usually, the "index" will be in the first columns).
461 | What is more, in BigQuery we are billed by the number of columns we read so we need to avoid reading
462 | hundreds of columns just like what we have for big tables.
463 |
464 | :rtype: tuple
465 | :returns: ``(query, selected_columns)``, where ``query`` is the sample SQL query; ``selected_columns`` is the
466 | list of columns that are fetched
467 | """
468 | query = "SELECT"
469 | selected_columns = self.get_ddl_columns()[:self.tc.sample_column_number]
470 | for col in selected_columns:
471 | query += " %s," % col["name"] # for the last column we'll remove that trailing ","
472 | where_condition = ""
473 | if self.where_condition is not None:
474 | where_condition = "WHERE " + self.where_condition
475 | query = query[:-1] + " FROM %s %s LIMIT %i" % (self.full_name, where_condition, self.tc.sample_rows_number)
476 | return query, selected_columns
477 |
478 | def get_column_blocks(self, ddl):
479 | """Returns the list of a column blocks for a specific DDL (see function create_sql_intermediate_checksums)
480 |
481 | :type ddl: list of dict
482 | :param ddl: the ddl of the tables, containing dictionaries with keys (name, type) to describe each column
483 |
484 | :rtype: list of list
485 | :returns: list of each block, each one containing the (5) columns dictionary ({name, type}) that describe it
486 | """
487 | column_blocks = []
488 | for idx, col in enumerate(ddl):
489 | block_id = idx // self.tc.block_size
490 | if idx % self.tc.block_size == 0:
491 | column_blocks.append([])
492 | column_blocks[block_id].append({"name": col["name"], "type": col["type"]})
493 | return column_blocks
494 |
495 |
496 | class TableComparator(object):
497 | """Represent the general configuration of the program (tables names, number of rows to scan...) """
498 |
499 | def __init__(self):
500 | self.tsrc = None
501 | self.tdst = None
502 |
503 | # 10 000 rows should be good enough to use as a sample
504 | # (Google uses some sample of 1000 or 3000 rows)
505 | # Estimation of memory for the Counters: 10000 * 10 * ( 10 * 4 + 4) * 4 = 16.8 MB
506 | # (based on estimation : rows * columns * ( size string + int) * overhead Counter )
507 | self.sample_rows_number = 10000
508 | self.sample_column_number = 10
509 | self.max_percent_most_frequent_value_in_column = None
510 | self.number_of_most_frequent_values_to_weight = 50
511 |
512 | self.number_of_group_by = 100000 # 7999 is the limit if you want to manually download the data from BQ. This
513 | # limit does not apply in this script because we fetch the data with the Python API instead.
514 | # TODO ideally this limit would be dynamic (counting the total rows), in order to get an average of 7
515 | # lines per bucket
516 | self.skew_threshold = 40000 # if we detect that a Group By has more than this amount of rows
517 | # (compare_groupby_count() method), then we raise an exception, because the computation of the shas (which is
518 | # computationally expensive) might suffer a lot from this skew, and might also trigger some OOM exception.
519 | # 40 000 seems a safe number:
520 | # Let's consider a table with many column: 1000 columns. That means 201 sha blocks (1000 / 5 + 1)
521 | # a sha is 29 characters
522 | # so the max memory with a skew of 40 000 would be:
523 | # 201 * 40000 * 29 / 1024 /1024 = 222 MB, which should fit into the Heap of a task process
524 | self.block_size = 5 # 5 columns means that when we want to debug we have enough context. But it small enough to
525 | # avoid being charged too much by Google when querying on it
526 | reload(sys)
527 | # below method really exists (don't know why PyCharm cannot see it) and is really needed
528 | # noinspection PyUnresolvedReferences
529 | sys.setdefaultencoding('utf-8')
530 |
531 | def set_tsrc(self, table):
532 | """Set the source table to be compared
533 |
534 | :type table: :class:`_Table`
535 | :param table: the _Table object
536 | """
537 | self.tsrc = table
538 |
539 | def set_tdst(self, table):
540 | """Set the destination table to be compared
541 |
542 | :type table: :class:`_Table`
543 | :param table: the _Table object
544 | """
545 | self.tdst = table
546 |
547 | def set_skew_threshold(self, threshold):
548 | """Set the threshold value for the skew
549 |
550 | :type threshold: int
551 | :param threshold: the threshold value (default: 40 000)
552 | """
553 | self.skew_threshold = threshold
554 |
555 | def set_max_percent_most_frequent_value_in_column(self, percent):
556 | """Set the max_percent_most_frequent_value_in_column value
557 |
558 | If in one sample a column has a value whose frequency is higher than this percentage, then this column is not
559 | considered as a suitable column to do the Group By
560 |
561 | :type percent: float
562 | :param percent: the percentage value
563 | """
564 | self.max_percent_most_frequent_value_in_column = percent
565 |
566 | def compare_groupby_count(self):
567 | """Runs a light query on Hive and BigQuery to check if the counts match, using the ideal column estimated before
568 |
569 | Some skew detection is also performed here. And the program stops if we detect some important skew and no
570 | difference was detected (meaning that the sha computation would have been performed with that skew).
571 |
572 | :rtype: tuple
573 | :returns: ``(summary_differences, big_small_bucket)``, where ``summary_differences`` is a list of tuples,
574 | one per difference containing (groupByValue, number of differences for this bucket, count of rows
575 | for this bucket for the "biggest table"); ``big_small_bucket`` is a tuple containing the table that
576 | has the biggest distribution (according to the Group By column) and then the other table
577 | """
578 | logging.info("Executing the 'Group By' Count queries for %s (%s) and %s (%s) to do first comparison",
579 | self.tsrc.full_name, self.tsrc.get_type(), self.tdst.full_name, self.tdst.get_type())
580 | src_query = self.tsrc.create_sql_groupby_count()
581 | dst_query = self.tdst.create_sql_groupby_count()
582 |
583 | result = {"src_count_dict": {}, "dst_count_dict": {}}
584 | t_src = threading.Thread(name='srcGroupBy-' + self.tsrc.get_type(), target=self.tsrc.launch_query_dict_result,
585 | args=(src_query, result["src_count_dict"]))
586 | t_dst = threading.Thread(name='dstGroupBy-' + self.tdst.get_type(), target=self.tdst.launch_query_dict_result,
587 | args=(dst_query, result["dst_count_dict"]))
588 | t_src.start()
589 | t_dst.start()
590 | t_src.join()
591 | t_dst.join()
592 |
593 | for k in result:
594 | if 'error' in result[k]:
595 | sys.exit(result[k]["error"])
596 |
597 | # #### Let's compare the count between the 2 Group By queries
598 | # iterate on biggest dictionary so that we're sure to se a difference if there is one
599 | logging.debug("Searching differences in Group By")
600 | if len(result["src_count_dict"]) > len(result["dst_count_dict"]):
601 | big_dict = result["src_count_dict"]
602 | small_dict = result["dst_count_dict"]
603 | big_small_bucket = (self.tsrc, self.tdst)
604 | else:
605 | big_dict = result["dst_count_dict"]
606 | small_dict = result["src_count_dict"]
607 | big_small_bucket = (self.tdst, self.tsrc)
608 |
609 | differences = Counter()
610 | skew = Counter()
611 | for (k, v) in big_dict.iteritems():
612 | if k not in small_dict:
613 | differences[k] = -v # we want to see the differences where we have less lines to compare
614 | elif v != small_dict[k]:
615 | differences[k] = - abs(v - small_dict[k])
616 | # we check the skew even if some differences were found above and we will never enter the sha computation,
617 | # so that the developer can fix at early stage
618 | max_value = max(v, small_dict.get(k))
619 | if max_value > self.skew_threshold:
620 | skew[k] = max_value
621 | summary_differences = [(k, -v, big_dict[k]) for (k, v) in differences.most_common()]
622 | if len(skew) > 0:
623 | logging.warning("Some important skew (threshold: %i) was detected in the Group By column %s. The top values"
624 | " are: %s", self.skew_threshold, self.tsrc.get_groupby_column(), str(skew.most_common(10)))
625 | if len(summary_differences) == 0:
626 | sys.exit("No difference in Group By count was detected but we saw some important skew that could make "
627 | "the next step (comparison of the shas) very slow or failing. So better stopping now. You "
628 | "should consider choosing another Group By column with the '--group-by-column' option")
629 |
630 | if len(summary_differences) != 0:
631 | logging.info("We found at least %i differences in Group By count", len(summary_differences))
632 | logging.debug("Differences in Group By count are: %s", summary_differences[:300])
633 |
634 | return summary_differences, big_small_bucket
635 |
636 | def show_results_count(self, summary_differences, big_small_bucket):
637 | """If any differences found in the Count Group By step, then show them in a webpage
638 |
639 | :type summary_differences: list of tuple
640 | :param summary_differences: list of all the differences where each difference is described as: (groupByValue,
641 | number of differences for this bucket, count of rows for this bucket for the "biggest table")
642 |
643 | :type big_small_bucket: tuple
644 | :param big_small_bucket: tuple containing the table that has the biggest distribution (according to the Group By
645 | column) and then the other table
646 | """
647 | # TODO break/refactor a bit this function in the same way it has been done for the sha part
648 |
649 | # We want to return at most 6 blocks of lines corresponding to different group by values. For the sake of
650 | # brevity, each block should not show more than 70 lines. Blocks that show rows that appear in only on 1 table
651 | # should be limited to 3 (so that we can see "context" when debugging). To also give context, we will show some
652 | # few other columns.
653 | number_buckets_only_one_table = 0
654 | number_buckets_found = 0
655 | buckets_bigtable = []
656 | buckets_smalltable = []
657 | bigtable = big_small_bucket[0]
658 | smalltable = big_small_bucket[1]
659 | for (bucket, difference_num, biggest_num) in summary_differences:
660 | if difference_num > 70:
661 | break # since the Counter was ordered from small differences to biggest, we know that this difference
662 | # number can only increase. So let's go out of the loop
663 | if biggest_num > 70:
664 | continue
665 | if difference_num == biggest_num:
666 | if number_buckets_only_one_table == 3:
667 | continue
668 | else:
669 | number_buckets_only_one_table += 1
670 | number_buckets_found += 1
671 | if number_buckets_found == 6:
672 | break
673 | buckets_bigtable.append(bucket)
674 | else:
675 | buckets_bigtable.append(bucket)
676 | buckets_smalltable.append(bucket)
677 | number_buckets_found += 1
678 | if number_buckets_found == 6:
679 | break
680 | if len(buckets_bigtable) == 0: # let's ensure that we have at least 1 value to show
681 | (bucket, difference_num, biggest_num) = summary_differences[0]
682 | buckets_bigtable.append(bucket)
683 | if difference_num != biggest_num:
684 | buckets_smalltable.append(bucket)
685 | logging.debug("Buckets for %s: %s \t\tBuckets for %s: %s", bigtable.full_name, str(buckets_bigtable),
686 | smalltable.full_name, str(buckets_smalltable))
687 |
688 | gb_column = self.tsrc.get_groupby_column()
689 | extra_columns = [x["name"] for x in self.tsrc.get_ddl_columns()[:6]] # add 5 extra columns to see some context
690 | if gb_column in extra_columns:
691 | extra_columns.remove(gb_column)
692 | elif len(extra_columns) == 6:
693 | extra_columns = extra_columns[:-1] # limit to 5 columns
694 | extra_columns_str = str(extra_columns)[1:-1].replace("'", "")
695 | bigtable_query = bigtable.create_sql_show_bucket_columns(extra_columns_str, str(buckets_bigtable)[1:-1])
696 |
697 | result = {"big_rows": [], "small_rows": []}
698 | t_big = threading.Thread(name='bigShowCountDifferences-' + bigtable.get_type(),
699 | target=bigtable.launch_query_csv_compare_result,
700 | args=(bigtable_query, result["big_rows"]))
701 | t_big.start()
702 |
703 | if len(buckets_smalltable) > 0: # in case 0, then it means that the "smalltable" does not contain any of
704 | # the rows that appear in the "bigtable". In such case, there is no need to launch the query
705 | smalltable_query = smalltable.create_sql_show_bucket_columns(extra_columns_str,
706 | str(buckets_smalltable)[1:-1])
707 | t_small = threading.Thread(name='smallShowCountDifferences-' + smalltable.get_type(),
708 | target=smalltable.launch_query_csv_compare_result,
709 | args=(smalltable_query, result["small_rows"]))
710 | t_small.start()
711 | t_small.join()
712 | t_big.join()
713 |
714 | sorted_file = {}
715 | for instance in ("big_rows", "small_rows"):
716 | result[instance].sort()
717 | sorted_file[instance] = "/tmp/count_diff_" + instance
718 | with open(sorted_file[instance], "w") as f:
719 | f.write("\n".join(result[instance]))
720 |
721 | column_description = "hash(%s) , %s , %s" \
722 | % (bigtable.get_groupby_column(), bigtable.get_groupby_column(), extra_columns_str)
723 | diff_string = difflib.HtmlDiff().make_file(result["big_rows"], result["small_rows"], bigtable.get_id_string() +
724 | column_description, smalltable.get_id_string() + column_description,
725 | context=False, numlines=30)
726 | html_file = "/tmp/count_diff.html"
727 | with open(html_file, "w") as f:
728 | f.write(diff_string)
729 | logging.debug("Sorted results of the queries are in the files %s and %s. HTML differences are in %s",
730 | sorted_file["big_rows"], sorted_file["small_rows"], html_file)
731 | webbrowser.open("file://" + html_file, new=2)
732 |
733 | def compare_shas(self):
734 | """Runs the final queries on Hive and BigQuery to check if the checksum match and return the list of differences
735 |
736 | :rtype: tuple
737 | :returns: ``(list_differences, names_sha_tables, tables_to_clean)``, where ``list_differences`` is the list of
738 | Group By values which presents different row checksums; ``names_sha_tables`` is a dictionary that
739 | contains the names of the "temporary" result tables; ``tables_to_clean`` is a dictionary of the
740 | temporary tables we will want to remove at the end of the process
741 | """
742 | logging.info("Executing the 'shas' queries for %s and %s to do final comparison",
743 | self.tsrc.get_id_string(), self.tdst.get_id_string())
744 |
745 | tsrc_query = self.tsrc.create_sql_intermediate_checksums()
746 | tdst_query = self.tdst.create_sql_intermediate_checksums()
747 |
748 | # "cleaning" is for all the tables that will need to be eventually deleted. It must contain tuples (, corresponding _Table object). "names_sha_tables" contains all the temporary tables generated
750 | # even the BigQuery cached table that does not need to be deleted. "sha_dictionaries" contains the results.
751 | result = {"cleaning": [], "names_sha_tables": {}, "sha_dictionaries": {
752 | self.tsrc.get_id_string(): {},
753 | self.tdst.get_id_string(): {}
754 | }}
755 | t_src = threading.Thread(name='shaBy-' + self.tsrc.get_id_string(),
756 | target=self.tsrc.launch_query_with_intermediate_table,
757 | args=(tsrc_query, result))
758 | t_dst = threading.Thread(name='shaBy-' + self.tdst.get_id_string(),
759 | target=self.tdst.launch_query_with_intermediate_table,
760 | args=(tdst_query, result))
761 | t_src.start()
762 | t_dst.start()
763 | t_src.join()
764 | t_dst.join()
765 |
766 | if "error" in result:
767 | for table_name, table_object in result["cleaning"]:
768 | table_object.delete_temporary_table(table_name)
769 | sys.exit(result["error"])
770 |
771 | # Comparing the results of those dictionaries
772 | logging.debug("Searching differences in Shas")
773 | src_num_gb = len(result["sha_dictionaries"][self.tsrc.get_id_string()])
774 | dst_num_gb = len(result["sha_dictionaries"][self.tdst.get_id_string()])
775 | if not src_num_gb == dst_num_gb:
776 | sys.exit("The number of Group By values is not the same when doing the final sha queries (%s: %i - "
777 | "%s: %i).\nMake sure to first execute the 'count' verification step!"
778 | % (self.tsrc.get_id_string(), src_num_gb, self.tdst.get_id_string(), dst_num_gb))
779 |
780 | list_differences = []
781 | for (k, v) in result["sha_dictionaries"][self.tdst.get_id_string()].iteritems():
782 | if k not in result["sha_dictionaries"][self.tsrc.get_id_string()]:
783 | sys.exit("The Group By value %s appears in %s but not in %s.\nMake sure to first execute the "
784 | "'count' verification step!" % (k, self.tdst.get_id_string(), self.tsrc.get_id_string()))
785 | elif v != result["sha_dictionaries"][self.tsrc.get_id_string()][k]:
786 | list_differences.append(k)
787 |
788 | if len(list_differences) != 0:
789 | logging.info("We found %i differences in sha verification", len(list_differences))
790 | logging.debug("Differences in sha are: %s", list_differences[:300])
791 |
792 | return list_differences, result["names_sha_tables"], result["cleaning"]
793 |
794 | def get_column_blocks_most_differences(self, differences, temp_tables):
795 | """Return the information of which columns contain most differences
796 |
797 | From the compare_shas step, we know all the rowBuckets that present some differences. The goal of this
798 | function is to identify which columnsBuckets have those differences.
799 |
800 | :type differences: list of str
801 | :param differences: the list of Group By values which present different row checksums
802 |
803 | :type temp_tables: dict
804 | :param temp_tables: contains the names of the temporary tables ["src_table", "dst_table"]
805 |
806 | :rtype: tuple
807 | :returns: ``(column_blocks_most_differences, map_colblocks_bucketrows)``, where
808 | ``column_blocks_most_differences`` is the Counter of the column blocks with most differences;
809 | ``map_colblocks_bucketrows`` is a list that "maps" a column block with the list of the hash of the
810 | bucket rows that contain a difference
811 |
812 | :raises: IOError if the query has some execution errors
813 | """
814 | subset_differences = str(differences[:10000])[1:-1] # let's choose quite a big number (instead of just looking
815 | # at some few (5 for instance) differences for 2 reasons: 1) by fetching more rows we will find estimate
816 | # better which column blocks fail often 2) we have less possibilities to face some 'permutations' problems
817 | logging.debug("The sha differences that we consider are: %s", str(subset_differences))
818 |
819 | src_query = "SELECT * FROM %s WHERE gb IN (%s)" % (temp_tables[self.tsrc.get_id_string()], subset_differences)
820 | dst_query = "SELECT * FROM %s WHERE gb IN (%s)" % (temp_tables[self.tdst.get_id_string()], subset_differences)
821 | logging.debug("queries to find differences in bucket_blocks are: \n%s\n%s", src_query, dst_query)
822 |
823 | src_sha_lines = {} # key=gb, values=list of shas from the blocks (not the one of the whole line)
824 | dst_sha_lines = {}
825 | t_src = threading.Thread(name='srcFetchShaDifferences', target=self.tsrc.launch_query_dict_result,
826 | args=(src_query, src_sha_lines, True))
827 | t_dst = threading.Thread(name='dstFetchShaDifferences', target=self.tdst.launch_query_dict_result,
828 | args=(dst_query, dst_sha_lines, True))
829 | t_src.start()
830 | t_dst.start()
831 | t_src.join()
832 | t_dst.join()
833 |
834 | # We want to find the column blocks that present most of the differences, and the bucket_rows associated to it
835 | column_blocks_most_differences = Counter()
836 | column_blocks = self.tsrc.get_column_blocks(self.tsrc.get_ddl_columns())
837 | # noinspection PyUnusedLocal
838 | map_colblocks_bucketrows = [[] for x in range(len(column_blocks))]
839 | for bucket_row, dst_blocks in dst_sha_lines.iteritems():
840 | src_blocks = src_sha_lines[bucket_row]
841 | for idx, sha in enumerate(dst_blocks):
842 | if sha != src_blocks[idx]:
843 | column_blocks_most_differences[idx] += 1
844 | map_colblocks_bucketrows[idx].append(bucket_row)
845 | logging.debug("Block columns with most differences are: %s. Which correspond to those bucket rows: %s",
846 | column_blocks_most_differences, map_colblocks_bucketrows)
847 |
848 | # collisions could happen for instance with those 2 "rows" (1st column is the Group BY value, 2nd column is the
849 | # value of a column in the data, 3rd column is the value of another column which belongs to another 'block
850 | # column' than the 2nd one):
851 | # ## SRC table:
852 | # bucket1 1 A
853 | # bucket1 0 B
854 | # ## DST table:
855 | # bucket1 0 A
856 | # bucket1 1 B
857 | # In such case, the 'grouped sha of each column' will be always the same. But the sha-lines will be different
858 | # That leads to a bad situation where the program says that there are some differences but it is not possible
859 | # to show them on the the web page (which is quite confusing for the user running the program).
860 | if len(column_blocks_most_differences) == 0:
861 | raise RuntimeError("Program faced some collisions when trying to assess which blocks of columns were not"
862 | "correct. Please contact the developer to ask for a fix")
863 |
864 | return column_blocks_most_differences, map_colblocks_bucketrows
865 |
866 | def get_sql_final_differences(self, column_blocks_most_differences, map_colblocks_bucketrows, index):
867 | """Return the queries to get the real data for the differences found in the last compare_shas() step
868 |
869 | Get the definition of the [index]th column block with most differences, and generate the SQL queries that will
870 | be triggered to show to the developer those differences.
871 |
872 | :type column_blocks_most_differences: :class:`Counter`
873 | :param column_blocks_most_differences: the Counter of the column blocks with most differences. The keys of this
874 | Counter is the index of the column block
875 |
876 | :type map_colblocks_bucketrows: list of list
877 | :param map_colblocks_bucketrows: list that "maps" a column block (the element N in the list corresponds to the
878 | column block N) with the list of the hash of the bucket rows that contain a difference.
879 |
880 | :type index: int
881 | :param index: the position of the block column we want to get
882 |
883 | :rtype: tuple of str
884 | :returns: ``(hive_final_sql, bq_final_sql, list_column_to_check)``, the queries to be executed to do the final
885 | debugging, and the name of the columns that are fetched.
886 | """
887 | column_block_most_different = column_blocks_most_differences.most_common(index)[index - 1][0]
888 | column_blocks = self.tsrc.get_column_blocks(self.tsrc.get_ddl_columns())
889 | # buckets otherwise we might want to take a second block
890 | list_column_to_check = " ,".join([x["name"] for x in column_blocks[column_block_most_different]])
891 | # let's display just 10 buckets in error max
892 | list_hashs = " ,".join(map(str, map_colblocks_bucketrows[column_block_most_different][:10]))
893 |
894 | src_final_sql = self.tsrc.create_sql_show_bucket_columns(list_column_to_check, list_hashs)
895 | dst_final_sql = self.tdst.create_sql_show_bucket_columns(list_column_to_check, list_hashs)
896 | logging.debug("Final source query is: %s \nFinal dest query is: %s", src_final_sql, dst_final_sql)
897 |
898 | return src_final_sql, dst_final_sql, list_column_to_check
899 |
900 | @staticmethod
901 | def display_html_diff(result, file_name, col_description):
902 | """Show the difference of the analysis in a graphical webpage
903 |
904 | :type result: dict
905 | :param result: dictionary that contains the hashMaps of some of the rows with differences between the 2 tables
906 |
907 | :type file_name: str
908 | :param file_name: prefix of the path where will be written the temporary files
909 |
910 | :type col_description: str
911 | :param col_description: "," separated list of the 5 extra columns from the column block we show in the diff
912 | """
913 | sorted_file = {}
914 | keys = result.keys()
915 | for instance in keys:
916 | result[instance].sort()
917 | sorted_file[instance] = file_name + "_" + instance
918 | with open(sorted_file[instance], "w") as f:
919 | f.write("\n".join(result[instance]))
920 |
921 | diff_html = difflib.HtmlDiff().make_file(result[keys[0]], result[keys[1]], keys[0] + col_description, keys[1]
922 | + col_description, context=True, numlines=15)
923 | html_file = file_name + ".html"
924 | with open(html_file, "w") as f:
925 | f.write(diff_html)
926 | logging.debug("Sorted results of the queries are in the files %s and %s. HTML differences are in %s",
927 | sorted_file[keys[0]], sorted_file[keys[1]], html_file)
928 | webbrowser.open("file://" + html_file, new=2)
929 |
930 | def show_results_final_differences(self, src_sql, dst_sql, list_extra_columns):
931 | """If any differences found in the shas analysis step, then show them in a webpage
932 |
933 | :type src_sql: str
934 | :param src_sql: the query of the source table to launch to see the rows that are different
935 |
936 | :type dst_sql: str
937 | :param dst_sql: the query of the destination table to launch to see the rows that are different
938 |
939 | :type list_extra_columns: str
940 | :param list_extra_columns: list (',' separated) of the extra columns that will be shown in the differences.
941 | This parameter is only used to show the description in the web page.
942 | """
943 | src_id = self.tsrc.get_id_string()
944 | dst_id = self.tdst.get_id_string()
945 | result = {src_id: [], dst_id: []}
946 | t_src = threading.Thread(name='srcShowShaFinalDifferences', target=self.tsrc.launch_query_csv_compare_result,
947 | args=(src_sql, result[src_id]))
948 | t_dst = threading.Thread(name='dstShowShaFinalDifferences', target=self.tdst.launch_query_csv_compare_result,
949 | args=(dst_sql, result[dst_id]))
950 | t_src.start()
951 | t_dst.start()
952 | t_src.join()
953 | t_dst.join()
954 |
955 | col_description = "hash(%s) , %s , %s" \
956 | % (self.tsrc.get_groupby_column(), self.tsrc.get_groupby_column(), list_extra_columns)
957 |
958 | self.display_html_diff(result, "/tmp/sha_diff", col_description)
959 |
960 | return False # no need to execute the script further since errors have already been spotted
961 |
962 | def synchronise_tables(self):
963 | """Ensure that some specific properties between the 2 tables have the same values, like the Group By column"""
964 | self.tdst._ddl_columns = self.tsrc.get_ddl_columns()
965 | # a check DDL comparison
966 | self.tdst._group_by_column = self.tsrc.get_groupby_column() # the Group By must use the same column for both
967 | # tables
968 |
969 | def perform_step_count(self):
970 | """Execute the Count comparison of the 2 tables
971 |
972 | :rtype: bool
973 | :returns: True if we haven't found differences yet and further analysis is needed
974 | """
975 | self.synchronise_tables()
976 | diff, big_small = self.compare_groupby_count()
977 |
978 | if len(diff) == 0:
979 | print("No differences were found when doing a Count on the tables %s and %s and grouping by on the "
980 | "column %s" % (self.tsrc.full_name, self.tdst.full_name, self.tsrc.get_groupby_column()))
981 | return True # means that we should continue executing the script
982 |
983 | self.show_results_count(diff, big_small)
984 | return False # no need to execute the script further since errors have already been spotted
985 |
986 | @staticmethod
987 | def clean_step_sha(tables_to_clean):
988 | """Delete temporary table if needed
989 |
990 | :type tables_to_clean: dict
991 | :param tables_to_clean: contains the name of the tables that need to be deleted, and the table object
992 | """
993 | for table_name, table_object in tables_to_clean:
994 | table_object.delete_temporary_table(table_name)
995 |
996 | def perform_step_sha(self):
997 | """Execute the Sha comparison of the 2 tables"""
998 | self.synchronise_tables()
999 | sha_differences, temporary_tables, tables_to_clean = self.compare_shas()
1000 | if len(sha_differences) == 0:
1001 | print("Sha queries were done and no differences were found: the tables %s and %s are equal!"
1002 | % (self.tsrc.get_id_string(), self.tdst.get_id_string()))
1003 | TableComparator.clean_step_sha(tables_to_clean)
1004 | sys.exit(0)
1005 |
1006 | cb_most_diff, map_cb_bucketrows = self.get_column_blocks_most_differences(sha_differences, temporary_tables)
1007 |
1008 | for idx_cb in range(1, len(cb_most_diff) + 1):
1009 | if idx_cb > 1:
1010 | answer = raw_input('Do you want to see more differences? [Y/n]: ')
1011 | # Yes being the default, we only exit in case of properly pushing 'n'
1012 | if answer == 'n':
1013 | break
1014 |
1015 | queries = self.get_sql_final_differences(cb_most_diff, map_cb_bucketrows, idx_cb)
1016 | print("Showing differences for columns " + queries[2])
1017 | self.show_results_final_differences(queries[0], queries[1], queries[2])
1018 |
1019 | TableComparator.clean_step_sha(tables_to_clean)
1020 | sys.exit(1)
1021 |
1022 |
1023 | def parse_arguments():
1024 | """Parse the arguments received on the command line and returns the args element of argparse
1025 |
1026 | :rtype: namespace
1027 | :returns: The object that contains all the configuration of the command line
1028 | """
1029 | parser = argparse.ArgumentParser(description="Compare table with table ",
1030 | formatter_class=argparse.RawTextHelpFormatter)
1031 | parser.add_argument("source", help="the original (correct version) table\n"
1032 | "The format must have the following format: /.
\n"
1033 | " can be: bq or hive\n ")
1034 | parser.add_argument("destination", help="the destination table that needs to be compared\n"
1035 | "Format follows the one for the source table")
1036 |
1037 | parser.add_argument("-s", "--source-options", help="options for the source table\nFor Hive that could be: {'jar': "
1038 | "'hdfs://hdp/user/sluangsay/lib/hcbq.jar', 'hs2': "
1039 | "'master-003.bol.net'}\nExample for BigQuery: {'project': "
1040 | "'myGoogleCloudProject'}")
1041 | parser.add_argument("-d", "--destination-options", help="options for the destination table")
1042 |
1043 | parser.add_argument("--source-where", help="the WHERE condition we want to apply for the source table\n"
1044 | "Could be useful in case of partitioned tables\n"
1045 | "Example: \"datedir='2017-05-01'\"")
1046 | parser.add_argument("--destination-where", help="the WHERE condition we want to apply for the destination table")
1047 |
1048 | parser.add_argument("--max-gb-percent", type=float, default=1.0,
1049 | help="if in one sample a column has a value whose frequency is higher than this percentage, "
1050 | "then this column is discarded (default: 1.0)")
1051 |
1052 | parser.add_argument("--skew-threshold", type=int,
1053 | help="increase the threshold (default: 40 000) if you have some skew but if the amount of"
1054 | "columns is reduced so that you feel confident that all the shas will fit into memory")
1055 |
1056 | group_columns = parser.add_mutually_exclusive_group()
1057 | group_columns.add_argument("--column-range", default=":",
1058 | help="Instead of checking all the columns, you can define a range of columns to check\n"
1059 | "This works as a Python array-range. Meaning that if you want to only analyze the "
1060 | "first 20 columns, you need to give:\n :20")
1061 | group_columns.add_argument("--columns",
1062 | help="Instead of checking all the columns, you can give here the list of the columns you"
1063 | " want to check. Example: 'column1,column14,column23'")
1064 |
1065 | parser.add_argument("--ignore-columns",
1066 | help="the columns in argument will be ignored from analysis. Example: 'column1,column14,"
1067 | "column23'") # not in the group_columns, because we want to be able to ask for a range
1068 | # of columns but removing some few at the same time
1069 |
1070 | parser.add_argument("--decodeCP1252-columns",
1071 | help="try to transform the CP1252 encoding from the (string) columns in argument into "
1072 | "Google's UTF8 encoding (Experimental). Example: 'column1,column14,column23'")
1073 |
1074 | parser.add_argument("--group-by-column",
1075 | help="the column in argument is enforced to be the Group By column. Can be useful if the sample"
1076 | "query does not manage to find a good Group By column and we need to avoid some skew")
1077 |
1078 | group_step = parser.add_mutually_exclusive_group()
1079 | group_step.add_argument("--just-count", help="only perform the Count check", action="store_true")
1080 | group_step.add_argument("--just-sha", help="only perform the final sha check", action="store_true")
1081 |
1082 | group_log = parser.add_mutually_exclusive_group()
1083 | group_log.add_argument("-v", "--verbose", help="show debug information", action="store_true")
1084 | group_log.add_argument("-q", "--quiet", help="only show important information", action="store_true")
1085 |
1086 | return parser.parse_args()
1087 |
1088 |
1089 | def create_table_from_args(definition, options, where, args, tc):
1090 | """Create and returns a _Table object based on some arguments that were received on the command line
1091 |
1092 | :type definition: str
1093 | :param definition: basic definition of the table, in the format: /.
1094 |
1095 | :type options: str
1096 | :param options: JSON representation of options for this table. Ex: {'project': 'myGoogleCloudProject'}
1097 |
1098 | :type where: str
1099 | :param where: SQL filter that will be put in the WHERE clause, to limit the scope of the table analysed.
1100 |
1101 | :type args: :class:`ArgumentParser`
1102 | :param args: object containing all the arguments from the command line
1103 |
1104 | :type tc: :class:`TableComparator`
1105 | :param tc: current TableComparator instance, so that the table has a pointer to general configuration
1106 |
1107 | :rtype: :class:`_Table`
1108 | :returns: The _Table object, fully configured
1109 | """
1110 | table = _Table.create_table_from_string(definition, options, tc)
1111 | table.set_where_condition(where)
1112 | table.set_column_range(args.column_range)
1113 | if args.columns is not None:
1114 | table.set_chosen_columns(args.columns)
1115 | if args.ignore_columns is not None:
1116 | table.set_ignore_columns(args.ignore_columns)
1117 | if args.decodeCP1252_columns is not None:
1118 | table.set_decode_cp1252_columns(args.decodeCP1252_columns)
1119 | table.set_group_by_column(args.group_by_column) # if not defined, then it's None and we'll compute it later
1120 |
1121 | return table
1122 |
1123 |
1124 | def main():
1125 | args = parse_arguments()
1126 |
1127 | level_logging = logging.INFO
1128 | if args.verbose:
1129 | level_logging = logging.DEBUG
1130 | elif args.quiet:
1131 | level_logging = logging.WARNING
1132 | logging.basicConfig(level=level_logging, format='[%(levelname)s]\t[%(asctime)s] (%(threadName)-10s) %(message)s', )
1133 |
1134 | logging.debug("Starting comparison program with arguments: %s", args)
1135 |
1136 | # Create the TableComparator that contains the definition of the 2 tables we want to compare
1137 | tc = TableComparator()
1138 | tc.set_max_percent_most_frequent_value_in_column(args.max_gb_percent)
1139 | source_table = create_table_from_args(args.source, args.source_options, args.source_where, args, tc)
1140 | destination_table = create_table_from_args(args.destination, args.destination_options, args.destination_where,
1141 | args, tc)
1142 | if args.skew_threshold is not None:
1143 | tc.set_skew_threshold(args.skew_threshold)
1144 | tc.set_tsrc(source_table)
1145 | tc.set_tdst(destination_table)
1146 |
1147 | # Step: count
1148 | if not args.just_sha:
1149 | do_we_continue = tc.perform_step_count()
1150 | if not do_we_continue:
1151 | sys.exit(1)
1152 |
1153 | # Step: sha
1154 | if not args.just_count:
1155 | tc.perform_step_sha()
1156 |
1157 | if __name__ == "__main__":
1158 | main()
1159 |
--------------------------------------------------------------------------------