├── .gitignore ├── README.md ├── config └── how_to_use_hive_to_es.ini ├── hive_to_es.py └── sql ├── hql_test1.sql └── hql_test2.sql /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | test/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 同步hive数据到Elasticsearch的工具 2 | =============================== 3 | 4 | - 可选 全量(默认) 和 增量; 5 | - 同时支持编写SQL产生中间结果表,再导入到ES; 6 | 7 | #### 已经支持从impala渠道导数据,极大提升导数据速度 8 | 9 | - 采用分页查询机制,数据集过多时不会撑爆内存; 10 | - 我实习期的公司的数据分析、产品、运营经常需要看各种报表,多是分析统计类需求,Elasticsearch适合做统计分析,结合Kibana可以直接生成报表! 11 | 对这类常有的统计类需求,我的通常做法是从hive数据仓库导数据表到ES,或者先用HQL或ImpalaSQL筛选出结果表,ES拿到数据再进行聚合统计,如(Date Histogram)每天、每周、每月、某人的数据。 12 | - kibana再生成各类可视化图表,最终数据直观展现! 13 | - 力求简洁的配置,方便使用。 14 | 15 | ***Elastic官方已经有了Hive integration的同步工具,但是由于使用的hive版本太低,ES又已经是最新版本, 16 | 尝试使用hive integration时一直报错,为尽快适应当前需求手动造了该轮子。*** 17 | 18 | 脚本使用说明: 19 | - 环境: Python2 Python3 20 | - 命令 python hive_to_es.py config=<配置文件路径.ini> [可选,需要导入的表: tables=table1,table2...] 21 | 22 | 23 | 配置文件使用说明: 使用.ini后缀的配置文件 24 | 25 | ```ini 26 | ;Elasticsearch地址(有多节点,地址用逗号','隔开)、用户名、密码 27 | [es] 28 | hosts = 192.168.3.100:9200 29 | username = elastic 30 | password = 888888 31 | 32 | ;存入的es的index默认等于hive或impala中的数据库名称 33 | ;在这里可配置自定义全局index名,所有导出表将默认导到该index 34 | ;default_index = tqc_ttt 35 | 36 | ;数据平台,默认是hive 37 | ;by = impala 38 | 39 | ;Hive地址、端口、数据库名、用户等配置 40 | [hive] 41 | host = 127.0.0.1 42 | port = 10000 43 | user = hiveuser 44 | auth_mechanism = PLAIN 45 | database = dbname 46 | 47 | ;Impala地址、端口、数据库名等配置 48 | [impala] 49 | host = 127.0.0.1 50 | port = 21050 51 | database = dbname 52 | 53 | 54 | ;需要导到ES的各个表的名称,同时也是导到ES的type名(可配置); 55 | ;如果是通过SQL筛选出新的结果表再导入ES,结果表名称可自定义,但必须再在下面给出SQL文件路径的配置 56 | [table] 57 | tables = student,score,teacher,my_result_a,my_result_b 58 | 59 | ;SQL筛选结果表my_result_a 60 | [my_result_a] 61 | ;通过编写HQL或ImpalaSQL获得新的结果集表导入ES时的SQL文件路径,目前还不支持带有注释的SQL 62 | sql_path = ./sql/hql_test1.sql 63 | 64 | ;再定义另一想要导出到ES的结果表 65 | [my_result_b] 66 | sql_path = ./sql/hql_test2.sql 67 | 68 | 69 | # 如需要对导出表或者结果表作出更多配置,可进行如下可选配置 70 | 71 | ;配置头为对应要导出的表或结果表的名称 72 | ;[student] 73 | 74 | ;若不使用默认index,则配置此目标index 75 | ;es_index = tqc_test 76 | ;若不使用默认type,则配置此目标type;默认type与表名一致 77 | ;es_type = tqc_test_type 78 | 79 | ;限定导出的字段 80 | ;columns = date,name,age,address,sex 81 | 82 | ;选择一个字段作为ES文档中的id 83 | ;id_column = student_id 84 | 85 | ;字段名映射,这里hive表中的name字段映射为ES中的name_in_es,sex字段映射为ES中的sex_in_es... 86 | ;column_mapping = date=@timestamp,name=name_in_es,sex=sex_in_es 87 | 88 | ;where条件语句,导表时限定字段数据值条件 89 | ;where = age>20 AND name LIKE 'abc%' 90 | 91 | ;通过编写HQL或ImpalaSQL获得新的结果集表导入ES时的SQL文件路径,目前还不支持带有注释的SQL 92 | ;sql_path = ./sql/hql_test1.sql 93 | 94 | ;分页查询配置,为了防止一次查询出所有数据,导致结果集过大,内存吃不消,无分页配置时默认分页大小30000 95 | ;page_size = 1000 96 | 97 | ;全量 & 增量:导入数据前是否清空该type下所有数据,默认=true:清空原有type中数据,再把新数据导入ES(全量更新数据)。 98 | ;overwrite = false 99 | 100 | 101 | 102 | ``` 103 | 104 | 105 | ***TODO: 使用多线程*** -------------------------------------------------------------------------------- /config/how_to_use_hive_to_es.ini: -------------------------------------------------------------------------------- 1 | ;Elasticsearch地址(有多节点,地址用逗号','隔开)、用户名、密码 2 | [es] 3 | hosts = 192.168.3.100:9200 4 | username = elastic 5 | password = 888888 6 | 7 | ;存入的es的index默认等于hive或impala中的数据库名称 8 | ;在这里可配置自定义全局index名,所有导出表将默认导到该index 9 | ;default_index = tqc_ttt 10 | 11 | ;数据平台,默认是hive 12 | ;by = impala 13 | 14 | ;Hive地址、端口、数据库名、用户等配置 15 | [hive] 16 | host = 127.0.0.1 17 | port = 10000 18 | user = hiveuser 19 | auth_mechanism = PLAIN 20 | database = dbname 21 | 22 | ;Impala地址、端口、数据库名等配置 23 | [impala] 24 | host = 127.0.0.1 25 | port = 21050 26 | database = dbname 27 | 28 | 29 | ;需要导到ES的各个表的名称,同时也是导到ES的type名(可配置); 30 | ;如果是通过SQL筛选出新的结果表再导入ES,结果表名称可自定义,但必须再在下面给出SQL文件路径的配置 31 | [table] 32 | tables = student,score,teacher,my_result_a,my_result_b 33 | 34 | ;SQL筛选结果表my_result_a 35 | [my_result_a] 36 | ;通过编写HQL或ImpalaSQL获得新的结果集表导入ES时的SQL文件路径,目前还不支持带有注释的SQL 37 | sql_path = ./sql/hql_test1.sql 38 | 39 | ;再定义另一想要导出到ES的结果表 40 | [my_result_b] 41 | sql_path = ./sql/hql_test2.sql 42 | 43 | 44 | # 如需要对导出表或者结果表作出更多配置,可进行如下可选配置 45 | 46 | ;配置头为对应要导出的表或结果表的名称 47 | ;[student] 48 | 49 | ;若不使用默认index,则配置此目标index 50 | ;es_index = tqc_test 51 | ;若不使用默认type,则配置此目标type;默认type与表名一致 52 | ;es_type = tqc_test_type 53 | 54 | ;限定导出的字段 55 | ;columns = date,name,age,address,sex 56 | 57 | ;选择一个字段作为ES文档中的id 58 | ;id_column = student_id 59 | 60 | ;字段名映射,这里hive表中date映射为@timestamp,name字段映射为ES中的name_in_es,sex字段映射为ES中的sex_in_es... 61 | ;column_mapping = date=@timestamp,name=name_in_es,sex=sex_in_es 62 | 63 | ;where条件语句,导表时限定字段数据值条件 64 | ;where = age>20 AND name LIKE 'abc%' 65 | 66 | ;通过编写HQL或ImpalaSQL获得新的结果集表导入ES时的SQL文件路径,目前还不支持带有注释的SQL 67 | ;sql_path = ./sql/hql_test1.sql 68 | 69 | ;分页查询配置,为了防止一次查询出所有数据,导致结果集过大,内存吃不消,无分页配置时默认分页大小30000 70 | ;page_size = 1000 71 | 72 | ;全量 & 增量:导入数据前是否清空该type下所有数据,默认=true:清空原有type中数据,再把新数据导入ES(全量更新数据)。 73 | ;overwrite = false 74 | 75 | -------------------------------------------------------------------------------- /hive_to_es.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | import codecs 4 | import logging 5 | import time 6 | import sys 7 | 8 | import re 9 | from impala.dbapi import connect as big_data_connection 10 | from elasticsearch import Elasticsearch 11 | from elasticsearch import helpers as elasticsearch_helper 12 | 13 | from imp import reload 14 | 15 | reload(sys) 16 | try: 17 | # Python3 18 | import configparser as ConfigParser 19 | except: 20 | # Python2 21 | import ConfigParser 22 | 23 | sys.setdefaultencoding('utf8') 24 | 25 | """ 26 | Created by tangqingchang on 2017-09-02 27 | python hive_to_es.py config=<配置文件路径.ini> [可选,需要导入的表: tables=table1,table2...] 28 | """ 29 | 30 | # TODO: 使用多线程 31 | 32 | def get_map(param_list): 33 | """ 34 | 解析键值对形式的参数数组,返回dict 35 | :param param_list: 参数数组,如sys.argv 36 | :return: 37 | """ 38 | param_dict = {} 39 | try: 40 | for pair in param_list: 41 | ls = pair.split('=') 42 | param_dict[ls[0]] = ls[1] 43 | except: 44 | return {} 45 | return param_dict 46 | 47 | 48 | def get_list(s, f=','): 49 | """ 50 | 分割字符串为数组 51 | :param s: 字符串 52 | :param f: 分隔符,默认是',' 53 | :return: 54 | """ 55 | if (not s) or (not s.strip()) or (s.strip() == ""): 56 | return [] 57 | else: 58 | ls = s.split(f) 59 | return ls 60 | 61 | 62 | logging.basicConfig(level=logging.INFO) 63 | 64 | 65 | def log(*content): 66 | """ 67 | 输出日志 68 | :param content: 69 | :return: 70 | """ 71 | log_content = "[{t}]".format(t=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) 72 | for c in content: 73 | log_content += str(c) 74 | logging.info(log_content) 75 | 76 | 77 | def s2t(seconds): 78 | """ 79 | 秒转化为时间字符串 80 | :param seconds: 81 | :return: 82 | """ 83 | m, s = divmod(seconds, 60) 84 | h, m = divmod(m, 60) 85 | return "%02d:%02d:%02d" % (h, m, s) 86 | 87 | 88 | def get_file_content(path): 89 | """ 90 | 读文件 91 | :param path: 92 | :return: 93 | """ 94 | file = codecs.open(path, 'r+', 'utf-8', 'ignore') 95 | data = file.read() 96 | file.close() 97 | return data 98 | 99 | 100 | def run_query(sql): 101 | """ 102 | 执行mpala-SQL或者hiveQL获得结果 103 | :param sql: 104 | :return: 105 | """ 106 | cur = big_data_conn.cursor() 107 | cur.execute(sql) 108 | des = cur.description 109 | res = cur.fetchall() 110 | res_data = [] 111 | 112 | # 拼接成字典 113 | for r in res: 114 | d = dict() 115 | for i, v in enumerate(r): 116 | if '.' in des[i][0]: 117 | d[des[i][0].split('.')[1]] = v 118 | pass 119 | else: 120 | d[des[i][0]] = v 121 | res_data.append(d) 122 | return res_data 123 | 124 | 125 | def add_paging_and_where_info_into_hql(**kwargs): 126 | """ 127 | 拼接为支持分页的HQL,加入分页信息 128 | :param kwargs 129 | :return: 130 | """ 131 | hql = kwargs['hql'] 132 | start_row = kwargs['start_row'] 133 | to_row = kwargs['to_row'] 134 | where = kwargs.get("where", "") 135 | 136 | ql = hql.lstrip() 137 | start_pos = hql.upper().find(re.findall("FROM\s", hql.upper())[0]) 138 | left = ql[:start_pos] 139 | right = ql[start_pos:] 140 | left = left + ", ROW_NUMBER() OVER () AS row_number_flag " 141 | with_row_number_hql = "SELECT * FROM(" + left + right + ")t_paging" 142 | 143 | if len(where) > 0: 144 | return with_row_number_hql + " WHERE " + where + " AND row_number_flag BETWEEN " + str( 145 | start_row) + " AND " + str( 146 | to_row) + " ORDER BY row_number_flag" 147 | else: 148 | return with_row_number_hql + " WHERE row_number_flag BETWEEN " + str(start_row) + " AND " + str( 149 | to_row) + " ORDER BY row_number_flag" 150 | 151 | 152 | def add_paging_and_where_info_into_impala_sql(**kwargs): 153 | """ 154 | 拼接为支持分页的Impala-SQL,加入分页信息 155 | :param kwargs 156 | :return: 157 | """ 158 | 159 | impala_sql = kwargs['impala_sql'] 160 | start_row = kwargs['start_row'] 161 | to_row = kwargs['to_row'] 162 | where = kwargs.get("where", "") 163 | 164 | ql = impala_sql.lstrip() 165 | start_pos = impala_sql.upper().find(re.findall("FROM\s", impala_sql.upper())[0]) 166 | left = ql[:start_pos] 167 | right = ql[start_pos:] 168 | left = left + ", 0 AS `row_number_flag` " 169 | with_flag_sql = "SELECT * FROM(" + left + right + ")t_paging" 170 | 171 | page_size = to_row - start_row + 1 172 | 173 | if len(where) > 0: 174 | return with_flag_sql + " WHERE " + where + " ORDER BY `row_number_flag` LIMIT " + str( 175 | page_size) + " OFFSET " + str(start_row - 1) 176 | else: 177 | return with_flag_sql + " ORDER BY `row_number_flag` LIMIT " + str(page_size) + " OFFSET " + str(start_row - 1) 178 | 179 | 180 | def get_paging_and_where_supported_sql(sql, start_row, to_row, where, platform): 181 | """ 182 | 获得支持分页信息的SQL 183 | :param sql: 184 | :param start_row: 起始行最小是1 185 | :param to_row: 186 | :param platform: hive or impala 187 | :return: 188 | """ 189 | if platform == "hive": 190 | return add_paging_and_where_info_into_hql(hql=sql, start_row=start_row, to_row=to_row, where=where) 191 | elif platform == "impala": 192 | return add_paging_and_where_info_into_impala_sql(impala_sql=sql, start_row=start_row, to_row=to_row, 193 | where=where) 194 | else: 195 | return "" 196 | 197 | 198 | def config(k, v, fallback=None): 199 | """ 200 | 获取不到配置信息时返回fallback 201 | :param k: 202 | :param v: 203 | :param fallback: 204 | :return: 205 | """ 206 | try: 207 | return main_config.get(k, v) 208 | except: 209 | return fallback 210 | 211 | 212 | if len(sys.argv) < 2: 213 | log("参数不足") 214 | print("例子:") 215 | print("python hive_to_es.py config=<配置文件路径.ini> [可选,需要导入的表: tables=table1,table2...]") 216 | exit(0) 217 | 218 | params_dict = get_map(sys.argv[1:]) 219 | 220 | main_config = ConfigParser.ConfigParser() 221 | main_config.readfp(codecs.open(params_dict['config'], mode='r+', encoding='utf-8')) 222 | es = Elasticsearch(hosts=get_list(config("es", "hosts")), 223 | http_auth=(config("es", "username"), 224 | config("es", "password"))) 225 | 226 | # 导数据途经默认hive 227 | BY = config("es", "by", fallback="hive") 228 | log("导数据途径:", BY) 229 | big_data_conn = big_data_connection(host=config(BY, "host"), 230 | port=int(config(BY, "port")), 231 | database=config(BY, "database"), 232 | user=config(BY, "user", fallback=""), 233 | auth_mechanism=config(BY, "auth_mechanism", fallback=""), 234 | ) 235 | # 导入ES的index默认使用数据库名称 236 | DEFAULT_ES_INDEX = config("es", "default_index", fallback=config(BY, "database")) 237 | MAX_PAGE_SIZE = 30000 238 | 239 | 240 | def run_job(job_config): 241 | """ 242 | 一个任务 243 | :return: 244 | """ 245 | log("*************************", job_config['table'], "开始*************************") 246 | PAGE_SIZE = job_config["page_size"] 247 | ES_INDEX = job_config["es_index"] 248 | ES_TYPE = job_config["es_type"] 249 | COLUMNS = job_config['columns'] 250 | ID_COLUMN = job_config['id_column'] 251 | WHERE = job_config['where'] 252 | COLUMN_MAPPING = job_config['column_mapping'] 253 | OVERWRITE = job_config["overwrite"] 254 | SQL_PATH = job_config["sql_path"] 255 | 256 | if len(SQL_PATH) > 0: 257 | log("SQL文件: ", SQL_PATH) 258 | try: 259 | USER_SQL = get_file_content(SQL_PATH).strip() 260 | if len(COLUMNS) > 0: 261 | USER_SQL = "SELECT " + COLUMNS + " FROM (" + USER_SQL + ") AS columns_chosen" 262 | except Exception as e: 263 | log("读取SQL文件出错,退出: ", e) 264 | return 265 | else: 266 | log("无SQL文件,直接导表数据") 267 | if len(COLUMNS) > 0: 268 | USER_SQL = "SELECT " + COLUMNS + " FROM " + job_config['table'] 269 | pass 270 | else: 271 | USER_SQL = "SELECT * FROM " + job_config['table'] 272 | 273 | log("ES_INDEX: ", ES_INDEX) 274 | log("ES_TYPE: ", ES_TYPE) 275 | log("分页大小: ", PAGE_SIZE) 276 | log("是否全量:", OVERWRITE) 277 | log("自选字段:", COLUMNS) 278 | log("ID_COLUMN:", ID_COLUMN) 279 | log("自定义where条件:", WHERE) 280 | log("字段名称映射:", COLUMN_MAPPING) 281 | log("SQL内容: ", USER_SQL) 282 | if not (USER_SQL.startswith("select") or USER_SQL.startswith("SELECT")): 283 | log("只允许SELECT语句, 退出该任务") 284 | return 285 | log(">>>>>>>>>>>>>>>初始化结束>>>>>>>>>>>>>>>") 286 | 287 | # 开始记录时间 288 | start_time = time.time() 289 | 290 | current_row_num = 1 291 | result_size = PAGE_SIZE 292 | p = 1 293 | 294 | # 开始查询 295 | while result_size == PAGE_SIZE: 296 | log("==================第%s页开始===================" % p) 297 | s = time.time() 298 | log("当前行: ", current_row_num) 299 | 300 | start_row = current_row_num 301 | to_row = current_row_num + PAGE_SIZE - 1 302 | log("开始行号: ", start_row) 303 | log("结束行号: ", to_row) 304 | 305 | final_sql = get_paging_and_where_supported_sql(USER_SQL, start_row, to_row, where=WHERE, platform=BY) 306 | 307 | try: 308 | log("开始执行: ") 309 | log(final_sql) 310 | result_data = run_query(final_sql) 311 | except Exception as e: 312 | log(">>>>>>>>>>>>>>>SQL执行失败,结束该任务:", e, ">>>>>>>>>>>>>>>>>>") 313 | return 314 | 315 | if p == 1: 316 | # es准备 317 | if es.indices.exists(index=ES_INDEX) is True: 318 | if OVERWRITE == "true": 319 | log("全量添加结果集") 320 | # 删除type下所有数据 321 | es.delete_by_query(index=ES_INDEX, 322 | body={"query": {"match_all": {}}}, 323 | doc_type=ES_TYPE, 324 | params={"conflicts": "proceed"}) 325 | else: 326 | log("增量添加结果集") 327 | pass 328 | else: 329 | es.indices.create(index=ES_INDEX) 330 | log("已新创建index:", ES_INDEX) 331 | 332 | actions = [] 333 | for r in result_data: 334 | _source = dict() 335 | obj = dict() 336 | # 根据字段名称映射生成目标文档 337 | for k in r: 338 | if k == 'row_number_flag' or k == ID_COLUMN: 339 | continue 340 | if COLUMN_MAPPING.get(k) is not None: 341 | _source[COLUMN_MAPPING.get(k)] = r[k] 342 | else: 343 | _source[k] = r[k] 344 | obj['_index'] = ES_INDEX 345 | obj['_type'] = ES_TYPE 346 | obj['_source'] = _source 347 | 348 | try: 349 | if len(ID_COLUMN) > 0: 350 | obj['_id'] = r[ID_COLUMN] 351 | except: 352 | pass 353 | 354 | actions.append(obj) 355 | 356 | log("开始插入结果到ES...") 357 | if len(actions) > 0: 358 | elasticsearch_helper.bulk(es, actions) 359 | log("插入ES结束...") 360 | e = time.time() 361 | log("该页查询时间:", s2t(e - s)) 362 | 363 | current_row_num = current_row_num + PAGE_SIZE 364 | result_size = len(result_data) 365 | p = p + 1 366 | 367 | end_time = time.time() 368 | log("************************", job_config['table'], ": 全部结束,花费时间:", s2t(end_time - start_time), 369 | "************************") 370 | 371 | 372 | if params_dict.get('tables') is not None: 373 | result_tables = get_list(params_dict['tables']) 374 | else: 375 | result_tables = get_list(config("table", "tables", fallback="")) 376 | 377 | for result in result_tables: 378 | job_conf = dict() 379 | 380 | job_conf['table'] = result 381 | job_conf['columns'] = config(result, "columns", fallback="") 382 | job_conf['id_column'] = config(result, "id_column", fallback="") 383 | job_conf['column_mapping'] = get_map(get_list(config(result, "column_mapping", fallback=""))) 384 | job_conf['es_index'] = config(result, "es_index", fallback=DEFAULT_ES_INDEX) 385 | job_conf['es_type'] = config(result, "es_type", fallback=result) 386 | 387 | job_conf['page_size'] = min(int(config(result, "page_size", fallback=MAX_PAGE_SIZE)), 388 | MAX_PAGE_SIZE) 389 | # 默认全量导表 390 | job_conf['overwrite'] = config(result, "overwrite", fallback="true") 391 | 392 | job_conf['sql_path'] = config(result, "sql_path", fallback="") 393 | 394 | job_conf['where'] = config(result, "where", fallback="") 395 | try: 396 | run_job(job_conf) 397 | except Exception as e: 398 | log(result, "执行job出错:", job_conf, ": ", e) 399 | 400 | big_data_conn.close() 401 | -------------------------------------------------------------------------------- /sql/hql_test1.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | `id`, 3 | `age`, 4 | `address` 5 | FROM staff 6 | WHERE `age` > 30 -------------------------------------------------------------------------------- /sql/hql_test2.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | `id`, 3 | `name`, 4 | `age` 5 | FROM student 6 | LEFT JOIN score ON score.id = student.id 7 | WHERE score.math_score > 90 AND score.english_score > 90 --------------------------------------------------------------------------------