├── __init__.py ├── cluster_task ├── __init__.py ├── clean_db.py ├── clean_db_sync.py ├── clean_table.py ├── dw_sql.py ├── hive_metadata.py ├── hive_tb_store.py ├── indicator_system.py ├── indicator_system_run.py ├── minireport_job.py └── scheduler_job.py ├── core ├── .DS_Store ├── __init__.py ├── conf │ ├── __init__.py │ ├── conf.py │ ├── extract.conf │ ├── system.conf │ ├── task.conf │ ├── template.conf │ └── uba_log.conf ├── core.py ├── model │ ├── __init__.py │ ├── bi_db_model.py │ ├── conf_model.py │ ├── date_model.py │ ├── hadoop_model.py │ ├── hive_db_model.py │ ├── hive_model.py │ ├── mail_model.py │ ├── produce_db_model.py │ ├── spark_model.py │ └── sqoop_model.py ├── shell │ ├── mysql_dump_file_bak.sh │ └── sqoop_import_mysql.sh └── util │ ├── .DS_Store │ ├── __init__.py │ ├── base │ ├── .DS_Store │ ├── __init__.py │ ├── args_format.py │ ├── camel.py │ ├── date.py │ ├── file.py │ ├── process.py │ └── read_conf.py │ ├── hive │ ├── __init__.py │ ├── hive_interface.py │ └── hive_server2.py │ ├── log │ ├── __init__.py │ └── logger.py │ ├── mail │ ├── __init__.py │ └── mail.py │ └── mysql │ ├── __init__.py │ ├── mysql.py │ ├── mysql_interface.py │ └── mysql_interface_20160907.py ├── dw_service ├── .project ├── .pydevproject ├── core │ ├── __init__.pyc │ ├── conf │ │ ├── __init__.pyc │ │ └── conf.pyc │ ├── core.pyc │ ├── model │ │ ├── __init__.pyc │ │ ├── date_model.pyc │ │ └── hive_model.pyc │ └── util │ │ ├── __init__.pyc │ │ ├── base │ │ ├── __init__.pyc │ │ ├── args_format.pyc │ │ ├── camel.pyc │ │ └── date.pyc │ │ └── hive │ │ ├── __init__.pyc │ │ └── hive_server2.pyc └── dw_service │ ├── __init__.pyc │ ├── base_service.pyc │ ├── load_service.pyc │ └── uba_log │ ├── __init__.pyc │ ├── run.pyc │ ├── uba_base.pyc │ └── uba_web_visit_log.pyc ├── dw_service_core.py ├── extract ├── __init__.py ├── extract_mysql.py ├── extract_queue_run.py ├── extract_run.py ├── extract_run.sh ├── gather_run.py ├── gather_table.py └── snapshot_run.py ├── index.py ├── template ├── __init__.py └── template_run.py └── uba_log ├── __init__.py ├── uba_ods_table_run.py ├── uba_run.py └── uba_sql ├── app ├── dw_app_access_log.sql ├── dw_app_access_log_ddl.sql ├── dw_app_action_detail_log.sql └── dw_app_action_detail_log_ddl.sql ├── ods ├── access_log.sql ├── dw_access_log.sql ├── uba_app_action_log.sql ├── uba_web_action_log.sql └── uba_web_visit_log.sql └── web ├── dw_web_action_detail_log.sql ├── dw_web_action_detail_log_ddl.sql ├── dw_web_visit_traffic_log.sql └── dw_web_visit_traffic_log_ddl.sql /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/__init__.py -------------------------------------------------------------------------------- /cluster_task/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/cluster_task/__init__.py -------------------------------------------------------------------------------- /cluster_task/clean_db.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 清理无用数据库,请谨慎操作! 5 | 6 | 适用方法: 7 | ./index.py --service cluster_task --module clean_db --parameter 'dbName' (数据库名称) 8 | ''' 9 | 10 | from dw_service_core import DwServiceCore 11 | 12 | 13 | class CleanDb(DwServiceCore) : 14 | 15 | __dbName = None 16 | 17 | __tables = None 18 | 19 | def process(self): 20 | self.__dbName = self.getParameter() 21 | 22 | self.dropDbTables() 23 | 24 | 25 | u'删除数据库的所有表' 26 | def dropDbTables(self): 27 | 28 | self.__tables = self.getRegisterInstance('hiveModel').getDbTables(self.__dbName) 29 | 30 | for curTable in self.__tables: 31 | dbTable = self.__dbName + '.' + curTable 32 | self.getRegisterInstance('hiveModel').dropTable(dbTable) 33 | 34 | #self.getRegisterInstance('hiveModel').dropDb(self.__dbName) 35 | 36 | -------------------------------------------------------------------------------- /cluster_task/clean_db_sync.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 检测 db_sync 数据库下, 根据 dw_service.clean_table 中的表, 检测没有用到 hive 数据表 5 | 6 | 使用方法: ./index.py --service cluster_task --module clean_db_sync --parameter '' 7 | ''' 8 | 9 | from dw_service_core import DwServiceCore 10 | 11 | 12 | class CleanDbSync(DwServiceCore) : 13 | 14 | __date = None 15 | 16 | __dwServiceDb = None 17 | 18 | 19 | def process(self): 20 | 21 | self.cleanDbSync() 22 | 23 | 24 | u' 清理数据表' 25 | def cleanDbSync(self): 26 | tableData = self.getTableData() 27 | 28 | for curTb in tableData: 29 | tbName = curTb['hive_tb_name'] 30 | 31 | # 检测项目开发的依赖 32 | devTaskRs = self.devTask(tbName) 33 | 34 | minireportRs = self.monireport(tbName) 35 | 36 | # 打印没有依赖的 job 37 | if len(devTaskRs) == 0 and len(minireportRs) == 0: 38 | print tbName 39 | 40 | 41 | 42 | 43 | u' 获取待操作数据表' 44 | def getTableData(self): 45 | querySql = """ 46 | SELECT 47 | bs.* 48 | FROM ( 49 | SELECT 50 | -- 拼接 db_sync 51 | CONCAT(db_name,'__',tb_name) AS hive_tb_name 52 | FROM dw_service.extract_table AS bs 53 | WHERE is_delete=0 AND db_server='product' 54 | ) AS bs 55 | ; 56 | """ 57 | return self.getRegisterInstance('biDbModel').queryAll(querySql) 58 | 59 | 60 | def devTask(self,tbName): 61 | querySql = "SELECT * FROM test.dev_task WHERE details like '%" + tbName + "%';" 62 | return self.getRegisterInstance('biDbModel').queryAll(querySql) 63 | 64 | def monireport(self,tbName): 65 | querySql = "SELECT * FROM test.mini_report WHERE sp LIKE '%"+tbName+"%';" 66 | return self.getRegisterInstance('biDbModel').queryAll(querySql) 67 | 68 | -------------------------------------------------------------------------------- /cluster_task/clean_table.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 删除无用数据表,根据 dw_service.clean_table 填写的数据,清理无用数据表 5 | 6 | 使用方法: ./index.py --service cluster_task --module clean_table --parameter '20151123' (需要清理数据表的日期) 7 | ''' 8 | 9 | from dw_service_core import DwServiceCore 10 | 11 | class CleanTable(DwServiceCore) : 12 | 13 | __date = None 14 | 15 | __dwServiceDb = None 16 | 17 | def process(self): 18 | 19 | self.cleanDwTable() 20 | 21 | 22 | u' 获取当前操作日期' 23 | def getDate(self): 24 | date = self.getParameter() 25 | if (date == ''): 26 | date = self.getRegisterInstance('dateModel').getYesterday() 27 | 28 | self.__date = date 29 | 30 | return self.__date 31 | 32 | 33 | u' 清理数据表' 34 | def cleanDwTable(self): 35 | tableData = self.getTableData() 36 | for curTb in tableData: 37 | self.dropTable(curTb['db_name'] + '.' + curTb['tb_name'],curTb['id']) 38 | 39 | 40 | u' 获取待操作数据表' 41 | def getTableData(self): 42 | queryDataSql = "SELECT * FROM dw_service.clean_table WHERE status = 0 AND date = " + self.getDate() 43 | return self.getRegisterInstance('biDbModel').queryAll(queryDataSql) 44 | 45 | 46 | u' 删除数据表' 47 | def dropTable(self,db_tb_name,id): 48 | 49 | idDelHive = self.getRegisterInstance('hiveModel').dropTable(db_tb_name) 50 | idDelMysql = self.getRegisterInstance('biDbModel').dropTable(db_tb_name) 51 | 52 | if (idDelHive == True and idDelMysql == True): 53 | upStausSql = "UPDATE dw_service.clean_table SET status = 2 WHERE id = " + str(id); 54 | upStatus = self.getRegisterInstance('biDbModel').updataData(upStausSql) 55 | -------------------------------------------------------------------------------- /cluster_task/dw_sql.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from dw_service_core import DwServiceCore 4 | from core.util.base.file import File 5 | from core.util.log.logger import Logger 6 | 7 | u''' 8 | 执行 hive 和 spark sql 语句 9 | 10 | ./index.py --service cluster_task --module dw_sql --parameter '{"serverType":"spark","sql":"property/dw_property_inventory_recommend_d.sql","date":"yesterday","isDwSql":"yes"}' 11 | 12 | ''' 13 | 14 | class DwSql(DwServiceCore) : 15 | 16 | 17 | def init(self): 18 | DwServiceCore.init(self) 19 | 20 | 21 | def process(self): 22 | # 录入参数 23 | parsMap = self.getFormatParameter() 24 | 25 | self.runDwSqlProcess(parsMap) 26 | 27 | 28 | 29 | u''' 30 | 执行 Sql 流程控制 31 | serverType: 执行计算框架 [hive | spark] 32 | runEnv: 执行计算环境 [hiveserver2 | local] 本地模式 33 | sql: 执行的 dw_sql 仓库文件相对路径, 例如 property/dw_property_inventory_recommend_d.sql" 34 | isDwSql: 是否是 dw_sql 仓库的 sql [yes | no] ,默认 yes 35 | date: 执行的日期 [yesterday | today | yesterday | None] 执行日期, 格式 2016-11-12 36 | ''' 37 | def runDwSqlProcess(self, parsMap): 38 | status = False 39 | 40 | try : 41 | Logger.init() 42 | 43 | sqlFile = parsMap.get('sql') 44 | # sql file 文件 45 | if (sqlFile == None): 46 | Logger.info("sql 仓库文件不存在") 47 | exit(1) 48 | 49 | # 日期 50 | parsDate = parsMap.get('date') 51 | 52 | if (parsDate != None ): 53 | 54 | if (parsDate == "today") : 55 | date = self.getRegisterInstance('dateModel').getToday() 56 | elif (parsDate == "tomorrow") : 57 | date = self.getRegisterInstance('dateModel').getTomorrow() 58 | elif (parsDate == "yesterday") : 59 | date = self.getRegisterInstance('dateModel').getYesterday() 60 | else : 61 | date = parsDate 62 | else : 63 | # 默认是昨天日期 ,格式: 20151010 64 | date = self.getRegisterInstance('dateModel').getYesterday() 65 | 66 | # 服务器类型 67 | serverType = parsMap.get('serverType') 68 | if (serverType == None ): 69 | Logger.info("serverType : hive or spark") 70 | exit(1) 71 | 72 | 73 | # 是否是 dw 仓库的 sql 文件 74 | isDwSql = parsMap.get("isDwSql") 75 | # 读取 sql 文件内容,并且格式化好时间 76 | if (isDwSql == None): 77 | sqlContent = self.getDwSqlContent(parsMap.get('sql'), date) 78 | elif(isDwSql == "yes"): 79 | sqlContent = self.getDwSqlContent(parsMap.get('sql'), date) 80 | elif(isDwSql == "no"): 81 | sqlContent = self.getSqlContent(parsMap.get('sql'), date) 82 | else: 83 | Logger.info("isDwSql 参数: [yes|no]") 84 | exit(1) 85 | 86 | 87 | if (serverType == 'hive'): 88 | status = self.runSqlByHive(sqlContent, parsMap.get('runEnv')) 89 | elif (serverType == 'spark'): 90 | status = self.runSqlBySpark(sqlContent) 91 | 92 | 93 | except Exception,ex: 94 | log = "异常存储过程: " 95 | log += " -> " + str(Exception) + ":" + str(ex) 96 | Logger.info(log) 97 | 98 | return status 99 | 100 | 101 | # 执行 hive Sql 102 | def runSqlByHive(self, sqlContent, runEnv): 103 | Logger.info(sqlContent) 104 | Logger.info("执行中.....") 105 | 106 | status = False 107 | 108 | # 运行环境 109 | if (runEnv == "local"): 110 | u'提交到 Hive 本地 执行' 111 | status = self.getRegisterInstance('hiveModel').runHiveScript(sqlContent) 112 | u'提交到 Hive jdbc 执行' 113 | elif (runEnv == "hiveserver2"): 114 | status = self.getRegisterInstance('hiveModel').batchExecuteSql(sqlContent) 115 | else : 116 | status = self.getRegisterInstance('hiveModel').batchExecuteSql(sqlContent) 117 | 118 | u'打印日志' 119 | Logger.info(self.runLog("运行结果",status)) 120 | 121 | return status 122 | 123 | 124 | # 执行 spark Sql 125 | def runSqlBySpark(self, sqlContent): 126 | Logger.info(sqlContent) 127 | Logger.info("执行中.....") 128 | 129 | u'提交到 spark jdbc 执行' 130 | status = self.getRegisterInstance('sparkModel').batchExecuteSql(sqlContent) 131 | 132 | u'打印日志' 133 | Logger.info(self.runLog("运行结果",status)) 134 | 135 | return status 136 | 137 | 138 | u''' 139 | 获取 sql 文件内容内容, 并格式化 日期等参数 140 | file : 绝对路径 141 | date : 日期 142 | ''' 143 | def getSqlContent(self, file, date): 144 | dwSqlContent = File.redeAll(file) 145 | 146 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 147 | 148 | result = dwSqlContent.replace('${dealDate}',"'" + formatDate + "'") 149 | result = result.replace('${baseDealDate}',date) 150 | 151 | return result 152 | 153 | 154 | u''' 155 | 获取 dw_sql 仓库相对路径下的文件内容 156 | file : dw_sql 仓库的文件的相对路径 157 | date : 日期 158 | ''' 159 | def getDwSqlContent(self, file, date): 160 | # dw_sql 仓库的目录 161 | dwSqlDir = self.getRegisterInstance('confModel').getTaskConf()['dw_sql']['dir'] 162 | 163 | return self.getSqlContent(dwSqlDir + "/" + file, date) 164 | 165 | 166 | 167 | u''' 168 | 日志 echo 模板 169 | ''' 170 | def runLog(self,log,result): 171 | 172 | run_log = u'''>>> start : %s 173 | 174 | %s 175 | 176 | result : %s 177 | 178 | --- end 179 | 180 | '''%(self.getRegisterInstance('dateModel').getCurrentTime(),log.decode('utf-8'),result) 181 | 182 | return run_log 183 | 184 | -------------------------------------------------------------------------------- /cluster_task/hive_metadata.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from dw_service_core import DwServiceCore 4 | from cluster_task.dw_sql import DwSql 5 | 6 | u''' 7 | hive 元数据监控 8 | ''' 9 | class HiveMetadata(DwServiceCore) : 10 | 11 | def __init__(self): 12 | DwServiceCore.init(self) 13 | self.setRegisterInstance('hiveDbModel', self.getDwCoreInstance().getModelInterface('HiveDb') ) 14 | 15 | # 执行存储过程 16 | def storedProcedure(self, date): 17 | # 存储构成的 sql 文件 18 | sqlFile = 'monitor/monitor_hive_table.sql' 19 | 20 | sqlContent = DwSql().getDwSqlContent(sqlFile, date) 21 | 22 | return self.getRegisterInstance('hiveDbModel').batchExecuteSql(sqlContent) 23 | 24 | 25 | # 获取 hive 总条数 26 | def getHiveTableCount(self, date): 27 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 28 | querySql = "SELECT COUNT(*) AS c FROM test.hive_table_history WHERE p_dt='"+formatDate+"';" 29 | return self.getRegisterInstance('hiveDbModel').count(querySql) 30 | 31 | -------------------------------------------------------------------------------- /cluster_task/hive_tb_store.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 处理修改 hive 表的存储格式 5 | 6 | 适用方法: 7 | ./index.py --service cluster_task --module hive_tb_store --parameter '' 8 | ''' 9 | 10 | from dw_service_core import DwServiceCore 11 | 12 | class HiveTbStore(DwServiceCore) : 13 | 14 | def process(self): 15 | self.hiveTbCompress('db_gather', 'angejia__article_quiz', "p_dt", 'ORC') 16 | 17 | 18 | u'获取表字段' 19 | def getHiveTbFields(self, dbName, tbName, filterFields = []): 20 | # 获取源数据表字段 21 | sourceTableFields = self.getRegisterInstance('hiveModel').getFileds(dbName, tbName) 22 | formatTableFieldsList = [] 23 | for curField in sourceTableFields: 24 | if (curField in filterFields) : 25 | continue 26 | formatTableFieldsList.append('`' + curField + '`') 27 | formatSourceTableFieldsStr = ' String,'.join(formatTableFieldsList) + " String" 28 | return formatSourceTableFieldsStr 29 | 30 | u''' 31 | hive 表的压缩格式处理 32 | storedType: 表格式, ORC 33 | partition: 分区字段 34 | ''' 35 | def hiveTbCompress(self, dbName, tbName, partition, storedType): 36 | u''' 37 | 处理思路 38 | 使用动态分区: 39 | set hive.exec.dynamic.partition=true; 40 | set hive.exec.dynamic.partition.mode=nonstrict; 41 | set hive.exec.max.dynamic.partitions=100000; 42 | set hive.exec.max.dynamic.partitions.pernode=100000; 43 | 1. 复制原始表到 dw_history_db 中, 做一个备份 44 | CREATE TABLE dw_history_db.angejia__broker LIKE db_gather.angejia__broker; 45 | INSERT OVERWRITE TABLE dw_history_db.angejia__broker PARTITION(p_dt) SELECT * FROM db_gather.angejia__broker; 46 | 2. 删除原始表, 并创建新的存储格式的数据表 47 | DROP TABLE db_gather.angejia__broker; 48 | CREATE TABLE db_gather.angejia__broker( 49 | xxx.xxx 这里的字段使用 dw_history_db.angejia__broker 表中的字段 50 | ) PARTITIONED BY ( 51 | `p_dt` String 52 | ) 53 | STORED AS ORC; 54 | 3. 把备份数据写入到新格式的表中 55 | INSERT OVERWRITE TABLE db_gather.angejia__broker PARTITION(p_dt) SELECT * FROM dw_history_db.angejia__broker; 56 | ''' 57 | historyDb = "test" 58 | 59 | # 获取表字段 60 | sourceTbFields = self.getHiveTbFields(dbName, tbName, [partition]) 61 | 62 | eltSql = ''' 63 | set hive.exec.dynamic.partition=true; 64 | set hive.exec.dynamic.partition.mode=nonstrict; 65 | set hive.exec.max.dynamic.partitions=100000; 66 | set hive.exec.max.dynamic.partitions.pernode=100000; 67 | 68 | set hive.exec.parallel=false; 69 | set mapred.child.java.opts=-Xmx16384M; 70 | set mapreduce.map.java.opts=-Xmx8192M; 71 | set mapreduce.reduce.java.opts=-Xmx16384M; 72 | set mapreduce.map.memory.mb=8192; 73 | set mapreduce.reduce.memory.mb=16384; 74 | 75 | -- 复制阶段 76 | CREATE TABLE IF NOT EXISTS %(targetTb)s LIKE %(sourceTb)s; 77 | INSERT OVERWRITE TABLE %(targetTb)s PARTITION(%(partition)s) SELECT * FROM %(sourceTb)s; 78 | 79 | -- 删除原始表, 并创建新的存储格式的数据表 80 | DROP TABLE IF EXISTS %(sourceTb)s; 81 | CREATE TABLE %(sourceTb)s( 82 | %(sourceTbFields)s 83 | ) PARTITIONED BY ( 84 | %(partition)s String 85 | ) 86 | STORED AS %(storedType)s; 87 | 88 | -- 把数据写入原始表 89 | INSERT OVERWRITE TABLE %(sourceTb)s PARTITION(%(partition)s) SELECT * FROM %(targetTb)s; 90 | 91 | '''% {'sourceTb': dbName + '.' + tbName, 92 | 'targetTb': historyDb + '.' + tbName, 93 | 'partition': partition, 94 | 'storedType' : storedType, 95 | 'sourceTbFields' : sourceTbFields 96 | } 97 | 98 | print eltSql 99 | #result = self.getRegisterInstance('hiveModel').runHiveScript(eltSql) 100 | #if (result['code'] == 0) : print result['code'] 101 | 102 | -------------------------------------------------------------------------------- /cluster_task/indicator_system.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | dw 指标体系 5 | ''' 6 | 7 | from dw_service_core import DwServiceCore 8 | 9 | class IndicatorSystem(DwServiceCore) : 10 | u''' 11 | 获取数据表记录条数 12 | tableName 数据表 13 | date 日期 yyyymmdd 14 | ''' 15 | def getTableCountForDate(self, tableName, date): 16 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 17 | querySql = "SELECT COUNT(*) AS c FROM " + tableName + " WHERE p_dt='" + formatDate + "'" 18 | 19 | return self.getRegisterInstance('sparkModel').queryCount(querySql) 20 | 21 | 22 | u''' 23 | 获取当前 HDFS MB 已使用的容量 24 | ''' 25 | def getHdfsMB(self): 26 | hdfsSize = self.getRegisterInstance('hadoopModel').getHdfsDirSizeForSSH("/") 27 | 28 | dataMb = hdfsSize.get('dataSize') / 1024 / 1024 29 | hdfsMb = hdfsSize.get('hdfsSize') / 1024 / 1024 30 | 31 | return {'dataMb' : dataMb, 'hdfsMb' : hdfsMb} 32 | 33 | 34 | u''' 35 | 修改指标体系的数据 36 | date 日期 yyyymmdd 37 | field 修改的字段 38 | value 值 39 | ''' 40 | def modifyIndicatorSystem(self, date, field, value): 41 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 42 | 43 | # 查询当天数据是否存在 44 | queryCurDateDateSql = "SELECT id FROM dw_service.indicator_system_sd WHERE p_dt='" + formatDate + "';" 45 | data = self.getRegisterInstance('biDbModel').queryOne(queryCurDateDateSql) 46 | 47 | if (data == None): 48 | curRunSql = "INSERT INTO dw_service.indicator_system_sd(p_dt," + str(field) + ") values('" + str(formatDate) + "','" + str(value) + "')" 49 | else: 50 | curRunSql = "UPDATE dw_service.indicator_system_sd SET " + str(field) + " = '" + str(value) + "' WHERE id = " + str(data.get('id')) 51 | 52 | return self.getRegisterInstance('biDbModel').updataData(curRunSql) -------------------------------------------------------------------------------- /cluster_task/indicator_system_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | dw 指标体系 5 | 6 | 调用方式 7 | 指定日期调用: 8 | ./index.py --service cluster_task --module indicator_system_run --parameter '{"date":"20151010"}' 9 | 10 | 以当前日期为准调用,进行偏移值调用 yesterday ,today,tomorrow 11 | ./index.py --service cluster_task --module indicator_system_run --parameter '{"date":"today"}' 12 | 13 | ''' 14 | 15 | from core.util.log.logger import Logger 16 | 17 | from cluster_task.indicator_system import IndicatorSystem 18 | from cluster_task.scheduler_job import SchedulerJob 19 | from cluster_task.minireport_job import MinireportJob 20 | from cluster_task.hive_metadata import HiveMetadata 21 | 22 | class IndicatorSystemRun(IndicatorSystem) : 23 | 24 | 25 | def process(self): 26 | Logger.init() 27 | 28 | pars = self.getFormatParameter() 29 | 30 | u'日期' 31 | parsDate = pars.get('date') 32 | 33 | if (parsDate != None ): 34 | 35 | if (parsDate == "today") : 36 | date = self.getRegisterInstance('dateModel').getToday() 37 | elif (parsDate == "tomorrow") : 38 | date = self.getRegisterInstance('dateModel').getTomorrow() 39 | elif (parsDate == "yesterday") : 40 | date = self.getRegisterInstance('dateModel').getYesterday() 41 | else : 42 | date = parsDate 43 | else : 44 | u'默认是昨天日期 ,格式: 20151010' 45 | date = self.getRegisterInstance('dateModel').getYesterday() 46 | 47 | print date 48 | 49 | self.schedulerJob(date) 50 | self.minireportJob(date) 51 | self.hiveMetadata(date) 52 | self.dataWarehouse(date) 53 | self.hdfsMonitor(date) 54 | 55 | 56 | u' 调度指标体系任务' 57 | def schedulerJob(self, date): 58 | sc = SchedulerJob() 59 | 60 | # 跑存储过程 61 | sc.storedProcedure(date) 62 | 63 | # job 总数量 64 | jobCount = sc.getJobCount(date) 65 | etlJobCnStatus = self.modifyIndicatorSystem(date, 'etl_job_cn', jobCount) 66 | 67 | # job 总平均运行时间 68 | jobAvg = sc.getJobTotalAvgTime(date) 69 | etlJobRunAvgStatus = self.modifyIndicatorSystem(date, 'etl_job_run_avg', jobAvg) 70 | 71 | # 输出日志 72 | Logger.info("------------------------------") 73 | Logger.info("etl_job_cn: " + str(jobCount) + " ," + str(etlJobCnStatus) ) 74 | Logger.info("etl_job_run_avg: " + str(jobAvg) + " ," + str(etlJobRunAvgStatus) ) 75 | Logger.info("------------------------------") 76 | 77 | 78 | u' minireport 指标体系任务' 79 | def minireportJob(self, date): 80 | 81 | mr = MinireportJob() 82 | 83 | # 跑存储过程 84 | mr.storedProcedure(date) 85 | 86 | # minireprot 总数量 87 | minireportCount = mr.getMinireportCount(date) 88 | minireportCnStatus = self.modifyIndicatorSystem(date, 'minireport_cn', minireportCount) 89 | 90 | # minireprot总平均运行时间 91 | minireportAvg = mr.getMinireportTotalAvgTime(date) 92 | minireportRunAvgStatus = self.modifyIndicatorSystem(date, 'minireport_run_avg', minireportAvg) 93 | 94 | # 输出日志 95 | Logger.info("------------------------------") 96 | Logger.info("minireport_cn: " + str(minireportCount) + " ," + str(minireportCnStatus) ) 97 | Logger.info("minireport_run_avg: " + str(minireportAvg) + " ," + str(minireportRunAvgStatus) ) 98 | Logger.info("------------------------------") 99 | 100 | 101 | u' hive 元数据监控' 102 | def hiveMetadata(self, date): 103 | hm = HiveMetadata() 104 | 105 | # 跑存储过程 106 | hm.storedProcedure(date) 107 | 108 | # hive 数据表总条数 109 | hive_table_cn = hm.getHiveTableCount(date) 110 | hiveTableCnStatus = self.modifyIndicatorSystem(date, 'hive_table_cn', hive_table_cn) 111 | 112 | # 输出日志 113 | Logger.info("------------------------------") 114 | Logger.info("hive_table_cn: " + str(hive_table_cn) + " ," + str(hiveTableCnStatus) ) 115 | Logger.info("------------------------------") 116 | 117 | 118 | u' 数据仓库监控' 119 | def dataWarehouse(self, date): 120 | # 获取昨天日期,因为每次统一的是当前天数昨天的日期 121 | offsetDay = self.getRegisterInstance('dateModel').getOffsetDateDay(date, -1) 122 | 123 | # dw_web_visit_traffic_log 数据表统计 124 | dwWebVisitTrafficLogCn = self.getTableCountForDate('dw_db.dw_web_visit_traffic_log', offsetDay); 125 | dwWebVisitTrafficLogCnStatus = self.modifyIndicatorSystem(date, 'dw_web_visit_traffic_log', dwWebVisitTrafficLogCn) 126 | 127 | # dw_web_action_detail_log 数据表统计 128 | dwWebActionDetailLogCn = self.getTableCountForDate('dw_db.dw_web_action_detail_log', offsetDay); 129 | dwWebActionDetailLogCnStatus = self.modifyIndicatorSystem(date, 'dw_web_action_detail_log', dwWebActionDetailLogCn) 130 | 131 | # dw_app_access_log 数据表统计 132 | dwAppAccessLogCn = self.getTableCountForDate('dw_db.dw_app_access_log', offsetDay); 133 | dwAppAccessLogCnStatus = self.modifyIndicatorSystem(date, 'dw_app_access_log', dwAppAccessLogCn) 134 | 135 | # dw_app_action_detail_log 数据表统计 136 | dwAppActionDetailLogCn = self.getTableCountForDate('dw_db.dw_app_action_detail_log', offsetDay); 137 | dwAppActionDetailLogCnStatus = self.modifyIndicatorSystem(date, 'dw_app_action_detail_log', dwAppActionDetailLogCn) 138 | 139 | # dw_property_inventory_sd 数据表统计 140 | dwPropertyInventorySdCn = self.getTableCountForDate('dw_db.dw_property_inventory_sd', offsetDay); 141 | dwPropertyInventorySdCnStatus = self.modifyIndicatorSystem(date, 'dw_property_inventory_sd', dwPropertyInventorySdCn) 142 | 143 | # 输出日志 144 | Logger.info("------------------------------") 145 | Logger.info(str(date) + " " + str(offsetDay)) 146 | Logger.info("dw_web_visit_traffic_log: " + str(dwWebVisitTrafficLogCn) + " ," + str(dwWebVisitTrafficLogCnStatus) ) 147 | Logger.info("dw_web_action_detail_log: " + str(dwWebActionDetailLogCn) + " ," + str(dwWebActionDetailLogCnStatus) ) 148 | Logger.info("dw_app_access_log: " + str(dwAppAccessLogCn) + " ," + str(dwAppAccessLogCnStatus) ) 149 | Logger.info("dw_app_action_detail_log: " + str(dwAppActionDetailLogCn) + " ," + str(dwAppActionDetailLogCnStatus) ) 150 | Logger.info("dw_property_inventory_sd: " + str(dwPropertyInventorySdCn) + " ," + str(dwPropertyInventorySdCnStatus) ) 151 | Logger.info("------------------------------") 152 | 153 | 154 | 155 | u' 监控 hdfs 指标' 156 | def hdfsMonitor(self, date): 157 | rs = self.getHdfsMB() 158 | self.modifyIndicatorSystem(date, 'hadoop_data', rs.get('dataMb')) 159 | self.modifyIndicatorSystem(date, 'hadoop_hdfs', rs.get('hdfsMb')) 160 | 161 | # 输出日志 162 | Logger.info("------------------------------") 163 | Logger.info("hadoop_data: " + str(rs.get('dataMb')) ) 164 | Logger.info("hadoop_hdfs: " + str(rs.get('hdfsMb')) ) 165 | Logger.info("------------------------------") 166 | 167 | -------------------------------------------------------------------------------- /cluster_task/minireport_job.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from dw_service_core import DwServiceCore 4 | from cluster_task.dw_sql import DwSql 5 | 6 | u''' 7 | MinireportJob 监控 8 | ''' 9 | 10 | class MinireportJob(DwServiceCore) : 11 | 12 | # 执行存储过程 13 | def storedProcedure(self, date): 14 | # 存储构成的 sql 文件 15 | sqlFile = 'monitor/monitor_minireport.sql' 16 | 17 | sqlContent = DwSql().getDwSqlContent(sqlFile, date) 18 | 19 | return self.getRegisterInstance('biDbModel').batchExecuteSql(sqlContent) 20 | 21 | 22 | # 获取 Minireport 数量 23 | def getMinireportCount(self, date): 24 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 25 | querySql = "SELECT COUNT(*) AS c FROM dw_service.minireport_job_sd WHERE p_dt='"+formatDate+"';" 26 | return self.getRegisterInstance('biDbModel').count(querySql) 27 | 28 | 29 | # 获取所有 Minireport 总的运行时间 30 | def getMinireportTotalRunTime(self, date): 31 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 32 | querySql = "SELECT SUM(last_second) AS c FROM dw_service.minireport_job_sd WHERE p_dt='"+formatDate+"';" 33 | return self.getRegisterInstance('biDbModel').count(querySql) 34 | 35 | 36 | # 获取所有 Minireport 平均运行时间 37 | def getMinireportTotalAvgTime(self,date): 38 | cn = self.getMinireportTotalRunTime(date) / self.getMinireportCount(date) 39 | return int(cn) 40 | -------------------------------------------------------------------------------- /cluster_task/scheduler_job.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from dw_service_core import DwServiceCore 4 | from cluster_task.dw_sql import DwSql 5 | 6 | u''' 7 | SchedulerJob 监控 8 | ''' 9 | 10 | class SchedulerJob(DwServiceCore) : 11 | 12 | 13 | # 执行存储过程 14 | def storedProcedure(self, date): 15 | # 存储构成的 sql 文件 16 | sqlFile = 'monitor/monitor_scheduler_job.sql' 17 | 18 | sqlContent = DwSql().getDwSqlContent(sqlFile, date) 19 | 20 | return self.getRegisterInstance('biDbModel').batchExecuteSql(sqlContent) 21 | 22 | 23 | # 获取 Job 数量 24 | def getJobCount(self, date): 25 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 26 | querySql = "SELECT COUNT(*) AS c FROM dw_service.scheduler_job_sd WHERE p_dt='"+formatDate+"';" 27 | return self.getRegisterInstance('biDbModel').count(querySql) 28 | 29 | 30 | # 获取所有 JOB 总的运行时间 31 | def getJobTotalRunTime(self, date): 32 | formatDate = self.getRegisterInstance('dateModel').dateToFormatYmd(date) 33 | querySql = "SELECT SUM(last_second) AS c FROM dw_service.scheduler_job_sd WHERE p_dt='"+formatDate+"';" 34 | return self.getRegisterInstance('biDbModel').count(querySql) 35 | 36 | 37 | # 获取所有 JOB 平均运行时间 38 | def getJobTotalAvgTime(self,date): 39 | cn = self.getJobTotalRunTime(date) / self.getJobCount(date) 40 | return int(cn) 41 | -------------------------------------------------------------------------------- /core/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/.DS_Store -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/__init__.py -------------------------------------------------------------------------------- /core/conf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/conf/__init__.py -------------------------------------------------------------------------------- /core/conf/conf.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import ConfigParser 3 | 4 | from core.core import Core 5 | 6 | class Conf: 7 | 8 | cf = None 9 | def __init__(self,file_name = 'template'): 10 | if (self.cf == None): 11 | self.cf = ConfigParser.ConfigParser() 12 | 13 | self.cf.read(Core.SystemPath('confPath') + "/" + file_name +".conf"); 14 | 15 | u''' 16 | 获取指定指定类,key 下的数据 17 | ''' 18 | def getConf(self,category,key): 19 | return self.cf.get(category, key) 20 | 21 | def setConf(self,category,key,val): 22 | self.cf.set(category,key,val) 23 | 24 | u''' 25 | 读取配置文件,所有的大类 26 | ''' 27 | def getConfSections(self): 28 | return self.cf.sections() 29 | 30 | u''' 31 | 获取大类下的所有小组 key 32 | ''' 33 | def getConfOptions(self,category): 34 | return self.cf.options(category) 35 | 36 | -------------------------------------------------------------------------------- /core/conf/extract.conf: -------------------------------------------------------------------------------- 1 | [core] 2 | # 分隔符 3 | separator = __ 4 | 5 | # 抽取 Mysql 配置 6 | [extract_mysql] 7 | dump_file_dir = /data/log/mysql 8 | # 抽取到 hive 目标数据库 9 | hive_target_db = db_sync 10 | hive_db_dir = /user/hive/db_sync 11 | 12 | 13 | # 聚合配置 14 | [gather_table] 15 | # hive 源数据库 16 | hive_source_db = db_sync 17 | # hive 目标数据库 18 | hive_target_db = db_gather 19 | 20 | # 镜像表配置 21 | [snapshot_table] 22 | # hive 源数据库 23 | hive_source_db = db_sync 24 | # hive 目标数据库 25 | hive_target_db = db_snapshot 26 | 27 | # 监控配置 28 | [monitor] 29 | -------------------------------------------------------------------------------- /core/conf/system.conf: -------------------------------------------------------------------------------- 1 | # mysql 业务 db 2 | [mysql_produce_db] 3 | host = 10.10.39.153 4 | user = angejia_dw 5 | password = Th872havAyaxEmEB 6 | port = 3306 7 | 8 | 9 | # mysql dw 本部门 db 10 | [mysql_bi_db] 11 | host = 10.10.64.146 12 | user = hadoop 13 | password = angejia888 14 | port = 3306 15 | 16 | 17 | # mysql hive 元数据库 db 18 | [mysql_hive_db] 19 | host = uhadoop-ociicy-master1 20 | user = hadoop 21 | password = angejia888 22 | port = 3306 23 | 24 | 25 | [hadoop] 26 | hadoop_user = user 27 | hadoop_namenode_account = hadoop 28 | hadoop_namenode_host = uhadoop-ociicy-master1 29 | 30 | 31 | [hive_server2] 32 | host = uhadoop-ociicy-master2 33 | port = 10000 34 | user = dwadmin 35 | password = dwadmin 36 | hive_home = /usr/local/hive 37 | 38 | 39 | [spark_server] 40 | host = uhadoop-ociicy-task4 41 | port = 10002 42 | user = hadoop 43 | password = hadoop 44 | spark_home = /usr/local/spark 45 | 46 | 47 | [sqoop] 48 | sqoop_home=/usr/local/sqoop 49 | 50 | 51 | #邮件 52 | [bi_mail] 53 | mutt_bin = /usr/local/mutt/bin 54 | smtp_server = smtp.angejia.com 55 | username = bi02@angejia.com 56 | password = Angejia2015 57 | sender = bi02@angejia.com 58 | receiver = jason@angejia.com 59 | 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /core/conf/task.conf: -------------------------------------------------------------------------------- 1 | [dw_sql] 2 | dir = /home/dwadmin/app/dw_sql 3 | #dir = /home/dwadmin/develop/jason/dw_sql 4 | 5 | -------------------------------------------------------------------------------- /core/conf/template.conf: -------------------------------------------------------------------------------- 1 | [portal] 2 | host = localhost123 3 | port = 80804 4 | url = http://%(host)s:%(port)s/Portal 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /core/conf/uba_log.conf: -------------------------------------------------------------------------------- 1 | [uba_log_server] 2 | server_user = dwadmin 3 | server_host = 10.10.2.91 4 | server_port = 22 5 | 6 | 7 | [dw_hive_udf] 8 | udf_jar=123123123 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /core/core.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import sys,os 3 | 4 | from util.base.camel import Camel 5 | 6 | class Core: 7 | 8 | u'静态环境变量' 9 | SYSTEM_PATH = {} 10 | 11 | @staticmethod 12 | def SystemPath(key = None): 13 | 14 | if (len(Core.SYSTEM_PATH) == 0): 15 | basePath = sys.path[0]; 16 | Core.SYSTEM_PATH.update({ 17 | 'basePath':basePath, 18 | 'corePath':basePath + "/core", 19 | 'confPath':basePath + "/core/conf", 20 | 'modelPath':basePath + "/core/model", 21 | 'utilPath':basePath + "/core/util", 22 | 'shellPath':basePath + "/core/shell", 23 | 'tmpPath':basePath + "/tmp", 24 | 25 | }); 26 | 27 | if (key == None): 28 | return Core.SYSTEM_PATH 29 | else: 30 | return Core.SYSTEM_PATH[key] 31 | 32 | 33 | #获取 配置文件对象接口 34 | CONF_OBJ = {} 35 | @staticmethod 36 | def getConfInterface(fileName = 'coreConf'): 37 | # 单例 38 | if (Core.CONF_OBJ.has_key(fileName) == False): 39 | from conf.conf import Conf 40 | #默认返回 core 配置 41 | if (fileName == 'coreConf'): 42 | Core.CONF_OBJ[fileName] = Conf() 43 | else: 44 | Core.CONF_OBJ[fileName] = Conf(fileName) 45 | 46 | return Core.CONF_OBJ[fileName] 47 | 48 | 49 | # 获取模型接口对象 50 | MODEL_OBJ = {} 51 | @staticmethod 52 | def getModelInterface(modelName): 53 | # 单例 54 | if (Core.MODEL_OBJ.has_key(modelName) == False): 55 | # 实例化指定模型对象 56 | str = 'from model.' + Camel.camelToUnderline(modelName) + '_model import ' + modelName + 'Model'; 57 | str += "\n"; 58 | str += 'Core.MODEL_OBJ[modelName] = ' + modelName + 'Model()' 59 | exec(str) 60 | 61 | return Core.MODEL_OBJ[modelName] 62 | -------------------------------------------------------------------------------- /core/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/model/__init__.py -------------------------------------------------------------------------------- /core/model/bi_db_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.model.conf_model import ConfModel 3 | from core.util.mysql.mysql_interface import MysqlInterface 4 | 5 | u''' 6 | 数据部 mysql 服务器模型 7 | ''' 8 | class BiDbModel(MysqlInterface): 9 | 10 | def __init__(self): 11 | systemConf = ConfModel.getSystemConf() 12 | dict = { 13 | 'host':systemConf['mysql_bi_db']['host'], 14 | 'port':int(systemConf['mysql_bi_db']['port']), 15 | 'user':systemConf['mysql_bi_db']['user'], 16 | 'passwd':systemConf['mysql_bi_db']['password'], 17 | 'db':'dw_service', 18 | 'charset':'utf8' 19 | } 20 | MysqlInterface.__init__(self,dict) 21 | 22 | 23 | # 获取数据抽取的数据表信息 24 | def getExtractMysqlTables(self, extractType): 25 | sql = "SELECT * FROM dw_service.extract_table WHERE is_delete=0 AND extract_type='" + str(extractType) + "' ORDER BY extract_type ASC,id ASC" 26 | return self.queryAll(sql) 27 | 28 | 29 | # 获取抽取数据表的扩展信息 30 | def getExtractMysqlTableExt(self, tbId): 31 | sql = "SELECT * FROM dw_service.extract_table_ext WHERE tb_id = " + str(tbId) + " AND is_del = 0" 32 | return self.queryOne(sql) 33 | 34 | 35 | # 获取 gather 数据表信息 36 | def getGatherTables(self): 37 | sql = "SELECT * FROM dw_service.extract_table WHERE is_delete = 0 AND is_gather = 1 ORDER BY id ASC" 38 | return self.queryAll(sql) 39 | 40 | 41 | # 获取 Extract 日志数据 42 | def getExtractLogForDate(self,date): 43 | sql = "SELECT * FROM dw_service.extract_log WHERE date_format(created_at,'%Y-%m-%d') = '" + date + "' ORDER BY id DESC;" 44 | return self.queryAll(sql) -------------------------------------------------------------------------------- /core/model/conf_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 配置对象模型实例, 5 | 所有的读取配置,都要通过此类来处理 6 | ''' 7 | 8 | from core.core import Core 9 | 10 | class ConfModel: 11 | 12 | u''' 13 | 私有方法 14 | 组成 Conf 成字典 15 | ''' 16 | @staticmethod 17 | def _mergerConfToDict(conf): 18 | systemInterface = Core.getConfInterface(conf) 19 | 20 | result = {} 21 | 22 | u'获取所有大类的 key ' 23 | sections = systemInterface.getConfSections() 24 | 25 | for curSection in sections: 26 | 27 | u'大类下的小类' 28 | curOptions = systemInterface.getConfOptions(curSection) 29 | 30 | sectionResult = {} 31 | for curKey in curOptions : 32 | sectionResult[curKey] = systemInterface.getConf(curSection,curKey) 33 | 34 | result[curSection] = sectionResult 35 | 36 | return result 37 | 38 | u'获取 Core 系统环境变量' 39 | @staticmethod 40 | def getCoreSystemPath(): 41 | return Core.SystemPath() 42 | 43 | 44 | u'系统配置' 45 | SYSTEM_CONF = None 46 | @staticmethod 47 | def getSystemConf(): 48 | if (ConfModel.SYSTEM_CONF == None): 49 | ConfModel.SYSTEM_CONF = ConfModel._mergerConfToDict('system') 50 | 51 | return ConfModel.SYSTEM_CONF 52 | 53 | 54 | u'uba 配置' 55 | UBA_LOG_CONF = None 56 | @staticmethod 57 | def getUbaLogConf(): 58 | if (ConfModel.UBA_LOG_CONF == None): 59 | ConfModel.UBA_LOG_CONF = ConfModel._mergerConfToDict('uba_log') 60 | return ConfModel.UBA_LOG_CONF 61 | 62 | 63 | u'task 配置' 64 | TASK_CONF = None 65 | @staticmethod 66 | def getTaskConf(): 67 | if (ConfModel.TASK_CONF == None): 68 | ConfModel.TASK_CONF = ConfModel._mergerConfToDict('task') 69 | return ConfModel.TASK_CONF 70 | 71 | 72 | u'抽取 配置' 73 | EXTRACT_CONF = None 74 | @staticmethod 75 | def getExtractConf(): 76 | if (ConfModel.EXTRACT_CONF == None): 77 | ConfModel.EXTRACT_CONF = ConfModel._mergerConfToDict('extract') 78 | return ConfModel.EXTRACT_CONF 79 | -------------------------------------------------------------------------------- /core/model/date_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | u''' 3 | 日期模型对象,所有日期对象都从此类出 4 | ''' 5 | 6 | from core.util.base.date import Date 7 | 8 | class DateModel: 9 | 10 | u''' 11 | 获取昨天日期: 格式,20151029 12 | ''' 13 | @staticmethod 14 | def getYesterday(): 15 | timestamp = Date.getOffsetDate(-1) 16 | return Date.timestampToFormatDate(timestamp,"%Y%m%d") 17 | 18 | u''' 19 | 获取昨天日期: 格式,2015-10-29 20 | ''' 21 | @staticmethod 22 | def getYesterdayByYmd(): 23 | timestamp = Date.getOffsetDate(-1) 24 | return Date.timestampToFormatDate(timestamp,"%Y-%m-%d") 25 | 26 | u''' 27 | 获取今天日期 : 格式,20151030 28 | ''' 29 | @staticmethod 30 | def getToday(): 31 | timestamp = Date.getOffsetDate(0) 32 | return Date.timestampToFormatDate(timestamp,"%Y%m%d") 33 | 34 | u''' 35 | 获取今天日期 : 格式,2015-10-30 36 | ''' 37 | @staticmethod 38 | def getTodayByYmd(): 39 | timestamp = Date.getOffsetDate(0) 40 | return Date.timestampToFormatDate(timestamp,"%Y-%m-%d") 41 | 42 | u''' 43 | 获取明天日期 : 格式,20151031 44 | ''' 45 | @staticmethod 46 | def getTomorrow(): 47 | timestamp = Date.getOffsetDate(1) 48 | return Date.timestampToFormatDate(timestamp,"%Y%m%d") 49 | 50 | u''' 51 | 获取明天日期 : 格式,2015-10-31 52 | ''' 53 | @staticmethod 54 | def getTomorrowByYmd(): 55 | timestamp = Date.getOffsetDate(1) 56 | return Date.timestampToFormatDate(timestamp,"%Y-%m-%d") 57 | 58 | u''' 59 | 转换日期格式 : 20151031 -> 2015-10-31 60 | ''' 61 | @staticmethod 62 | def dateToFormatYmd(date): 63 | return Date.dateToFormatDate(date,"%Y%m%d","%Y-%m-%d") 64 | 65 | u''' 66 | 获取当前时间 67 | ''' 68 | @staticmethod 69 | def getCurrentTime(): 70 | return Date.timestampToFormatDate(Date.getTimestamp(),"%Y-%m-%d %H:%M:%S") 71 | 72 | u'获取当前时间戳' 73 | @staticmethod 74 | def getTimestamp(): 75 | return Date.getTimestamp() 76 | 77 | u'根据时间戳,格式化日期' 78 | @staticmethod 79 | def timestampToFormatDate(timestamp,dateFormat = "%Y-%m-%d %H:%M:%S"): 80 | return Date.timestampToFormatDate(timestamp,dateFormat) 81 | 82 | u''' 83 | 获取指定日期偏移天数 84 | ''' 85 | @staticmethod 86 | def getOffsetDateDay(date, offsetDay, dateFormat = "%Y%m%d"): 87 | offsetDate = Date.offsetDateDay(date, offsetDay ,dateFormat) 88 | 89 | return Date.dateToFormatDate(offsetDate,"%Y-%m-%d",dateFormat) 90 | -------------------------------------------------------------------------------- /core/model/hadoop_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.model.conf_model import ConfModel 3 | from core.util.base.process import Process 4 | 5 | class HadoopModel: 6 | 7 | u''' 8 | hive 执行脚本命令接口 9 | ''' 10 | def hadoopCommand(self,command): 11 | return Process.runScriptSync(command) 12 | 13 | 14 | u'获取 hdfs 目录容量' 15 | def getHdfsDirSize(self, hdfsDir): 16 | hdfsDir = self.hadoopCommand('hdfs dfs -du '+ hdfsDir) 17 | result = {} 18 | if (hdfsDir['code'] == 0): 19 | strToList = hdfsDir['stdoutPut'].split('\n') 20 | filterList = strToList[3:-1] 21 | for curDir in filterList: 22 | lineDetail = curDir.split() 23 | result[lineDetail[2]] = { 24 | 'fileSize': lineDetail[0], 25 | 'blockSize':lineDetail[1] 26 | } 27 | return result 28 | 29 | 30 | u''' 31 | ssh 连接到 hadoop namenode 服务器远程执行命令 32 | 本机 ~/.ssh/id_rsa.pub 公钥,需要添加 namenode 服务器中 33 | return 字节 34 | 转为 KB 为 x / 1024 35 | 转为 MB 为 x / 1024 / 1024 36 | 转为 GB 为 x / 1024 / 1024 / 1024 37 | ''' 38 | def getHdfsDirSizeForSSH(self, hdfsDir): 39 | sysConf = ConfModel.getSystemConf() 40 | nameNodeAccount = sysConf.get('hadoop').get('hadoop_namenode_account') 41 | nameNodeHost = sysConf.get('hadoop').get('hadoop_namenode_host') 42 | 43 | command = "hdfs dfs -du -s " + hdfsDir 44 | commandRs = Process.sshCommand(nameNodeAccount, nameNodeHost, command) 45 | 46 | rs = {} 47 | if (commandRs.get('code') == 0): 48 | strToList = commandRs.get('stdoutPut').split('\n') 49 | # 过滤多余的行 50 | line = "" 51 | for curLine in strToList : 52 | if (len(curLine) == 0): 53 | continue 54 | elif ( "bash" in curLine) : 55 | continue 56 | else : 57 | line = curLine 58 | filterList = line.split() 59 | rs['dir'] = filterList[2] 60 | rs['dataSize'] = int(filterList[0]) 61 | rs['hdfsSize'] = int(filterList[1]) 62 | else: 63 | print commandRs.get('erroutPut') 64 | return rs 65 | -------------------------------------------------------------------------------- /core/model/hive_db_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.model.conf_model import ConfModel 3 | from core.util.mysql.mysql_interface import MysqlInterface 4 | 5 | u''' 6 | hive 元数据库 7 | ''' 8 | class HiveDbModel(MysqlInterface): 9 | 10 | def __init__(self): 11 | systemConf = ConfModel.getSystemConf() 12 | dict = { 13 | 'host':systemConf['mysql_hive_db']['host'], 14 | 'port':int(systemConf['mysql_hive_db']['port']), 15 | 'user':systemConf['mysql_hive_db']['user'], 16 | 'passwd':systemConf['mysql_hive_db']['password'], 17 | 'db':'hive', 18 | 'charset':'utf8' 19 | } 20 | MysqlInterface.__init__(self,dict) 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /core/model/hive_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.model.conf_model import ConfModel 3 | from core.util.hive.hive_interface import HiveInterface 4 | 5 | 6 | class HiveModel(HiveInterface): 7 | 8 | def __init__(self): 9 | systemConf = ConfModel.getSystemConf() 10 | dict = { 11 | 'host':systemConf['hive_server2']['host'], 12 | 'port':systemConf['hive_server2']['port'], 13 | 'user':systemConf['hive_server2']['user'], 14 | 'password':systemConf['hive_server2']['password'], 15 | 'hiveHome' : systemConf['hive_server2']['hive_home'] 16 | 17 | } 18 | HiveInterface.__init__(self,dict) 19 | -------------------------------------------------------------------------------- /core/model/mail_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | u''' 3 | 日期模型对象,所有日期对象都从此类出 4 | ''' 5 | from core.model.conf_model import ConfModel 6 | from core.util.mail.mail import Mail 7 | 8 | class MailModel: 9 | 10 | @staticmethod 11 | def DefautlSendMail(subject,content): 12 | systemConf = ConfModel.getSystemConf() 13 | 14 | mailBaseInfo = { 15 | 'smtpServer' : systemConf['bi_mail']['smtp_server'], 16 | 'username' : systemConf['bi_mail']['username'], 17 | 'password' : systemConf['bi_mail']['password'], 18 | 'sender' : systemConf['bi_mail']['sender'], 19 | 'receiver' : systemConf['bi_mail']['receiver'].split(','), 20 | 'subject' : subject, 21 | 'content' : content 22 | } 23 | 24 | Mail.SendMail(mailBaseInfo) 25 | 26 | -------------------------------------------------------------------------------- /core/model/produce_db_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.model.conf_model import ConfModel 3 | from core.util.mysql.mysql_interface import MysqlInterface 4 | 5 | u''' 6 | 生产环境数据库模型 7 | ''' 8 | class ProduceDbModel(MysqlInterface): 9 | 10 | def __init__(self): 11 | systemConf = ConfModel.getSystemConf() 12 | dict = { 13 | 'host':systemConf['mysql_produce_db']['host'], 14 | 'port':int(systemConf['mysql_produce_db']['port']), 15 | 'user':systemConf['mysql_produce_db']['user'], 16 | 'passwd':systemConf['mysql_produce_db']['password'], 17 | 'db':'angejia', 18 | 'charset':'utf8' 19 | } 20 | MysqlInterface.__init__(self,dict) 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /core/model/spark_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | u''' 3 | spark-sql 模型 4 | 5 | 底层通讯,基于 HiveServer2 的实现 6 | ''' 7 | 8 | from core.model.conf_model import ConfModel 9 | from core.util.hive.hive_server2 import HiveServer2 10 | from core.util.hive.hive_interface import HiveInterface 11 | 12 | 13 | class SparkModel(HiveInterface): 14 | 15 | def __init__(self): 16 | systemConf = ConfModel.getSystemConf() 17 | dict = { 18 | 'host':systemConf['spark_server']['host'], 19 | 'port':systemConf['spark_server']['port'], 20 | 'user':systemConf['spark_server']['user'], 21 | 'password':systemConf['spark_server']['password'] 22 | } 23 | HiveInterface.__init__(self,dict) 24 | 25 | 26 | 27 | class SparkModelBak: 28 | 29 | sparkInterface = None 30 | def _getSparkInterface(self): 31 | systemConf = ConfModel.getSystemConf() 32 | 33 | u'避免多次创建连接实例' 34 | if (self.sparkInterface != None): 35 | return self.sparkInterface 36 | 37 | dict = { 38 | 'host':systemConf['spark_server']['host'], 39 | 'port':systemConf['spark_server']['port'], 40 | 'user':systemConf['spark_server']['user'], 41 | 'password':systemConf['spark_server']['password'], 42 | } 43 | 44 | self.sparkInterface = HiveServer2(dict) 45 | return self.sparkInterface 46 | 47 | 48 | u''' 49 | 创建数据表 50 | 返回 bool 值 51 | ''' 52 | def createTable(self,createTableSql): 53 | return self._getSparkInterface().execute([createTableSql]) 54 | 55 | u''' 56 | 删除表 57 | 返回 bool 值 58 | ''' 59 | def dropTable(self,dbTbName): 60 | dropTableSql = "DROP TABLE IF EXISTS " + dbTbName; 61 | return self._getSparkInterface().execute([dropTableSql]) 62 | 63 | 64 | u''' 65 | 执行 spark-sql 语句 66 | ''' 67 | def runSparkSqlBak(self,sqlContent): 68 | sqlContentList = sqlContent.split(';'); 69 | formatSqlContentList = sqlContentList[:-1]; 70 | 71 | return self._getSparkInterface().execute(formatSqlContentList) 72 | 73 | -------------------------------------------------------------------------------- /core/model/sqoop_model.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.core import Core 3 | from core.model.conf_model import ConfModel 4 | from core.util.base.process import Process 5 | 6 | 7 | u''' 8 | Sqoop Model 9 | ''' 10 | class SqoopModel: 11 | 12 | # 业务 DB 服务配置 13 | DB_CONF = {} 14 | 15 | systemConf = {} 16 | systemCorePath = {} 17 | 18 | def __init__(self): 19 | self.systemConf = ConfModel.getSystemConf() 20 | self.systemCorePath = ConfModel.getCoreSystemPath() 21 | 22 | # 数据库服务器 23 | # 业务数据库 24 | PRODUCT_DB = 'product' 25 | # dw 数据仓库数据库 26 | DW_DB = 'dw' 27 | # 当前抽取的数据库服务器 28 | dbServer = None 29 | # 当前数据库指定 Model 30 | dbServerModel = None 31 | def setDbServer(self,dbServer): 32 | if (dbServer == SqoopModel.PRODUCT_DB): 33 | self.dbServer = { 34 | 'mysql_host' : self.systemConf['mysql_produce_db']['host'], 35 | 'mysql_user' : self.systemConf['mysql_produce_db']['user'], 36 | 'mysql_password' : self.systemConf['mysql_produce_db']['password'], 37 | 'mysql_port' : self.systemConf['mysql_produce_db']['port'], 38 | } 39 | self.dbServerModel = Core.getModelInterface('ProduceDb') 40 | elif (dbServer == SqoopModel.DW_DB): 41 | self.dbServer = { 42 | 'mysql_host' : self.systemConf['mysql_bi_db']['host'], 43 | 'mysql_user' : self.systemConf['mysql_bi_db']['user'], 44 | 'mysql_password' : self.systemConf['mysql_bi_db']['password'], 45 | 'mysql_port' : self.systemConf['mysql_bi_db']['port'], 46 | } 47 | self.dbServerModel = Core.getModelInterface('BiDb') 48 | def getDbServer(self): 49 | return self.dbServer 50 | 51 | 52 | # 源数据库 53 | sourceDb = None 54 | def setSourceDb(self,data): 55 | self.sourceDb = data 56 | def getSourceDb(self): 57 | return self.sourceDb 58 | 59 | 60 | # 源数据表 61 | sourceTable = None 62 | def setSourceTable(self,data): 63 | self.sourceTable = data 64 | def getSourceTable(self): 65 | return self.sourceTable 66 | 67 | 68 | # 目标数据库 69 | targetDb = None 70 | def setTargetDb(self,data): 71 | self.targetDb = data 72 | def getTargetDb(self): 73 | return self.targetDb 74 | 75 | 76 | # 目标数据表 77 | targetTable = None 78 | def setTargetTable(self,data): 79 | self.targetTable = data 80 | def getTargetTable(self): 81 | return self.targetTable 82 | 83 | 84 | # MapReduce 数量,默认 1 个 85 | mapReduceNum = 1 86 | def setMapReduceNum(self,data): 87 | self.mapReduceNum = data 88 | def getMapReduceNum(self): 89 | return self.mapReduceNum 90 | 91 | 92 | # 分割字符串,默认 \001 分割 93 | fieldsTerminated = "\\001" 94 | def setFieldsTerminated(self,data): 95 | self.fieldsTerminated = data 96 | def getFieldsTerminated(self): 97 | return self.fieldsTerminated 98 | 99 | 100 | # 导入 mysql 101 | def importMysqlToHive(self): 102 | 103 | # 当前选择数据库服务器 104 | dbServer = self.getDbServer() 105 | 106 | # 获取 带抽取的 Mysql 表结构 ,用来创建 Hive 数据表 107 | mysqlFileds = self.dbServerModel.getFileds(self.getSourceDb(),self.getSourceTable()) 108 | mysqlFiledsFormat = '=String,'.join(mysqlFileds) + "=String" 109 | 110 | # 执行脚本 111 | script = self.systemCorePath['shellPath'] + '/sqoop_import_mysql.sh ' 112 | # 参数 113 | script += '--sqoop_home \"'+self.systemConf['sqoop']['sqoop_home']+'\" ' 114 | script += '--local_tmp_dir \"'+self.systemCorePath['tmpPath']+'\" ' 115 | script += '--hdfs_sqoop_tmp_dir \"/tmp/sqoop\" ' 116 | script += '--mysql_host \"'+dbServer['mysql_host']+'\" ' 117 | script += '--mysql_port \"'+dbServer['mysql_port']+'\" ' 118 | script += '--mysql_user \"'+dbServer['mysql_user']+'\" ' 119 | script += '--mysql_password \"'+dbServer['mysql_password']+'\" ' 120 | script += '--mysql_database \"'+self.getSourceDb()+'\" ' 121 | script += '--mysql_table \"'+self.getSourceTable()+'\" ' 122 | script += '--hive_table \"'+self.getTargetDb()+'.'+self.getTargetTable()+'\" ' 123 | script += '--fields_terminated_by \"'+self.getFieldsTerminated()+'\" ' 124 | script += '--map_column_hive_fields \"'+mysqlFiledsFormat+'\" ' 125 | script += '--mappers_num \"'+str(self.getMapReduceNum())+'\"' 126 | 127 | result = Process.runScriptSync(script) 128 | return result 129 | 130 | 131 | # hbase 指定 row_key 132 | hbaseRowKey = "row_key" 133 | def setHbaseRowKey(self,data): 134 | self.hbaseRowKey = data 135 | def getHbaseRowKey(self): 136 | return self.hbaseRowKey 137 | 138 | # hbase 指定 column_family 139 | hbaseColumnFamily = "row_key" 140 | def setHbaseColumnFamily(self,data): 141 | self.hbaseColumnFamily = data 142 | def getbaseColumnFamily(self): 143 | return self.hbaseColumnFamily 144 | 145 | # 导入 mysql 到 table 146 | def importMysqlToHbase(self, querySql = None): 147 | # 当前选择数据库服务器 148 | dbServer = self.getDbServer() 149 | 150 | tmpDir = self.systemCorePath['tmpPath'] + '/sqoop_outdir' 151 | targetDir = '/tmp/sqoop/' + self.getTargetTable() 152 | 153 | # 执行脚本 154 | script = self.systemConf['sqoop']['sqoop_home'] + '/bin/sqoop import ' 155 | script += '--connect \"jdbc:mysql://' + dbServer['mysql_host'] + ':' + dbServer['mysql_port'] + '/' + self.getSourceDb() + '?useUnicode=true&tinyInt1isBit=false&characterEncoding=utf-8\" ' 156 | script += '--username \"'+dbServer['mysql_user']+'\" ' 157 | script += '--password \"'+dbServer['mysql_password']+'\" ' 158 | script += '--hbase-create-table ' 159 | script += '--hbase-table \"'+self.getTargetTable()+'\" ' 160 | script += '--column-family \"'+self.getbaseColumnFamily()+'\" ' 161 | script += '--m \"'+str(self.getMapReduceNum())+'\" ' 162 | script += '--outdir \"'+ tmpDir +'\" ' 163 | script += '--target-dir \"' + targetDir + '\" ' 164 | script += '--delete-target-dir ' 165 | 166 | u' 表示整张表导入' 167 | if (querySql == None): 168 | rmTmpTable = 'rm ' + tmpDir + '/' + self.getSourceTable() + '.java' 169 | Process.runScriptSync(rmTmp) 170 | 171 | script += '--table \"'+self.getSourceTable()+'\" ' 172 | else : 173 | u' 清理临时文件' 174 | rmTmp = 'rm ' + tmpDir + '/QueryResult.java' 175 | Process.runScriptSync(rmTmp) 176 | 177 | u' 表示使用查询语句导入' 178 | script += '--hbase-row-key \"'+self.getHbaseRowKey()+'\" ' 179 | script += '--split-by \"'+self.getHbaseRowKey()+'\" ' 180 | script += '--query \"'+ querySql +'\"' 181 | 182 | #print script 183 | result = Process.runScriptSync(script) 184 | return result 185 | 186 | -------------------------------------------------------------------------------- /core/shell/mysql_dump_file_bak.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | fn_gmr_regexp+="s/[\n|\r\n]//g;"; 4 | 5 | # 处理 NULL 字符串 6 | 7 | fn_gmr_regexp+="s/NULL/\\\N/g;"; 8 | #处理分隔符 9 | fn_gmr_regexp+="s/\t/$(echo -e ${fields_terminated_by})/g;"; 10 | 11 | #格式化 12 | mysql "${source_db_type}" "${fn_gmr_mysql_sql}" | sed -e "${fn_gmr_regexp}" > ${now_result_file};" -------------------------------------------------------------------------------- /core/shell/sqoop_import_mysql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 使用案例 3 | :< val 60 | def format(self): 61 | #self.setshortOption(""); 62 | #self.setlongOption(["service=", "mo="]) 63 | try: 64 | opts, args = getopt.getopt(self.getArgs()[1:],self.getshortOption(),self.getlongOption()) 65 | except getopt.GetoptError, err: 66 | print str(err) 67 | self.usage() 68 | sys.exit(2) 69 | 70 | for op, value in opts: 71 | self.setResult({op:value}); 72 | 73 | 74 | def run(self): 75 | self.format(); 76 | return self.getResult(); 77 | -------------------------------------------------------------------------------- /core/util/base/camel.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | u''' 3 | Created on 2015年10月20日 4 | 5 | @author: Jason 6 | ''' 7 | class Camel: 8 | 9 | @staticmethod 10 | def camelToUnderline(camelFormat): 11 | ''' 12 | 驼峰命名格式转下划线命名格式 13 | ''' 14 | underlineFormat='' 15 | if isinstance(camelFormat, str): 16 | for _s_ in camelFormat: 17 | underlineFormat += _s_ if _s_.islower() else '_'+_s_.lower() 18 | 19 | return underlineFormat[1:] 20 | 21 | @staticmethod 22 | def underlineToCamel(underlineFormat): 23 | ''' 24 | 下划线命名格式驼峰命名格式 25 | ''' 26 | camelFormat = '' 27 | if isinstance(underlineFormat, str): 28 | for _s_ in underlineFormat.split('_'): 29 | camelFormat += _s_.capitalize() 30 | return camelFormat 31 | 32 | -------------------------------------------------------------------------------- /core/util/base/date.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import time 3 | import datetime 4 | import calendar 5 | 6 | u''' 7 | Created on 2015年10月20日 8 | @author: Jason 9 | ''' 10 | class Date: 11 | 12 | #获取当前时间戳 13 | @staticmethod 14 | def getTimestamp(): 15 | return int(time.time()) 16 | 17 | u''' 18 | 时间戳转换为日期指定格式日期 19 | ''' 20 | @staticmethod 21 | def timestampToFormatDate(timestamp,formatDate = "%Y-%m-%d %H:%M:%S"): 22 | timeArray = time.localtime(timestamp) 23 | return time.strftime(formatDate, timeArray) 24 | 25 | u''' 26 | 日期转换为时间戳 27 | @date String 日期 : 2015-10-20 28 | @dateFormat String: 日期格式,必须根据 date 的格式一致 29 | ''' 30 | @staticmethod 31 | def dateToTimestamp (date,dateFormat = "%Y-%m-%d %H:%M:%S"): 32 | date = str(date) 33 | timeArray = time.strptime(date, dateFormat) 34 | #转换为时间戳: 35 | timeStamp = int(time.mktime(timeArray)) 36 | return timeStamp 37 | 38 | u''' 39 | 日期直接转换为指定格式的日期 40 | @date 日期 : 41 | @dateFormat 日期格式 : 日期格式,必须根据 date 的格式一致 42 | @toDateFormat 需要转换成为的日期格式 43 | ''' 44 | @staticmethod 45 | def dateToFormatDate(date,dateFormat,toDateFormat): 46 | date = str(date) 47 | timeArray = time.strptime(date, dateFormat) 48 | otherStyleTime = time.strftime(toDateFormat, timeArray) 49 | return otherStyleTime 50 | 51 | u''' 52 | 获取指定偏移日期 53 | * @param offset_day 偏移天数,-1昨天 0今天 1明天 54 | * @return 时间戳 55 | ''' 56 | @staticmethod 57 | def getOffsetDate(offset_day = 0): 58 | daySeconds = 24 * 60 * 60 59 | curTimestamp = Date.getTimestamp() 60 | 61 | return curTimestamp + (daySeconds * offset_day) 62 | 63 | 64 | u''' 65 | 获取指定日期的偏移值 66 | @param date 指定日期,格式 %Y-%m-%d 2016-04-16 67 | @param offsetDay 偏移天数,-1昨天 0今天 1明天 68 | @return 2016-04-16 69 | ''' 70 | @staticmethod 71 | def offsetDateDay (date, offsetDay, dateFormat = "%Y-%m-%d"): 72 | # 当前时间戳 73 | timestamp = Date.dateToTimestamp(date, dateFormat) 74 | # 一天的秒数 75 | daySeconds = 24 * 60 * 60 76 | # 天数 x 一天的秒数 77 | seconds = daySeconds * abs(offsetDay) 78 | 79 | rsTimestamp = 0 80 | if (offsetDay > 0): 81 | rsTimestamp = timestamp + seconds 82 | else: 83 | rsTimestamp = timestamp - seconds 84 | 85 | # 时间戳转换为日期 86 | return Date.timestampToFormatDate(rsTimestamp, "%Y-%m-%d") -------------------------------------------------------------------------------- /core/util/base/file.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | import sys,os 4 | 5 | u''' 6 | Created on 2015年10月20日 7 | 8 | @author: Jason 9 | ''' 10 | class File: 11 | 12 | u' 验证文件是否存在' 13 | @staticmethod 14 | def isExists(file): 15 | return os.path.exists(file) 16 | 17 | 18 | u''' 19 | 读取文件所有内容 20 | ''' 21 | @staticmethod 22 | def redeAll(file): 23 | f = open(file) 24 | try: 25 | allTheText = f.read( ) 26 | finally: 27 | f.close( ) 28 | 29 | return allTheText 30 | 31 | 32 | u''' 33 | 写文件 34 | 35 | ‘r’模式: 以读方式打开,不能进行写操作,文件必须是已经存在的 36 | ‘r+’模式:以读写方式打开,文件必须是已经存在的 37 | ‘w’模式: 以写方式打开,不能进行读操作,若文件存在,则先清空,然后重新创建;若不存在,则创建文件 38 | ‘w+’模式:以读写方式打开,若文件存在,则先清空,然后重新创建;若不存在,则创建文件 39 | ‘a’模式: 以追加方式打开,不能进行读操作,把数据追加到文件的末尾;若不存在,则创建文件 40 | ‘a+’模式:以读写方式打开,把数据追加到文件的末尾;若不存在,则创建文件 41 | ‘b’模式: 以二进制模式打开,不能作为第一个字符出现,需跟以上模式组合使用,如’rb’,’rb+’等, 42 | ‘u’模式: 表示通用换行符支持,文件必须是已经存在的 43 | ''' 44 | @staticmethod 45 | def write(file, string, model = 'w+'): 46 | f = open(file, model) 47 | try: 48 | f.write(string) 49 | finally: 50 | f.close() 51 | 52 | 53 | u' 读行' 54 | @staticmethod 55 | def readLines(file): 56 | f = open(file) 57 | try: 58 | lineList = f.readlines() 59 | finally: 60 | f.close() 61 | 62 | 63 | u' 写行' 64 | @staticmethod 65 | def writeLines(file, lineList): 66 | f = open(file,'w') 67 | try: 68 | f.writelines(lineList) 69 | finally: 70 | f.close() 71 | -------------------------------------------------------------------------------- /core/util/base/process.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | u''' 3 | 进程通讯管理 4 | ''' 5 | 6 | import os 7 | import sys 8 | import subprocess 9 | import threading 10 | import logging 11 | from time import ctime,sleep 12 | 13 | 14 | u''' 15 | Created on 2015年10月20日 16 | 17 | @author: Jason 18 | ''' 19 | class Process: 20 | 21 | 22 | u''' 23 | subprocess 模块 fork 后执行脚本,返回进程对象 24 | @par isWait 是否等待子进程结束 25 | True : 等待子进程结束后,再退出父进程 26 | False : 提交子进程结束后,直接退出父进程 27 | 28 | 实时获取 shell 结果 29 | while (True): 30 | line = p.stdout.readline().strip() 31 | if (p.poll() == 0): 32 | break 33 | if line: 34 | print line 35 | ''' 36 | @staticmethod 37 | def subprocessOpen(command,isWait = True): 38 | p = subprocess.Popen( 39 | command, 40 | shell=True, 41 | preexec_fn = os.setsid, 42 | stdin=sys.stdin, 43 | stdout=subprocess.PIPE, 44 | stderr=subprocess.STDOUT) 45 | 46 | # 等待子进程结束 47 | if (isWait == True) : 48 | p.wait() 49 | 50 | return p 51 | 52 | 53 | u''' 54 | 运行脚本 - 同步模式 55 | @par command 命令 : 比如 ls / 56 | @return dict 57 | code : 运行状态 0 正常 58 | stdoutPut : 标准输出 59 | erroutPut : 标准错误输出(如果错误的话) 60 | ''' 61 | @staticmethod 62 | def runScriptSync(command): 63 | currentP = Process.subprocessOpen(command) 64 | 65 | u'标准输出和标准错误输出' 66 | stdoutPut,erroutPut = currentP.communicate() 67 | 68 | u'进程退出状态' 69 | code = currentP.poll() 70 | 71 | result = { 72 | 'code' : code, 73 | 'stdoutPut' : stdoutPut, 74 | 'erroutPut' : erroutPut 75 | } 76 | 77 | return result 78 | 79 | 80 | u''' 81 | 运行脚本 - 异步模式 82 | return 进程对象: 83 | currentP.pid : 获取 Pid 84 | currentP.kill() : 删除进程 85 | currentP.poll() : 获取结束状态 : 0 表示成功 86 | currentP.stdout.readline().strip() : 可通过循环,动态获取输出 87 | ''' 88 | @staticmethod 89 | def runScriptAsync(command): 90 | currentP = Process.subprocessOpen(command,False) 91 | 92 | return currentP 93 | 94 | 95 | 96 | u''' 97 | 多线程执行 98 | @par commands list : list = ['ls /','ls ~/'] 99 | @return subprocessOpen 当前提交进程 100 | 后续处理 101 | status = True 102 | while (status): 103 | sleep(1) 104 | for item in result: 105 | u'当前进程对象' 106 | print item 107 | u'当前进程是否是活动的' 108 | print item.isAlive() 109 | ''' 110 | @staticmethod 111 | def runThreadingScripts(commands): 112 | u'提交多线程任务' 113 | for current_command in commands: 114 | current_thread = threading.Thread(target=Process.work,args=(current_command,)) 115 | current_thread.setDaemon(True) 116 | current_thread.start() 117 | 118 | u'当前运行中的Thread对象列表' 119 | return threading.enumerate() 120 | 121 | 122 | @staticmethod 123 | def work(args): 124 | p = Process.subprocessOpen(args,True) 125 | #p = Process.runScriptAsync(args) 126 | 127 | 128 | u' ssh 远程执行脚本' 129 | @staticmethod 130 | def sshCommand(server_user, server_host, command): 131 | script = 'ssh -q -t ' + server_user + '@'+ server_host + ' ' 132 | script += '"bash -i ' 133 | script += command 134 | script += '"' 135 | return Process.runScriptSync(script) 136 | -------------------------------------------------------------------------------- /core/util/base/read_conf.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | ConfigParser 模块的 3 个类,其中任何一个都可处理 5 | 6 | RawConfigParser、ConfigParser、SafeConfigParser 7 | ''' 8 | 9 | class ReadConf: 10 | 11 | def init(self): 12 | print "ReadConf" -------------------------------------------------------------------------------- /core/util/hive/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/util/hive/__init__.py -------------------------------------------------------------------------------- /core/util/hive/hive_interface.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.util.hive.hive_server2 import HiveServer2 3 | from core.util.base.process import Process 4 | from time import ctime,sleep 5 | 6 | 7 | class HiveInterface: 8 | 9 | hiveConf = {} 10 | def __init__(self,data): 11 | self.hiveConf = data 12 | #self.__setHiveConnection(self.hiveConf) 13 | 14 | 15 | u''' 16 | hive 操作数据库对象接口,私有有方法,请不要在类外使用 17 | ''' 18 | __hiveConnection = None 19 | def __setHiveConnection(self,conf): 20 | self.__hiveConnection = HiveServer2(conf) 21 | def __getHiveConnection(self): 22 | #return self.__hiveConnection 23 | self.__setHiveConnection(self.hiveConf) 24 | return self.__hiveConnection 25 | 26 | 27 | u''' 28 | 创建数据表 29 | 返回 bool 值 30 | ''' 31 | def createTable(self,createTableSql): 32 | return self.__getHiveConnection().execute([createTableSql]) 33 | 34 | u''' 35 | 删除表 36 | 返回 bool 值 37 | ''' 38 | def dropTable(self,dbTbName): 39 | dropTableSql = "DROP TABLE " + dbTbName; 40 | if (self.__getHiveConnection().execute([dropTableSql]) == True): 41 | return True 42 | else: 43 | return False 44 | 45 | 46 | u''' 47 | 删除数据库 ,谨慎操作! 48 | 返回 bool 值 49 | ''' 50 | def dropDb(self,dbName): 51 | dropDbNameSql = "DROP DATABASE IF EXISTS " + dbName; 52 | if (self.__getHiveConnection().execute([dropDbNameSql]) == True): 53 | return True 54 | else: 55 | return False 56 | 57 | 58 | u'查询数据记录条数' 59 | def queryCount(self,sql): 60 | queryData = self.__getHiveConnection().query([sql]) 61 | return queryData[0][0] 62 | 63 | 64 | u'查询最大记录数' 65 | def queryMax(self,sql): 66 | queryData = self.__getHiveConnection().query([sql]) 67 | return queryData[0][0] 68 | 69 | 70 | u'查询数据' 71 | def query(self,sql,isShowField = True): 72 | result = None 73 | if (isShowField == True): 74 | result = self.__getHiveConnection().queryReturnField([sql]) 75 | else: 76 | result = self.__getHiveConnection().query([sql]); 77 | return result 78 | 79 | 80 | u'显示指定数据库表' 81 | def getDbTables(self,dbName): 82 | queryData = self.__getHiveConnection().query(["USE " + dbName,"SHOW TABLES"]) 83 | 84 | result = [] 85 | for curTable in queryData: 86 | result.append(curTable[0]) 87 | return result 88 | 89 | 90 | u'验证 Table 是否存在' 91 | def isExistsTable (self,dbName,tbName): 92 | try : 93 | self.query('DESC ' + dbName + '.' + tbName, False) 94 | result = True 95 | except Exception,ex: 96 | result = False 97 | 98 | return result 99 | 100 | 101 | u'获取数据表字段 set hive.display.partition.cols.separately=false' 102 | def getFileds(self,dbName,tbName): 103 | 104 | sqlList = [ 105 | 'SET hive.display.partition.cols.separately=false', 106 | "DESC " + dbName + "." + tbName 107 | ] 108 | data = self.__getHiveConnection().queryReturnField(sqlList) 109 | 110 | fields = [] 111 | for i in data: 112 | fields.append(i.get('col_name')) 113 | 114 | return fields 115 | 116 | 117 | u''' 118 | 批量执行 sql 语句 119 | ''' 120 | def batchExecuteSql(self,sqlContent): 121 | sqlContentList = sqlContent.split(';'); 122 | formatSqlContentList = sqlContentList[:-1]; 123 | 124 | return self.__getHiveConnection().execute(formatSqlContentList) 125 | 126 | 127 | u'命令行使用本地 hive 脚本' 128 | def runHiveScript(self,sql): 129 | hiveHome = self.hiveConf['hiveHome'] 130 | runCommand = hiveHome + "/bin/hive -e " + '\"' + sql + '\"' 131 | result = Process.runScriptSync(runCommand) 132 | return result 133 | 134 | 135 | 136 | u''' 137 | test hive 执行脚本命令接口 138 | ''' 139 | def hiveBinInterface(self): 140 | #process = Process() 141 | #result = process.runScript('ls ~/develop/jason/dw_etl/dw_service ') 142 | #print result['stdoutPut'] 143 | 144 | #list = ['ls ~/develop/jason/dw_etl/dw_service','ls ~/develop/jason/uba'] 145 | #list = ['hadoop dfs -ls /user/hive','hadoop dfs -ls /user'] 146 | list = ['hive -e "select count(*) from dw_db.dw_broker_summary_basis_info_daily;"', 147 | 'hive -e "select count(*) from dw_db.dw_cal;"'] 148 | result = Process.runThreadingScripts(list) 149 | 150 | status = True 151 | while (status): 152 | sleep(1) 153 | for item in result: 154 | print item 155 | u'获取当前活动的(alive)线程的个数' 156 | print item.isAlive() -------------------------------------------------------------------------------- /core/util/hive/hive_server2.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | Hive Server2 通讯类,文档地址 5 | https://cwiki.apache.org/confluence/display/Hive/Setting+Up+HiveServer2#SettingUpHiveServer2-PythonClientDriver 6 | ''' 7 | 8 | import pyhs2 9 | 10 | class HiveServer2: 11 | 12 | u''' 13 | 构造方法,配置通讯属性 14 | par 15 | dict = { 16 | 'host':'uhadoop-ociicy-master2', 17 | 'port':10000, 18 | 'user':'dwadmin', 19 | 'password':'dwadmin', 20 | } 21 | ''' 22 | def __init__(self,dict = {}): 23 | self.setConnectionBase(dict) 24 | self.setConnection() 25 | 26 | 27 | u''' 28 | 连接属性 29 | ''' 30 | connectionBase = {} 31 | def setConnectionBase(self,dict): 32 | dict.setdefault('authMechanism',"PLAIN") 33 | dict.setdefault('database','default') 34 | self.connectionBase = dict 35 | 36 | def getConnectionBase(self,key = None): 37 | if (key == None): 38 | return self.connectionBase 39 | else: 40 | return self.connectionBase[key] 41 | 42 | 43 | u''' 44 | 设置链接 45 | ''' 46 | connection = None 47 | def setConnection(self): 48 | try : 49 | self.connection = pyhs2.connect( 50 | host=self.getConnectionBase('host'), 51 | port=self.getConnectionBase('port'), 52 | authMechanism=self.getConnectionBase('authMechanism'), 53 | user=self.getConnectionBase('user'), 54 | password=self.getConnectionBase('password'), 55 | database=self.getConnectionBase('database') 56 | ) 57 | except Exception,ex: 58 | print 'hiveServer2 创建连接失败:' + Exception,ex 59 | 60 | u''' 61 | 获取连接句对象 62 | ''' 63 | def getConnection(self): 64 | return self.connection 65 | 66 | 67 | u''' 68 | 负责执行 Sql 69 | @par sql 70 | @result bool 71 | ''' 72 | def execute(self,sqls = []): 73 | cursor = self.getConnection().cursor() 74 | try: 75 | for cur_sql in sqls: 76 | cursor.execute(cur_sql) 77 | status = True 78 | except Exception,ex: 79 | print Exception,":",ex 80 | status = False 81 | 82 | return status 83 | 84 | 85 | u''' 86 | 查询返回,并且返回查询字段 87 | ''' 88 | def queryReturnField (self,sqls = []): 89 | cursor = self.getConnection().cursor() 90 | 91 | 92 | for cur_sql in sqls: 93 | cursor.execute(cur_sql) 94 | 95 | u'返回查询表字段名称' 96 | fields = [] 97 | for cur_field_info in cursor.getSchema(): 98 | fields.append(cur_field_info['columnName']) 99 | 100 | u'返回查询表字段名称' 101 | result = [] 102 | for cur_row in cursor.fetch(): 103 | result.append(dict(zip(fields,cur_row))) 104 | 105 | 106 | return result 107 | 108 | u''' 109 | 查询,不返回字段 110 | ''' 111 | def query(self,sqls = []): 112 | cursor = self.getConnection().cursor() 113 | 114 | for cur_sql in sqls: 115 | cursor.execute(cur_sql) 116 | 117 | return cursor.fetch() 118 | 119 | u''' 120 | Show databases 121 | ''' 122 | def getDatabases(self): 123 | return self.getConnection().cursor().getDatabases() 124 | 125 | -------------------------------------------------------------------------------- /core/util/log/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/util/log/__init__.py -------------------------------------------------------------------------------- /core/util/log/logger.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 日志级别 5 | CRITICAL > ERROR > WARNING > INFO > DEBUG > NOTSET 6 | ''' 7 | import logging 8 | 9 | 10 | class Logger: 11 | 12 | @staticmethod 13 | def init(level = logging.DEBUG): 14 | logging.basicConfig(level=level, 15 | format='%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S') 17 | 18 | @staticmethod 19 | def debug(message): 20 | logging.debug(message) 21 | 22 | @staticmethod 23 | def info(message): 24 | logging.info(message) 25 | 26 | # 警告 27 | @staticmethod 28 | def warning(message): 29 | logging.warning(message) 30 | 31 | # 错误的 32 | @staticmethod 33 | def error(message): 34 | logging.error(message) 35 | 36 | # 危险的 37 | @staticmethod 38 | def critical(message): 39 | logging.critical(message) 40 | -------------------------------------------------------------------------------- /core/util/mail/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/util/mail/__init__.py -------------------------------------------------------------------------------- /core/util/mail/mail.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import smtplib 3 | from email.mime.text import MIMEText 4 | 5 | 6 | u''' 7 | Created on 2015年10月20日 8 | 9 | @author: Jason 10 | ''' 11 | class Mail: 12 | 13 | 14 | u'发送邮件主方法' 15 | @staticmethod 16 | def SendMail(mailInfo,mailType = 'html'): 17 | 18 | if (mailType == 'html'): 19 | msg = Mail.Html(mailInfo['subject'],mailInfo['content']) 20 | elif (mailType == 'other'): 21 | pass 22 | 23 | Mail.BaseMail( 24 | mailInfo['smtpServer'], 25 | mailInfo['username'], 26 | mailInfo['password'], 27 | mailInfo['sender'], 28 | mailInfo['receiver'], 29 | msg.as_string() 30 | ) 31 | 32 | 33 | u''' 34 | 发送邮件 35 | @par smtpServer 邮件服务器地址 36 | @par username 用户名 37 | @par password 密码 38 | @par sender 发件人 39 | @par receiver 收件人 | 群发 ['***','****',……] 40 | @par content 内容,需要 msg.as_string() 转换后再输入 41 | ''' 42 | @staticmethod 43 | def BaseMail(smtpServer,username,password,sender,receiver,msg): 44 | smtp = smtplib.SMTP() 45 | smtp.connect(smtpServer) 46 | smtp.login(username, password) 47 | smtp.sendmail(sender, receiver, msg) 48 | smtp.quit() 49 | 50 | 51 | 52 | @staticmethod 53 | def Html(subject,content): 54 | msg = MIMEText(content,'html','utf-8') 55 | msg['Subject'] = subject 56 | return msg 57 | 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /core/util/mysql/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/core/util/mysql/__init__.py -------------------------------------------------------------------------------- /core/util/mysql/mysql.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import MySQLdb 3 | import MySQLdb.cursors 4 | import time 5 | 6 | #''对MySQLdb常用函数进行封装的类''' 7 | class MySql: 8 | 9 | error_code = '' #MySQL错误号码 10 | 11 | _instance = None #本类的实例 12 | _conn = None # 数据库conn 13 | _cur = None #游标 14 | 15 | _TIMEOUT = 30 #默认超时30秒 16 | _timecount = 0 17 | 18 | def __init__(self, dbconfig): 19 | #构造器:根据数据库连接参数,创建MySQL连接' 20 | try: 21 | self._conn = MySQLdb.connect(host=dbconfig['host'], 22 | # port 传入的参数要设置成为 int 类型 23 | port=dbconfig['port'], 24 | user=dbconfig['user'], 25 | passwd=dbconfig['passwd'], 26 | db=dbconfig['db'], 27 | charset=dbconfig['charset'], 28 | cursorclass=MySQLdb.cursors.DictCursor) 29 | except MySQLdb.Error, e: 30 | self.error_code = e.args[0] 31 | error_msg = 'MySQL error! ', e.args[0], e.args[1] 32 | print error_msg 33 | 34 | # 如果没有超过预设超时时间,则再次尝试连接, 35 | if self._timecount < self._TIMEOUT: 36 | interval = 5 37 | self._timecount += interval 38 | time.sleep(interval) 39 | return self.__init__(dbconfig) 40 | else: 41 | raise Exception(error_msg) 42 | 43 | self._cur = self._conn.cursor() 44 | self._instance = MySQLdb 45 | 46 | 47 | u'执行 SELECT 语句' 48 | def query(self,sql): 49 | try: 50 | self._cur.execute("SET NAMES utf8") 51 | result = self._cur.execute(sql) 52 | except MySQLdb.Error, e: 53 | self.error_code = e.args[0] 54 | print "数据库错误代码:",e.args[0],e.args[1] 55 | result = False 56 | return result 57 | 58 | 59 | u'执行 UPDATE 及 DELETE 语句' 60 | def update(self,sql): 61 | 62 | try: 63 | self._cur.execute("SET NAMES utf8") 64 | self._cur.execute(sql) 65 | self._conn.commit() 66 | result = True 67 | except MySQLdb.Error, e: 68 | self.error_code = e.args[0] 69 | print "数据库错误代码:",e.args[0],e.args[1] 70 | result = False 71 | 72 | return result 73 | 74 | u'执行 INSERT 语句。如主键为自增长int,则返回新生成的ID' 75 | def insert(self,sql): 76 | try: 77 | self._cur.execute("SET NAMES utf8") 78 | self._cur.execute(sql) 79 | self._conn.commit() 80 | return self._conn.insert_id() 81 | except MySQLdb.Error, e: 82 | self.error_code = e.args[0] 83 | return False 84 | 85 | u''' 86 | 批量执行 Sql 87 | @par sql 88 | @result bool 89 | ''' 90 | def executeSqls(self,sqls = []): 91 | try: 92 | self._cur.execute("SET NAMES utf8") 93 | for cur_sql in sqls: 94 | self._cur.execute(cur_sql) 95 | self._conn.commit() 96 | status = True 97 | except MySQLdb.Error, e: 98 | self.error_code = e.args[0] 99 | print "数据库错误代码:",e.args[0],e.args[1] 100 | status = False 101 | 102 | return status 103 | 104 | 105 | def fetchAllRows(self): 106 | #返回结果列表' 107 | return self._cur.fetchall() 108 | 109 | def fetchOneRow(self): 110 | #返回一行结果,然后游标指向下一行。到达最后一行以后,返回None' 111 | return self._cur.fetchone() 112 | 113 | def getRowCount(self): 114 | #获取结果行数' 115 | return self._cur.rowcount 116 | 117 | def commit(self): 118 | #数据库commit操作' 119 | self._conn.commit() 120 | 121 | def rollback(self): 122 | #数据库回滚操作' 123 | self._conn.rollback() 124 | 125 | def __del__(self): 126 | #释放资源(系统GC自动调用)' 127 | try: 128 | self._cur.close() 129 | self._conn.close() 130 | except: 131 | pass 132 | 133 | def close(self): 134 | #关闭数据库连接' 135 | self.__del__() -------------------------------------------------------------------------------- /core/util/mysql/mysql_interface.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.util.mysql.mysql import MySql 3 | from core.util.base.process import Process 4 | from time import ctime,sleep 5 | 6 | 7 | class MysqlInterface: 8 | 9 | mysqlConf = {} 10 | 11 | def __init__(self,data): 12 | self.mysqlConf = data 13 | #self.__mysqlConnection = MySql(self.mysqlConf) 14 | 15 | def __getMysqlConnection(self): 16 | return MySql(self.mysqlConf) 17 | 18 | 19 | def queryOne(self,sql): 20 | mysqlConnection = self.__getMysqlConnection() 21 | mysqlConnection.query(sql) 22 | return mysqlConnection.fetchOneRow(); 23 | 24 | u'查询返回数据' 25 | def queryAll(self,sql): 26 | mysqlConnection = self.__getMysqlConnection() 27 | mysqlConnection.query(sql) 28 | return mysqlConnection.fetchAllRows(); 29 | 30 | 31 | u'删除数据表' 32 | def dropTable(self,dbTbName): 33 | mysqlConnection = self.__getMysqlConnection() 34 | 35 | dropTableSql = "DROP TABLE IF EXISTS " + dbTbName; 36 | 37 | if (mysqlConnection.update(dropTableSql) == True): 38 | return True 39 | else: 40 | return False 41 | 42 | u'获取数据表信息' 43 | def tableInfo(self,dbName,tbName): 44 | sql = "SELECT `table_schema`,`table_name`,`table_rows`, `data_length`,`row_format` FROM information_schema.tables WHERE `table_schema`='"+ dbName +"' AND `table_name`='"+ tbName +"' LIMIT 1;" 45 | 46 | mysqlConnection = self.__getMysqlConnection() 47 | 48 | mysqlConnection.query(sql) 49 | data = mysqlConnection.fetchOneRow() 50 | result = { 51 | 'dbName' : data['table_schema'], 52 | 'tbName' : data['table_name'], 53 | 'tbRows' : data['table_rows'], 54 | 'tbSize' : data['data_length'], 55 | } 56 | return result 57 | 58 | 59 | u'获取表字段' 60 | def getFileds(self,dbName,tbName): 61 | data = self.queryAll("DESC " + dbName + "." + tbName) 62 | fields = [] 63 | for i in data: 64 | fields.append(i.get('Field')) 65 | return fields 66 | 67 | 68 | u'插入数据' 69 | def insertData(self,sql): 70 | mysqlConnection = self.__getMysqlConnection() 71 | 72 | id = mysqlConnection.insert(sql) 73 | if (id == True): 74 | return id 75 | else: 76 | return False 77 | 78 | 79 | u'修改数据,删除数据' 80 | def updataData(self,upSql): 81 | mysqlConnection = self.__getMysqlConnection() 82 | 83 | if (mysqlConnection.update(upSql) == True): 84 | return True 85 | else: 86 | return False 87 | 88 | 89 | u'批量执行 sql' 90 | def batchExecuteSql(self,sqlContent): 91 | sqlContentList = sqlContent.split(';'); 92 | formatSqlContentList = sqlContentList[:-1]; 93 | 94 | mysqlConnection = self.__getMysqlConnection() 95 | 96 | status = mysqlConnection.executeSqls(formatSqlContentList) 97 | 98 | return status 99 | 100 | u''' 101 | 获取表行数 102 | 使用方法 103 | SELECT COUNT(*) AS c FROM xxx 104 | ''' 105 | def count(self,sql): 106 | mysqlConnection = self.__getMysqlConnection() 107 | 108 | mysqlConnection.query(sql) 109 | data = mysqlConnection.fetchOneRow() 110 | return data['c'] 111 | 112 | 113 | u'获取命令行 Mysql 脚本执行参数' 114 | def getMysqlCommand(self): 115 | return "mysql -h" + str(self.mysqlConf['host']) + " -u" + str(self.mysqlConf['user']) +" -p" + str(self.mysqlConf['passwd']) + " -P" + str(self.mysqlConf['port']) 116 | 117 | 118 | u'命令行 执行 mysql SQL' 119 | def runMysqlCommand(self,sql = ''): 120 | scriptBase = self.getMysqlCommand() + " -N -s -e " 121 | runScript = scriptBase + "\"" + sql + "\"" 122 | return Process.runScriptSync(runScript) 123 | 124 | # Mysql 查询语句 Dump 到本地文件中 125 | def mysqlDumpFile(self,sql,file): 126 | 127 | # sed 格式化规则 128 | # 处理行中的换行符号 129 | rowRegexp = 's/[\\n|\\r\\n]//g;' 130 | # 处理行中 NULL 字符串 131 | rowRegexp += 's/NULL/\\\\\N/g;' 132 | # 列分隔符 133 | rowRegexp += 's/\t/\001/g;' 134 | 135 | # 组合命令 136 | script = self.getMysqlCommand() + " -N -s -e ""\"" + sql + "\" | sed -e \"" + rowRegexp + "\" > " + file 137 | 138 | result = Process.runScriptSync(script) 139 | 140 | return result 141 | 142 | -------------------------------------------------------------------------------- /core/util/mysql/mysql_interface_20160907.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.util.mysql.mysql import MySql 3 | from core.util.base.process import Process 4 | from time import ctime,sleep 5 | 6 | 7 | class MysqlInterface: 8 | 9 | mysqlConf = {} 10 | 11 | def __init__(self,data): 12 | self.mysqlConf = data 13 | self.__mysqlConnection = MySql(self.mysqlConf) 14 | 15 | def queryOne(self,sql): 16 | self.__mysqlConnection.query(sql) 17 | return self.__mysqlConnection.fetchOneRow(); 18 | 19 | u'查询返回数据' 20 | def queryAll(self,sql): 21 | self.__mysqlConnection.query(sql) 22 | return self.__mysqlConnection.fetchAllRows(); 23 | 24 | 25 | u'删除数据表' 26 | def dropTable(self,dbTbName): 27 | dropTableSql = "DROP TABLE IF EXISTS " + dbTbName; 28 | if (self.__mysqlConnection.update(dropTableSql) == True): 29 | return True 30 | else: 31 | return False 32 | 33 | u'获取数据表信息' 34 | def tableInfo(self,dbName,tbName): 35 | sql = "SELECT `table_schema`,`table_name`,`table_rows`, `data_length`,`row_format` FROM information_schema.tables WHERE `table_schema`='"+ dbName +"' AND `table_name`='"+ tbName +"' LIMIT 1;" 36 | self.__mysqlConnection.query(sql) 37 | data = self.__mysqlConnection.fetchOneRow() 38 | result = { 39 | 'dbName' : data['table_schema'], 40 | 'tbName' : data['table_name'], 41 | 'tbRows' : data['table_rows'], 42 | 'tbSize' : data['data_length'], 43 | } 44 | return result 45 | 46 | 47 | u'获取表字段' 48 | def getFileds(self,dbName,tbName): 49 | data = self.queryAll("DESC " + dbName + "." + tbName) 50 | fields = [] 51 | for i in data: 52 | fields.append(i.get('Field')) 53 | return fields 54 | 55 | 56 | u'插入数据' 57 | def insertData(self,sql): 58 | id = self.__mysqlConnection.insert(sql) 59 | if (id == True): 60 | return id 61 | else: 62 | return False 63 | 64 | 65 | u'修改数据,删除数据' 66 | def updataData(self,upSql): 67 | if (self.__mysqlConnection.update(upSql) == True): 68 | return True 69 | else: 70 | return False 71 | 72 | 73 | u'批量执行 sql' 74 | def batchExecuteSql(self,sqlContent): 75 | sqlContentList = sqlContent.split(';'); 76 | formatSqlContentList = sqlContentList[:-1]; 77 | 78 | status = self.__mysqlConnection.executeSqls(formatSqlContentList) 79 | 80 | return status 81 | 82 | u''' 83 | 获取表行数 84 | 使用方法 85 | SELECT COUNT(*) AS c FROM xxx 86 | ''' 87 | def count(self,sql): 88 | self.__mysqlConnection.query(sql) 89 | data = self.__mysqlConnection.fetchOneRow() 90 | return data['c'] 91 | 92 | 93 | u'获取命令行 Mysql 脚本执行参数' 94 | def getMysqlCommand(self): 95 | return "mysql -h" + str(self.mysqlConf['host']) + " -u" + str(self.mysqlConf['user']) +" -p" + str(self.mysqlConf['passwd']) + " -P" + str(self.mysqlConf['port']) 96 | 97 | 98 | u'命令行 执行 mysql SQL' 99 | def runMysqlCommand(self,sql = ''): 100 | scriptBase = self.getMysqlCommand() + " -N -s -e " 101 | runScript = scriptBase + "\"" + sql + "\"" 102 | return Process.runScriptSync(runScript) 103 | 104 | # Mysql 查询语句 Dump 到本地文件中 105 | def mysqlDumpFile(self,sql,file): 106 | 107 | # sed 格式化规则 108 | # 处理行中的换行符号 109 | rowRegexp = 's/[\\n|\\r\\n]//g;' 110 | # 处理行中 NULL 字符串 111 | rowRegexp += 's/NULL/\\\\\N/g;' 112 | # 列分隔符 113 | rowRegexp += 's/\t/\001/g;' 114 | 115 | # 组合命令 116 | script = self.getMysqlCommand() + " -N -s -e ""\"" + sql + "\" | sed -e \"" + rowRegexp + "\" > " + file 117 | 118 | result = Process.runScriptSync(script) 119 | 120 | return result 121 | 122 | -------------------------------------------------------------------------------- /dw_service/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | dw_service 4 | 5 | 6 | 7 | 8 | 9 | org.python.pydev.PyDevBuilder 10 | 11 | 12 | 13 | 14 | 15 | org.python.pydev.pythonNature 16 | 17 | 18 | -------------------------------------------------------------------------------- /dw_service/.pydevproject: -------------------------------------------------------------------------------- 1 | 2 | 3 | Default 4 | python 2.7 5 | 6 | -------------------------------------------------------------------------------- /dw_service/core/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/__init__.pyc -------------------------------------------------------------------------------- /dw_service/core/conf/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/conf/__init__.pyc -------------------------------------------------------------------------------- /dw_service/core/conf/conf.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/conf/conf.pyc -------------------------------------------------------------------------------- /dw_service/core/core.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/core.pyc -------------------------------------------------------------------------------- /dw_service/core/model/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/model/__init__.pyc -------------------------------------------------------------------------------- /dw_service/core/model/date_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/model/date_model.pyc -------------------------------------------------------------------------------- /dw_service/core/model/hive_model.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/model/hive_model.pyc -------------------------------------------------------------------------------- /dw_service/core/util/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/util/__init__.pyc -------------------------------------------------------------------------------- /dw_service/core/util/base/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/util/base/__init__.pyc -------------------------------------------------------------------------------- /dw_service/core/util/base/args_format.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/util/base/args_format.pyc -------------------------------------------------------------------------------- /dw_service/core/util/base/camel.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/util/base/camel.pyc -------------------------------------------------------------------------------- /dw_service/core/util/base/date.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/util/base/date.pyc -------------------------------------------------------------------------------- /dw_service/core/util/hive/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/util/hive/__init__.pyc -------------------------------------------------------------------------------- /dw_service/core/util/hive/hive_server2.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/core/util/hive/hive_server2.pyc -------------------------------------------------------------------------------- /dw_service/dw_service/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/dw_service/__init__.pyc -------------------------------------------------------------------------------- /dw_service/dw_service/base_service.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/dw_service/base_service.pyc -------------------------------------------------------------------------------- /dw_service/dw_service/load_service.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/dw_service/load_service.pyc -------------------------------------------------------------------------------- /dw_service/dw_service/uba_log/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/dw_service/uba_log/__init__.pyc -------------------------------------------------------------------------------- /dw_service/dw_service/uba_log/run.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/dw_service/uba_log/run.pyc -------------------------------------------------------------------------------- /dw_service/dw_service/uba_log/uba_base.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/dw_service/uba_log/uba_base.pyc -------------------------------------------------------------------------------- /dw_service/dw_service/uba_log/uba_web_visit_log.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/dw_service/dw_service/uba_log/uba_web_visit_log.pyc -------------------------------------------------------------------------------- /dw_service_core.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from core.core import Core as Core 3 | from core.util.base.camel import Camel 4 | 5 | import importlib 6 | 7 | class DwServiceCore: 8 | 9 | u'获取 Core 实例' 10 | def getDwCoreInstance(self): 11 | return Core 12 | 13 | #服务 14 | service = "" 15 | def setService(self,data): 16 | self.service = data 17 | 18 | def getService(self): 19 | return self.service 20 | 21 | #模块 22 | module = "" 23 | def setModule(self,data): 24 | self.module = data 25 | 26 | def getModule(self): 27 | return self.module 28 | 29 | # args (入参为:sys.argv) 30 | args = [] 31 | def setArgs(self,args): 32 | self.args = args 33 | 34 | def getArgs(self): 35 | return self.args 36 | 37 | 38 | parameter = None; 39 | def setParameter(self,data = None): 40 | self.parameter = data; 41 | 42 | def getParameter(self): 43 | return self.parameter; 44 | 45 | def getFormatParameter(self): 46 | #print self.getParameter() 47 | return eval(self.getParameter()); 48 | 49 | 50 | # 注册实例 51 | registerInstance = {} 52 | def setRegisterInstance(self, instanceKey, object): 53 | self.registerInstance.update({instanceKey : object}) 54 | 55 | def getRegisterInstance(self, instanceKey): 56 | return self.registerInstance.get(instanceKey) 57 | 58 | 59 | def init(self): 60 | # 注册对象 61 | 62 | self.setRegisterInstance('hadoopModel', self.getDwCoreInstance().getModelInterface('Hadoop') ) 63 | self.setRegisterInstance('hiveModel', self.getDwCoreInstance().getModelInterface('Hive') ) 64 | self.setRegisterInstance('sparkModel', self.getDwCoreInstance().getModelInterface('Spark') ) 65 | self.setRegisterInstance('sqoopModel', self.getDwCoreInstance().getModelInterface('Sqoop') ) 66 | self.setRegisterInstance('biDbModel', self.getDwCoreInstance().getModelInterface('BiDb') ) 67 | self.setRegisterInstance('produceDbModel', self.getDwCoreInstance().getModelInterface('ProduceDb') ) 68 | 69 | self.setRegisterInstance('mailModel', self.getDwCoreInstance().getModelInterface('Mail') ) 70 | self.setRegisterInstance('dateModel', self.getDwCoreInstance().getModelInterface('Date') ) 71 | self.setRegisterInstance('confModel', self.getDwCoreInstance().getModelInterface('Conf') ) 72 | 73 | 74 | def process(self): 75 | pass 76 | 77 | def shutdown(self): 78 | pass 79 | 80 | 81 | def run(self): 82 | #package = 'test.test_run' 83 | #module = 'TestRun' 84 | # 动态加载包和模块 85 | package = self.getService() + "." + self.getModule() 86 | module = Camel.underlineToCamel(self.getModule()) 87 | 88 | # 加载 89 | importClass = __import__(package, fromlist=[module]) 90 | className = getattr(importClass, module) 91 | 92 | # 实例 93 | serviceObject = className() 94 | serviceObject.setService(self.getService()) 95 | serviceObject.setModule(self.getModule()) 96 | serviceObject.setParameter(self.getParameter()) 97 | serviceObject.setArgs(self.getArgs()) 98 | serviceObject.init() 99 | serviceObject.process() 100 | serviceObject.shutdown() 101 | 102 | 103 | def runOther(self): 104 | run_str = u''' 105 | from %s.%s import %s 106 | serviceObj = %s() 107 | serviceObj.setService('%s') 108 | serviceObj.setModule('%s') 109 | serviceObj.setParameter('%s') 110 | serviceObj.setArgs(%s) 111 | serviceObj.init() 112 | serviceObj.process() 113 | serviceObj.shutdown() 114 | '''%(self.getService(), 115 | self.getModule(), 116 | Camel.underlineToCamel(self.getModule()), 117 | Camel.underlineToCamel(self.getModule()), 118 | self.getService(), 119 | self.getModule(), 120 | self.getParameter(), 121 | self.getArgs() 122 | ) 123 | 124 | exec(run_str); 125 | -------------------------------------------------------------------------------- /extract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/extract/__init__.py -------------------------------------------------------------------------------- /extract/extract_mysql.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from dw_service_core import DwServiceCore 3 | from core.util.log.logger import Logger 4 | 5 | 6 | u''' 7 | Mysql 数据抽取 8 | 9 | ''' 10 | 11 | class ExtractMysql(DwServiceCore): 12 | 13 | # MYSQL DUMP 最大文件大小边界,当超过这个值就会使用 Sqoop 导入 14 | # 500 MB 15 | #DUMP_SIZE_RANGE = 524288000 16 | 17 | # 300 MB 18 | #DUMP_SIZE_RANGE = 314572800 19 | 20 | # 200 MB 21 | DUMP_SIZE_RANGE = 209715200 22 | 23 | # 100MB 24 | #DUMP_SIZE_RANGE = 104857600 25 | 26 | # SQOOP 方式最大文件大小 27 | SQOOP_SIZE_RANGE = 314572800 28 | #SQOOP_SIZE_RANGE = 1048576000 29 | 30 | 31 | # 数据库服务器 32 | # 业务数据库 33 | PRODUCE_DB = 'product' 34 | # dw 数据仓库数据库 35 | DW_DB = 'dw' 36 | # 当前抽取的数据库标识 37 | extractDb = None 38 | # 当前抽取的 Model 对象 39 | extractDbServerModel = None 40 | # 设置 抽取服务器的信息 41 | def setExtractDb(self,dbServer): 42 | if (dbServer == ExtractMysql.PRODUCE_DB): 43 | self.extractDbServerModel = self.getRegisterInstance('produceDbModel') 44 | elif (dbServer == ExtractMysql.DW_DB): 45 | self.extractDbServerModel = self.getRegisterInstance('biDbModel') 46 | 47 | self.extractDb = dbServer 48 | def getExtractDb(self): 49 | return self.extractDb 50 | 51 | 52 | # 抽取方式 53 | # 全量 54 | COMPLETE = 1 55 | # 增量 56 | INCREMENTAL = 2 57 | # 当前选择方式 58 | extractType = None 59 | def setExtractType(self,data): 60 | self.extractType = data 61 | def getExtractType(self): 62 | return self.extractType 63 | 64 | 65 | # 抽取工具 66 | # dump 文件方式 67 | MYSQL_DUMP = 1 68 | # sqoop 导入方式 69 | SQOOP = 2 70 | # 当前选择方式 71 | extractTool = 0 72 | def setExtractTool(self,data): 73 | self.extractTool = data 74 | def getExtractTool(self): 75 | return self.extractTool 76 | 77 | 78 | # 表 id 79 | tbId = 0 80 | def setTbId(self, tbId): 81 | self.tbId = tbId 82 | def getTbId(self): 83 | return self.tbId 84 | 85 | 86 | # 源数据库 87 | sourceDb = None 88 | def setSourceDb(self,data): 89 | self.sourceDb = data 90 | def getSourceDb(self): 91 | return self.sourceDb 92 | 93 | # 源数据表 94 | sourceTable = None 95 | def setSourceTable(self,data): 96 | self.sourceTable = data 97 | def getSourceTable(self): 98 | return self.sourceTable 99 | 100 | 101 | # 目标数据库 102 | targetDb = None 103 | def setTargetDb(self,data): 104 | self.targetDb = data 105 | def getTargetDb(self): 106 | return self.targetDb 107 | 108 | # 目标数据表 109 | targetTable = None 110 | def setTargetTable(self,data): 111 | self.targetTable = data 112 | def getTargetTable(self): 113 | return self.targetTable 114 | 115 | 116 | # dump 的目录 117 | dumpFileDir = '/data/log/mysql' 118 | def setDumpFileDir(self,data): 119 | self.dumpFileDir = data 120 | def getDumpFileDir(self): 121 | return self.dumpFileDir 122 | 123 | 124 | # dump 的文件名 125 | dumpFileName = None 126 | def setDumpFileName(self,data): 127 | self.dumpFileName = data 128 | def getDumpFileName(self): 129 | if (self.dumpFileName == None): 130 | self.dumpFileName = self.getSourceDb() + "." + self.getSourceTable() 131 | return self.dumpFileName 132 | 133 | 134 | # MapReduce 数量 135 | mapReduceNum = None 136 | def setMapReduceNum(self,data): 137 | self.mapReduceNum = data 138 | def getMapReduceNum(self): 139 | if (self.mapReduceNum == None): 140 | self.mapReduceNum = 1 141 | return self.mapReduceNum 142 | 143 | 144 | # 表大小 145 | tbSize = None 146 | def setTbSize(self,data): 147 | self.tbSize = data 148 | def getTbSize(self): 149 | if (self.tbSize == None): 150 | self.tbSize = 0 151 | return self.tbSize 152 | 153 | 154 | # 表行数 155 | tbRows = None 156 | def setTbRows(self,data): 157 | self.tbRows = data 158 | def getTbRows(self): 159 | if (self.tbRows == None): 160 | self.tbRows = '' 161 | return self.tbRows 162 | 163 | 164 | 165 | u'''---------- 全量抽取处理 START ---------- ''' 166 | 167 | # 获取待导入数据表信息 168 | def getSourceTableInfo(self): 169 | # 表大小 170 | tbInfo = self.extractDbServerModel.tableInfo(self.getSourceDb(),self.getSourceTable()) 171 | self.setTbSize(tbInfo['tbSize']) 172 | 173 | # 表行数 174 | tbRows = self.extractDbServerModel.count("SELECT COUNT(*) AS c FROM " + self.getSourceDb() + '.' + self.getSourceTable()) 175 | self.setTbRows(tbRows) 176 | 177 | 178 | # 全量抽取控制 179 | def extractCompleteAction(self): 180 | 181 | # 当没有自定义抽取的工具,则使用系统默认规则 182 | if (self.getExtractTool() == 0): 183 | self.setExtractTool(ExtractMysql.MYSQL_DUMP) 184 | u''' 185 | self.getSourceTableInfo() 186 | 187 | # 如果小于边界值 188 | if (self.getTbSize() <= ExtractMysql.DUMP_SIZE_RANGE): 189 | # 设置抽取方式 : mysql dump 工具抽取 190 | self.setExtractTool(ExtractMysql.MYSQL_DUMP) 191 | 192 | # 大于边界值时 Sqoop 抽取 193 | elif (self.getTbSize() >= ExtractMysql.DUMP_SIZE_RANGE and self.getTbSize() <= ExtractMysql.SQOOP_SIZE_RANGE) : 194 | # 设置抽取方式 : sqoop 方式抽取 195 | self.setExtractTool(ExtractMysql.SQOOP) 196 | self.setMapReduceNum(5) 197 | 198 | # 超过 边界值 则加大 mpreduce 数量用 Sqoop 抽取 199 | elif (self.getTbSize() > ExtractMysql.SQOOP_SIZE_RANGE) : 200 | # 设置抽取方式 : sqoop 方式抽取 201 | self.setExtractTool(ExtractMysql.SQOOP) 202 | self.setMapReduceNum(10) 203 | ''' 204 | # 执行全量抽取 205 | self.extractComplete() 206 | 207 | 208 | # 全量抽取工具选择 209 | def extractComplete(self): 210 | # Mysql Dump 抽取 211 | if (self.getExtractTool() == ExtractMysql.MYSQL_DUMP): 212 | self.extractMysqlDump() 213 | elif (self.getExtractTool() == ExtractMysql.SQOOP): 214 | self.extractMysqlSqoop() 215 | 216 | 217 | # MySQL Dump 全量方式抽取实现 218 | def extractMysqlDump(self): 219 | Logger.info("---------- mysqlDump 开始 " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " ---------- ") 220 | # 开始时间 221 | startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() 222 | 223 | # 1. 删除目标 Hive 对应表 224 | Logger.info("删除目标 Hive 对应表") 225 | self.getRegisterInstance('hiveModel').dropTable(self.getTargetDb() + "." + self.getTargetTable()) 226 | 227 | # 2. dump mysql 数据到文件中 228 | Logger.info("dump mysql 数据到文件中") 229 | dumpSql = "SELECT * FROM " + self.getSourceDb() + "." + self.getSourceTable() 230 | dumpFile = self.getDumpFileDir() + "/" + self.getDumpFileName() 231 | dumpResult = self.extractDbServerModel.mysqlDumpFile(dumpSql ,dumpFile) 232 | 233 | # 3. 根据 Mysql 表结构创建 Hive 表结构 234 | Logger.info("根据 Mysql 表结构创建 Hive 表结构") 235 | sourceTableFields = self.getSourceTableFields() 236 | formatTableFieldsList = [] 237 | for curField in sourceTableFields: 238 | formatTableFieldsList.append('`' + curField + '`') 239 | formatTableFieldsStr = ' String,'.join(formatTableFieldsList) + " String" 240 | 241 | createHiveTableSql = ''' 242 | CREATE TABLE IF NOT EXISTS %s.%s ( 243 | %s 244 | ) ROW FORMAT DELIMITED 245 | FIELDS TERMINATED BY '\\001' 246 | COLLECTION ITEMS TERMINATED BY '\\n' 247 | STORED AS TEXTFILE 248 | '''%(self.getTargetDb(),self.getTargetTable(),formatTableFieldsStr) 249 | 250 | # 执行 Hive 创建表 251 | Logger.info("执行 Hive 创建表") 252 | createHiveTableResult = self.getRegisterInstance('hiveModel').createTable(createHiveTableSql) 253 | 254 | # 4. 上传 dump 文件到 HiveTable 中 255 | Logger.info("上传 dump 文件到 HiveTable 中") 256 | hiveLoadSql = "LOAD DATA LOCAL INPATH '" + self.getDumpFileDir() + "/" + self.getDumpFileName() + "' OVERWRITE INTO TABLE " + self.getTargetDb() + "." + self.getTargetTable() + ";" 257 | hiveLoadResult = self.getRegisterInstance('hiveModel').runHiveScript(hiveLoadSql) 258 | 259 | # 5. 检测执行结果 260 | if (dumpResult['code'] == 0 and createHiveTableResult == True and hiveLoadResult['code'] == 0 ) : 261 | resultCode = 0 262 | else: 263 | resultCode = hiveLoadResult['code'] 264 | 265 | # 6. 计算执行结果写日志和打印 266 | # 计算结束日期 267 | diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp 268 | 269 | # mysql 记录日志 270 | self.extractLog(resultCode,diffTimestamp) 271 | 272 | # 打印日志 273 | logStr = "全量抽取 : (Dump : " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " -> " + str(self.getTargetDb()) + "." + str(self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" 274 | Logger.info(logStr) 275 | 276 | 277 | 278 | # sqoop 方式抽取 279 | def extractMysqlSqoop(self): 280 | Logger.info("---------- sqoop 开始 " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " ----------") 281 | 282 | # 开始时间 283 | startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() 284 | 285 | # 1. 删除目标 Hive 对应表 286 | Logger.info("删除目标 Hive 对应表") 287 | self.getRegisterInstance('hiveModel').dropTable(self.getTargetDb() + "." + self.getTargetTable()) 288 | 289 | # 2. 执行 Sqoop Mysql 导入到 Hive 290 | Logger.info("执行 Sqoop Mysql 导入到 Hive") 291 | self.getRegisterInstance('sqoopModel').setDbServer(self.getExtractDb()) 292 | self.getRegisterInstance('sqoopModel').setSourceDb(self.getSourceDb()) 293 | self.getRegisterInstance('sqoopModel').setSourceTable(self.getSourceTable()) 294 | self.getRegisterInstance('sqoopModel').setTargetDb(self.getTargetDb()) 295 | self.getRegisterInstance('sqoopModel').setTargetTable(self.getTargetTable()) 296 | self.getRegisterInstance('sqoopModel').setMapReduceNum(self.getMapReduceNum()) 297 | result = self.getRegisterInstance('sqoopModel').importMysqlToHive() 298 | 299 | # 3. 检测执行结果 300 | if (result['code'] == 0): 301 | resultCode = 0 302 | else : 303 | resultCode = result['code'] 304 | 305 | # 4. 计算执行结果写日志和打印 306 | # 计算结束日期 307 | diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp 308 | 309 | #mysql 记录日志 310 | self.extractLog(resultCode,diffTimestamp) 311 | 312 | # 打印日志 313 | logStr = "全量抽取 : (Sqoop : " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " -> " + str(self.getTargetDb()) + "." + str(self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" 314 | Logger.info(logStr) 315 | 316 | u'''---------- 全量抽取处理 END ---------- ''' 317 | 318 | 319 | 320 | u'''---------- 增量抽取处理 START ---------- ''' 321 | 322 | # 设置增量表属性 323 | incrementalAttribute = {} 324 | def setIncrementalAttribute(self, data): 325 | self.incrementalAttribute = data 326 | def getIncrementalAttribute(self): 327 | return self.incrementalAttribute 328 | 329 | 330 | # 增量抽取流程控制 331 | def extractIncrementalAction(self): 332 | 333 | # 检测 hive 数据表是否存在 334 | if (self.isExistsHiveTable() == False): 335 | Logger.info("增量抽取控制: 初始化, 全量抽取... -> " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) ) 336 | # 不存在全量抽取 337 | self.extractCompleteAction() 338 | else: 339 | # 检测字段变化 340 | # 有变化时 341 | if (self.checkStbAndTtbFields() == True): 342 | Logger.info("增量抽取控制: 结构发生变化, 初始化, 全量抽取... -> " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) ) 343 | # 全量抽取 344 | self.extractCompleteAction() 345 | # 无变化 346 | else: 347 | Logger.info("增量抽取控制: 增量抽取... -> " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable())) 348 | # 增量抽取 349 | self.extractIncrementalTable() 350 | 351 | 352 | # 增量抽取方法实体 353 | def extractIncrementalTable(self): 354 | Logger.info("---------- 增量抽取开始 " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " ----------") 355 | 356 | # 开始时间 357 | startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() 358 | 359 | # 设置增量表属性 360 | Logger.info("设置增量表属性") 361 | tableInfoExt = self.getRegisterInstance('biDbModel').getExtractMysqlTableExt(self.getTbId()) 362 | self.setIncrementalAttribute(tableInfoExt) 363 | 364 | # 获取增量表的属性 365 | Logger.info("获取增量表的属性") 366 | incTbAttr = self.getIncrementalAttribute() 367 | primaryKey = incTbAttr['primary_key'] 368 | incrementalField = incTbAttr['incremental_field'] 369 | incrementalVal = incTbAttr['incremental_val'] 370 | conditions = incTbAttr['conditions'] 371 | 372 | # hive 目标表 373 | targetTb = self.getTargetDb() + "." + self.getTargetTable() 374 | # hive 增量表 375 | incTb = targetTb + "__inc" 376 | 377 | 378 | # 1. 删除增量抽取表 379 | Logger.info("删除增量抽取表") 380 | self.getRegisterInstance('hiveModel').dropTable(incTb) 381 | 382 | # 2. 创建增量抽取表 383 | Logger.info("创建增量抽取表") 384 | createHiveTableSql = "CREATE TABLE " + incTb + " LIKE " + targetTb 385 | createHiveTableResult = self.getRegisterInstance('hiveModel').createTable(createHiveTableSql) 386 | 387 | incDumpSql = "" 388 | # 3. 读取最新的增量数据到本地文件中 389 | if (incrementalVal == ""): 390 | # 获取目标表,最大的字段数, 已这个作为基地抽取数据 391 | Logger.info("获取目标表,最大的字段数, 已这个作为基地抽取数据") 392 | targetTbMaxPointVal = self.getHiveTbMaxVal(targetTb, incrementalField) 393 | 394 | incDumpSql = "SELECT * FROM " + self.getSourceDb() + "." + self.getSourceTable() + " WHERE " + incrementalField + conditions + "'" + str(targetTbMaxPointVal) + "'" 395 | 396 | # 更新 point 点 397 | Logger.info("更新 point 点") 398 | self.updateTableExt(tableInfoExt['id'], targetTbMaxPointVal) 399 | else: 400 | incDumpSql = "SELECT * FROM " + self.getSourceDb() + "." + self.getSourceTable() + " WHERE " + incrementalField + conditions + "'" + incrementalVal + "'" 401 | 402 | Logger.info("dump 更新数据到本地") 403 | dumpIncFile = self.getDumpFileDir() + "/" + self.getDumpFileName() + ".inc" 404 | dumpIncResult = self.extractDbServerModel.mysqlDumpFile(incDumpSql ,dumpIncFile) 405 | 406 | 407 | # 4. 上传 dump 文件到 incHiveTable 中 408 | Logger.info("上传 dump 文件到 incHiveTable 中") 409 | hiveLoadSql = "LOAD DATA LOCAL INPATH '" + dumpIncFile + "' OVERWRITE INTO TABLE " + incTb + ";" 410 | hiveLoadResult = self.getRegisterInstance('hiveModel').runHiveScript(hiveLoadSql) 411 | 412 | 413 | # 获取增量表本次最大一条增量的增量字段值 414 | Logger.info("获取增量表本次最大一条增量的增量字段值") 415 | incTbMaxPointVal = self.getHiveTbMaxVal(incTb, incrementalField) 416 | # 当抽取的增量数据为空时, 不做任何处理, 直接退出 417 | if (incTbMaxPointVal == None): 418 | Logger.info("增量数据为空...") 419 | return 420 | 421 | # 5. 执行存储过程完成增量 422 | incHiveSql = """ 423 | INSERT OVERWRITE TABLE %(targetTb)s 424 | SELECT * 425 | FROM ( 426 | SELECT a.* 427 | FROM %(targetTb)s AS a 428 | LEFT JOIN %(incTb)s AS b 429 | ON a.%(primaryKey)s = b.%(primaryKey)s 430 | WHERE b.%(primaryKey)s IS NULL 431 | ) AS bs 432 | 433 | UNION ALL 434 | SELECT * FROM %(incTb)s 435 | ;"""%{'targetTb': targetTb, 436 | 'incTb': incTb, 437 | 'primaryKey': primaryKey 438 | } 439 | 440 | Logger.info(incHiveSql) 441 | 442 | # 6. 最终逻辑运算使用 spark sql 443 | Logger.info("最终逻辑运算使用 spark sql") 444 | incSqlResult = self.getRegisterInstance('sparkModel').batchExecuteSql(incHiveSql) 445 | incSqlResult= True 446 | 447 | # 7. 检测执行结果 448 | if (dumpIncResult['code'] == 0 449 | and hiveLoadResult['code'] == 0 450 | and createHiveTableResult == True 451 | and incSqlResult == True ) : 452 | 453 | # 更新节点 454 | if (incTbMaxPointVal != None): 455 | 456 | self.updateTableExt(tableInfoExt['id'],incTbMaxPointVal ) 457 | 458 | resultCode = 0 459 | else: 460 | resultCode = 1 461 | 462 | 463 | # 计算执行结果写日志和打印 464 | # 计算结束日期 465 | diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp 466 | 467 | # mysql 记录日志 468 | self.extractLog(resultCode, diffTimestamp) 469 | 470 | # 打印日志 471 | logStr = "增量抽取 : (Dump : " + str(self.getTbId()) + ': ' + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " -> " + str(self.getTargetDb()) + "." + str(self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" 472 | Logger.info(logStr) 473 | 474 | 475 | u'''---------- 增量抽取处理 END ---------- ''' 476 | 477 | 478 | 479 | u''' ---------- TOOLS ---------''' 480 | 481 | # 获取 Mysql 字段 482 | def getSourceTableFields(self): 483 | return self.extractDbServerModel.getFileds(self.getSourceDb(),self.getSourceTable()) 484 | 485 | # 记录日志到 Mysql 486 | def extractLog(self,code,time): 487 | logSql = "INSERT INTO dw_service.extract_log (`db_server`,`db_name`,`tb_name`,`extract_type`,`extract_tool`,`code`,`run_time`,`size`,`rows`) " 488 | logSql += "VALUES ('%s','%s','%s',%d, %d, %d, %d, '%s','%s')" 489 | #print self.getExtractDb(),self.getSourceDb(),self.getSourceTable(),self.getExtractType(),self.getExtractTool(),code,time,self.getTbSize(),self.getTbRows() 490 | logSql = logSql%(self.getExtractDb(),self.getSourceDb(),self.getSourceTable(),self.getExtractType(),self.getExtractTool(),code,time,self.getTbSize(),self.getTbRows()) 491 | self.getRegisterInstance('biDbModel').insertData(logSql) 492 | 493 | 494 | # 检测 hive target 数据表是否存在 495 | def isExistsHiveTable(self): 496 | return self.getRegisterInstance('sparkModel').isExistsTable(self.getTargetDb(),self.getTargetTable()) 497 | 498 | 499 | # source table 对比 target table 后, 字段是否发生变化 500 | def checkStbAndTtbFields(self): 501 | # False 未变化, True 已变化 502 | status = False 503 | 504 | # mysql 源表字段 505 | sourceTableFields = self.getSourceTableFields() 506 | # hive 表字段 507 | gatherTableFields = self.getRegisterInstance('sparkModel').getFileds(self.getTargetDb(),self.getTargetTable()) 508 | 509 | #获取新增的字段 510 | changeFileds = [] 511 | 512 | # 字段长度不同表示变化 513 | if (len(sourceTableFields) != len(gatherTableFields)): 514 | status = True 515 | else: 516 | # 对比 mysql 字段与 hive 字段, 发生变换的字段 517 | for sourceField in sourceTableFields: 518 | if (sourceField not in gatherTableFields): 519 | changeFileds.append(sourceField) 520 | 521 | # 变换的字段数量 522 | if len(changeFileds) > 0: 523 | status = True 524 | 525 | return status 526 | 527 | 528 | 529 | # 获取 hive 表某个字段最大值 530 | def getHiveTbMaxVal(self, tb, field = ""): 531 | 532 | if (field.find("id") >= 0): 533 | tbMaxSql = "SELECT MAX(int(" + field + ")) AS c FROM " + tb 534 | else: 535 | tbMaxSql = "SELECT MAX(" + field + ") AS c FROM " + tb 536 | 537 | tbMaxVal = self.getRegisterInstance('sparkModel').queryMax(tbMaxSql) 538 | 539 | return tbMaxVal 540 | 541 | 542 | 543 | # 更新扩展表信息 544 | def updateTableExt(self, tbId, incVal): 545 | updateTableExtSql = "UPDATE dw_service.extract_table_ext SET incremental_val='" + str(incVal) +"' WHERE id='" + str(tbId) + "'" 546 | return self.getRegisterInstance('biDbModel').updataData(updateTableExtSql) 547 | 548 | u''' ---------- TOOLS ---------''' 549 | 550 | 551 | 552 | 553 | u''' 入口 ''' 554 | 555 | def run(self): 556 | Logger.init() 557 | 558 | # 抽取类型(全量、增量) 559 | curExtractType = self.getExtractType() 560 | 561 | # 全量抽取 562 | if (curExtractType == ExtractMysql.COMPLETE): 563 | self.extractCompleteAction() 564 | # 增量抽取 565 | elif(curExtractType == ExtractMysql.INCREMENTAL): 566 | self.extractIncrementalAction() 567 | 568 | 569 | -------------------------------------------------------------------------------- /extract/extract_queue_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 数据抽取执行入口 5 | ''' 6 | from dw_service_core import DwServiceCore 7 | 8 | from extract.extract_mysql import ExtractMysql 9 | from core.util.log.logger import Logger 10 | 11 | import threading 12 | from time import ctime,sleep 13 | import Queue 14 | 15 | 16 | u''' 17 | 调用方法: 18 | 1. 抽取 串行/并行 19 | runType : liste 串行抽取, thread 并行抽取 20 | ./index.py --service extract --module extract_queue_run --parameter '{"runType":"liste"}' 21 | 22 | ./index.py --service extract --module extract_queue_run --parameter '{"runType":"thread"}' 23 | 24 | 2. 抽取单个数据表 25 | dbServer 抽取服务器: product 业务数据库 ,dw 数据部服务器 26 | sourceDb 源数据库 27 | sourceTb 源数据表 28 | targetDb 目标数据库 29 | targetTb 目标数据表 30 | extractTool 抽取工具: 1 mysql dump , 2 sqoop 31 | mapReduceNum mapReduce 数量 ,抽取类型为 2 时有效 32 | ./index.py --service extract --mo extract_queue_run --par '{"dbServer":"product","sourceDb":"angejia","sourceTb":"call_relation_with_inventory","targetDb":"db_sync","targetTb":"angejia__call_relation_with_inventory","extractTool":"1","mapReduceNum":"1"}' 33 | ''' 34 | class ExtractQueueRun(DwServiceCore) : 35 | 36 | 37 | def process(self): 38 | Logger.init() 39 | 40 | # 解析参数 41 | parameter = self.getFormatParameter() 42 | 43 | # 运行类型 44 | runType = parameter.get('runType') 45 | 46 | # 串行抽取 47 | if (runType == 'liste'): 48 | self.extractMysqlTableListe() 49 | # 并行抽取 50 | elif (runType == 'thread'): 51 | self.extractMysqlTableThread() 52 | # 指定抽取数据表抽取 53 | elif (parameter.get('sourceDb') != None and parameter.get('sourceTb') != None): 54 | self.extractMysqlTableIndependent(parameter) 55 | # 测试 56 | else : 57 | self.extractMysqlTableTest() 58 | 59 | 60 | # 并发执行 61 | def extractMysqlTableThread(self): 62 | # 等待运行数据表 63 | sourceTableList = self.getRegisterInstance('biDbModel').getExtractMysqlTables() 64 | 65 | # 运行线程数 66 | numThreadPool = 2 67 | 68 | # 定义队列 69 | q = Queue.Queue() 70 | 71 | # 加入到队列中 72 | for curTableInfo in sourceTableList: 73 | q.put(curTableInfo) 74 | 75 | 76 | # 开指定数量线程消费 77 | for curThreadPoolNum in range(numThreadPool): 78 | currentThread = threading.Thread(target=self.runTable,args=(q, curThreadPoolNum)) 79 | # 父进程不等待子进程结束,继续执行 80 | currentThread.setDaemon(True) 81 | currentThread.start() 82 | sleep(5) 83 | 84 | 85 | # 等到队列为空,再向下执行 86 | q.join() 87 | 88 | Logger.info('执行完成~') 89 | 90 | 91 | def runTable(self, q, threadPoolNum): 92 | 93 | wNum = 1 94 | while(True): 95 | # 队列为空的时候退出 96 | if (q.empty() == True): 97 | break 98 | 99 | # 当前可消费的队列 100 | qTableInfo = q.get() 101 | 102 | sourceTb = qTableInfo['db_name'] + '.' + qTableInfo['tb_name'] + '.' + str(qTableInfo['id']) 103 | Logger.info('线程:' + str(threadPoolNum) + ', 第: ' + str(wNum) + ' 次. ' + str(sourceTb)) 104 | 105 | # 执行抽取任务 106 | self.extractMysqlTable(qTableInfo) 107 | q.task_done() 108 | wNum += 1 109 | 110 | 111 | # 串行 抽取 MYSQL 数据表 112 | def extractMysqlTableListe(self): 113 | extractMysqlTables = self.getRegisterInstance('biDbModel').getExtractMysqlTables() 114 | 115 | for curTableInfo in extractMysqlTables: 116 | self.extractMysqlTable(curTableInfo) 117 | 118 | 119 | 120 | # 抽取方法实体 121 | def extractMysqlTable(self, tableInfo): 122 | try : 123 | extractConf = self.getRegisterInstance('confModel').getExtractConf() 124 | 125 | # 默认表分隔符 126 | confSeparator = extractConf['core']['separator'] 127 | # 抽取到目标 hive 数据库名 128 | confTargetDb = extractConf['extract_mysql']['hive_target_db'] 129 | # dump 本地临时目录 130 | confDumpFileDir = extractConf['extract_mysql']['dump_file_dir'] 131 | 132 | # 数据库配置表信息 133 | 134 | # tb id 135 | tbId = tableInfo['id'] 136 | # mysql 数据库源信息 137 | dbServer = tableInfo['db_server'] 138 | # 数据源: 数据库名 139 | dbName = tableInfo['db_name'] 140 | # 数据源: 表名 141 | tbName = tableInfo['tb_name'] 142 | # 目标 hive 数据库名 143 | dbTargetDbName = tableInfo['target_db_name'] 144 | # 目标 hive 表名 145 | dbTargetTbName = tableInfo['target_tb_name'] 146 | # 抽取工具 147 | extractTool = tableInfo['extract_tool'] 148 | # 抽取类型 149 | extractType = tableInfo['extract_type'] 150 | 151 | # 设置 hive 表名的规则 152 | # 当指定了抽取的目标库, 使用指定的库和表名 153 | if (dbTargetDbName != "" and dbTargetTbName !=""): 154 | affirmTargetDb = dbTargetDbName 155 | affirmTargetTb = dbTargetTbName 156 | # 没有使用则用默认的规则 157 | else: 158 | affirmTargetDb = confTargetDb 159 | affirmTargetTb = dbName + confSeparator + tbName 160 | 161 | # 实例化抽取对象 162 | extractMysql = ExtractMysql() 163 | # Dump 方式时的保存目录 164 | extractMysql.setDumpFileDir(confDumpFileDir) 165 | 166 | # 设置抽取的数据库源 167 | if (dbServer == ExtractMysql.PRODUCE_DB): 168 | extractMysql.setExtractDb(ExtractMysql.PRODUCE_DB) 169 | elif (dbServer == ExtractMysql.DW_DB): 170 | extractMysql.setExtractDb(ExtractMysql.DW_DB) 171 | else: 172 | Logger.info("抽取的数据源不存在!" + dbServer) 173 | 174 | 175 | # 设置抽取类型 176 | # 全量抽取 177 | if (extractType == ExtractMysql.COMPLETE): 178 | extractMysql.setExtractType(ExtractMysql.COMPLETE) 179 | # 增量抽取 180 | elif (extractType == ExtractMysql.INCREMENTAL): 181 | extractMysql.setExtractType(ExtractMysql.INCREMENTAL) 182 | else : 183 | Logger.info("抽取数据类型不存在!" + extractType) 184 | 185 | 186 | # 配置指定抽取的工具 187 | if (extractTool == ExtractMysql.MYSQL_DUMP): 188 | extractMysql.setExtractTool(ExtractMysql.MYSQL_DUMP) 189 | elif (extractTool == ExtractMysql.SQOOP) : 190 | extractMysql.setExtractTool(ExtractMysql.SQOOP) 191 | extractMysql.setMapReduceNum(5) 192 | 193 | 194 | # 设置抽取表的信息 195 | sourceDb = dbName 196 | sourceTable = tbName 197 | targetDb = affirmTargetDb 198 | targetTable = affirmTargetTb 199 | 200 | extractMysql.setTbId(tbId) 201 | extractMysql.setSourceDb(sourceDb) 202 | extractMysql.setSourceTable(sourceTable) 203 | extractMysql.setTargetDb(targetDb) 204 | extractMysql.setTargetTable(targetTable) 205 | extractMysql.run() 206 | 207 | except Exception,ex: 208 | log = "异常-> 数据表: " + str(dbServer) + ": " + str(dbName) + "." + str(tbName) 209 | log += " -> " + str(Exception) + ":" + str(ex) 210 | Logger.info(log) 211 | 212 | 213 | 214 | # 独立抽取数据表,使用默认的抽取规则算法抽取数据 215 | def extractMysqlTableIndependent(self, tableInfo): 216 | # 抽取 217 | extractMysql = ExtractMysql() 218 | 219 | # 源数据信息 220 | dbServer = tableInfo.get('dbServer') 221 | sourceDb = tableInfo.get('sourceDb') 222 | sourceTb = tableInfo.get('sourceTb') 223 | targetDb = tableInfo.get('targetDb') 224 | targetTb = tableInfo.get('targetTb') 225 | extractTool = tableInfo.get('extractTool') 226 | # 默认 1 ,三元表达式 227 | mapReduceNum = tableInfo.get('mapReduceNum') and tableInfo.get('mapReduceNum') or 1 228 | 229 | # 抽取工具 230 | extractTool = tableInfo['extractTool'] 231 | 232 | # 业务 数据库服务器 233 | # ExtractMysql.PRODUCE_DB 234 | extractMysql.setExtractDb(dbServer) 235 | 236 | # 抽取类型,全量 237 | extractMysql.setExtractType(ExtractMysql.COMPLETE) 238 | 239 | # 指定工具抽取 240 | extractMysql.setExtractTool(int(extractTool)) 241 | 242 | extractMysql.setMapReduceNum(int(mapReduceNum)) 243 | 244 | # Dump 方式时的保存目录 245 | extractMysql.setDumpFileDir('/data/log/mysql') 246 | 247 | extractMysql.setSourceDb(sourceDb) 248 | extractMysql.setSourceTable(sourceTb) 249 | extractMysql.setTargetDb(targetDb) 250 | extractMysql.setTargetTable(targetTb) 251 | extractMysql.run() 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | -------------------------------------------------------------------------------- /extract/extract_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | u''' 3 | 数据抽取执行入口 4 | ''' 5 | import threading 6 | from time import ctime,sleep 7 | import Queue 8 | 9 | from dw_service_core import DwServiceCore 10 | from extract.extract_mysql import ExtractMysql 11 | from core.util.log.logger import Logger 12 | 13 | 14 | u''' 15 | 抽取 mysql 数据到 hive 中 16 | 17 | 调用方法: 18 | 1. 抽取 串行/并行 19 | runType : liste 串行抽取, thread 并行抽取 20 | ./index.py --service extract --module extract_run --parameter '{"runType":"liste"}' 21 | 22 | ./index.py --service extract --module extract_run --parameter '{"runType":"thread"}' 23 | 24 | 2. 抽取单个数据表 25 | dbServer 抽取服务器: product 业务数据库 ,dw 数据部服务器 26 | sourceDb 源数据库 27 | sourceTb 源数据表 28 | targetDb 目标数据库 29 | targetTb 目标数据表 30 | extractTool 抽取工具: 1 mysql dump , 2 sqoop 31 | extractType 抽取类型: 1 全量, 2 增量 32 | mapReduceNum mapReduce 数量 ,抽取类型为 2 时有效 33 | ../index.py \ 34 | --service extract \ 35 | --module extract_run \ 36 | --par '{"dbServer":"product","sourceDb":"angejia","sourceTb":"user_msg","targetDb":"db_sync","targetTb":"angejia__user_msg","extractType":"1","extractTool":"1","mapReduceNum":"1"}' 37 | 38 | ''' 39 | class ExtractRun(DwServiceCore) : 40 | 41 | def process(self): 42 | Logger.init() 43 | 44 | # 解析参数 45 | parameter = self.getFormatParameter() 46 | 47 | # 运行类型 48 | runType = parameter.get('runType') 49 | 50 | # 串行抽取 51 | if (runType == 'liste'): 52 | self.extractMysqlTableListe(ExtractMysql.COMPLETE) 53 | self.extractMysqlTableListe(ExtractMysql.INCREMENTAL) 54 | # 并行抽取 55 | elif (runType == 'thread'): 56 | self.extractMysqlTableThread() 57 | self.extractMysqlTableListe(ExtractMysql.INCREMENTAL) 58 | # 指定抽取数据表抽取 59 | elif (parameter.get('sourceDb') != None and parameter.get('sourceTb') != None): 60 | self.extractMysqlTableIndependent(parameter) 61 | # 测试 62 | else : 63 | self.extractMysqlTableTest() 64 | 65 | 66 | # 并发执行 67 | def extractMysqlTableThread(self): 68 | 69 | # 等待运行数据表 70 | sourceTableList = self.getRegisterInstance('biDbModel').getExtractMysqlTables(ExtractMysql.COMPLETE) 71 | #sourceTableList = self.getRegisterInstance('biDbModel').getExtractMysqlTables(ExtractMysql.INCREMENTAL) 72 | 73 | # 运行线程数 74 | numThreadPool = 4 75 | 76 | # 定义队列 77 | q = Queue.Queue() 78 | 79 | # 加入到队列中 80 | for curTableInfo in sourceTableList: 81 | q.put(curTableInfo) 82 | 83 | 84 | # 开指定数量线程消费 85 | for curThreadPoolNum in range(numThreadPool): 86 | currentThread = threading.Thread(target=self.runTable,args=(q, curThreadPoolNum)) 87 | # True:父进程不等待子进程结束, 继续执行 88 | # False:父进程会等待所有子进程执行完毕,父进程才会退出 89 | currentThread.setDaemon(True) 90 | currentThread.start() 91 | 92 | # 等到队列为空,再向下执行 93 | q.join() 94 | 95 | Logger.info('执行完成~') 96 | 97 | return True 98 | 99 | 100 | def runTable(self, q, threadPoolNum): 101 | 102 | wNum = 1 103 | while(True): 104 | # 队列为空的时候退出 105 | if (q.empty() == True): 106 | break 107 | 108 | # 当前可消费的队列 109 | qTableInfo = q.get() 110 | 111 | sourceTb = str(qTableInfo['id']) + ': ' + qTableInfo['db_name'] + '.' + qTableInfo['tb_name'] 112 | Logger.info('线程:' + str(threadPoolNum) + ', 第: ' + str(wNum) + ' 次-> ' + str(sourceTb)) 113 | 114 | # 执行抽取任务 115 | self.extractMysqlTable(qTableInfo) 116 | 117 | #hiveTbName = "test." + qTableInfo['db_name'] + "__" + qTableInfo['tb_name'] 118 | #print hiveTbName 119 | #print self.hiveModel.dropTable(hiveTbName) 120 | 121 | q.task_done() 122 | wNum += 1 123 | 124 | 125 | # 串行 抽取 MYSQL 数据表 126 | def extractMysqlTableListe(self, extractType): 127 | extractMysqlTables = self.getRegisterInstance("biDbModel").getExtractMysqlTables(extractType) 128 | 129 | for curTableInfo in extractMysqlTables: 130 | self.extractMysqlTable(curTableInfo) 131 | 132 | 133 | 134 | # 抽取方法实体 135 | def extractMysqlTable(self, tableInfo): 136 | try : 137 | extractConf = self.getRegisterInstance('confModel').getExtractConf() 138 | 139 | # 默认表分隔符 140 | confSeparator = extractConf['core']['separator'] 141 | # 抽取到目标 hive 数据库名 142 | confTargetDb = extractConf['extract_mysql']['hive_target_db'] 143 | # dump 本地临时目录 144 | confDumpFileDir = extractConf['extract_mysql']['dump_file_dir'] 145 | 146 | # 数据库配置表信息 147 | 148 | # tb id 149 | tbId = tableInfo['id'] 150 | # mysql 数据库源信息 151 | dbServer = tableInfo['db_server'] 152 | # 数据源: 数据库名 153 | dbName = tableInfo['db_name'] 154 | # 数据源: 表名 155 | tbName = tableInfo['tb_name'] 156 | # 目标 hive 数据库名 157 | dbTargetDbName = tableInfo['target_db_name'] 158 | # 目标 hive 表名 159 | dbTargetTbName = tableInfo['target_tb_name'] 160 | # 抽取工具 161 | extractTool = tableInfo['extract_tool'] 162 | # 抽取类型 163 | extractType = tableInfo['extract_type'] 164 | 165 | # 设置 hive 表名的规则 166 | # 当指定了抽取的目标库, 使用指定的库和表名 167 | if (dbTargetDbName != "" and dbTargetTbName !=""): 168 | affirmTargetDb = dbTargetDbName 169 | affirmTargetTb = dbTargetTbName 170 | # 没有使用则用默认的规则 171 | else: 172 | affirmTargetDb = confTargetDb 173 | affirmTargetTb = dbName + confSeparator + tbName 174 | 175 | # 实例化抽取对象 176 | extractMysql = ExtractMysql() 177 | # Dump 方式时的保存目录 178 | extractMysql.setDumpFileDir(confDumpFileDir) 179 | 180 | # 设置抽取的数据库源 181 | if (dbServer == ExtractMysql.PRODUCE_DB): 182 | extractMysql.setExtractDb(ExtractMysql.PRODUCE_DB) 183 | elif (dbServer == ExtractMysql.DW_DB): 184 | extractMysql.setExtractDb(ExtractMysql.DW_DB) 185 | else: 186 | Logger.info("抽取的数据源不存在!" + dbServer) 187 | 188 | 189 | # 设置抽取类型 190 | # 全量抽取 191 | if (extractType == ExtractMysql.COMPLETE): 192 | extractMysql.setExtractType(ExtractMysql.COMPLETE) 193 | # 增量抽取 194 | elif (extractType == ExtractMysql.INCREMENTAL): 195 | extractMysql.setExtractType(ExtractMysql.INCREMENTAL) 196 | else : 197 | Logger.info("抽取数据类型不存在!" + extractType) 198 | 199 | 200 | # 配置指定抽取的工具 201 | if (extractTool == ExtractMysql.MYSQL_DUMP): 202 | extractMysql.setExtractTool(ExtractMysql.MYSQL_DUMP) 203 | elif (extractTool == ExtractMysql.SQOOP) : 204 | extractMysql.setExtractTool(ExtractMysql.SQOOP) 205 | extractMysql.setMapReduceNum(5) 206 | 207 | 208 | # 设置抽取表的信息 209 | sourceDb = dbName 210 | sourceTable = tbName 211 | targetDb = affirmTargetDb 212 | targetTable = affirmTargetTb 213 | 214 | extractMysql.setTbId(tbId) 215 | extractMysql.setSourceDb(sourceDb) 216 | extractMysql.setSourceTable(sourceTable) 217 | extractMysql.setTargetDb(targetDb) 218 | extractMysql.setTargetTable(targetTable) 219 | extractMysql.run() 220 | 221 | except Exception,ex: 222 | log = "异常-> 数据表: " + str(dbServer) + ": " + str(dbName) + "." + str(tbName) 223 | log += " -> " + str(Exception) + ":" + str(ex) 224 | Logger.info(log) 225 | 226 | 227 | 228 | # 独立抽取数据表,使用默认的抽取规则算法抽取数据 229 | def extractMysqlTableIndependent(self, tableInfo): 230 | # 抽取 231 | extractMysql = ExtractMysql() 232 | 233 | # 源数据信息 234 | dbServer = tableInfo.get('dbServer') 235 | sourceDb = tableInfo.get('sourceDb') 236 | sourceTb = tableInfo.get('sourceTb') 237 | targetDb = tableInfo.get('targetDb') 238 | targetTb = tableInfo.get('targetTb') 239 | # 抽取工具 240 | extractTool = tableInfo.get('extractTool') 241 | # 抽取类型 242 | extractType = tableInfo.get('extractType') 243 | # 默认 1 ,三元表达式 244 | mapReduceNum = tableInfo.get('mapReduceNum') and tableInfo.get('mapReduceNum') or 1 245 | 246 | # 业务 数据库服务器 247 | # ExtractMysql.PRODUCE_DB 248 | extractMysql.setExtractDb(dbServer) 249 | 250 | # 抽取类型,全量 251 | extractMysql.setExtractType(ExtractMysql.COMPLETE) 252 | 253 | # 指定工具抽取 254 | extractMysql.setExtractTool(int(extractTool)) 255 | 256 | # 抽取类型 257 | extractMysql.setExtractType(int(extractType)) 258 | 259 | # sqoop 抽取的时候 mapreduce 数量 260 | extractMysql.setMapReduceNum(int(mapReduceNum)) 261 | 262 | # Dump 方式时的保存目录 263 | extractMysql.setDumpFileDir('/data/log/mysql') 264 | 265 | extractMysql.setSourceDb(sourceDb) 266 | extractMysql.setSourceTable(sourceTb) 267 | extractMysql.setTargetDb(targetDb) 268 | extractMysql.setTargetTable(targetTb) 269 | extractMysql.run() 270 | 271 | 272 | -------------------------------------------------------------------------------- /extract/extract_run.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # 目录: /home/dwadmin/app/dw_etl 4 | dwServiceHome=$1 5 | 6 | # 运行类型: liste thread 7 | runType=$2 8 | 9 | # 运行抽取脚本 10 | $dwServiceHome/index.py --service extract --module extract_run --parameter '{"runType":"'${runType}'"}' 11 | -------------------------------------------------------------------------------- /extract/gather_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 聚合 run 5 | ''' 6 | from dw_service_core import DwServiceCore 7 | 8 | from extract.gather_table import GatherTable 9 | from core.util.log.logger import Logger 10 | 11 | u''' 12 | 聚合快照表到一张 hive 分区表中 13 | 14 | 调用方法: 15 | 1. gather 所有表 16 | ./index.py --service extract --module gather_run --parameter '{}' 17 | 18 | 2. 聚合指定表 19 | ./index.py --service extract --module gather_run --parameter '{"date":"2016-01-28","dbName":"angejia","tbName":"inventory_detail_survey"}' 20 | 21 | date 分区日期,默认昨天,格式 yyyy-mm-dd 22 | dbName 数据库名 23 | tbName 表名 24 | 25 | ''' 26 | class GatherRun(DwServiceCore) : 27 | 28 | def process(self): 29 | Logger.init() 30 | 31 | # 解析参数 32 | parameter = self.getFormatParameter() 33 | 34 | dbName = parameter.get('dbName') 35 | tbName = parameter.get('tbName') 36 | date = parameter.get('date') 37 | 38 | if (date == None): 39 | date = self.getRegisterInstance('dateModel').getYesterdayByYmd() 40 | 41 | # gather 指定数据表 42 | if (dbName != None and tbName != None): 43 | self.gatherTable(dbName,tbName,date) 44 | # gather 所有表 45 | else: 46 | self.gatherTableAll(date) 47 | 48 | #print dbServer , dbName , tbName , date 49 | 50 | 51 | # 聚合数据表 52 | def gatherTable(self,dbName,tbName,date): 53 | try : 54 | extractConf = self.getRegisterInstance('confModel').getExtractConf() 55 | 56 | confSeparator = extractConf['core']['separator'] 57 | confSourceDb = extractConf['gather_table']['hive_source_db'] 58 | confTargetDb = extractConf['gather_table']['hive_target_db'] 59 | 60 | # 设置 gather 表规则 61 | sourceDb = confSourceDb 62 | sourceTable = dbName + confSeparator + tbName 63 | targetDb = confTargetDb 64 | targetTable = dbName + confSeparator + tbName 65 | 66 | gatherTable = GatherTable() 67 | gatherTable.setSourceDb(sourceDb) 68 | gatherTable.setSourceTable(sourceTable) 69 | gatherTable.setTargetDb(targetDb) 70 | gatherTable.setTargetTable(targetTable) 71 | gatherTable.setPartitionDate(date) 72 | gatherTable.run() 73 | 74 | except Exception,ex: 75 | log = "异常-> " + str(sourceDb) + "." + str(sourceTable) 76 | log += " -> " + str(Exception) + ":" + str(ex) 77 | Logger.info(log) 78 | 79 | # 聚合所有数据表 80 | def gatherTableAll (self,date): 81 | # 获取抽取列表 82 | gatherTables = self.getRegisterInstance('biDbModel').getGatherTables() 83 | 84 | for curTableInfo in gatherTables: 85 | # 源数据信息 86 | dbName = curTableInfo['db_name'] 87 | tbName = curTableInfo['tb_name'] 88 | 89 | self.gatherTable(dbName, tbName, date) 90 | 91 | 92 | 93 | # 聚合 table 94 | def gatherTableBak(self,date): 95 | extractConf = self.getRegisterInstance('confModel').getExtractConf() 96 | 97 | confSeparator = extractConf['core']['separator'] 98 | confSourceDb = extractConf['gather_table']['hive_source_db'] 99 | confTargetDb = extractConf['gather_table']['hive_target_db'] 100 | 101 | # 获取抽取列表 102 | gatherTables = self.getRegisterInstance('biDbModel').getGatherTables() 103 | for curTableInfo in gatherTables: 104 | # 源数据信息 105 | dbServer = curTableInfo['db_server'] 106 | dbName = curTableInfo['db_name'] 107 | tbName = curTableInfo['tb_name'] 108 | 109 | # 设置 gather 表规则 110 | sourceDb = confSourceDb 111 | sourceTable = dbName + confSeparator + tbName 112 | targetDb = confTargetDb 113 | targetTable = dbName + confSeparator + tbName 114 | 115 | gatherTable = GatherTable() 116 | gatherTable.setSourceDb(sourceDb) 117 | gatherTable.setSourceTable(sourceTable) 118 | gatherTable.setTargetDb(targetDb) 119 | gatherTable.setTargetTable(targetTable) 120 | gatherTable.setPartitionDate(date) 121 | gatherTable.run() 122 | 123 | -------------------------------------------------------------------------------- /extract/gather_table.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | from dw_service_core import DwServiceCore 3 | from core.util.log.logger import Logger 4 | 5 | class GatherTable(DwServiceCore): 6 | 7 | # 源数据库 8 | sourceDb = None 9 | def setSourceDb(self,data): 10 | self.sourceDb = data 11 | def getSourceDb(self): 12 | return self.sourceDb 13 | 14 | 15 | # 源数据表 16 | sourceTable = None 17 | def setSourceTable(self,data): 18 | self.sourceTable = data 19 | def getSourceTable(self): 20 | return self.sourceTable 21 | 22 | 23 | # 目标数据库 24 | targetDb = None 25 | def setTargetDb(self,data): 26 | self.targetDb = data 27 | def getTargetDb(self): 28 | return self.targetDb 29 | 30 | 31 | # 目标数据表 32 | targetTable = None 33 | def setTargetTable(self,data): 34 | self.targetTable = data 35 | def getTargetTable(self): 36 | return self.targetTable 37 | 38 | 39 | # 分区日期 40 | partitionDate = None 41 | def setPartitionDate(self,data): 42 | self.partitionDate = data 43 | def getPartitionDate(self): 44 | return self.partitionDate 45 | 46 | 47 | # 验证 Gather Table 是否存在 48 | def isExistsGatherTable(self): 49 | return self.getRegisterInstance('hiveModel').isExistsTable(self.getTargetDb(),self.getTargetTable()) 50 | 51 | 52 | 53 | # 初始化 gather table 54 | def initGatherTable(self): 55 | # 开始时间 56 | startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() 57 | 58 | # 1. 根据源数据表,创建一张 gather table 59 | # 获取源数据表字段 60 | sourceTableFields = self.getRegisterInstance('hiveModel').getFileds(self.getSourceDb(),self.getSourceTable()) 61 | formatTableFieldsList = [] 62 | for curField in sourceTableFields: 63 | formatTableFieldsList.append('`' + curField + '`') 64 | formatTableFieldsStr = ' String,'.join(formatTableFieldsList) + " String" 65 | 66 | createHiveTableSql = ''' 67 | CREATE TABLE IF NOT EXISTS %s.%s ( 68 | %s 69 | ) PARTITIONED BY ( 70 | `p_dt` String 71 | ) 72 | STORED AS ORC 73 | '''%(self.getTargetDb(),self.getTargetTable(),formatTableFieldsStr) 74 | 75 | 76 | # 创建数据表 77 | createHiveTableResult = self.getRegisterInstance('hiveModel').createTable(createHiveTableSql) 78 | 79 | # 2. 导入数据到 gather 表 80 | insertSql = ''' 81 | INSERT OVERWRITE TABLE `%(gatherTable)s` PARTITION (`p_dt` = '%(partitionDate)s') SELECT * FROM %(sourceTable)s; 82 | '''% {'gatherTable':self.getTargetDb() + '.' + self.getTargetTable(), 83 | 'partitionDate' : self.getPartitionDate(), 84 | 'sourceTable' : self.getSourceDb() + '.' + self.getSourceTable() 85 | } 86 | # 执行导入 87 | insertResult = self.getRegisterInstance('hiveModel').batchExecuteSql(insertSql) 88 | 89 | # 3. 检测执行结果 90 | if (createHiveTableResult == True and insertResult == True ) : 91 | resultCode = 0 92 | else: 93 | resultCode = 1 94 | 95 | # 4. 计算执行结果写日志和打印 96 | # 计算结束日期 97 | diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp 98 | 99 | # mysql 记录日志 100 | self.extractLog(resultCode,diffTimestamp) 101 | 102 | # 打印日志 103 | logStr = "(初始化 : " + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " -> " + str(self.getTargetDb()) + "." + str(self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" 104 | Logger.info(logStr) 105 | 106 | 107 | # source table 对比 gather table 后, source table 新增的字段 108 | def getSourceTableNewFields(self): 109 | sourceTableFields = self.getRegisterInstance('hiveModel').getFileds(self.getSourceDb(),self.getSourceTable()) 110 | gatherTableFields = self.getRegisterInstance('hiveModel').getFileds(self.getTargetDb(),self.getTargetTable()) 111 | 112 | #获取新增的字段 113 | newFileds = [] 114 | for sourceField in sourceTableFields: 115 | if (sourceField not in gatherTableFields): 116 | newFileds.append(sourceField) 117 | 118 | return newFileds 119 | 120 | 121 | # 修改 gather Table 表结构 122 | def alterGatherTableField(self,fields): 123 | alterTableSql = ''; 124 | 125 | for field in fields: 126 | alterTableSql+='''ALTER TABLE `%(gatherTable)s` ADD COLUMNS(`%(field)s` String COMMENT '%(partitionDate)s'); 127 | '''%{'gatherTable':self.getTargetDb() + '.' + self.getTargetTable(), 128 | 'field' : field, 129 | 'partitionDate' : self.getPartitionDate() 130 | }; 131 | 132 | return self.getRegisterInstance('hiveModel').batchExecuteSql(alterTableSql) 133 | 134 | 135 | # source table 导入到 gather table 136 | def sourceTableToGatherTable(self): 137 | # 开始时间 138 | startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() 139 | 140 | # 1. 获取表结构 141 | sourceTableFields = self.getRegisterInstance('hiveModel').getFileds(self.getSourceDb(),self.getSourceTable()) 142 | gatherTableFields = self.getRegisterInstance('hiveModel').getFileds(self.getTargetDb(),self.getTargetTable()) 143 | 144 | # 2. 格式化需要导入到 gather table 的字段 145 | fieldSql = '' 146 | for curGatherField in gatherTableFields: 147 | if (curGatherField == 'p_dt') : continue 148 | 149 | if (curGatherField in sourceTableFields): 150 | fieldSql += '`' + curGatherField + '`,' 151 | else: 152 | fieldSql += "'' AS " + '`' + curGatherField + '`,' 153 | 154 | # 祛除最后的逗号 155 | formatFieldSql = fieldSql[:-1] 156 | 157 | # 3. 拼接 SQL 158 | gatherTableSql = ''' 159 | INSERT OVERWRITE TABLE `%(gatherTable)s` PARTITION (`p_dt` = '%(partitionDate)s') SELECT %(fieldSql)s FROM %(sourceTable)s; 160 | '''% {'gatherTable':self.getTargetDb() + '.' + self.getTargetTable(), 161 | 'partitionDate' : self.getPartitionDate(), 162 | 'fieldSql' : formatFieldSql, 163 | 'sourceTable' : self.getSourceDb() + '.' + self.getSourceTable() 164 | } 165 | # 执行 SQL 166 | gatherTableResult = self.getRegisterInstance('hiveModel').batchExecuteSql(gatherTableSql) 167 | 168 | # 4. 检测执行结果 169 | if (gatherTableResult == True ) : 170 | resultCode = 0 171 | else: 172 | resultCode = 1 173 | 174 | # 5. 计算执行结果写日志和打印 175 | # 计算结束日期 176 | diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp 177 | 178 | # mysql 记录日志 179 | self.extractLog(resultCode,diffTimestamp) 180 | 181 | # 打印日志 182 | logStr = "(执行聚合 : " + str(self.getSourceDb()) + "." + str(self.getSourceTable()) + " -> " + str(self.getTargetDb()) + "." + str(self.getTargetTable()) + " Time : " + str(diffTimestamp) + ")" 183 | Logger.info(logStr) 184 | 185 | 186 | # 记录日志到 Mysql 187 | def extractLog(self,code,time): 188 | logSql = "INSERT INTO dw_service.gather_log (`db_name`,`tb_name`,`code`,`run_time`) " 189 | logSql += "VALUES ('%s','%s',%d,%d)" 190 | logSql = logSql%(self.getSourceDb(),self.getSourceTable(),code,time) 191 | self.getRegisterInstance('biDbModel').insertData(logSql) 192 | 193 | 194 | def run(self): 195 | Logger.init() 196 | # 目标数据库不存在 197 | if (self.isExistsGatherTable() == False): 198 | # 初始化数据表 199 | self.initGatherTable() 200 | else: 201 | sourceTableNewFields = self.getSourceTableNewFields() 202 | # 如果 source table 有新增的字段 203 | if (len(sourceTableNewFields) > 0): 204 | # 增加新的字段 205 | self.alterGatherTableField(sourceTableNewFields) 206 | # 导入 207 | self.sourceTableToGatherTable() 208 | else: 209 | # 导入 210 | self.sourceTableToGatherTable() 211 | 212 | 213 | -------------------------------------------------------------------------------- /extract/snapshot_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | u''' 4 | 镜像run 5 | ''' 6 | from dw_service_core import DwServiceCore 7 | from core.util.log.logger import Logger 8 | 9 | u''' 10 | 镜像拷贝一张表到hive带时间戳 11 | 12 | 调用方法: 13 | 1. snapshot所有表 14 | ./index.py --service extract --module snapshot_run --parameter '{}' 15 | 16 | 2. snapshot指定表 17 | ./index.py --service extract --module snapshot_run --parameter '{"date":"2016-12-01","dbName":"xinfang","tbName":"loupan_basic"}' 18 | 19 | date 生成镜像日期,默认昨天,格式 yyyy-mm-dd 20 | dbName 数据库名 21 | tbName 表名 22 | 23 | ''' 24 | class SnapshotRun(DwServiceCore) : 25 | 26 | def process(self): 27 | Logger.init() 28 | 29 | # 解析参数 30 | parameter = self.getFormatParameter() 31 | 32 | dbName = parameter.get('dbName') 33 | tbName = parameter.get('tbName') 34 | date = parameter.get('date') 35 | 36 | if (date == None): 37 | date = self.getRegisterInstance('dateModel').getYesterdayByYmd() 38 | # snapshot 指定数据表 39 | if (dbName != None and tbName != None): 40 | self.snapshotTable(dbName,tbName,date) 41 | # snapshot 所有表 42 | else: 43 | self.snapshotTableAll(date) 44 | 45 | #snapshot指定表 46 | def snapshotTable(self,dbName,tbName,date): 47 | try : 48 | extractConf = self.getRegisterInstance('confModel').getExtractConf() 49 | 50 | confSeparator = extractConf['core']['separator'] 51 | confSourceDb = extractConf['snapshot_table']['hive_source_db'] 52 | confTargetDb = extractConf['snapshot_table']['hive_target_db'] 53 | 54 | # 设置 snapshot表规则 55 | sourceDb = confSourceDb 56 | sourceTable = dbName + confSeparator + tbName 57 | targetDb = confTargetDb 58 | targetTable = dbName + confSeparator + tbName + '_' + date.replace('-','') 59 | 60 | # 开始时间 61 | startTimestamp = self.getRegisterInstance('dateModel').getTimestamp() 62 | 63 | # 1.生成sql 64 | createHiveTableSql = ''' 65 | DROP TABLE IF EXISTS %(snapshotTbl)s; 66 | CREATE TABLE IF NOT EXISTS %(snapshotTbl)s AS 67 | SELECT * FROM %(srcTbl)s;'''%{'srcTbl':sourceDb + '.' + sourceTable, 68 | 'snapshotTbl': targetDb + '.' + targetTable} 69 | 70 | Result = self.getRegisterInstance('hiveModel').batchExecuteSql(createHiveTableSql) 71 | # 2.计算执行结果写日志和打印 72 | diffTimestamp = self.getRegisterInstance('dateModel').getTimestamp() - startTimestamp 73 | logSql = "INSERT INTO dw_service.snapshot_log (`source_db`,`source_table`,`target_db`,`target_table`,`code`,`run_time`) " 74 | logSql += "VALUES ('%s','%s','%s','%s',%d,%d)" 75 | logSql = logSql%(sourceDb,sourceTable,targetDb,targetTable,int(Result),diffTimestamp) 76 | self.getRegisterInstance('biDbModel').insertData(logSql) 77 | 78 | # 打印日志 79 | logStr = str(sourceDb) + "." + str(sourceTable) + " -> " + str(targetDb) + "." + str(targetTable) + " Time : " + str(diffTimestamp) + ")" 80 | Logger.info(logStr) 81 | 82 | except Exception,ex: 83 | log = "异常-> " + str(sourceDb) + "." + str(sourceTable) 84 | log += " -> " + str(Exception) + ":" + str(ex) 85 | Logger.info(log) 86 | 87 | #snapshot所有表 88 | def snapshotTableAll(self,date): 89 | # 获取抽取列表 90 | snapshotTables = self.getRegisterInstance('biDbModel').getGatherTables() 91 | for curTableInfo in snapshotTables: 92 | # 源数据信息 93 | dbName = curTableInfo['db_name'] 94 | tbName = curTableInfo['tb_name'] 95 | self.snapshotTable(dbName, tbName, date) 96 | -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #coding=utf-8 3 | import sys,os 4 | 5 | from core.util.base.args_format import ArgsFormat 6 | from dw_service_core import DwServiceCore 7 | 8 | argsFormat = ArgsFormat(sys.argv) 9 | #需要解析的长参数 10 | argsFormat.setlongOption(["service=", "module=","parameter="]) 11 | #需要解析的段参数 12 | #argsFormat.setshortOption("m:f:") 13 | argsMap = argsFormat.run() 14 | 15 | # 初始化 16 | dwServiceCore = DwServiceCore() 17 | dwServiceCore.setService(argsMap.get('--service')) 18 | dwServiceCore.setModule(argsMap.get('--module')) 19 | dwServiceCore.setParameter(argsMap.get('--parameter')) 20 | dwServiceCore.run() 21 | -------------------------------------------------------------------------------- /template/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/template/__init__.py -------------------------------------------------------------------------------- /template/template_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | from dw_service_core import DwServiceCore 4 | 5 | u''' 6 | 模板: 7 | ./index.py --service template --module template_run --parameter '{"date":"20151010"}' 8 | ''' 9 | 10 | class TemplateRun(DwServiceCore): 11 | 12 | # 初始化 13 | def init(self): 14 | #super(DwServiceCore,self).init() 15 | DwServiceCore.init(self) 16 | print "init" 17 | 18 | # 处理流程 19 | def process(self): 20 | print "process" 21 | 22 | # 关闭 23 | def shutdown(self): 24 | print "shutdown" -------------------------------------------------------------------------------- /uba_log/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JasonWiki/dw_etl/a679c0462006cd7c35e3b3f3d00e25c49a55a983/uba_log/__init__.py -------------------------------------------------------------------------------- /uba_log/uba_ods_table_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import time 3 | import datetime 4 | 5 | from dw_service_core import DwServiceCore 6 | from cluster_task.dw_sql import DwSql 7 | from core.util.log.logger import Logger 8 | 9 | u''' 10 | uba ods table log 处理 11 | 调用方法: 12 | 单独跑一张 dw_web_action_detail_log 13 | ./index.py --service uba_log --module uba_ods_table_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_app_access_log","isDwSql":"no"}' 14 | ./index.py --service uba_log --module uba_ods_table_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_app_action_detail_log","isDwSql":"no"}' 15 | ./index.py --service uba_log --module uba_ods_table_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_web_visit_traffic_log","isDwSql":"no"}' 16 | ./index.py --service uba_log --module uba_ods_table_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_web_action_detail_log","isDwSql":"no"}' 17 | 18 | OR 19 | 20 | # 跑所有的 log 21 | ./index.py --service uba_log --module uba_ods_table_run --parameter '{"serverType":"spark","date":"yesterday","logType":"","isDwSql":"no"}' 22 | ''' 23 | 24 | class UbaOdsTableRun(DwServiceCore) : 25 | 26 | dwSqlServcie = None 27 | 28 | # uba sql 所在目录 29 | ubaSqlPath = None 30 | 31 | # 最终状态: 32 | accessLogStatus = None 33 | dwAccessLogStatus = None 34 | ubaAppActionLogStatus = None 35 | ubaWebVisitLogStatus = None 36 | ubaWebActionLogStatus = None 37 | 38 | 39 | def init(self): 40 | DwServiceCore.init(self) 41 | 42 | Logger.init() 43 | 44 | self.ubaSqlPath = self.getDwCoreInstance().SystemPath('basePath') + '/uba_log/uba_sql' 45 | 46 | 47 | def process(self): 48 | # 录入参数: 49 | parsMap = self.getFormatParameter() 50 | 51 | self.createOdsTable(parsMap) 52 | 53 | 54 | def createOdsTable(self, parsMap): 55 | # 执行控制 56 | self.dwSqlServcie = DwSql() 57 | 58 | # 日志类型 59 | logType = parsMap.get('logType') 60 | if (logType == None or logType == '') : 61 | self.accessLogStatus = self.accessLog(parsMap) 62 | self.dwAccessLogStatus = self.dwAccessLog(parsMap) 63 | self.ubaAppActionLogStatus = self.ubaAppActionLog(parsMap) 64 | self.ubaWebVisitLogStatus = self.ubaWebVisitLog(parsMap) 65 | self.ubaWebActionLogStatus = self.ubaWebActionLog(parsMap) 66 | 67 | elif (logType == 'access_log' ): 68 | self.accessLogStatus = self.accessLog(parsMap) 69 | 70 | elif (logType == 'dw_access_log' ): 71 | self.dwAccessLogStatus = self.dwAccessLog(parsMap) 72 | 73 | elif (logType == 'uba_app_action_log' ): 74 | self.ubaAppActionLogStatus = self.ubaAppActionLog(parsMap) 75 | 76 | elif (logType == 'uba_web_visit_log' ): 77 | self.ubaWebVisitLogStatus = self.ubaWebVisitLog(parsMap) 78 | 79 | elif (logType == 'uba_web_action_log' ): 80 | self.ubaWebActionLogStatus = self.ubaWebActionLog(parsMap) 81 | 82 | 83 | def accessLog(self, parsMap): 84 | parsData = parsMap 85 | parsData.update( {"sql" : self.ubaSqlPath + "/ods/access_log.sql"} ) 86 | return self.dwSqlServcie.runDwSqlProcess(parsData) 87 | 88 | 89 | def dwAccessLog(self, parsMap): 90 | parsData = parsMap 91 | parsData.update( {"sql" : self.ubaSqlPath + "/ods/dw_access_log.sql"} ) 92 | return self.dwSqlServcie.runDwSqlProcess(parsData) 93 | 94 | 95 | def ubaAppActionLog(self, parsMap): 96 | parsData = parsMap 97 | parsData.update( {"sql" : self.ubaSqlPath + "/ods/uba_app_action_log.sql"} ) 98 | return self.dwSqlServcie.runDwSqlProcess(parsData) 99 | 100 | 101 | def ubaWebVisitLog(self, parsMap): 102 | parsData = parsMap 103 | parsData.update( {"sql" : self.ubaSqlPath + "/ods/uba_web_visit_log.sql"} ) 104 | return self.dwSqlServcie.runDwSqlProcess(parsData) 105 | 106 | 107 | def ubaWebActionLog(self, parsMap): 108 | parsData = parsMap 109 | parsData.update( {"sql" : self.ubaSqlPath + "/ods/uba_web_action_log.sql"} ) 110 | return self.dwSqlServcie.runDwSqlProcess(parsData) 111 | 112 | 113 | def shutdown(self): 114 | Logger.info("执行结果 access_log : " + str(self.accessLogStatus)) 115 | Logger.info("执行结果 dw_access_log : " + str(self.dwAccessLogStatus)) 116 | Logger.info("执行结果 uba_app_action_log : " + str(self.ubaAppActionLogStatus)) 117 | Logger.info("执行结果 uba_web_visit_log : " + str(self.ubaWebVisitLogStatus)) 118 | Logger.info("执行结果 uba_web_action_log : " + str(self.ubaWebActionLogStatus)) 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /uba_log/uba_run.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import time 3 | import datetime 4 | 5 | from dw_service_core import DwServiceCore 6 | from cluster_task.dw_sql import DwSql 7 | from core.util.log.logger import Logger 8 | 9 | u''' 10 | uba log 处理 11 | 调用方法: 12 | 单独跑一张 dw_web_action_detail_log 13 | ./index.py --service uba_log --module uba_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_app_access_log","isDwSql":"no"}' 14 | ./index.py --service uba_log --module uba_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_app_action_detail_log","isDwSql":"no"}' 15 | ./index.py --service uba_log --module uba_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_web_visit_traffic_log","isDwSql":"no"}' 16 | ./index.py --service uba_log --module uba_run --parameter '{"serverType":"spark","date":"yesterday","logType":"dw_web_action_detail_log","isDwSql":"no"}' 17 | 18 | OR 19 | 20 | # 跑所有的 log 21 | ./index.py --service uba_log --module uba_run --parameter '{"serverType":"spark","date":"yesterday","logType":"","isDwSql":"no"}' 22 | ''' 23 | 24 | class UbaRun(DwServiceCore) : 25 | 26 | dwSqlServcie = None 27 | 28 | # uba sql 所在目录 29 | ubaSqlPath = None 30 | 31 | # 最终状态: 32 | dwAppAccessLogStatus = None 33 | dwAppActionDetailLogStatus = None 34 | dwWebVisitTrafficLogStatus = None 35 | dwWebActionDetailLogStatus = None 36 | 37 | 38 | def init(self): 39 | DwServiceCore.init(self) 40 | 41 | self.ubaSqlPath = self.getDwCoreInstance().SystemPath('basePath') + '/uba_log/uba_sql' 42 | 43 | 44 | def process(self): 45 | Logger.init() 46 | # 录入参数: 47 | parsMap = self.getFormatParameter() 48 | 49 | # 执行控制 50 | self.dwSqlServcie = DwSql() 51 | 52 | # 日志类型 53 | logType = parsMap.get('logType') 54 | if (logType == None or logType == '') : 55 | # APP 主题 LOG 56 | self.dwAppAccessLogStatus = self.dwAppAccessLog(parsMap) 57 | self.dwAppActionDetailLogStatus = self.dwAppActionDetailLog(parsMap) 58 | 59 | # WEB 主题 Log 60 | self.dwWebVisitTrafficLogStatus = self.dwWebVisitTrafficLog(parsMap) 61 | self.dwWebActionDetailLogStatus = self.dwWebActionDetailLog(parsMap) 62 | 63 | elif (logType == 'dw_app_access_log' ): 64 | self.dwAppAccessLogStatus = self.dwAppAccessLog(parsMap) 65 | 66 | elif (logType == 'dw_app_action_detail_log' ): 67 | self.dwAppActionDetailLogStatus = self.dwAppActionDetailLog(parsMap) 68 | 69 | elif (logType == 'dw_web_visit_traffic_log' ): 70 | self.dwWebVisitTrafficLogStatus = sself.dwWebVisitTrafficLog(parsMap) 71 | 72 | elif (logType == 'dw_web_action_detail_log' ): 73 | self.dwWebActionDetailLogStatus = self.dwWebActionDetailLog(parsMap) 74 | 75 | 76 | # app 访问 access_log 77 | def dwAppAccessLog(self, parsMap): 78 | parsData = parsMap 79 | parsData.update( {"sql" : self.ubaSqlPath + "/app/dw_app_access_log.sql"} ) 80 | return self.dwSqlServcie.runDwSqlProcess(parsData) 81 | 82 | 83 | # app 用户行为log 84 | def dwAppActionDetailLog(self, parsMap): 85 | parsData = parsMap 86 | parsData.update( {"sql" : self.ubaSqlPath + "/app/dw_app_action_detail_log.sql"} ) 87 | return self.dwSqlServcie.runDwSqlProcess(parsData) 88 | 89 | 90 | # web 访问 log 91 | def dwWebVisitTrafficLog(self, parsMap): 92 | parsData = parsMap 93 | parsData.update( {"sql" : self.ubaSqlPath + "/web/dw_web_visit_traffic_log.sql"} ) 94 | return self.dwSqlServcie.runDwSqlProcess(parsData) 95 | 96 | 97 | # web 行为 log 98 | def dwWebActionDetailLog(self, parsMap): 99 | parsData = parsMap 100 | parsData.update( {"sql" : self.ubaSqlPath + "/web/dw_web_action_detail_log.sql"} ) 101 | return self.dwSqlServcie.runDwSqlProcess(parsData) 102 | 103 | 104 | def shutdown(self): 105 | Logger.info("执行结果 dw_app_access_log : " + str(self.dwAppAccessLogStatus)) 106 | Logger.info("执行结果 dw_app_action_detail_log : " + str(self.dwAppActionDetailLogStatus)) 107 | Logger.info("执行结果 dw_web_visit_traffic_log : " + str(self.dwWebVisitTrafficLogStatus)) 108 | Logger.info("执行结果 dw_web_action_detail_log : " + str(self.dwWebActionDetailLogStatus)) 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /uba_log/uba_sql/app/dw_app_access_log.sql: -------------------------------------------------------------------------------- 1 | ADD JAR /data/app/jars/dw_hive_udf-1.0-SNAPSHOT-hive.jar; 2 | 3 | CREATE TEMPORARY FUNCTION parse_mobile_token AS 'com.angejia.dw.hive.udf.parse.ParseMobileToken'; 4 | CREATE TEMPORARY FUNCTION get_page_info AS 'com.angejia.dw.hive.udf.pageinfo.CalculatePageInfo'; 5 | CREATE TEMPORARY FUNCTION parse_mobile_agent AS 'com.angejia.dw.hive.udf.parse.ParseMobileAgent'; 6 | 7 | -- 插入数据 8 | INSERT OVERWRITE TABLE dw_db.dw_app_access_log PARTITION(p_dt=${dealDate}) 9 | SELECT 10 | -- app name 11 | parse_mobile_agent(a.mobile_agent,'app') as app_name, 12 | parse_mobile_agent(a.mobile_agent,'av') as app_version, 13 | -- 选择城市 14 | parse_mobile_agent(a.mobile_agent,'ccid') as selection_city_id, 15 | -- 本地城市 16 | parse_mobile_agent(a.mobile_agent,'gcid') as location_city_id, 17 | -- 客户端 Ip 18 | remote_addr as client_ip, 19 | -- 用户 id 20 | coalesce(parse_mobile_token(auth,'user_id'),0) as user_id, 21 | -- 网络类型 22 | parse_mobile_agent(a.mobile_agent,'net') as network_type, 23 | -- 平台 24 | parse_mobile_agent(a.mobile_agent,'p') as platform, 25 | parse_mobile_agent(a.mobile_agent,'pm') as device_type, 26 | parse_mobile_agent(a.mobile_agent,'osv') as os_version, 27 | parse_mobile_agent(a.mobile_agent,'dvid') as device_id, 28 | -- 渠道包号 29 | parse_mobile_agent(a.mobile_agent,'ch') as delivery_channels, 30 | -- 渠道包名 31 | coalesce(c.channel_name,'') as channel_name, 32 | -- 域名 33 | hostname as hostname, 34 | -- 请求 uri 35 | request_uri as request_uri, 36 | -- 请求服务器时间 37 | to_date(server_date) as server_date, 38 | -- 请求服务器时间 39 | concat(server_date,' ',server_time) as server_time, 40 | -- page id 41 | get_page_info(concat('http://',concat(hostname,request_uri)),'page_id') as request_page_id, 42 | -- page name 43 | get_page_info(concat('http://',concat(hostname,request_uri)),'page_name') as request_page_name, 44 | -- 经纬度 45 | parse_mobile_agent(a.mobile_agent,'lng') as longitude, 46 | parse_mobile_agent(a.mobile_agent,'lat') as latitude 47 | 48 | FROM access_log.access_log_${baseDealDate} a 49 | 50 | -- 过滤 ip 51 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f1 52 | ON a.remote_addr = f1.client_ip 53 | AND f1.status = 1 54 | 55 | -- 过滤 ip 段 56 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f2 57 | ON (CONCAT(split(a.remote_addr,'\\.')[0], '.', split(a.remote_addr,'\\.')[1], '.', split(a.remote_addr,'\\.')[2])) 58 | = (CONCAT(split(f2.client_ip,'\\.')[0], '.', split(f2.client_ip,'\\.')[1], '.', split(f2.client_ip,'\\.')[2])) 59 | AND f2.status = 1 60 | 61 | -- 渠道包 62 | LEFT JOIN dw_db.dw_basis_dimension_delivery_channels_package c 63 | ON parse_mobile_agent(a.mobile_agent,'ch') = c.channel_package_code 64 | 65 | WHERE mobile_agent <> '-' 66 | 67 | -- 过滤 ip 和 ip 段 68 | AND f1.client_ip IS NULL 69 | AND f2.client_ip IS NULL 70 | ; 71 | -------------------------------------------------------------------------------- /uba_log/uba_sql/app/dw_app_access_log_ddl.sql: -------------------------------------------------------------------------------- 1 | -- 创建 app access log 2 | CREATE TABLE if NOT exists dw_db.dw_app_access_log ( 3 | app_name string, 4 | app_version string, 5 | selection_city_id string, 6 | location_city_id string, 7 | client_ip string, 8 | user_id string, 9 | network_type string, 10 | platform string, 11 | device_type string, 12 | os_version string, 13 | device_id string, 14 | delivery_channels string, 15 | channel_name string, 16 | hostname string, 17 | request_uri string, 18 | server_date string, 19 | server_time string, 20 | request_page_id string, 21 | request_page_name string, 22 | longitude string, 23 | latitude string 24 | ) partitioned by (p_dt string); 25 | -------------------------------------------------------------------------------- /uba_log/uba_sql/app/dw_app_action_detail_log.sql: -------------------------------------------------------------------------------- 1 | ADD JAR /data/app/jars/dw_hive_udf-1.0-SNAPSHOT-hive.jar; 2 | 3 | CREATE TEMPORARY FUNCTION parse_action_id_to_page_id AS 'com.angejia.dw.hive.udf.parse.ParseActionIdToPageId'; 4 | 5 | -- 插入数据 6 | INSERT OVERWRITE TABLE dw_db.dw_app_action_detail_log PARTITION(p_dt = ${dealDate}) 7 | SELECT 8 | -- mac 地址 9 | a.mac, 10 | -- 设备 唯一 id 11 | a.dvid, 12 | -- 机型 13 | a.model, 14 | -- 设备版本 15 | a.os, 16 | -- app 名称 17 | a.name, 18 | -- 渠道包 19 | a.ch AS channel, 20 | -- app 版本号 21 | a.ver AS version, 22 | -- user id 23 | if(length(a.uid)>0,uid,0) AS uid, 24 | -- 网络类型 25 | a.net, 26 | -- ip 27 | a.ip, 28 | -- 城市 id 29 | a.ccid, 30 | -- 定位城市 id 31 | a.gcid, 32 | -- 经纬度 33 | split(a.geo,'-')[0] AS longtitude, 34 | split(a.geo,'-')[1] AS latitude, 35 | -- 动作 id 36 | a.action AS action_id, 37 | -- 动作名称 38 | b.action_name, 39 | -- 动作英文标号 40 | b.action_cname, 41 | -- page id( pageId 匹配模式, 如 actionId 是: 1-5400000 , '最后 000' 表示指定 pageId ) 42 | parse_action_id_to_page_id(a.action) AS currnet_page_id, 43 | -- page name 44 | c.action_name AS current_page_name, 45 | -- page en name 46 | c.action_cname AS current_page_cname, 47 | -- 客户端时间 48 | a.click_time, 49 | -- 扩展 json 字段 50 | a.extend, 51 | -- 上一个 action id 52 | get_json_object(a.extend,'$.bp') AS bp_id, 53 | -- 上一个 action name 54 | d.action_name AS bp_name, 55 | -- 服务器时间 56 | a.server_time, 57 | -- 客户端 Ip 58 | a.client_ip 59 | 60 | -- app 基础表 61 | FROM uba_app_action_log.uba_app_action_log_${baseDealDate} a 62 | -- 解析 actionId 63 | LEFT JOIN dw_db.dw_basis_dimen_action_id_name_lkp b 64 | ON a.action=b.action_id 65 | AND b.flag IN (0,3,4) 66 | -- 解析 actionId -> pageId 67 | LEFT JOIN dw_db.dw_basis_dimen_action_id_name_lkp c 68 | ON parse_action_id_to_page_id(a.action) = c.action_id 69 | AND c.flag IN (0,3,4) 70 | -- 解析上一级 actionId 71 | LEFT JOIN dw_db.dw_basis_dimen_action_id_name_lkp d 72 | ON get_json_object(a.extend,'$.bp') = d.action_id 73 | 74 | -- 过滤 ip 75 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f1 76 | ON a.client_ip = f1.client_ip 77 | AND f1.status = 1 78 | 79 | -- 过滤 ip 段 80 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f2 81 | ON (CONCAT(split(a.client_ip,'\\.')[0], '.', split(a.client_ip,'\\.')[1], '.', split(a.client_ip,'\\.')[2])) 82 | = (CONCAT(split(f2.client_ip,'\\.')[0], '.', split(f2.client_ip,'\\.')[1], '.', split(f2.client_ip,'\\.')[2])) 83 | AND f2.status = 1 84 | 85 | WHERE 86 | -- 过滤 ip 和 ip 段 87 | f1.client_ip IS NULL 88 | AND f2.client_ip IS NULL 89 | ; -------------------------------------------------------------------------------- /uba_log/uba_sql/app/dw_app_action_detail_log_ddl.sql: -------------------------------------------------------------------------------- 1 | -- 创建 app action log 详细表 2 | CREATE TABLE if not exists dw_db.dw_app_action_detail_log ( 3 | mac string, 4 | dvid string, 5 | model string, 6 | os string, 7 | name string, 8 | channel string, 9 | version string, 10 | uid string, 11 | net string, 12 | ip string, 13 | ccid string, 14 | gcid string, 15 | longtitude string, 16 | latitude string, 17 | action_id string, 18 | action_name string, 19 | action_cname string, 20 | current_page_id string, 21 | current_page_name string, 22 | current_page_cname string, 23 | click_time string, 24 | extend string, 25 | bp_id string, 26 | bp_name string, 27 | server_time string, 28 | client_ip string 29 | ) partitioned by (p_dt string); 30 | -------------------------------------------------------------------------------- /uba_log/uba_sql/ods/access_log.sql: -------------------------------------------------------------------------------- 1 | -- 安个家 app access_log 2 | CREATE EXTERNAL TABLE IF NOT EXISTS access_log.access_log_${baseDealDate} ( 3 | `request_time` string COMMENT '请求时间', 4 | `upstream_response_time` string COMMENT '响应时间', 5 | `remote_addr` string COMMENT '请求地址', 6 | `request_length` string COMMENT '请求大小', 7 | `upstream_addr` string COMMENT '', 8 | `server_date` string COMMENT '', 9 | `server_time` string COMMENT '', 10 | `hostname` string COMMENT '', 11 | `method` string COMMENT '', 12 | `request_uri` string COMMENT '', 13 | `http_code` string COMMENT '', 14 | `bytes_sent` string COMMENT '', 15 | `http_referer` string COMMENT '', 16 | `user_agent` string COMMENT '', 17 | `gzip_ratio` string COMMENT '', 18 | `http_x_forwarded_for` string COMMENT '', 19 | `auth` string COMMENT '', 20 | `mobile_agent` string COMMENT '', 21 | `http_angejia_payload` string COMMENT '', 22 | `http_trace_id` string COMMENT '', 23 | `server_protocol` string COMMENT '', 24 | `ssl_protocol` string COMMENT '' 25 | ) 26 | ROW FORMAT SERDE 27 | 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' 28 | WITH SERDEPROPERTIES ( 29 | 'input.regex'='^([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t\\[(.+?)T(.+?)\\+.*?\\]\\t([^\\t]*)\\t([^\\s]*)\\s([^\\s]*)\\s[^\\t]*\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)', 30 | 'output.format.string'='%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21$s %22$s' 31 | ) 32 | STORED AS TEXTFILE 33 | LOCATION '/flume/access_log/access_log_${baseDealDate}' 34 | ; 35 | -------------------------------------------------------------------------------- /uba_log/uba_sql/ods/dw_access_log.sql: -------------------------------------------------------------------------------- 1 | -- 安个家 dw access_log 2 | CREATE EXTERNAL TABLE IF NOT EXISTS access_log.dw_access_log_${baseDealDate} ( 3 | `request_time` string COMMENT '请求时间', 4 | `upstream_response_time` string COMMENT '响应时间', 5 | `remote_addr` string COMMENT '请求地址', 6 | `request_length` string COMMENT '请求大小', 7 | `upstream_addr` string COMMENT '', 8 | `server_date` string COMMENT '', 9 | `server_time` string COMMENT '', 10 | `hostname` string COMMENT '', 11 | `method` string COMMENT '', 12 | `request_uri` string COMMENT '', 13 | `http_code` string COMMENT '', 14 | `bytes_sent` string COMMENT '', 15 | `http_referer` string COMMENT '', 16 | `user_agent` string COMMENT '', 17 | `gzip_ratio` string COMMENT '', 18 | `http_x_forwarded_for` string COMMENT '', 19 | `auth` string COMMENT '', 20 | `mobile_agent` string COMMENT '', 21 | `http_angejia_payload` string COMMENT '', 22 | `http_trace_id` string COMMENT '', 23 | `server_protocol` string COMMENT '', 24 | `ssl_protocol` string COMMENT '' 25 | ) 26 | ROW FORMAT SERDE 27 | 'org.apache.hadoop.hive.contrib.serde2.RegexSerDe' 28 | WITH SERDEPROPERTIES ( 29 | 'input.regex'='^([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t\\[(.+?)T(.+?)\\+.*?\\]\\t([^\\t]*)\\t([^\\s]*)\\s([^\\s]*)\\s[^\\t]*\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*)\\t([^\\t]*).*', 30 | 'output.format.string'='%1$s %2$s %3$s %4$s %5$s %6$s %7$s %8$s %9$s %10$s %11$s %12$s %13$s %14$s %15$s %16$s %17$s %18$s %19$s %20$s %21$s %22$s' 31 | ) 32 | STORED AS TEXTFILE 33 | LOCATION '/flume/dw_access_log/dw_access_log_${baseDealDate}' 34 | ; 35 | -------------------------------------------------------------------------------- /uba_log/uba_sql/ods/uba_app_action_log.sql: -------------------------------------------------------------------------------- 1 | -- 安个家 app 行为 log 2 | CREATE EXTERNAL TABLE IF NOT EXISTS uba_app_action_log.uba_app_action_log_${baseDealDate} ( 3 | `mac` string, 4 | `dvid` string, 5 | `model` string, 6 | `os` string, 7 | `name` string, 8 | `ch` string, 9 | `ver` string, 10 | `uid` string, 11 | `net` string, 12 | `ip` string, 13 | `ccid` string, 14 | `gcid` string, 15 | `geo` string, 16 | `action` string, 17 | `click_time` string, 18 | `extend` string, 19 | `server_time` string, 20 | `client_ip` string COMMENT '20150824 add' 21 | ) 22 | ROW FORMAT DELIMITED 23 | FIELDS TERMINATED BY '\t' 24 | COLLECTION ITEMS TERMINATED BY '\n' 25 | STORED AS TEXTFILE 26 | LOCATION '/flume/uba_app_action/uba_app_action_${baseDealDate}' 27 | ; 28 | -------------------------------------------------------------------------------- /uba_log/uba_sql/ods/uba_web_action_log.sql: -------------------------------------------------------------------------------- 1 | -- 安个家 web 行为日志 2 | CREATE EXTERNAL TABLE IF NOT EXISTS uba_web_action_log.uba_web_action_log_${baseDealDate} ( 3 | `uid` string COMMENT 'from deserializer', 4 | `ccid` string COMMENT 'from deserializer', 5 | `referer` string COMMENT 'from deserializer', 6 | `url` string COMMENT 'from deserializer', 7 | `guid` string COMMENT 'from deserializer', 8 | `client_time` string COMMENT 'from deserializer', 9 | `page_param` string COMMENT 'from deserializer', 10 | `action` string COMMENT 'from deserializer', 11 | `client_param` string COMMENT 'from deserializer', 12 | `server_time` string COMMENT 'from deserializer', 13 | `ip` string COMMENT 'from deserializer', 14 | `agent` string COMMENT 'from deserializer' 15 | ) 16 | ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.JsonSerde' 17 | STORED AS TEXTFILE 18 | LOCATION '/flume/uba_web_action/uba_web_action_${baseDealDate}' 19 | ; 20 | -------------------------------------------------------------------------------- /uba_log/uba_sql/ods/uba_web_visit_log.sql: -------------------------------------------------------------------------------- 1 | -- 安个家 web 访问日志 2 | CREATE EXTERNAL TABLE IF NOT EXISTS uba_web_visit_log.uba_web_visit_log_${baseDealDate} ( 3 | `uid` string COMMENT 'from deserializer', 4 | `ccid` string COMMENT 'from deserializer', 5 | `referer` string COMMENT 'from deserializer', 6 | `url` string COMMENT 'from deserializer', 7 | `guid` string COMMENT 'from deserializer', 8 | `client_time` string COMMENT 'from deserializer', 9 | `page_param` string COMMENT 'from deserializer', 10 | `client_param` string COMMENT 'from deserializer', 11 | `server_time` string COMMENT 'from deserializer', 12 | `ip` string COMMENT 'from deserializer', 13 | `agent` string COMMENT 'from deserializer' 14 | ) 15 | ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.JsonSerde' 16 | STORED AS TEXTFILE 17 | LOCATION '/flume/uba_web_visit/uba_web_visit_${baseDealDate}' 18 | ; 19 | -------------------------------------------------------------------------------- /uba_log/uba_sql/web/dw_web_action_detail_log.sql: -------------------------------------------------------------------------------- 1 | ADD JAR /data/app/jars/dw_hive_udf-1.0-SNAPSHOT-hive.jar; 2 | 3 | CREATE TEMPORARY FUNCTION parse_user_agent AS 'com.angejia.dw.hive.udf.useragent.ParseUserAgent'; 4 | CREATE TEMPORARY FUNCTION get_page_info AS 'com.angejia.dw.hive.udf.pageinfo.CalculatePageInfo'; 5 | 6 | -- 插入数据 7 | INSERT OVERWRITE TABLE dw_db.dw_web_action_detail_log PARTITION (p_dt=${dealDate}) 8 | SELECT 9 | -- user_id 10 | if(length(a.uid)>0,uid,0) AS user_id, 11 | -- 城市 id 12 | a.ccid AS selection_city_id, 13 | -- 上一页 url 14 | if(length(a.referer)>0,referer,'') AS referer_full_url, 15 | -- 上一页 page id 16 | get_page_info(a.referer,'page_id') AS referer_page_id, 17 | -- 上一页 page uri 18 | coalesce(parse_url(a.referer,'PATH'),'') AS referer_page, 19 | -- 上一页 page bane 20 | get_page_info(a.referer,'page_name') AS referer_page_name, 21 | -- 当前页 url 22 | if(length(a.url)>0,url,'') AS current_full_url, 23 | -- 当前页 uri 24 | coalesce(parse_url(a.url,'PATH'),'') AS current_page, 25 | -- 当前页 page_id 26 | get_page_info(a.url,'page_id') AS current_page_id, 27 | -- 当前页 page_name 28 | get_page_info(a.url,'page_name') AS current_page_name, 29 | -- 唯一身份表示符 30 | a.guid AS guid, 31 | -- 客户端时间 32 | a.client_time AS client_time, 33 | -- page 参数 34 | a.page_param AS page_param, 35 | -- 动作 id 36 | b.action_id AS action_id, 37 | -- 动作名 38 | b.action_name AS action_name, 39 | -- 原始动作 英文标识 40 | a.action AS action_cname, 41 | -- 客户端扩展参数 42 | a.client_param AS client_param, 43 | -- 服务器时间 44 | a.server_time AS server_time, 45 | -- 客户端 ip 46 | a.ip AS client_ip, 47 | -- 设备类型 48 | parse_user_agent(a.agent,0) AS os_type, 49 | -- 设备版本 50 | parse_user_agent(a.agent,1) AS os_version, 51 | -- 浏览器类型 52 | parse_user_agent(a.agent,2) AS brower_type, 53 | -- 浏览器版本 54 | parse_user_agent(a.agent,3) AS brower_version, 55 | -- 设备客户端类型 56 | parse_user_agent(a.agent,4) AS phone_type, 57 | -- 上一页 host name 58 | coalesce(parse_url(a.referer,'HOST'),'') AS referer_host, 59 | -- 上一页 请求参数 60 | coalesce(parse_url(a.referer,'QUERY'),'') AS referer_query, 61 | -- 上一页 锚点 62 | coalesce(parse_url(a.referer,'REF'),'') AS referer_ref, 63 | -- 当前页 Host 64 | coalesce(parse_url(a.url,'HOST'),'') AS current_host, 65 | -- 当前页 请求参数 66 | coalesce(parse_url(a.url,'QUERY'),'') AS current_query, 67 | -- 当前页锚点 68 | coalesce(parse_url(a.url,'REF'),'') AS current_ref, 69 | -- host 的城市 id 70 | coalesce(host_city.city_id,'') AS current_host_city_id 71 | 72 | FROM uba_web_action_log.uba_web_action_log_${baseDealDate} a 73 | -- 通过 host 和 uri 翻译城市 id 74 | LEFT JOIN dim_db.dim_hostname_city AS host_city 75 | ON ( 76 | CASE 77 | -- 当为 m.angejia.com 处理下 78 | WHEN parse_url(a.url,'HOST') = 'm.angejia.com' 79 | -- 提取城市正则规范 80 | THEN concat( 81 | parse_url(a.url,'HOST'), 82 | regexp_extract( parse_url(a.url,'PATH') ,'^(/[sale|broker]{1,}/[sh|bj|hz|xg]{1,})',1) 83 | ) 84 | ELSE 85 | parse_url(a.url,'HOST') 86 | END 87 | ) = host_city.hostname 88 | AND host_city.is_active = 1 89 | 90 | -- action id 维度表 91 | LEFT JOIN dw_db.dw_basis_dimen_action_id_name_lkp AS b 92 | ON a.action = b.action_cname 93 | AND b.flag IN (1,2) 94 | 95 | -- 过滤 ip 96 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f1 97 | ON a.ip = f1.client_ip 98 | AND f1.status = 1 99 | 100 | -- 过滤 ip 段 101 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f2 102 | ON (CONCAT(split(a.ip,'\\.')[0], '.', split(a.ip,'\\.')[1], '.', split(a.ip,'\\.')[2])) 103 | = (CONCAT(split(f2.client_ip,'\\.')[0], '.', split(f2.client_ip,'\\.')[1], '.', split(f2.client_ip,'\\.')[2])) 104 | AND f2.status = 1 105 | 106 | WHERE 107 | -- 过滤 ip 和 ip 段 108 | f1.client_ip IS NULL 109 | AND f2.client_ip IS NULL 110 | ; -------------------------------------------------------------------------------- /uba_log/uba_sql/web/dw_web_action_detail_log_ddl.sql: -------------------------------------------------------------------------------- 1 | -- 创建 web action 表 2 | CREATE TABLE IF NOT EXISTS dw_db.dw_web_action_detail_log ( 3 | user_id string, 4 | ccid string, 5 | referer_full_url string, 6 | referer_page_id string, 7 | referer_page string, 8 | referer_page_name string, 9 | current_full_url string, 10 | current_page string, 11 | current_page_id string, 12 | current_page_name string, 13 | guid string, 14 | client_time string, 15 | page_param string, 16 | action_id string, 17 | action_name string, 18 | action_cname string, 19 | client_param string, 20 | server_time string, 21 | ip string, 22 | os_type string, 23 | os_version string, 24 | brower_type string, 25 | brower_version string, 26 | phone_type string, 27 | referer_host String, 28 | referer_query String, 29 | referer_ref STRING, 30 | current_host STRING, 31 | current_query STRING, 32 | current_ref STRING, 33 | current_host_city_id STRING 34 | ) partitioned by (p_dt string); 35 | -------------------------------------------------------------------------------- /uba_log/uba_sql/web/dw_web_visit_traffic_log.sql: -------------------------------------------------------------------------------- 1 | ADD JAR /data/app/jars/dw_hive_udf-1.0-SNAPSHOT-hive.jar; 2 | 3 | CREATE TEMPORARY FUNCTION parse_user_agent AS 'com.angejia.dw.hive.udf.useragent.ParseUserAgent'; 4 | CREATE TEMPORARY FUNCTION get_page_info AS 'com.angejia.dw.hive.udf.pageinfo.CalculatePageInfo'; 5 | 6 | -- 插入数据 7 | INSERT OVERWRITE TABLE dw_db.dw_web_visit_traffic_log PARTITION(p_dt=${dealDate}) 8 | SELECT 9 | -- 用户 ID 10 | if(length(a.uid)>0,uid,0) AS user_id, 11 | -- 用户选择城市 ID 12 | a.ccid as selection_city_id, 13 | -- 客户端请求时间 14 | a.client_time AS client_time, 15 | -- 空字段 16 | '' AS user_based_city_id, 17 | -- 上一页 url 18 | if(length(a.referer)>0,referer,'') AS referer_full_url, 19 | -- 上一页 path 20 | coalesce(parse_url(a.referer,'PATH'),'') AS referer_page, 21 | -- 上一页 page id 22 | get_page_info(a.referer,'page_id') AS referer_page_id, 23 | -- 上一页 page name 24 | get_page_info(a.referer,'page_name') AS referer_page_name, 25 | -- 当前页 url 26 | if(length(a.url)>0,url,'') AS current_full_url, 27 | -- 当前页 path 28 | coalesce(parse_url(a.url,'PATH'),'') AS current_page, 29 | -- 当前页 page id 30 | get_page_info(a.url,'page_id') AS current_page_id, 31 | -- 当前页 page name 32 | get_page_info(a.url,'page_name') AS current_page_name, 33 | -- 当前页 平台编号 1-Touchweb, 2-PC, 3-APP, 4 API 34 | get_page_info(a.url,'platform_id') AS channel_code, 35 | -- 页面扩展参数, json 格式 36 | a.page_param AS page_param, 37 | -- 请求扩展参数, json 格式 38 | a.client_param AS client_param, 39 | -- guid 40 | a.guid AS guid, 41 | -- 客户端 ip 42 | a.ip AS client_ip, 43 | -- 设备类型 44 | parse_user_agent(a.agent,0) AS os_type, 45 | -- 设备版本 46 | parse_user_agent(a.agent,1) AS os_version, 47 | -- 浏览器类型 48 | parse_user_agent(a.agent,2) AS brower_type, 49 | -- 浏览器版本 50 | parse_user_agent(a.agent,3) AS brower_version, 51 | -- 设备客户端类型 52 | parse_user_agent(a.agent,4) AS phone_type, 53 | -- 服务器时间 54 | a.server_time AS server_time, 55 | -- 上一页 host name 56 | coalesce(parse_url(a.referer,'HOST'),'') AS referer_host, 57 | -- 上一页 请求参数 58 | coalesce(parse_url(a.referer,'QUERY'),'') AS referer_query, 59 | -- 上一页 锚点 60 | coalesce(parse_url(a.referer,'REF'),'') AS referer_ref, 61 | -- 当前页 Host 62 | coalesce(parse_url(a.url,'HOST'),'') AS current_host, 63 | -- 当前页 请求参数 64 | coalesce(parse_url(a.url,'QUERY'),'') AS current_query, 65 | -- 当前页锚点 66 | coalesce(parse_url(a.url,'REF'),'') AS current_ref, 67 | -- host 的城市 id 68 | coalesce(host_city.city_id,'') AS current_host_city_id 69 | 70 | FROM uba_web_visit_log.uba_web_visit_log_${baseDealDate} AS a 71 | -- 通过 host 和 uri 翻译城市 id 72 | LEFT JOIN dim_db.dim_hostname_city AS host_city 73 | ON ( 74 | CASE 75 | -- 当为 m.angejia.com 处理下 76 | WHEN parse_url(a.url,'HOST') = 'm.angejia.com' 77 | -- 提取城市正则规范 78 | THEN concat( 79 | parse_url(a.url,'HOST'), 80 | regexp_extract( parse_url(a.url,'PATH') ,'^(/[sale|broker]{1,}/[sh|bj|hz|xg]{1,})',1) 81 | ) 82 | ELSE 83 | parse_url(a.url,'HOST') 84 | END 85 | ) = host_city.hostname 86 | AND host_city.is_active = 1 87 | 88 | -- 过滤 ip 89 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f1 90 | ON a.ip = f1.client_ip 91 | AND f1.status = 1 92 | 93 | -- 过滤 ip 段 94 | LEFT JOIN dw_db.dw_basis_dimension_filter_ip AS f2 95 | ON (CONCAT(split(a.ip,'\\.')[0], '.', split(a.ip,'\\.')[1], '.', split(a.ip,'\\.')[2])) 96 | = (CONCAT(split(f2.client_ip,'\\.')[0], '.', split(f2.client_ip,'\\.')[1], '.', split(f2.client_ip,'\\.')[2])) 97 | AND f2.status = 1 98 | 99 | WHERE 100 | -- 过滤 agent 101 | parse_user_agent(a.agent,2) <> 'Robot/Spider' 102 | AND a.agent NOT LIKE '%spider%' 103 | AND a.agent NOT LIKE '%-broker%' 104 | 105 | -- 过滤 ip 和 ip 段 106 | AND f1.client_ip IS NULL 107 | AND f2.client_ip IS NULL 108 | ; -------------------------------------------------------------------------------- /uba_log/uba_sql/web/dw_web_visit_traffic_log_ddl.sql: -------------------------------------------------------------------------------- 1 | -- 创建 WEB visit log 详细表 2 | CREATE TABLE IF NOT EXISTS dw_db.dw_web_visit_traffic_log ( 3 | user_id string, 4 | selection_city_id string, 5 | client_time string, 6 | user_based_city_id string, 7 | referer_full_url string, 8 | referer_page string, 9 | referer_page_id string, 10 | referer_page_name string, 11 | current_full_url string, 12 | current_page string, 13 | current_page_id string, 14 | current_page_name string, 15 | channel_code string, 16 | page_param string, 17 | client_param string, 18 | guid string, 19 | client_ip string, 20 | os_type string, 21 | os_version string, 22 | brower_type string, 23 | brower_version string, 24 | phone_type string, 25 | server_time string, 26 | referer_host String, 27 | referer_query String, 28 | referer_ref String, 29 | current_host String, 30 | current_query String, 31 | current_ref String, 32 | current_host_city_id String 33 | ) partitioned by (p_dt string); 34 | --------------------------------------------------------------------------------