├── README.md ├── dxprof.py ├── datax.py └── perftrace.py /README.md: -------------------------------------------------------------------------------- 1 | # DataX_Python3 2 | 3 | 由于我本地 python 为 3.7.0 ,所以把 [datax_home]/bin 目录下的三个py文件修改成了符合 python3 语法要求的文件,执行也没报错 4 | 5 | 如需要,下载这三个文件替换即可。 6 | 7 | DataX 学习记录:https://blog.csdn.net/weixin_41287692/article/details/83620261 8 | -------------------------------------------------------------------------------- /dxprof.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # vim: set expandtab tabstop=4 shiftwidth=4 foldmethod=marker nu: 3 | 4 | import re 5 | import sys 6 | import time 7 | 8 | REG_SQL_WAKE = re.compile(r'Begin\s+to\s+read\s+record\s+by\s+Sql', re.IGNORECASE) 9 | REG_SQL_DONE = re.compile(r'Finished\s+read\s+record\s+by\s+Sql', re.IGNORECASE) 10 | REG_SQL_PATH = re.compile(r'from\s+(\w+)(\s+where|\s*$)', re.IGNORECASE) 11 | REG_SQL_JDBC = re.compile(r'jdbcUrl:\s*\[(.+?)\]', re.IGNORECASE) 12 | REG_SQL_UUID = re.compile(r'(\d+\-)+reader') 13 | REG_COMMIT_UUID = re.compile(r'(\d+\-)+writer') 14 | REG_COMMIT_WAKE = re.compile(r'begin\s+to\s+commit\s+blocks', re.IGNORECASE) 15 | REG_COMMIT_DONE = re.compile(r'commit\s+blocks\s+ok', re.IGNORECASE) 16 | 17 | # {{{ function parse_timestamp() # 18 | def parse_timestamp(line): 19 | try: 20 | ts = int(time.mktime(time.strptime(line[0:19], '%Y-%m-%d %H:%M:%S'))) 21 | except: 22 | ts = 0 23 | 24 | return ts 25 | 26 | # }}} # 27 | 28 | # {{{ function parse_query_host() # 29 | def parse_query_host(line): 30 | ori = REG_SQL_JDBC.search(line) 31 | if (not ori): 32 | return '' 33 | 34 | ori = ori.group(1).split('?')[0] 35 | off = ori.find('@') 36 | if (off > -1): 37 | ori = ori[off+1:len(ori)] 38 | else: 39 | off = ori.find('//') 40 | if (off > -1): 41 | ori = ori[off+2:len(ori)] 42 | 43 | return ori.lower() 44 | # }}} # 45 | 46 | # {{{ function parse_query_table() # 47 | def parse_query_table(line): 48 | ori = REG_SQL_PATH.search(line) 49 | return (ori and ori.group(1).lower()) or '' 50 | # }}} # 51 | 52 | # {{{ function parse_reader_task() # 53 | def parse_task(fname): 54 | global LAST_SQL_UUID 55 | global LAST_COMMIT_UUID 56 | global DATAX_JOBDICT 57 | global DATAX_JOBDICT_COMMIT 58 | global UNIXTIME 59 | LAST_SQL_UUID = '' 60 | DATAX_JOBDICT = {} 61 | LAST_COMMIT_UUID = '' 62 | DATAX_JOBDICT_COMMIT = {} 63 | 64 | UNIXTIME = int(time.time()) 65 | with open(fname, 'r') as f: 66 | for line in f.readlines(): 67 | line = line.strip() 68 | 69 | if (LAST_SQL_UUID and (LAST_SQL_UUID in DATAX_JOBDICT)): 70 | DATAX_JOBDICT[LAST_SQL_UUID]['host'] = parse_query_host(line) 71 | LAST_SQL_UUID = '' 72 | 73 | if line.find('CommonRdbmsReader$Task') > 0: 74 | parse_read_task(line) 75 | elif line.find('commit blocks') > 0: 76 | parse_write_task(line) 77 | else: 78 | continue 79 | # }}} # 80 | 81 | # {{{ function parse_read_task() # 82 | def parse_read_task(line): 83 | ser = REG_SQL_UUID.search(line) 84 | if not ser: 85 | return 86 | 87 | LAST_SQL_UUID = ser.group() 88 | if REG_SQL_WAKE.search(line): 89 | DATAX_JOBDICT[LAST_SQL_UUID] = { 90 | 'stat' : 'R', 91 | 'wake' : parse_timestamp(line), 92 | 'done' : UNIXTIME, 93 | 'host' : parse_query_host(line), 94 | 'path' : parse_query_table(line) 95 | } 96 | elif ((LAST_SQL_UUID in DATAX_JOBDICT) and REG_SQL_DONE.search(line)): 97 | DATAX_JOBDICT[LAST_SQL_UUID]['stat'] = 'D' 98 | DATAX_JOBDICT[LAST_SQL_UUID]['done'] = parse_timestamp(line) 99 | # }}} # 100 | 101 | # {{{ function parse_write_task() # 102 | def parse_write_task(line): 103 | ser = REG_COMMIT_UUID.search(line) 104 | if not ser: 105 | return 106 | 107 | LAST_COMMIT_UUID = ser.group() 108 | if REG_COMMIT_WAKE.search(line): 109 | DATAX_JOBDICT_COMMIT[LAST_COMMIT_UUID] = { 110 | 'stat' : 'R', 111 | 'wake' : parse_timestamp(line), 112 | 'done' : UNIXTIME, 113 | } 114 | elif ((LAST_COMMIT_UUID in DATAX_JOBDICT_COMMIT) and REG_COMMIT_DONE.search(line)): 115 | DATAX_JOBDICT_COMMIT[LAST_COMMIT_UUID]['stat'] = 'D' 116 | DATAX_JOBDICT_COMMIT[LAST_COMMIT_UUID]['done'] = parse_timestamp(line) 117 | # }}} # 118 | 119 | # {{{ function result_analyse() # 120 | def result_analyse(): 121 | def compare(a, b): 122 | return b['cost'] - a['cost'] 123 | 124 | tasklist = [] 125 | hostsmap = {} 126 | statvars = {'sum' : 0, 'cnt' : 0, 'svr' : 0, 'max' : 0, 'min' : int(time.time())} 127 | tasklist_commit = [] 128 | statvars_commit = {'sum' : 0, 'cnt' : 0} 129 | 130 | for idx in DATAX_JOBDICT: 131 | item = DATAX_JOBDICT[idx] 132 | item['uuid'] = idx; 133 | item['cost'] = item['done'] - item['wake'] 134 | tasklist.append(item); 135 | 136 | if (not (item['host'] in hostsmap)): 137 | hostsmap[item['host']] = 1 138 | statvars['svr'] += 1 139 | 140 | if (item['cost'] > -1 and item['cost'] < 864000): 141 | statvars['sum'] += item['cost'] 142 | statvars['cnt'] += 1 143 | statvars['max'] = max(statvars['max'], item['done']) 144 | statvars['min'] = min(statvars['min'], item['wake']) 145 | 146 | for idx in DATAX_JOBDICT_COMMIT: 147 | itemc = DATAX_JOBDICT_COMMIT[idx] 148 | itemc['uuid'] = idx 149 | itemc['cost'] = itemc['done'] - itemc['wake'] 150 | tasklist_commit.append(itemc) 151 | 152 | if (itemc['cost'] > -1 and itemc['cost'] < 864000): 153 | statvars_commit['sum'] += itemc['cost'] 154 | statvars_commit['cnt'] += 1 155 | 156 | ttl = (statvars['max'] - statvars['min']) or 1 157 | idx = float(statvars['cnt']) / (statvars['sum'] or ttl) 158 | 159 | tasklist.sort(compare) 160 | for item in tasklist: 161 | print('%s\t%s.%s\t%s\t%s\t% 4d\t% 2.1f%%\t% .2f' %(item['stat'], item['host'], item['path'], 162 | time.strftime('%H:%M:%S', time.localtime(item['wake'])), 163 | (('D' == item['stat']) and time.strftime('%H:%M:%S', time.localtime(item['done']))) or '--', 164 | item['cost'], 100 * item['cost'] / ttl, idx * item['cost'])) 165 | 166 | if (not len(tasklist) or not statvars['cnt']): 167 | return 168 | 169 | print('\n--- DataX Profiling Statistics ---') 170 | print('%d task(s) on %d server(s), Total elapsed %d second(s), %.2f second(s) per task in average' %(statvars['cnt'], 171 | statvars['svr'], statvars['sum'], float(statvars['sum']) / statvars['cnt'])) 172 | print('Actually cost %d second(s) (%s - %s), task concurrency: %.2f, tilt index: %.2f' %(ttl, 173 | time.strftime('%H:%M:%S', time.localtime(statvars['min'])), 174 | time.strftime('%H:%M:%S', time.localtime(statvars['max'])), 175 | float(statvars['sum']) / ttl, idx * tasklist[0]['cost'])) 176 | 177 | idx_commit = float(statvars_commit['cnt']) / (statvars_commit['sum'] or ttl) 178 | tasklist_commit.sort(compare) 179 | print('%d task(s) done odps comit, Total elapsed %d second(s), %.2f second(s) per task in average, tilt index: %.2f' % ( 180 | statvars_commit['cnt'], 181 | statvars_commit['sum'], float(statvars_commit['sum']) / statvars_commit['cnt'], 182 | idx_commit * tasklist_commit[0]['cost'])) 183 | 184 | # }}} # 185 | 186 | if (len(sys.argv) < 2): 187 | print("Usage: %s filename" %(sys.argv[0])) 188 | quit(1) 189 | else: 190 | parse_task(sys.argv[1]) 191 | result_analyse() 192 | -------------------------------------------------------------------------------- /datax.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import sys 5 | import os 6 | import signal 7 | import subprocess 8 | import time 9 | import re 10 | import socket 11 | import json 12 | from optparse import OptionParser 13 | from optparse import OptionGroup 14 | from string import Template 15 | import codecs 16 | import platform 17 | 18 | def isWindows(): 19 | return platform.system() == 'Windows' 20 | 21 | DATAX_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 22 | 23 | DATAX_VERSION = 'DATAX-OPENSOURCE-3.0' 24 | if isWindows(): 25 | codecs.register(lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None) 26 | CLASS_PATH = ("%s/lib/*") % (DATAX_HOME) 27 | else: 28 | CLASS_PATH = ("%s/lib/*:.") % (DATAX_HOME) 29 | LOGBACK_FILE = ("%s/conf/logback.xml") % (DATAX_HOME) 30 | DEFAULT_JVM = "-Xms1g -Xmx1g -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s/log" % (DATAX_HOME) 31 | DEFAULT_PROPERTY_CONF = "-Dfile.encoding=UTF-8 -Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener -Djava.security.egd=file:///dev/urandom -Ddatax.home=%s -Dlogback.configurationFile=%s" % ( 32 | DATAX_HOME, LOGBACK_FILE) 33 | ENGINE_COMMAND = "java -server ${jvm} %s -classpath %s ${params} com.alibaba.datax.core.Engine -mode ${mode} -jobid ${jobid} -job ${job}" % ( 34 | DEFAULT_PROPERTY_CONF, CLASS_PATH) 35 | REMOTE_DEBUG_CONFIG = "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,address=9999" 36 | 37 | RET_STATE = { 38 | "KILL": 143, 39 | "FAIL": -1, 40 | "OK": 0, 41 | "RUN": 1, 42 | "RETRY": 2 43 | } 44 | 45 | 46 | def getLocalIp(): 47 | try: 48 | return socket.gethostbyname(socket.getfqdn(socket.gethostname())) 49 | except: 50 | return "Unknown" 51 | 52 | 53 | def suicide(signum, e): 54 | global child_process 55 | print >> sys.stderr, "[Error] DataX receive unexpected signal %d, starts to suicide." % (signum) 56 | 57 | if child_process: 58 | child_process.send_signal(signal.SIGQUIT) 59 | time.sleep(1) 60 | child_process.kill() 61 | print >> sys.stderr, "DataX Process was killed ! you did ?" 62 | sys.exit(RET_STATE["KILL"]) 63 | 64 | 65 | def register_signal(): 66 | if not isWindows(): 67 | global child_process 68 | signal.signal(2, suicide) 69 | signal.signal(3, suicide) 70 | signal.signal(15, suicide) 71 | 72 | 73 | def getOptionParser(): 74 | usage = "usage: %prog [options] job-url-or-path" 75 | parser = OptionParser(usage=usage) 76 | 77 | prodEnvOptionGroup = OptionGroup(parser, "Product Env Options", 78 | "Normal user use these options to set jvm parameters, job runtime mode etc. " 79 | "Make sure these options can be used in Product Env.") 80 | prodEnvOptionGroup.add_option("-j", "--jvm", metavar="", dest="jvmParameters", action="store", 81 | default=DEFAULT_JVM, help="Set jvm parameters if necessary.") 82 | prodEnvOptionGroup.add_option("--jobid", metavar="", dest="jobid", action="store", default="-1", 83 | help="Set job unique id when running by Distribute/Local Mode.") 84 | prodEnvOptionGroup.add_option("-m", "--mode", metavar="", 85 | action="store", default="standalone", 86 | help="Set job runtime mode such as: standalone, local, distribute. " 87 | "Default mode is standalone.") 88 | prodEnvOptionGroup.add_option("-p", "--params", metavar="", 89 | action="store", dest="params", 90 | help='Set job parameter, eg: the source tableName you want to set it by command, ' 91 | 'then you can use like this: -p"-DtableName=your-table-name", ' 92 | 'if you have mutiple parameters: -p"-DtableName=your-table-name -DcolumnName=your-column-name".' 93 | 'Note: you should config in you job tableName with ${tableName}.') 94 | prodEnvOptionGroup.add_option("-r", "--reader", metavar="", 95 | action="store", dest="reader",type="string", 96 | help='View job config[reader] template, eg: mysqlreader,streamreader') 97 | prodEnvOptionGroup.add_option("-w", "--writer", metavar="", 98 | action="store", dest="writer",type="string", 99 | help='View job config[writer] template, eg: mysqlwriter,streamwriter') 100 | parser.add_option_group(prodEnvOptionGroup) 101 | 102 | devEnvOptionGroup = OptionGroup(parser, "Develop/Debug Options", 103 | "Developer use these options to trace more details of DataX.") 104 | devEnvOptionGroup.add_option("-d", "--debug", dest="remoteDebug", action="store_true", 105 | help="Set to remote debug mode.") 106 | devEnvOptionGroup.add_option("--loglevel", metavar="", dest="loglevel", action="store", 107 | default="info", help="Set log level such as: debug, info, all etc.") 108 | parser.add_option_group(devEnvOptionGroup) 109 | return parser 110 | 111 | def generateJobConfigTemplate(reader, writer): 112 | readerRef = "Please refer to the %s document:\n https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n" % (reader,reader,reader) 113 | writerRef = "Please refer to the %s document:\n https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n " % (writer,writer,writer) 114 | print(readerRef) 115 | print(writerRef) 116 | jobGuid = 'Please save the following configuration as a json file and use\n python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json \nto run the job.\n' 117 | print(jobGuid) 118 | jobTemplate={ 119 | "job": { 120 | "setting": { 121 | "speed": { 122 | "channel": "" 123 | } 124 | }, 125 | "content": [ 126 | { 127 | "reader": {}, 128 | "writer": {} 129 | } 130 | ] 131 | } 132 | } 133 | readerTemplatePath = "%s/plugin/reader/%s/plugin_job_template.json" % (DATAX_HOME,reader) 134 | writerTemplatePath = "%s/plugin/writer/%s/plugin_job_template.json" % (DATAX_HOME,writer) 135 | try: 136 | readerPar = readPluginTemplate(readerTemplatePath); 137 | except Exception as e: 138 | print("Read reader[%s] template error: can\'t find file %s" % (reader,readerTemplatePath)) 139 | try: 140 | writerPar = readPluginTemplate(writerTemplatePath); 141 | except Exception as e: 142 | print("Read writer[%s] template error: : can\'t find file %s" % (writer,writerTemplatePath)) 143 | jobTemplate['job']['content'][0]['reader'] = readerPar; 144 | jobTemplate['job']['content'][0]['writer'] = writerPar; 145 | print(json.dumps(jobTemplate, indent=4, sort_keys=True)) 146 | 147 | def readPluginTemplate(plugin): 148 | with open(plugin, 'r') as f: 149 | return json.load(f) 150 | 151 | def isUrl(path): 152 | if not path: 153 | return False 154 | 155 | assert (isinstance(path, str)) 156 | m = re.match(r"^http[s]?://\S+\w*", path.lower()) 157 | if m: 158 | return True 159 | else: 160 | return False 161 | 162 | 163 | def buildStartCommand(options, args): 164 | commandMap = {} 165 | tempJVMCommand = DEFAULT_JVM 166 | if options.jvmParameters: 167 | tempJVMCommand = tempJVMCommand + " " + options.jvmParameters 168 | 169 | if options.remoteDebug: 170 | tempJVMCommand = tempJVMCommand + " " + REMOTE_DEBUG_CONFIG 171 | print('local ip: ', getLocalIp()) 172 | 173 | if options.loglevel: 174 | tempJVMCommand = tempJVMCommand + " " + ("-Dloglevel=%s" % (options.loglevel)) 175 | 176 | if options.mode: 177 | commandMap["mode"] = options.mode 178 | 179 | # jobResource 可能是 URL,也可能是本地文件路径(相对,绝对) 180 | jobResource = args[0] 181 | if not isUrl(jobResource): 182 | jobResource = os.path.abspath(jobResource) 183 | if jobResource.lower().startswith("file://"): 184 | jobResource = jobResource[len("file://"):] 185 | 186 | jobParams = ("-Dlog.file.name=%s") % (jobResource[-20:].replace('/', '_').replace('.', '_')) 187 | if options.params: 188 | jobParams = jobParams + " " + options.params 189 | 190 | if options.jobid: 191 | commandMap["jobid"] = options.jobid 192 | 193 | commandMap["jvm"] = tempJVMCommand 194 | commandMap["params"] = jobParams 195 | commandMap["job"] = jobResource 196 | 197 | return Template(ENGINE_COMMAND).substitute(**commandMap) 198 | 199 | 200 | def printCopyright(): 201 | print(''' 202 | DataX (%s), From Alibaba ! 203 | Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved. 204 | 205 | ''' % DATAX_VERSION) 206 | sys.stdout.flush() 207 | 208 | 209 | if __name__ == "__main__": 210 | printCopyright() 211 | parser = getOptionParser() 212 | options, args = parser.parse_args(sys.argv[1:]) 213 | if options.reader is not None and options.writer is not None: 214 | generateJobConfigTemplate(options.reader,options.writer) 215 | sys.exit(RET_STATE['OK']) 216 | if len(args) != 1: 217 | parser.print_help() 218 | sys.exit(RET_STATE['FAIL']) 219 | 220 | startCommand = buildStartCommand(options, args) 221 | # print startCommand 222 | 223 | child_process = subprocess.Popen(startCommand, shell=True) 224 | register_signal() 225 | (stdout, stderr) = child_process.communicate() 226 | 227 | sys.exit(child_process.returncode) 228 | -------------------------------------------------------------------------------- /perftrace.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | 5 | """ 6 | Life's short, Python more. 7 | """ 8 | 9 | import re 10 | import os 11 | import sys 12 | import json 13 | import uuid 14 | import signal 15 | import time 16 | import subprocess 17 | from optparse import OptionParser 18 | reload(sys) 19 | sys.setdefaultencoding('utf8') 20 | 21 | ##begin cli & help logic 22 | def getOptionParser(): 23 | usage = getUsage() 24 | parser = OptionParser(usage = usage) 25 | #rdbms reader and writer 26 | parser.add_option('-r', '--reader', action='store', dest='reader', help='trace datasource read performance with specified !json! string') 27 | parser.add_option('-w', '--writer', action='store', dest='writer', help='trace datasource write performance with specified !json! string') 28 | 29 | parser.add_option('-c', '--channel', action='store', dest='channel', default='1', help='the number of concurrent sync thread, the default is 1') 30 | parser.add_option('-f', '--file', action='store', help='existing datax configuration file, include reader and writer params') 31 | parser.add_option('-t', '--type', action='store', default='reader', help='trace which side\'s performance, cooperate with -f --file params, need to be reader or writer') 32 | parser.add_option('-d', '--delete', action='store', default='true', help='delete temporary files, the default value is true') 33 | #parser.add_option('-h', '--help', action='store', default='true', help='print usage information') 34 | return parser 35 | 36 | def getUsage(): 37 | return ''' 38 | The following params are available for -r --reader: 39 | [these params is for rdbms reader, used to trace rdbms read performance, it's like datax's key] 40 | *datasourceType: datasource type, may be mysql|drds|oracle|ads|sqlserver|postgresql|db2 etc... 41 | *jdbcUrl: datasource jdbc connection string, mysql as a example: jdbc:mysql://ip:port/database 42 | *username: username for datasource 43 | *password: password for datasource 44 | *table: table name for read data 45 | column: column to be read, the default value is ['*'] 46 | splitPk: the splitPk column of rdbms table 47 | where: limit the scope of the performance data set 48 | fetchSize: how many rows to be fetched at each communicate 49 | 50 | [these params is for stream reader, used to trace rdbms write performance] 51 | reader-sliceRecordCount: how man test data to mock(each channel), the default value is 10000 52 | reader-column : stream reader while generate test data(type supports: string|long|date|double|bool|bytes; support constant value and random function),demo: [{"type":"string","value":"abc"},{"type":"string","random":"10,20"}] 53 | 54 | The following params are available for -w --writer: 55 | [these params is for rdbms writer, used to trace rdbms write performance, it's like datax's key] 56 | datasourceType: datasource type, may be mysql|drds|oracle|ads|sqlserver|postgresql|db2|ads etc... 57 | *jdbcUrl: datasource jdbc connection string, mysql as a example: jdbc:mysql://ip:port/database 58 | *username: username for datasource 59 | *password: password for datasource 60 | *table: table name for write data 61 | column: column to be writed, the default value is ['*'] 62 | batchSize: how many rows to be storeed at each communicate, the default value is 512 63 | preSql: prepare sql to be executed before write data, the default value is '' 64 | postSql: post sql to be executed end of write data, the default value is '' 65 | url: required for ads, pattern is ip:port 66 | schme: required for ads, ads database name 67 | 68 | [these params is for stream writer, used to trace rdbms read performance] 69 | writer-print: true means print data read from source datasource, the default value is false 70 | 71 | The following params are available global control: 72 | -c --channel: the number of concurrent tasks, the default value is 1 73 | -f --file: existing completely dataX configuration file path 74 | -t --type: test read or write performance for a datasource, couble be reader or writer, in collaboration with -f --file 75 | -h --help: print help message 76 | 77 | some demo: 78 | perftrace.py --channel=10 --reader='{"jdbcUrl":"jdbc:mysql://127.0.0.1:3306/database", "username":"", "password":"", "table": "", "where":"", "splitPk":"", "writer-print":"false"}' 79 | perftrace.py --channel=10 --writer='{"jdbcUrl":"jdbc:mysql://127.0.0.1:3306/database", "username":"", "password":"", "table": "", "reader-sliceRecordCount": "10000", "reader-column": [{"type":"string","value":"abc"},{"type":"string","random":"10,20"}]}' 80 | perftrace.py --file=/tmp/datax.job.json --type=reader --reader='{"writer-print": "false"}' 81 | perftrace.py --file=/tmp/datax.job.json --type=writer --writer='{"reader-sliceRecordCount": "10000", "reader-column": [{"type":"string","value":"abc"},{"type":"string","random":"10,20"}]}' 82 | 83 | some example jdbc url pattern, may help: 84 | jdbc:oracle:thin:@ip:port:database 85 | jdbc:mysql://ip:port/database 86 | jdbc:sqlserver://ip:port;DatabaseName=database 87 | jdbc:postgresql://ip:port/database 88 | warn: ads url pattern is ip:port 89 | warn: test write performance will write data into your table, you can use a temporary table just for test. 90 | ''' 91 | 92 | def printCopyright(): 93 | DATAX_VERSION = 'UNKNOWN_DATAX_VERSION' 94 | print(''' 95 | DataX Util Tools (%s), From Alibaba ! 96 | Copyright (C) 2010-2016, Alibaba Group. All Rights Reserved.''' % DATAX_VERSION) 97 | sys.stdout.flush() 98 | 99 | 100 | def yesNoChoice(): 101 | yes = set(['yes','y', 'ye', '']) 102 | no = set(['no','n']) 103 | choice = raw_input().lower() 104 | if choice in yes: 105 | return True 106 | elif choice in no: 107 | return False 108 | else: 109 | sys.stdout.write("Please respond with 'yes' or 'no'") 110 | ##end cli & help logic 111 | 112 | 113 | ##begin process logic 114 | def suicide(signum, e): 115 | global childProcess 116 | print >> sys.stderr, "[Error] Receive unexpected signal %d, starts to suicide." % (signum) 117 | if childProcess: 118 | childProcess.send_signal(signal.SIGQUIT) 119 | time.sleep(1) 120 | childProcess.kill() 121 | print >> sys.stderr, "DataX Process was killed ! you did ?" 122 | sys.exit(-1) 123 | 124 | 125 | def registerSignal(): 126 | global childProcess 127 | signal.signal(2, suicide) 128 | signal.signal(3, suicide) 129 | signal.signal(15, suicide) 130 | 131 | 132 | def fork(command, isShell=False): 133 | global childProcess 134 | childProcess = subprocess.Popen(command, shell = isShell) 135 | registerSignal() 136 | (stdout, stderr) = childProcess.communicate() 137 | #阻塞直到子进程结束 138 | childProcess.wait() 139 | return childProcess.returncode 140 | ##end process logic 141 | 142 | 143 | ##begin datax json generate logic 144 | #warn: if not '': -> true; if not None: -> true 145 | def notNone(obj, context): 146 | if not obj: 147 | raise Exception("Configuration property [%s] could not be blank!" % (context)) 148 | 149 | def attributeNotNone(obj, attributes): 150 | for key in attributes: 151 | notNone(obj.get(key), key) 152 | 153 | def isBlank(value): 154 | if value is None or len(value.strip()) == 0: 155 | return True 156 | return False 157 | 158 | def parsePluginName(jdbcUrl, pluginType): 159 | import re 160 | #warn: drds 161 | name = 'pluginName' 162 | mysqlRegex = re.compile('jdbc:(mysql)://.*') 163 | if (mysqlRegex.match(jdbcUrl)): 164 | name = 'mysql' 165 | postgresqlRegex = re.compile('jdbc:(postgresql)://.*') 166 | if (postgresqlRegex.match(jdbcUrl)): 167 | name = 'postgresql' 168 | oracleRegex = re.compile('jdbc:(oracle):.*') 169 | if (oracleRegex.match(jdbcUrl)): 170 | name = 'oracle' 171 | sqlserverRegex = re.compile('jdbc:(sqlserver)://.*') 172 | if (sqlserverRegex.match(jdbcUrl)): 173 | name = 'sqlserver' 174 | db2Regex = re.compile('jdbc:(db2)://.*') 175 | if (db2Regex.match(jdbcUrl)): 176 | name = 'db2' 177 | return "%s%s" % (name, pluginType) 178 | 179 | def renderDataXJson(paramsDict, readerOrWriter = 'reader', channel = 1): 180 | dataxTemplate = { 181 | "job": { 182 | "setting": { 183 | "speed": { 184 | "channel": 1 185 | } 186 | }, 187 | "content": [ 188 | { 189 | "reader": { 190 | "name": "", 191 | "parameter": { 192 | "username": "", 193 | "password": "", 194 | "sliceRecordCount": "10000", 195 | "column": [ 196 | "*" 197 | ], 198 | "connection": [ 199 | { 200 | "table": [], 201 | "jdbcUrl": [] 202 | } 203 | ] 204 | } 205 | }, 206 | "writer": { 207 | "name": "", 208 | "parameter": { 209 | "print": "false", 210 | "connection": [ 211 | { 212 | "table": [], 213 | "jdbcUrl": '' 214 | } 215 | ] 216 | } 217 | } 218 | } 219 | ] 220 | } 221 | } 222 | dataxTemplate['job']['setting']['speed']['channel'] = channel 223 | dataxTemplateContent = dataxTemplate['job']['content'][0] 224 | 225 | pluginName = '' 226 | if paramsDict.get('datasourceType'): 227 | pluginName = '%s%s' % (paramsDict['datasourceType'], readerOrWriter) 228 | elif paramsDict.get('jdbcUrl'): 229 | pluginName = parsePluginName(paramsDict['jdbcUrl'], readerOrWriter) 230 | elif paramsDict.get('url'): 231 | pluginName = 'adswriter' 232 | 233 | theOtherSide = 'writer' if readerOrWriter == 'reader' else 'reader' 234 | dataxPluginParamsContent = dataxTemplateContent.get(readerOrWriter).get('parameter') 235 | dataxPluginParamsContent.update(paramsDict) 236 | 237 | dataxPluginParamsContentOtherSide = dataxTemplateContent.get(theOtherSide).get('parameter') 238 | 239 | if readerOrWriter == 'reader': 240 | dataxTemplateContent.get('reader')['name'] = pluginName 241 | dataxTemplateContent.get('writer')['name'] = 'streamwriter' 242 | if paramsDict.get('writer-print'): 243 | dataxPluginParamsContentOtherSide['print'] = paramsDict['writer-print'] 244 | del dataxPluginParamsContent['writer-print'] 245 | del dataxPluginParamsContentOtherSide['connection'] 246 | if readerOrWriter == 'writer': 247 | dataxTemplateContent.get('reader')['name'] = 'streamreader' 248 | dataxTemplateContent.get('writer')['name'] = pluginName 249 | if paramsDict.get('reader-column'): 250 | dataxPluginParamsContentOtherSide['column'] = paramsDict['reader-column'] 251 | del dataxPluginParamsContent['reader-column'] 252 | if paramsDict.get('reader-sliceRecordCount'): 253 | dataxPluginParamsContentOtherSide['sliceRecordCount'] = paramsDict['reader-sliceRecordCount'] 254 | del dataxPluginParamsContent['reader-sliceRecordCount'] 255 | del dataxPluginParamsContentOtherSide['connection'] 256 | 257 | if paramsDict.get('jdbcUrl'): 258 | if readerOrWriter == 'reader': 259 | dataxPluginParamsContent['connection'][0]['jdbcUrl'].append(paramsDict['jdbcUrl']) 260 | else: 261 | dataxPluginParamsContent['connection'][0]['jdbcUrl'] = paramsDict['jdbcUrl'] 262 | if paramsDict.get('table'): 263 | dataxPluginParamsContent['connection'][0]['table'].append(paramsDict['table']) 264 | 265 | 266 | traceJobJson = json.dumps(dataxTemplate, indent = 4) 267 | return traceJobJson 268 | 269 | def isUrl(path): 270 | if not path: 271 | return False 272 | if not isinstance(path, str): 273 | raise Exception('Configuration file path required for the string, you configure is:%s' % path) 274 | m = re.match(r"^http[s]?://\S+\w*", path.lower()) 275 | if m: 276 | return True 277 | else: 278 | return False 279 | 280 | 281 | def readJobJsonFromLocal(jobConfigPath): 282 | jobConfigContent = None 283 | jobConfigPath = os.path.abspath(jobConfigPath) 284 | file = open(jobConfigPath) 285 | try: 286 | jobConfigContent = file.read() 287 | finally: 288 | file.close() 289 | if not jobConfigContent: 290 | raise Exception("Your job configuration file read the result is empty, please check the configuration is legal, path: [%s]\nconfiguration:\n%s" % (jobConfigPath, str(jobConfigContent))) 291 | return jobConfigContent 292 | 293 | 294 | def readJobJsonFromRemote(jobConfigPath): 295 | import urllib 296 | conn = urllib.urlopen(jobConfigPath) 297 | jobJson = conn.read() 298 | return jobJson 299 | 300 | def parseJson(strConfig, context): 301 | try: 302 | return json.loads(strConfig) 303 | except Exception as e: 304 | import traceback 305 | traceback.print_exc() 306 | sys.stdout.flush() 307 | print >> sys.stderr, '%s %s need in line with json syntax' % (context, strConfig) 308 | sys.exit(-1) 309 | 310 | def convert(options, args): 311 | traceJobJson = '' 312 | if options.file: 313 | if isUrl(options.file): 314 | traceJobJson = readJobJsonFromRemote(options.file) 315 | else: 316 | traceJobJson = readJobJsonFromLocal(options.file) 317 | traceJobDict = parseJson(traceJobJson, '%s content' % options.file) 318 | attributeNotNone(traceJobDict, ['job']) 319 | attributeNotNone(traceJobDict['job'], ['content']) 320 | attributeNotNone(traceJobDict['job']['content'][0], ['reader', 'writer']) 321 | attributeNotNone(traceJobDict['job']['content'][0]['reader'], ['name', 'parameter']) 322 | attributeNotNone(traceJobDict['job']['content'][0]['writer'], ['name', 'parameter']) 323 | if options.type == 'reader': 324 | traceJobDict['job']['content'][0]['writer']['name'] = 'streamwriter' 325 | if options.reader: 326 | traceReaderDict = parseJson(options.reader, 'reader config') 327 | if traceReaderDict.get('writer-print') is not None: 328 | traceJobDict['job']['content'][0]['writer']['parameter']['print'] = traceReaderDict.get('writer-print') 329 | else: 330 | traceJobDict['job']['content'][0]['writer']['parameter']['print'] = 'false' 331 | else: 332 | traceJobDict['job']['content'][0]['writer']['parameter']['print'] = 'false' 333 | elif options.type == 'writer': 334 | traceJobDict['job']['content'][0]['reader']['name'] = 'streamreader' 335 | if options.writer: 336 | traceWriterDict = parseJson(options.writer, 'writer config') 337 | if traceWriterDict.get('reader-column'): 338 | traceJobDict['job']['content'][0]['reader']['parameter']['column'] = traceWriterDict['reader-column'] 339 | if traceWriterDict.get('reader-sliceRecordCount'): 340 | traceJobDict['job']['content'][0]['reader']['parameter']['sliceRecordCount'] = traceWriterDict['reader-sliceRecordCount'] 341 | else: 342 | columnSize = len(traceJobDict['job']['content'][0]['writer']['parameter']['column']) 343 | streamReaderColumn = [] 344 | for i in range(columnSize): 345 | streamReaderColumn.append({"type": "long", "random": "2,10"}) 346 | traceJobDict['job']['content'][0]['reader']['parameter']['column'] = streamReaderColumn 347 | traceJobDict['job']['content'][0]['reader']['parameter']['sliceRecordCount'] = 10000 348 | else: 349 | pass#do nothing 350 | return json.dumps(traceJobDict, indent = 4) 351 | elif options.reader: 352 | traceReaderDict = parseJson(options.reader, 'reader config') 353 | return renderDataXJson(traceReaderDict, 'reader', options.channel) 354 | elif options.writer: 355 | traceWriterDict = parseJson(options.writer, 'writer config') 356 | return renderDataXJson(traceWriterDict, 'writer', options.channel) 357 | else: 358 | print(getUsage()) 359 | sys.exit(-1) 360 | #dataxParams = {} 361 | #for opt, value in options.__dict__.items(): 362 | # dataxParams[opt] = value 363 | ##end datax json generate logic 364 | 365 | 366 | if __name__ == "__main__": 367 | printCopyright() 368 | parser = getOptionParser() 369 | 370 | options, args = parser.parse_args(sys.argv[1:]) 371 | #print options, args 372 | dataxTraceJobJson = convert(options, args) 373 | 374 | #由MAC地址、当前时间戳、随机数生成,可以保证全球范围内的唯一性 375 | dataxJobPath = os.path.join(os.getcwd(), "perftrace-" + str(uuid.uuid1())) 376 | jobConfigOk = True 377 | if os.path.exists(dataxJobPath): 378 | print("file already exists, truncate and rewrite it? %s" % dataxJobPath) 379 | if yesNoChoice(): 380 | jobConfigOk = True 381 | else: 382 | print("exit failed, because of file conflict") 383 | sys.exit(-1) 384 | fileWriter = open(dataxJobPath, 'w') 385 | fileWriter.write(dataxTraceJobJson) 386 | fileWriter.close() 387 | 388 | 389 | print("trace environments:") 390 | print("dataxJobPath: %s" % dataxJobPath) 391 | dataxHomePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 392 | print("dataxHomePath: %s" % dataxHomePath) 393 | 394 | dataxCommand = "%s %s" % (os.path.join(dataxHomePath, "bin", "datax.py"), dataxJobPath) 395 | print("dataxCommand: %s" % dataxCommand) 396 | 397 | returncode = fork(dataxCommand, True) 398 | if options.delete == 'true': 399 | os.remove(dataxJobPath) 400 | sys.exit(returncode) 401 | --------------------------------------------------------------------------------