├── README.md
├── dxprof.py
├── datax.py
└── perftrace.py


/README.md:
--------------------------------------------------------------------------------
1 | # DataX_Python3
2 | 
3 | 由于我本地 python 为 3.7.0 ，所以把 [datax_home]/bin 目录下的三个py文件修改成了符合 python3 语法要求的文件，执行也没报错
4 | 
5 | 如需要，下载这三个文件替换即可。
6 | 
7 | DataX 学习记录：https://blog.csdn.net/weixin_41287692/article/details/83620261
8 | 


--------------------------------------------------------------------------------
/dxprof.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # vim: set expandtab tabstop=4 shiftwidth=4 foldmethod=marker nu:
  3 | 
  4 | import re
  5 | import sys
  6 | import time
  7 | 
  8 | REG_SQL_WAKE = re.compile(r'Begin\s+to\s+read\s+record\s+by\s+Sql', re.IGNORECASE)
  9 | REG_SQL_DONE = re.compile(r'Finished\s+read\s+record\s+by\s+Sql', re.IGNORECASE)
 10 | REG_SQL_PATH = re.compile(r'from\s+(\w+)(\s+where|\s*$)', re.IGNORECASE)
 11 | REG_SQL_JDBC = re.compile(r'jdbcUrl:\s*\[(.+?)\]', re.IGNORECASE)
 12 | REG_SQL_UUID = re.compile(r'(\d+\-)+reader')
 13 | REG_COMMIT_UUID = re.compile(r'(\d+\-)+writer')
 14 | REG_COMMIT_WAKE = re.compile(r'begin\s+to\s+commit\s+blocks', re.IGNORECASE)
 15 | REG_COMMIT_DONE = re.compile(r'commit\s+blocks\s+ok', re.IGNORECASE)
 16 | 
 17 | # {{{ function parse_timestamp() #
 18 | def parse_timestamp(line):
 19 |     try:
 20 |         ts = int(time.mktime(time.strptime(line[0:19], '%Y-%m-%d %H:%M:%S')))
 21 |     except:
 22 |         ts = 0
 23 | 
 24 |     return ts
 25 | 
 26 | # }}} #
 27 | 
 28 | # {{{ function parse_query_host() #
 29 | def parse_query_host(line):
 30 |     ori = REG_SQL_JDBC.search(line)
 31 |     if (not ori):
 32 |         return ''
 33 | 
 34 |     ori = ori.group(1).split('?')[0]
 35 |     off = ori.find('@')
 36 |     if (off > -1):
 37 |         ori = ori[off+1:len(ori)]
 38 |     else:
 39 |         off = ori.find('//')
 40 |         if (off > -1):
 41 |             ori = ori[off+2:len(ori)]
 42 | 
 43 |     return ori.lower()
 44 | # }}} #
 45 | 
 46 | # {{{ function parse_query_table() #
 47 | def parse_query_table(line):
 48 |     ori = REG_SQL_PATH.search(line)
 49 |     return (ori and ori.group(1).lower()) or ''
 50 | # }}} #
 51 | 
 52 | # {{{ function parse_reader_task() #
 53 | def parse_task(fname):
 54 |     global LAST_SQL_UUID
 55 |     global LAST_COMMIT_UUID
 56 |     global DATAX_JOBDICT
 57 |     global DATAX_JOBDICT_COMMIT
 58 |     global UNIXTIME
 59 |     LAST_SQL_UUID = ''
 60 |     DATAX_JOBDICT = {}
 61 |     LAST_COMMIT_UUID = ''
 62 |     DATAX_JOBDICT_COMMIT = {}
 63 | 
 64 |     UNIXTIME = int(time.time())
 65 |     with open(fname, 'r') as f:
 66 |         for line in f.readlines():
 67 |             line = line.strip()
 68 | 
 69 |             if (LAST_SQL_UUID and (LAST_SQL_UUID in DATAX_JOBDICT)):
 70 |                 DATAX_JOBDICT[LAST_SQL_UUID]['host'] = parse_query_host(line)
 71 |                 LAST_SQL_UUID = ''
 72 | 
 73 |             if line.find('CommonRdbmsReader$Task') > 0:
 74 |                 parse_read_task(line)
 75 |             elif line.find('commit blocks') > 0:
 76 |                 parse_write_task(line)
 77 |             else:
 78 |                 continue
 79 | # }}} #
 80 | 
 81 | # {{{ function parse_read_task() #
 82 | def parse_read_task(line):
 83 |     ser = REG_SQL_UUID.search(line)
 84 |     if not ser:
 85 |         return
 86 | 
 87 |     LAST_SQL_UUID = ser.group()
 88 |     if REG_SQL_WAKE.search(line):
 89 |         DATAX_JOBDICT[LAST_SQL_UUID] = {
 90 |             'stat' : 'R',
 91 |             'wake' : parse_timestamp(line),
 92 |             'done' : UNIXTIME,
 93 |             'host' : parse_query_host(line),
 94 |             'path' : parse_query_table(line)
 95 |         }
 96 |     elif ((LAST_SQL_UUID in DATAX_JOBDICT) and REG_SQL_DONE.search(line)):
 97 |         DATAX_JOBDICT[LAST_SQL_UUID]['stat'] = 'D'
 98 |         DATAX_JOBDICT[LAST_SQL_UUID]['done'] = parse_timestamp(line)
 99 | # }}} #
100 | 
101 | # {{{ function parse_write_task() #
102 | def parse_write_task(line):
103 |     ser = REG_COMMIT_UUID.search(line)
104 |     if not ser:
105 |         return
106 | 
107 |     LAST_COMMIT_UUID = ser.group()
108 |     if REG_COMMIT_WAKE.search(line):
109 |         DATAX_JOBDICT_COMMIT[LAST_COMMIT_UUID] = {
110 |             'stat' : 'R',
111 |             'wake' : parse_timestamp(line),
112 |             'done' : UNIXTIME,
113 |         }
114 |     elif ((LAST_COMMIT_UUID in DATAX_JOBDICT_COMMIT) and REG_COMMIT_DONE.search(line)):
115 |         DATAX_JOBDICT_COMMIT[LAST_COMMIT_UUID]['stat'] = 'D'
116 |         DATAX_JOBDICT_COMMIT[LAST_COMMIT_UUID]['done'] = parse_timestamp(line)
117 | # }}} #
118 | 
119 | # {{{ function result_analyse() #
120 | def result_analyse():
121 |     def compare(a, b):
122 |         return b['cost'] - a['cost']
123 | 
124 |     tasklist = []
125 |     hostsmap = {}
126 |     statvars = {'sum' : 0, 'cnt' : 0, 'svr' : 0, 'max' : 0, 'min' : int(time.time())}
127 |     tasklist_commit = []
128 |     statvars_commit = {'sum' : 0, 'cnt' : 0}
129 | 
130 |     for idx in DATAX_JOBDICT:
131 |         item = DATAX_JOBDICT[idx]
132 |         item['uuid'] = idx;
133 |         item['cost'] = item['done'] - item['wake']
134 |         tasklist.append(item);
135 | 
136 |         if (not (item['host'] in hostsmap)):
137 |             hostsmap[item['host']] = 1
138 |             statvars['svr'] += 1
139 | 
140 |         if (item['cost'] > -1 and item['cost'] < 864000):
141 |             statvars['sum'] += item['cost']
142 |             statvars['cnt'] += 1
143 |             statvars['max'] = max(statvars['max'], item['done'])
144 |             statvars['min'] = min(statvars['min'], item['wake'])
145 | 
146 |     for idx in DATAX_JOBDICT_COMMIT:
147 |         itemc = DATAX_JOBDICT_COMMIT[idx]
148 |         itemc['uuid'] = idx
149 |         itemc['cost'] = itemc['done'] - itemc['wake']
150 |         tasklist_commit.append(itemc)
151 | 
152 |         if (itemc['cost'] > -1 and itemc['cost'] < 864000):
153 |             statvars_commit['sum'] += itemc['cost']
154 |             statvars_commit['cnt'] += 1
155 | 
156 |     ttl = (statvars['max'] - statvars['min']) or 1
157 |     idx = float(statvars['cnt']) / (statvars['sum'] or ttl)
158 | 
159 |     tasklist.sort(compare)
160 |     for item in tasklist:
161 |         print('%s\t%s.%s\t%s\t%s\t% 4d\t% 2.1f%%\t% .2f' %(item['stat'], item['host'], item['path'],
162 |                                                            time.strftime('%H:%M:%S', time.localtime(item['wake'])),
163 |                                                            (('D' == item['stat']) and time.strftime('%H:%M:%S', time.localtime(item['done']))) or '--',
164 |                                                            item['cost'], 100 * item['cost'] / ttl, idx * item['cost']))
165 | 
166 |     if (not len(tasklist) or not statvars['cnt']):
167 |         return
168 | 
169 |     print('\n--- DataX Profiling Statistics ---')
170 |     print('%d task(s) on %d server(s), Total elapsed %d second(s), %.2f second(s) per task in average' %(statvars['cnt'],
171 |                                                                                                          statvars['svr'], statvars['sum'], float(statvars['sum']) / statvars['cnt']))
172 |     print('Actually cost %d second(s) (%s - %s), task concurrency: %.2f, tilt index: %.2f' %(ttl,
173 |                                                                                              time.strftime('%H:%M:%S', time.localtime(statvars['min'])),
174 |                                                                                              time.strftime('%H:%M:%S', time.localtime(statvars['max'])),
175 |                                                                                              float(statvars['sum']) / ttl, idx * tasklist[0]['cost']))
176 | 
177 |     idx_commit = float(statvars_commit['cnt']) / (statvars_commit['sum'] or ttl)
178 |     tasklist_commit.sort(compare)
179 |     print('%d task(s) done odps comit, Total elapsed %d second(s), %.2f second(s) per task in average, tilt index: %.2f' % (
180 |         statvars_commit['cnt'],
181 |         statvars_commit['sum'], float(statvars_commit['sum']) / statvars_commit['cnt'],
182 |         idx_commit * tasklist_commit[0]['cost']))
183 | 
184 | # }}} #
185 | 
186 | if (len(sys.argv) < 2):
187 |     print("Usage: %s filename" %(sys.argv[0]))
188 |     quit(1)
189 | else:
190 |     parse_task(sys.argv[1])
191 |     result_analyse()
192 | 


--------------------------------------------------------------------------------
/datax.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | import sys
  5 | import os
  6 | import signal
  7 | import subprocess
  8 | import time
  9 | import re
 10 | import socket
 11 | import json
 12 | from optparse import OptionParser
 13 | from optparse import OptionGroup
 14 | from string import Template
 15 | import codecs
 16 | import platform
 17 | 
 18 | def isWindows():
 19 |     return platform.system() == 'Windows'
 20 | 
 21 | DATAX_HOME = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 22 | 
 23 | DATAX_VERSION = 'DATAX-OPENSOURCE-3.0'
 24 | if isWindows():
 25 |     codecs.register(lambda name: name == 'cp65001' and codecs.lookup('utf-8') or None)
 26 |     CLASS_PATH = ("%s/lib/*") % (DATAX_HOME)
 27 | else:
 28 |     CLASS_PATH = ("%s/lib/*:.") % (DATAX_HOME)
 29 | LOGBACK_FILE = ("%s/conf/logback.xml") % (DATAX_HOME)
 30 | DEFAULT_JVM = "-Xms1g -Xmx1g -XX:+HeapDumpOnOutOfMemoryError -XX:HeapDumpPath=%s/log" % (DATAX_HOME)
 31 | DEFAULT_PROPERTY_CONF = "-Dfile.encoding=UTF-8 -Dlogback.statusListenerClass=ch.qos.logback.core.status.NopStatusListener -Djava.security.egd=file:///dev/urandom -Ddatax.home=%s -Dlogback.configurationFile=%s" % (
 32 |     DATAX_HOME, LOGBACK_FILE)
 33 | ENGINE_COMMAND = "java -server ${jvm} %s -classpath %s  ${params} com.alibaba.datax.core.Engine -mode ${mode} -jobid ${jobid} -job ${job}" % (
 34 |     DEFAULT_PROPERTY_CONF, CLASS_PATH)
 35 | REMOTE_DEBUG_CONFIG = "-Xdebug -Xrunjdwp:transport=dt_socket,server=y,address=9999"
 36 | 
 37 | RET_STATE = {
 38 |     "KILL": 143,
 39 |     "FAIL": -1,
 40 |     "OK": 0,
 41 |     "RUN": 1,
 42 |     "RETRY": 2
 43 | }
 44 | 
 45 | 
 46 | def getLocalIp():
 47 |     try:
 48 |         return socket.gethostbyname(socket.getfqdn(socket.gethostname()))
 49 |     except:
 50 |         return "Unknown"
 51 | 
 52 | 
 53 | def suicide(signum, e):
 54 |     global child_process
 55 |     print >> sys.stderr, "[Error] DataX receive unexpected signal %d, starts to suicide." % (signum)
 56 | 
 57 |     if child_process:
 58 |         child_process.send_signal(signal.SIGQUIT)
 59 |         time.sleep(1)
 60 |         child_process.kill()
 61 |     print >> sys.stderr, "DataX Process was killed ! you did ?"
 62 |     sys.exit(RET_STATE["KILL"])
 63 | 
 64 | 
 65 | def register_signal():
 66 |     if not isWindows():
 67 |         global child_process
 68 |         signal.signal(2, suicide)
 69 |         signal.signal(3, suicide)
 70 |         signal.signal(15, suicide)
 71 | 
 72 | 
 73 | def getOptionParser():
 74 |     usage = "usage: %prog [options] job-url-or-path"
 75 |     parser = OptionParser(usage=usage)
 76 | 
 77 |     prodEnvOptionGroup = OptionGroup(parser, "Product Env Options",
 78 |                                      "Normal user use these options to set jvm parameters, job runtime mode etc. "
 79 |                                      "Make sure these options can be used in Product Env.")
 80 |     prodEnvOptionGroup.add_option("-j", "--jvm", metavar="<jvm parameters>", dest="jvmParameters", action="store",
 81 |                                   default=DEFAULT_JVM, help="Set jvm parameters if necessary.")
 82 |     prodEnvOptionGroup.add_option("--jobid", metavar="<job unique id>", dest="jobid", action="store", default="-1",
 83 |                                   help="Set job unique id when running by Distribute/Local Mode.")
 84 |     prodEnvOptionGroup.add_option("-m", "--mode", metavar="<job runtime mode>",
 85 |                                   action="store", default="standalone",
 86 |                                   help="Set job runtime mode such as: standalone, local, distribute. "
 87 |                                        "Default mode is standalone.")
 88 |     prodEnvOptionGroup.add_option("-p", "--params", metavar="<parameter used in job config>",
 89 |                                   action="store", dest="params",
 90 |                                   help='Set job parameter, eg: the source tableName you want to set it by command, '
 91 |                                        'then you can use like this: -p"-DtableName=your-table-name", '
 92 |                                        'if you have mutiple parameters: -p"-DtableName=your-table-name -DcolumnName=your-column-name".'
 93 |                                        'Note: you should config in you job tableName with ${tableName}.')
 94 |     prodEnvOptionGroup.add_option("-r", "--reader", metavar="<parameter used in view job config[reader] template>",
 95 |                                   action="store", dest="reader",type="string",
 96 |                                   help='View job config[reader] template, eg: mysqlreader,streamreader')
 97 |     prodEnvOptionGroup.add_option("-w", "--writer", metavar="<parameter used in view job config[writer] template>",
 98 |                                   action="store", dest="writer",type="string",
 99 |                                   help='View job config[writer] template, eg: mysqlwriter,streamwriter')
100 |     parser.add_option_group(prodEnvOptionGroup)
101 | 
102 |     devEnvOptionGroup = OptionGroup(parser, "Develop/Debug Options",
103 |                                     "Developer use these options to trace more details of DataX.")
104 |     devEnvOptionGroup.add_option("-d", "--debug", dest="remoteDebug", action="store_true",
105 |                                  help="Set to remote debug mode.")
106 |     devEnvOptionGroup.add_option("--loglevel", metavar="<log level>", dest="loglevel", action="store",
107 |                                  default="info", help="Set log level such as: debug, info, all etc.")
108 |     parser.add_option_group(devEnvOptionGroup)
109 |     return parser
110 | 
111 | def generateJobConfigTemplate(reader, writer):
112 |     readerRef = "Please refer to the %s document:\n     https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n" % (reader,reader,reader)
113 |     writerRef = "Please refer to the %s document:\n     https://github.com/alibaba/DataX/blob/master/%s/doc/%s.md \n " % (writer,writer,writer)
114 |     print(readerRef)
115 |     print(writerRef)
116 |     jobGuid = 'Please save the following configuration as a json file and  use\n     python {DATAX_HOME}/bin/datax.py {JSON_FILE_NAME}.json \nto run the job.\n'
117 |     print(jobGuid)
118 |     jobTemplate={
119 |       "job": {
120 |         "setting": {
121 |           "speed": {
122 |             "channel": ""
123 |           }
124 |         },
125 |         "content": [
126 |           {
127 |             "reader": {},
128 |             "writer": {}
129 |           }
130 |         ]
131 |       }
132 |     }
133 |     readerTemplatePath = "%s/plugin/reader/%s/plugin_job_template.json" % (DATAX_HOME,reader)
134 |     writerTemplatePath = "%s/plugin/writer/%s/plugin_job_template.json" % (DATAX_HOME,writer)
135 |     try:
136 |       readerPar = readPluginTemplate(readerTemplatePath);
137 |     except Exception as e:
138 |        print("Read reader[%s] template error: can\'t find file %s" % (reader,readerTemplatePath))
139 |     try:
140 |       writerPar = readPluginTemplate(writerTemplatePath);
141 |     except Exception as e:
142 |       print("Read writer[%s] template error: : can\'t find file %s" % (writer,writerTemplatePath))
143 |     jobTemplate['job']['content'][0]['reader'] = readerPar;
144 |     jobTemplate['job']['content'][0]['writer'] = writerPar;
145 |     print(json.dumps(jobTemplate, indent=4, sort_keys=True))
146 | 
147 | def readPluginTemplate(plugin):
148 |     with open(plugin, 'r') as f:
149 |             return json.load(f)
150 | 
151 | def isUrl(path):
152 |     if not path:
153 |         return False
154 | 
155 |     assert (isinstance(path, str))
156 |     m = re.match(r"^http[s]?://\S+\w*", path.lower())
157 |     if m:
158 |         return True
159 |     else:
160 |         return False
161 | 
162 | 
163 | def buildStartCommand(options, args):
164 |     commandMap = {}
165 |     tempJVMCommand = DEFAULT_JVM
166 |     if options.jvmParameters:
167 |         tempJVMCommand = tempJVMCommand + " " + options.jvmParameters
168 | 
169 |     if options.remoteDebug:
170 |         tempJVMCommand = tempJVMCommand + " " + REMOTE_DEBUG_CONFIG
171 |         print('local ip: ', getLocalIp())
172 | 
173 |     if options.loglevel:
174 |         tempJVMCommand = tempJVMCommand + " " + ("-Dloglevel=%s" % (options.loglevel))
175 | 
176 |     if options.mode:
177 |         commandMap["mode"] = options.mode
178 | 
179 |     # jobResource 可能是 URL，也可能是本地文件路径（相对,绝对）
180 |     jobResource = args[0]
181 |     if not isUrl(jobResource):
182 |         jobResource = os.path.abspath(jobResource)
183 |         if jobResource.lower().startswith("file://"):
184 |             jobResource = jobResource[len("file://"):]
185 | 
186 |     jobParams = ("-Dlog.file.name=%s") % (jobResource[-20:].replace('/', '_').replace('.', '_'))
187 |     if options.params:
188 |         jobParams = jobParams + " " + options.params
189 | 
190 |     if options.jobid:
191 |         commandMap["jobid"] = options.jobid
192 | 
193 |     commandMap["jvm"] = tempJVMCommand
194 |     commandMap["params"] = jobParams
195 |     commandMap["job"] = jobResource
196 | 
197 |     return Template(ENGINE_COMMAND).substitute(**commandMap)
198 | 
199 | 
200 | def printCopyright():
201 |     print('''
202 | DataX (%s), From Alibaba !
203 | Copyright (C) 2010-2017, Alibaba Group. All Rights Reserved.
204 | 
205 | ''' % DATAX_VERSION)
206 |     sys.stdout.flush()
207 | 
208 | 
209 | if __name__ == "__main__":
210 |     printCopyright()
211 |     parser = getOptionParser()
212 |     options, args = parser.parse_args(sys.argv[1:])
213 |     if options.reader is not None and options.writer is not None:
214 |         generateJobConfigTemplate(options.reader,options.writer)
215 |         sys.exit(RET_STATE['OK'])
216 |     if len(args) != 1:
217 |         parser.print_help()
218 |         sys.exit(RET_STATE['FAIL'])
219 | 
220 |     startCommand = buildStartCommand(options, args)
221 |     # print startCommand
222 | 
223 |     child_process = subprocess.Popen(startCommand, shell=True)
224 |     register_signal()
225 |     (stdout, stderr) = child_process.communicate()
226 | 
227 |     sys.exit(child_process.returncode)
228 | 


--------------------------------------------------------------------------------
/perftrace.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding:utf-8 -*-
  3 | 
  4 | 
  5 | """
  6 |    Life's short, Python more.
  7 | """
  8 | 
  9 | import re
 10 | import os
 11 | import sys
 12 | import json
 13 | import uuid
 14 | import signal
 15 | import time
 16 | import subprocess
 17 | from optparse import OptionParser
 18 | reload(sys)
 19 | sys.setdefaultencoding('utf8')
 20 | 
 21 | ##begin cli & help logic
 22 | def getOptionParser():
 23 |     usage = getUsage()
 24 |     parser = OptionParser(usage = usage)
 25 |     #rdbms reader and writer
 26 |     parser.add_option('-r', '--reader', action='store', dest='reader', help='trace datasource read performance with specified !json! string')
 27 |     parser.add_option('-w', '--writer', action='store', dest='writer', help='trace datasource write performance with specified !json! string')
 28 | 
 29 |     parser.add_option('-c', '--channel',  action='store', dest='channel', default='1', help='the number of concurrent sync thread, the default is 1')
 30 |     parser.add_option('-f', '--file',   action='store', help='existing datax configuration file, include reader and writer params')
 31 |     parser.add_option('-t', '--type',   action='store', default='reader', help='trace which side\'s performance, cooperate with -f --file params, need to be reader or writer')
 32 |     parser.add_option('-d', '--delete',   action='store', default='true', help='delete temporary files, the default value is true')
 33 |     #parser.add_option('-h', '--help',   action='store', default='true', help='print usage information')
 34 |     return parser
 35 | 
 36 | def getUsage():
 37 |     return '''
 38 | The following params are available for -r --reader:
 39 |     [these params is for rdbms reader, used to trace rdbms read performance, it's like datax's key]
 40 |     *datasourceType: datasource type, may be mysql|drds|oracle|ads|sqlserver|postgresql|db2 etc...
 41 |     *jdbcUrl:        datasource jdbc connection string, mysql as a example: jdbc:mysql://ip:port/database
 42 |     *username:       username for datasource
 43 |     *password:       password for datasource
 44 |     *table:          table name for read data
 45 |     column:          column to be read, the default value is ['*']
 46 |     splitPk:         the splitPk column of rdbms table
 47 |     where:           limit the scope of the performance data set
 48 |     fetchSize:       how many rows to be fetched at each communicate
 49 | 
 50 |     [these params is for stream reader, used to trace rdbms write performance]
 51 |     reader-sliceRecordCount:  how man test data to mock(each channel), the default value is 10000
 52 |     reader-column          :  stream reader while generate test data(type supports: string|long|date|double|bool|bytes; support constant value and random function)，demo: [{"type":"string","value":"abc"},{"type":"string","random":"10,20"}]
 53 | 
 54 | The following params are available for -w --writer:
 55 |     [these params is for rdbms writer, used to trace rdbms write performance, it's like datax's key]
 56 |     datasourceType:  datasource type, may be mysql|drds|oracle|ads|sqlserver|postgresql|db2|ads etc...
 57 |     *jdbcUrl:        datasource jdbc connection string, mysql as a example: jdbc:mysql://ip:port/database
 58 |     *username:       username for datasource
 59 |     *password:       password for datasource
 60 |     *table:          table name for write data
 61 |     column:          column to be writed, the default value is ['*']
 62 |     batchSize:       how many rows to be storeed at each communicate, the default value is 512
 63 |     preSql:          prepare sql to be executed before write data, the default value is ''
 64 |     postSql:         post sql to be executed end of write data, the default value is ''
 65 |     url:             required for ads, pattern is ip:port
 66 |     schme:           required for ads, ads database name
 67 | 
 68 |     [these params is for stream writer, used to trace rdbms read performance]
 69 |     writer-print:           true means print data read from source datasource, the default value is false
 70 | 
 71 | The following params are available global control:
 72 |     -c --channel:    the number of concurrent tasks, the default value is 1
 73 |     -f --file:       existing completely dataX configuration file path
 74 |     -t --type:       test read or write performance for a datasource, couble be reader or writer, in collaboration with -f --file
 75 |     -h --help:       print help message
 76 | 
 77 | some demo:
 78 | perftrace.py --channel=10 --reader='{"jdbcUrl":"jdbc:mysql://127.0.0.1:3306/database", "username":"", "password":"", "table": "", "where":"", "splitPk":"", "writer-print":"false"}'
 79 | perftrace.py --channel=10 --writer='{"jdbcUrl":"jdbc:mysql://127.0.0.1:3306/database", "username":"", "password":"", "table": "", "reader-sliceRecordCount": "10000", "reader-column": [{"type":"string","value":"abc"},{"type":"string","random":"10,20"}]}'
 80 | perftrace.py --file=/tmp/datax.job.json --type=reader --reader='{"writer-print": "false"}'
 81 | perftrace.py --file=/tmp/datax.job.json --type=writer --writer='{"reader-sliceRecordCount": "10000", "reader-column": [{"type":"string","value":"abc"},{"type":"string","random":"10,20"}]}'
 82 | 
 83 | some example jdbc url pattern, may help:
 84 | jdbc:oracle:thin:@ip:port:database
 85 | jdbc:mysql://ip:port/database
 86 | jdbc:sqlserver://ip:port;DatabaseName=database
 87 | jdbc:postgresql://ip:port/database
 88 | warn: ads url pattern is ip:port
 89 | warn: test write performance will write data into your table, you can use a temporary table just for test.
 90 | '''
 91 | 
 92 | def printCopyright():
 93 |     DATAX_VERSION = 'UNKNOWN_DATAX_VERSION'
 94 |     print('''
 95 | DataX Util Tools (%s), From Alibaba !
 96 | Copyright (C) 2010-2016, Alibaba Group. All Rights Reserved.''' % DATAX_VERSION)
 97 |     sys.stdout.flush()
 98 | 
 99 | 
100 | def yesNoChoice():
101 |     yes = set(['yes','y', 'ye', ''])
102 |     no = set(['no','n'])
103 |     choice = raw_input().lower()
104 |     if choice in yes:
105 |         return True
106 |     elif choice in no:
107 |         return False
108 |     else:
109 |         sys.stdout.write("Please respond with 'yes' or 'no'")
110 | ##end cli & help logic
111 | 
112 | 
113 | ##begin process logic
114 | def suicide(signum, e):
115 |     global childProcess
116 |     print >> sys.stderr, "[Error] Receive unexpected signal %d, starts to suicide." % (signum)
117 |     if childProcess:
118 |         childProcess.send_signal(signal.SIGQUIT)
119 |         time.sleep(1)
120 |         childProcess.kill()
121 |     print >> sys.stderr, "DataX Process was killed ! you did ?"
122 |     sys.exit(-1)
123 | 
124 | 
125 | def registerSignal():
126 |     global childProcess
127 |     signal.signal(2, suicide)
128 |     signal.signal(3, suicide)
129 |     signal.signal(15, suicide)
130 | 
131 | 
132 | def fork(command, isShell=False):
133 |     global childProcess
134 |     childProcess = subprocess.Popen(command, shell = isShell)
135 |     registerSignal()
136 |     (stdout, stderr) = childProcess.communicate()
137 |     #阻塞直到子进程结束
138 |     childProcess.wait()
139 |     return childProcess.returncode
140 | ##end process logic
141 | 
142 | 
143 | ##begin datax json generate logic
144 | #warn: if not '': -> true;   if not None: -> true
145 | def notNone(obj, context):
146 |     if not obj:
147 |         raise Exception("Configuration property [%s] could not be blank!" % (context))
148 | 
149 | def attributeNotNone(obj, attributes):
150 |     for key in attributes:
151 |         notNone(obj.get(key), key)
152 | 
153 | def isBlank(value):
154 |     if value is None or len(value.strip()) == 0:
155 |         return True
156 |     return False
157 | 
158 | def parsePluginName(jdbcUrl, pluginType):
159 |     import re
160 |     #warn: drds
161 |     name = 'pluginName'
162 |     mysqlRegex = re.compile('jdbc:(mysql)://.*')
163 |     if (mysqlRegex.match(jdbcUrl)):
164 |         name = 'mysql'
165 |     postgresqlRegex = re.compile('jdbc:(postgresql)://.*')
166 |     if (postgresqlRegex.match(jdbcUrl)):
167 |         name = 'postgresql'
168 |     oracleRegex = re.compile('jdbc:(oracle):.*')
169 |     if (oracleRegex.match(jdbcUrl)):
170 |         name = 'oracle'
171 |     sqlserverRegex = re.compile('jdbc:(sqlserver)://.*')
172 |     if (sqlserverRegex.match(jdbcUrl)):
173 |         name = 'sqlserver'
174 |     db2Regex = re.compile('jdbc:(db2)://.*')
175 |     if (db2Regex.match(jdbcUrl)):
176 |         name = 'db2'
177 |     return "%s%s" % (name, pluginType)
178 | 
179 | def renderDataXJson(paramsDict, readerOrWriter = 'reader', channel = 1):
180 |     dataxTemplate = {
181 |         "job": {
182 |             "setting": {
183 |                 "speed": {
184 |                     "channel": 1
185 |                 }
186 |             },
187 |             "content": [
188 |                 {
189 |                     "reader": {
190 |                         "name": "",
191 |                         "parameter": {
192 |                             "username": "",
193 |                             "password": "",
194 |                             "sliceRecordCount": "10000",
195 |                             "column": [
196 |                                 "*"
197 |                             ],
198 |                             "connection": [
199 |                                 {
200 |                                     "table": [],
201 |                                     "jdbcUrl": []
202 |                                 }
203 |                             ]
204 |                         }
205 |                     },
206 |                     "writer": {
207 |                         "name": "",
208 |                         "parameter": {
209 |                             "print": "false",
210 |                             "connection": [
211 |                                 {
212 |                                     "table": [],
213 |                                     "jdbcUrl": ''
214 |                                 }
215 |                             ]
216 |                         }
217 |                     }
218 |                 }
219 |             ]
220 |         }
221 |     }
222 |     dataxTemplate['job']['setting']['speed']['channel'] = channel
223 |     dataxTemplateContent = dataxTemplate['job']['content'][0]
224 | 
225 |     pluginName = ''
226 |     if paramsDict.get('datasourceType'):
227 |         pluginName = '%s%s' % (paramsDict['datasourceType'], readerOrWriter)
228 |     elif paramsDict.get('jdbcUrl'):
229 |         pluginName = parsePluginName(paramsDict['jdbcUrl'], readerOrWriter)
230 |     elif paramsDict.get('url'):
231 |         pluginName = 'adswriter'
232 | 
233 |     theOtherSide = 'writer' if readerOrWriter == 'reader' else 'reader'
234 |     dataxPluginParamsContent = dataxTemplateContent.get(readerOrWriter).get('parameter')
235 |     dataxPluginParamsContent.update(paramsDict)
236 | 
237 |     dataxPluginParamsContentOtherSide = dataxTemplateContent.get(theOtherSide).get('parameter')
238 | 
239 |     if readerOrWriter == 'reader':
240 |         dataxTemplateContent.get('reader')['name'] = pluginName
241 |         dataxTemplateContent.get('writer')['name'] = 'streamwriter'
242 |         if paramsDict.get('writer-print'):
243 |             dataxPluginParamsContentOtherSide['print'] = paramsDict['writer-print']
244 |             del dataxPluginParamsContent['writer-print']
245 |         del dataxPluginParamsContentOtherSide['connection']
246 |     if readerOrWriter == 'writer':
247 |         dataxTemplateContent.get('reader')['name'] = 'streamreader'
248 |         dataxTemplateContent.get('writer')['name'] = pluginName
249 |         if paramsDict.get('reader-column'):
250 |             dataxPluginParamsContentOtherSide['column'] = paramsDict['reader-column']
251 |             del dataxPluginParamsContent['reader-column']
252 |         if paramsDict.get('reader-sliceRecordCount'):
253 |             dataxPluginParamsContentOtherSide['sliceRecordCount'] = paramsDict['reader-sliceRecordCount']
254 |             del dataxPluginParamsContent['reader-sliceRecordCount']
255 |         del dataxPluginParamsContentOtherSide['connection']
256 | 
257 |     if paramsDict.get('jdbcUrl'):
258 |         if readerOrWriter == 'reader':
259 |             dataxPluginParamsContent['connection'][0]['jdbcUrl'].append(paramsDict['jdbcUrl'])
260 |         else:
261 |             dataxPluginParamsContent['connection'][0]['jdbcUrl'] = paramsDict['jdbcUrl']
262 |     if paramsDict.get('table'):
263 |         dataxPluginParamsContent['connection'][0]['table'].append(paramsDict['table'])
264 | 
265 | 
266 |     traceJobJson = json.dumps(dataxTemplate, indent = 4)
267 |     return traceJobJson
268 | 
269 | def isUrl(path):
270 |     if not path:
271 |         return False
272 |     if not isinstance(path, str):
273 |         raise Exception('Configuration file path required for the string, you configure is:%s' % path)
274 |     m = re.match(r"^http[s]?://\S+\w*", path.lower())
275 |     if m:
276 |         return True
277 |     else:
278 |         return False
279 | 
280 | 
281 | def readJobJsonFromLocal(jobConfigPath):
282 |     jobConfigContent = None
283 |     jobConfigPath = os.path.abspath(jobConfigPath)
284 |     file = open(jobConfigPath)
285 |     try:
286 |         jobConfigContent = file.read()
287 |     finally:
288 |         file.close()
289 |     if not jobConfigContent:
290 |         raise Exception("Your job configuration file read the result is empty, please check the configuration is legal, path: [%s]\nconfiguration:\n%s" % (jobConfigPath, str(jobConfigContent)))
291 |     return jobConfigContent
292 | 
293 | 
294 | def readJobJsonFromRemote(jobConfigPath):
295 |     import urllib
296 |     conn = urllib.urlopen(jobConfigPath)
297 |     jobJson = conn.read()
298 |     return jobJson
299 | 
300 | def parseJson(strConfig, context):
301 |     try:
302 |         return json.loads(strConfig)
303 |     except Exception as e:
304 |         import traceback
305 |         traceback.print_exc()
306 |         sys.stdout.flush()
307 |         print >> sys.stderr, '%s %s need in line with json syntax' % (context, strConfig)
308 |         sys.exit(-1)
309 | 
310 | def convert(options, args):
311 |     traceJobJson = ''
312 |     if options.file:
313 |         if isUrl(options.file):
314 |             traceJobJson = readJobJsonFromRemote(options.file)
315 |         else:
316 |             traceJobJson = readJobJsonFromLocal(options.file)
317 |         traceJobDict = parseJson(traceJobJson, '%s content' % options.file)
318 |         attributeNotNone(traceJobDict, ['job'])
319 |         attributeNotNone(traceJobDict['job'], ['content'])
320 |         attributeNotNone(traceJobDict['job']['content'][0], ['reader', 'writer'])
321 |         attributeNotNone(traceJobDict['job']['content'][0]['reader'], ['name', 'parameter'])
322 |         attributeNotNone(traceJobDict['job']['content'][0]['writer'], ['name', 'parameter'])
323 |         if options.type == 'reader':
324 |             traceJobDict['job']['content'][0]['writer']['name'] = 'streamwriter'
325 |             if options.reader:
326 |                 traceReaderDict = parseJson(options.reader, 'reader config')
327 |                 if traceReaderDict.get('writer-print') is not None:
328 |                     traceJobDict['job']['content'][0]['writer']['parameter']['print'] = traceReaderDict.get('writer-print')
329 |                 else:
330 |                     traceJobDict['job']['content'][0]['writer']['parameter']['print'] = 'false'
331 |             else:
332 |                 traceJobDict['job']['content'][0]['writer']['parameter']['print'] = 'false'
333 |         elif options.type == 'writer':
334 |             traceJobDict['job']['content'][0]['reader']['name'] = 'streamreader'
335 |             if options.writer:
336 |                 traceWriterDict = parseJson(options.writer, 'writer config')
337 |                 if traceWriterDict.get('reader-column'):
338 |                     traceJobDict['job']['content'][0]['reader']['parameter']['column'] = traceWriterDict['reader-column']
339 |                 if traceWriterDict.get('reader-sliceRecordCount'):
340 |                     traceJobDict['job']['content'][0]['reader']['parameter']['sliceRecordCount'] = traceWriterDict['reader-sliceRecordCount']
341 |             else:
342 |                 columnSize = len(traceJobDict['job']['content'][0]['writer']['parameter']['column'])
343 |                 streamReaderColumn = []
344 |                 for i in range(columnSize):
345 |                     streamReaderColumn.append({"type": "long", "random": "2,10"})
346 |                 traceJobDict['job']['content'][0]['reader']['parameter']['column'] = streamReaderColumn
347 |                 traceJobDict['job']['content'][0]['reader']['parameter']['sliceRecordCount'] = 10000
348 |         else:
349 |             pass#do nothing
350 |         return json.dumps(traceJobDict, indent = 4)
351 |     elif options.reader:
352 |         traceReaderDict = parseJson(options.reader, 'reader config')
353 |         return renderDataXJson(traceReaderDict, 'reader', options.channel)
354 |     elif options.writer:
355 |         traceWriterDict = parseJson(options.writer, 'writer config')
356 |         return renderDataXJson(traceWriterDict, 'writer', options.channel)
357 |     else:
358 |         print(getUsage())
359 |         sys.exit(-1)
360 |     #dataxParams = {}
361 |     #for opt, value in options.__dict__.items():
362 |     #    dataxParams[opt] = value
363 | ##end datax json generate logic
364 | 
365 | 
366 | if __name__ == "__main__":
367 |     printCopyright()
368 |     parser = getOptionParser()
369 | 
370 |     options, args = parser.parse_args(sys.argv[1:])
371 |     #print options, args
372 |     dataxTraceJobJson = convert(options, args)
373 | 
374 |     #由MAC地址、当前时间戳、随机数生成,可以保证全球范围内的唯一性
375 |     dataxJobPath = os.path.join(os.getcwd(), "perftrace-" + str(uuid.uuid1()))
376 |     jobConfigOk = True
377 |     if os.path.exists(dataxJobPath):
378 |         print("file already exists, truncate and rewrite it? %s" % dataxJobPath)
379 |         if yesNoChoice():
380 |             jobConfigOk = True
381 |         else:
382 |             print("exit failed, because of file conflict")
383 |             sys.exit(-1)
384 |     fileWriter = open(dataxJobPath, 'w')
385 |     fileWriter.write(dataxTraceJobJson)
386 |     fileWriter.close()
387 | 
388 | 
389 |     print("trace environments:")
390 |     print("dataxJobPath:  %s" % dataxJobPath)
391 |     dataxHomePath = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
392 |     print("dataxHomePath: %s" % dataxHomePath)
393 | 
394 |     dataxCommand = "%s %s" % (os.path.join(dataxHomePath, "bin", "datax.py"), dataxJobPath)
395 |     print("dataxCommand:  %s" % dataxCommand)
396 | 
397 |     returncode = fork(dataxCommand, True)
398 |     if options.delete == 'true':
399 |         os.remove(dataxJobPath)
400 |     sys.exit(returncode)
401 | 


--------------------------------------------------------------------------------