├── README.md ├── bin ├── hdata └── hdata.bat ├── conf ├── hdata.xml ├── log4j2.xml └── plugins.xml ├── job-examples ├── ftp-ftp.xml ├── hbase-console.xml ├── hdfs-hive.xml ├── hdfs-jdbc.xml ├── hive-jdbc.xml ├── jdbc-hbase.xml ├── jdbc-hdfs.xml ├── jdbc-hive.xml ├── jdbc-jdbc.xml ├── jdbc-mongodb.xml ├── job.xml └── mongodb-console.xml ├── pom.xml └── src └── main └── java └── opensource └── hdata ├── CliDriver.java ├── common ├── Constants.java └── HDataConfigConstants.java ├── config ├── Configuration.java ├── EngineConfig.java ├── JobConfig.java └── PluginConfig.java ├── core ├── DefaultRecord.java ├── Fields.java ├── HData.java ├── JobContext.java ├── Metric.java ├── OutputFieldsDeclarer.java ├── PluginLoader.java ├── ReaderWorker.java ├── RecordEvent.java ├── RecordWorkHandler.java ├── Storage.java ├── WaitStrategyFactory.java └── plugin │ ├── AbstractPlugin.java │ ├── Pluginable.java │ ├── Reader.java │ ├── ReaderPlugin.java │ ├── Record.java │ ├── RecordCollector.java │ ├── Splitter.java │ ├── Writer.java │ └── WriterPlugin.java ├── exception └── HDataException.java ├── plugin ├── reader │ ├── ftp │ │ ├── FTPReader.java │ │ ├── FTPReaderProperties.java │ │ └── FTPSplitter.java │ ├── hbase │ │ ├── HBaseReader.java │ │ ├── HBaseReaderProperties.java │ │ └── HBaseSplitter.java │ ├── hdfs │ │ ├── HDFSReader.java │ │ ├── HDFSReaderProperties.java │ │ └── HDFSSplitter.java │ ├── hive │ │ ├── HiveReader.java │ │ ├── HiveReaderProperties.java │ │ └── HiveSplitter.java │ ├── jdbc │ │ ├── JBDCReaderProperties.java │ │ ├── JDBCReader.java │ │ └── JDBCSplitter.java │ └── mongodb │ │ ├── MongoDBReader.java │ │ ├── MongoDBReaderProperties.java │ │ └── MongoDBSplitter.java └── writer │ ├── console │ └── ConsoleWriter.java │ ├── ftp │ ├── FTPWriter.java │ └── FTPWriterProperties.java │ ├── hbase │ ├── HBaseWriter.java │ └── HBaseWriterProperties.java │ ├── hdfs │ ├── HDFSWriter.java │ └── HDFSWriterProperties.java │ ├── hive │ ├── HiveRecordWritable.java │ ├── HiveWriter.java │ └── HiveWriterProperties.java │ ├── jdbc │ ├── JBDCWriterProperties.java │ └── JDBCWriter.java │ └── mongodb │ ├── MongoDBWriter.java │ └── MongoDBWriterProperties.java ├── tool └── SQLExecuteTool.java └── util ├── EscaperUtils.java ├── FTPUtils.java ├── HiveMetaStoreUtils.java ├── HiveTypeUtils.java ├── JDBCUtils.java ├── LoggerUtils.java ├── TypeConvertUtils.java ├── Utils.java └── XMLUtils.java /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DataSky/HData/50ff4568fec2538a6f2098311c9ab5ff6737471c/README.md -------------------------------------------------------------------------------- /bin/hdata: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | CDPATH="" 5 | SCRIPT="$0" 6 | 7 | while [ -h "$SCRIPT" ] ; do 8 | ls=`ls -ld "$SCRIPT"` 9 | link=`expr "$ls" : '.*-> \(.*\)$'` 10 | if expr "$link" : '/.*' > /dev/null; then 11 | SCRIPT="$link" 12 | else 13 | SCRIPT=`dirname "$SCRIPT"`/"$link" 14 | fi 15 | done 16 | 17 | HDATA_HOME=`dirname "$SCRIPT"`/.. 18 | HDATA_HOME=`cd "$HDATA_HOME"; pwd` 19 | HDATA_LIB_DIR=$HDATA_HOME/lib 20 | HDATA_CONF_DIR=$HDATA_HOME/conf 21 | 22 | if [ -x "$JAVA_HOME/bin/java" ]; then 23 | JAVA="$JAVA_HOME/bin/java" 24 | else 25 | JAVA=`which java` 26 | fi 27 | 28 | if [ ! -x "$JAVA" ]; then 29 | echo "Could not find any executable java binary. Please install java in your PATH or set JAVA_HOME" 30 | exit 1 31 | fi 32 | 33 | HDATA_CLASSPATH='.' 34 | for f in $HDATA_LIB_DIR/*.jar; do 35 | HDATA_CLASSPATH=${HDATA_CLASSPATH}:$f; 36 | done 37 | 38 | JAVA_OPTS="$JAVA_OPTS -Dhdata.conf.dir=$HDATA_CONF_DIR" 39 | JAVA_OPTS="$JAVA_OPTS -Dlog4j.configurationFile=file:///$HDATA_CONF_DIR/log4j2.xml" 40 | 41 | MAIN_CLASS="com.suning.hdata.CliDriver" 42 | if [ "$1" = "execute-sql" ]; then 43 | MAIN_CLASS="com.suning.hdata.tool.SQLExecuteTool" 44 | fi 45 | 46 | exec "$JAVA" $JAVA_OPTS -cp "$HDATA_CLASSPATH" $MAIN_CLASS "$@" 47 | -------------------------------------------------------------------------------- /bin/hdata.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | SETLOCAL 4 | 5 | if NOT DEFINED JAVA_HOME goto err 6 | 7 | set SCRIPT_DIR=%~dp0 8 | for %%I in ("%SCRIPT_DIR%..") do set HDATA_HOME=%%~dpfI 9 | 10 | set MAIN_CLASSPATH=.;%HDATA_HOME%\lib\* 11 | set HDATA_CONF_DIR=%HDATA_HOME%\conf 12 | 13 | set JAVA_OPTS=%JAVA_OPTS% -Xss256k 14 | set JAVA_OPTS=%JAVA_OPTS% -XX:+UseParNewGC 15 | set JAVA_OPTS=%JAVA_OPTS% -XX:+UseConcMarkSweepGC 16 | 17 | set JAVA_OPTS=%JAVA_OPTS% -XX:CMSInitiatingOccupancyFraction=75 18 | set JAVA_OPTS=%JAVA_OPTS% -XX:+UseCMSInitiatingOccupancyOnly 19 | set JAVA_OPTS=%JAVA_OPTS% -XX:+HeapDumpOnOutOfMemoryError 20 | set JAVA_OPTS=%JAVA_OPTS% -Dhdata.conf.dir="%HDATA_CONF_DIR%" 21 | set JAVA_OPTS=%JAVA_OPTS% -Dlog4j.configurationFile="file:///%HDATA_CONF_DIR%/log4j2.xml" 22 | 23 | set FIRST_ARG=%1 24 | set MAIN_CLASS="com.suning.hdata.CliDriver" 25 | if "%FIRST_ARG%"=="execute-sql" (set MAIN_CLASS="com.suning.hdata.tool.SQLExecuteTool") 26 | 27 | "%JAVA_HOME%\bin\java" %JAVA_OPTS% -cp "%MAIN_CLASSPATH%" %MAIN_CLASS% %* 28 | 29 | goto finally 30 | 31 | :err 32 | echo JAVA_HOME environment variable must be set! 33 | pause 34 | 35 | 36 | :finally 37 | 38 | ENDLOCAL -------------------------------------------------------------------------------- /conf/hdata.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hdata.storage.default.buffer.size 6 | 16384 7 | 默认storage缓冲区大小,值必须为2^n 8 | 9 | 10 | hdata.storage.disruptor.wait.strategy 11 | BlockingWaitStrategy 12 | 线程等待策略,可选项:BlockingWaitStrategy、BusySpinWaitStrategy、SleepingWaitStrategy、YieldingWaitStrategy 13 | 14 | 15 | hdata.hive.writer.tmp.dir 16 | /tmp 17 | Hive Writer写入HDFS文件的临时目录 18 | 19 | 20 | -------------------------------------------------------------------------------- /conf/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /conf/plugins.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | jdbc 7 | opensource.hdata.plugin.reader.jdbc.JDBCReader 8 | opensource.hdata.plugin.reader.jdbc.JDBCSplitter 9 | 10 | 11 | hive 12 | opensource.hdata.plugin.reader.hive.HiveReader 13 | opensource.hdata.plugin.reader.hive.HiveSplitter 14 | 15 | 16 | hdfs 17 | opensource.hdata.plugin.reader.hdfs.HDFSReader 18 | opensource.hdata.plugin.reader.hdfs.HDFSSplitter 19 | 20 | 21 | ftp 22 | opensource.hdata.plugin.reader.ftp.FTPReader 23 | opensource.hdata.plugin.reader.ftp.FTPSplitter 24 | 25 | 26 | mongodb 27 | opensource.hdata.plugin.reader.mongodb.MongoDBReader 28 | opensource.hdata.plugin.reader.mongodb.MongoDBSplitter 29 | 30 | 31 | hbase 32 | opensource.hdata.plugin.reader.hbase.HBaseReader 33 | opensource.hdata.plugin.reader.hbase.HBaseSplitter 34 | 35 | 36 | 37 | 38 | 39 | console 40 | opensource.hdata.plugin.writer.console.ConsoleWriter 41 | 42 | 43 | jdbc 44 | opensource.hdata.plugin.writer.jdbc.JDBCWriter 45 | 46 | 47 | hive 48 | opensource.hdata.plugin.writer.hive.HiveWriter 49 | 50 | 51 | hdfs 52 | opensource.hdata.plugin.writer.hdfs.HDFSWriter 53 | 54 | 55 | ftp 56 | opensource.hdata.plugin.writer.ftp.FTPWriter 57 | 58 | 59 | mongodb 60 | opensource.hdata.plugin.writer.mongodb.MongoDBWriter 61 | 62 | 63 | hbase 64 | opensource.hdata.plugin.writer.hbase.HBaseWriter 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /job-examples/ftp-ftp.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 192.168.130.161 6 | 1 7 | 1@1 8 | /etldata/input/sa_log/151_125 9 | 10 | serv11-saIntf-pageTime-access-20140407_00.0.log 11 | | 12 | 13 | 1 14 | 15 | 16 | 17 | localhost 18 | 1 19 | 1 20 | /ftp/tmp/1.txt 21 | 1 22 | 23 | 24 | -------------------------------------------------------------------------------- /job-examples/hbase-console.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 192.168.142.16,192.168.142.17,192.168.142.18 6 | 2181 7 | ip_address
8 | :rowkey,cf:start_ip,cf:end_ip,cf:start_ip_num,cf:end_ip_num,cf:country,cf:area,cf:province,cf:city,cf:isp 9 | id,start_ip,end_ip,start_ip_num,end_ip_num,country,area,province,city,isp 10 | 958200 11 | 12 | 2 13 |
14 | 15 | 16 | 1 17 | 18 |
19 | -------------------------------------------------------------------------------- /job-examples/hdfs-hive.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hdfs://192.168.142.21:8020/tmp/hdata_test 6 | .*\.csv 7 | , 8 | gb18030 9 | bigdata 10 | 1 11 | 12 | 13 | 14 | thrift://192.168.142.21:9083 15 | default 16 | tmp_hdata_rcfile_test
17 | bigdata 18 | 1 19 |
20 |
21 | -------------------------------------------------------------------------------- /job-examples/hdfs-jdbc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hdfs://192.168.142.21:8020/tmp/hdata_test 6 | hdfs.test 7 | bigdata 8 | 1 9 | 10 | 11 | 12 | org.postgresql.Driver 13 | jdbc:postgresql://localhost:5432/ip 14 | postgres 15 | toor 16 | tmp
17 | 3 18 |
19 |
20 | -------------------------------------------------------------------------------- /job-examples/hive-jdbc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | thrift://192.168.142.21:9083 6 | bi_td 7 | tdm_common_td
8 | 9 | 1 10 |
11 | 12 | 13 | org.postgresql.Driver 14 | jdbc:postgresql://localhost:5432/tmp 15 | postgres 16 | toor 17 | tdm_common_td
18 | 3 19 |
20 |
21 | -------------------------------------------------------------------------------- /job-examples/jdbc-hbase.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | org.postgresql.Driver 6 | jdbc:postgresql://10.22.8.140:5432/ip 7 | postgres 8 | toor 9 | ip_address
10 | 11 | 12 | 13 | 14 | 15 | 1 16 |
17 | 18 | 19 | 192.168.142.16,192.168.142.17,192.168.142.18,192.168.142.19,192.168.142.20,192.168.142.21,192.168.142.23,192.168.142.24,192.168.142.25,192.168.142.26,192.168.142.27 20 | 2181 21 | ip_address
22 | :rowkey,cf:start_ip,cf:end_ip,cf:start_ip_num,cf:end_ip_num,cf:country,cf:area,cf:province,cf:city,cf:isp 23 | 10000 24 | 1 25 |
26 |
27 | -------------------------------------------------------------------------------- /job-examples/jdbc-hdfs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | org.postgresql.Driver 6 | jdbc:postgresql://10.22.8.140:5432/ip 7 | postgres 8 | toor 9 | ip_address
10 | 11 | 12 | 13 | 14 | 15 | 3 16 |
17 | 18 | 19 | hdfs://192.168.142.21:8020/tmp/hdata_test/hdfs.test 20 | bigdata 21 | 1 22 | 23 |
24 | -------------------------------------------------------------------------------- /job-examples/jdbc-hive.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | org.postgresql.Driver 6 | jdbc:postgresql://localhost:5432/ip 7 | postgres 8 | toor 9 | ip_address
10 | 11 | 12 | 13 | 14 | 15 | 3 16 |
17 | 18 | 19 | thrift://192.168.142.21:9083 20 | default 21 | tmp_hdata_rcfile_test_p
22 | p=20140407 23 | bigdata 24 | 3 25 |
26 |
27 | -------------------------------------------------------------------------------- /job-examples/jdbc-jdbc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | org.postgresql.Driver 6 | jdbc:postgresql://localhost:5432/ip 7 | postgres 8 | toor 9 | ip_address
10 | 11 | 12 | 13 | 14 | 15 | 3 16 |
17 | 18 | 19 | org.postgresql.Driver 20 | jdbc:postgresql://localhost:5432/ip 21 | postgres 22 | toor 23 | tmp
24 | 10000 25 | 3 26 |
27 |
28 | -------------------------------------------------------------------------------- /job-examples/jdbc-mongodb.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | org.postgresql.Driver 6 | jdbc:postgresql://localhost:5432/ip 7 | postgres 8 | toor 9 | ip_address
10 | 11 | 12 | 13 | 14 | 15 | 3 16 |
17 | 18 | 19 | mongodb://localhost/test.ip 20 | 21 | 3 22 | 23 |
24 | -------------------------------------------------------------------------------- /job-examples/job.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | com.mysql.jdbc.Driver 6 | jdbc:mysql://localhost:3306/test 7 | root 8 | toor 9 | ip_address
10 | 11 | 12 | 13 | 14 | 15 | 7 16 |
17 | 18 | 19 | com.mysql.jdbc.Driver 20 | jdbc:mysql://localhost:3306/test?useUnicode=true&characterEncoding=UTF-8 21 | root 22 | toor 23 | tmp
24 | 10000 25 | 3 26 |
27 |
28 | -------------------------------------------------------------------------------- /job-examples/mongodb-console.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | mongodb://localhost/test.ip 6 | {"city":"南京市"} 7 | 1 8 | 9 | 10 | 11 | 1 12 | 13 | 14 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | opensource 5 | hdata 6 | hdata 7 | 0.1 8 | 9 | 10 | UTF-8 11 | 1.2.1 12 | 0.12.0 13 | 0.94.16 14 | 15 | 16 | 2014 17 | 18 | 19 | Jayer 20 | dczxxuhai@gmail.com 21 | 22 | 23 | 24 | 25 | 26 | org.apache.logging.log4j 27 | log4j-api 28 | 2.0-rc1 29 | 30 | 31 | org.apache.logging.log4j 32 | log4j-core 33 | 2.0-rc1 34 | 35 | 36 | com.google.guava 37 | guava 38 | 16.0.1 39 | 40 | 41 | com.lmax 42 | disruptor 43 | 3.2.1 44 | 45 | 46 | commons-cli 47 | commons-cli 48 | 1.2 49 | 50 | 51 | org.apache.commons 52 | commons-lang3 53 | 3.3.2 54 | 55 | 56 | commons-cli 57 | commons-cli 58 | 1.2 59 | 60 | 61 | org.jdom 62 | jdom2 63 | 2.0.5 64 | 65 | 66 | javassist 67 | javassist 68 | 3.18.1-GA 69 | 70 | 71 | org.antlr 72 | antlr-runtime 73 | 3.4 74 | 75 | 76 | commons-configuration 77 | commons-configuration 78 | 1.9 79 | 80 | 81 | commons-lang 82 | commons-lang 83 | 2.6 84 | 85 | 86 | commons-logging 87 | commons-logging 88 | 1.1.1 89 | 90 | 91 | commons-net 92 | commons-net 93 | 3.3 94 | 95 | 96 | log4j 97 | log4j 98 | 1.2.17 99 | 100 | 101 | org.slf4j 102 | slf4j-api 103 | 1.7.6 104 | 105 | 106 | org.slf4j 107 | slf4j-log4j12 108 | 1.7.6 109 | 110 | 111 | org.apache.hive 112 | hive-exec 113 | ${hiveVersion} 114 | 115 | 116 | org.apache.hive 117 | hive-metastore 118 | ${hiveVersion} 119 | 120 | 121 | org.apache.hadoop 122 | hadoop-core 123 | ${hadoopVersion} 124 | 125 | 126 | org.apache.hbase 127 | hbase 128 | ${hbaseVersion} 129 | 130 | 131 | org.apache.zookeeper 132 | zookeeper 133 | 3.4.6 134 | 135 | 136 | org.mongodb 137 | mongo-java-driver 138 | 2.12.0 139 | 140 | 141 | javax.jdo 142 | jdo-api 143 | 3.0.1 144 | 145 | 146 | org.apache.thrift 147 | libfb303 148 | 0.9.0 149 | 150 | 151 | org.datanucleus 152 | datanucleus-api-jdo 153 | 3.2.1 154 | 155 | 156 | org.datanucleus 157 | datanucleus-core 158 | 3.2.2 159 | 160 | 161 | org.datanucleus 162 | datanucleus-rdbms 163 | 3.2.1 164 | 165 | 166 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/CliDriver.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Map.Entry; 6 | import java.util.Properties; 7 | 8 | import opensource.hdata.config.JobConfig; 9 | import opensource.hdata.config.PluginConfig; 10 | import opensource.hdata.core.HData; 11 | 12 | import org.apache.commons.cli.CommandLine; 13 | import org.apache.commons.cli.CommandLineParser; 14 | import org.apache.commons.cli.HelpFormatter; 15 | import org.apache.commons.cli.OptionBuilder; 16 | import org.apache.commons.cli.Options; 17 | import org.apache.commons.cli.ParseException; 18 | import org.apache.commons.cli.PosixParser; 19 | 20 | public class CliDriver { 21 | 22 | private static final String XML_FILE = "f"; 23 | private static final String HDATA_VARS = "var"; 24 | 25 | /** 26 | * 创建命令行选项 27 | * 28 | * @return 29 | */ 30 | public Options createOptions() { 31 | Options options = new Options(); 32 | options.addOption(XML_FILE, null, true, "job xml path"); 33 | OptionBuilder.withValueSeparator(); 34 | OptionBuilder.hasArgs(2); 35 | OptionBuilder.withArgName("property=value"); 36 | OptionBuilder.withLongOpt(HDATA_VARS); 37 | options.addOption(OptionBuilder.create()); 38 | return options; 39 | } 40 | 41 | /** 42 | * 打印命令行帮助信息 43 | * 44 | * @param options 45 | */ 46 | public void printHelp(Options options) { 47 | HelpFormatter formatter = new HelpFormatter(); 48 | formatter.printHelp(" ", options); 49 | } 50 | 51 | /** 52 | * 替换命令行变量 53 | * 54 | * @param config 55 | * @param vars 56 | */ 57 | public void replaceConfigVars(PluginConfig config, Map vars) { 58 | for (Entry confEntry : config.entrySet()) { 59 | if (confEntry.getKey().getClass() == String.class && confEntry.getValue().getClass() == String.class) { 60 | for (Entry varEntry : vars.entrySet()) { 61 | String replaceVar = "${" + varEntry.getKey() + "}"; 62 | if (confEntry.getValue().toString().contains(replaceVar)) { 63 | config.put(confEntry.getKey(), confEntry.getValue().toString().replace(replaceVar, varEntry.getValue())); 64 | } 65 | } 66 | } 67 | } 68 | } 69 | 70 | /** 71 | * 主程序入口 72 | * 73 | * @param args 74 | */ 75 | public static void main(String[] args) { 76 | CliDriver cliDriver = new CliDriver(); 77 | Options options = cliDriver.createOptions(); 78 | if (args.length < 1) { 79 | cliDriver.printHelp(options); 80 | System.exit(-1); 81 | } 82 | 83 | CommandLineParser parser = new PosixParser(); 84 | CommandLine cmd = null; 85 | try { 86 | cmd = parser.parse(options, args); 87 | String jobXmlPath = cmd.getOptionValue(XML_FILE); 88 | JobConfig jobConfig = new JobConfig(jobXmlPath); 89 | Map vars = new HashMap(); 90 | Properties properties = cmd.getOptionProperties(HDATA_VARS); 91 | for (String key : properties.stringPropertyNames()) { 92 | vars.put(key, properties.getProperty(key)); 93 | } 94 | 95 | final PluginConfig readerConfig = jobConfig.getReaderConfig(); 96 | final PluginConfig writerConfig = jobConfig.getWriterConfig(); 97 | 98 | cliDriver.replaceConfigVars(readerConfig, vars); 99 | cliDriver.replaceConfigVars(writerConfig, vars); 100 | 101 | HData hData = new HData(); 102 | hData.start(jobConfig); 103 | } catch (ParseException e) { 104 | cliDriver.printHelp(options); 105 | System.exit(-1); 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/common/Constants.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.common; 2 | 3 | public class Constants { 4 | 5 | public static final String HDATA_XML = "hdata.xml"; 6 | public static final String PLUGINS_XML = "plugins.xml"; 7 | public static final String LOG4J2_XML = "log4j2.xml"; 8 | public static final String DATE_FORMAT_STRING = "yyyy-MM-dd HH:mm:ss"; 9 | public static final String COLUMNS_SPLIT_REGEX = "\\s*,\\s*"; 10 | 11 | private Constants() { 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/common/HDataConfigConstants.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.common; 2 | 3 | public class HDataConfigConstants { 4 | 5 | public static final String STORAGE_BUFFER_SIZE = "hdata.storage.default.buffer.size"; 6 | public static final String HDATA_STORAGE_DISRUPTOR_WAIT_STRATEGY = "hdata.storage.disruptor.wait.strategy"; 7 | public static final String HDATA_SLEEP_MILLIS = "hdata.sleep.millis"; 8 | 9 | private HDataConfigConstants() { 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/config/Configuration.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.config; 2 | 3 | import java.util.Properties; 4 | 5 | public abstract class Configuration extends Properties { 6 | 7 | private static final long serialVersionUID = 8606831740240321865L; 8 | 9 | public String getString(String key, String defalutValue) { 10 | String value = getProperty(key); 11 | return value != null ? value : defalutValue; 12 | } 13 | 14 | public String getString(String key) { 15 | return getProperty(key); 16 | } 17 | 18 | public void setString(String key, String value) { 19 | setProperty(key, value); 20 | } 21 | 22 | public int getInt(String key, int defalutValue) { 23 | String value = getProperty(key); 24 | return value != null ? Integer.parseInt(value) : defalutValue; 25 | } 26 | 27 | public void setInt(String key, int value) { 28 | setString(key, Integer.toString(value)); 29 | } 30 | 31 | public long getLong(String key, long defalutValue) { 32 | String value = getProperty(key); 33 | return value != null ? Long.parseLong(value) : defalutValue; 34 | } 35 | 36 | public void setLong(String key, long value) { 37 | setString(key, Long.toString(value)); 38 | } 39 | 40 | public double getDouble(String key, double defalutValue) { 41 | String value = getProperty(key); 42 | return value != null ? Double.parseDouble(value) : defalutValue; 43 | } 44 | 45 | public void setDouble(String key, double value) { 46 | setString(key, Double.toString(value)); 47 | } 48 | 49 | public boolean getBoolean(String key, boolean defalutValue) { 50 | String value = getProperty(key); 51 | return value != null ? Boolean.parseBoolean(value) : defalutValue; 52 | } 53 | 54 | public void setBoolean(String key, boolean value) { 55 | setString(key, Boolean.toString(value)); 56 | } 57 | 58 | public float getFloat(String key, float defalutValue) { 59 | String value = getProperty(key); 60 | return value != null ? Float.parseFloat(value) : defalutValue; 61 | } 62 | 63 | public void setFloat(String key, float value) { 64 | setString(key, Float.toString(value)); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/config/EngineConfig.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.config; 2 | 3 | import java.util.List; 4 | 5 | import opensource.hdata.common.Constants; 6 | import opensource.hdata.exception.HDataException; 7 | import opensource.hdata.util.Utils; 8 | import opensource.hdata.util.XMLUtils; 9 | 10 | import org.jdom2.Element; 11 | 12 | public class EngineConfig extends Configuration { 13 | 14 | private static final long serialVersionUID = -4751544524691015405L; 15 | 16 | private EngineConfig() { 17 | super(); 18 | } 19 | 20 | public static EngineConfig create() { 21 | EngineConfig conf = new EngineConfig(); 22 | Element root = null; 23 | try { 24 | root = XMLUtils.load(Utils.getConfigDir() + Constants.HDATA_XML); 25 | } catch (Exception e) { 26 | throw new HDataException("Init EngineConf error!", e); 27 | } 28 | List list = root.getChildren("property"); 29 | 30 | for (Element element : list) { 31 | conf.setString(element.getChildText("name"), element.getChildText("value")); 32 | } 33 | return conf; 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/config/JobConfig.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.config; 2 | 3 | import opensource.hdata.core.PluginLoader; 4 | import opensource.hdata.core.plugin.Reader; 5 | import opensource.hdata.core.plugin.Splitter; 6 | import opensource.hdata.core.plugin.Writer; 7 | import opensource.hdata.exception.HDataException; 8 | import opensource.hdata.util.XMLUtils; 9 | 10 | import org.jdom2.Element; 11 | 12 | public class JobConfig extends Configuration { 13 | 14 | private Element root; 15 | private PluginConfig readerConfig; 16 | private PluginConfig writerConfig; 17 | private static final long serialVersionUID = -106497323171420503L; 18 | 19 | public JobConfig() { 20 | super(); 21 | } 22 | 23 | public JobConfig(String jobXmlPath) { 24 | this(); 25 | try { 26 | root = XMLUtils.load(jobXmlPath); 27 | } catch (Exception e) { 28 | throw new HDataException("Can not load job xml file: " + jobXmlPath, e); 29 | } 30 | } 31 | 32 | public PluginConfig getReaderConfig() { 33 | if (readerConfig == null) { 34 | readerConfig = new PluginConfig(); 35 | for (Element e : root.getChild("reader").getChildren()) { 36 | if (!e.getValue().trim().isEmpty()) { 37 | readerConfig.setProperty(e.getName(), e.getValue()); 38 | } 39 | } 40 | } 41 | 42 | return readerConfig; 43 | } 44 | 45 | public PluginConfig getWriterConfig() { 46 | if (writerConfig == null) { 47 | writerConfig = new PluginConfig(); 48 | for (Element e : root.getChild("writer").getChildren()) { 49 | if (!e.getValue().trim().isEmpty()) { 50 | writerConfig.setProperty(e.getName(), e.getValue()); 51 | } 52 | } 53 | } 54 | return writerConfig; 55 | } 56 | 57 | public String getReaderName() { 58 | return root.getChild("reader").getAttributeValue("name"); 59 | } 60 | 61 | public String getReaderClassName() { 62 | return PluginLoader.getReaderPlugin(getReaderName()).getClassName(); 63 | } 64 | 65 | public Reader newReader() { 66 | String readerClassName = getReaderClassName(); 67 | if (readerClassName == null) { 68 | throw new HDataException("Can not find class for reader: " + getReaderName()); 69 | } 70 | 71 | try { 72 | return (Reader) Class.forName(readerClassName).newInstance(); 73 | } catch (Exception e) { 74 | throw new HDataException("Can not create new reader instance for: " + getReaderName(), e); 75 | } 76 | } 77 | 78 | public Splitter newSplitter() { 79 | String spliterClassName = PluginLoader.getReaderPlugin(getReaderName()).getSplitterClassName(); 80 | 81 | if (spliterClassName == null) { 82 | return null; 83 | } 84 | 85 | try { 86 | return (Splitter) Class.forName(spliterClassName.trim()).newInstance(); 87 | } catch (Exception e) { 88 | throw new HDataException("Can not find splitter for reader: " + getReaderName(), e); 89 | } 90 | } 91 | 92 | public String getWriterName() { 93 | return root.getChild("writer").getAttributeValue("name"); 94 | } 95 | 96 | public String getWriterClassName() { 97 | return PluginLoader.getWriterPlugin(getWriterName()).getClassName(); 98 | } 99 | 100 | public Writer newWriter() { 101 | String writerClassName = getWriterClassName(); 102 | if (writerClassName == null) { 103 | throw new HDataException("Can not find class for writer: " + getWriterName()); 104 | } 105 | 106 | try { 107 | return (Writer) Class.forName(getWriterClassName()).newInstance(); 108 | } catch (Exception e) { 109 | throw new HDataException("Can not create new writer instance for: " + getWriterName(), e); 110 | } 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/config/PluginConfig.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.config; 2 | 3 | public class PluginConfig extends Configuration { 4 | 5 | private static final String PARALLELISM_KEY = "parallelism"; 6 | private static final int DEFAULT_PARALLELISM = 1; 7 | private static final long serialVersionUID = 3311331304791946068L; 8 | 9 | public PluginConfig() { 10 | super(); 11 | } 12 | 13 | public int getParallelism() { 14 | int parallelism = getInt(PARALLELISM_KEY, DEFAULT_PARALLELISM); 15 | if (parallelism < 1) { 16 | throw new IllegalArgumentException("Reader and Writer parallelism must be >= 1."); 17 | } 18 | return parallelism; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/DefaultRecord.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import opensource.hdata.core.plugin.Record; 4 | 5 | public class DefaultRecord implements Record { 6 | 7 | private Object[] fields; 8 | private int cursor; 9 | 10 | public DefaultRecord(int fieldCount) { 11 | fields = new Object[fieldCount]; 12 | } 13 | 14 | public void addField(int index, Object field) { 15 | fields[index] = field; 16 | this.cursor++; 17 | } 18 | 19 | public void addField(Object field) { 20 | addField(cursor, field); 21 | } 22 | 23 | public Object getField(int index) { 24 | return fields[index]; 25 | } 26 | 27 | public int getFieldsCount() { 28 | return fields.length; 29 | } 30 | 31 | @Override 32 | public String toString() { 33 | StringBuilder sb = new StringBuilder(); 34 | sb.append("{"); 35 | for (int i = 0, len = fields.length; i < len; i++) { 36 | if (i > 0) { 37 | sb.append(", "); 38 | } 39 | sb.append(fields[i]); 40 | } 41 | sb.append("}"); 42 | return sb.toString(); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/Fields.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import java.util.ArrayList; 4 | 5 | public class Fields extends ArrayList { 6 | 7 | private static final long serialVersionUID = -174064216143075549L; 8 | 9 | public Fields() { 10 | super(); 11 | } 12 | 13 | public Fields(String... fields) { 14 | super(); 15 | for (String field : fields) { 16 | this.add(field); 17 | } 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/HData.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import java.text.DecimalFormat; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.concurrent.ExecutorService; 7 | import java.util.concurrent.Executors; 8 | 9 | import opensource.hdata.common.HDataConfigConstants; 10 | import opensource.hdata.config.EngineConfig; 11 | import opensource.hdata.config.JobConfig; 12 | import opensource.hdata.config.PluginConfig; 13 | import opensource.hdata.core.plugin.Reader; 14 | import opensource.hdata.core.plugin.RecordCollector; 15 | import opensource.hdata.core.plugin.Splitter; 16 | import opensource.hdata.core.plugin.Writer; 17 | import opensource.hdata.exception.HDataException; 18 | import opensource.hdata.util.Utils; 19 | 20 | import org.apache.logging.log4j.LogManager; 21 | import org.apache.logging.log4j.Logger; 22 | 23 | import com.lmax.disruptor.WaitStrategy; 24 | import com.lmax.disruptor.dsl.Disruptor; 25 | import com.lmax.disruptor.dsl.ProducerType; 26 | 27 | public class HData { 28 | 29 | private DecimalFormat df = new DecimalFormat("#0.00"); 30 | private static final Logger LOG = LogManager.getLogger(HData.class); 31 | 32 | public void start(final JobConfig jobConfig) { 33 | final PluginConfig readerConfig = jobConfig.getReaderConfig(); 34 | final PluginConfig writerConfig = jobConfig.getWriterConfig(); 35 | 36 | LOG.info("Reader: {}, Writer: {}", jobConfig.getReaderName(), jobConfig.getWriterName()); 37 | int writerParallelism = writerConfig.getParallelism(); 38 | 39 | final JobContext context = new JobContext(); 40 | context.setJobConfig(jobConfig); 41 | final Metric metric = new Metric(); 42 | context.setMetric(metric); 43 | final OutputFieldsDeclarer outputFieldsDeclarer = new OutputFieldsDeclarer(context); 44 | context.setDeclarer(outputFieldsDeclarer); 45 | 46 | final EngineConfig engineConfig = EngineConfig.create(); 47 | context.setEngineConfig(engineConfig); 48 | 49 | long sleepMillis = engineConfig.getLong(HDataConfigConstants.HDATA_SLEEP_MILLIS, 3000); 50 | 51 | List readerConfigList = null; 52 | Splitter spliter = jobConfig.newSplitter(); 53 | if (spliter != null) { 54 | LOG.info("Executing spliter for reader."); 55 | readerConfigList = spliter.split(jobConfig); 56 | if (readerConfigList == null || readerConfigList.size() == 0) { 57 | LOG.info("Job Finished."); 58 | System.exit(0); 59 | } 60 | } else if (readerConfig.getParallelism() > 1) { 61 | throw new HDataException("Reader parallelism is " + readerConfig.getParallelism() + ", but can not find splitter."); 62 | } else { 63 | readerConfigList = new ArrayList(); 64 | readerConfigList.add(readerConfig); 65 | } 66 | 67 | Reader[] readers = new Reader[readerConfigList.size()]; 68 | for (int i = 0, len = readers.length; i < len; i++) { 69 | readers[i] = jobConfig.newReader(); 70 | } 71 | 72 | LOG.info("Reader parallelism: {}, Writer parallelism: {}", readers.length, writerParallelism); 73 | 74 | final Writer[] writers = new Writer[writerParallelism]; 75 | final RecordWorkHandler[] handlers = new RecordWorkHandler[writerParallelism]; 76 | for (int i = 0; i < writerParallelism; i++) { 77 | writers[i] = jobConfig.newWriter(); 78 | handlers[i] = new RecordWorkHandler(readers, writers[i], context, writerConfig); 79 | } 80 | 81 | int bufferSize = engineConfig.getInt(HDataConfigConstants.STORAGE_BUFFER_SIZE, 1024); 82 | String WaitStrategyName = engineConfig.getString(HDataConfigConstants.HDATA_STORAGE_DISRUPTOR_WAIT_STRATEGY, "BlockingWaitStrategy"); 83 | 84 | Storage storage = createStorage(bufferSize, WaitStrategyName, readers.length, handlers); 85 | context.setStorage(storage); 86 | RecordCollector rc = new RecordCollector(storage, metric); 87 | 88 | LOG.info("Transfering data from reader to writer..."); 89 | ExecutorService es = Executors.newFixedThreadPool(readers.length); 90 | for (int i = 0, len = readerConfigList.size(); i < len; i++) { 91 | es.submit(new ReaderWorker(readers[i], context, readerConfigList.get(i), rc)); 92 | } 93 | es.shutdown(); 94 | 95 | metric.setReaderStartTime(System.currentTimeMillis()); 96 | metric.setWriterStartTime(System.currentTimeMillis()); 97 | while (!es.isTerminated()) { 98 | Utils.sleep(sleepMillis); 99 | LOG.info("Read: {}\tWrite: {}", metric.getReadCount().get(), metric.getWriteCount().get()); 100 | } 101 | metric.setReaderEndTime(System.currentTimeMillis()); 102 | 103 | while (!storage.isEmpty()) { 104 | if (context.isWriterError()) { 105 | LOG.error("Write error."); 106 | break; 107 | } 108 | Utils.sleep(sleepMillis); 109 | LOG.info("Read Finished(total: {}), Write: {}", metric.getReadCount().get(), metric.getWriteCount().get()); 110 | } 111 | storage.close(); 112 | LOG.info("Read Finished(total: {}), Write Finished(total: {})", metric.getReadCount().get(), metric.getWriteCount().get()); 113 | 114 | metric.setWriterEndTime(System.currentTimeMillis()); 115 | for (Writer writer : writers) { 116 | writer.close(); 117 | } 118 | 119 | double readSeconds = (metric.getReaderEndTime() - metric.getReaderStartTime()) / 1000d; 120 | double writeSeconds = (metric.getWriterEndTime() - metric.getWriterStartTime()) / 1000d; 121 | String readSpeed = df.format(metric.getReadCount().get() / readSeconds); 122 | String writeSpeed = df.format(metric.getWriteCount().get() / writeSeconds); 123 | LOG.info("Read spent time: {}s, Write spent time: {}s", df.format(readSeconds), df.format(writeSeconds)); 124 | LOG.info("Read records: {}/s, Write records: {}/s", readSpeed, writeSpeed); 125 | } 126 | 127 | private Storage createStorage(int bufferSize, String WaitStrategyName, int producerCount, RecordWorkHandler[] handlers) { 128 | WaitStrategy waitStrategy = WaitStrategyFactory.build(WaitStrategyName); 129 | ExecutorService executorService = Executors.newCachedThreadPool(); 130 | ProducerType producerType; 131 | if (producerCount == 1) { 132 | producerType = ProducerType.SINGLE; 133 | } else { 134 | producerType = ProducerType.MULTI; 135 | } 136 | Disruptor disruptor = new Disruptor(RecordEvent.FACTORY, bufferSize, executorService, producerType, waitStrategy); 137 | Storage storage = new Storage(disruptor, handlers); 138 | executorService.shutdown(); 139 | return storage; 140 | } 141 | 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/JobContext.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import opensource.hdata.config.Configuration; 4 | import opensource.hdata.config.EngineConfig; 5 | import opensource.hdata.config.JobConfig; 6 | 7 | public class JobContext { 8 | 9 | private Fields fields; 10 | private EngineConfig engineConfig; 11 | private JobConfig jobConfig; 12 | private OutputFieldsDeclarer declarer; 13 | private Storage storage; 14 | private Metric metric; 15 | private boolean isWriterError; 16 | 17 | public Fields getFields() { 18 | return fields; 19 | } 20 | 21 | protected void setFields(Fields fields) { 22 | this.fields = fields; 23 | } 24 | 25 | public Configuration getEngineConfig() { 26 | return engineConfig; 27 | } 28 | 29 | public void setEngineConfig(EngineConfig engineConfig) { 30 | this.engineConfig = engineConfig; 31 | } 32 | 33 | protected OutputFieldsDeclarer getDeclarer() { 34 | return declarer; 35 | } 36 | 37 | protected void setDeclarer(OutputFieldsDeclarer declarer) { 38 | this.declarer = declarer; 39 | } 40 | 41 | public Storage getStorage() { 42 | return storage; 43 | } 44 | 45 | public void setStorage(Storage storage) { 46 | this.storage = storage; 47 | } 48 | 49 | public Metric getMetric() { 50 | return metric; 51 | } 52 | 53 | public void setMetric(Metric metric) { 54 | this.metric = metric; 55 | } 56 | 57 | public JobConfig getJobConfig() { 58 | return jobConfig; 59 | } 60 | 61 | public void setJobConfig(JobConfig jobConfig) { 62 | this.jobConfig = jobConfig; 63 | } 64 | 65 | public boolean isWriterError() { 66 | return isWriterError; 67 | } 68 | 69 | public void setWriterError(boolean isWriterError) { 70 | this.isWriterError = isWriterError; 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/Metric.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import java.util.concurrent.atomic.AtomicLong; 4 | 5 | public class Metric { 6 | 7 | private AtomicLong readCount = new AtomicLong(0); 8 | private AtomicLong writeCount = new AtomicLong(0); 9 | private long readerStartTime; 10 | private long readerEndTime; 11 | private long writerStartTime; 12 | private long writerEndTime; 13 | 14 | public AtomicLong getReadCount() { 15 | return readCount; 16 | } 17 | 18 | public void setReadCount(AtomicLong readCount) { 19 | this.readCount = readCount; 20 | } 21 | 22 | public AtomicLong getWriteCount() { 23 | return writeCount; 24 | } 25 | 26 | public void setWriteCount(AtomicLong writeCount) { 27 | this.writeCount = writeCount; 28 | } 29 | 30 | public long getReaderStartTime() { 31 | return readerStartTime; 32 | } 33 | 34 | public void setReaderStartTime(long readerStartTime) { 35 | this.readerStartTime = readerStartTime; 36 | } 37 | 38 | public long getReaderEndTime() { 39 | return readerEndTime; 40 | } 41 | 42 | public void setReaderEndTime(long readerEndTime) { 43 | this.readerEndTime = readerEndTime; 44 | } 45 | 46 | public long getWriterStartTime() { 47 | return writerStartTime; 48 | } 49 | 50 | public void setWriterStartTime(long writerStartTime) { 51 | this.writerStartTime = writerStartTime; 52 | } 53 | 54 | public long getWriterEndTime() { 55 | return writerEndTime; 56 | } 57 | 58 | public void setWriterEndTime(long writerEndTime) { 59 | this.writerEndTime = writerEndTime; 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/OutputFieldsDeclarer.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | public class OutputFieldsDeclarer { 4 | 5 | private JobContext context; 6 | 7 | public OutputFieldsDeclarer(JobContext context) { 8 | this.context = context; 9 | } 10 | 11 | public void declare(Fields fields) { 12 | context.setFields(fields); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/PluginLoader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import opensource.hdata.common.Constants; 8 | import opensource.hdata.core.plugin.ReaderPlugin; 9 | import opensource.hdata.core.plugin.WriterPlugin; 10 | import opensource.hdata.exception.HDataException; 11 | import opensource.hdata.util.Utils; 12 | import opensource.hdata.util.XMLUtils; 13 | 14 | import org.jdom2.Element; 15 | 16 | public class PluginLoader { 17 | 18 | private static Map readerMap; 19 | private static Map writerMap; 20 | 21 | public static ReaderPlugin getReaderPlugin(String name) { 22 | return readerMap.get(name); 23 | } 24 | 25 | public static WriterPlugin getWriterPlugin(String name) { 26 | return writerMap.get(name); 27 | } 28 | 29 | static { 30 | readerMap = new HashMap(); 31 | writerMap = new HashMap(); 32 | 33 | Element root; 34 | try { 35 | root = XMLUtils.load(Utils.getConfigDir() + Constants.PLUGINS_XML); 36 | } catch (Exception e) { 37 | throw new HDataException(e); 38 | } 39 | List readers = root.getChild("readers").getChildren("reader"); 40 | for (Element e : readers) { 41 | ReaderPlugin readerPlugin = new ReaderPlugin(); 42 | readerPlugin.setPluginName(e.getChildText("name")); 43 | readerPlugin.setClassName(e.getChildText("class")); 44 | readerPlugin.setSplitterClassName(e.getChildText("splitter")); 45 | readerMap.put(readerPlugin.getPluginName(), readerPlugin); 46 | } 47 | 48 | List writers = root.getChild("writers").getChildren("writer"); 49 | for (Element e : writers) { 50 | WriterPlugin writerPlugin = new WriterPlugin(); 51 | writerPlugin.setPluginName(e.getChildText("name")); 52 | writerPlugin.setClassName(e.getChildText("class")); 53 | writerMap.put(writerPlugin.getPluginName(), writerPlugin); 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/ReaderWorker.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import opensource.hdata.config.PluginConfig; 4 | import opensource.hdata.core.plugin.Reader; 5 | import opensource.hdata.core.plugin.RecordCollector; 6 | 7 | public class ReaderWorker implements Runnable { 8 | 9 | private Reader reader; 10 | private JobContext context; 11 | private PluginConfig readerConfig; 12 | private RecordCollector rc; 13 | 14 | public ReaderWorker(Reader reader, JobContext context, PluginConfig readerConfig, RecordCollector rc) { 15 | this.reader = reader; 16 | this.context = context; 17 | this.readerConfig = readerConfig; 18 | this.rc = rc; 19 | } 20 | 21 | public void run() { 22 | reader.prepare(context, readerConfig); 23 | reader.execute(rc); 24 | reader.close(); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/RecordEvent.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import opensource.hdata.core.plugin.Record; 4 | 5 | import com.lmax.disruptor.EventFactory; 6 | 7 | public class RecordEvent { 8 | 9 | private Record record; 10 | 11 | public Record getRecord() { 12 | return record; 13 | } 14 | 15 | public void setRecord(Record record) { 16 | this.record = record; 17 | } 18 | 19 | public static final EventFactory FACTORY = new EventFactory() { 20 | 21 | public RecordEvent newInstance() { 22 | return new RecordEvent(); 23 | } 24 | }; 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/RecordWorkHandler.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import opensource.hdata.config.PluginConfig; 4 | import opensource.hdata.core.plugin.Reader; 5 | import opensource.hdata.core.plugin.Writer; 6 | 7 | import com.lmax.disruptor.WorkHandler; 8 | 9 | public class RecordWorkHandler implements WorkHandler { 10 | 11 | private Reader[] readers; 12 | private Writer writer; 13 | private JobContext context; 14 | private PluginConfig writerConfig; 15 | private boolean writerPrepared; 16 | private boolean isWriterError; 17 | private Metric metric; 18 | 19 | public RecordWorkHandler(Reader[] readers, Writer writer, JobContext context, PluginConfig writerConfig) { 20 | this.readers = readers; 21 | this.writer = writer; 22 | this.context = context; 23 | this.writerConfig = writerConfig; 24 | this.metric = context.getMetric(); 25 | } 26 | 27 | public void onEvent(RecordEvent event) { 28 | if (!isWriterError) { 29 | try { 30 | if (!writerPrepared) { 31 | for (Reader reader : readers) { 32 | if (context.getFields() == null) { 33 | reader.declareOutputFields(context.getDeclarer()); 34 | } else { 35 | break; 36 | } 37 | } 38 | writer.prepare(context, writerConfig); 39 | writerPrepared = true; 40 | 41 | if (metric.getWriterStartTime() == 0) { 42 | metric.setWriterStartTime(System.currentTimeMillis()); 43 | } 44 | } 45 | 46 | writer.execute(event.getRecord()); 47 | metric.getWriteCount().incrementAndGet(); 48 | } catch (Exception e) { 49 | this.isWriterError = true; 50 | context.setWriterError(true); 51 | e.printStackTrace(); 52 | } 53 | } 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/Storage.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import opensource.hdata.core.plugin.Record; 4 | 5 | import com.lmax.disruptor.EventTranslatorOneArg; 6 | import com.lmax.disruptor.RingBuffer; 7 | import com.lmax.disruptor.dsl.Disruptor; 8 | 9 | public class Storage { 10 | 11 | private Disruptor disruptor; 12 | private RingBuffer ringBuffer; 13 | 14 | private static final EventTranslatorOneArg TRANSLATOR = new EventTranslatorOneArg() { 15 | 16 | public void translateTo(RecordEvent event, long sequence, Record record) { 17 | event.setRecord(record); 18 | } 19 | }; 20 | 21 | public Storage(Disruptor disruptor, RecordWorkHandler[] handlers) { 22 | this.disruptor = disruptor; 23 | disruptor.handleEventsWithWorkerPool(handlers); 24 | ringBuffer = disruptor.start(); 25 | } 26 | 27 | public void put(Record record) { 28 | disruptor.publishEvent(TRANSLATOR, record); 29 | } 30 | 31 | public void put(Record[] records) { 32 | for (Record record : records) { 33 | put(record); 34 | } 35 | } 36 | 37 | public boolean isEmpty() { 38 | return ringBuffer.remainingCapacity() == ringBuffer.getBufferSize(); 39 | } 40 | 41 | public int size() { 42 | return ringBuffer.getBufferSize(); 43 | } 44 | 45 | public void close() { 46 | disruptor.shutdown(); 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/WaitStrategyFactory.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core; 2 | 3 | import opensource.hdata.exception.HDataException; 4 | 5 | import com.lmax.disruptor.BlockingWaitStrategy; 6 | import com.lmax.disruptor.BusySpinWaitStrategy; 7 | import com.lmax.disruptor.SleepingWaitStrategy; 8 | import com.lmax.disruptor.WaitStrategy; 9 | import com.lmax.disruptor.YieldingWaitStrategy; 10 | 11 | public class WaitStrategyFactory { 12 | 13 | /** 14 | * 构造线程等待策略 15 | * 16 | * @param name 17 | * @return 18 | */ 19 | public static WaitStrategy build(String name) { 20 | WaitStrategy waitStrategy = null; 21 | if ("BlockingWaitStrategy".equals(name)) { 22 | waitStrategy = new BlockingWaitStrategy(); 23 | } else if ("BusySpinWaitStrategy".equals(name)) { 24 | waitStrategy = new BusySpinWaitStrategy(); 25 | } else if ("SleepingWaitStrategy".equals(name)) { 26 | waitStrategy = new SleepingWaitStrategy(); 27 | } else if ("YieldingWaitStrategy".equals(name)) { 28 | waitStrategy = new YieldingWaitStrategy(); 29 | } else { 30 | throw new HDataException("Invalid wait strategy: " + name); 31 | } 32 | return waitStrategy; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/AbstractPlugin.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | public abstract class AbstractPlugin implements Pluginable { 4 | 5 | private String pluginName; 6 | 7 | public String getPluginName() { 8 | return this.pluginName; 9 | } 10 | 11 | public void setPluginName(String name) { 12 | this.pluginName = name; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/Pluginable.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | public interface Pluginable { 4 | 5 | public String getPluginName(); 6 | 7 | public void setPluginName(String name); 8 | } 9 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/Reader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | import opensource.hdata.config.PluginConfig; 4 | import opensource.hdata.core.JobContext; 5 | import opensource.hdata.core.OutputFieldsDeclarer; 6 | 7 | public abstract class Reader extends AbstractPlugin { 8 | 9 | public void prepare(JobContext context, PluginConfig readerConfig) { 10 | } 11 | 12 | public void execute(RecordCollector recordCollector) { 13 | } 14 | 15 | public void close() { 16 | } 17 | 18 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/ReaderPlugin.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | public class ReaderPlugin extends AbstractPlugin { 4 | 5 | private String className; 6 | private String splitterClassName; 7 | 8 | public String getClassName() { 9 | return className; 10 | } 11 | 12 | public void setClassName(String className) { 13 | this.className = className; 14 | } 15 | 16 | public String getSplitterClassName() { 17 | return splitterClassName; 18 | } 19 | 20 | public void setSplitterClassName(String splitterClassName) { 21 | this.splitterClassName = splitterClassName; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/Record.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | public interface Record { 4 | 5 | public void addField(Object field); 6 | 7 | public void addField(int index, Object field); 8 | 9 | public Object getField(int index); 10 | 11 | public int getFieldsCount(); 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/RecordCollector.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | import opensource.hdata.core.Metric; 4 | import opensource.hdata.core.Storage; 5 | 6 | public class RecordCollector { 7 | 8 | private Storage storage; 9 | private Metric metric; 10 | 11 | public RecordCollector(Storage storage, Metric metric) { 12 | this.storage = storage; 13 | this.metric = metric; 14 | } 15 | 16 | public void send(Record record) { 17 | storage.put(record); 18 | metric.getReadCount().incrementAndGet(); 19 | } 20 | 21 | public void send(Record[] records) { 22 | storage.put(records); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/Splitter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | import java.util.List; 4 | 5 | import opensource.hdata.config.JobConfig; 6 | import opensource.hdata.config.PluginConfig; 7 | 8 | public abstract class Splitter extends AbstractPlugin { 9 | 10 | public abstract List split(JobConfig jobConfig); 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/Writer.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | import opensource.hdata.config.PluginConfig; 4 | import opensource.hdata.core.JobContext; 5 | 6 | public abstract class Writer extends AbstractPlugin { 7 | 8 | public void prepare(JobContext context, PluginConfig writerConfig) { 9 | } 10 | 11 | public void execute(Record record) { 12 | } 13 | 14 | public void close() { 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/core/plugin/WriterPlugin.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.core.plugin; 2 | 3 | public class WriterPlugin extends AbstractPlugin { 4 | 5 | private String className; 6 | 7 | public String getClassName() { 8 | return className; 9 | } 10 | 11 | public void setClassName(String className) { 12 | this.className = className; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/exception/HDataException.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.exception; 2 | 3 | public class HDataException extends RuntimeException { 4 | 5 | private static final long serialVersionUID = 2510267358921118998L; 6 | 7 | private String message; 8 | 9 | public HDataException() { 10 | super(); 11 | } 12 | 13 | public HDataException(final String message) { 14 | super(message); 15 | } 16 | 17 | public HDataException(final Exception e) { 18 | super(e); 19 | } 20 | 21 | public HDataException(Throwable cause) { 22 | super(cause); 23 | } 24 | 25 | public HDataException(final String message, final Throwable cause) { 26 | super(message, cause); 27 | } 28 | 29 | @Override 30 | public String getMessage() { 31 | return this.message == null ? super.getMessage() : this.message; 32 | } 33 | 34 | public void setMessage(String message) { 35 | this.message = message; 36 | } 37 | 38 | @Override 39 | public String toString() { 40 | return this.message; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/ftp/FTPReader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.ftp; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.InputStream; 5 | import java.io.InputStreamReader; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.zip.GZIPInputStream; 9 | 10 | import opensource.hdata.config.PluginConfig; 11 | import opensource.hdata.core.DefaultRecord; 12 | import opensource.hdata.core.Fields; 13 | import opensource.hdata.core.JobContext; 14 | import opensource.hdata.core.OutputFieldsDeclarer; 15 | import opensource.hdata.core.plugin.Reader; 16 | import opensource.hdata.core.plugin.Record; 17 | import opensource.hdata.core.plugin.RecordCollector; 18 | import opensource.hdata.exception.HDataException; 19 | import opensource.hdata.util.EscaperUtils; 20 | import opensource.hdata.util.FTPUtils; 21 | 22 | import org.apache.commons.lang3.StringUtils; 23 | import org.apache.commons.net.ftp.FTPClient; 24 | 25 | public class FTPReader extends Reader { 26 | 27 | private Fields fields; 28 | private String host; 29 | private int port; 30 | private String username; 31 | private String password; 32 | private String fieldsSeparator; 33 | private String encoding; 34 | private int fieldsCount; 35 | private List files = new ArrayList(); 36 | 37 | @SuppressWarnings("unchecked") 38 | @Override 39 | public void prepare(JobContext context, PluginConfig readerConfig) { 40 | host = readerConfig.getString(FTPReaderProperties.HOST); 41 | port = readerConfig.getInt(FTPReaderProperties.PORT, 21); 42 | username = readerConfig.getString(FTPReaderProperties.USERNAME, "anonymous"); 43 | password = readerConfig.getString(FTPReaderProperties.PASSWORD, ""); 44 | fieldsSeparator = EscaperUtils.parse(readerConfig.getString(FTPReaderProperties.FIELDS_SEPARATOR, "\t")); 45 | encoding = readerConfig.getString(FTPReaderProperties.ENCODING, "UTF-8"); 46 | files = (List) readerConfig.get(FTPReaderProperties.FILES); 47 | fieldsCount = readerConfig.getInt(FTPReaderProperties.FIELDS_COUNT_FILTER, 0); 48 | 49 | if (readerConfig.containsKey(FTPReaderProperties.SCHEMA)) { 50 | fields = new Fields(); 51 | String[] tokens = readerConfig.getString(FTPReaderProperties.SCHEMA).split("\\s*,\\s*"); 52 | for (String field : tokens) { 53 | fields.add(field); 54 | } 55 | } 56 | } 57 | 58 | @Override 59 | public void execute(RecordCollector recordCollector) { 60 | FTPClient ftpClient = null; 61 | try { 62 | ftpClient = FTPUtils.getFtpClient(host, port, username, password); 63 | for (String file : files) { 64 | InputStream is = ftpClient.retrieveFileStream(file); 65 | BufferedReader br = null; 66 | if (file.endsWith(".gz")) { 67 | GZIPInputStream gzin = new GZIPInputStream(is); 68 | br = new BufferedReader(new InputStreamReader(gzin, encoding)); 69 | } else { 70 | br = new BufferedReader(new InputStreamReader(is, encoding)); 71 | } 72 | 73 | String line = null; 74 | while ((line = br.readLine()) != null) { 75 | String[] tokens = StringUtils.splitByWholeSeparator(line, fieldsSeparator); 76 | if (tokens.length >= fieldsCount) { 77 | Record record = new DefaultRecord(tokens.length); 78 | for (String field : tokens) { 79 | record.addField(field); 80 | } 81 | recordCollector.send(record); 82 | } 83 | } 84 | ftpClient.completePendingCommand(); 85 | br.close(); 86 | is.close(); 87 | } 88 | } catch (Exception e) { 89 | throw new HDataException(e); 90 | } finally { 91 | FTPUtils.closeFtpClient(ftpClient); 92 | } 93 | } 94 | 95 | @Override 96 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 97 | declarer.declare(fields); 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/ftp/FTPReaderProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.ftp; 2 | 3 | public class FTPReaderProperties { 4 | public static final String HOST = "host"; 5 | public static final String PORT = "port"; 6 | public static final String USERNAME = "username"; 7 | public static final String PASSWORD = "password"; 8 | public static final String DIR = "dir"; 9 | public static final String FILENAME = "filename"; 10 | public static final String RECURSIVE = "recursive"; 11 | public static final String ENCODING = "encoding"; 12 | public static final String FIELDS_SEPARATOR = "fieldsSeparator"; 13 | public static final String SCHEMA = "schema"; 14 | public static final String FIELDS_COUNT_FILTER = "fieldsCountFilter"; 15 | public static final String FILES = "reader.files"; 16 | } 17 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/ftp/FTPSplitter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.ftp; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import opensource.hdata.config.JobConfig; 7 | import opensource.hdata.config.PluginConfig; 8 | import opensource.hdata.core.plugin.Splitter; 9 | import opensource.hdata.exception.HDataException; 10 | import opensource.hdata.util.FTPUtils; 11 | 12 | import org.apache.commons.net.ftp.FTPClient; 13 | 14 | public class FTPSplitter extends Splitter { 15 | 16 | @Override 17 | public List split(JobConfig jobConfig) { 18 | List list = new ArrayList(); 19 | PluginConfig readerConfig = jobConfig.getReaderConfig(); 20 | String host = readerConfig.getString(FTPReaderProperties.HOST); 21 | int port = readerConfig.getInt(FTPReaderProperties.PORT, 21); 22 | String username = readerConfig.getString(FTPReaderProperties.USERNAME, "anonymous"); 23 | String password = readerConfig.getString(FTPReaderProperties.PASSWORD, ""); 24 | String dir = readerConfig.getString(FTPReaderProperties.DIR); 25 | String filenameRegexp = readerConfig.getString(FTPReaderProperties.FILENAME); 26 | boolean recursive = readerConfig.getBoolean(FTPReaderProperties.RECURSIVE, false); 27 | int parallelism = readerConfig.getParallelism(); 28 | 29 | FTPClient ftpClient = null; 30 | try { 31 | ftpClient = FTPUtils.getFtpClient(host, port, username, password); 32 | List files = new ArrayList(); 33 | FTPUtils.listFile(files, ftpClient, dir, filenameRegexp, recursive); 34 | if (files.size() > 0) { 35 | if (parallelism == 1) { 36 | readerConfig.put(FTPReaderProperties.FILES, files); 37 | list.add(readerConfig); 38 | } else { 39 | double step = (double) files.size() / parallelism; 40 | for (int i = 0; i < parallelism; i++) { 41 | List splitedFiles = new ArrayList(); 42 | for (int start = (int) Math.ceil(step * i), end = (int) Math.ceil(step * (i + 1)); start < end; start++) { 43 | splitedFiles.add(files.get(start)); 44 | } 45 | PluginConfig pluginConfig = (PluginConfig) readerConfig.clone(); 46 | pluginConfig.put(FTPReaderProperties.FILES, splitedFiles); 47 | list.add(pluginConfig); 48 | } 49 | } 50 | } 51 | } catch (Exception e) { 52 | throw new HDataException(e); 53 | } finally { 54 | FTPUtils.closeFtpClient(ftpClient); 55 | } 56 | 57 | return list; 58 | } 59 | 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hbase/HBaseReader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hbase; 2 | 3 | import java.io.IOException; 4 | 5 | import opensource.hdata.config.PluginConfig; 6 | import opensource.hdata.core.DefaultRecord; 7 | import opensource.hdata.core.Fields; 8 | import opensource.hdata.core.JobContext; 9 | import opensource.hdata.core.OutputFieldsDeclarer; 10 | import opensource.hdata.core.plugin.Reader; 11 | import opensource.hdata.core.plugin.Record; 12 | import opensource.hdata.core.plugin.RecordCollector; 13 | import opensource.hdata.exception.HDataException; 14 | 15 | import org.apache.hadoop.conf.Configuration; 16 | import org.apache.hadoop.hbase.HBaseConfiguration; 17 | import org.apache.hadoop.hbase.client.HTable; 18 | import org.apache.hadoop.hbase.client.Result; 19 | import org.apache.hadoop.hbase.client.ResultScanner; 20 | import org.apache.hadoop.hbase.client.Scan; 21 | import org.apache.hadoop.hbase.util.Bytes; 22 | 23 | public class HBaseReader extends Reader { 24 | 25 | private Fields fields = new Fields(); 26 | private HTable table; 27 | private byte[] startRowkey; 28 | private byte[] endRowkey; 29 | private String[] columns; 30 | private int rowkeyIndex = -1; 31 | private static final String ROWKEY = ":rowkey"; 32 | 33 | @Override 34 | public void prepare(JobContext context, PluginConfig readerConfig) { 35 | startRowkey = (byte[]) readerConfig.get(HBaseReaderProperties.START_ROWKWY); 36 | endRowkey = (byte[]) readerConfig.get(HBaseReaderProperties.END_ROWKWY); 37 | 38 | String[] schema = readerConfig.getString(HBaseReaderProperties.SCHEMA).split(","); 39 | for (String field : schema) { 40 | fields.add(field); 41 | } 42 | 43 | Configuration conf = HBaseConfiguration.create(); 44 | conf.set("hbase.zookeeper.quorum", readerConfig.getString(HBaseReaderProperties.ZOOKEEPER_QUORUM)); 45 | conf.set("hbase.zookeeper.property.clientPort", readerConfig.getString(HBaseReaderProperties.ZOOKEEPER_PROPERTY_CLIENTPORT, "2181")); 46 | columns = readerConfig.getString(HBaseReaderProperties.COLUMNS).split("\\s*,\\s*"); 47 | for (int i = 0, len = columns.length; i < len; i++) { 48 | if (ROWKEY.equalsIgnoreCase(columns[i])) { 49 | rowkeyIndex = i; 50 | break; 51 | } 52 | } 53 | 54 | try { 55 | table = new HTable(conf, readerConfig.getString(HBaseReaderProperties.TABLE)); 56 | } catch (IOException e) { 57 | e.printStackTrace(); 58 | throw new HDataException(e); 59 | } 60 | } 61 | 62 | @Override 63 | public void execute(RecordCollector recordCollector) { 64 | Scan scan = new Scan(); 65 | if (startRowkey.length > 0) { 66 | scan.setStartRow(startRowkey); 67 | } 68 | if (endRowkey.length > 0) { 69 | scan.setStopRow(endRowkey); 70 | } 71 | 72 | for (int i = 0, len = columns.length; i < len; i++) { 73 | if (i != rowkeyIndex) { 74 | String[] column = columns[i].split(":"); 75 | scan.addColumn(Bytes.toBytes(column[0]), Bytes.toBytes(column[1])); 76 | } 77 | } 78 | 79 | try { 80 | ResultScanner results = table.getScanner(scan); 81 | for (Result result : results) { 82 | Record record = new DefaultRecord(fields.size()); 83 | for (int i = 0, len = fields.size(); i < len; i++) { 84 | if (i == rowkeyIndex) { 85 | record.addField(Bytes.toString(result.getRow())); 86 | } else { 87 | String[] column = columns[i].split(":"); 88 | record.addField(Bytes.toString(result.getValue(Bytes.toBytes(column[0]), Bytes.toBytes(column[1])))); 89 | } 90 | } 91 | recordCollector.send(record); 92 | } 93 | 94 | if (table != null) { 95 | table.close(); 96 | } 97 | } catch (IOException e) { 98 | throw new HDataException(e); 99 | } 100 | } 101 | 102 | @Override 103 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 104 | declarer.declare(fields); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hbase/HBaseReaderProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hbase; 2 | 3 | public class HBaseReaderProperties { 4 | 5 | public static final String ZOOKEEPER_QUORUM = "zookeeperQuorum"; 6 | public static final String ZOOKEEPER_PROPERTY_CLIENTPORT = "zookeeperClientPort"; 7 | public static final String TABLE = "table"; 8 | public static final String START_ROWKWY = "startRowkey"; 9 | public static final String END_ROWKWY = "endRowkey"; 10 | public static final String COLUMNS = "columns"; 11 | public static final String SCHEMA = "schema"; 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hbase/HBaseSplitter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hbase; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import opensource.hdata.config.JobConfig; 8 | import opensource.hdata.config.PluginConfig; 9 | import opensource.hdata.core.plugin.Splitter; 10 | import opensource.hdata.exception.HDataException; 11 | import opensource.hdata.plugin.writer.hbase.HBaseWriterProperties; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.hbase.HBaseConfiguration; 15 | import org.apache.hadoop.hbase.client.HTable; 16 | import org.apache.hadoop.hbase.util.Bytes; 17 | import org.apache.hadoop.hbase.util.Pair; 18 | import org.apache.logging.log4j.LogManager; 19 | import org.apache.logging.log4j.Logger; 20 | 21 | public class HBaseSplitter extends Splitter { 22 | 23 | private static final Logger LOG = LogManager.getLogger(HBaseSplitter.class); 24 | 25 | @Override 26 | public List split(JobConfig jobConfig) { 27 | List list = new ArrayList(); 28 | PluginConfig readerConfig = jobConfig.getReaderConfig(); 29 | int parallelism = readerConfig.getParallelism(); 30 | 31 | String startRowkey = readerConfig.getString(HBaseReaderProperties.START_ROWKWY, ""); 32 | String endRowkey = readerConfig.getString(HBaseReaderProperties.END_ROWKWY, ""); 33 | byte[] startRowkeyBytes = startRowkey.getBytes(); 34 | byte[] endRowkeyBytes = endRowkey.getBytes(); 35 | 36 | if (parallelism == 1) { 37 | readerConfig.put(HBaseReaderProperties.START_ROWKWY, startRowkeyBytes); 38 | readerConfig.put(HBaseReaderProperties.END_ROWKWY, endRowkeyBytes); 39 | list.add(readerConfig); 40 | return list; 41 | } else { 42 | Configuration conf = HBaseConfiguration.create(); 43 | conf.set("hbase.zookeeper.quorum", readerConfig.getString(HBaseReaderProperties.ZOOKEEPER_QUORUM)); 44 | conf.set("hbase.zookeeper.property.clientPort", readerConfig.getString(HBaseReaderProperties.ZOOKEEPER_PROPERTY_CLIENTPORT, "2181")); 45 | try { 46 | HTable table = new HTable(conf, readerConfig.getString(HBaseWriterProperties.TABLE)); 47 | Pair startEndKeysPair = table.getStartEndKeys(); 48 | table.close(); 49 | List> selectedPairList = new ArrayList>(); 50 | byte[][] startKeys = startEndKeysPair.getFirst(); 51 | byte[][] endKeys = startEndKeysPair.getSecond(); 52 | 53 | if (startKeys.length == 1) { 54 | Pair pair = new Pair(); 55 | pair.setFirst(startRowkeyBytes); 56 | pair.setSecond(endRowkeyBytes); 57 | selectedPairList.add(pair); 58 | } else { 59 | if (startRowkeyBytes.length == 0 && endRowkeyBytes.length == 0) { 60 | for (int i = 0, len = startKeys.length; i < len; i++) { 61 | Pair pair = new Pair(); 62 | pair.setFirst(startKeys[i]); 63 | pair.setSecond(endKeys[i]); 64 | selectedPairList.add(pair); 65 | } 66 | } else if (endRowkeyBytes.length == 0) { 67 | for (int i = 0, len = startKeys.length; i < len; i++) { 68 | if (Bytes.compareTo(endKeys[i], startRowkeyBytes) >= 0) { 69 | Pair pair = new Pair(); 70 | pair.setFirst(Bytes.compareTo(startKeys[i], startRowkeyBytes) >= 0 ? startKeys[i] : startRowkeyBytes); 71 | pair.setSecond(endKeys[i]); 72 | selectedPairList.add(pair); 73 | } 74 | } 75 | } else { 76 | for (int i = 0, len = startKeys.length; i < len; i++) { 77 | if (len == 1) { 78 | Pair pair = new Pair(); 79 | pair.setFirst(startRowkeyBytes); 80 | pair.setSecond(endRowkeyBytes); 81 | selectedPairList.add(pair); 82 | break; 83 | } else if (Bytes.compareTo(endKeys[i], startRowkeyBytes) >= 0 && Bytes.compareTo(endRowkeyBytes, startKeys[i]) >= 0) { 84 | Pair pair = new Pair(); 85 | pair.setFirst(Bytes.compareTo(startKeys[i], startRowkeyBytes) >= 0 ? startKeys[i] : startRowkeyBytes); 86 | pair.setSecond(Bytes.compareTo(endKeys[i], endRowkeyBytes) <= 0 ? endKeys[i] : endRowkeyBytes); 87 | selectedPairList.add(pair); 88 | } 89 | } 90 | } 91 | } 92 | 93 | if (parallelism > selectedPairList.size()) { 94 | LOG.info( 95 | "parallelism: {} is greater than the region count: {} in the currently open table: {}, so parallelism is set equal to region count.", 96 | parallelism, selectedPairList.size(), Bytes.toString(table.getTableName())); 97 | parallelism = selectedPairList.size(); 98 | } 99 | 100 | double step = (double) selectedPairList.size() / parallelism; 101 | for (int i = 0; i < parallelism; i++) { 102 | List> splitedPairs = new ArrayList>(); 103 | for (int start = (int) Math.ceil(step * i), end = (int) Math.ceil(step * (i + 1)); start < end; start++) { 104 | splitedPairs.add(selectedPairList.get(start)); 105 | } 106 | PluginConfig pluginConfig = (PluginConfig) readerConfig.clone(); 107 | pluginConfig.put(HBaseReaderProperties.START_ROWKWY, splitedPairs.get(0).getFirst()); 108 | pluginConfig.put(HBaseReaderProperties.END_ROWKWY, splitedPairs.get(splitedPairs.size() - 1).getSecond()); 109 | list.add(pluginConfig); 110 | } 111 | } catch (IOException e) { 112 | throw new HDataException(e); 113 | } 114 | 115 | return list; 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hdfs/HDFSReader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hdfs; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import opensource.hdata.config.PluginConfig; 10 | import opensource.hdata.core.DefaultRecord; 11 | import opensource.hdata.core.Fields; 12 | import opensource.hdata.core.JobContext; 13 | import opensource.hdata.core.OutputFieldsDeclarer; 14 | import opensource.hdata.core.plugin.Reader; 15 | import opensource.hdata.core.plugin.Record; 16 | import opensource.hdata.core.plugin.RecordCollector; 17 | import opensource.hdata.exception.HDataException; 18 | import opensource.hdata.util.EscaperUtils; 19 | 20 | import org.apache.commons.lang3.StringUtils; 21 | import org.apache.hadoop.conf.Configuration; 22 | import org.apache.hadoop.fs.FSDataInputStream; 23 | import org.apache.hadoop.fs.FileSystem; 24 | import org.apache.hadoop.fs.Path; 25 | import org.apache.hadoop.io.compress.CompressionCodec; 26 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 27 | 28 | public class HDFSReader extends Reader { 29 | 30 | private Fields fields; 31 | private String fieldsSeparator; 32 | private String encoding; 33 | private List files = new ArrayList(); 34 | 35 | @SuppressWarnings("unchecked") 36 | @Override 37 | public void prepare(JobContext context, PluginConfig readerConfig) { 38 | fieldsSeparator = EscaperUtils.parse(readerConfig.getString(HDFSReaderProperties.FIELDS_SEPARATOR, "\t")); 39 | files = (List) readerConfig.get(HDFSReaderProperties.FILES); 40 | encoding = readerConfig.getString(HDFSReaderProperties.ENCODING, "UTF-8"); 41 | if (readerConfig.containsKey(HDFSReaderProperties.SCHEMA)) { 42 | fields = new Fields(); 43 | String[] tokens = readerConfig.getString(HDFSReaderProperties.SCHEMA).split("\\s*,\\s*"); 44 | for (String field : tokens) { 45 | fields.add(field); 46 | } 47 | } 48 | } 49 | 50 | @Override 51 | public void execute(RecordCollector recordCollector) { 52 | Configuration conf = new Configuration(); 53 | CompressionCodecFactory codecFactory = new CompressionCodecFactory(conf); 54 | try { 55 | for (Path file : files) { 56 | FileSystem fs = file.getFileSystem(conf); 57 | CompressionCodec codec = codecFactory.getCodec(file); 58 | FSDataInputStream input = fs.open(file); 59 | BufferedReader br; 60 | String line = null; 61 | if (codec == null) { 62 | br = new BufferedReader(new InputStreamReader(input, encoding)); 63 | } else { 64 | br = new BufferedReader(new InputStreamReader(codec.createInputStream(input), encoding)); 65 | } 66 | while ((line = br.readLine()) != null) { 67 | String[] tokens = StringUtils.splitByWholeSeparator(line, fieldsSeparator); 68 | Record record = new DefaultRecord(tokens.length); 69 | for (String field : tokens) { 70 | record.addField(field); 71 | } 72 | recordCollector.send(record); 73 | } 74 | br.close(); 75 | } 76 | } catch (IOException e) { 77 | e.printStackTrace(); 78 | throw new HDataException(e); 79 | } 80 | } 81 | 82 | @Override 83 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 84 | declarer.declare(fields); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hdfs/HDFSReaderProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hdfs; 2 | 3 | public class HDFSReaderProperties { 4 | public static final String DIR = "dir"; 5 | public static final String FILENAME_REGEXP = "filename"; 6 | public static final String SCHEMA = "schema"; 7 | public static final String FIELDS_SEPARATOR = "fieldsSeparator"; 8 | public static final String ENCODING = "encoding"; 9 | public static final String HADOOP_USER = "hadoopUser"; 10 | public static final String FILES = "reader.files"; 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hdfs/HDFSSplitter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hdfs; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | import opensource.hdata.config.JobConfig; 10 | import opensource.hdata.config.PluginConfig; 11 | import opensource.hdata.core.plugin.Splitter; 12 | import opensource.hdata.exception.HDataException; 13 | 14 | import org.apache.hadoop.conf.Configuration; 15 | import org.apache.hadoop.fs.FileStatus; 16 | import org.apache.hadoop.fs.FileSystem; 17 | import org.apache.hadoop.fs.Path; 18 | 19 | public class HDFSSplitter extends Splitter { 20 | 21 | @Override 22 | public List split(JobConfig jobConfig) { 23 | List list = new ArrayList(); 24 | List matchedFiles = new ArrayList(); 25 | PluginConfig readerConfig = jobConfig.getReaderConfig(); 26 | Path dir = new Path(readerConfig.getString(HDFSReaderProperties.DIR)); 27 | int parallelism = readerConfig.getParallelism(); 28 | 29 | System.setProperty("HADOOP_USER_NAME", readerConfig.getString(HDFSReaderProperties.HADOOP_USER)); 30 | Configuration conf = new Configuration(); 31 | try { 32 | FileSystem fs = dir.getFileSystem(conf); 33 | Pattern filenamePattern = Pattern.compile(readerConfig.getString(HDFSReaderProperties.FILENAME_REGEXP)); 34 | if (fs.exists(dir)) { 35 | for (FileStatus fileStatus : fs.listStatus(dir)) { 36 | Matcher m = filenamePattern.matcher(fileStatus.getPath().getName()); 37 | if (m.matches()) { 38 | matchedFiles.add(fileStatus.getPath()); 39 | } 40 | } 41 | 42 | if (matchedFiles.size() > 0) { 43 | if (parallelism == 1) { 44 | readerConfig.put(HDFSReaderProperties.FILES, matchedFiles); 45 | list.add(readerConfig); 46 | } else { 47 | double step = (double) matchedFiles.size() / parallelism; 48 | for (int i = 0; i < parallelism; i++) { 49 | List splitedFiles = new ArrayList(); 50 | for (int start = (int) Math.ceil(step * i), end = (int) Math.ceil(step * (i + 1)); start < end; start++) { 51 | splitedFiles.add(matchedFiles.get(start)); 52 | } 53 | PluginConfig pluginConfig = (PluginConfig) readerConfig.clone(); 54 | pluginConfig.put(HDFSReaderProperties.FILES, splitedFiles); 55 | list.add(pluginConfig); 56 | } 57 | } 58 | } 59 | 60 | } else { 61 | throw new HDataException(String.format("Path %s not found.", dir)); 62 | } 63 | } catch (IOException e) { 64 | throw new HDataException(e); 65 | } 66 | 67 | return list; 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hive/HiveReader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hive; 2 | 3 | import java.util.List; 4 | 5 | import opensource.hdata.config.PluginConfig; 6 | import opensource.hdata.core.DefaultRecord; 7 | import opensource.hdata.core.Fields; 8 | import opensource.hdata.core.JobContext; 9 | import opensource.hdata.core.OutputFieldsDeclarer; 10 | import opensource.hdata.core.plugin.Reader; 11 | import opensource.hdata.core.plugin.Record; 12 | import opensource.hdata.core.plugin.RecordCollector; 13 | import opensource.hdata.exception.HDataException; 14 | import opensource.hdata.util.HiveTypeUtils; 15 | 16 | import org.apache.hadoop.fs.FileSystem; 17 | import org.apache.hadoop.fs.Path; 18 | import org.apache.hadoop.hive.metastore.api.FieldSchema; 19 | import org.apache.hadoop.hive.serde2.Deserializer; 20 | import org.apache.hadoop.hive.serde2.SerDeException; 21 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; 22 | import org.apache.hadoop.hive.serde2.objectinspector.StructField; 23 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 24 | import org.apache.hadoop.io.Writable; 25 | import org.apache.hadoop.mapred.FileInputFormat; 26 | import org.apache.hadoop.mapred.FileSplit; 27 | import org.apache.hadoop.mapred.InputFormat; 28 | import org.apache.hadoop.mapred.JobConf; 29 | import org.apache.hadoop.mapred.RecordReader; 30 | import org.apache.hadoop.mapred.Reporter; 31 | 32 | @SuppressWarnings("deprecation") 33 | public class HiveReader extends Reader { 34 | 35 | private final Fields fields = new Fields(); 36 | private List files; 37 | private List partitionValues; 38 | private Class> inputFormat; 39 | private StructObjectInspector oi; 40 | private List structFields; 41 | 42 | private Deserializer deserializer; 43 | 44 | @SuppressWarnings("unchecked") 45 | @Override 46 | public void prepare(JobContext context, PluginConfig readerConfig) { 47 | inputFormat = (Class>) readerConfig.get(HiveReaderProperties.INPUT_FORMAT_CLASS); 48 | deserializer = (Deserializer) readerConfig.get(HiveReaderProperties.DESERIALIZER); 49 | files = (List) readerConfig.get(HiveReaderProperties.TABLE_FILES); 50 | partitionValues = (List) readerConfig.get(HiveReaderProperties.PARTITION_VALUES); 51 | List columns = (List) readerConfig.get(HiveReaderProperties.TABLE_COLUMNS); 52 | 53 | for (FieldSchema fs : columns) { 54 | fields.add(fs.getName()); 55 | } 56 | 57 | try { 58 | oi = (StructObjectInspector) deserializer.getObjectInspector(); 59 | } catch (SerDeException e) { 60 | throw new HDataException(e); 61 | } 62 | structFields = oi.getAllStructFieldRefs(); 63 | } 64 | 65 | @Override 66 | public void execute(RecordCollector recordCollector) { 67 | 68 | int columnsCount = fields.size(); 69 | int partitionValueCount = partitionValues == null ? 0 : partitionValues.size(); 70 | 71 | JobConf jobConf = new JobConf(); 72 | for (String file : files) { 73 | Path path = new Path(file); 74 | try { 75 | FileSystem fs = path.getFileSystem(jobConf); 76 | FileInputFormat fileInputFormat = (FileInputFormat) inputFormat.newInstance(); 77 | long filelen = fs.getFileStatus(path).getLen(); 78 | FileSplit split = new FileSplit(path, 0, filelen, (String[]) null); 79 | RecordReader reader = fileInputFormat.getRecordReader(split, jobConf, Reporter.NULL); 80 | Writable key = reader.createKey(); 81 | Writable value = reader.createValue(); 82 | while (reader.next(key, value)) { 83 | Object row = deserializer.deserialize(value); 84 | Record record = new DefaultRecord(columnsCount); 85 | for (int i = 0, len = structFields.size(); i < len; i++) { 86 | Object fieldData = oi.getStructFieldData(row, structFields.get(i)); 87 | Object standardData = ObjectInspectorUtils.copyToStandardJavaObject(fieldData, structFields.get(i).getFieldObjectInspector()); 88 | record.addField(HiveTypeUtils.toJavaObject(standardData)); 89 | } 90 | 91 | for (int i = 0, len = partitionValueCount; i < len; i++) { 92 | record.addField(partitionValues.get(i)); 93 | } 94 | recordCollector.send(record); 95 | } 96 | reader.close(); 97 | } catch (Exception e) { 98 | throw new HDataException(e); 99 | } 100 | } 101 | } 102 | 103 | @Override 104 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 105 | declarer.declare(fields); 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hive/HiveReaderProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hive; 2 | 3 | public class HiveReaderProperties { 4 | public static final String METASTORE_URIS = "metastoreUris"; 5 | public static final String DATABASE = "database"; 6 | public static final String TABLE = "table"; 7 | public static final String SELECT_COLUMNS = "columns"; 8 | public static final String TABLE_COLUMNS = "reader.columns"; 9 | public static final String PARTITIONS = "partitions"; 10 | public static final String TABLE_FILES = "reader.table.files"; 11 | public static final String PARTITION_VALUES = "reader.partition.values"; 12 | public static final String INPUT_FORMAT_CLASS = "reader.input.format.class"; 13 | public static final String DESERIALIZER = "reader.deserializer"; 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/hive/HiveSplitter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.hive; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import opensource.hdata.config.JobConfig; 8 | import opensource.hdata.config.PluginConfig; 9 | import opensource.hdata.core.plugin.Splitter; 10 | import opensource.hdata.exception.HDataException; 11 | import opensource.hdata.util.LoggerUtils; 12 | import opensource.hdata.util.Utils; 13 | 14 | import org.apache.hadoop.conf.Configuration; 15 | import org.apache.hadoop.fs.FileStatus; 16 | import org.apache.hadoop.fs.FileSystem; 17 | import org.apache.hadoop.fs.Path; 18 | import org.apache.hadoop.hive.conf.HiveConf; 19 | import org.apache.hadoop.hive.conf.HiveConf.ConfVars; 20 | import org.apache.hadoop.hive.ql.metadata.Hive; 21 | import org.apache.hadoop.hive.ql.metadata.HiveException; 22 | import org.apache.hadoop.hive.ql.metadata.Table; 23 | import org.apache.logging.log4j.LogManager; 24 | import org.apache.logging.log4j.Logger; 25 | 26 | public class HiveSplitter extends Splitter { 27 | 28 | private static final Logger LOG = LogManager.getLogger(HiveSplitter.class); 29 | 30 | @Override 31 | public List split(JobConfig jobConfig) { 32 | List list = new ArrayList(); 33 | PluginConfig readerConfig = jobConfig.getReaderConfig(); 34 | String metastoreUris = readerConfig.getString(HiveReaderProperties.METASTORE_URIS); 35 | String dbName = readerConfig.getString(HiveReaderProperties.DATABASE, "default"); 36 | String tableName = readerConfig.getString(HiveReaderProperties.TABLE); 37 | int parallelism = readerConfig.getParallelism(); 38 | List partitionValues = null; 39 | 40 | HiveConf conf = new HiveConf(); 41 | conf.set(ConfVars.METASTOREURIS.varname, metastoreUris); 42 | 43 | Hive hive; 44 | Table table; 45 | try { 46 | hive = Hive.get(conf, true); 47 | table = hive.getTable(dbName, tableName, false); 48 | } catch (HiveException e) { 49 | throw new HDataException(e); 50 | } 51 | 52 | if (table == null) { 53 | throw new HDataException(String.format("Table %s.%s is not exist.", dbName, tableName)); 54 | } 55 | 56 | readerConfig.put(HiveReaderProperties.TABLE_COLUMNS, table.getAllCols()); 57 | readerConfig.put(HiveReaderProperties.INPUT_FORMAT_CLASS, table.getInputFormatClass()); 58 | readerConfig.put(HiveReaderProperties.DESERIALIZER, table.getDeserializer()); 59 | 60 | String tableLocation = Utils.fixLocaltion(table.getDataLocation().toString(), metastoreUris); 61 | if (readerConfig.containsKey(HiveReaderProperties.PARTITIONS)) { 62 | String partitions = readerConfig.getString(HiveReaderProperties.PARTITIONS); 63 | tableLocation += "/" + partitions.replaceAll("\\s*,\\s*", "/"); 64 | partitionValues = Utils.parsePartitionValue(partitions); 65 | readerConfig.put(HiveReaderProperties.PARTITION_VALUES, partitionValues); 66 | } 67 | 68 | List files = getTableFiles(tableLocation); 69 | if (files == null || files.size() < 1) { 70 | LOG.info("Can not find files on path {}", tableLocation); 71 | return null; 72 | } 73 | 74 | if (parallelism > files.size()) { 75 | parallelism = files.size(); 76 | LOG.info("Reader parallelism is greater than file count, so parallelism is set to equal with file count."); 77 | } 78 | 79 | if (parallelism == 1) { 80 | readerConfig.put(HiveReaderProperties.TABLE_FILES, files); 81 | list.add(readerConfig); 82 | } else { 83 | double step = (double) files.size() / parallelism; 84 | for (int i = 0; i < parallelism; i++) { 85 | List splitedFiles = new ArrayList(); 86 | for (int start = (int) Math.ceil(step * i), end = (int) Math.ceil(step * (i + 1)); start < end; start++) { 87 | splitedFiles.add(files.get(start)); 88 | } 89 | PluginConfig pluginConfig = (PluginConfig) readerConfig.clone(); 90 | pluginConfig.put(HiveReaderProperties.TABLE_FILES, splitedFiles); 91 | list.add(pluginConfig); 92 | } 93 | } 94 | 95 | Hive.closeCurrent(); 96 | return list; 97 | } 98 | 99 | private List getTableFiles(String tableLocation) { 100 | try { 101 | Configuration conf = new Configuration(); 102 | Path path = new Path(tableLocation); 103 | FileSystem hdfs = path.getFileSystem(conf); 104 | FileStatus[] fileStatus = hdfs.listStatus(path); 105 | List files = new ArrayList(); 106 | for (FileStatus fs : fileStatus) { 107 | if (!fs.isDir() && !fs.getPath().getName().startsWith("_")) { 108 | files.add(fs.getPath().toString()); 109 | } 110 | } 111 | return files; 112 | } catch (IOException e) { 113 | LoggerUtils.error(LOG, e); 114 | return null; 115 | } 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/jdbc/JBDCReaderProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.jdbc; 2 | 3 | public class JBDCReaderProperties { 4 | 5 | public static final String DRIVER = "driver"; 6 | public static final String URL = "url"; 7 | public static final String USERNAME = "username"; 8 | public static final String PASSWORD = "password"; 9 | public static final String TABLE = "table"; 10 | public static final String COLUMNS = "columns"; 11 | public static final String EXCLUDE_COLUMNS = "excludeColumns"; 12 | public static final String WHERE = "where"; 13 | public static final String SQL = "sql"; 14 | public static final String SPLIT_BY = "splitBy"; 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/jdbc/JDBCReader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.jdbc; 2 | 3 | import java.sql.Connection; 4 | import java.sql.ResultSet; 5 | import java.sql.ResultSetMetaData; 6 | import java.sql.SQLException; 7 | import java.sql.Statement; 8 | 9 | import opensource.hdata.config.PluginConfig; 10 | import opensource.hdata.core.DefaultRecord; 11 | import opensource.hdata.core.Fields; 12 | import opensource.hdata.core.JobContext; 13 | import opensource.hdata.core.OutputFieldsDeclarer; 14 | import opensource.hdata.core.plugin.Reader; 15 | import opensource.hdata.core.plugin.Record; 16 | import opensource.hdata.core.plugin.RecordCollector; 17 | import opensource.hdata.exception.HDataException; 18 | import opensource.hdata.util.JDBCUtils; 19 | 20 | import org.apache.logging.log4j.LogManager; 21 | import org.apache.logging.log4j.Logger; 22 | 23 | public class JDBCReader extends Reader { 24 | 25 | private Connection connection; 26 | private String sql; 27 | private Fields fields; 28 | private static final Logger LOG = LogManager.getLogger(JDBCReader.class); 29 | 30 | @Override 31 | public void prepare(JobContext context, PluginConfig readerConfig) { 32 | String driver = readerConfig.getString(JBDCReaderProperties.DRIVER); 33 | String url = readerConfig.getString(JBDCReaderProperties.URL); 34 | String username = readerConfig.getString(JBDCReaderProperties.USERNAME); 35 | String password = readerConfig.getString(JBDCReaderProperties.PASSWORD); 36 | sql = readerConfig.getString(JBDCReaderProperties.SQL); 37 | LOG.debug(sql); 38 | 39 | try { 40 | connection = JDBCUtils.getConnection(driver, url, username, password); 41 | } catch (Exception e) { 42 | throw new HDataException(e); 43 | } 44 | } 45 | 46 | @Override 47 | public void execute(RecordCollector recordCollector) { 48 | try { 49 | Statement statement = connection.createStatement(); 50 | 51 | ResultSet rs = statement.executeQuery(sql); 52 | ResultSetMetaData metaData = rs.getMetaData(); 53 | int ColumnCount = metaData.getColumnCount(); 54 | 55 | if (fields == null) { 56 | fields = new Fields(); 57 | for (int i = 1; i <= ColumnCount; i++) { 58 | fields.add(metaData.getColumnName(i)); 59 | } 60 | } 61 | 62 | while (rs.next()) { 63 | Record r = new DefaultRecord(ColumnCount); 64 | for (int i = 1; i <= ColumnCount; i++) { 65 | r.addField(i - 1, rs.getObject(i)); 66 | } 67 | recordCollector.send(r); 68 | } 69 | rs.close(); 70 | statement.close(); 71 | } catch (SQLException e) { 72 | e.printStackTrace(); 73 | JDBCUtils.closeConnection(connection); 74 | throw new HDataException(e); 75 | } 76 | } 77 | 78 | @Override 79 | public void close() { 80 | JDBCUtils.closeConnection(connection); 81 | } 82 | 83 | @Override 84 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 85 | declarer.declare(fields); 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/jdbc/JDBCSplitter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.jdbc; 2 | 3 | import java.sql.Connection; 4 | import java.sql.SQLException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import opensource.hdata.common.Constants; 9 | import opensource.hdata.config.JobConfig; 10 | import opensource.hdata.config.PluginConfig; 11 | import opensource.hdata.core.plugin.Splitter; 12 | import opensource.hdata.exception.HDataException; 13 | import opensource.hdata.util.JDBCUtils; 14 | import opensource.hdata.util.Utils; 15 | 16 | import org.apache.logging.log4j.LogManager; 17 | import org.apache.logging.log4j.Logger; 18 | 19 | import com.google.common.base.Joiner; 20 | 21 | public class JDBCSplitter extends Splitter { 22 | 23 | private static final String CONDITIONS_REGEX = "\\$CONDITIONS"; 24 | private static final Logger LOG = LogManager.getLogger(JDBCSplitter.class); 25 | 26 | private void checkIfContainsConditionKey(String sql, String errorMessage) { 27 | if (!sql.contains("$CONDITIONS")) { 28 | throw new HDataException(errorMessage); 29 | } 30 | } 31 | 32 | private List buildPluginConfigs(Connection conn, String sql, String splitColumn, PluginConfig readerConfig) { 33 | List list = new ArrayList(); 34 | try { 35 | int parallelism = readerConfig.getParallelism(); 36 | double[] minAndMax = JDBCUtils.querySplitColumnRange(conn, sql.replaceAll(CONDITIONS_REGEX, "(1 = 1)"), splitColumn); 37 | double min = minAndMax[0]; 38 | double max = minAndMax[1] + 1; 39 | double step = (max - min) / parallelism; 40 | for (int i = 0, len = parallelism; i < len; i++) { 41 | PluginConfig otherReaderConfig = (PluginConfig) readerConfig.clone(); 42 | StringBuilder sb = new StringBuilder(); 43 | sb.append(splitColumn); 44 | sb.append(" >= "); 45 | sb.append((long) Math.ceil(min + step * i)); 46 | sb.append(" AND "); 47 | sb.append(splitColumn); 48 | 49 | if (i == (len - 1)) { 50 | sb.append(" <= "); 51 | } else { 52 | sb.append(" < "); 53 | } 54 | sb.append((long) Math.ceil(min + step * (i + 1))); 55 | 56 | otherReaderConfig.setProperty(JBDCReaderProperties.SQL, sql.toString().replaceAll(CONDITIONS_REGEX, sb.toString())); 57 | list.add(otherReaderConfig); 58 | } 59 | return list; 60 | } catch (SQLException e) { 61 | throw new HDataException(e); 62 | } finally { 63 | JDBCUtils.closeConnection(conn); 64 | } 65 | } 66 | 67 | @Override 68 | public List split(JobConfig jobConfig) { 69 | PluginConfig readerConfig = jobConfig.getReaderConfig(); 70 | String driver = readerConfig.getString(JBDCReaderProperties.DRIVER); 71 | String url = readerConfig.getString(JBDCReaderProperties.URL); 72 | String username = readerConfig.getString(JBDCReaderProperties.USERNAME); 73 | String password = readerConfig.getString(JBDCReaderProperties.PASSWORD); 74 | int parallelism = readerConfig.getParallelism(); 75 | 76 | StringBuilder sql = new StringBuilder(); 77 | if (readerConfig.containsKey(JBDCReaderProperties.SQL)) { 78 | if (parallelism > 1) { 79 | checkIfContainsConditionKey(readerConfig.getString(JBDCReaderProperties.SQL), 80 | "Reader must contains key word \"$CONDITIONS\" in sql property when parallelism > 1."); 81 | } 82 | sql.append(readerConfig.get(JBDCReaderProperties.SQL)); 83 | } else { 84 | String table = readerConfig.getString(JBDCReaderProperties.TABLE); 85 | sql.append("SELECT "); 86 | if (!readerConfig.containsKey(JBDCReaderProperties.COLUMNS) && !readerConfig.containsKey(JBDCReaderProperties.EXCLUDE_COLUMNS)) { 87 | sql.append("*"); 88 | } else if (readerConfig.containsKey(JBDCReaderProperties.COLUMNS)) { 89 | String columns = readerConfig.getString(JBDCReaderProperties.COLUMNS); 90 | sql.append(columns); 91 | } else if (readerConfig.containsKey(JBDCReaderProperties.EXCLUDE_COLUMNS)) { 92 | String[] excludeColumns = readerConfig.getString(JBDCReaderProperties.EXCLUDE_COLUMNS).trim().split(Constants.COLUMNS_SPLIT_REGEX); 93 | Connection conn = null; 94 | try { 95 | conn = JDBCUtils.getConnection(driver, url, username, password); 96 | String selectColumns = Joiner.on(", ").join(Utils.getColumns(JDBCUtils.getColumnNames(conn, table), excludeColumns)); 97 | sql.append(selectColumns); 98 | } catch (Exception e) { 99 | e.printStackTrace(); 100 | JDBCUtils.closeConnection(conn); 101 | throw new HDataException(e); 102 | } 103 | 104 | } 105 | sql.append(" FROM "); 106 | sql.append(table); 107 | 108 | if (readerConfig.containsKey(JBDCReaderProperties.WHERE)) { 109 | String where = readerConfig.getString(JBDCReaderProperties.WHERE); 110 | sql.append(" WHERE "); 111 | sql.append(where); 112 | sql.append(" AND $CONDITIONS"); 113 | } else { 114 | sql.append(" WHERE $CONDITIONS"); 115 | } 116 | } 117 | 118 | if (readerConfig.containsKey(JBDCReaderProperties.SPLIT_BY)) { 119 | String splitColumn = readerConfig.getString(JBDCReaderProperties.SPLIT_BY); 120 | LOG.debug("Get split-by column: {}", splitColumn); 121 | 122 | Connection conn = null; 123 | try { 124 | conn = JDBCUtils.getConnection(driver, url, username, password); 125 | return buildPluginConfigs(conn, sql.toString(), splitColumn, readerConfig); 126 | } catch (Exception e) { 127 | throw new HDataException(e); 128 | } finally { 129 | JDBCUtils.closeConnection(conn); 130 | } 131 | } else { 132 | if (readerConfig.containsKey(JBDCReaderProperties.TABLE)) { 133 | Connection conn = null; 134 | try { 135 | String table = readerConfig.getString(JBDCReaderProperties.TABLE); 136 | LOG.info("Attemp to query digital primary key for table: {}", table); 137 | conn = JDBCUtils.getConnection(driver, url, username, password); 138 | String splitColumn = JDBCUtils.getDigitalPrimaryKey(conn, conn.getCatalog(), null, table); 139 | if (splitColumn != null) { 140 | LOG.info("Table {} find digital primary key: {}", table, splitColumn); 141 | return buildPluginConfigs(conn, sql.toString(), splitColumn, readerConfig); 142 | } else { 143 | LOG.info("Table {} can not find digital primary key.", table); 144 | } 145 | } catch (Exception e) { 146 | throw new HDataException(e); 147 | } finally { 148 | JDBCUtils.closeConnection(conn); 149 | } 150 | } 151 | 152 | if (parallelism > 1) { 153 | LOG.warn( 154 | "Reader parallelism is set to {}, but the \"split-by\" config is not given, so reader parallelism is set to default value: 1.", 155 | parallelism); 156 | } 157 | 158 | List list = new ArrayList(); 159 | readerConfig.setProperty(JBDCReaderProperties.SQL, sql.toString().replaceAll(CONDITIONS_REGEX, "(1 = 1)")); 160 | list.add(readerConfig); 161 | return list; 162 | } 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/mongodb/MongoDBReader.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.mongodb; 2 | 3 | import java.net.UnknownHostException; 4 | import java.util.Set; 5 | 6 | import opensource.hdata.config.PluginConfig; 7 | import opensource.hdata.core.DefaultRecord; 8 | import opensource.hdata.core.Fields; 9 | import opensource.hdata.core.JobContext; 10 | import opensource.hdata.core.OutputFieldsDeclarer; 11 | import opensource.hdata.core.plugin.Reader; 12 | import opensource.hdata.core.plugin.Record; 13 | import opensource.hdata.core.plugin.RecordCollector; 14 | import opensource.hdata.exception.HDataException; 15 | 16 | import com.mongodb.BasicDBObject; 17 | import com.mongodb.DB; 18 | import com.mongodb.DBCollection; 19 | import com.mongodb.DBCursor; 20 | import com.mongodb.DBObject; 21 | import com.mongodb.MongoClient; 22 | import com.mongodb.MongoClientURI; 23 | 24 | public class MongoDBReader extends Reader { 25 | 26 | private Fields fields; 27 | private String uri; 28 | private BasicDBObject condition; 29 | private static final String OBJECT_ID_KEY = "_id"; 30 | 31 | @Override 32 | public void prepare(JobContext context, PluginConfig readerConfig) { 33 | uri = readerConfig.getString(MongoDBReaderProperties.URI); 34 | condition = (BasicDBObject) readerConfig.get(MongoDBReaderProperties.QUERY); 35 | } 36 | 37 | @Override 38 | public void execute(RecordCollector recordCollector) { 39 | MongoClientURI clientURI = new MongoClientURI(uri); 40 | MongoClient mongoClient = null; 41 | try { 42 | mongoClient = new MongoClient(clientURI); 43 | DB db = mongoClient.getDB(clientURI.getDatabase()); 44 | DBCollection coll = db.getCollection(clientURI.getCollection()); 45 | DBCursor cur = coll.find(condition); 46 | while (cur.hasNext()) { 47 | DBObject doc = cur.next(); 48 | Set keys = doc.keySet(); 49 | Record record = new DefaultRecord(keys.size() - 1); 50 | if (fields == null) { 51 | fields = new Fields(); 52 | for (String key : keys) { 53 | fields.add(key); 54 | } 55 | } 56 | 57 | for (String key : keys) { 58 | if (!OBJECT_ID_KEY.equals(key)) { 59 | record.addField(doc.get(key)); 60 | } 61 | } 62 | 63 | recordCollector.send(record); 64 | } 65 | } catch (UnknownHostException e) { 66 | throw new HDataException(e); 67 | } finally { 68 | if (mongoClient != null) { 69 | mongoClient.close(); 70 | } 71 | } 72 | } 73 | 74 | @Override 75 | public void declareOutputFields(OutputFieldsDeclarer declarer) { 76 | declarer.declare(fields); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/mongodb/MongoDBReaderProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.mongodb; 2 | 3 | public class MongoDBReaderProperties { 4 | 5 | public static final String URI = "uri"; 6 | public static final String QUERY = "query"; 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/reader/mongodb/MongoDBSplitter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.reader.mongodb; 2 | 3 | import java.math.BigInteger; 4 | import java.net.UnknownHostException; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import opensource.hdata.config.JobConfig; 9 | import opensource.hdata.config.PluginConfig; 10 | import opensource.hdata.core.plugin.Splitter; 11 | import opensource.hdata.exception.HDataException; 12 | 13 | import org.bson.types.ObjectId; 14 | 15 | import com.mongodb.BasicDBObject; 16 | import com.mongodb.DB; 17 | import com.mongodb.DBCollection; 18 | import com.mongodb.DBCursor; 19 | import com.mongodb.DBObject; 20 | import com.mongodb.MongoClient; 21 | import com.mongodb.MongoClientURI; 22 | import com.mongodb.util.JSON; 23 | 24 | public class MongoDBSplitter extends Splitter { 25 | 26 | private static final String OBJECT_ID_KEY = "_id"; 27 | private static final int HEXADECIMAL = 16; 28 | 29 | @Override 30 | public List split(JobConfig jobConfig) { 31 | List list = new ArrayList(); 32 | PluginConfig readerConfig = jobConfig.getReaderConfig(); 33 | String uri = readerConfig.getString(MongoDBReaderProperties.URI); 34 | int parallelism = readerConfig.getParallelism(); 35 | 36 | MongoClientURI clientURI = new MongoClientURI(uri); 37 | MongoClient mongoClient = null; 38 | try { 39 | mongoClient = new MongoClient(clientURI); 40 | DB db = mongoClient.getDB(clientURI.getDatabase()); 41 | DBCollection coll = db.getCollection(clientURI.getCollection()); 42 | 43 | String maxID = ""; 44 | String minID = ""; 45 | DBObject sort = new BasicDBObject(); 46 | sort.put(OBJECT_ID_KEY, -1); 47 | DBCursor cursor = coll.find().sort(sort).limit(1); 48 | while (cursor.hasNext()) { 49 | maxID = cursor.next().get(OBJECT_ID_KEY).toString(); 50 | } 51 | 52 | sort.put(OBJECT_ID_KEY, 1); 53 | cursor = coll.find().sort(sort).limit(1); 54 | while (cursor.hasNext()) { 55 | minID = cursor.next().get(OBJECT_ID_KEY).toString(); 56 | } 57 | 58 | if (!maxID.isEmpty() && !minID.isEmpty()) { 59 | BigInteger maxBigInteger = new BigInteger(maxID, HEXADECIMAL); 60 | BigInteger minBigInteger = new BigInteger(minID, HEXADECIMAL); 61 | BigInteger step = (maxBigInteger.subtract(minBigInteger).divide(BigInteger.valueOf(parallelism))); 62 | for (int i = 0, len = parallelism; i < len; i++) { 63 | BasicDBObject condition = null; 64 | if (readerConfig.containsKey(MongoDBReaderProperties.QUERY)) { 65 | condition = (BasicDBObject) JSON.parse(readerConfig.getString(MongoDBReaderProperties.QUERY)); 66 | } else { 67 | condition = new BasicDBObject(); 68 | } 69 | 70 | BasicDBObject idRange = new BasicDBObject("$gte", new ObjectId(minBigInteger.add(step.multiply(BigInteger.valueOf(i))).toString( 71 | HEXADECIMAL))); 72 | if (i == len - 1) { 73 | idRange.append("$lte", new ObjectId(maxBigInteger.toString(HEXADECIMAL))); 74 | } else { 75 | idRange.append("$lt", new ObjectId(minBigInteger.add(step.multiply(BigInteger.valueOf(i + 1))).toString(HEXADECIMAL))); 76 | } 77 | 78 | condition.put(OBJECT_ID_KEY, idRange); 79 | 80 | PluginConfig pluginConfig = (PluginConfig) readerConfig.clone(); 81 | pluginConfig.put(MongoDBReaderProperties.QUERY, condition); 82 | list.add(pluginConfig); 83 | } 84 | } 85 | } catch (UnknownHostException e) { 86 | throw new HDataException(e); 87 | } finally { 88 | if (mongoClient != null) { 89 | mongoClient.close(); 90 | } 91 | } 92 | 93 | return list; 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/console/ConsoleWriter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.console; 2 | 3 | import opensource.hdata.core.plugin.Record; 4 | import opensource.hdata.core.plugin.Writer; 5 | 6 | public class ConsoleWriter extends Writer { 7 | 8 | @Override 9 | public void execute(Record record) { 10 | System.out.println(record); 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/ftp/FTPWriter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.ftp; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.IOException; 5 | import java.io.OutputStream; 6 | import java.io.OutputStreamWriter; 7 | import java.util.concurrent.atomic.AtomicInteger; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | import java.util.zip.GZIPOutputStream; 11 | 12 | import opensource.hdata.config.PluginConfig; 13 | import opensource.hdata.core.JobContext; 14 | import opensource.hdata.core.plugin.Record; 15 | import opensource.hdata.core.plugin.Writer; 16 | import opensource.hdata.exception.HDataException; 17 | import opensource.hdata.util.EscaperUtils; 18 | import opensource.hdata.util.FTPUtils; 19 | 20 | import org.apache.commons.net.ftp.FTPClient; 21 | 22 | import com.google.common.base.Joiner; 23 | 24 | public class FTPWriter extends Writer { 25 | 26 | private String host; 27 | private int port; 28 | private String username; 29 | private String password; 30 | private String fieldsSeparator; 31 | private String lineSeparator; 32 | private String encoding; 33 | private String path; 34 | private boolean gzipCompress; 35 | private FTPClient ftpClient; 36 | private BufferedWriter bw; 37 | private String[] strArray; 38 | private static AtomicInteger sequence = new AtomicInteger(0); 39 | private static final Pattern REG_FILE_PATH_WITHOUT_EXTENSION = Pattern.compile(".*?(?=\\.\\w+$)"); 40 | private static final Pattern REG_FILE_EXTENSION = Pattern.compile("(\\.\\w+)$"); 41 | 42 | @Override 43 | public void prepare(JobContext context, PluginConfig writerConfig) { 44 | host = writerConfig.getString(FTPWriterProperties.HOST); 45 | port = writerConfig.getInt(FTPWriterProperties.PORT, 21); 46 | username = writerConfig.getString(FTPWriterProperties.USERNAME, "anonymous"); 47 | password = writerConfig.getString(FTPWriterProperties.PASSWORD, ""); 48 | fieldsSeparator = EscaperUtils.parse(writerConfig.getString(FTPWriterProperties.FIELDS_SEPARATOR, "\t")); 49 | lineSeparator = EscaperUtils.parse(writerConfig.getString(FTPWriterProperties.LINE_SEPARATOR, "\n")); 50 | encoding = writerConfig.getString(FTPWriterProperties.ENCODING, "UTF-8"); 51 | path = writerConfig.getString(FTPWriterProperties.PATH); 52 | gzipCompress = writerConfig.getBoolean(FTPWriterProperties.GZIP_COMPRESS, false); 53 | 54 | int parallelism = writerConfig.getParallelism(); 55 | if (parallelism > 1) { 56 | String filePathWithoutExtension = ""; 57 | String fileExtension = ""; 58 | Matcher m1 = REG_FILE_PATH_WITHOUT_EXTENSION.matcher(path.trim()); 59 | if (m1.find()) { 60 | filePathWithoutExtension = m1.group(); 61 | } 62 | 63 | Matcher m2 = REG_FILE_EXTENSION.matcher(path.trim()); 64 | if (m2.find()) { 65 | fileExtension = m2.group(); 66 | } 67 | path = String.format("%s_%04d%s", filePathWithoutExtension, sequence.getAndIncrement(), fileExtension); 68 | } 69 | 70 | try { 71 | ftpClient = FTPUtils.getFtpClient(host, port, username, password); 72 | OutputStream outputStream = ftpClient.storeFileStream(path); 73 | if (gzipCompress) { 74 | bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(outputStream), encoding)); 75 | } else { 76 | bw = new BufferedWriter(new OutputStreamWriter(outputStream, encoding)); 77 | } 78 | } catch (Exception e) { 79 | throw new HDataException(e); 80 | } 81 | } 82 | 83 | @Override 84 | public void execute(Record record) { 85 | if (strArray == null) { 86 | strArray = new String[record.getFieldsCount()]; 87 | } 88 | 89 | for (int i = 0, len = record.getFieldsCount(); i < len; i++) { 90 | Object o = record.getField(i); 91 | if (o == null) { 92 | strArray[i] = "NULL"; 93 | } else { 94 | strArray[i] = o.toString(); 95 | } 96 | } 97 | try { 98 | bw.write(Joiner.on(fieldsSeparator).join(strArray)); 99 | bw.write(lineSeparator); 100 | } catch (IOException e) { 101 | throw new HDataException(e); 102 | } 103 | } 104 | 105 | @Override 106 | public void close() { 107 | if (bw != null) { 108 | try { 109 | bw.close(); 110 | } catch (IOException e) { 111 | throw new HDataException(e); 112 | } 113 | } 114 | FTPUtils.closeFtpClient(ftpClient); 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/ftp/FTPWriterProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.ftp; 2 | 3 | public class FTPWriterProperties { 4 | public static final String HOST = "host"; 5 | public static final String PORT = "port"; 6 | public static final String USERNAME = "username"; 7 | public static final String PASSWORD = "password"; 8 | public static final String PATH = "path"; 9 | public static final String ENCODING = "encoding"; 10 | public static final String FIELDS_SEPARATOR = "fieldsSeparator"; 11 | public static final String LINE_SEPARATOR = "lineSeparator"; 12 | public static final String GZIP_COMPRESS = "gzipCompress"; 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/hbase/HBaseWriter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.hbase; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import opensource.hdata.config.PluginConfig; 8 | import opensource.hdata.core.JobContext; 9 | import opensource.hdata.core.plugin.Record; 10 | import opensource.hdata.core.plugin.Writer; 11 | import opensource.hdata.exception.HDataException; 12 | 13 | import org.apache.hadoop.conf.Configuration; 14 | import org.apache.hadoop.hbase.HBaseConfiguration; 15 | import org.apache.hadoop.hbase.client.HTable; 16 | import org.apache.hadoop.hbase.client.Put; 17 | import org.apache.hadoop.hbase.util.Bytes; 18 | 19 | public class HBaseWriter extends Writer { 20 | 21 | private HTable table; 22 | private int batchSize; 23 | private int rowkeyIndex = -1; 24 | private List putList = new ArrayList(); 25 | private String[] columns; 26 | private static final String ROWKEY = ":rowkey"; 27 | 28 | @Override 29 | public void prepare(JobContext context, PluginConfig writerConfig) { 30 | Configuration conf = HBaseConfiguration.create(); 31 | conf.set("hbase.zookeeper.quorum", writerConfig.getString(HBaseWriterProperties.ZOOKEEPER_QUORUM)); 32 | conf.set("hbase.zookeeper.property.clientPort", writerConfig.getString(HBaseWriterProperties.ZOOKEEPER_PROPERTY_CLIENTPORT, "2181")); 33 | batchSize = writerConfig.getInt(HBaseWriterProperties.BATCH_INSERT_SIZE, 10000); 34 | columns = writerConfig.getString(HBaseWriterProperties.COLUMNS).split(","); 35 | for (int i = 0, len = columns.length; i < len; i++) { 36 | if (ROWKEY.equalsIgnoreCase(columns[i])) { 37 | rowkeyIndex = i; 38 | break; 39 | } 40 | } 41 | 42 | if (rowkeyIndex == -1) { 43 | throw new IllegalArgumentException("Can not find :rowkey in columnsMapping of HBase Writer!"); 44 | } 45 | 46 | try { 47 | table = new HTable(conf, writerConfig.getString(HBaseWriterProperties.TABLE)); 48 | } catch (IOException e) { 49 | throw new HDataException(e); 50 | } 51 | } 52 | 53 | @Override 54 | public void execute(Record record) { 55 | Object rowkeyValue = record.getField(rowkeyIndex); 56 | Put put = new Put(Bytes.toBytes(rowkeyValue == null ? "NULL" : rowkeyValue.toString())); 57 | for (int i = 0, len = record.getFieldsCount(); i < len; i++) { 58 | if (i != rowkeyIndex) { 59 | String[] tokens = columns[i].split(":"); 60 | put.add(Bytes.toBytes(tokens[0]), Bytes.toBytes(tokens[1]), 61 | record.getField(i) == null ? null : Bytes.toBytes(record.getField(i).toString())); 62 | } 63 | } 64 | 65 | putList.add(put); 66 | if (putList.size() == batchSize) { 67 | try { 68 | table.put(putList); 69 | } catch (IOException e) { 70 | throw new HDataException(e); 71 | } 72 | putList.clear(); 73 | } 74 | } 75 | 76 | @Override 77 | public void close() { 78 | if (table != null) { 79 | try { 80 | if (putList.size() > 0) { 81 | table.put(putList); 82 | } 83 | 84 | table.close(); 85 | } catch (IOException e) { 86 | throw new HDataException(e); 87 | } 88 | putList.clear(); 89 | } 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/hbase/HBaseWriterProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.hbase; 2 | 3 | public class HBaseWriterProperties { 4 | public static final String ZOOKEEPER_QUORUM = "zookeeperQuorum"; 5 | public static final String ZOOKEEPER_PROPERTY_CLIENTPORT = "zookeeperClientPort"; 6 | public static final String TABLE = "table"; 7 | public static final String COLUMNS = "columns"; 8 | public static final String BATCH_INSERT_SIZE = "batchInsertSize"; 9 | } 10 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/hdfs/HDFSWriter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.hdfs; 2 | 3 | import java.io.BufferedWriter; 4 | import java.io.IOException; 5 | import java.io.OutputStreamWriter; 6 | import java.util.concurrent.atomic.AtomicInteger; 7 | import java.util.regex.Matcher; 8 | import java.util.regex.Pattern; 9 | 10 | import opensource.hdata.config.PluginConfig; 11 | import opensource.hdata.core.JobContext; 12 | import opensource.hdata.core.plugin.Record; 13 | import opensource.hdata.core.plugin.Writer; 14 | import opensource.hdata.exception.HDataException; 15 | import opensource.hdata.util.EscaperUtils; 16 | 17 | import org.apache.hadoop.conf.Configuration; 18 | import org.apache.hadoop.fs.FSDataOutputStream; 19 | import org.apache.hadoop.fs.FileSystem; 20 | import org.apache.hadoop.fs.Path; 21 | import org.apache.hadoop.io.compress.CompressionCodec; 22 | import org.apache.hadoop.io.compress.CompressionCodecFactory; 23 | 24 | import com.google.common.base.Joiner; 25 | 26 | public class HDFSWriter extends Writer { 27 | 28 | private String path; 29 | private String fieldsSeparator; 30 | private String lineSeparator; 31 | private String encoding; 32 | private String compressCodec; 33 | private String hadoopUser; 34 | private BufferedWriter bw; 35 | private String[] strArray; 36 | private static AtomicInteger sequence = new AtomicInteger(0); 37 | private static final Pattern REG_FILE_PATH_WITHOUT_EXTENSION = Pattern.compile(".*?(?=\\.\\w+$)"); 38 | private static final Pattern REG_FILE_EXTENSION = Pattern.compile("(\\.\\w+)$"); 39 | 40 | @Override 41 | public void prepare(JobContext context, PluginConfig writerConfig) { 42 | path = writerConfig.getString(HDFSWriterProperties.PATH); 43 | fieldsSeparator = EscaperUtils.parse(writerConfig.getString(HDFSWriterProperties.FIELDS_SEPARATOR, "\t")); 44 | lineSeparator = EscaperUtils.parse(writerConfig.getString(HDFSWriterProperties.LINE_SEPARATOR, "\n")); 45 | encoding = writerConfig.getString(HDFSWriterProperties.ENCODING, "UTF-8"); 46 | compressCodec = writerConfig.getProperty(HDFSWriterProperties.COMPRESS_CODEC); 47 | hadoopUser = writerConfig.getString(HDFSWriterProperties.HADOOP_USER); 48 | System.setProperty("HADOOP_USER_NAME", hadoopUser); 49 | 50 | int parallelism = writerConfig.getParallelism(); 51 | if (parallelism > 1) { 52 | String filePathWithoutExtension = ""; 53 | String fileExtension = ""; 54 | Matcher m1 = REG_FILE_PATH_WITHOUT_EXTENSION.matcher(path.trim()); 55 | if (m1.find()) { 56 | filePathWithoutExtension = m1.group(); 57 | } 58 | 59 | Matcher m2 = REG_FILE_EXTENSION.matcher(path.trim()); 60 | if (m2.find()) { 61 | fileExtension = m2.group(); 62 | } 63 | path = String.format("%s_%04d%s", filePathWithoutExtension, sequence.getAndIncrement(), fileExtension); 64 | } 65 | 66 | Path hdfsPath = new Path(path); 67 | Configuration conf = new Configuration(); 68 | try { 69 | FileSystem fs = hdfsPath.getFileSystem(conf); 70 | FSDataOutputStream output = fs.create(hdfsPath); 71 | if (compressCodec == null) { 72 | bw = new BufferedWriter(new OutputStreamWriter(output, encoding)); 73 | } else { 74 | CompressionCodecFactory factory = new CompressionCodecFactory(conf); 75 | CompressionCodec codec = factory.getCodecByClassName(compressCodec); 76 | bw = new BufferedWriter(new OutputStreamWriter(codec.createOutputStream(output), encoding)); 77 | } 78 | } catch (IOException e) { 79 | throw new HDataException(e); 80 | } 81 | 82 | } 83 | 84 | @Override 85 | public void execute(Record record) { 86 | if (strArray == null) { 87 | strArray = new String[record.getFieldsCount()]; 88 | } 89 | 90 | for (int i = 0, len = record.getFieldsCount(); i < len; i++) { 91 | Object o = record.getField(i); 92 | if (o == null) { 93 | strArray[i] = "NULL"; 94 | } else { 95 | strArray[i] = o.toString(); 96 | } 97 | } 98 | try { 99 | bw.write(Joiner.on(fieldsSeparator).join(strArray)); 100 | bw.write(lineSeparator); 101 | } catch (IOException e) { 102 | throw new HDataException(e); 103 | } 104 | } 105 | 106 | @Override 107 | public void close() { 108 | if (bw != null) { 109 | try { 110 | bw.flush(); 111 | bw.close(); 112 | } catch (IOException e) { 113 | throw new HDataException(e); 114 | } 115 | } 116 | } 117 | } 118 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/hdfs/HDFSWriterProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.hdfs; 2 | 3 | public class HDFSWriterProperties { 4 | public static final String PATH = "path"; 5 | public static final String FIELDS_SEPARATOR = "fieldsSeparator"; 6 | public static final String LINE_SEPARATOR = "lineSeparator"; 7 | public static final String ENCODING = "encoding"; 8 | public static final String COMPRESS_CODEC = "compressCodec"; 9 | public static final String HADOOP_USER = "hadoopUser"; 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/hive/HiveRecordWritable.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.hive; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Writable; 8 | 9 | public class HiveRecordWritable implements Writable { 10 | 11 | public void write(DataOutput dataOutput) throws IOException { 12 | throw new UnsupportedOperationException("no write"); 13 | } 14 | 15 | public void readFields(DataInput dataInput) throws IOException { 16 | throw new UnsupportedOperationException("no read"); 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/hive/HiveWriter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.hive; 2 | 3 | import java.io.IOException; 4 | import java.lang.reflect.Field; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.UUID; 10 | import java.util.regex.Matcher; 11 | import java.util.regex.Pattern; 12 | 13 | import javassist.ClassPool; 14 | import javassist.CtClass; 15 | import javassist.CtField; 16 | import opensource.hdata.config.PluginConfig; 17 | import opensource.hdata.core.JobContext; 18 | import opensource.hdata.core.plugin.Record; 19 | import opensource.hdata.core.plugin.Writer; 20 | import opensource.hdata.exception.HDataException; 21 | import opensource.hdata.plugin.reader.hive.HiveReaderProperties; 22 | import opensource.hdata.util.HiveTypeUtils; 23 | import opensource.hdata.util.LoggerUtils; 24 | import opensource.hdata.util.TypeConvertUtils; 25 | import opensource.hdata.util.Utils; 26 | 27 | import org.apache.hadoop.fs.FileSystem; 28 | import org.apache.hadoop.fs.Path; 29 | import org.apache.hadoop.hive.conf.HiveConf; 30 | import org.apache.hadoop.hive.conf.HiveConf.ConfVars; 31 | import org.apache.hadoop.hive.metastore.api.FieldSchema; 32 | import org.apache.hadoop.hive.ql.exec.FileSinkOperator; 33 | import org.apache.hadoop.hive.ql.io.HiveOutputFormat; 34 | import org.apache.hadoop.hive.ql.metadata.Hive; 35 | import org.apache.hadoop.hive.ql.metadata.HiveException; 36 | import org.apache.hadoop.hive.ql.metadata.Partition; 37 | import org.apache.hadoop.hive.ql.metadata.Table; 38 | import org.apache.hadoop.hive.serde2.Serializer; 39 | import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; 40 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; 41 | import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector; 42 | import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; 43 | import org.apache.hadoop.mapred.JobConf; 44 | import org.apache.hadoop.mapred.Reporter; 45 | import org.apache.logging.log4j.LogManager; 46 | import org.apache.logging.log4j.Logger; 47 | 48 | @SuppressWarnings("deprecation") 49 | public class HiveWriter extends Writer { 50 | 51 | private Serializer serializer; 52 | private HiveOutputFormat outputFormat; 53 | private StructObjectInspector inspector; 54 | private FileSinkOperator.RecordWriter writer; 55 | private Path path = null; 56 | private Map partitionSpecify = new HashMap(); 57 | private int partitionKeySize; 58 | private PluginConfig writerConfig; 59 | private Object hiveRecord; 60 | private String hdfsTmpDir; 61 | 62 | private static Class hiveRecordWritale; 63 | private static List classFields = new ArrayList(); 64 | private static List files = new ArrayList(); 65 | private static final Pattern HDFS_MASTER = Pattern.compile("hdfs://[\\w\\.]+:\\d+"); 66 | private static final Logger LOG = LogManager.getLogger(HiveWriter.class); 67 | 68 | private synchronized static void createHiveRecordClass(List columns) { 69 | if (hiveRecordWritale == null) { 70 | ClassPool pool = ClassPool.getDefault(); 71 | try { 72 | CtClass ctClass = pool.get("opensource.hdata.plugin.writer.hive.HiveRecordWritable"); 73 | for (FieldSchema fieldSchema : columns) { 74 | PrimitiveCategory primitiveCategory = HiveTypeUtils.getPrimitiveCategory(fieldSchema.getType().replaceAll("\\(.*\\)", "") 75 | .toUpperCase()); 76 | Class fieldTypeClazz = PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(primitiveCategory) 77 | .getJavaPrimitiveClass(); 78 | CtField ctField = new CtField(pool.get(fieldTypeClazz.getName()), fieldSchema.getName(), ctClass); 79 | ctClass.addField(ctField); 80 | } 81 | hiveRecordWritale = ctClass.toClass(); 82 | for (Field field : hiveRecordWritale.getDeclaredFields()) { 83 | field.setAccessible(true); 84 | classFields.add(field); 85 | } 86 | } catch (Exception e) { 87 | throw new HDataException(e); 88 | } 89 | } 90 | } 91 | 92 | @Override 93 | public void prepare(JobContext context, PluginConfig writerConfig) { 94 | hdfsTmpDir = context.getEngineConfig().getString("hdata.hive.writer.tmp.dir", "/tmp"); 95 | this.writerConfig = writerConfig; 96 | String metastoreUris = writerConfig.getString(HiveWriterProperties.METASTORE_URIS); 97 | String dbName = writerConfig.getString(HiveWriterProperties.DATABASE, "default"); 98 | String tableName = writerConfig.getString(HiveWriterProperties.TABLE); 99 | boolean isCompress = writerConfig.getBoolean(HiveWriterProperties.COMPRESS, true); 100 | 101 | System.setProperty("HADOOP_USER_NAME", writerConfig.getString(HiveWriterProperties.HADOOP_USER)); 102 | 103 | HiveConf conf = new HiveConf(); 104 | conf.set(ConfVars.METASTOREURIS.varname, metastoreUris); 105 | 106 | Hive hive; 107 | Table table; 108 | try { 109 | hive = Hive.get(conf, true); 110 | table = hive.getTable(dbName, tableName, false); 111 | 112 | partitionKeySize = table.getPartitionKeys().size(); 113 | serializer = (Serializer) table.getDeserializer(); 114 | outputFormat = (HiveOutputFormat) table.getOutputFormatClass().newInstance(); 115 | if (writerConfig.containsKey(HiveReaderProperties.PARTITIONS)) { 116 | String partitions = writerConfig.getString(HiveReaderProperties.PARTITIONS); 117 | String[] partKVs = partitions.split("\\s*,\\s*"); 118 | for (String kv : partKVs) { 119 | String[] tokens = kv.split("="); 120 | if (tokens.length == 2) { 121 | partitionSpecify.put(tokens[0], tokens[1]); 122 | } 123 | } 124 | } else if (partitionKeySize > 0) { 125 | throw new HDataException(String.format("Table %s.%s is partition table, but partition config is not given.", dbName, tableName)); 126 | } 127 | 128 | createHiveRecordClass(table.getCols()); 129 | hiveRecord = hiveRecordWritale.newInstance(); 130 | 131 | String tableLocation = Utils.fixLocaltion(table.getDataLocation().toString(), metastoreUris); 132 | Matcher m = HDFS_MASTER.matcher(tableLocation); 133 | if (m.find()) { 134 | path = new Path(String.format("%s/%s/%s-%s.tmp", m.group(), hdfsTmpDir, tableName, UUID.randomUUID().toString().replaceAll("-", ""))); 135 | files.add(path); 136 | } 137 | 138 | inspector = (StructObjectInspector) ObjectInspectorFactory.getReflectionObjectInspector(HiveRecordWritable.class, 139 | ObjectInspectorFactory.ObjectInspectorOptions.JAVA); 140 | JobConf jobConf = new JobConf(); 141 | writer = outputFormat.getHiveRecordWriter(jobConf, path, HiveRecordWritable.class, isCompress, table.getMetadata(), Reporter.NULL); 142 | } catch (Exception e) { 143 | throw new HDataException(e); 144 | } finally { 145 | Hive.closeCurrent(); 146 | } 147 | } 148 | 149 | @Override 150 | public void execute(Record record) { 151 | try { 152 | for (int i = 0, len = record.getFieldsCount() - partitionKeySize; i < len; i++) { 153 | classFields.get(i).set(hiveRecord, TypeConvertUtils.convert(record.getField(i), classFields.get(i).getType())); 154 | } 155 | writer.write(serializer.serialize(hiveRecord, inspector)); 156 | } catch (Exception e) { 157 | throw new HDataException(e); 158 | } 159 | } 160 | 161 | private synchronized static Partition createPartition(Hive hive, Table table, Map partSpec) { 162 | Partition p = null; 163 | try { 164 | p = hive.getPartition(table, partSpec, false); 165 | if (p == null) { 166 | p = hive.getPartition(table, partSpec, true); 167 | } 168 | } catch (HiveException e) { 169 | throw new HDataException(e); 170 | } 171 | return p; 172 | } 173 | 174 | @Override 175 | public void close() { 176 | if (writer != null) { 177 | try { 178 | writer.close(true); 179 | 180 | String metastoreUris = writerConfig.getString(HiveWriterProperties.METASTORE_URIS); 181 | String dbName = writerConfig.getString(HiveWriterProperties.DATABASE, "default"); 182 | String tableName = writerConfig.getString(HiveWriterProperties.TABLE); 183 | HiveConf conf = new HiveConf(); 184 | conf.set(ConfVars.METASTOREURIS.varname, metastoreUris); 185 | Path renamedPath = new Path(path.toString().replaceFirst("\\.tmp$", "")); 186 | FileSystem fs = renamedPath.getFileSystem(conf); 187 | fs.rename(path, renamedPath); 188 | 189 | Hive hive; 190 | try { 191 | hive = Hive.get(conf, true); 192 | if (partitionKeySize == 0) { 193 | LOG.info("Loading data {} into table {}.{}", renamedPath.toString(), dbName, tableName); 194 | hive.loadTable(renamedPath, dbName + "." + tableName, false, false); 195 | } else { 196 | Table table = hive.getTable(dbName, tableName, false); 197 | Partition p = createPartition(hive, table, partitionSpecify); 198 | LOG.info("Loading data {} into table {}.{} partition({})", renamedPath.toString(), dbName, tableName, p.getName()); 199 | hive.loadPartition(renamedPath, dbName + "." + tableName, partitionSpecify, false, false, true, false); 200 | } 201 | } catch (Exception e) { 202 | throw new HDataException(e); 203 | } finally { 204 | Hive.closeCurrent(); 205 | } 206 | } catch (IOException e) { 207 | LoggerUtils.error(LOG, e); 208 | } 209 | } 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/hive/HiveWriterProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.hive; 2 | 3 | public class HiveWriterProperties { 4 | 5 | public static final String METASTORE_URIS = "metastoreUris"; 6 | public static final String DATABASE = "database"; 7 | public static final String TABLE = "table"; 8 | public static final String PARTITIONS = "partitions"; 9 | public static final String COMPRESS = "compress"; 10 | public static final String HADOOP_USER = "hadoopUser"; 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/jdbc/JBDCWriterProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.jdbc; 2 | 3 | public class JBDCWriterProperties { 4 | 5 | public static final String DRIVER = "driver"; 6 | public static final String URL = "url"; 7 | public static final String USERNAME = "username"; 8 | public static final String PASSWORD = "password"; 9 | public static final String TABLE = "table"; 10 | public static final String BATCH_INSERT_SIZE = "batchInsertSize"; 11 | public static final String PARALLELISM = "parallelism"; 12 | 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/jdbc/JDBCWriter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.jdbc; 2 | 3 | import java.sql.Connection; 4 | import java.sql.PreparedStatement; 5 | import java.sql.SQLException; 6 | import java.sql.Timestamp; 7 | import java.sql.Types; 8 | import java.text.SimpleDateFormat; 9 | import java.util.Arrays; 10 | import java.util.Map; 11 | 12 | import opensource.hdata.common.Constants; 13 | import opensource.hdata.config.PluginConfig; 14 | import opensource.hdata.core.Fields; 15 | import opensource.hdata.core.JobContext; 16 | import opensource.hdata.core.plugin.Record; 17 | import opensource.hdata.core.plugin.Writer; 18 | import opensource.hdata.exception.HDataException; 19 | import opensource.hdata.util.JDBCUtils; 20 | 21 | import org.apache.logging.log4j.LogManager; 22 | import org.apache.logging.log4j.Logger; 23 | 24 | import com.google.common.base.Joiner; 25 | 26 | public class JDBCWriter extends Writer { 27 | 28 | private Connection connection = null; 29 | private PreparedStatement statement = null; 30 | private int count; 31 | private int batchInsertSize; 32 | private Fields columns; 33 | private String table; 34 | private Map columnTypes; 35 | private final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat(Constants.DATE_FORMAT_STRING); 36 | private final int DEFAULT_BATCH_INSERT_SIZE = 10000; 37 | private static final Logger LOG = LogManager.getLogger(JDBCWriter.class); 38 | 39 | @Override 40 | public void prepare(JobContext context, PluginConfig writerConfig) { 41 | columns = context.getFields(); 42 | String driver = writerConfig.getString(JBDCWriterProperties.DRIVER); 43 | String url = writerConfig.getString(JBDCWriterProperties.URL); 44 | String username = writerConfig.getString(JBDCWriterProperties.USERNAME); 45 | String password = writerConfig.getString(JBDCWriterProperties.PASSWORD); 46 | String table = writerConfig.getString(JBDCWriterProperties.TABLE); 47 | this.table = table; 48 | batchInsertSize = writerConfig.getInt(JBDCWriterProperties.BATCH_INSERT_SIZE, DEFAULT_BATCH_INSERT_SIZE); 49 | if (batchInsertSize < 1) { 50 | batchInsertSize = DEFAULT_BATCH_INSERT_SIZE; 51 | } 52 | 53 | try { 54 | connection = JDBCUtils.getConnection(driver, url, username, password); 55 | connection.setAutoCommit(false); 56 | columnTypes = JDBCUtils.getColumnTypes(connection, table); 57 | 58 | String sql = null; 59 | if (columns != null) { 60 | String[] placeholder = new String[columns.size()]; 61 | Arrays.fill(placeholder, "?"); 62 | sql = String.format("INSERT INTO %s(%s) VALUES(%s)", table, Joiner.on(", ").join(columns), Joiner.on(", ").join(placeholder)); 63 | LOG.debug(sql); 64 | statement = connection.prepareStatement(sql); 65 | } 66 | } catch (Exception e) { 67 | JDBCUtils.closeConnection(connection); 68 | throw new HDataException("Writer prepare failed.", e); 69 | } 70 | } 71 | 72 | @Override 73 | public void execute(Record record) { 74 | try { 75 | if (statement == null) { 76 | String[] placeholder = new String[record.getFieldsCount()]; 77 | Arrays.fill(placeholder, "?"); 78 | String sql = String.format("INSERT INTO %s VALUES(%s)", table, Joiner.on(", ").join(placeholder)); 79 | LOG.debug(sql); 80 | statement = connection.prepareStatement(sql); 81 | } 82 | 83 | for (int i = 0, len = record.getFieldsCount(); i < len; i++) { 84 | if (record.getField(i) instanceof Timestamp 85 | && !Integer.valueOf(Types.TIMESTAMP).equals(columnTypes.get(columns.get(i).toLowerCase()))) { 86 | statement.setObject(i + 1, DATE_FORMAT.format(record.getField(i))); 87 | } else { 88 | statement.setObject(i + 1, record.getField(i)); 89 | } 90 | } 91 | 92 | count++; 93 | statement.addBatch(); 94 | 95 | if (count % batchInsertSize == 0) { 96 | count = 0; 97 | statement.executeBatch(); 98 | connection.commit(); 99 | } 100 | } catch (SQLException e) { 101 | close(); 102 | throw new HDataException("Writer execute failed.", e); 103 | } 104 | } 105 | 106 | @Override 107 | public void close() { 108 | try { 109 | if (count > 0) { 110 | statement.executeBatch(); 111 | connection.commit(); 112 | } 113 | 114 | if (statement != null) { 115 | statement.close(); 116 | } 117 | 118 | } catch (SQLException e) { 119 | throw new HDataException(e); 120 | } finally { 121 | JDBCUtils.closeConnection(connection); 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/mongodb/MongoDBWriter.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.mongodb; 2 | 3 | import java.net.UnknownHostException; 4 | 5 | import opensource.hdata.config.PluginConfig; 6 | import opensource.hdata.core.Fields; 7 | import opensource.hdata.core.JobContext; 8 | import opensource.hdata.core.plugin.Record; 9 | import opensource.hdata.core.plugin.Writer; 10 | import opensource.hdata.exception.HDataException; 11 | 12 | import org.apache.commons.lang3.ArrayUtils; 13 | 14 | import com.mongodb.BasicDBObject; 15 | import com.mongodb.DB; 16 | import com.mongodb.DBCollection; 17 | import com.mongodb.MongoClient; 18 | import com.mongodb.MongoClientURI; 19 | 20 | public class MongoDBWriter extends Writer { 21 | 22 | private Fields fields; 23 | private MongoClient mongoClient = null; 24 | private DBCollection coll; 25 | private BasicDBObject[] insertDocs; 26 | private int batchsize; 27 | private int count; 28 | 29 | @Override 30 | public void prepare(JobContext context, PluginConfig writerConfig) { 31 | fields = context.getFields(); 32 | batchsize = writerConfig.getInt(MongoDBWriterProperties.BATCH_INSERT_SIZE, 1000); 33 | insertDocs = new BasicDBObject[batchsize]; 34 | MongoClientURI clientURI = new MongoClientURI(writerConfig.getString(MongoDBWriterProperties.URI)); 35 | try { 36 | mongoClient = new MongoClient(clientURI); 37 | DB db = mongoClient.getDB(clientURI.getDatabase()); 38 | coll = db.getCollection(clientURI.getCollection()); 39 | } catch (UnknownHostException e) { 40 | throw new HDataException(e); 41 | } 42 | } 43 | 44 | @Override 45 | public void execute(Record record) { 46 | BasicDBObject doc = new BasicDBObject(); 47 | for (int i = 0, len = fields.size(); i < len; i++) { 48 | doc.put(fields.get(i), record.getField(i)); 49 | } 50 | 51 | insertDocs[count++] = doc; 52 | if (count == batchsize) { 53 | coll.insert(insertDocs); 54 | count = 0; 55 | } 56 | } 57 | 58 | @Override 59 | public void close() { 60 | if (mongoClient != null) { 61 | if (count > 0) { 62 | coll.insert(ArrayUtils.subarray(insertDocs, 0, count)); 63 | } 64 | mongoClient.close(); 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/plugin/writer/mongodb/MongoDBWriterProperties.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.plugin.writer.mongodb; 2 | 3 | public class MongoDBWriterProperties { 4 | public static final String URI = "uri"; 5 | public static final String BATCH_INSERT_SIZE = "batchInsertSize"; 6 | } 7 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/tool/SQLExecuteTool.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.tool; 2 | 3 | import java.sql.Connection; 4 | import java.sql.Statement; 5 | 6 | import opensource.hdata.exception.HDataException; 7 | import opensource.hdata.util.JDBCUtils; 8 | 9 | import org.apache.commons.cli.CommandLine; 10 | import org.apache.commons.cli.CommandLineParser; 11 | import org.apache.commons.cli.HelpFormatter; 12 | import org.apache.commons.cli.Options; 13 | import org.apache.commons.cli.ParseException; 14 | import org.apache.commons.cli.PosixParser; 15 | import org.apache.logging.log4j.LogManager; 16 | import org.apache.logging.log4j.Logger; 17 | 18 | public class SQLExecuteTool { 19 | 20 | private static final String JDBC_DRIVER = "jdbc-driver"; 21 | private static final String JDBC_URL = "jdbc-url"; 22 | private static final String JDBC_USERNAME = "jdbc-username"; 23 | private static final String JDBC_PASSWORD = "jdbc-password"; 24 | private static final String SQL = "sql"; 25 | private static final Logger LOG = LogManager.getLogger(SQLExecuteTool.class); 26 | 27 | public Options createOptions() { 28 | Options options = new Options(); 29 | options.addOption(null, JDBC_DRIVER, true, "jdbc driver class name"); 30 | options.addOption(null, JDBC_URL, true, "jdbc url, e.g., jdbc:mysql://localhost:3306/database"); 31 | options.addOption(null, JDBC_USERNAME, true, "jdbc username"); 32 | options.addOption(null, JDBC_PASSWORD, true, "jdbc password"); 33 | options.addOption(null, SQL, true, "sql"); 34 | return options; 35 | } 36 | 37 | public void printHelp(Options options) { 38 | HelpFormatter formatter = new HelpFormatter(); 39 | formatter.printHelp(" ", options); 40 | } 41 | 42 | public static void main(String[] args) { 43 | SQLExecuteTool tool = new SQLExecuteTool(); 44 | Options options = tool.createOptions(); 45 | if (args.length < 1) { 46 | tool.printHelp(options); 47 | System.exit(-1); 48 | } 49 | 50 | CommandLineParser parser = new PosixParser(); 51 | CommandLine cmd = null; 52 | Connection conn = null; 53 | try { 54 | cmd = parser.parse(options, args); 55 | String driver = cmd.getOptionValue(JDBC_DRIVER); 56 | String url = cmd.getOptionValue(JDBC_URL); 57 | String username = cmd.getOptionValue(JDBC_USERNAME); 58 | String password = cmd.getOptionValue(JDBC_PASSWORD); 59 | String sql = cmd.getOptionValue(SQL); 60 | conn = JDBCUtils.getConnection(driver, url, username, password); 61 | Statement statement = conn.createStatement(); 62 | 63 | LOG.info("Executing sql: {}", sql); 64 | statement.execute(sql); 65 | LOG.info("Execute successfully."); 66 | } catch (ParseException e) { 67 | tool.printHelp(options); 68 | System.exit(-1); 69 | } catch (Exception e) { 70 | throw new HDataException(e); 71 | } finally { 72 | JDBCUtils.closeConnection(conn); 73 | } 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/EscaperUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | public class EscaperUtils { 7 | private static Map map = null; 8 | private static final char CHAR_SLASH = '\\'; 9 | 10 | /** 11 | * 特殊字符转义 12 | * 13 | * @param input 14 | * @return 15 | */ 16 | public static String parse(String input) { 17 | int cursor = 0; 18 | int index = input.indexOf(CHAR_SLASH, cursor); 19 | 20 | if (index < 0) { 21 | return input; 22 | } 23 | 24 | StringBuilder sb = new StringBuilder(); 25 | int len = input.length(); 26 | while ((index = input.indexOf('\\', cursor)) != -1) { 27 | if (index < len - 1) { 28 | if (map.containsKey(input.charAt(index + 1))) { 29 | sb.append(input.substring(cursor, index)); 30 | sb.append(map.get(input.charAt(index + 1))); 31 | } else { 32 | sb.append(input.substring(cursor, index + 2)); 33 | } 34 | cursor = index + 2; 35 | } else { 36 | break; 37 | } 38 | } 39 | sb.append(input.substring(cursor)); 40 | 41 | return sb.toString(); 42 | } 43 | 44 | static { 45 | map = new HashMap(); 46 | map.put('b', '\b'); 47 | map.put('t', '\t'); 48 | map.put('n', '\n'); 49 | map.put('f', '\f'); 50 | map.put('r', '\r'); 51 | map.put('"', '\"'); 52 | map.put('\'', '\''); 53 | map.put('\\', '\\'); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/FTPUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import java.io.IOException; 4 | import java.net.SocketException; 5 | import java.util.List; 6 | import java.util.regex.Pattern; 7 | 8 | import org.apache.commons.net.ftp.FTP; 9 | import org.apache.commons.net.ftp.FTPClient; 10 | import org.apache.commons.net.ftp.FTPFile; 11 | import org.apache.commons.net.ftp.FTPReply; 12 | 13 | public class FTPUtils { 14 | 15 | public static FTPClient getFtpClient(String host, int port, String username, String password) throws SocketException, IOException { 16 | String LOCAL_CHARSET = "GB18030"; 17 | FTPClient ftpClient = new FTPClient(); 18 | ftpClient.connect(host, port); 19 | // 检测服务器是否支持UTF-8编码,如果支持就用UTF-8编码,否则就使用本地编码GB18030 20 | if (FTPReply.isPositiveCompletion(ftpClient.sendCommand("OPTS UTF8", "ON"))) { 21 | LOCAL_CHARSET = "UTF-8"; 22 | } 23 | ftpClient.setControlEncoding(LOCAL_CHARSET); 24 | ftpClient.login(username, password); 25 | ftpClient.setBufferSize(1024 * 1024 * 16); 26 | ftpClient.enterLocalPassiveMode(); 27 | ftpClient.setFileType(FTP.BINARY_FILE_TYPE); 28 | ftpClient.setControlKeepAliveTimeout(60); 29 | return ftpClient; 30 | } 31 | 32 | /** 33 | * 获取FTP目录下的文件 34 | * 35 | * @param files 36 | * @param ftpClient 37 | * @param path 38 | * FTP目录 39 | * @param filenameRegexp 40 | * 文件名正则表达式 41 | * @param recursive 42 | * 是否递归搜索 43 | * @throws IOException 44 | */ 45 | public static void listFile(List files, FTPClient ftpClient, String path, String filenameRegexp, boolean recursive) throws IOException { 46 | for (FTPFile ftpFile : ftpClient.listFiles(path)) { 47 | if (ftpFile.isFile()) { 48 | if (Pattern.matches(filenameRegexp, ftpFile.getName())) { 49 | files.add(path + "/" + ftpFile.getName()); 50 | } 51 | } else if (recursive && ftpFile.isDirectory()) { 52 | listFile(files, ftpClient, path + "/" + ftpFile.getName(), filenameRegexp, recursive); 53 | } 54 | } 55 | } 56 | 57 | /** 58 | * 关闭FTP客户端连接 59 | * 60 | * @param ftpClient 61 | */ 62 | public static void closeFtpClient(FTPClient ftpClient) { 63 | if (ftpClient != null) { 64 | try { 65 | ftpClient.disconnect(); 66 | } catch (IOException e) { 67 | e.printStackTrace(); 68 | } 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/HiveMetaStoreUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; 4 | import org.apache.hadoop.hive.metastore.api.Partition; 5 | import org.apache.hadoop.hive.metastore.api.Table; 6 | 7 | public class HiveMetaStoreUtils { 8 | 9 | /** 10 | * 获取Hive表 11 | * 12 | * @param client 13 | * @param database 14 | * @param table 15 | * @return 16 | */ 17 | public static Table getTable(HiveMetaStoreClient client, String database, String table) { 18 | try { 19 | return client.getTable(database, table); 20 | } catch (Exception e) { 21 | return null; 22 | } 23 | } 24 | 25 | /** 26 | * 判断是否为托管表 27 | * 28 | * @param table 29 | * @return 30 | */ 31 | public static boolean isManagedTable(Table table) { 32 | return "MANAGED_TABLE".equals(table.getTableType()); 33 | } 34 | 35 | /** 36 | * 判断是否为分区表 37 | * 38 | * @param table 39 | * @return 40 | */ 41 | public static boolean isPartitionTable(Table table) { 42 | return table.getPartitionKeys().size() > 0 ? true : false; 43 | } 44 | 45 | /** 46 | * 获取Hive表的分区 47 | * 48 | * @param client 49 | * @param table 50 | * @param partitionValues 51 | * @return 52 | */ 53 | public static Partition getPartition(HiveMetaStoreClient client, Table table, String partitionValues) { 54 | try { 55 | return client.getPartition(table.getDbName(), table.getTableName(), partitionValues.replaceAll("\"", "").replaceAll("\\s+,\\s+", "")); 56 | } catch (Exception e) { 57 | return null; 58 | } 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/HiveTypeUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import org.apache.hadoop.hive.common.type.HiveBaseChar; 4 | import org.apache.hadoop.hive.common.type.HiveDecimal; 5 | import org.apache.hadoop.hive.common.type.HiveVarchar; 6 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; 7 | import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; 8 | 9 | public class HiveTypeUtils { 10 | 11 | /** 12 | * 将Hive Writable类型转为标准Java类型 13 | * 14 | * @param o 15 | * @return 16 | */ 17 | public static Object toJavaObject(Object o) { 18 | if (o instanceof HiveBaseChar) { 19 | return ((HiveVarchar) o).getValue(); 20 | } else if (o instanceof HiveDecimal) { 21 | return ((HiveDecimal) o).bigDecimalValue(); 22 | } 23 | 24 | return o; 25 | } 26 | 27 | /** 28 | * 获取Hive类型的PrimitiveCategory 29 | * 30 | * @param type 31 | * @return 32 | */ 33 | public static PrimitiveCategory getPrimitiveCategory(String type) { 34 | if ("TINYINT".equals(type)) { 35 | return PrimitiveObjectInspector.PrimitiveCategory.BYTE; 36 | } else if ("SMALLINT".equals(type)) { 37 | return PrimitiveObjectInspector.PrimitiveCategory.SHORT; 38 | } else if ("BIGINT".equals(type)) { 39 | return PrimitiveObjectInspector.PrimitiveCategory.LONG; 40 | } else { 41 | return PrimitiveObjectInspector.PrimitiveCategory.valueOf(type); 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/JDBCUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import java.sql.Connection; 4 | import java.sql.DriverManager; 5 | import java.sql.PreparedStatement; 6 | import java.sql.ResultSet; 7 | import java.sql.ResultSetMetaData; 8 | import java.sql.SQLException; 9 | import java.sql.Statement; 10 | import java.sql.Types; 11 | import java.util.ArrayList; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | import org.apache.logging.log4j.LogManager; 19 | import org.apache.logging.log4j.Logger; 20 | 21 | public class JDBCUtils { 22 | 23 | private static final Logger LOG = LogManager.getLogger(JDBCUtils.class); 24 | 25 | /** 26 | * 获取JDBC连接 27 | * 28 | * @param driver 29 | * @param url 30 | * @param username 31 | * @param password 32 | * @return 33 | * @throws ClassNotFoundException 34 | * @throws SQLException 35 | */ 36 | public static Connection getConnection(String driver, String url, String username, String password) throws ClassNotFoundException, SQLException { 37 | Class.forName(driver); 38 | Connection conn = DriverManager.getConnection(url, username, password); 39 | return conn; 40 | } 41 | 42 | /** 43 | * 关闭JDBC连接 44 | * 45 | * @param conn 46 | */ 47 | public static void closeConnection(Connection conn) { 48 | if (conn != null) { 49 | try { 50 | conn.close(); 51 | } catch (SQLException e) { 52 | LoggerUtils.error(LOG, e); 53 | } 54 | } 55 | } 56 | 57 | /** 58 | * 获取表的字段类型 59 | * 60 | * @param connection 61 | * @param table 62 | * @return 63 | * @throws SQLException 64 | */ 65 | public static Map getColumnTypes(Connection connection, String table) throws SQLException { 66 | Map map = new HashMap(); 67 | StringBuilder sql = new StringBuilder(); 68 | sql.append("SELECT * FROM "); 69 | sql.append(table); 70 | sql.append(" WHERE 1=2"); 71 | 72 | PreparedStatement ps = connection.prepareStatement(sql.toString()); 73 | ResultSetMetaData rsd = ps.executeQuery().getMetaData(); 74 | for (int i = 0; i < rsd.getColumnCount(); i++) { 75 | map.put(rsd.getColumnName(i + 1).toLowerCase(), rsd.getColumnType(i + 1)); 76 | } 77 | ps.close(); 78 | return map; 79 | } 80 | 81 | /** 82 | * 获取表的字段名称 83 | * 84 | * @param conn 85 | * @param table 86 | * @return 87 | * @throws SQLException 88 | */ 89 | public static List getColumnNames(Connection conn, String table) throws SQLException { 90 | List columnNames = new ArrayList(); 91 | StringBuilder sql = new StringBuilder(); 92 | sql.append("SELECT * FROM "); 93 | sql.append(table); 94 | sql.append(" WHERE 1=2"); 95 | 96 | PreparedStatement ps = conn.prepareStatement(sql.toString()); 97 | ResultSet rs = ps.executeQuery(); 98 | ResultSetMetaData rsd = rs.getMetaData(); 99 | 100 | for (int i = 0, len = rsd.getColumnCount(); i < len; i++) { 101 | columnNames.add(rsd.getColumnName(i + 1)); 102 | } 103 | rs.close(); 104 | ps.close(); 105 | 106 | return columnNames; 107 | } 108 | 109 | /** 110 | * 查询表中分割字段值的区域(最大值、最小值) 111 | * 112 | * @param conn 113 | * @param sql 114 | * @param splitColumn 115 | * @return 116 | * @throws SQLException 117 | */ 118 | public static double[] querySplitColumnRange(Connection conn, String sql, String splitColumn) throws SQLException { 119 | double[] minAndMax = new double[2]; 120 | Pattern p = Pattern.compile("\\s+FROM\\s+.*", Pattern.CASE_INSENSITIVE); 121 | Matcher m = p.matcher(sql); 122 | 123 | if (m.find() && splitColumn != null && !splitColumn.trim().isEmpty()) { 124 | StringBuilder sb = new StringBuilder(); 125 | sb.append("SELECT MIN("); 126 | sb.append(splitColumn); 127 | sb.append("), MAX("); 128 | sb.append(splitColumn); 129 | sb.append(")"); 130 | sb.append(m.group(0)); 131 | 132 | Statement statement = conn.createStatement(); 133 | ResultSet rs = statement.executeQuery(sb.toString()); 134 | while (rs.next()) { 135 | minAndMax[0] = rs.getDouble(1); 136 | minAndMax[1] = rs.getDouble(2); 137 | } 138 | 139 | rs.close(); 140 | statement.close(); 141 | } 142 | 143 | return minAndMax; 144 | } 145 | 146 | /** 147 | * 查询表数值类型的主键 148 | * 149 | * @param conn 150 | * @param catalog 151 | * @param schema 152 | * @param table 153 | * @return 154 | * @throws SQLException 155 | */ 156 | public static String getDigitalPrimaryKey(Connection conn, String catalog, String schema, String table) throws SQLException { 157 | List primaryKeys = new ArrayList(); 158 | ResultSet rs = conn.getMetaData().getPrimaryKeys(catalog, schema, table); 159 | while (rs.next()) { 160 | primaryKeys.add(rs.getString("COLUMN_NAME")); 161 | } 162 | rs.close(); 163 | 164 | if (primaryKeys.size() > 0) { 165 | Map map = getColumnTypes(conn, table); 166 | for (String pk : primaryKeys) { 167 | if (isDigitalType(map.get(pk))) { 168 | return pk; 169 | } 170 | } 171 | } 172 | 173 | return null; 174 | } 175 | 176 | /** 177 | * 判断字段类型是否为数值类型 178 | * 179 | * @param sqlType 180 | * @return 181 | */ 182 | public static boolean isDigitalType(int sqlType) { 183 | switch (sqlType) { 184 | case Types.NUMERIC: 185 | case Types.DECIMAL: 186 | case Types.SMALLINT: 187 | case Types.INTEGER: 188 | case Types.BIGINT: 189 | case Types.REAL: 190 | case Types.FLOAT: 191 | case Types.DOUBLE: 192 | return true; 193 | 194 | default: 195 | return false; 196 | } 197 | } 198 | 199 | } 200 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/LoggerUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import org.apache.logging.log4j.Logger; 4 | 5 | public class LoggerUtils { 6 | 7 | public static void error(Logger logger, Exception e) { 8 | logger.error(e.getMessage()); 9 | logger.error(e.getStackTrace()); 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/TypeConvertUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import java.math.BigDecimal; 4 | import java.math.BigInteger; 5 | 6 | public class TypeConvertUtils { 7 | 8 | /** 9 | * 数据类型转换 10 | * 11 | * @param src 12 | * @param clazz 13 | * @return 14 | */ 15 | public static Object convert(Object src, Class clazz) { 16 | if (src == null) { 17 | return null; 18 | } else if (src instanceof String) { 19 | if (clazz == Integer.class) { 20 | return Integer.valueOf(src.toString()); 21 | } else if (clazz == Long.class) { 22 | return Long.valueOf(src.toString()); 23 | } else if (clazz == Double.class) { 24 | return Double.valueOf(src.toString()); 25 | } else if (clazz == Float.class) { 26 | return Float.valueOf(src.toString()); 27 | } else if (clazz == Boolean.class) { 28 | return Boolean.valueOf(src.toString()); 29 | } else if (clazz == Short.class) { 30 | return Short.valueOf(src.toString()); 31 | } else if (clazz == Byte.class) { 32 | return Byte.valueOf(src.toString()); 33 | } else if (clazz == BigInteger.class) { 34 | return BigInteger.valueOf(Long.valueOf(src.toString())); 35 | } else if (clazz == BigDecimal.class) { 36 | return new BigDecimal(src.toString()); 37 | } 38 | } else if (clazz == String.class) { 39 | return src.toString(); 40 | } 41 | return src; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/Utils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | import org.apache.commons.lang3.ArrayUtils; 10 | import org.apache.commons.lang3.StringUtils; 11 | import org.apache.logging.log4j.LogManager; 12 | import org.apache.logging.log4j.Logger; 13 | 14 | public class Utils { 15 | 16 | private static final Logger LOG = LogManager.getLogger(Utils.class); 17 | 18 | /** 19 | * 线程休眠 20 | * 21 | * @param millis 22 | */ 23 | public static void sleep(long millis) { 24 | try { 25 | Thread.sleep(millis); 26 | } catch (InterruptedException e) { 27 | LoggerUtils.error(LOG, e); 28 | } 29 | } 30 | 31 | public static List getColumns(String[] columns, String[] excludeColumns) { 32 | if (excludeColumns == null || excludeColumns.length < 1) { 33 | return columns == null ? null : Arrays.asList(columns); 34 | } 35 | 36 | List list = new ArrayList(); 37 | for (String column : columns) { 38 | if (!ArrayUtils.contains(excludeColumns, column)) { 39 | list.add(column); 40 | } 41 | } 42 | return list; 43 | } 44 | 45 | public static List getColumns(List columns, String[] excludeColumns) { 46 | return getColumns(columns.toArray(new String[columns.size()]), excludeColumns); 47 | } 48 | 49 | /** 50 | * 修复HDFS路径(将主机名改成IP) 51 | * 52 | * @param srcLocaltion 53 | * @param metastoreUris 54 | * @return 55 | */ 56 | public static String fixLocaltion(String srcLocaltion, String metastoreUris) { 57 | Matcher ipMatcher = Pattern.compile("(\\d+\\.){3}\\d+").matcher(metastoreUris.split(",")[0].trim()); 58 | if (ipMatcher.find()) { 59 | String masterIP = ipMatcher.group(); 60 | return srcLocaltion.replaceFirst("^hdfs://\\w+:", "hdfs://" + masterIP + ":"); 61 | } 62 | return srcLocaltion; 63 | } 64 | 65 | /** 66 | * 解析分区值 67 | * 68 | * @param partitions 69 | * @return 70 | */ 71 | public static List parsePartitionValue(String partitions) { 72 | List partitionValues = new ArrayList(); 73 | String[] partitionKeyValue = partitions.split("\\s*,\\s*"); 74 | for (String kv : partitionKeyValue) { 75 | String[] tokens = StringUtils.splitPreserveAllTokens(kv, "="); 76 | partitionValues.add(tokens[1]); 77 | } 78 | return partitionValues; 79 | } 80 | 81 | /** 82 | * 获取配置目录 83 | * 84 | * @return 85 | */ 86 | public static String getConfigDir() { 87 | return System.getProperty("hdata.conf.dir") + System.getProperty("file.separator"); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/opensource/hdata/util/XMLUtils.java: -------------------------------------------------------------------------------- 1 | package opensource.hdata.util; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | 7 | import javax.xml.parsers.DocumentBuilder; 8 | import javax.xml.parsers.DocumentBuilderFactory; 9 | import javax.xml.parsers.ParserConfigurationException; 10 | 11 | import org.jdom2.Document; 12 | import org.jdom2.Element; 13 | import org.jdom2.input.DOMBuilder; 14 | import org.xml.sax.SAXException; 15 | 16 | public class XMLUtils { 17 | 18 | /** 19 | * 加载XML文件 20 | * 21 | * @param input 22 | * @return 23 | * @throws ParserConfigurationException 24 | * @throws SAXException 25 | * @throws IOException 26 | */ 27 | public static Element load(InputStream input) throws ParserConfigurationException, SAXException, IOException { 28 | DOMBuilder domBuilder = new DOMBuilder(); 29 | DocumentBuilder builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); 30 | Document doc = domBuilder.build(builder.parse(input)); 31 | Element root = doc.getRootElement(); 32 | return root; 33 | } 34 | 35 | public static Element load(String xmlpath) throws ParserConfigurationException, SAXException, IOException { 36 | FileInputStream fis = new FileInputStream(xmlpath); 37 | return load(fis); 38 | } 39 | } 40 | --------------------------------------------------------------------------------