├── README.md ├── pom.xml ├── src ├── main │ └── java │ │ └── co │ │ └── gridport │ │ └── kafka │ │ └── hadoop │ │ ├── HadoopJob.java │ │ ├── HadoopJobMapper.java │ │ ├── KafkaInputFetcher.java │ │ ├── KafkaInputFormat.java │ │ ├── KafkaInputRecordReader.java │ │ ├── KafkaInputSplit.java │ │ ├── KafkaOutputFormat.java │ │ └── ZkUtils.java └── test │ └── java │ └── co │ └── gridport │ └── kafka │ └── hadoop │ ├── FetcherTest.java │ ├── TestKafkaInputFetcher.java │ ├── TestKafkaInputFormat.java │ ├── TestKafkaInputRecordReader.java │ ├── TestMapper.java │ ├── TestZkUtils.java │ └── TestZookeeper.java └── target ├── classes └── co │ └── gridport │ └── kafka │ └── hadoop │ ├── HadoopJob.class │ ├── HadoopJobMapper.class │ ├── KafkaInputFetcher.class │ ├── KafkaInputFormat.class │ ├── KafkaInputRecordReader.class │ ├── KafkaInputSplit.class │ ├── KafkaOutputFormat$1.class │ ├── KafkaOutputFormat$LineRecordWriter.class │ ├── KafkaOutputFormat.class │ ├── ZkUtils$StringSerializer.class │ └── ZkUtils.class ├── kafka-hadoop-loader.jar └── test-classes └── co └── gridport └── kafka └── hadoop ├── FetcherTest$1.class ├── FetcherTest.class ├── TestKafkaInputFetcher.class ├── TestKafkaInputFormat.class ├── TestKafkaInputRecordReader.class ├── TestMapper.class ├── TestZkUtils.class └── TestZookeeper.class /README.md: -------------------------------------------------------------------------------- 1 | kafka-hadoop-loader 2 | ===================== 3 | 4 | This hadoop loader creates splits for each topic-broker-partition which creates 5 | ideal parallelism between kafka sterams and mapper tasks. 6 | 7 | Further it does not use high level consumer and communicates with zookeeper directly 8 | for management of the consumed offsets, which are comitted at the end of each map task, 9 | that is when the output file has been moved from hdfs_temp to its final destination. 10 | 11 | The actual consumer and it's inner fetcher thread are wrapped as KafkaInputContext which 12 | is created for each Map Task's record reader object. 13 | 14 | The mapper then takes in offest,message pairs, parses the content for date and emits (date,message) 15 | which is in turn picked up by Output Format and partitioned on the hdfs-level to different location. 16 | 17 | 18 | ANATOMY 19 | ------- 20 | 21 | HadoopJob 22 | -> KafkaInputFormat 23 | -> zkUtils.getBrokerPartitions 24 | -> FOR EACH ( broker-topic-partition ) CREATE KafkaInputSplit 25 | -> FOR EACH ( KafkaInputSplit ) CREATE MapTask: 26 | -> KafkaInputRecordReader( KafkaInputSplit[i] ) 27 | -> zkUtils.getLastConsumedOffset 28 | -> intialize simple kafka consumer 29 | -> reset watermark if given as option 30 | -> WHILE nextKeyValue() 31 | -> KafkaInputContext.getNext() -> (offset,message):newOffset 32 | -> KafkaInputRecordReader advance currentOffset+=newOffset and numProcessedMessages++ 33 | -> HadoopJobMapper(offset,message) -> (offset, message) 34 | -> KafkaOutputFormat.RecordWriter.write(offset, message) 35 | -> recordWriters[date].write( offset,message ) 36 | -> LineRecordWriter.write( message ) gz compressed or not 37 | -> END WHILE 38 | -> close KafkaInputContext 39 | -> zkUtils.commitLastConsumedOffset 40 | 41 | 42 | LAUNCH CONFIGURATIONS 43 | ===================== 44 | 45 | TO RUN FROM ECLIPSE (NO JAR) 46 | ---------------------------- 47 | add run configuration arguments: -r [-t ] [-z ] [target_hdfs_path] 48 | 49 | 50 | TO RUN REMOTELY 51 | --------------- 52 | $ mvn assembly:single 53 | $ java -jar kafka-hadoop-loader.jar -r [-t ] [-z ] [target_hdfs_path] 54 | TODO -r check if jar exists otherwise use addJarByClass 55 | 56 | 57 | TO RUN AS HADOOP JAR 58 | -------------------- 59 | $ mvn assembly:single 60 | $ hadoop jar kafka-hadoop-loader.jar [-z ] [-t ] [target_hdfs_path] 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | co.gridport.kafka 5 | kafka-hadoop-loader 6 | jar 7 | 1.2.5 8 | Kafka Loader Configurable Hadoop Job 9 | 10 | 11 | 12 | org.apache.kafka 13 | kafka-clients 14 | 0.8.2.1 15 | 16 | 17 | org.apache.kafka 18 | kafka_2.10 19 | 0.8.2.1 20 | 21 | 22 | org.apache.hadoop 23 | hadoop-client 24 | 2.6.0 25 | provided 26 | jar 27 | 28 | 29 | org.apache.hadoop 30 | hadoop-mapreduce 31 | 2.6.0 32 | provided 33 | pom 34 | 35 | 36 | commons-logging 37 | commons-logging 38 | 1.1.1 39 | 40 | 41 | commons-cli 42 | commons-cli 43 | 1.2 44 | 45 | 46 | 47 | 48 | 49 | gridport-snapshots 50 | GridPort Snapshots 51 | http://maven.gridport.co/content/repositories/snapshots 52 | 53 | always 54 | 55 | 56 | 57 | gridport-releases 58 | GridPort Releases 59 | http://maven.gridport.co/content/repositories/releases 60 | 61 | 62 | 63 | 64 | 65 | 66 | org.apache.maven.plugins 67 | maven-compiler-plugin 68 | 69 | 1.6 70 | 1.6 71 | 72 | 73 | 74 | maven-assembly-plugin 75 | 76 | kafka-hadoop-loader 77 | false 78 | 79 | 80 | co.gridport.kafka.hadoop.HadoopJob 81 | 82 | 83 | 84 | jar-with-dependencies 85 | 86 | 87 | 88 | 89 | org.apache.maven.plugins 90 | maven-deploy-plugin 91 | 2.6 92 | 93 | deploy-file 94 | 95 | 96 | gridport-releases 97 | jar 98 | true 99 | http://maven.gridport.co/content/repositories/releases/ 100 | co.gridport.kafka 101 | kafka-hadoop-loader 102 | ${project.version} 103 | ./target/kafka-hadoop-loader.jar 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/HadoopJob.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.File; 4 | 5 | import org.apache.commons.cli.CommandLine; 6 | import org.apache.commons.cli.CommandLineParser; 7 | import org.apache.commons.cli.HelpFormatter; 8 | import org.apache.commons.cli.OptionBuilder; 9 | import org.apache.commons.cli.Options; 10 | import org.apache.commons.cli.PosixParser; 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.conf.Configured; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.hadoop.io.Text; 15 | import org.apache.hadoop.mapred.JobConf; 16 | import org.apache.hadoop.mapreduce.Job; 17 | import org.apache.hadoop.util.Tool; 18 | import org.apache.hadoop.util.ToolRunner; 19 | import org.apache.log4j.Logger; 20 | import org.mortbay.log.Log; 21 | 22 | public class HadoopJob extends Configured implements Tool { 23 | 24 | static { 25 | Configuration.addDefaultResource("core-site.xml"); 26 | } 27 | 28 | public int run(String[] args) throws Exception { 29 | 30 | CommandLineParser parser = new PosixParser(); 31 | Options options = buildOptions(); 32 | CommandLine cmd = parser.parse(options, args); 33 | if (cmd.hasOption("h") || cmd.getArgs().length == 0) 34 | { 35 | printHelpAndExit(options); 36 | } 37 | String hdfsPath = cmd.getArgs()[0]; 38 | 39 | Configuration conf = getConf(); 40 | conf.setBoolean("mapred.map.tasks.speculative.execution", false); 41 | 42 | if (cmd.hasOption("topics")) 43 | { 44 | conf.set("kafka.topics", cmd.getOptionValue("topics")); 45 | Logger.getRootLogger().info("Using topics: " + conf.get("kafka.topics")); 46 | } 47 | else 48 | { 49 | printHelpAndExit(options); 50 | } 51 | 52 | conf.set("kafka.groupid", cmd.getOptionValue("consumer-group", "dev-hadoop-loader")); 53 | Logger.getRootLogger().info("Registering under consumer group: " + conf.get("kafka.groupid")); 54 | 55 | conf.set("kafka.zk.connect", cmd.getOptionValue("zk-connect", "localhost:9092")); 56 | 57 | Logger.getRootLogger().info("Using ZooKepper connection: " + conf.get("kafka.zk.connect")); 58 | 59 | if (cmd.getOptionValue("autooffset-reset") != null) 60 | { 61 | conf.set("kafka.watermark.reset", cmd.getOptionValue("autooffset-reset")); 62 | Logger.getRootLogger().info("SHOULD RESET OFFSET TO: " + conf.get("kafka.watermark.reset")); 63 | } 64 | 65 | conf.set("input.format", cmd.getOptionValue("input-format", "json")); 66 | Log.info("input format======",cmd.getOptionValue("input-format", "json")); 67 | if (!conf.get("input.format").equals("json") && !conf.get("input.format").equals("protobuf")) 68 | { 69 | printHelpAndExit(options); 70 | } 71 | Logger.getRootLogger().info("EXPECTING MESSAGE FORMAT: " + conf.get("input.format")); 72 | 73 | JobConf jobConf = new JobConf(conf); 74 | if (cmd.hasOption("remote") ) 75 | { 76 | String ip = cmd.getOptionValue("remote"); 77 | Logger.getRootLogger().info("Default file system: hdfs://" + ip + ":8020/"); 78 | jobConf.set("fs.defaultFS", "hdfs://"+ip+":9000/"); 79 | Logger.getRootLogger().info("Remote jobtracker: " + ip + ":8021"); 80 | jobConf.set("mapred.job.tracker", ip+":8021"); 81 | } 82 | Path jarTarget = new Path( 83 | getClass().getProtectionDomain().getCodeSource().getLocation() 84 | + "../kafka-hadoop-loader.jar" 85 | ); 86 | if (new File(jarTarget.toUri() ).exists()) 87 | { 88 | //running from eclipse / as maven 89 | jobConf.setJar(jarTarget.toUri().getPath()); 90 | Logger.getRootLogger().info("Using target jar: " + jarTarget.toString()); 91 | } 92 | else 93 | { 94 | //running from jar remotely or locally 95 | jobConf.setJarByClass(getClass()); 96 | Logger.getRootLogger().info("Using parent jar: " + jobConf.getJar()); 97 | } 98 | 99 | //Ready to launch 100 | Job job = Job.getInstance(jobConf, "kafka.hadoop.loader"); 101 | job.setMapperClass(HadoopJobMapper.class); 102 | job.setInputFormatClass(KafkaInputFormat.class); 103 | job.setOutputKeyClass(Text.class); 104 | job.setOutputValueClass(Text.class); 105 | job.setOutputFormatClass(KafkaOutputFormat.class); 106 | job.setNumReduceTasks(0); 107 | KafkaOutputFormat.setOutputPath(job, new Path(hdfsPath)); 108 | 109 | KafkaOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on")); 110 | 111 | 112 | Logger.getRootLogger().info("Output hdfs location: " + hdfsPath); 113 | Logger.getRootLogger().info("Output hdfs compression: " + KafkaOutputFormat.getCompressOutput(job)); 114 | boolean success = job.waitForCompletion(true); 115 | return success ? 0: -1; 116 | } 117 | 118 | private void printHelpAndExit(Options options) { 119 | HelpFormatter formatter = new HelpFormatter(); 120 | formatter.printHelp( "kafka-hadoop-loader.jar", options ); 121 | System.exit(0); 122 | } 123 | 124 | @SuppressWarnings("static-access") 125 | private Options buildOptions() { 126 | Options options = new Options(); 127 | 128 | options.addOption(OptionBuilder.withArgName("topics") 129 | .withLongOpt("topics") 130 | .hasArg() 131 | .withDescription("kafka topics") 132 | .create("t")); 133 | options.addOption(OptionBuilder.withArgName("groupid") 134 | .withLongOpt("consumer-group") 135 | .hasArg() 136 | .withDescription("kafka consumer groupid") 137 | .create("g")); 138 | options.addOption(OptionBuilder.withArgName("zk") 139 | .withLongOpt("zk-connect") 140 | .hasArg() 141 | .withDescription("Initial zk connection string for discovery") 142 | .create("z")); 143 | 144 | options.addOption(OptionBuilder.withArgName("offset") 145 | .withLongOpt("offset-reset") 146 | .hasArg() 147 | .withDescription("Reset offset to start or end of the stream e.g. 'smallest' or 'largest'") 148 | .create("o")); 149 | 150 | options.addOption(OptionBuilder.withArgName("compression") 151 | .withLongOpt("compress-output") 152 | .hasArg() 153 | .withDescription("GZip output compression on|off") 154 | .create("c")); 155 | 156 | options.addOption(OptionBuilder.withArgName("ip_address") 157 | .withLongOpt("remote") 158 | .hasArg() 159 | .withDescription("Running on a remote hadoop node") 160 | .create("r")); 161 | 162 | options.addOption(OptionBuilder 163 | .withLongOpt("help") 164 | .withDescription("Show this help") 165 | .create("h")); 166 | 167 | options.addOption(OptionBuilder.withArgName("json|protobuf|avro") 168 | .withLongOpt("input-format") 169 | .hasArg() 170 | .withDescription("How are the input messages formatted in the topic") 171 | .create("i")); 172 | return options; 173 | } 174 | 175 | public static void main(String[] args) throws Exception { 176 | int exitCode = ToolRunner.run(new HadoopJob(), args); 177 | System.exit(exitCode); 178 | } 179 | 180 | } 181 | -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/HadoopJobMapper.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.IOException; 4 | 5 | 6 | import org.apache.hadoop.io.BytesWritable; 7 | import org.apache.hadoop.io.LongWritable; 8 | import org.apache.hadoop.io.Text; 9 | import org.apache.hadoop.mapreduce.Mapper; 10 | 11 | public class HadoopJobMapper extends Mapper { 12 | 13 | @Override 14 | public void map(LongWritable key, BytesWritable value, Context context) throws IOException { 15 | 16 | try { 17 | context.write(new Text(String.valueOf(key)), new Text(value.copyBytes())); 18 | } catch (InterruptedException e) { 19 | // TODO Auto-generated catch block 20 | e.printStackTrace(); 21 | } 22 | 23 | 24 | } 25 | 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/KafkaInputFetcher.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/src/main/java/co/gridport/kafka/hadoop/KafkaInputFetcher.java -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/KafkaInputFormat.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.List; 7 | 8 | import kafka.javaapi.PartitionMetadata; 9 | import kafka.javaapi.TopicMetadata; 10 | import kafka.javaapi.TopicMetadataRequest; 11 | import kafka.javaapi.TopicMetadataResponse; 12 | import kafka.javaapi.consumer.SimpleConsumer; 13 | 14 | import org.apache.hadoop.conf.Configuration; 15 | import org.apache.hadoop.io.BytesWritable; 16 | import org.apache.hadoop.io.LongWritable; 17 | import org.apache.hadoop.mapreduce.InputFormat; 18 | import org.apache.hadoop.mapreduce.InputSplit; 19 | import org.apache.hadoop.mapreduce.JobContext; 20 | import org.apache.hadoop.mapreduce.RecordReader; 21 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 22 | 23 | 24 | public class KafkaInputFormat extends InputFormat { 25 | 26 | @Override 27 | public List getSplits(JobContext context) throws IOException, InterruptedException { 28 | 29 | List splits = new ArrayList(); 30 | 31 | Configuration conf = context.getConfiguration(); 32 | List topics = Arrays.asList(conf.get("kafka.topics").split(",")); 33 | ZkUtils zk = new ZkUtils( 34 | conf.get("kafka.zk.connect"), 35 | conf.getInt("kafka.zk.sessiontimeout.ms", 10000), 36 | conf.getInt("kafka.zk.connectiontimeout.ms", 10000) 37 | ); 38 | List seeds = zk.getSeedList(); 39 | 40 | String consumerGroup = conf.get("kafka.groupid"); 41 | 42 | for(final String seed: seeds) { 43 | SimpleConsumer consumer = new SimpleConsumer(seed, 9092, 10000, 65535, "PartitionsLookup"); 44 | TopicMetadataRequest request = new TopicMetadataRequest(topics); 45 | TopicMetadataResponse response = consumer.send(request); 46 | if (response != null && response.topicsMetadata() != null) { 47 | for(TopicMetadata tm: response.topicsMetadata()) { 48 | for(PartitionMetadata pm: tm.partitionsMetadata()) { 49 | long lastConsumedOffset = zk.getLastConsumedOffset(consumerGroup, tm.topic(), pm.partitionId()) ; 50 | InputSplit split = new KafkaInputSplit( 51 | seed, 52 | tm.topic(), 53 | pm.partitionId(), 54 | lastConsumedOffset 55 | ); 56 | splits.add(split); 57 | 58 | } 59 | 60 | } 61 | } 62 | if(splits.size()!=0) 63 | break; 64 | } 65 | zk.close(); 66 | return splits; 67 | } 68 | 69 | @Override 70 | public RecordReader createRecordReader( 71 | InputSplit arg0, TaskAttemptContext arg1) throws IOException, 72 | InterruptedException { 73 | return new KafkaInputRecordReader() ; 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/KafkaInputRecordReader.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/src/main/java/co/gridport/kafka/hadoop/KafkaInputRecordReader.java -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/KafkaInputSplit.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | 7 | import org.apache.hadoop.io.Text; 8 | import org.apache.hadoop.io.Writable; 9 | import org.apache.hadoop.mapreduce.InputSplit; 10 | 11 | public class KafkaInputSplit extends InputSplit implements Writable { 12 | 13 | private String broker; 14 | private int partition; 15 | private String topic; 16 | private long lastCommit; 17 | 18 | public KafkaInputSplit() {} 19 | 20 | public KafkaInputSplit(String broker, String topic, int partition, long lastCommit) { 21 | this.broker = broker; 22 | this.partition = partition; 23 | this.topic = topic; 24 | this.lastCommit = lastCommit; 25 | } 26 | 27 | public void readFields(DataInput in) throws IOException { 28 | broker = Text.readString(in); 29 | topic = Text.readString(in); 30 | partition = in.readInt(); 31 | lastCommit = in.readLong(); 32 | } 33 | 34 | public void write(DataOutput out) throws IOException { 35 | Text.writeString(out, broker); 36 | Text.writeString(out, topic); 37 | out.writeInt(partition); 38 | out.writeLong(lastCommit); 39 | } 40 | 41 | @Override 42 | public long getLength() throws IOException, InterruptedException { 43 | return Long.MAX_VALUE; 44 | } 45 | 46 | @Override 47 | public String[] getLocations() throws IOException, InterruptedException { 48 | return new String[] {broker}; 49 | } 50 | 51 | public int getPartition() { 52 | return partition; 53 | } 54 | 55 | public String getTopic() { 56 | return topic; 57 | } 58 | 59 | public long getWatermark() { 60 | return lastCommit; 61 | } 62 | 63 | @Override 64 | public String toString() { 65 | return "-" + topic + "-" + partition + "-" + lastCommit ; 66 | } 67 | } -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/KafkaOutputFormat.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.DataOutputStream; 4 | import java.io.IOException; 5 | import java.io.UnsupportedEncodingException; 6 | 7 | import org.apache.hadoop.conf.Configuration; 8 | import org.apache.hadoop.fs.FSDataOutputStream; 9 | import org.apache.hadoop.fs.FileSystem; 10 | import org.apache.hadoop.fs.Path; 11 | import org.apache.hadoop.io.NullWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.io.compress.CompressionCodec; 14 | import org.apache.hadoop.io.compress.GzipCodec; 15 | import org.apache.hadoop.mapred.FileAlreadyExistsException; 16 | import org.apache.hadoop.mapred.InvalidJobConfException; 17 | import org.apache.hadoop.mapreduce.JobContext; 18 | import org.apache.hadoop.mapreduce.RecordWriter; 19 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 20 | import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; 21 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 22 | import org.apache.hadoop.mapreduce.security.TokenCache; 23 | import org.apache.hadoop.util.ReflectionUtils; 24 | 25 | public class KafkaOutputFormat extends FileOutputFormat 26 | { 27 | public void checkOutputSpecs(JobContext job) throws FileAlreadyExistsException, IOException 28 | { 29 | // Ensure that the output directory is set and not already there 30 | Path outDir = getOutputPath(job); 31 | if (outDir == null) 32 | { 33 | throw new InvalidJobConfException("Output directory not set."); 34 | } 35 | 36 | // get delegation token for outDir's file system 37 | TokenCache.obtainTokensForNamenodes( 38 | job.getCredentials(), 39 | new Path[] {outDir}, 40 | job.getConfiguration() 41 | ); 42 | 43 | if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) { 44 | throw new FileAlreadyExistsException("Output directory " + outDir + 45 | " already exists"); 46 | } 47 | } 48 | 49 | /** 50 | * Create a composite record writer that can write key/value data to different 51 | * output files 52 | * 53 | * @param fs 54 | * the file system to use 55 | * @param job 56 | * the job conf for the job 57 | * @param name 58 | * the leaf file name for the output file (such as part-00000") 59 | * @param arg3 60 | * a progressable for reporting progress. 61 | * @return a composite record writer 62 | * @throws IOException 63 | */ 64 | public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException { 65 | 66 | final TaskAttemptContext taskContext = context; 67 | final Configuration conf = context.getConfiguration(); 68 | final boolean isCompressed = getCompressOutput(context); 69 | String ext = ""; 70 | CompressionCodec gzipCodec = null; 71 | if (isCompressed) { 72 | Class codecClass = getOutputCompressorClass(context, GzipCodec.class); 73 | gzipCodec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf); 74 | ext = ".gz"; 75 | } 76 | final CompressionCodec codec = gzipCodec; 77 | final String extension = ext; 78 | 79 | return new RecordWriter() { 80 | // TreeMap> recordWriters = new TreeMap>(); 81 | 82 | RecordWriter rw; 83 | 84 | public void write(K key, V value) throws IOException, InterruptedException { 85 | 86 | if (rw == null) { 87 | Path file = new Path( 88 | ((FileOutputCommitter)getOutputCommitter(taskContext)).getWorkPath(), 89 | getUniqueFile( 90 | taskContext, taskContext.getJobID().toString().replace("job_", ""), 91 | extension 92 | ) 93 | ); 94 | FileSystem fs = file.getFileSystem(conf); 95 | FSDataOutputStream fileOut = fs.create(file, false); 96 | if (isCompressed) 97 | { 98 | rw = new LineRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut))); 99 | } 100 | else 101 | { 102 | rw = new LineRecordWriter(fileOut); 103 | } 104 | 105 | } 106 | rw.write( key, value); 107 | 108 | /* String keyBasedPath = "d="+key.toString(); 109 | 110 | RecordWriter rw = this.recordWriters.get(keyBasedPath); 111 | try { 112 | if (rw == null) { 113 | Path file = new Path( 114 | ((FileOutputCommitter)getOutputCommitter(taskContext)).getWorkPath(), 115 | getUniqueFile( 116 | taskContext, keyBasedPath + "/" + taskContext.getJobID().toString().replace("job_", ""), 117 | extension 118 | ) 119 | ); 120 | FileSystem fs = file.getFileSystem(conf); 121 | FSDataOutputStream fileOut = fs.create(file, false); 122 | if (isCompressed) 123 | { 124 | rw = new LineRecordWriter(new DataOutputStream(codec.createOutputStream(fileOut))); 125 | } 126 | else 127 | { 128 | rw = new LineRecordWriter(fileOut); 129 | } 130 | this.recordWriters.put(keyBasedPath, rw); 131 | } 132 | rw.write( key, value); 133 | } catch (InterruptedException e) { 134 | e.printStackTrace(); 135 | }*/ 136 | 137 | }; 138 | 139 | /* @Override 140 | public void close(TaskAttemptContext context) throws IOException, InterruptedException 141 | { 142 | Iterator keys = this.recordWriters.keySet().iterator(); 143 | while (keys.hasNext()) { 144 | RecordWriter rw = this.recordWriters.get(keys.next()); 145 | rw.close(context); 146 | } 147 | this.recordWriters.clear(); 148 | };*/ 149 | 150 | @Override 151 | public void close(TaskAttemptContext context) throws IOException, InterruptedException 152 | { 153 | if (rw!=null) 154 | rw.close(context); 155 | }; 156 | 157 | }; 158 | } 159 | 160 | protected static class LineRecordWriter extends RecordWriter 161 | { 162 | private static final String utf8 = "UTF-8"; 163 | private static final byte[] newline; 164 | static { 165 | try { 166 | newline = "\n".getBytes(utf8); 167 | } catch (UnsupportedEncodingException uee) { 168 | throw new IllegalArgumentException("can't find " + utf8 + " encoding"); 169 | } 170 | } 171 | 172 | protected DataOutputStream out; 173 | 174 | public LineRecordWriter(DataOutputStream out) { 175 | this.out = out; 176 | } 177 | 178 | /** 179 | * Write the event value to the byte stream. 180 | * 181 | * @param o the object to print 182 | * @throws IOException if the write throws, we pass it on 183 | */ 184 | private void writeObject(Object o) throws IOException { 185 | if (o instanceof Text) { 186 | Text to = (Text) o; 187 | out.write(((Text) o).getBytes(),0,((Text) o).getLength()); 188 | // out.write(to.toString().getBytes(utf8), 0, to.toString().getBytes(utf8).length); 189 | } else { 190 | out.write(o.toString().getBytes(utf8)); 191 | } 192 | } 193 | 194 | public synchronized void write(K key, V value) 195 | throws IOException { 196 | 197 | boolean nullValue = value == null || value instanceof NullWritable; 198 | if (nullValue) { 199 | return; 200 | } 201 | writeObject(key+"\t"); 202 | writeObject(value); 203 | out.write(newline); 204 | } 205 | 206 | public synchronized 207 | void close(TaskAttemptContext context) throws IOException { 208 | out.close(); 209 | } 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/main/java/co/gridport/kafka/hadoop/ZkUtils.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/src/main/java/co/gridport/kafka/hadoop/ZkUtils.java -------------------------------------------------------------------------------- /src/test/java/co/gridport/kafka/hadoop/FetcherTest.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.nio.ByteBuffer; 4 | import java.util.Arrays; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.concurrent.ExecutorService; 8 | import java.util.concurrent.Executors; 9 | 10 | import kafka.javaapi.PartitionMetadata; 11 | import kafka.javaapi.TopicMetadata; 12 | import kafka.javaapi.TopicMetadataRequest; 13 | import kafka.javaapi.TopicMetadataResponse; 14 | import kafka.javaapi.consumer.SimpleConsumer; 15 | import kafka.message.MessageAndOffset; 16 | 17 | public class FetcherTest { 18 | public static void main(String[] args) throws Exception { 19 | 20 | final List seeds = Arrays.asList("localhost"); 21 | final String topic = "test"; 22 | final List partitions = discoverPartitions(topic, seeds); 23 | ExecutorService executor = Executors.newFixedThreadPool(partitions.size()); 24 | for(final Integer partition: partitions) { 25 | executor.submit( 26 | new Runnable() { 27 | @Override 28 | public void run() { 29 | KafkaInputFetcher fetcher = new KafkaInputFetcher( 30 | "HadoopLoaderFetcher", 31 | topic, 32 | partition, 33 | seeds, 34 | 30000, 35 | 64 * 1024 36 | ); 37 | while(true) { 38 | MessageAndOffset messageAndOffset; 39 | try { 40 | messageAndOffset = fetcher.nextMessageAndOffset(65535 * 16,100); 41 | if (messageAndOffset == null) { 42 | //backoff sleep 43 | try {Thread.sleep(1000);} catch (InterruptedException ie) {} 44 | } else { 45 | ByteBuffer payload = messageAndOffset.message().payload(); 46 | byte[] bytes = new byte[payload.limit()]; 47 | payload.get(bytes); 48 | System.out.println(String.valueOf(messageAndOffset.offset()) + ": " + new String(bytes, "UTF-8")); 49 | } 50 | } catch (Exception e) { 51 | // TODO Auto-generated catch block 52 | e.printStackTrace(); 53 | } 54 | } 55 | } 56 | } 57 | ); 58 | } 59 | 60 | } 61 | 62 | private static List discoverPartitions(String topic,List seeds) { 63 | List result = new LinkedList(); 64 | for(String seed: seeds) { 65 | SimpleConsumer consumer = new SimpleConsumer(seed, 9092, 10000, 65535, "PartitionsLookup"); 66 | TopicMetadataRequest request = new TopicMetadataRequest(Arrays.asList(topic)); 67 | TopicMetadataResponse response = consumer.send(request); 68 | if (response != null && response.topicsMetadata() != null) { 69 | for(TopicMetadata tm: response.topicsMetadata()) { 70 | for(PartitionMetadata partMd: tm.partitionsMetadata()) { 71 | result.add(partMd.partitionId()); 72 | } 73 | } 74 | } 75 | } 76 | return result; 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/test/java/co/gridport/kafka/hadoop/TestKafkaInputFetcher.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.IOException; 4 | import java.nio.ByteBuffer; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import kafka.message.MessageAndOffset; 9 | 10 | public class TestKafkaInputFetcher { 11 | public static void main(String args[]) throws IOException{ 12 | List seeds = new ArrayList(); 13 | // seeds.add("10-140-130-101"); 14 | // seeds.add("10-140-130-135"); 15 | seeds.add("10-140-70-86"); 16 | // seeds.add("10-140-70-87"); 17 | 18 | KafkaInputFetcher fetcher = new KafkaInputFetcher("hadoop-loader", "test_hadoop", 0, seeds, 30000, 64*1024); 19 | 20 | long smallestOffset = fetcher.getOffset(kafka.api.OffsetRequest.EarliestTime()); 21 | long latestOffset = fetcher.getOffset(kafka.api.OffsetRequest.LatestTime()); 22 | fetcher.setOffset(smallestOffset); 23 | long readOffset=0; 24 | 25 | while(true){ 26 | MessageAndOffset msg= fetcher.nextMessageAndOffset(10000,latestOffset); 27 | if(msg==null){ 28 | System.out.println("current offset is "+readOffset); 29 | break; 30 | } 31 | 32 | 33 | readOffset= msg.nextOffset(); 34 | 35 | ByteBuffer payload = msg.message().payload(); 36 | 37 | byte[] bytes = new byte[payload.limit()]; 38 | payload.get(bytes); 39 | System.out.println(String.valueOf(msg.offset()) + ": " + new String(bytes, "UTF-8")); 40 | 41 | 42 | } 43 | 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/java/co/gridport/kafka/hadoop/TestKafkaInputFormat.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.IOException; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | 7 | import org.apache.hadoop.mapreduce.InputSplit; 8 | 9 | import kafka.javaapi.PartitionMetadata; 10 | import kafka.javaapi.TopicMetadata; 11 | import kafka.javaapi.TopicMetadataRequest; 12 | import kafka.javaapi.TopicMetadataResponse; 13 | import kafka.javaapi.consumer.SimpleConsumer; 14 | 15 | public class TestKafkaInputFormat { 16 | public static void main(String args[]) throws IOException{ 17 | ZkUtils zk = new ZkUtils( 18 | "sdf-spark-master1:31818,sdf-spark-master2:31818,sdf-resourcemanager1:31818,sdf-resourcemanager2:31818,sdf-logserver:31818/test-kafka", 19 | 10000, 20 | 10000 21 | ); 22 | List seeds = zk.getSeedList(); 23 | 24 | String consumerGroup = "test_hadoop111"; 25 | 26 | List topics = Arrays.asList("test_single"); 27 | 28 | for(final String seed: seeds) { 29 | SimpleConsumer consumer = new SimpleConsumer(seed, 9092, 10000, 65535, "PartitionsLookup"); 30 | TopicMetadataRequest request = new TopicMetadataRequest(topics); 31 | TopicMetadataResponse response = consumer.send(request); 32 | if (response != null && response.topicsMetadata() != null) { 33 | for(TopicMetadata tm: response.topicsMetadata()) { 34 | for(PartitionMetadata pm: tm.partitionsMetadata()) { 35 | long lastConsumedOffset = zk.getLastConsumedOffset(consumerGroup, tm.topic(), pm.partitionId()) ; 36 | System.out.println(lastConsumedOffset); 37 | System.out.println(seed+"-----"+tm.topic()+"------"+pm.partitionId()+"------"+lastConsumedOffset); 38 | InputSplit split = new KafkaInputSplit( 39 | seed, 40 | tm.topic(), 41 | pm.partitionId(), 42 | lastConsumedOffset 43 | ); 44 | 45 | 46 | } 47 | } 48 | } 49 | break; 50 | } 51 | zk.close(); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test/java/co/gridport/kafka/hadoop/TestKafkaInputRecordReader.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import kafka.message.MessageAndOffset; 8 | 9 | public class TestKafkaInputRecordReader { 10 | public static void main(String args[]) throws IOException, InterruptedException{ 11 | List seeds = new ArrayList(); 12 | seeds.add("10-140-130-101"); 13 | seeds.add("10-140-130-135"); 14 | seeds.add("10-140-70-86"); 15 | seeds.add("10-140-70-87"); 16 | 17 | KafkaInputFetcher fetcher = new KafkaInputFetcher("hadoop-loader", "test_single", 0, seeds, 30000, 64*1024); 18 | System.out.println(fetcher); 19 | long smallestOffset = fetcher.getOffset(kafka.api.OffsetRequest.EarliestTime()); 20 | System.out.println("smallestOffset====="+smallestOffset); 21 | long latestOffset = fetcher.getOffset(kafka.api.OffsetRequest.LatestTime()); 22 | System.out.println("latestOffset====="+latestOffset); 23 | 24 | MessageAndOffset msg= fetcher.nextMessageAndOffset(100,latestOffset); 25 | 26 | System.out.println(msg); 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/co/gridport/kafka/hadoop/TestMapper.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import org.apache.hadoop.io.BytesWritable; 4 | import org.apache.hadoop.io.LongWritable; 5 | import org.apache.hadoop.io.Text; 6 | 7 | public class TestMapper { 8 | 9 | public static void main(String args[]) { 10 | HadoopJobMapper maptest = new HadoopJobMapper(); 11 | LongWritable lo = new LongWritable(); 12 | lo.set(0l); 13 | BytesWritable value = new BytesWritable(); 14 | String s = "1438913587232,www.example.com,192.168.2.638"; 15 | // s="{key1:value1}"; 16 | value.set(s.getBytes(),0,s.length()); 17 | System.out.println(new String(s.getBytes())); 18 | System.out.println(s.getBytes().length); 19 | Text text = new Text(value.copyBytes()); 20 | 21 | 22 | System.out.println(text.getBytes().length); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/test/java/co/gridport/kafka/hadoop/TestZkUtils.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/src/test/java/co/gridport/kafka/hadoop/TestZkUtils.java -------------------------------------------------------------------------------- /src/test/java/co/gridport/kafka/hadoop/TestZookeeper.java: -------------------------------------------------------------------------------- 1 | package co.gridport.kafka.hadoop; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import org.I0Itec.zkclient.ZkClient; 7 | 8 | import co.gridport.kafka.hadoop.ZkUtils.StringSerializer; 9 | 10 | public class TestZookeeper { 11 | 12 | static String BROKER_TOPICS_PATH ="/brokers/topics"; 13 | static String topic="test-10m"; 14 | static String s = "/brokers/topics/test-10m/partitions"; 15 | public static void main(String args[]){ 16 | ZkClient client = new ZkClient("sdf-spark-master1:31818,sdf-spark-master2:31818,sdf-resourcemanager1:31818,sdf-resourcemanager2:31818,sdf-logserver:31818/test-kafka",10000,10000,new StringSerializer()); 17 | 18 | 19 | 20 | 21 | List partitions = new ArrayList(); 22 | List brokersTopics = client.getChildren( BROKER_TOPICS_PATH + "/" + topic); 23 | 24 | // List brokersTopics = client.getChildren(s); 25 | 26 | // List children = client.getChildren(path); 27 | 28 | 29 | String test = client.readData("/brokers"); 30 | System.out.println(test); 31 | 32 | 33 | /* for(String broker: brokersTopics) { 34 | System.out.println(broker); 35 | String parts = client.readData(BROKER_TOPICS_PATH + "/" + topic + "/" +broker); 36 | for(int i =0; i< Integer.valueOf(parts); i++) { 37 | System.out.println(broker); 38 | partitions.add(broker + "-" + i); 39 | } 40 | }*/ 41 | 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/HadoopJob.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/HadoopJob.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/HadoopJobMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/HadoopJobMapper.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/KafkaInputFetcher.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/KafkaInputFetcher.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/KafkaInputFormat.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/KafkaInputFormat.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/KafkaInputRecordReader.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/KafkaInputRecordReader.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/KafkaInputSplit.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/KafkaInputSplit.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/KafkaOutputFormat$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/KafkaOutputFormat$1.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/KafkaOutputFormat$LineRecordWriter.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/KafkaOutputFormat$LineRecordWriter.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/KafkaOutputFormat.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/KafkaOutputFormat.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/ZkUtils$StringSerializer.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/ZkUtils$StringSerializer.class -------------------------------------------------------------------------------- /target/classes/co/gridport/kafka/hadoop/ZkUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/classes/co/gridport/kafka/hadoop/ZkUtils.class -------------------------------------------------------------------------------- /target/kafka-hadoop-loader.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/kafka-hadoop-loader.jar -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/FetcherTest$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/FetcherTest$1.class -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/FetcherTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/FetcherTest.class -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/TestKafkaInputFetcher.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/TestKafkaInputFetcher.class -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/TestKafkaInputFormat.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/TestKafkaInputFormat.class -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/TestKafkaInputRecordReader.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/TestKafkaInputRecordReader.class -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/TestMapper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/TestMapper.class -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/TestZkUtils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/TestZkUtils.class -------------------------------------------------------------------------------- /target/test-classes/co/gridport/kafka/hadoop/TestZookeeper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LiuGuH/kafka-hadoop-loader-my/e439a3d0e3cfe67350ea2aa46c60c3f50fd8d417/target/test-classes/co/gridport/kafka/hadoop/TestZookeeper.class --------------------------------------------------------------------------------