├── README.md ├── pom.xml └── src └── main └── java └── kafka └── consumer ├── ConsumerUtils.java ├── HadoopConsumer.java ├── KafkaContext.java ├── KafkaInputFormat.java ├── KafkaOutputFormat.java └── ZkUtils.java /README.md: -------------------------------------------------------------------------------- 1 | kafka-hadoop-consumer 2 | ===================== 3 | 4 | Another kafka-hadoop-consumer 5 | 6 | ## Quick Start 7 | $ mvn start 8 | $ java -cp target/hadoop_consumer-1.0-SNAPSHOT.jar:`hadoop classpath` kafka.consumer.HadoopConsumer -z -t target_hdfs_path 9 | 10 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | kafka.consumer 5 | hadoop_consumer 6 | jar 7 | 0.1.0-SNAPSHOT 8 | hadoop_consumer 9 | http://maven.apache.org 10 | 11 | 12 | org.scala-lang 13 | scala-library 14 | 2.8.2 15 | 16 | 17 | kafka 18 | kafka 19 | 0.7.0 20 | 21 | 22 | org.I0Itec.zkclient 23 | zkclient 24 | 0.1 25 | 26 | 27 | junit 28 | junit 29 | 4.8.2 30 | test 31 | 32 | 33 | org.slf4j 34 | slf4j-api 35 | 1.5.11 36 | 37 | 38 | 39 | org.slf4j 40 | slf4j-log4j12 41 | 1.5.11 42 | 43 | 44 | org.apache.zookeeper 45 | zookeeper 46 | 3.3.4 47 | 48 | 49 | com.sun.jmx 50 | jmxri 51 | 52 | 53 | com.sun.jdmk 54 | jmxtools 55 | 56 | 57 | javax.jms 58 | jms 59 | 60 | 61 | 62 | 63 | org.apache.hadoop 64 | hadoop-core 65 | 0.20.2-cdh3u2 66 | 67 | 68 | commons-logging 69 | commons-logging 70 | 1.1.1 71 | 72 | 73 | commons-cli 74 | commons-cli 75 | 1.2 76 | 77 | 78 | 79 | 80 | 81 | org.apache.maven.plugins 82 | maven-shade-plugin 83 | 1.7 84 | 85 | 86 | package 87 | 88 | shade 89 | 90 | 91 | 92 | 93 | junit:junit 94 | org.apache.hadoop:* 95 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /src/main/java/kafka/consumer/ConsumerUtils.java: -------------------------------------------------------------------------------- 1 | package kafka.consumer; 2 | 3 | import java.util.Properties; 4 | 5 | import org.apache.hadoop.conf.Configuration; 6 | 7 | 8 | public class ConsumerUtils { 9 | 10 | 11 | public static ConsumerConfig getConfiguration(Configuration conf) { 12 | Properties props = new Properties(); 13 | props.put("zk.connect", conf.get("kafka.zk.connect","localhost:2182")); 14 | props.put("zk.connectiontimeout.ms", conf.get("kafka.zk.connectiontimeout.ms","1000000")); 15 | 16 | ConsumerConfig csConfig = new ConsumerConfig(props); 17 | return csConfig; 18 | } 19 | 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/kafka/consumer/HadoopConsumer.java: -------------------------------------------------------------------------------- 1 | package kafka.consumer; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.commons.cli.CommandLine; 6 | import org.apache.commons.cli.CommandLineParser; 7 | import org.apache.commons.cli.HelpFormatter; 8 | import org.apache.commons.cli.OptionBuilder; 9 | import org.apache.commons.cli.Options; 10 | import org.apache.commons.cli.PosixParser; 11 | import org.apache.hadoop.conf.Configuration; 12 | import org.apache.hadoop.conf.Configured; 13 | import org.apache.hadoop.fs.Path; 14 | import org.apache.hadoop.io.BytesWritable; 15 | import org.apache.hadoop.io.LongWritable; 16 | import org.apache.hadoop.io.Text; 17 | import org.apache.hadoop.mapreduce.Job; 18 | import org.apache.hadoop.mapreduce.Mapper; 19 | import org.apache.hadoop.util.Tool; 20 | import org.apache.hadoop.util.ToolRunner; 21 | 22 | public class HadoopConsumer extends Configured implements Tool { 23 | 24 | static { 25 | Configuration.addDefaultResource("core-site.xml"); 26 | //Configuration.addDefaultResource("mapred-site.xml"); 27 | } 28 | 29 | public static class KafkaMapper extends Mapper { 30 | @Override 31 | public void map(LongWritable key, BytesWritable value, Context context) throws IOException { 32 | Text out = new Text(); 33 | try { 34 | out.set(value.getBytes(),0, value.getLength()); 35 | context.write(key, out); 36 | } catch (InterruptedException e) { 37 | // TODO Auto-generated catch block 38 | e.printStackTrace(); 39 | } 40 | } 41 | 42 | } 43 | 44 | public int run(String[] args) throws Exception { 45 | 46 | //ToolRunner.printGenericCommandUsage(System.err); 47 | /* 48 | if (args.length < 2) { 49 | ToolRunner.printGenericCommandUsage(System.err); 50 | return -1; 51 | } 52 | */ 53 | 54 | CommandLineParser parser = new PosixParser(); 55 | Options options = buildOptions(); 56 | 57 | CommandLine cmd = parser.parse(options, args); 58 | 59 | //HelpFormatter formatter = new HelpFormatter(); 60 | //formatter.printHelp( "kafka.consumer.hadoop", options ); 61 | 62 | Configuration conf = getConf(); 63 | conf.set("kafka.topic", cmd.getOptionValue("topic", "test")); 64 | conf.set("kafka.groupid", cmd.getOptionValue("consumer-group", "test_group")); 65 | conf.set("kafka.zk.connect", cmd.getOptionValue("zk-connect", "localhost:2182")); 66 | if (cmd.getOptionValue("autooffset-reset") != null) 67 | conf.set("kafka.autooffset.reset", cmd.getOptionValue("autooffset-reset")); 68 | conf.setInt("kafka.limit", Integer.valueOf(cmd.getOptionValue("limit", "-1"))); 69 | 70 | conf.setBoolean("mapred.map.tasks.speculative.execution", false); 71 | 72 | 73 | Job job = new Job(conf, "Kafka.Consumer"); 74 | job.setJarByClass(getClass()); 75 | job.setMapperClass(KafkaMapper.class); 76 | // input 77 | job.setInputFormatClass(KafkaInputFormat.class); 78 | // output 79 | job.setOutputKeyClass(LongWritable.class); 80 | job.setOutputValueClass(Text.class); 81 | job.setOutputFormatClass(KafkaOutputFormat.class); 82 | 83 | job.setNumReduceTasks(0); 84 | 85 | KafkaOutputFormat.setOutputPath(job, new Path(cmd.getArgs()[0])); 86 | 87 | boolean success = job.waitForCompletion(true); 88 | if (success) { 89 | commit(conf); 90 | } 91 | return success ? 0: -1; 92 | } 93 | 94 | private void commit(Configuration conf) throws IOException { 95 | ZkUtils zk = new ZkUtils(conf); 96 | try { 97 | String topic = conf.get("kafka.topic"); 98 | String group = conf.get("kafka.groupid"); 99 | zk.commit(group, topic); 100 | } catch (Exception e) { 101 | rollback(); 102 | } finally { 103 | zk.close(); 104 | } 105 | } 106 | 107 | private void rollback() { 108 | } 109 | 110 | @SuppressWarnings("static-access") 111 | private Options buildOptions() { 112 | Options options = new Options(); 113 | 114 | options.addOption(OptionBuilder.withArgName("topic") 115 | .withLongOpt("topic") 116 | .hasArg() 117 | .withDescription("kafka topic") 118 | .create("t")); 119 | options.addOption(OptionBuilder.withArgName("groupid") 120 | .withLongOpt("consumer-group") 121 | .hasArg() 122 | .withDescription("kafka consumer groupid") 123 | .create("g")); 124 | options.addOption(OptionBuilder.withArgName("zk") 125 | .withLongOpt("zk-connect") 126 | .hasArg() 127 | .withDescription("ZooKeeper connection String") 128 | .create("z")); 129 | 130 | options.addOption(OptionBuilder.withArgName("offset") 131 | .withLongOpt("autooffset-reset") 132 | .hasArg() 133 | .withDescription("Offset reset") 134 | .create("o")); 135 | 136 | options.addOption(OptionBuilder.withArgName("limit") 137 | .withLongOpt("limit") 138 | .hasArg() 139 | .withDescription("kafka limit") 140 | .create("l")); 141 | 142 | 143 | return options; 144 | } 145 | 146 | public static void main(String[] args) throws Exception { 147 | int exitCode = ToolRunner.run(new HadoopConsumer(), args); 148 | System.exit(exitCode); 149 | } 150 | 151 | } 152 | -------------------------------------------------------------------------------- /src/main/java/kafka/consumer/KafkaContext.java: -------------------------------------------------------------------------------- 1 | package kafka.consumer; 2 | 3 | import java.io.Closeable; 4 | import java.io.IOException; 5 | import java.nio.ByteBuffer; 6 | import java.util.Iterator; 7 | import java.util.Queue; 8 | import java.util.concurrent.ArrayBlockingQueue; 9 | 10 | import org.apache.hadoop.io.BytesWritable; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | //import kafka.etl.FetchRequest; 17 | import kafka.api.FetchRequest; 18 | import kafka.common.ErrorMapping; 19 | import kafka.javaapi.consumer.SimpleConsumer; 20 | import kafka.javaapi.message.ByteBufferMessageSet; 21 | import kafka.message.Message; 22 | import kafka.message.MessageAndOffset; 23 | 24 | public class KafkaContext implements Closeable { 25 | 26 | private static Logger LOG = LoggerFactory.getLogger(KafkaContext.class); 27 | 28 | SimpleConsumer consumer ; 29 | String topic; 30 | int partition; 31 | long startOffset = -1L; 32 | long lastOffset = -1L; 33 | long curOffset; 34 | int fetchSize; 35 | ByteBufferMessageSet messages; 36 | Iterator iterator; 37 | final ArrayBlockingQueue queue; 38 | final FetchThread fetcher; 39 | 40 | public KafkaContext(String broker, String topic, 41 | int partition, long lastCommit,int fetchSize, int timeout, int bufferSize, 42 | String reset) { 43 | 44 | String[] sp = broker.split(":"); // broker-id:host:port 45 | consumer = new SimpleConsumer(sp[1], Integer.valueOf(sp[2]), timeout, bufferSize); 46 | this.topic = topic; 47 | this.partition = partition; 48 | this.startOffset = lastCommit; 49 | this.curOffset = getStartOffset(); 50 | this.lastOffset = getLastOffset(); 51 | this.fetchSize = fetchSize; 52 | 53 | 54 | resetOffset(reset, sp[0], partition); 55 | 56 | 57 | queue = new ArrayBlockingQueue(5); 58 | fetcher = new FetchThread(consumer, queue, topic, partition, curOffset, fetchSize); 59 | fetcher.start(); 60 | } 61 | 62 | private void resetOffset(String reset, String brokerId, int partition) { 63 | if (reset == null) return; 64 | LOG.info("RESET {} {} {}", new Object[]{reset, brokerId, partition}); 65 | if (reset.indexOf(":") > 0) { 66 | String[] sp = reset.split(":"); 67 | if (!sp[0].equals(brokerId + "-" + partition)) { 68 | return; 69 | } 70 | reset = sp[1]; 71 | } 72 | if ("smallest".equals(reset)) { 73 | setStartOffset(-1); 74 | } else if("largest".equals(reset)) { 75 | setStartOffset(lastOffset); 76 | } else { 77 | try { 78 | setStartOffset(Long.valueOf(reset)); 79 | } catch (NumberFormatException e) { 80 | } 81 | } 82 | } 83 | 84 | @Override 85 | public void close() throws IOException { 86 | fetcher.stop = true; 87 | //fetcher.interrupt(); 88 | while (!fetcher.stopped); 89 | consumer.close(); 90 | } 91 | 92 | private boolean hasMore() { 93 | if (iterator == null) { 94 | fetchMore(); 95 | if (iterator == null) { 96 | return false; 97 | } 98 | } 99 | boolean hasNext = iterator.hasNext(); 100 | if (hasNext) return hasNext; 101 | else if (curOffset >= lastOffset) return false; 102 | else { 103 | fetchMore(); 104 | return iterator.hasNext(); 105 | } 106 | } 107 | 108 | private void fetchMore() { 109 | 110 | while(!fetcher.stop || !queue.isEmpty()) { 111 | messages = queue.poll(); 112 | if (messages != null) { 113 | int code = messages.getErrorCode(); 114 | if (code != 0) { 115 | ErrorMapping.maybeThrowException(code); 116 | } 117 | iterator = messages.iterator(); 118 | break; 119 | } 120 | } 121 | } 122 | 123 | public long getNext(LongWritable key, BytesWritable value) throws IOException { 124 | if ( !hasMore() ) return -1L; 125 | 126 | MessageAndOffset messageOffset = iterator.next(); 127 | Message message = messageOffset.message(); 128 | 129 | key.set(curOffset); 130 | curOffset = messageOffset.offset(); 131 | 132 | //byte[] bytes = new byte[message.payloadSize()]; 133 | //message.payload().get(bytes); 134 | //value.set(bytes, 0, message.payloadSize()); 135 | ByteBuffer buffer = message.payload(); 136 | value.set(buffer.array(), buffer.arrayOffset(), message.payloadSize()); 137 | 138 | return curOffset; 139 | } 140 | 141 | public long getStartOffset() { 142 | if (startOffset <= 0) { 143 | startOffset = consumer.getOffsetsBefore(topic, partition, -2L, 1)[0]; 144 | } 145 | return startOffset; 146 | } 147 | 148 | public void setStartOffset(long offset) { 149 | if (offset <= 0) { 150 | offset = consumer.getOffsetsBefore(topic, partition, -2L, 1)[0]; 151 | LOG.info("Smallest Offset {}", offset); 152 | } 153 | curOffset = startOffset = offset; 154 | } 155 | 156 | public long getLastOffset() { 157 | if (lastOffset <= 0) { 158 | lastOffset = consumer.getOffsetsBefore(topic, partition, -1L, 1)[0]; 159 | } 160 | return lastOffset; 161 | } 162 | 163 | static class FetchThread extends Thread { 164 | 165 | String topic; 166 | int partition; 167 | long offset; 168 | int fetchSize; 169 | SimpleConsumer consumer ; 170 | public volatile boolean stop = false; 171 | public volatile boolean stopped = false; 172 | ArrayBlockingQueue queue ; 173 | boolean hasData = false; 174 | ByteBufferMessageSet messages = null; 175 | 176 | public FetchThread(SimpleConsumer consumer, ArrayBlockingQueue queue, 177 | String topic, int partition, long offset, int fetchSize) { 178 | this.topic = topic; 179 | this.partition = partition; 180 | this.offset = offset; 181 | this.fetchSize = fetchSize; 182 | this.consumer = consumer; 183 | this.queue = queue; 184 | } 185 | @Override 186 | public void run() { 187 | while (!stop) { 188 | if (messages == null) { 189 | FetchRequest request = 190 | new FetchRequest(topic, partition, offset, fetchSize); 191 | 192 | LOG.info("fetching offset {}", offset); 193 | messages = consumer.fetch(request); 194 | } 195 | 196 | int code = messages.getErrorCode(); 197 | if (code == 0) { 198 | if (!queue.offer(messages)){ 199 | try { 200 | Thread.sleep(100); 201 | } catch (InterruptedException e) { 202 | } 203 | continue; 204 | } 205 | hasData = true; 206 | offset += messages.validBytes(); // next offset to fetch 207 | //LOG.info("Valid bytes {} {}", messages.validBytes(), stop); 208 | messages = null; 209 | } else if (hasData && code == ErrorMapping.OffsetOutOfRangeCode()) { 210 | // no more data 211 | //queue.notify(); 212 | stop = true; 213 | LOG.info("No More Data"); 214 | } else { 215 | while (!queue.offer(messages)); 216 | stop = true; 217 | } 218 | } 219 | stopped = true; 220 | } 221 | } 222 | 223 | } 224 | -------------------------------------------------------------------------------- /src/main/java/kafka/consumer/KafkaInputFormat.java: -------------------------------------------------------------------------------- 1 | package kafka.consumer; 2 | 3 | import java.io.DataInput; 4 | import java.io.DataOutput; 5 | import java.io.IOException; 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | 9 | import org.apache.hadoop.conf.Configuration; 10 | import org.apache.hadoop.io.BytesWritable; 11 | import org.apache.hadoop.io.LongWritable; 12 | import org.apache.hadoop.io.Text; 13 | import org.apache.hadoop.io.Writable; 14 | import org.apache.hadoop.mapreduce.InputFormat; 15 | import org.apache.hadoop.mapreduce.InputSplit; 16 | import org.apache.hadoop.mapreduce.JobContext; 17 | import org.apache.hadoop.mapreduce.RecordReader; 18 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 19 | import org.slf4j.Logger; 20 | import org.slf4j.LoggerFactory; 21 | 22 | public class KafkaInputFormat extends InputFormat { 23 | 24 | static Logger LOG = LoggerFactory.getLogger(KafkaInputFormat.class); 25 | 26 | @Override 27 | public RecordReader createRecordReader( 28 | InputSplit arg0, TaskAttemptContext arg1) throws IOException, 29 | InterruptedException { 30 | return new KafkaRecordReader() ; 31 | } 32 | 33 | @Override 34 | public List getSplits(JobContext context) throws IOException, 35 | InterruptedException { 36 | Configuration conf = context.getConfiguration(); 37 | 38 | ZkUtils zk = new ZkUtils(conf); 39 | String topic = conf.get("kafka.topic"); 40 | String group = conf.get("kafka.groupid"); 41 | List splits = new ArrayList(); 42 | List partitions = zk.getPartitions(topic); 43 | 44 | for(String partition: partitions) { 45 | String[] sp = partition.split("-"); 46 | 47 | long last = zk.getLastCommit(group, topic, partition) ; 48 | InputSplit split = new KafkaSplit(sp[0], zk.getBroker(sp[0]), topic, Integer.valueOf(sp[1]), last); 49 | splits.add(split); 50 | } 51 | zk.close(); 52 | return splits; 53 | } 54 | 55 | public static class KafkaSplit extends InputSplit implements Writable { 56 | 57 | private String brokerId; 58 | private String broker; 59 | private int partition; 60 | private String topic; 61 | private long lastCommit; 62 | 63 | public KafkaSplit() {} 64 | 65 | public KafkaSplit(String brokerId, String broker, String topic, int partition, long lastCommit) { 66 | this.brokerId = brokerId; 67 | this.broker = broker; 68 | this.partition = partition; 69 | this.topic = topic; 70 | this.lastCommit = lastCommit; 71 | } 72 | @Override 73 | public void readFields(DataInput in) throws IOException { 74 | brokerId = Text.readString(in); 75 | broker = Text.readString(in); 76 | topic = Text.readString(in); 77 | partition = in.readInt(); 78 | lastCommit = in.readLong(); 79 | } 80 | 81 | @Override 82 | public void write(DataOutput out) throws IOException { 83 | Text.writeString(out, brokerId); 84 | Text.writeString(out, broker); 85 | Text.writeString(out, topic); 86 | out.writeInt(partition); 87 | out.writeLong(lastCommit); 88 | } 89 | 90 | @Override 91 | public long getLength() throws IOException, InterruptedException { 92 | return Long.MAX_VALUE; 93 | } 94 | 95 | @Override 96 | public String[] getLocations() throws IOException, InterruptedException { 97 | return new String[] {broker}; 98 | } 99 | 100 | public String getBrokerId() { 101 | return brokerId; 102 | } 103 | 104 | public String getBroker() { 105 | return broker; 106 | } 107 | 108 | public int getPartition() { 109 | return partition; 110 | } 111 | 112 | public String getTopic() { 113 | return topic; 114 | } 115 | 116 | public long getLastCommit() { 117 | return lastCommit; 118 | } 119 | 120 | @Override 121 | public String toString() { 122 | return broker + "-" + topic + "-" + partition + "-" + lastCommit ; 123 | } 124 | } 125 | 126 | public static class KafkaRecordReader extends RecordReader { 127 | 128 | private KafkaContext kcontext; 129 | private KafkaSplit ksplit; 130 | private TaskAttemptContext context; 131 | private int limit; 132 | private LongWritable key; 133 | private BytesWritable value; 134 | private long start; 135 | private long end; 136 | private long pos; 137 | private long count = 0L; 138 | @Override 139 | public void initialize(InputSplit split, TaskAttemptContext context) 140 | throws IOException, InterruptedException { 141 | this.context = context; 142 | ksplit = (KafkaSplit) split; 143 | 144 | Configuration conf = context.getConfiguration(); 145 | limit = conf.getInt("kafka.limit", -1); 146 | 147 | 148 | int timeout = conf.getInt("kafka.socket.timeout.ms", 30000); 149 | int bsize = conf.getInt("kafka.socket.buffersize", 64*1024); 150 | int fsize = conf.getInt("kafka.fetch.size", 1024 * 1024); 151 | String reset = conf.get("kafka.autooffset.reset"); 152 | kcontext = new KafkaContext(ksplit.getBrokerId() + ":" + ksplit.getBroker(), 153 | ksplit.getTopic(), 154 | ksplit.getPartition(), 155 | ksplit.getLastCommit(), 156 | fsize, timeout, bsize, reset); 157 | 158 | start = kcontext.getStartOffset(); 159 | end = kcontext.getLastOffset(); 160 | 161 | LOG.info("JobId {} {} Start: {} End: {}", 162 | new Object[]{context.getJobID(), ksplit, start, end }); 163 | } 164 | 165 | @Override 166 | public void close() throws IOException { 167 | kcontext.close(); 168 | commit(); 169 | } 170 | 171 | private void commit() throws IOException { 172 | if (count == 0L) return; 173 | Configuration conf = context.getConfiguration(); 174 | ZkUtils zk = new ZkUtils(conf); 175 | String group = conf.get("kafka.groupid"); 176 | String partition = ksplit.getBrokerId() + "-" + ksplit.getPartition(); 177 | zk.setLastCommit(group, ksplit.getTopic(), partition, pos, true); 178 | zk.close(); 179 | } 180 | 181 | @Override 182 | public LongWritable getCurrentKey() throws IOException, 183 | InterruptedException { 184 | return key; 185 | } 186 | 187 | @Override 188 | public BytesWritable getCurrentValue() throws IOException, 189 | InterruptedException { 190 | return value; 191 | } 192 | 193 | @Override 194 | public float getProgress() throws IOException, InterruptedException { 195 | 196 | if (pos >= end || start == end) { 197 | return 1.0f; 198 | } 199 | 200 | if (limit < 0) { 201 | return Math.min(1.0f, (pos - start) / (float)(end - start)); 202 | } else { 203 | return Math.min(1.0f, count / (float)limit); 204 | } 205 | } 206 | 207 | 208 | @Override 209 | public boolean nextKeyValue() throws IOException, InterruptedException { 210 | 211 | if (key == null) { 212 | key = new LongWritable(); 213 | } 214 | if (value == null) { 215 | value = new BytesWritable(); 216 | } 217 | if (limit < 0 || count < limit) { 218 | 219 | long next = kcontext.getNext(key, value); 220 | if (next >= 0) { 221 | pos = next; 222 | count++; 223 | return true; 224 | } 225 | } 226 | 227 | LOG.info("Next Offset " + pos); 228 | 229 | return false; 230 | } 231 | 232 | } 233 | 234 | } 235 | -------------------------------------------------------------------------------- /src/main/java/kafka/consumer/KafkaOutputFormat.java: -------------------------------------------------------------------------------- 1 | package kafka.consumer; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.hadoop.fs.Path; 6 | import org.apache.hadoop.mapred.FileAlreadyExistsException; 7 | import org.apache.hadoop.mapred.InvalidJobConfException; 8 | import org.apache.hadoop.mapreduce.JobContext; 9 | import org.apache.hadoop.mapreduce.JobID; 10 | import org.apache.hadoop.mapreduce.TaskAttemptContext; 11 | import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; 12 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 13 | import org.apache.hadoop.mapreduce.security.TokenCache; 14 | 15 | public class KafkaOutputFormat extends TextOutputFormat { 16 | 17 | public Path getDefaultWorkFile(TaskAttemptContext context, 18 | String extension) throws IOException{ 19 | FileOutputCommitter committer = 20 | (FileOutputCommitter) getOutputCommitter(context); 21 | JobID jobId = context.getJobID(); 22 | return new Path(committer.getWorkPath(), 23 | getUniqueFile(context, "part-" + jobId.toString().replace("job_", ""), 24 | extension)); 25 | } 26 | 27 | public void checkOutputSpecs(JobContext job 28 | ) throws FileAlreadyExistsException, IOException{ 29 | // Ensure that the output directory is set and not already there 30 | Path outDir = getOutputPath(job); 31 | if (outDir == null) { 32 | throw new InvalidJobConfException("Output directory not set."); 33 | } 34 | 35 | // get delegation token for outDir's file system 36 | TokenCache.obtainTokensForNamenodes(job.getCredentials(), 37 | new Path[] {outDir}, 38 | job.getConfiguration()); 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/kafka/consumer/ZkUtils.java: -------------------------------------------------------------------------------- 1 | package kafka.consumer; 2 | 3 | import java.io.Closeable; 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | import org.I0Itec.zkclient.ZkClient; 13 | import org.I0Itec.zkclient.exception.ZkMarshallingError; 14 | import org.I0Itec.zkclient.exception.ZkNoNodeException; 15 | import org.I0Itec.zkclient.serialize.ZkSerializer; 16 | import org.apache.hadoop.conf.Configuration; 17 | 18 | public class ZkUtils implements Closeable { 19 | 20 | private static Logger LOG = LoggerFactory.getLogger(ZkUtils.class); 21 | 22 | private static final String CONSUMERS_PATH = "/consumers"; 23 | private static final String BROKER_IDS_PATH = "/brokers/ids"; 24 | private static final String BROKER_TOPICS_PATH = "/brokers/topics"; 25 | 26 | /* 27 | * class ZKGroupDirs(val group: String) { 28 | def consumerDir = ZkUtils.ConsumersPath 29 | def consumerGroupDir = consumerDir + "/" + group 30 | def consumerRegistryDir = consumerGroupDir + "/ids" 31 | } 32 | 33 | class ZKGroupTopicDirs(group: String, topic: String) extends ZKGroupDirs(group) { 34 | def consumerOffsetDir = consumerGroupDir + "/offsets/" + topic 35 | def consumerOwnerDir = consumerGroupDir + "/owners/" + topic 36 | } 37 | */ 38 | 39 | private ZkClient client ; 40 | Map brokers ; 41 | public ZkUtils(Configuration config) { 42 | connect(config); 43 | } 44 | 45 | private void connect(Configuration config) { 46 | String zk = config.get("kafka.zk.connect"); 47 | int stimeout = config.getInt("kafka.zk.sessiontimeout.ms", 10000); 48 | int ctimeout = config.getInt("kafka.zk.connectiontimeout.ms", 10000); 49 | client = new ZkClient(zk, stimeout, ctimeout, new StringSerializer() ); 50 | } 51 | 52 | public String getBroker(String id) { 53 | if (brokers == null) { 54 | brokers = new HashMap(); 55 | List brokerIds = getChildrenParentMayNotExist(BROKER_IDS_PATH); 56 | for(String bid: brokerIds) { 57 | String data = client.readData(BROKER_IDS_PATH + "/" + bid); 58 | LOG.info("Broker " + bid + " " + data); 59 | brokers.put(bid, data.split(":", 2)[1]); 60 | } 61 | } 62 | return brokers.get(id); 63 | } 64 | 65 | public List getPartitions(String topic) { 66 | List partitions = new ArrayList(); 67 | List brokersTopics = getChildrenParentMayNotExist( BROKER_TOPICS_PATH + "/" + topic); 68 | for(String broker: brokersTopics) { 69 | String parts = client.readData(BROKER_TOPICS_PATH + "/" + topic + "/" + broker); 70 | for(int i =0; i< Integer.valueOf(parts); i++) { 71 | partitions.add(broker + "-" + i); 72 | } 73 | } 74 | return partitions; 75 | } 76 | 77 | private String getOffsetsPath(String group, String topic, String partition) { 78 | return CONSUMERS_PATH + "/" + group + "/offsets/" + topic + "/" + partition; 79 | } 80 | 81 | private String getTempOffsetsPath(String group, String topic, String partition) { 82 | return CONSUMERS_PATH + "/" + group + "/offsets-temp/" + topic + "/" + partition; 83 | } 84 | 85 | private String getTempOffsetsPath(String group, String topic) { 86 | return CONSUMERS_PATH + "/" + group + "/offsets-temp/" + topic ; 87 | } 88 | 89 | 90 | public long getLastCommit(String group, String topic, String partition) { 91 | String znode = getOffsetsPath(group ,topic ,partition); 92 | String offset = client.readData(znode, true); 93 | 94 | if (offset == null) { 95 | return -1L; 96 | } 97 | return Long.valueOf(offset); 98 | } 99 | 100 | public void setLastCommit(String group, String topic, String partition, long commit, boolean temp) { 101 | String path = temp? getTempOffsetsPath(group ,topic ,partition) 102 | : getOffsetsPath(group ,topic ,partition); 103 | if (!client.exists(path)) { 104 | client.createPersistent(path, true); 105 | } 106 | client.writeData(path, commit); 107 | } 108 | 109 | public boolean commit(String group, String topic) { 110 | List partitions = getChildrenParentMayNotExist(getTempOffsetsPath(group, topic)); 111 | for(String partition: partitions) { 112 | String path = getTempOffsetsPath(group, topic, partition); 113 | String offset = client.readData(path); 114 | setLastCommit(group, topic, partition, Long.valueOf(offset), false); 115 | client.delete(path); 116 | } 117 | return true; 118 | } 119 | 120 | 121 | private List getChildrenParentMayNotExist(String path) { 122 | try { 123 | List children = client.getChildren(path); 124 | return children; 125 | } catch (ZkNoNodeException e) { 126 | return new ArrayList(); 127 | } 128 | } 129 | 130 | @Override 131 | public void close() throws IOException { 132 | if (client != null) { 133 | client.close(); 134 | } 135 | } 136 | 137 | static class StringSerializer implements ZkSerializer { 138 | 139 | public StringSerializer() {} 140 | @Override 141 | public Object deserialize(byte[] data) throws ZkMarshallingError { 142 | if (data == null) return null; 143 | return new String(data); 144 | } 145 | 146 | @Override 147 | public byte[] serialize(Object data) throws ZkMarshallingError { 148 | return data.toString().getBytes(); 149 | } 150 | 151 | } 152 | 153 | } 154 | --------------------------------------------------------------------------------