├── config.yaml
├── README.md
├── LICENSE
├── .gitignore
├── run-kafka.rb
└── server.properties.erb


/config.yaml:
--------------------------------------------------------------------------------
1 | --- 
2 | cluster: your-cluster
3 | zk_hosts: 
4 |   - 10.0.0.1:2181
5 |   - 10.0.0.2:2181
6 |   - 10.0.0.3:2181
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | kafka-on-marathon
2 | =================
3 | 
4 | Scripts for running Apache Kafka on Mesosphere's Marathon
5 | 
6 | Alternatively, you may want to use the [Kafka framework](https://github.com/mesos/kafka).
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
 2 |                     Version 2, December 2004
 3 | 
 4 |  Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
 5 | 
 6 |  Everyone is permitted to copy and distribute verbatim or modified
 7 |  copies of this license document, and changing it is allowed as long
 8 |  as the name is changed.
 9 | 
10 |             DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12 | 
13 |   0. You just DO WHAT THE FUCK YOU WANT TO.
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | /.config
 4 | /coverage/
 5 | /InstalledFiles
 6 | /pkg/
 7 | /spec/reports/
 8 | /test/tmp/
 9 | /test/version_tmp/
10 | /tmp/
11 | 
12 | ## Specific to RubyMotion:
13 | .dat*
14 | .repl_history
15 | build/
16 | 
17 | ## Documentation cache and generated files:
18 | /.yardoc/
19 | /_yardoc/
20 | /doc/
21 | /rdoc/
22 | 
23 | ## Environment normalisation:
24 | /.bundle/
25 | /lib/bundler/man/
26 | 
27 | # for a library or gem, you might want to ignore these files since the code is
28 | # intended to run in multiple environments; otherwise, check them in:
29 | # Gemfile.lock
30 | # .ruby-version
31 | # .ruby-gemset
32 | 
33 | # unless supporting rvm < 1.11.0 or doing something fancy, ignore this:
34 | .rvmrc
35 | 


--------------------------------------------------------------------------------
/run-kafka.rb:
--------------------------------------------------------------------------------
  1 | require 'zk'
  2 | require 'yaml'
  3 | require 'erb'
  4 | 
  5 | module Kafka
  6 |   class Broker
  7 |     def initialize(heap_size, broker_count, ports, config)
  8 |       @heap_size = heap_size.to_i
  9 |       @prng = Random.new(Time.now.to_f * 100000)
 10 | 
 11 |       @yaml = YAML.load_file(config)
 12 | 
 13 |       @zk_servers = @yaml['zk_hosts'].shuffle.join(',')
 14 |       @cluster = @yaml['cluster']
 15 | 
 16 |       zk_connect
 17 | 
 18 |       @ports = ports.split(/,/)
 19 | 
 20 |       @broker_count = broker_count.to_i
 21 | 
 22 |       @broker_set = get_missing_brokers
 23 |       if @broker_set.empty?
 24 |         $stderr.puts "No missing brokers found!"
 25 |         exit 1
 26 |       end
 27 | 
 28 |       @attempts = 0
 29 |       @max_attempts = 3 * @broker_count
 30 | 
 31 |       @hostname = `hostname`.chomp
 32 |     end
 33 | 
 34 |     def zk_connect
 35 |       @zk = ZK.new(@zk_servers, {
 36 |         :chroot    => "/kafka-#{@cluster}",
 37 |         :thread    => :single,
 38 |         :timeout   => 5,
 39 |       })
 40 | 
 41 |       @zk.wait_until_connected
 42 |     end
 43 | 
 44 |     def zk_reconnect
 45 |       close
 46 |       sleep 10
 47 |       zk_connect
 48 |       become_broker
 49 |     end
 50 | 
 51 |     # Initially, populate broker set with list of brokers not in Zookeeper, if
 52 |     # possible
 53 |     def get_missing_brokers
 54 |       broker_set = Set.new(0..@broker_count - 1)
 55 |       if @zk.exists?('/brokers/ids') && @zk.stat('/brokers/ids').num_children > 0
 56 |         ids = @zk.children('/brokers/ids').map{ |x| x.to_i }.sort
 57 |         puts "Found these broker IDs in Zookeeper: #{ids}"
 58 |         broker_set = broker_set.subtract(ids)
 59 |       end
 60 |       puts "Missing broker IDs: #{broker_set.to_a.sort}"
 61 |       broker_set
 62 |     end
 63 | 
 64 |     def close
 65 |       @candidate.close unless @candidate.nil?
 66 |       @zk.close! unless @zk.nil?
 67 |     end
 68 | 
 69 |     def become_broker
 70 |       elected = false
 71 |       got_result = false
 72 | 
 73 |       @candidate = @zk.election_candidate(
 74 |         "kafka-#{@broker_id}", @hostname, :follow => :leader)
 75 | 
 76 |       @candidate.on_winning_election {
 77 |         puts "Won election for kafka-#{@broker_id}"
 78 |         elected = true
 79 |         got_result = true
 80 | 
 81 |         @zk.on_expired_session do
 82 |           puts "ZK session expired"
 83 |           zk_reconnect
 84 |         end
 85 |       }
 86 | 
 87 |       @candidate.on_losing_election {
 88 |         puts "Lost election for kafka-#{@broker_id}"
 89 | 
 90 |         elected = false
 91 |         got_result = true
 92 |       }
 93 | 
 94 |       while !got_result
 95 |         puts "Trying to get elected for kafka-#{@broker_id}..."
 96 |         @candidate.vote!
 97 |         # Random sleep to help avoid thundering herd
 98 |         sleep @prng.rand(@broker_count)
 99 |         @attempts += 1
100 |         if @attempts > @max_attempts
101 |           break
102 |         end
103 |       end
104 | 
105 |       if !elected
106 |         close
107 |         $stderr.puts "Couldn't become a broker.  Suiciding."
108 |         exit 1
109 |       end
110 |     end
111 | 
112 |     def run
113 |       @broker_id = @broker_set.to_a.sample(:random => @prng)
114 |       become_broker
115 | 
116 | 
117 |       erb = ERB.new(File.open('server.properties.erb').readlines.map{|x| x.chomp }.join("\n"))
118 |       File.open('server.properties', 'w') do |f|
119 |         f.puts erb.result(binding)
120 |       end
121 | 
122 |       at_exit {
123 |         close
124 |       }
125 | 
126 |       env = {
127 |         "KAFKA_HEAP_OPTS" => "-XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -Xmx#{@heap_size.to_s}m -Xms#{(@heap_size / 2).to_s}m -XX:NewSize=#{(@heap_size / 3).to_s}m -XX:MaxNewSize=#{(@heap_size / 3).to_s}m -Xss256k -XX:+UseTLAB -XX:+AlwaysPreTouch",
128 |         "SCALA_VERSION" => "2.10.3",
129 |         "KAFKA_LOG4J_OPTS" => "-Dlog4j.configuration=file:log4j.properties",
130 |         "KAFKA_JVM_PERFORMANCE_OPTS" => "-server -XX:+UseCompressedOops -XX:+CMSClassUnloadingEnabled -XX:+CMSScavengeBeforeRemark -XX:+DisableExplicitGC",
131 |         "JMX_PORT" => @ports[1],
132 |       }
133 | 
134 |       %x(tar xf kafka-exec.tar.xz)
135 |       cmd = "./kafka-exec/bin/kafka-run-class.sh -name kafkaServer -loggc kafka.Kafka server.properties".freeze
136 |       last_finished = 0
137 | 
138 |       loop do
139 |         puts "About to run:"
140 |         puts env, cmd
141 |         GC.start # clean up memory
142 |         system env, cmd
143 |         finished = Time.now.to_f
144 |         if finished - last_finished < 120
145 |           # If the process was running for less than 2 minutes, abort.  We're
146 |           # probably 'bouncing'.  Occasional restarts are okay, but not
147 |           # continuous restarting.
148 |           $stderr.puts "Kafka exited too soon!"
149 |           exit 1
150 |         end
151 |         last_finished = finished
152 |       end
153 |     end
154 |   end
155 | end
156 | 
157 | begin
158 |   broker = Kafka::Broker.new ARGV[0], ARGV[1], ARGV[2], ARGV[3]
159 |   broker.run
160 | rescue => e
161 |   $stdout.puts $!.inspect, $@
162 |   $stderr.puts $!.inspect, $@
163 | ensure
164 |   broker.close
165 | end
166 | 


--------------------------------------------------------------------------------
/server.properties.erb:
--------------------------------------------------------------------------------
  1 | ############################# Server Basics #############################
  2 | 
  3 | # The id of the broker. This must be set to a unique integer for each broker.
  4 | broker.id=<%= @broker_id %>
  5 | 
  6 | ############################# Socket Server Settings #############################
  7 | 
  8 | # The port the socket server listens on
  9 | port=<%= @ports[0] %>
 10 | 
 11 | # Hostname the broker will bind to and advertise to producers and consumers.
 12 | # If not set, the server will bind to all interfaces and advertise the value returned from
 13 | # from java.net.InetAddress.getCanonicalHostName().
 14 | host.name=<%= @hostname %>
 15 | 
 16 | # The number of threads handling network requests
 17 | num.network.threads=16
 18 | 
 19 | # The number of threads doing disk I/O
 20 | num.io.threads=4
 21 | 
 22 | # The send buffer (SO_SNDBUF) used by the socket server
 23 | socket.send.buffer.bytes=8388608
 24 | 
 25 | # The receive buffer (SO_RCVBUF) used by the socket server
 26 | socket.receive.buffer.bytes=8388608
 27 | 
 28 | # The maximum size of a request that the socket server will accept (protection against OOM)
 29 | socket.request.max.bytes=104857600
 30 | 
 31 | 
 32 | ############################# Log Basics #############################
 33 | 
 34 | # A comma seperated list of directories under which to store log files
 35 | log.dirs=./kafka-logs
 36 | 
 37 | # The number of logical partitions per topic per server. More partitions allow greater parallelism
 38 | # for consumption, but also mean more files.
 39 | num.partitions=48
 40 | 
 41 | ############################# Log Flush Policy #############################
 42 | 
 43 | # Messages are immediately written to the filesystem but by default we only fsync() to sync
 44 | # the OS cache lazily. The following configurations control the flush of data to disk. 
 45 | # There are a few important trade-offs here:
 46 | #    1. Durability: Unflushed data may be lost if you are not using replication.
 47 | #    2. Latency: Very large flush intervals may lead to latency spikes when the flush does occur as there will be a lot of data to flush.
 48 | #    3. Throughput: The flush is generally the most expensive operation, and a small flush interval may lead to exceessive seeks. 
 49 | # The settings below allow one to configure the flush policy to flush data after a period of time or
 50 | # every N messages (or both). This can be done globally and overridden on a per-topic basis.
 51 | 
 52 | # The number of messages to accept before forcing a flush of data to disk
 53 | log.flush.interval.messages=100000
 54 | 
 55 | # The maximum amount of time a message can sit in a log before we force a flush
 56 | log.flush.interval.ms=10000
 57 | log.flush.scheduler.interval.ms=10000
 58 | 
 59 | ############################# Log Retention Policy #############################
 60 | 
 61 | # The following configurations control the disposal of log segments. The policy can
 62 | # be set to delete segments after a period of time, or after a given size has accumulated.
 63 | # A segment will be deleted whenever *either* of these criteria are met. Deletion always happens
 64 | # from the end of the log.
 65 | 
 66 | # The minimum age of a log file to be eligible for deletion
 67 | log.retention.hours=36
 68 | 
 69 | # A size-based retention policy for logs. Segments are pruned from the log as long as the remaining
 70 | # segments don't drop below log.retention.bytes.
 71 | #log.retention.bytes=1073741824
 72 | 
 73 | # The maximum size of a log segment file. When this size is reached a new log segment will be created.
 74 | log.segment.bytes=268435456
 75 | 
 76 | # The interval at which log segments are checked to see if they can be deleted according
 77 | # to the retention policies
 78 | log.cleanup.interval.mins=1
 79 | 
 80 | ############################# Zookeeper #############################
 81 | 
 82 | # Zookeeper connection string (see zookeeper docs for details).
 83 | # This is a comma separated host:port pairs, each corresponding to a zk
 84 | # server. e.g. "127.0.0.1:3000,127.0.0.1:3001,127.0.0.1:3002".
 85 | # You can also append an optional chroot string to the urls to specify the
 86 | # root directory for all kafka znodes.
 87 | zookeeper.connect=<%= @zk_servers %>/kafka-<%= @yaml['cluster'] %>
 88 | 
 89 | # Timeout in ms for connecting to zookeeper
 90 | zookeeper.session.timeout.ms=6000
 91 | zookeeper.connection.timeout.ms=6000
 92 | zookeeper.sync.time.ms=2000
 93 | 
 94 | log.cleanup.policy=delete
 95 | 
 96 | kafka.graphite.metrics.host=graphite-di.musta.ch
 97 | kafka.graphite.metrics.prefix=kafka.<%= @yaml['cluster'] %>.<%= @broker_id %>
 98 | kafka.graphite.metrics.reporter.enabled=true
 99 | kafka.metrics.reporters=kafka.metrics.KafkaGraphiteMetricsReporter
100 | kafka.metrics.polling.interval.secs=60
101 | 
102 | controlled.shutdown.enable=true
103 | 
104 | replica.fetch.wait.max.ms=5000
105 | replica.fetch.max.bytes=8388608
106 | replica.socket.receive.buffer.bytes=8388608
107 | replica.lag.max.messages=10000
108 | replica.lag.time.max.ms=30000
109 | replica.high.watermark.checkpoint.interval.ms=5000
110 | replica.socket.timeout.ms=20000
111 | num.replica.fetchers=3
112 | 
113 | controller.socket.timeout.ms=20000
114 | controller.message.queue.size=10
115 | 
116 | auto.leader.rebalance.enable=true
117 | message.max.bytes=8000000
118 | 
119 | fetch.purgatory.purge.interval.requests=1000
120 | producer.purgatory.purge.interval.requests=1000
121 | 


--------------------------------------------------------------------------------