├── kafka ├── request_type.py ├── __init__.py ├── README ├── LICENSE.txt ├── message.py ├── producer.py ├── io.py └── consumer.py ├── .gitignore ├── README.md ├── LICENSE └── tail2kafka └── tail2kafka /kafka/request_type.py: -------------------------------------------------------------------------------- 1 | """ Kafka Constants """ 2 | 3 | PRODUCE = 0 4 | FETCH = 1 5 | MULTIFETCH = 2 6 | MULTIPRODUCE = 3 7 | OFFSETS = 4 8 | -------------------------------------------------------------------------------- /kafka/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # 3 | # 4 | # This great package has been taken from dsully's pykafka https://github.com/dsully/pykafka 5 | # 6 | # Thanks a lot dsully 7 | # 8 | # 9 | # 10 | import kafka.consumer 11 | import kafka.io 12 | import kafka.message 13 | import kafka.producer 14 | import kafka.request_type 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[co] 2 | 3 | # Packages 4 | *.egg 5 | *.egg-info 6 | dist 7 | build 8 | eggs 9 | parts 10 | bin 11 | var 12 | sdist 13 | develop-eggs 14 | .installed.cfg 15 | 16 | # Installer logs 17 | pip-log.txt 18 | 19 | # Unit test / coverage reports 20 | .coverage 21 | .tox 22 | 23 | #Translations 24 | *.mo 25 | 26 | #Mr Developer 27 | .mr.developer.cfg 28 | -------------------------------------------------------------------------------- /kafka/README: -------------------------------------------------------------------------------- 1 | This package is derived from Daniel Sully's great pykafka module (https://github.com/dsully/pykafka). 2 | 3 | There is currently an issue with a separate installation of this package using pip, so it is embedded here. 4 | 5 | Once the issue is solved, this folder will be removed and the original pip installation will be used. 6 | 7 | 2012 8 | Harel Ben Attia 9 | https://github.com/harelba 10 | @harelba 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # tail2kafka 2 | 3 | ## Overview 4 | tail2kafka is a linux tool for sending log lines into a kafka topic. 5 | 6 | It supports local and remote kafka servers, log rotation and batching messages to kafka. 7 | 8 | Great thanks to Daniel Sully's pykafka (https://github.com/dsully/pykafka). 9 | 10 | ## Contact 11 | Any feedback/suggestions/complaints regarding this tool would be much appreciated. Contributions are most welcome as well, of course. 12 | 13 | Harel Ben-Attia, harelba@gmail.com, @harelba on Twitter 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (C) 2012 Harel Ben-Attia @harelba 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /kafka/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011 Daniel Sully 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /kafka/message.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import zlib 3 | 4 | def parse_from(binary): 5 | """ Turn a packed binary message as recieved from a Kafka broker into a :class:`Message`. """ 6 | 7 | # A message. The format of an N byte message is the following: 8 | # 1 byte "magic" identifier to allow format changes 9 | # 4 byte CRC32 of the payload 10 | # N - 5 byte payload 11 | size = struct.unpack('>i', binary[0:4])[0] 12 | magic = struct.unpack('>B', binary[4:5])[0] 13 | checksum = struct.unpack('>i', binary[5:9])[0] 14 | payload = binary[9:9+size] 15 | 16 | return Message(payload, magic, checksum) 17 | 18 | class Message(object): 19 | """ A Kafka Message object. """ 20 | 21 | MAGIC_IDENTIFIER_DEFAULT = 0 22 | 23 | def __init__(self, payload=None, magic=MAGIC_IDENTIFIER_DEFAULT, checksum=None): 24 | self.magic = magic 25 | self.checksum = checksum 26 | self.payload = None 27 | 28 | if payload is not None: 29 | self.payload = str(payload) 30 | 31 | if self.payload is not None and self.checksum is None: 32 | self.checksum = self.calculate_checksum() 33 | 34 | def __str__(self): 35 | return self.payload 36 | 37 | def __eq__(self, other): 38 | if isinstance(other, self.__class__): 39 | return self.magic == other.magic and self.payload == other.payload and self.checksum == other.checksum 40 | 41 | return False 42 | 43 | def __ne__(self, other): 44 | return not self.__eq__(other) 45 | 46 | def calculate_checksum(self): 47 | """ Returns the checksum for the payload. """ 48 | 49 | return zlib.crc32(self.payload) 50 | 51 | def is_valid(self): 52 | """ Returns true if the checksum for this message is valid. """ 53 | return self.checksum == self.calculate_checksum() 54 | 55 | def encode(self): 56 | """ Encode a :class:`Message` to binary form. """ 57 | 58 | # 59 | return struct.pack('>Bi%ds' % len(self.payload), self.magic, self.calculate_checksum(), self.payload) 60 | 61 | -------------------------------------------------------------------------------- /kafka/producer.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | import itertools 3 | import struct 4 | 5 | import kafka.io 6 | import kafka.request_type 7 | 8 | class Producer(kafka.io.IO): 9 | """ Class for sending data to a `Kafka `_ broker. """ 10 | 11 | PRODUCE_REQUEST_ID = kafka.request_type.PRODUCE 12 | 13 | def __init__(self, topic, partition=0, host='localhost', port=9092): 14 | kafka.io.IO.__init__(self, host, port) 15 | 16 | self.topic = topic 17 | self.partition = partition 18 | 19 | self.connect() 20 | 21 | def encode_request(self, messages): 22 | """ Encode a sequence of :class:`Message ` objects for sending to the broker. """ 23 | 24 | # encode messages as 25 | encoded = [message.encode() for message in messages] 26 | lengths = [len(em) for em in encoded] 27 | 28 | # Build up the struct format. 29 | mformat = '>' + ''.join(['i%ds' % l for l in lengths]) 30 | 31 | # Flatten the two lists to match the format. 32 | message_set = struct.pack(mformat, *list(itertools.chain.from_iterable(zip(lengths, encoded)))) 33 | 34 | topic_len = len(self.topic) 35 | mset_len = len(message_set) 36 | 37 | # Create the request as 38 | pformat = '>HH%dsii%ds' % (topic_len, mset_len) 39 | payload = struct.pack(pformat, self.PRODUCE_REQUEST_ID, topic_len, self.topic, self.partition, mset_len, message_set) 40 | 41 | return struct.pack('>i%ds' % len(payload), len(payload), payload) 42 | 43 | def send(self, messages): 44 | """ Send a :class:`Message ` or a sequence of `Messages` to the Kafka server. """ 45 | 46 | if isinstance(messages, kafka.message.Message): 47 | messages = [messages] 48 | 49 | return self.write(self.encode_request(messages)) 50 | 51 | @contextlib.contextmanager 52 | def batch(self): 53 | """ Send messages with an implict `send`. """ 54 | 55 | messages = [] 56 | yield(messages) 57 | self.send(messages) 58 | -------------------------------------------------------------------------------- /kafka/io.py: -------------------------------------------------------------------------------- 1 | import array 2 | import errno 3 | import socket 4 | 5 | class IO(object): 6 | """ Base class for handling socket communication to the Kafka server. """ 7 | 8 | def __init__(self, host='localhost', port=9092): 9 | self.socket = None 10 | 11 | #: Hostname to connect to. 12 | self.host = host 13 | 14 | #: Port to connect to. 15 | self.port = port 16 | 17 | def connect(self): 18 | """ Connect to the Kafka server. """ 19 | 20 | self.socket = socket.socket() 21 | self.socket.connect((self.host, self.port)) 22 | 23 | def reconnect(self): 24 | """ Reconnect to the Kafka server. """ 25 | self.disconnect() 26 | self.connect() 27 | 28 | def disconnect(self): 29 | """ Disconnect from the remote server & close the socket. """ 30 | try: 31 | self.socket.close() 32 | except IOError: 33 | pass 34 | finally: 35 | self.socket = None 36 | 37 | def read(self, length): 38 | """ Send a read request to the remote Kafka server. """ 39 | 40 | # Create a character array to act as the buffer. 41 | buf = array.array('c', ' ' * length) 42 | read_length = 0 43 | 44 | try: 45 | while read_length < length: 46 | read_length += self.socket.recv_into(buf, length) 47 | 48 | except errno.EAGAIN: 49 | self.disconnect() 50 | raise IOError, "Timeout reading from the socket." 51 | 52 | else: 53 | return buf.tostring() 54 | 55 | def write(self, data): 56 | """ Write `data` to the remote Kafka server. """ 57 | 58 | if self.socket is None: 59 | self.reconnect() 60 | 61 | wrote_length = 0 62 | 63 | try: 64 | wrote_length = self.__write(data) 65 | 66 | except (errno.ECONNRESET, errno.EPIPE, errno.ECONNABORTED): 67 | # Retry once. 68 | self.reconnect() 69 | wrote_length = self.__write(data) 70 | 71 | finally: 72 | return wrote_length 73 | 74 | def __write(self, data): 75 | write_length = len(data) 76 | wrote_length = 0 77 | 78 | while write_length > wrote_length: 79 | wrote_length += self.socket.send(data) 80 | 81 | return wrote_length 82 | -------------------------------------------------------------------------------- /kafka/consumer.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import time 3 | 4 | import kafka.io 5 | import kafka.request_type 6 | 7 | class Consumer(kafka.io.IO): 8 | 9 | CONSUME_REQUEST_TYPE = kafka.request_type.FETCH 10 | 11 | MAX_SIZE = 1024 * 1024 12 | 13 | # seconds. 14 | DEFAULT_POLLING_INTERVAL = 2 15 | 16 | def __init__(self, topic, partition=0, host='localhost', port=9092): 17 | kafka.io.IO.__init__(self, host, port) 18 | 19 | #: The topic queue to consume. 20 | self.topic = topic 21 | 22 | #: The partition the topic queue is on. 23 | self.partition = partition 24 | 25 | #: Offset in the Kafka queue in bytes? 26 | self.offset = 0 27 | 28 | #: Maximum message size to consume. 29 | self.max_size = self.MAX_SIZE 30 | self.request_type = self.CONSUME_REQUEST_TYPE 31 | self.polling = self.DEFAULT_POLLING_INTERVAL 32 | 33 | self.connect() 34 | 35 | def consume(self): 36 | """ Consume data from the topic queue. """ 37 | 38 | self.send_consume_request() 39 | 40 | return self.parse_message_set_from(self.read_data_response()) 41 | 42 | def loop(self): 43 | """ Loop over incoming messages from the queue in a blocking fashion. Set `polling` for the check interval in seconds. """ 44 | 45 | while True: 46 | messages = self.consume() 47 | 48 | if messages and isinstance(messages, list) and len(messages) > 0: 49 | for message in messages: 50 | yield message 51 | 52 | time.sleep(self.polling) 53 | 54 | # REQUEST TYPE ID + TOPIC LENGTH + TOPIC + PARTITION + OFFSET + MAX SIZE 55 | def request_size(self): 56 | return 2 + 2 + len(self.topic) + 4 + 8 + 4 57 | 58 | def encode_request_size(self): 59 | return struct.pack('>i', self.request_size()) 60 | 61 | def encode_request(self): 62 | length = len(self.topic) 63 | 64 | return struct.pack('>HH%dsiQi' % length, self.request_type, length, self.topic, self.partition, self.offset, self.max_size) 65 | 66 | def send_consume_request(self): 67 | self.write(self.encode_request_size()) 68 | self.write(self.encode_request()) 69 | 70 | def read_data_response(self): 71 | buf_length = struct.unpack('>i', self.read(4))[0] 72 | 73 | # Start with a 2 byte offset 74 | return self.read(buf_length)[2:] 75 | 76 | def parse_message_set_from(self, data): 77 | messages = [] 78 | processed = 0 79 | length = len(data) - 4 80 | 81 | while (processed <= length): 82 | message_size = struct.unpack('>i', data[processed:processed+4])[0] 83 | messages.append(kafka.message.parse_from(data[processed:processed + message_size + 4])) 84 | processed += 4 + message_size 85 | 86 | self.offset += processed 87 | 88 | return messages 89 | -------------------------------------------------------------------------------- /tail2kafka/tail2kafka: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os,sys 4 | 5 | # Assumes kafka module is in a folder parallel to the folder this script exists in. 6 | # Once we can install pykafka as a separate module, this will be removed. 7 | sys.path.append(os.path.join(os.path.split(sys.argv[0])[0],'..')) 8 | import kafka as k 9 | 10 | from optparse import OptionParser 11 | import subprocess 12 | import atexit 13 | 14 | 15 | should_stop = False 16 | pending_messages = [] 17 | 18 | def create_kafka_producer(host,port,topic): 19 | return k.producer.Producer(topic,host=host,port=port) 20 | 21 | def create_message_data(metadata,data): 22 | if metadata is not None: 23 | return "%s::%s" % (metadata,data) 24 | else: 25 | return data 26 | 27 | def flush_messages(producer): 28 | global pending_messages 29 | print "flushing %d messages " % len(pending_messages) 30 | producer.send(pending_messages) 31 | pending_messages = [] 32 | 33 | def send_to_kafka(producer,message_text,batch_size): 34 | global pending_messages 35 | pending_messages.append(k.message.Message(message_text)) 36 | if len(pending_messages) == batch_size: 37 | flush_messages(producer) 38 | 39 | def log_lines_generator(logfile,delay_between_iterations=None): 40 | global should_stop 41 | cmd = ['tail','-n','0','-F'] 42 | if delay_between_iterations is not None: 43 | cmd.append('-s') 44 | cmd.append(delay_between_iterations) 45 | cmd.append(logfile) 46 | process = subprocess.Popen(cmd,stdout=subprocess.PIPE,stderr=None) 47 | while not should_stop: 48 | line = process.stdout.readline().strip() 49 | yield line 50 | 51 | def main(): 52 | parser = OptionParser(usage=""" 53 | %prog -l -t -s -p [other-options] 54 | 55 | Tails a log file continously, sending log lines to a kafka topic as messages and supporting log rotation. Optionally, 56 | prepend a "metadata" string to each log line (kafka message will contain the string :). 57 | 58 | set -l to the log file to be tailed. The log tailing supports log rotation. 59 | set -s and -p to set the kafka server (ZK is not supported for now) 60 | set -t to set the kafka topic to send 61 | 62 | Simple batching is supported (use -b to choose the batch size, default is 10). 63 | 64 | Advanced: If needed, use -d in order to control the tail delay - Unneeded in almost all cases. 65 | 66 | NOTE: Currently expects kafka/ module to be in a folder parallel to this script. 67 | """) 68 | parser.add_option("-s","--host",dest="host",default="localhost", 69 | help="kafka host") 70 | parser.add_option("-p","--port",dest="port",default="9092", 71 | help="kafka port") 72 | parser.add_option("-t","--topic",dest="topic",default=None, 73 | help="REQUIRED: Topic to send to") 74 | parser.add_option("-l","--log-file",dest="logfile",default=None, 75 | help="REQUIRED: Log file to tail") 76 | parser.add_option("-m","--metadata",dest="metadata",default=None, 77 | help="REQUIRED: metadata tag to send along with the data") 78 | parser.add_option("-b","--batch-size",dest="batch_size",default="10", 79 | help="Size of message batches") 80 | parser.add_option("-d","--delay",dest="delay",default=None, 81 | help="tail delay between iterations") 82 | 83 | (options,args) = parser.parse_args() 84 | 85 | if options.topic is None or options.logfile is None: 86 | parser.print_help() 87 | sys.exit(1) 88 | 89 | producer = create_kafka_producer(options.host,int(options.port),options.topic) 90 | atexit.register(flush_messages,producer) 91 | try: 92 | for line in log_lines_generator(options.logfile): 93 | mt = create_message_data(options.metadata,line) 94 | send_to_kafka(producer,mt,int(options.batch_size)) 95 | except KeyboardInterrupt,e: 96 | pass 97 | 98 | if __name__ == '__main__': 99 | main() 100 | 101 | 102 | 103 | --------------------------------------------------------------------------------