├── src ├── model │ ├── __init__.py │ └── worker_dto.py ├── utility │ ├── __init__.py │ ├── common_util.py │ ├── config_manager.py │ └── logging_util.py ├── exceptions │ ├── __init__.py │ └── usi_exceptions.py ├── kafka_core │ ├── __init__.py │ ├── ser_des_util.py │ ├── kafka_stream_writer.py │ ├── kafka_util.py │ ├── sink_task.py │ └── consumer_manager.py ├── transformers │ ├── __init__.py │ ├── test_transformer.py │ └── transformer.py ├── stream_writers │ ├── __init__.py │ ├── console_stream_writer.py │ └── stream_writer.py ├── __init__.py └── event_consumer_app.py ├── setup.cfg ├── requirements.txt ├── Dockerfile ├── config └── consumer_config.json ├── LICENSE.txt ├── setup.py ├── .gitignore ├── k8 └── ray │ └── ray-cluster-config.yaml └── README.md /src/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utility/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/kafka_core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/stream_writers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = venv 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.65.1 2 | uvicorn==0.13.4 3 | cachetools~=4.2.2 4 | starlette~=0.14.2 5 | pydantic~=1.7.4 6 | ratelimit==2.2.1 7 | ray==1.8.0 8 | setuptools==58.4.0 9 | kafka-python==2.0.2 10 | -------------------------------------------------------------------------------- /src/utility/common_util.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | 3 | CLIENT_ID = str(uuid.uuid4()) 4 | 5 | 6 | def singleton(cls): 7 | instances = {} 8 | 9 | def get_instance(): 10 | if cls not in instances: 11 | instances[cls] = cls() 12 | return instances[cls] 13 | 14 | return get_instance 15 | -------------------------------------------------------------------------------- /src/exceptions/usi_exceptions.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | 4 | class GenericException(Exception): 5 | 6 | def __init__(self, message): 7 | super().__init__(message) 8 | self.message = message 9 | self.when = datetime.now() 10 | 11 | 12 | class BadConsumerConfigException(GenericException): 13 | 14 | def __init__(self, message): 15 | super().__init__(message) 16 | self.message = message 17 | self.when = datetime.now() 18 | 19 | 20 | class BadInput(GenericException): 21 | 22 | def __init__(self, message): 23 | super().__init__(message) 24 | self.message = message 25 | self.when = datetime.now() 26 | -------------------------------------------------------------------------------- /src/kafka_core/ser_des_util.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from src.exceptions.usi_exceptions import BadConsumerConfigException 4 | 5 | SER_DES_OPTIONS = { 6 | 'STRING_SER': lambda k: k.encode('utf-8') if k is not None else k, 7 | 'JSON_SER': lambda v: json.dumps(v).encode('utf-8') if v is not None else v, 8 | 'STRING_DES': lambda k: k.decode('utf-8') if k is not None else k, 9 | 'JSON_DES': lambda v: json.loads(v) if v is not None else v, 10 | 11 | } 12 | 13 | 14 | def get_ser_des(name: str): 15 | ser_des_cal = SER_DES_OPTIONS.get(name) 16 | if ser_des_cal is None: 17 | raise BadConsumerConfigException(f'No Serializer/Deserializer found with name {name}') 18 | return ser_des_cal 19 | -------------------------------------------------------------------------------- /src/stream_writers/console_stream_writer.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List 3 | 4 | from src.model.worker_dto import SinkRecordDTO 5 | from src.stream_writers.stream_writer import StreamWriter 6 | 7 | 8 | class ConsoleStreamWriter(StreamWriter): 9 | 10 | def write(self, streams: List[SinkRecordDTO]) -> None: 11 | """ 12 | Writes processed records read from kafka to Elastic search 13 | :param streams: List of SinkRecordDTO - transformed data to be written to ES 14 | :return: None 15 | """ 16 | for sink_record_dto in streams: 17 | logging.info(f' Key: {sink_record_dto.key} - value: {sink_record_dto.message}') 18 | 19 | def close(self) -> None: 20 | pass 21 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | # set the default logging level to info 5 | logging.basicConfig(level=logging.INFO) 6 | 7 | ROOT_SRC_DIR = os.path.dirname(os.path.abspath(__file__)) 8 | USERNAME = os.environ.get('APP_USERNAME', 'admin') 9 | PASSWORD = os.environ.get('APP_PASSWORD', 'admin') 10 | 11 | WORKER_NUM_CPUS = os.environ.get('WORKER_NUM_CPUS', .25) 12 | 13 | SASL_USERNAME = os.environ.get('SASL_USERNAME', None) 14 | SASL_PASSWORD = os.environ.get('SASL_PASSWORD', None) 15 | SECURITY_PROTOCOL = os.environ.get('SECURITY_PROTOCOL', 'PLAINTEXT') 16 | SASL_MECHANISM = os.environ.get('SASL_MECHANISM') 17 | WORKER_CONFIG_PATH = os.environ.get('WORKER_CONFIG_PATH', '/../config/consumer_config.json') 18 | RAY_HEAD_ADDRESS = os.environ.get('RAY_HEAD_ADDRESS', 'auto') 19 | LOCAL_MODE = os.environ.get('LOCAL_MODE', 'Y') 20 | -------------------------------------------------------------------------------- /src/utility/config_manager.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import os 4 | 5 | from src import WORKER_CONFIG_PATH, ROOT_SRC_DIR 6 | from src.exceptions.usi_exceptions import BadInput 7 | from src.utility.common_util import singleton 8 | 9 | ENV = os.environ.get('env', 'dev') 10 | AWS_REGION = os.environ.get('aws_region', '') 11 | 12 | 13 | @singleton 14 | class ConfigManager: 15 | 16 | def __init__(self): 17 | self._worker_config = None 18 | self._load_consumer_config() 19 | 20 | def _load_consumer_config(self) -> None: 21 | with open(ROOT_SRC_DIR + WORKER_CONFIG_PATH, "r") as f: 22 | self._worker_config = json.load(f) 23 | 24 | def get_worker_config(self) -> list: 25 | return copy.deepcopy(self._worker_config) 26 | 27 | def get_worker_config_by_name(self, name: str) -> dict: 28 | 29 | for config in self._worker_config: 30 | if config['consumer_name'] == name: 31 | return config 32 | 33 | raise BadInput(f'Consumer name: {name}, is not configured.') 34 | -------------------------------------------------------------------------------- /src/model/worker_dto.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Optional, Dict 3 | 4 | from pydantic import BaseModel 5 | 6 | 7 | class SinkOperationType(Enum): 8 | UPSERT = 1 9 | DELETE = 2 10 | INSERT = 4 11 | 12 | 13 | class SinkOperation(BaseModel): 14 | sink_operation_type: SinkOperationType 15 | 16 | # update query for update by query 17 | update_query: Dict = {} 18 | 19 | # source field name for script-based update 20 | source_field_name: Optional[str] 21 | 22 | # this can be a json or any primitive value 23 | new_val: object = None 24 | 25 | class Config: 26 | arbitrary_types_allowed = True 27 | 28 | 29 | class SinkRecordDTO(BaseModel): 30 | sink_operation: SinkOperation 31 | message: Dict 32 | key: Optional[str] 33 | offset: Optional[int] 34 | topic: Optional[str] 35 | partition: Optional[int] 36 | 37 | 38 | class DeadLetterDTO(BaseModel): 39 | key: str 40 | message: str 41 | topic: str 42 | partition: int 43 | failed_at: str 44 | error: str 45 | offset: int 46 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8.5-slim 2 | 3 | ### common steps start 4 | 5 | # username 6 | ARG APP_USER=app 7 | 8 | # Create user to run the application 9 | RUN groupadd -r ${APP_USER} && useradd --no-log-init -r -g ${APP_USER} ${APP_USER} 10 | 11 | # create application working directory 12 | WORKDIR /var/www-api 13 | RUN chown -R app:app . /usr/local; 14 | RUN mkdir -p .tmp .log; 15 | RUN chown -R app:app .tmp 16 | RUN chown -R app:app .log 17 | ### common steps end 18 | 19 | # switch to app user 20 | USER ${APP_USER}:${APP_USER} 21 | 22 | # bundle app source 23 | COPY --chown=app requirements.txt . 24 | COPY --chown=app ./src ./src 25 | COPY --chown=app ./config ./config 26 | COPY --chown=app setup.cfg . 27 | COPY --chown=app setup.py . 28 | COPY --chown=app README.md . 29 | 30 | # install dependency requirements 31 | RUN pip install -r requirements.txt 32 | 33 | RUN pip install -e . 34 | 35 | # set application environment 36 | ENV APP_ENV="production" 37 | 38 | # expose applicable server port 39 | EXPOSE 8002 40 | 41 | CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8002", "src.event_consumer_app:app"] 42 | -------------------------------------------------------------------------------- /config/consumer_config.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "consumer_name": "some_consumer_group_name", 4 | "topic_name": "test-topic", 5 | "number_of_workers": 2, 6 | "enable_auto_commit": false, 7 | "bootstrap_servers": "192.168.64.1:9092", 8 | "key_deserializer": "STRING_DES", 9 | "value_deserializer": "STRING_DES", 10 | "header_deserializer": null, 11 | "auto_offset_reset": "earliest", 12 | "max_poll_records": 20, 13 | "max_poll_interval_ms": 60000, 14 | "sink_configs": { 15 | "transformer_cls": "src.transformers.test_transformer.SampleTransformer", 16 | "num_retries": 3, 17 | "retry_delay_seconds": 1, 18 | "stream_writers": [ 19 | "src.stream_writers.console_stream_writer.ConsoleStreamWriter" 20 | ] 21 | }, 22 | "dlq_config": { 23 | "bootstrap_servers": "192.168.64.1:9092", 24 | "topic_name": "test-dlq", 25 | "key_serializer": "STRING_SER", 26 | "value_serializer": "STRING_SER", 27 | "acks": "all", 28 | "compression_type": "gzip", 29 | "retries": 3, 30 | "linger_ms": 10 31 | } 32 | } 33 | ] 34 | -------------------------------------------------------------------------------- /src/transformers/test_transformer.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from kafka.consumer.fetcher import ConsumerRecord 4 | 5 | from src.model.worker_dto import SinkRecordDTO, SinkOperation, SinkOperationType 6 | from src.transformers.transformer import StreamTransformer 7 | 8 | 9 | class SampleTransformer(StreamTransformer): 10 | def transform(self, consumer_record: ConsumerRecord) -> SinkRecordDTO: 11 | """ 12 | converts message to message dict 13 | :param consumer_record: kafka consumer record 14 | :return: SinkRecordDTO 15 | """ 16 | # do something here 17 | message_dict: dict = json.loads(consumer_record.value) 18 | sink_operation = SinkOperation( 19 | sink_operation_type=SinkOperationType.UPSERT 20 | ) 21 | 22 | return SinkRecordDTO(key=consumer_record.key, 23 | message=message_dict, 24 | topic=consumer_record.topic, 25 | offset=consumer_record.offset, 26 | sink_operation=sink_operation, 27 | partition=consumer_record.partition) 28 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Bikas Katwal - bikas.katwal10@gmail.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/transformers/transformer.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from abc import ABC, abstractmethod 3 | 4 | from kafka.consumer.fetcher import ConsumerRecord 5 | 6 | from src.exceptions.usi_exceptions import BadInput 7 | from src.model.worker_dto import SinkRecordDTO 8 | 9 | 10 | class StreamTransformer(ABC): 11 | 12 | def __init__(self, config: dict): 13 | self.config = config 14 | 15 | @abstractmethod 16 | def transform(self, consumer_record: ConsumerRecord) -> SinkRecordDTO: 17 | """ 18 | Transforms the JSON for sink updated and extracts the operation associated with the event 19 | :param consumer_record: kafka consumer record 20 | :return: return the Sink record that will be put into sink Datastore 21 | """ 22 | 23 | 24 | def get_transformer(cls_path: str, config: dict) -> StreamTransformer: 25 | module_name, class_name = cls_path.rsplit(".", 1) 26 | stream_transformer = getattr(importlib.import_module(module_name), class_name) 27 | 28 | if not issubclass(stream_transformer, StreamTransformer): 29 | raise BadInput(f'{cls_path} is not a subclass of StreamTransformer') 30 | 31 | return stream_transformer(config) 32 | -------------------------------------------------------------------------------- /src/stream_writers/stream_writer.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from abc import abstractmethod, ABC 3 | from typing import List 4 | 5 | from src.exceptions.usi_exceptions import BadInput 6 | 7 | 8 | class StreamWriter(ABC): 9 | 10 | def __init__(self, config: dict): 11 | self.config = config 12 | 13 | @abstractmethod 14 | def write(self, streams: list) -> None: 15 | """ 16 | Implement this interface to create an instance of stream writer 17 | :param streams: key and message dictionary 18 | :return: None 19 | """ 20 | 21 | @abstractmethod 22 | def close(self) -> None: 23 | """ 24 | tear down 25 | :return: 26 | """ 27 | 28 | 29 | def get_stream_writers(cls_paths: List[str], config: dict) -> List[StreamWriter]: 30 | stream_writers: List[StreamWriter] = [] 31 | for cls_path in cls_paths: 32 | module_name, class_name = cls_path.rsplit(".", 1) 33 | stream_writer_cls = getattr(importlib.import_module(module_name), class_name) 34 | 35 | if not issubclass(stream_writer_cls, StreamWriter): 36 | raise BadInput(f'{cls_path} is not a subclass of StreamTransformer') 37 | 38 | stream_writers.append(stream_writer_cls(config)) 39 | return stream_writers 40 | -------------------------------------------------------------------------------- /src/utility/logging_util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | LOG_FILE_PATH = '/var/www-api/.log/consumer_app.log' 5 | LOG_DIR_PATH = '/var/www-api/.log' 6 | LOG_FORMATTER = '%(levelname)s - %(asctime)s - %(name)s - %(message)s' 7 | 8 | 9 | def is_valid_path(path: str): 10 | if os.path.exists(path) or os.access(os.path.dirname(path), os.W_OK): 11 | return True 12 | return False 13 | 14 | 15 | def get_logger(name=None): 16 | logger = logging.getLogger(name) 17 | logger.setLevel(logging.INFO) 18 | 19 | std_out_handler = StdOutHandler() 20 | std_out_handler.setLevel(logging.INFO) 21 | logger.addHandler(std_out_handler) 22 | 23 | if is_valid_path(LOG_DIR_PATH): 24 | file_out_handler = FileOutHandler() 25 | file_out_handler.setLevel(logging.INFO) 26 | logger.addHandler(file_out_handler) 27 | 28 | return logger 29 | 30 | 31 | class StdOutHandler(logging.StreamHandler): 32 | def __init__(self, stream=None): 33 | super(StdOutHandler, self).__init__() 34 | 35 | def format(self, record): 36 | self.formatter = logging.Formatter(LOG_FORMATTER) 37 | return super(StdOutHandler, self).format(record) 38 | 39 | 40 | class FileOutHandler(logging.FileHandler): 41 | def __init__(self): 42 | super(FileOutHandler, self).__init__(filename=LOG_FILE_PATH) 43 | 44 | def format(self, record): 45 | self.formatter = logging.Formatter(LOG_FORMATTER) 46 | return super(FileOutHandler, self).format(record) 47 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name='kafka-connect-dependency', 8 | version='0.1.1', 9 | license='MIT', 10 | url='https://github.com/bkatwal/distributed-kafka-consumer-python', 11 | author='Bikas Katwal', 12 | author_email='bikas.katwal10@gmail.com', 13 | description='Library to run distributed Kafka Consumers using Ray', 14 | long_description='This library need to be installed in ray nodes. So, ray head and worker ' 15 | 'nodes can find and pickle/unpickle Kafka Consumer modules.', 16 | keywords=['ray', 'kafka', 'consumer'], 17 | long_description_content_type="text/markdown", 18 | py_modules=['src.exceptions.usi_exceptions', 19 | 'src.kafka_core.consumer_manager', 20 | 'src.kafka_core.kafka_stream_writer', 21 | 'src.kafka_core.kafka_util', 22 | 'src.kafka_core.ser_des_util', 23 | 'src.kafka_core.sink_task', 24 | 'src.model.worker_dto', 25 | 'src.stream_writers.stream_writer', 26 | 'src.stream_writers.console_stream_writer', 27 | 'src.transformers.transformer', 28 | 'src.transformers.test_transformer', 29 | 'src.utility.common_util', 30 | 'src.utility.config_manager', 31 | 'src.utility.logging_util'], 32 | python_requires='>=3', 33 | install_requires=[ 34 | 'fastapi==0.65.1', 35 | 'uvicorn==0.13.4', 36 | 'cachetools~=4.2.2', 37 | 'starlette~=0.14.2', 38 | 'pydantic~=1.7.4', 39 | 'ratelimit==2.2.1', 40 | 'ray==1.8.0', 41 | 'kafka-python==2.0.2' 42 | ] 43 | ) 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # These are some examples of commonly ignored file patterns. 2 | # You should customize this list as applicable to your project. 3 | # Learn more about .gitignore: 4 | # https://www.atlassian.com/git/tutorials/saving-changes/gitignore 5 | 6 | # Node artifact files 7 | node_modules/ 8 | dist/ 9 | 10 | # Compiled Java class files 11 | *.class 12 | 13 | # Compiled Python bytecode 14 | *.py[cod] 15 | 16 | # Log files 17 | *.log 18 | 19 | # Package files 20 | *.jar 21 | 22 | *.iml 23 | 24 | # Maven 25 | target/ 26 | dist/ 27 | 28 | # JetBrains IDE 29 | .idea/ 30 | 31 | # Unit test reports 32 | TEST*.xml 33 | 34 | # Generated by MacOS 35 | .DS_Store 36 | 37 | # Generated by Windows 38 | Thumbs.db 39 | 40 | # Applications 41 | *.app 42 | *.exe 43 | *.war 44 | 45 | # Large media files 46 | *.mp4 47 | *.tiff 48 | *.avi 49 | *.flv 50 | *.mov 51 | *.wmv 52 | 53 | 54 | .idea/ 55 | 56 | # dotenv 57 | event_consumer/.env 58 | 59 | # virtualenv 60 | .venv 61 | venv*/ 62 | ENV/ 63 | test-reports/ 64 | 65 | # Installer logs 66 | pip-log.txt 67 | pip-delete-this-directory.txt 68 | 69 | # Unit test / coverage reports 70 | htmlcov/ 71 | .tox/ 72 | .coverage 73 | .coverage.* 74 | .cache 75 | nosetests.xml 76 | coverage.xml 77 | *.cover 78 | .hypothesis/ 79 | 80 | # Byte-compiled / optimized / DLL files 81 | __pycache__/ 82 | *.py[cod] 83 | *$py.class 84 | 85 | # Ops Log files 86 | trie_rejected_words.txt 87 | 88 | # trained models 89 | event_consumer/trained_models/* 90 | !event_consumer/trained_models/__init__.py 91 | !event_consumer/trained_models/generated_dictionaries/ 92 | 93 | # C extensions 94 | *.so 95 | 96 | # Distribution / packaging 97 | .Python 98 | env/ 99 | build/ 100 | develop-eggs/ 101 | dist/ 102 | downloads/ 103 | eggs/ 104 | .eggs/ 105 | lib/ 106 | lib64/ 107 | parts/ 108 | sdist/ 109 | var/ 110 | wheels/ 111 | *.egg-info/ 112 | .installed.cfg 113 | *.egg 114 | -------------------------------------------------------------------------------- /src/kafka_core/kafka_stream_writer.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import TypeVar, Generic, List 3 | 4 | from kafka import KafkaProducer 5 | 6 | from src.exceptions.usi_exceptions import BadConsumerConfigException 7 | from src.kafka_core.ser_des_util import get_ser_des 8 | from src.stream_writers.stream_writer import StreamWriter 9 | from src.utility import logging_util 10 | from src.utility.common_util import CLIENT_ID 11 | 12 | logger = logging_util.get_logger(__name__) 13 | 14 | T = TypeVar('T') 15 | 16 | 17 | class KafkaStreamWriter(Generic[T], StreamWriter): 18 | 19 | def __init__(self, config: dict): 20 | self.config = config 21 | self.kafka_producer = self.__create_kafka_producer() 22 | if not config.get('topic_name'): 23 | raise BadConsumerConfigException('missing producer topic name.') 24 | self.topic = config.get('topic_name') 25 | 26 | def write(self, streams: List[T]) -> None: 27 | """ 28 | writes message of type T to a kafka topic 29 | :param streams: list of messages 30 | :return: None 31 | """ 32 | for event in streams: 33 | key = None 34 | if hasattr(event, 'key'): 35 | key = event.key 36 | 37 | self.kafka_producer.send(topic=self.topic, key=key, value=json.dumps(event.__dict__)) 38 | 39 | def close(self) -> None: 40 | """ 41 | closes the writer kafka producer object 42 | :return: None 43 | """ 44 | if self.kafka_producer: 45 | self.kafka_producer.close() 46 | 47 | def __create_kafka_producer(self) -> KafkaProducer: 48 | bootstrap_servers = self.config.get('bootstrap_servers') 49 | 50 | return KafkaProducer(bootstrap_servers=bootstrap_servers, 51 | key_serializer=get_ser_des(self.config.get( 52 | 'key_serializer', 'STRING_SER')), 53 | value_serializer=get_ser_des(self.config.get( 54 | 'value_serializer', 'STRING_SER')), 55 | acks=self.config.get('acks', 'all'), 56 | compression_type=self.config.get('compression_type', 57 | 'gzip'), 58 | retries=self.config.get('retries', 1), 59 | linger_ms=self.config.get('linger_ms', 10), 60 | client_id=CLIENT_ID) 61 | -------------------------------------------------------------------------------- /src/kafka_core/kafka_util.py: -------------------------------------------------------------------------------- 1 | from kafka import KafkaConsumer 2 | 3 | TWO_MINUTES = 2 4 | 5 | 6 | def is_end_offset_none(end_offsets: dict, start_offsets: dict) -> bool: 7 | """ 8 | Utility function to check if the partition that has start offset has end offset too. 9 | :param end_offsets: topic partition and end offsets 10 | :param start_offsets:topic partition and start offsets 11 | :return: True/False 12 | """ 13 | if len(end_offsets) == 0: 14 | return True 15 | 16 | for tp, offset in end_offsets.items(): 17 | if offset is None and start_offsets[tp] is not None: 18 | return True 19 | 20 | return False 21 | 22 | 23 | def is_all_end_offset_found(end_offsets: dict, start_offsets: dict) -> bool: 24 | """ 25 | Utility function to check if the partition that has start offset has end offset too. 26 | :param end_offsets: topic partition and end offsets 27 | :param start_offsets:topic partition and start offsets 28 | :return: True/False 29 | """ 30 | if len(end_offsets) == 0: 31 | return False 32 | 33 | for tp, offset in end_offsets.items(): 34 | if offset is None and start_offsets[tp] is not None: 35 | return False 36 | 37 | return True 38 | 39 | 40 | def get_start_end_offsets(start_timestamp: int, end_timestamp: int, 41 | topic_partitions: set, consumer: KafkaConsumer): 42 | """ 43 | Get start and end offset for all the partitions based on the given start and end timestamp 44 | :param start_timestamp: start timestamp in epoch time millis 45 | :param end_timestamp: end timestamp in epoch time millis 46 | :param topic_partitions: topic partition set 47 | :param consumer: kafka consumer 48 | :return: tuple of start offsets and end offsets for each partition 49 | """ 50 | tp_start_timestamps: dict = {} 51 | for tp in topic_partitions: 52 | tp_start_timestamps[tp] = start_timestamp 53 | 54 | start_offsets = consumer.offsets_for_times(tp_start_timestamps) 55 | end_offsets = {} 56 | # go back 2 minute and keep checking if there are end offsets in partition 57 | tp_end_timestamps: dict = {} 58 | while not is_all_end_offset_found(start_offsets=start_offsets, end_offsets=end_offsets): 59 | for tp in topic_partitions: 60 | # seek previous offset from a partition only if the offset is not found 61 | if len(end_offsets) == 0 or (end_offsets[tp] is None and start_offsets[tp] is not 62 | None): 63 | tp_end_timestamps[tp] = end_timestamp 64 | 65 | end_offsets = consumer.offsets_for_times(tp_end_timestamps) 66 | end_timestamp = end_timestamp - (TWO_MINUTES * 60 * 1000) 67 | 68 | return start_offsets, end_offsets 69 | -------------------------------------------------------------------------------- /src/event_consumer_app.py: -------------------------------------------------------------------------------- 1 | import secrets 2 | import time 3 | 4 | from fastapi import FastAPI, Depends, HTTPException, status 5 | from fastapi.security import HTTPBasic, HTTPBasicCredentials 6 | from starlette.requests import Request 7 | from starlette.responses import JSONResponse 8 | 9 | from src import USERNAME, PASSWORD 10 | from src.exceptions.usi_exceptions import BadInput, GenericException 11 | from src.kafka_core.consumer_manager import ConsumerWorkerManager 12 | from src.utility import logging_util 13 | 14 | logger = logging_util.get_logger(__name__) 15 | 16 | app = FastAPI(title="Distributed Kafka Consumer Using Ray - Manager") 17 | security = HTTPBasic() 18 | cwm = ConsumerWorkerManager() 19 | 20 | 21 | def authorize(credentials: HTTPBasicCredentials = Depends(security)): 22 | correct_username = secrets.compare_digest(credentials.username, USERNAME) 23 | correct_password = secrets.compare_digest(credentials.password, PASSWORD) 24 | if not (correct_username and correct_password): 25 | raise HTTPException( 26 | status_code=status.HTTP_401_UNAUTHORIZED, 27 | detail="Incorrect username or password", 28 | headers={"WWW-Authenticate": "Basic"}, 29 | ) 30 | return credentials.username 31 | 32 | 33 | @app.on_event("startup") 34 | def on_startup(): 35 | cwm.start_all_workers() 36 | 37 | 38 | @app.on_event("shutdown") 39 | def on_shutdown(): 40 | cwm.start_all_workers() 41 | 42 | 43 | @app.post('/manager/health', include_in_schema=False) 44 | def health(): 45 | return {'message': 'App is up!'} 46 | 47 | 48 | @app.post('/manager/start-consumers', dependencies=[Depends(authorize)]) 49 | def start_consumers(): 50 | cwm.start_all_workers() 51 | return "Successfully started all workers!" 52 | 53 | 54 | @app.get('/manager/fetch-consumers', dependencies=[Depends(authorize)]) 55 | def get_consumers(): 56 | return cwm.get_all_running_consumer() 57 | 58 | 59 | @app.post('/manager/read-from-timestamp', dependencies=[Depends(authorize)]) 60 | def read_from_timestamp(consumer_name: str, start_timestamp: int, 61 | end_timestamp: int = int(time.time() * 1000), 62 | stop_running_consumer: bool = True): 63 | cwm.start_worker_with_timestamp(start_timestamp=start_timestamp, end_timestamp=end_timestamp, 64 | stop_regular=stop_running_consumer, name=consumer_name) 65 | return "Successfully started!" 66 | 67 | 68 | @app.post('/manager/start-consumer/{consumer_name}', dependencies=[Depends(authorize)]) 69 | def start_consumer(consumer_name): 70 | cwm.start_worker(consumer_name) 71 | return "Successfully started worker!" 72 | 73 | 74 | @app.post('/manager/stop-consumers', dependencies=[Depends(authorize)]) 75 | def stop_consumers(): 76 | cwm.stop_all_workers() 77 | return "Successfully Stopped all workers!" 78 | 79 | 80 | @app.post('/manager/stop-consumer/{consumer_name}', dependencies=[Depends(authorize)]) 81 | def stop_consumer(consumer_name): 82 | cwm.stop_worker(consumer_name) 83 | return "Successfully Stopped!" 84 | 85 | 86 | @app.exception_handler(Exception) 87 | def generic_exception_handler(request: Request, exc: Exception): 88 | logger.error(exc) 89 | return JSONResponse( 90 | status_code=500, 91 | content={"message": "Oops! I messed up!"}, 92 | ) 93 | 94 | 95 | @app.exception_handler(GenericException) 96 | def request_validation_exception_handler(request: Request, exc: GenericException): 97 | logger.error(exc) 98 | return JSONResponse( 99 | status_code=500, 100 | content={"message": exc.message}, 101 | ) 102 | 103 | 104 | @app.exception_handler(BadInput) 105 | def request_validation_exception_handler(request: Request, exc: BadInput): 106 | logger.error(exc) 107 | return JSONResponse( 108 | status_code=422, 109 | content={"message": exc.message}, 110 | ) 111 | -------------------------------------------------------------------------------- /src/kafka_core/sink_task.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from abc import ABC 4 | from typing import List 5 | 6 | from kafka.consumer.fetcher import ConsumerRecord 7 | from ratelimit import limits, sleep_and_retry 8 | 9 | from src.exceptions.usi_exceptions import BadConsumerConfigException 10 | from src.kafka_core.kafka_stream_writer import KafkaStreamWriter 11 | from src.model.worker_dto import DeadLetterDTO, SinkRecordDTO 12 | from src.stream_writers.stream_writer import StreamWriter, get_stream_writers 13 | from src.transformers.transformer import get_transformer 14 | 15 | ONE_SECOND = 1 16 | CALLS = 20 17 | 18 | 19 | class SinkTask(ABC): 20 | 21 | def __init__(self, config: dict): 22 | self.sink_configs = config.get('sink_configs') 23 | if self.sink_configs is None: 24 | raise BadConsumerConfigException('Missing Sink Config.') 25 | self.config = config 26 | processor_cls_path = self.sink_configs.get('transformer_cls') 27 | if not processor_cls_path: 28 | raise BadConsumerConfigException('sink_configs.transformer_cls is a mandatory config') 29 | self.stream_transformer = get_transformer(processor_cls_path, self.sink_configs) 30 | self.operation_extractor = None 31 | stream_writer_cls_paths: List[str] = self.sink_configs.get('stream_writers') 32 | if not stream_writer_cls_paths or len(stream_writer_cls_paths) == 0: 33 | raise BadConsumerConfigException('sink_configs.stream_writers is a mandatory config') 34 | self.sink_stream_writers: List[StreamWriter] = get_stream_writers( 35 | stream_writer_cls_paths, self.sink_configs) 36 | if config.get('dlq_config') is not None: 37 | self.dlq_stream_writer: KafkaStreamWriter[DeadLetterDTO] = KafkaStreamWriter( 38 | config.get('dlq_config')) 39 | self.retries = self.sink_configs.get('num_retries', 3) 40 | self.retry_delay_seconds = self.sink_configs.get('retry_delay_seconds', 1) 41 | 42 | def write_to_sink(self, sink_record_dto_list: List[SinkRecordDTO]): 43 | for stream_writer in self.sink_stream_writers: 44 | retries = 0 45 | while retries <= self.retries: 46 | try: 47 | stream_writer.write(sink_record_dto_list) 48 | break 49 | except Exception as e: 50 | if retries == self.retries: 51 | raise e 52 | retries = retries + 1 53 | logging.error(f'{type(stream_writer)} - Failed with exception: {e}, retrying ' 54 | f'attempt' 55 | f' {retries}') 56 | time.sleep(self.retry_delay_seconds) 57 | 58 | @sleep_and_retry 59 | @limits(calls=CALLS, period=1) 60 | def process(self, consumer_records: List[ConsumerRecord]): 61 | 62 | for consumer_record in consumer_records: 63 | try: 64 | sink_record_dto: SinkRecordDTO = self.stream_transformer.transform(consumer_record) 65 | sink_record_dto_list: List[SinkRecordDTO] = [sink_record_dto] 66 | except Exception as e: 67 | self.handle_dlq_push(consumer_record.key, consumer_record.value, 68 | consumer_record.topic, consumer_record.partition, 69 | 'TRANSFORM', e, consumer_record.offset) 70 | continue 71 | 72 | try: 73 | self.write_to_sink(sink_record_dto_list) 74 | except Exception as e: 75 | self.handle_dlq_push(consumer_record.key, consumer_record.value, 76 | consumer_record.topic, consumer_record.partition, 77 | 'SINK_UPDATE', e, consumer_record.offset) 78 | 79 | def handle_dlq_push(self, key: str, message: str, topic: str, partition: int, 80 | failed_at: str, error: Exception, offset: int): 81 | logging.warning( 82 | f'failed to {failed_at} key: {key} and message: {message}, in topic {topic} ' 83 | f'having offset {offset}, with error: {error}') 84 | try: 85 | if self.dlq_stream_writer is not None: 86 | dead_letter = DeadLetterDTO(key=key, message=message, topic=topic, 87 | partition=partition, failed_at=failed_at, 88 | error=str(error) if error is not None else "", 89 | offset=offset) 90 | self.dlq_stream_writer.write([dead_letter]) 91 | except Exception as e: 92 | logging.error(f'Failed to write to DLQ: {e}') 93 | -------------------------------------------------------------------------------- /k8/ray/ray-cluster-config.yaml: -------------------------------------------------------------------------------- 1 | #### 2 | # We are creating one head pod and 2 worker replica pods. 3 | # Head node takes one core and 512 MB of memory — change this as you need 4 | # Worker node takes 0.5 CPU and 512 MB of memory — change this as you need 5 | # Expose dashboard, ray head node and Redis server for public access using K8 external service 6 | # using NodePort. 7 | # Install all code dependencies as a package in both head and worker nodes. This way Ray head 8 | # nodes and worker nodes can find these modules. 9 | #### 10 | --- 11 | # create namespace, where all ray components will be deployed 12 | apiVersion: v1 13 | kind: Namespace 14 | metadata: 15 | name: ray 16 | --- 17 | # create service to expose ray head, redis and ray dashboard. 18 | apiVersion: v1 19 | kind: Service 20 | metadata: 21 | namespace: ray 22 | name: ray-head-service 23 | spec: 24 | type: NodePort 25 | ports: 26 | - name: client 27 | protocol: TCP 28 | port: 10001 29 | targetPort: 10001 30 | nodePort: 30001 31 | - name: dashboard 32 | protocol: TCP 33 | port: 8265 34 | targetPort: 8265 35 | nodePort: 30002 36 | - name: redis 37 | protocol: TCP 38 | port: 6379 39 | targetPort: 6379 40 | nodePort: 30003 41 | selector: 42 | component: ray-head 43 | --- 44 | apiVersion: v1 45 | kind: ConfigMap 46 | metadata: 47 | name: ray-head-config 48 | namespace: ray 49 | data: 50 | ray-head-url: ray-head-service 51 | --- 52 | apiVersion: apps/v1 53 | kind: Deployment 54 | metadata: 55 | namespace: ray 56 | name: ray-head 57 | spec: 58 | # Do not change this - Ray currently only supports one head node per cluster. 59 | replicas: 1 60 | selector: 61 | matchLabels: 62 | component: ray-head 63 | type: ray 64 | template: 65 | metadata: 66 | labels: 67 | component: ray-head 68 | type: ray 69 | spec: 70 | # If the head node goes down, the entire cluster (including all worker 71 | # nodes) will go down as well. If you want Kubernetes to bring up a new 72 | # head node in this case, set this to "Always," else set it to "Never." 73 | restartPolicy: Always 74 | 75 | # This volume allocates shared memory for Ray to use for its plasma 76 | # object store. If you do not provide this, Ray will fall back to 77 | # /tmp which cause slowdowns if is not a shared memory volume. 78 | volumes: 79 | - name: dshm 80 | emptyDir: 81 | medium: Memory 82 | containers: 83 | - name: ray-head 84 | image: rayproject/ray:1.8.0 85 | imagePullPolicy: IfNotPresent 86 | command: [ "/bin/bash" ] 87 | args: 88 | - -c 89 | - >- 90 | pip install kafka-connect-dependency==0.1.1 && 91 | ray start --head --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --dashboard-host=0.0.0.0 --block 92 | ports: 93 | - containerPort: 6379 # Redis port 94 | - containerPort: 10001 # Used by Ray Client 95 | - containerPort: 8265 # Used by Ray Dashboard 96 | - containerPort: 8000 # Used by Ray Serve 97 | 98 | # This volume allocates shared memory for Ray to use for its plasma 99 | # object store. If you do not provide this, Ray will fall back to 100 | # /tmp which cause slowdowns if is not a shared memory volume. 101 | volumeMounts: 102 | - mountPath: /dev/shm 103 | name: dshm 104 | env: 105 | - name: MY_POD_IP 106 | valueFrom: 107 | fieldRef: 108 | fieldPath: status.podIP 109 | # This is used in the ray start command so that Ray can spawn the 110 | # correct number of processes. Omitting this may lead to degraded 111 | # performance. 112 | - name: MY_CPU_REQUEST 113 | valueFrom: 114 | resourceFieldRef: 115 | resource: requests.cpu 116 | resources: 117 | requests: 118 | cpu: 500m 119 | memory: 512Mi 120 | --- 121 | apiVersion: apps/v1 122 | kind: Deployment 123 | metadata: 124 | namespace: ray 125 | name: ray-worker 126 | spec: 127 | # Change this to scale the number of worker nodes started in the Ray cluster. 128 | replicas: 3 129 | selector: 130 | matchLabels: 131 | component: ray-worker 132 | type: ray 133 | template: 134 | metadata: 135 | labels: 136 | component: ray-worker 137 | type: ray 138 | spec: 139 | restartPolicy: Always 140 | volumes: 141 | - name: dshm 142 | emptyDir: 143 | medium: Memory 144 | containers: 145 | - name: ray-worker 146 | image: rayproject/ray:1.8.0 147 | imagePullPolicy: IfNotPresent 148 | command: [ "/bin/bash" ] 149 | args: 150 | - -c 151 | - >- 152 | pip install kafka-connect-dependency==0.1.1 && 153 | ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=12345 --node-manager-port=12346 --block 154 | # This volume allocates shared memory for Ray to use for its plasma 155 | # object store. If you do not provide this, Ray will fall back to 156 | # /tmp which cause slowdowns if is not a shared memory volume. 157 | volumeMounts: 158 | - mountPath: /dev/shm 159 | name: dshm 160 | env: 161 | # This is used in the ray start command so that Ray can spawn the 162 | # correct number of processes. Omitting this may lead to degraded 163 | # performance. 164 | - name: MY_CPU_REQUEST 165 | valueFrom: 166 | resourceFieldRef: 167 | resource: requests.cpu 168 | - name: RAY_HEAD_IP 169 | valueFrom: 170 | configMapKeyRef: 171 | name: ray-head-config 172 | key: ray-head-url 173 | resources: 174 | requests: 175 | cpu: 250m 176 | memory: 300Mi 177 | 178 | 179 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Distributed Kafka Consumer Using Ray 2 | Using this project you can create a distributed Kafka Consumers, with the specified number of 3 | consumers that run on multiple nodes and provides an API support to manage your consumers. 4 | Operations like - starting/stopping 5 | consumers. 6 | 7 | This project uses [Ray](https://docs.ray.io/) to create distributed kafka Consumers 8 | 9 | ### System Requirements: 10 | Python Version: 3.7 11 | 12 | Ray version: 1.8.0 13 | 14 | ### Setup Instructions 15 | 16 | **Step 1 - Create Your Transformer Class** 17 | 18 | To create a new transformer implement the abstract class [StreamTransformer](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/src/transformers/transformer.py) and use 19 | this new transformer in [worker config](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/config/consumer_config.json). 20 | 21 | One example transformer is defined [here](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/src/transformers/test_transformer.py) 22 | 23 | **Step 2 - Create your worker config** 24 | 25 | One Example config is defined [here](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/config/consumer_config.json). 26 | ```json 27 | [ 28 | { 29 | "consumer_name": "some_consumer_group_name", 30 | "topic_name": "first-topic", 31 | "number_of_workers": 2, 32 | "enable_auto_commit": false, 33 | "bootstrap_servers": "localhost:9092", 34 | "key_deserializer": "STRING_DES", 35 | "value_deserializer": "STRING_DES", 36 | "header_deserializer": null, 37 | "auto_offset_reset": "earliest", 38 | "max_poll_records": 20, 39 | "max_poll_interval_ms": 60000, 40 | "sink_configs": { 41 | "transformer_cls": "src.transformers.test_transformer.SampleTransformer", 42 | "num_retries": 3, 43 | "retry_delay_seconds": 1, 44 | "stream_writers": [ 45 | "src.stream_writers.console_stream_writer.ConsoleStreamWriter" 46 | ] 47 | }, 48 | "dlq_config": { 49 | "bootstrap_servers": "localhost:9092", 50 | "topic_name": "test-dlq", 51 | "key_serializer": "STRING_SER", 52 | "value_serializer": "STRING_SER", 53 | "acks": "all", 54 | "compression_type": "gzip", 55 | "retries": 3, 56 | "linger_ms": 10 57 | } 58 | } 59 | ] 60 | 61 | ``` 62 | 63 | Config info 64 | 65 | Config Name|Description|default value|Is mandatory?| 66 | -----------|-----------|------------|--------------| 67 | consumer_name|This will be used as consumer group name| |Yes 68 | number_of_workers|Number of consumers to create for a consumer group|1|No 69 | sink_configs|Any config related to your sink task. Say, if your are writing to Elasticsearch then you may want to add ES endpoint in config| |Yes 70 | dlq_config|Dead letter queue config| |No 71 | For available Serializers/deserializers refer [ser_des_util.py](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/src/kafka_core/ser_des_util.py) 72 | 73 | Rest of the configs are self explanatory. 74 | 75 | **Step 3 - Install the Requirements** 76 | 77 | Install all dependencies in [requirement.txt](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/requirements.txt) 78 | ```shell 79 | pip install -r 80 | ``` 81 | 82 | Install the code using `setup.py`. 83 | This is needed for ray to find modules to pickle/unpickle. 84 | 85 | Go to project root folder, where setup.py exists and run: 86 | ```shell 87 | pip install -e . 88 | ``` 89 | 90 | **Step 4 - Start ray head node** 91 | 92 | If running in local, run below command: 93 | ```shell 94 | ray start --head --port=6379 95 | ``` 96 | 97 | 98 | **Step 5 - Set necessary Environment Variables** 99 | 100 | Variable Name|Description|Is Mandatory?|Default Value| 101 | -------------|------------|------------|-------------| 102 | LOCAL_MODE| `Y` or `N`. Tells weather to run Kafka Consumer in single node or in a distributed setup.|N|Y| 103 | RAY_HEAD_ADDRESS|Ex: `ray://192.168.0.19:10001`. Avoid creating this env variable, if head and driver/app running on same node|No|auto| 104 | WORKER_CONFIG_PATH|worker [json conig](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/config/consumer_config.json) path|Yes|| 105 | APP_USERNAME|Username to setup Basic API Authentication|No|admin| 106 | APP_PASSWORD|Password to setup Basic API Authentication|No|admin| 107 | WORKER_NUM_CPUS|Number of CPUs to reserve per Consumer/Worker|No|0.25| 108 | SECURITY_PROTOCOL|Pass the security protocol being used to connect to Kafka Brokers. Valid values are - PLAINTEXT, SASL_PLAINTEXT, SASL_SSL|No|None| 109 | SASL_MECHANISM|Using SASL based Auth. Pass either of the valid values - PLAIN, SCRAM-SHA-256, SCRAM-SHA-512|No|None| 110 | SASL_USERNAME|Pass SASL username if using SASL Auth to connect to Kafka|No|None| 111 | SASL_PASSWORD|Pass SASL password if using SASL Auth to connect to Kafka|No|None 112 | 113 | **Step 6 - Run the APP** 114 | ```shell 115 | uvicorn src.event_consumer_app:app --port --reload 116 | ``` 117 | 118 | **Run App in docker container** 119 | 120 | Build Image 121 | ```shell 122 | # run below in the project root folder 123 | build -t kafka-connect-ray . 124 | ``` 125 | 126 | Run Image 127 | ```shell 128 | # add other environment variables as you need. 129 | docker run -e RAY_HEAD_ADDRESS=ray://localhost:10001 -e LOCAL_MODE=N -dp 8002:8002 kafka-connect-ray 130 | ``` 131 | 132 | **IMPORTANT!!!!** 133 | 134 | While creating ray cluster make sure to install code dependencies by running below command in 135 | your Node or VM or container: 136 | ```shell 137 | pip install kafka-connect-dependency==0.1.1 138 | ``` 139 | This will let ray head and worker nodes find the modules. 140 | 141 | This setup is added in Ray K8 [cluster config yaml](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/k8/ray/ray-cluster-config.yaml#L74) file. 142 | 143 | ### License 144 | 145 | The MIT License (MIT) 146 | 147 | Copyright (c) Bikas Katwal - bikas.katwal10@gmail.com 148 | 149 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 150 | associated documentation files (the "Software"), to deal in the Software without restriction, 151 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 152 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 153 | furnished to do so, subject to the following conditions: 154 | 155 | The above copyright notice and this permission notice shall be included in all copies or substantial 156 | portions of the Software. 157 | 158 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 159 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 160 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 161 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 162 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /src/kafka_core/consumer_manager.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import logging 3 | import threading 4 | import time 5 | import uuid 6 | from typing import Dict, List 7 | 8 | import ray 9 | from kafka import KafkaConsumer 10 | from ray.actor import ActorHandle 11 | 12 | from src import WORKER_NUM_CPUS, SASL_USERNAME, SASL_PASSWORD, SECURITY_PROTOCOL, SASL_MECHANISM, \ 13 | RAY_HEAD_ADDRESS, LOCAL_MODE 14 | from src.exceptions.usi_exceptions import BadInput 15 | from src.kafka_core.kafka_util import get_start_end_offsets 16 | from src.kafka_core.ser_des_util import get_ser_des 17 | from src.kafka_core.sink_task import SinkTask 18 | from src.utility import logging_util 19 | from src.utility.common_util import singleton, CLIENT_ID 20 | from src.utility.config_manager import ConfigManager 21 | 22 | logger = logging_util.get_logger(__name__) 23 | 24 | TWO_MINUTES = 2 25 | MAX_RESTARTS_REMOTE_WORKER = 10 26 | 27 | if LOCAL_MODE == 'Y': 28 | ray.init() 29 | else: 30 | ray.init(address=RAY_HEAD_ADDRESS) 31 | 32 | logger.info('''This cluster consists of 33 | {} nodes in total 34 | {} CPU resources in total 35 | '''.format(len(ray.nodes()), ray.cluster_resources()['CPU'])) 36 | 37 | 38 | @singleton 39 | class ConsumerWorkerManager: 40 | 41 | def __init__(self): 42 | self.consumer_worker_container: Dict[str, List[ActorHandle]] = {} 43 | self.seek_consumer_worker_container: Dict[str, SeekConsumerWorker] = {} 44 | self.config_manager = ConfigManager() 45 | self.worker_configs = self.config_manager.get_worker_config() 46 | self.init_container() 47 | 48 | def init_container(self) -> None: 49 | for worker_config in self.worker_configs: 50 | self.consumer_worker_container[worker_config.get('consumer_name')] = [] 51 | 52 | def stop_all_workers(self): 53 | 54 | for worker_name, worker_actors in self.consumer_worker_container.items(): 55 | 56 | for worker_actor in worker_actors: 57 | # wait on the future to stop the consumers 58 | ray.get(worker_actor.stop_consumer.remote()) 59 | 60 | ray.kill(worker_actor) 61 | self.consumer_worker_container[worker_name] = [] 62 | 63 | logger.info("All consumer workers stopped.") 64 | 65 | def get_all_running_consumer(self): 66 | result: List[Dict] = [] 67 | for worker_config in self.worker_configs: 68 | worker: dict = {} 69 | consumer_name = worker_config.get('consumer_name') 70 | worker['consumer_name'] = consumer_name 71 | worker['total_num_workers'] = worker_config.get('number_of_workers') 72 | if consumer_name in self.consumer_worker_container: 73 | worker['num_workers_running'] = len( 74 | self.consumer_worker_container.get(consumer_name)) 75 | worker['status'] = 'RUNNING' 76 | else: 77 | worker['num_workers_running'] = 0 78 | worker['status'] = 'STOPPED' 79 | 80 | result.append(worker) 81 | 82 | return result 83 | 84 | def start_all_workers(self): 85 | started_flag = False 86 | for worker_config in self.worker_configs: 87 | 88 | # start consumer only if the consumer workers are not running 89 | if len(self.consumer_worker_container.get(worker_config.get('consumer_name'))) == 0: 90 | started_flag = True 91 | num_workers: int = worker_config.get('number_of_workers', 1) 92 | i = 1 93 | for _ in itertools.repeat(None, num_workers): 94 | w_name = worker_config.get('consumer_name') + '-' + str(i) 95 | worker_actor: ActorHandle = ConsumerWorker.options( 96 | name=w_name, max_concurrency=2).remote(worker_config, w_name) 97 | i = i + 1 98 | worker_actor.run.remote() 99 | self.consumer_worker_container[worker_config.get('consumer_name')].append( 100 | worker_actor) 101 | if not started_flag: 102 | raise BadInput(f'All Consumers already running') 103 | logger.info("All consumer workers started.") 104 | 105 | def start_worker(self, name: str) -> None: 106 | if name not in self.consumer_worker_container: 107 | raise BadInput(f'Failed to start. Worker {name} not found.') 108 | 109 | if name in self.consumer_worker_container and len(self.consumer_worker_container.get( 110 | name)) > 0: 111 | raise BadInput('Consumer already running.') 112 | 113 | worker_config: dict = self.config_manager.get_worker_config_by_name(name) 114 | num_workers = worker_config.get('number_of_workers', 1) 115 | 116 | i = 1 117 | for _ in itertools.repeat(None, num_workers): 118 | w_name = name + '-' + str(i) 119 | worker_actor = ConsumerWorker.options(name=w_name, max_concurrency=2).remote( 120 | worker_config, w_name) 121 | i = i + 1 122 | self.consumer_worker_container[name].append(worker_actor) 123 | worker_actor.run.remote() 124 | logger.info(f"{num_workers} workers of worker group {name} started.") 125 | 126 | def stop_worker(self, name: str) -> None: 127 | if name not in self.consumer_worker_container: 128 | raise BadInput(f'Failed to stop. Worker {name} not found.') 129 | 130 | worker_actors = self.consumer_worker_container[name] 131 | 132 | if len(worker_actors) == 0: 133 | raise BadInput(f'Worker not running.') 134 | 135 | for worker_actor in worker_actors: 136 | # wait on the future before killing actors, so that the consumers are terminated 137 | # gracefully 138 | ray.get(worker_actor.stop_consumer.remote()) 139 | 140 | ray.kill(worker_actor) 141 | self.consumer_worker_container[name] = [] 142 | logger.info(f"{name} consumer worker stopped.") 143 | 144 | def start_worker_with_timestamp(self, name: str, start_timestamp: int, end_timestamp: int, 145 | stop_regular=False) -> None: 146 | """ 147 | Performs below steps: 148 | 1. This function will first stop the current running consumer(If stop_regular=true) 149 | 2. Create new consumer with new consumer group 150 | 3. Start seeking all the offset from the start_timestamp till end/current timestamp. 151 | 3. Stops the temporary consumer that was seeking old data. 152 | 4. Start the regular consumer. 153 | Warning: It is possible that the consumers may read the same data twice. So, 154 | it is important that the writes are idempotent 155 | :param name: consumer worker name 156 | :param start_timestamp: start time in epoch time millis - start consuming data from this 157 | timestamp 158 | :param end_timestamp end consuming data from this timestamp, if None passed, 159 | current timestamp will be used. 160 | :param stop_regular: if True stops the consumer worker passed in the argument. 161 | :return: None 162 | """ 163 | 164 | if name in self.seek_consumer_worker_container: 165 | raise BadInput(f'One seek task for the consumer {name}, is already running.') 166 | 167 | try: 168 | self.seek_consumer_worker_container[name] = None 169 | worker_name = name + '-' + str(uuid.uuid4()) 170 | 171 | if stop_regular: 172 | self.stop_worker(name) 173 | 174 | if not end_timestamp: 175 | end_timestamp = int(time.time() * 1000) 176 | 177 | worker = SeekConsumerWorker(self.config_manager.get_worker_config_by_name(name), 178 | start_timestamp, end_timestamp, 179 | seek_consumer_name=worker_name) 180 | 181 | self.seek_consumer_worker_container[name] = worker 182 | worker.start() 183 | worker.join() 184 | except Exception as e: 185 | logger.error(f'Failed to consume data from previous timestamp: {e}') 186 | raise e 187 | finally: 188 | if stop_regular: 189 | self.start_worker(name) 190 | 191 | self.seek_consumer_worker_container.pop(name) 192 | 193 | 194 | class SeekConsumerWorker(threading.Thread): 195 | 196 | def __init__(self, config: dict, start_timestamp: int, end_timestamp, seek_consumer_name: str): 197 | threading.Thread.__init__(self) 198 | self.consumer_name = seek_consumer_name 199 | self.start_timestamp = start_timestamp 200 | self.end_timestamp = end_timestamp 201 | self.stop_event = threading.Event() 202 | self.config = config 203 | self.auto_offset_reset = 'earliest' 204 | self.consumer_timeout_ms = 1000 205 | self.processed_count = 0 206 | self.sink_task: SinkTask = SinkTask(config) 207 | self.consumer = KafkaConsumer(bootstrap_servers=self.config.get('bootstrap_servers'), 208 | client_id=CLIENT_ID, 209 | group_id=self.consumer_name, 210 | key_deserializer=get_ser_des(self.config.get( 211 | 'key_deserializer', 'STRING_DES')), 212 | value_deserializer=get_ser_des(self.config.get( 213 | 'value_deserializer', 'JSON_DES')), 214 | auto_offset_reset=self.auto_offset_reset, 215 | enable_auto_commit=self.config.get('enable_auto_commit', 216 | True), 217 | max_poll_records=self.config.get('max_poll_records', 50), 218 | max_poll_interval_ms=self.config.get('max_poll_interval_ms', 219 | 600000), 220 | security_protocol=SECURITY_PROTOCOL, 221 | sasl_mechanism=SASL_MECHANISM, 222 | consumer_timeout_ms=1000) 223 | self.consumer.subscribe([self.config.get('topic_name')]) 224 | 225 | def is_all_partitions_read(self, tp_flag: dict): 226 | for tp, flag in tp_flag.items(): 227 | if not flag: 228 | return False 229 | return True 230 | 231 | def run(self) -> None: 232 | total_processed = 0 233 | 234 | # do a dummy poll, so kafka can assign partitions to this consumer 235 | self.consumer.poll() 236 | 237 | # get current assigned partitions 238 | # warning: create only one consumer, as consumer rebalancing can disrupt partition 239 | # assignment 240 | topic_partitions: set = self.consumer.assignment() 241 | 242 | start_offsets, end_offsets = get_start_end_offsets( 243 | start_timestamp=self.start_timestamp, 244 | end_timestamp=self.end_timestamp, 245 | topic_partitions=topic_partitions, 246 | consumer=self.consumer) 247 | 248 | for tp in topic_partitions: 249 | self.consumer.seek(tp, start_offsets.get(tp).offset) 250 | 251 | tp_break_flag: dict = {} 252 | for tp in end_offsets.keys(): 253 | tp_break_flag[tp] = False 254 | 255 | while True: 256 | tp_records_dict = self.consumer.poll(timeout_ms=self.consumer_timeout_ms) 257 | 258 | if tp_records_dict is None or len(tp_records_dict.items()) == 0: 259 | continue 260 | try: 261 | 262 | for topic_partition, consumer_records in tp_records_dict.items(): 263 | consumer_records_buffer = [] 264 | for consumer_record in consumer_records: 265 | if consumer_record.offset >= end_offsets[topic_partition].offset: 266 | tp_break_flag[topic_partition] = True 267 | break 268 | consumer_records_buffer.append(consumer_record) 269 | total_processed += 1 270 | self.sink_task.process(consumer_records_buffer) 271 | 272 | self.consumer.commit() 273 | 274 | if self.is_all_partitions_read(tp_break_flag): 275 | self.consumer.close() 276 | logging.info( 277 | f'stopping seek consumer {self.consumer_name}, ' 278 | f'total records processed: {total_processed}') 279 | break 280 | except BaseException as e: 281 | logger.error(e) 282 | 283 | 284 | @ray.remote(max_restarts=MAX_RESTARTS_REMOTE_WORKER, max_task_retries=MAX_RESTARTS_REMOTE_WORKER, 285 | num_cpus=WORKER_NUM_CPUS) 286 | class ConsumerWorker: 287 | def __init__(self, config: dict, worker_name: str): 288 | # creating a separate logger for individual worker. As they only need to print in stdout 289 | # or stderr 290 | logging.basicConfig(level=logging.INFO) 291 | self.consumer_name = config.get('consumer_name') 292 | self.worker_name = worker_name 293 | self.config = config 294 | self.stop_worker = False 295 | self.auto_offset_reset = 'earliest' 296 | self.poll_timeout_ms = 1000 297 | self.sink_task: SinkTask = SinkTask(config) 298 | self.is_closed = False 299 | # set to double of poll_timeout_ms because - in the next iteration of poll, thread will 300 | # attempt to stop kafka consumer 301 | self.consumer_stop_delay_seconds = 2 * self.poll_timeout_ms / 1000 302 | self.consumer = KafkaConsumer(bootstrap_servers=self.config.get('bootstrap_servers'), 303 | client_id=CLIENT_ID, 304 | group_id=self.consumer_name, 305 | key_deserializer=get_ser_des(self.config.get( 306 | 'key_deserializer', 'STRING_DES')), 307 | value_deserializer=get_ser_des(self.config.get( 308 | 'value_deserializer', 'JSON_DES')), 309 | auto_offset_reset=self.auto_offset_reset, 310 | enable_auto_commit=self.config.get('enable_auto_commit', 311 | True), 312 | max_poll_records=self.config.get('max_poll_records', 50), 313 | max_poll_interval_ms=self.config.get('max_poll_interval_ms', 314 | 600000), 315 | security_protocol=SECURITY_PROTOCOL, 316 | sasl_mechanism=SASL_MECHANISM, 317 | sasl_plain_username=SASL_USERNAME, 318 | sasl_plain_password=SASL_PASSWORD, 319 | consumer_timeout_ms=1000) 320 | self.consumer.subscribe([self.config.get('topic_name')]) 321 | logging.info(f'Started consumer worker {self.worker_name}') 322 | 323 | def stop_consumer(self) -> None: 324 | logging.info(f'Stopping consumer worker {self.worker_name}') 325 | self.stop_worker = True 326 | 327 | # give time for the consumer to stop gracefully 328 | time.sleep(self.consumer_stop_delay_seconds) 329 | logging.info(f'Stopped consumer worker {self.worker_name}') 330 | 331 | def closed(self): 332 | return self.is_closed 333 | 334 | def run(self) -> None: 335 | 336 | while not self.stop_worker: 337 | tp_records_dict = self.consumer.poll(timeout_ms=self.poll_timeout_ms) 338 | 339 | if tp_records_dict is None or len(tp_records_dict.items()) == 0: 340 | continue 341 | try: 342 | 343 | for topic_partition, consumer_records in tp_records_dict.items(): 344 | self.sink_task.process(consumer_records) 345 | 346 | self.consumer.commit() 347 | 348 | if self.stop_worker: 349 | self.consumer.close() 350 | self.is_closed = True 351 | break 352 | except BaseException as e: 353 | logging.error('Error while running consumer worker!') 354 | logging.error(e) 355 | --------------------------------------------------------------------------------