├── src
├── model
│ ├── __init__.py
│ └── worker_dto.py
├── utility
│ ├── __init__.py
│ ├── common_util.py
│ ├── config_manager.py
│ └── logging_util.py
├── exceptions
│ ├── __init__.py
│ └── usi_exceptions.py
├── kafka_core
│ ├── __init__.py
│ ├── ser_des_util.py
│ ├── kafka_stream_writer.py
│ ├── kafka_util.py
│ ├── sink_task.py
│ └── consumer_manager.py
├── transformers
│ ├── __init__.py
│ ├── test_transformer.py
│ └── transformer.py
├── stream_writers
│ ├── __init__.py
│ ├── console_stream_writer.py
│ └── stream_writer.py
├── __init__.py
└── event_consumer_app.py
├── setup.cfg
├── requirements.txt
├── Dockerfile
├── config
└── consumer_config.json
├── LICENSE.txt
├── setup.py
├── .gitignore
├── k8
└── ray
│ └── ray-cluster-config.yaml
└── README.md
/src/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/utility/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/exceptions/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/kafka_core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/transformers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/stream_writers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | exclude = venv
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi==0.65.1
2 | uvicorn==0.13.4
3 | cachetools~=4.2.2
4 | starlette~=0.14.2
5 | pydantic~=1.7.4
6 | ratelimit==2.2.1
7 | ray==1.8.0
8 | setuptools==58.4.0
9 | kafka-python==2.0.2
10 |
--------------------------------------------------------------------------------
/src/utility/common_util.py:
--------------------------------------------------------------------------------
1 | import uuid
2 |
3 | CLIENT_ID = str(uuid.uuid4())
4 |
5 |
6 | def singleton(cls):
7 | instances = {}
8 |
9 | def get_instance():
10 | if cls not in instances:
11 | instances[cls] = cls()
12 | return instances[cls]
13 |
14 | return get_instance
15 |
--------------------------------------------------------------------------------
/src/exceptions/usi_exceptions.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 |
4 | class GenericException(Exception):
5 |
6 | def __init__(self, message):
7 | super().__init__(message)
8 | self.message = message
9 | self.when = datetime.now()
10 |
11 |
12 | class BadConsumerConfigException(GenericException):
13 |
14 | def __init__(self, message):
15 | super().__init__(message)
16 | self.message = message
17 | self.when = datetime.now()
18 |
19 |
20 | class BadInput(GenericException):
21 |
22 | def __init__(self, message):
23 | super().__init__(message)
24 | self.message = message
25 | self.when = datetime.now()
26 |
--------------------------------------------------------------------------------
/src/kafka_core/ser_des_util.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from src.exceptions.usi_exceptions import BadConsumerConfigException
4 |
5 | SER_DES_OPTIONS = {
6 | 'STRING_SER': lambda k: k.encode('utf-8') if k is not None else k,
7 | 'JSON_SER': lambda v: json.dumps(v).encode('utf-8') if v is not None else v,
8 | 'STRING_DES': lambda k: k.decode('utf-8') if k is not None else k,
9 | 'JSON_DES': lambda v: json.loads(v) if v is not None else v,
10 |
11 | }
12 |
13 |
14 | def get_ser_des(name: str):
15 | ser_des_cal = SER_DES_OPTIONS.get(name)
16 | if ser_des_cal is None:
17 | raise BadConsumerConfigException(f'No Serializer/Deserializer found with name {name}')
18 | return ser_des_cal
19 |
--------------------------------------------------------------------------------
/src/stream_writers/console_stream_writer.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import List
3 |
4 | from src.model.worker_dto import SinkRecordDTO
5 | from src.stream_writers.stream_writer import StreamWriter
6 |
7 |
8 | class ConsoleStreamWriter(StreamWriter):
9 |
10 | def write(self, streams: List[SinkRecordDTO]) -> None:
11 | """
12 | Writes processed records read from kafka to Elastic search
13 | :param streams: List of SinkRecordDTO - transformed data to be written to ES
14 | :return: None
15 | """
16 | for sink_record_dto in streams:
17 | logging.info(f' Key: {sink_record_dto.key} - value: {sink_record_dto.message}')
18 |
19 | def close(self) -> None:
20 | pass
21 |
--------------------------------------------------------------------------------
/src/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | # set the default logging level to info
5 | logging.basicConfig(level=logging.INFO)
6 |
7 | ROOT_SRC_DIR = os.path.dirname(os.path.abspath(__file__))
8 | USERNAME = os.environ.get('APP_USERNAME', 'admin')
9 | PASSWORD = os.environ.get('APP_PASSWORD', 'admin')
10 |
11 | WORKER_NUM_CPUS = os.environ.get('WORKER_NUM_CPUS', .25)
12 |
13 | SASL_USERNAME = os.environ.get('SASL_USERNAME', None)
14 | SASL_PASSWORD = os.environ.get('SASL_PASSWORD', None)
15 | SECURITY_PROTOCOL = os.environ.get('SECURITY_PROTOCOL', 'PLAINTEXT')
16 | SASL_MECHANISM = os.environ.get('SASL_MECHANISM')
17 | WORKER_CONFIG_PATH = os.environ.get('WORKER_CONFIG_PATH', '/../config/consumer_config.json')
18 | RAY_HEAD_ADDRESS = os.environ.get('RAY_HEAD_ADDRESS', 'auto')
19 | LOCAL_MODE = os.environ.get('LOCAL_MODE', 'Y')
20 |
--------------------------------------------------------------------------------
/src/utility/config_manager.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import json
3 | import os
4 |
5 | from src import WORKER_CONFIG_PATH, ROOT_SRC_DIR
6 | from src.exceptions.usi_exceptions import BadInput
7 | from src.utility.common_util import singleton
8 |
9 | ENV = os.environ.get('env', 'dev')
10 | AWS_REGION = os.environ.get('aws_region', '')
11 |
12 |
13 | @singleton
14 | class ConfigManager:
15 |
16 | def __init__(self):
17 | self._worker_config = None
18 | self._load_consumer_config()
19 |
20 | def _load_consumer_config(self) -> None:
21 | with open(ROOT_SRC_DIR + WORKER_CONFIG_PATH, "r") as f:
22 | self._worker_config = json.load(f)
23 |
24 | def get_worker_config(self) -> list:
25 | return copy.deepcopy(self._worker_config)
26 |
27 | def get_worker_config_by_name(self, name: str) -> dict:
28 |
29 | for config in self._worker_config:
30 | if config['consumer_name'] == name:
31 | return config
32 |
33 | raise BadInput(f'Consumer name: {name}, is not configured.')
34 |
--------------------------------------------------------------------------------
/src/model/worker_dto.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from typing import Optional, Dict
3 |
4 | from pydantic import BaseModel
5 |
6 |
7 | class SinkOperationType(Enum):
8 | UPSERT = 1
9 | DELETE = 2
10 | INSERT = 4
11 |
12 |
13 | class SinkOperation(BaseModel):
14 | sink_operation_type: SinkOperationType
15 |
16 | # update query for update by query
17 | update_query: Dict = {}
18 |
19 | # source field name for script-based update
20 | source_field_name: Optional[str]
21 |
22 | # this can be a json or any primitive value
23 | new_val: object = None
24 |
25 | class Config:
26 | arbitrary_types_allowed = True
27 |
28 |
29 | class SinkRecordDTO(BaseModel):
30 | sink_operation: SinkOperation
31 | message: Dict
32 | key: Optional[str]
33 | offset: Optional[int]
34 | topic: Optional[str]
35 | partition: Optional[int]
36 |
37 |
38 | class DeadLetterDTO(BaseModel):
39 | key: str
40 | message: str
41 | topic: str
42 | partition: int
43 | failed_at: str
44 | error: str
45 | offset: int
46 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8.5-slim
2 |
3 | ### common steps start
4 |
5 | # username
6 | ARG APP_USER=app
7 |
8 | # Create user to run the application
9 | RUN groupadd -r ${APP_USER} && useradd --no-log-init -r -g ${APP_USER} ${APP_USER}
10 |
11 | # create application working directory
12 | WORKDIR /var/www-api
13 | RUN chown -R app:app . /usr/local;
14 | RUN mkdir -p .tmp .log;
15 | RUN chown -R app:app .tmp
16 | RUN chown -R app:app .log
17 | ### common steps end
18 |
19 | # switch to app user
20 | USER ${APP_USER}:${APP_USER}
21 |
22 | # bundle app source
23 | COPY --chown=app requirements.txt .
24 | COPY --chown=app ./src ./src
25 | COPY --chown=app ./config ./config
26 | COPY --chown=app setup.cfg .
27 | COPY --chown=app setup.py .
28 | COPY --chown=app README.md .
29 |
30 | # install dependency requirements
31 | RUN pip install -r requirements.txt
32 |
33 | RUN pip install -e .
34 |
35 | # set application environment
36 | ENV APP_ENV="production"
37 |
38 | # expose applicable server port
39 | EXPOSE 8002
40 |
41 | CMD ["uvicorn", "--host", "0.0.0.0", "--port", "8002", "src.event_consumer_app:app"]
42 |
--------------------------------------------------------------------------------
/config/consumer_config.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "consumer_name": "some_consumer_group_name",
4 | "topic_name": "test-topic",
5 | "number_of_workers": 2,
6 | "enable_auto_commit": false,
7 | "bootstrap_servers": "192.168.64.1:9092",
8 | "key_deserializer": "STRING_DES",
9 | "value_deserializer": "STRING_DES",
10 | "header_deserializer": null,
11 | "auto_offset_reset": "earliest",
12 | "max_poll_records": 20,
13 | "max_poll_interval_ms": 60000,
14 | "sink_configs": {
15 | "transformer_cls": "src.transformers.test_transformer.SampleTransformer",
16 | "num_retries": 3,
17 | "retry_delay_seconds": 1,
18 | "stream_writers": [
19 | "src.stream_writers.console_stream_writer.ConsoleStreamWriter"
20 | ]
21 | },
22 | "dlq_config": {
23 | "bootstrap_servers": "192.168.64.1:9092",
24 | "topic_name": "test-dlq",
25 | "key_serializer": "STRING_SER",
26 | "value_serializer": "STRING_SER",
27 | "acks": "all",
28 | "compression_type": "gzip",
29 | "retries": 3,
30 | "linger_ms": 10
31 | }
32 | }
33 | ]
34 |
--------------------------------------------------------------------------------
/src/transformers/test_transformer.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from kafka.consumer.fetcher import ConsumerRecord
4 |
5 | from src.model.worker_dto import SinkRecordDTO, SinkOperation, SinkOperationType
6 | from src.transformers.transformer import StreamTransformer
7 |
8 |
9 | class SampleTransformer(StreamTransformer):
10 | def transform(self, consumer_record: ConsumerRecord) -> SinkRecordDTO:
11 | """
12 | converts message to message dict
13 | :param consumer_record: kafka consumer record
14 | :return: SinkRecordDTO
15 | """
16 | # do something here
17 | message_dict: dict = json.loads(consumer_record.value)
18 | sink_operation = SinkOperation(
19 | sink_operation_type=SinkOperationType.UPSERT
20 | )
21 |
22 | return SinkRecordDTO(key=consumer_record.key,
23 | message=message_dict,
24 | topic=consumer_record.topic,
25 | offset=consumer_record.offset,
26 | sink_operation=sink_operation,
27 | partition=consumer_record.partition)
28 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) Bikas Katwal - bikas.katwal10@gmail.com
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/transformers/transformer.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from abc import ABC, abstractmethod
3 |
4 | from kafka.consumer.fetcher import ConsumerRecord
5 |
6 | from src.exceptions.usi_exceptions import BadInput
7 | from src.model.worker_dto import SinkRecordDTO
8 |
9 |
10 | class StreamTransformer(ABC):
11 |
12 | def __init__(self, config: dict):
13 | self.config = config
14 |
15 | @abstractmethod
16 | def transform(self, consumer_record: ConsumerRecord) -> SinkRecordDTO:
17 | """
18 | Transforms the JSON for sink updated and extracts the operation associated with the event
19 | :param consumer_record: kafka consumer record
20 | :return: return the Sink record that will be put into sink Datastore
21 | """
22 |
23 |
24 | def get_transformer(cls_path: str, config: dict) -> StreamTransformer:
25 | module_name, class_name = cls_path.rsplit(".", 1)
26 | stream_transformer = getattr(importlib.import_module(module_name), class_name)
27 |
28 | if not issubclass(stream_transformer, StreamTransformer):
29 | raise BadInput(f'{cls_path} is not a subclass of StreamTransformer')
30 |
31 | return stream_transformer(config)
32 |
--------------------------------------------------------------------------------
/src/stream_writers/stream_writer.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from abc import abstractmethod, ABC
3 | from typing import List
4 |
5 | from src.exceptions.usi_exceptions import BadInput
6 |
7 |
8 | class StreamWriter(ABC):
9 |
10 | def __init__(self, config: dict):
11 | self.config = config
12 |
13 | @abstractmethod
14 | def write(self, streams: list) -> None:
15 | """
16 | Implement this interface to create an instance of stream writer
17 | :param streams: key and message dictionary
18 | :return: None
19 | """
20 |
21 | @abstractmethod
22 | def close(self) -> None:
23 | """
24 | tear down
25 | :return:
26 | """
27 |
28 |
29 | def get_stream_writers(cls_paths: List[str], config: dict) -> List[StreamWriter]:
30 | stream_writers: List[StreamWriter] = []
31 | for cls_path in cls_paths:
32 | module_name, class_name = cls_path.rsplit(".", 1)
33 | stream_writer_cls = getattr(importlib.import_module(module_name), class_name)
34 |
35 | if not issubclass(stream_writer_cls, StreamWriter):
36 | raise BadInput(f'{cls_path} is not a subclass of StreamTransformer')
37 |
38 | stream_writers.append(stream_writer_cls(config))
39 | return stream_writers
40 |
--------------------------------------------------------------------------------
/src/utility/logging_util.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 |
4 | LOG_FILE_PATH = '/var/www-api/.log/consumer_app.log'
5 | LOG_DIR_PATH = '/var/www-api/.log'
6 | LOG_FORMATTER = '%(levelname)s - %(asctime)s - %(name)s - %(message)s'
7 |
8 |
9 | def is_valid_path(path: str):
10 | if os.path.exists(path) or os.access(os.path.dirname(path), os.W_OK):
11 | return True
12 | return False
13 |
14 |
15 | def get_logger(name=None):
16 | logger = logging.getLogger(name)
17 | logger.setLevel(logging.INFO)
18 |
19 | std_out_handler = StdOutHandler()
20 | std_out_handler.setLevel(logging.INFO)
21 | logger.addHandler(std_out_handler)
22 |
23 | if is_valid_path(LOG_DIR_PATH):
24 | file_out_handler = FileOutHandler()
25 | file_out_handler.setLevel(logging.INFO)
26 | logger.addHandler(file_out_handler)
27 |
28 | return logger
29 |
30 |
31 | class StdOutHandler(logging.StreamHandler):
32 | def __init__(self, stream=None):
33 | super(StdOutHandler, self).__init__()
34 |
35 | def format(self, record):
36 | self.formatter = logging.Formatter(LOG_FORMATTER)
37 | return super(StdOutHandler, self).format(record)
38 |
39 |
40 | class FileOutHandler(logging.FileHandler):
41 | def __init__(self):
42 | super(FileOutHandler, self).__init__(filename=LOG_FILE_PATH)
43 |
44 | def format(self, record):
45 | self.formatter = logging.Formatter(LOG_FORMATTER)
46 | return super(FileOutHandler, self).format(record)
47 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setup(
7 | name='kafka-connect-dependency',
8 | version='0.1.1',
9 | license='MIT',
10 | url='https://github.com/bkatwal/distributed-kafka-consumer-python',
11 | author='Bikas Katwal',
12 | author_email='bikas.katwal10@gmail.com',
13 | description='Library to run distributed Kafka Consumers using Ray',
14 | long_description='This library need to be installed in ray nodes. So, ray head and worker '
15 | 'nodes can find and pickle/unpickle Kafka Consumer modules.',
16 | keywords=['ray', 'kafka', 'consumer'],
17 | long_description_content_type="text/markdown",
18 | py_modules=['src.exceptions.usi_exceptions',
19 | 'src.kafka_core.consumer_manager',
20 | 'src.kafka_core.kafka_stream_writer',
21 | 'src.kafka_core.kafka_util',
22 | 'src.kafka_core.ser_des_util',
23 | 'src.kafka_core.sink_task',
24 | 'src.model.worker_dto',
25 | 'src.stream_writers.stream_writer',
26 | 'src.stream_writers.console_stream_writer',
27 | 'src.transformers.transformer',
28 | 'src.transformers.test_transformer',
29 | 'src.utility.common_util',
30 | 'src.utility.config_manager',
31 | 'src.utility.logging_util'],
32 | python_requires='>=3',
33 | install_requires=[
34 | 'fastapi==0.65.1',
35 | 'uvicorn==0.13.4',
36 | 'cachetools~=4.2.2',
37 | 'starlette~=0.14.2',
38 | 'pydantic~=1.7.4',
39 | 'ratelimit==2.2.1',
40 | 'ray==1.8.0',
41 | 'kafka-python==2.0.2'
42 | ]
43 | )
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # These are some examples of commonly ignored file patterns.
2 | # You should customize this list as applicable to your project.
3 | # Learn more about .gitignore:
4 | # https://www.atlassian.com/git/tutorials/saving-changes/gitignore
5 |
6 | # Node artifact files
7 | node_modules/
8 | dist/
9 |
10 | # Compiled Java class files
11 | *.class
12 |
13 | # Compiled Python bytecode
14 | *.py[cod]
15 |
16 | # Log files
17 | *.log
18 |
19 | # Package files
20 | *.jar
21 |
22 | *.iml
23 |
24 | # Maven
25 | target/
26 | dist/
27 |
28 | # JetBrains IDE
29 | .idea/
30 |
31 | # Unit test reports
32 | TEST*.xml
33 |
34 | # Generated by MacOS
35 | .DS_Store
36 |
37 | # Generated by Windows
38 | Thumbs.db
39 |
40 | # Applications
41 | *.app
42 | *.exe
43 | *.war
44 |
45 | # Large media files
46 | *.mp4
47 | *.tiff
48 | *.avi
49 | *.flv
50 | *.mov
51 | *.wmv
52 |
53 |
54 | .idea/
55 |
56 | # dotenv
57 | event_consumer/.env
58 |
59 | # virtualenv
60 | .venv
61 | venv*/
62 | ENV/
63 | test-reports/
64 |
65 | # Installer logs
66 | pip-log.txt
67 | pip-delete-this-directory.txt
68 |
69 | # Unit test / coverage reports
70 | htmlcov/
71 | .tox/
72 | .coverage
73 | .coverage.*
74 | .cache
75 | nosetests.xml
76 | coverage.xml
77 | *.cover
78 | .hypothesis/
79 |
80 | # Byte-compiled / optimized / DLL files
81 | __pycache__/
82 | *.py[cod]
83 | *$py.class
84 |
85 | # Ops Log files
86 | trie_rejected_words.txt
87 |
88 | # trained models
89 | event_consumer/trained_models/*
90 | !event_consumer/trained_models/__init__.py
91 | !event_consumer/trained_models/generated_dictionaries/
92 |
93 | # C extensions
94 | *.so
95 |
96 | # Distribution / packaging
97 | .Python
98 | env/
99 | build/
100 | develop-eggs/
101 | dist/
102 | downloads/
103 | eggs/
104 | .eggs/
105 | lib/
106 | lib64/
107 | parts/
108 | sdist/
109 | var/
110 | wheels/
111 | *.egg-info/
112 | .installed.cfg
113 | *.egg
114 |
--------------------------------------------------------------------------------
/src/kafka_core/kafka_stream_writer.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import TypeVar, Generic, List
3 |
4 | from kafka import KafkaProducer
5 |
6 | from src.exceptions.usi_exceptions import BadConsumerConfigException
7 | from src.kafka_core.ser_des_util import get_ser_des
8 | from src.stream_writers.stream_writer import StreamWriter
9 | from src.utility import logging_util
10 | from src.utility.common_util import CLIENT_ID
11 |
12 | logger = logging_util.get_logger(__name__)
13 |
14 | T = TypeVar('T')
15 |
16 |
17 | class KafkaStreamWriter(Generic[T], StreamWriter):
18 |
19 | def __init__(self, config: dict):
20 | self.config = config
21 | self.kafka_producer = self.__create_kafka_producer()
22 | if not config.get('topic_name'):
23 | raise BadConsumerConfigException('missing producer topic name.')
24 | self.topic = config.get('topic_name')
25 |
26 | def write(self, streams: List[T]) -> None:
27 | """
28 | writes message of type T to a kafka topic
29 | :param streams: list of messages
30 | :return: None
31 | """
32 | for event in streams:
33 | key = None
34 | if hasattr(event, 'key'):
35 | key = event.key
36 |
37 | self.kafka_producer.send(topic=self.topic, key=key, value=json.dumps(event.__dict__))
38 |
39 | def close(self) -> None:
40 | """
41 | closes the writer kafka producer object
42 | :return: None
43 | """
44 | if self.kafka_producer:
45 | self.kafka_producer.close()
46 |
47 | def __create_kafka_producer(self) -> KafkaProducer:
48 | bootstrap_servers = self.config.get('bootstrap_servers')
49 |
50 | return KafkaProducer(bootstrap_servers=bootstrap_servers,
51 | key_serializer=get_ser_des(self.config.get(
52 | 'key_serializer', 'STRING_SER')),
53 | value_serializer=get_ser_des(self.config.get(
54 | 'value_serializer', 'STRING_SER')),
55 | acks=self.config.get('acks', 'all'),
56 | compression_type=self.config.get('compression_type',
57 | 'gzip'),
58 | retries=self.config.get('retries', 1),
59 | linger_ms=self.config.get('linger_ms', 10),
60 | client_id=CLIENT_ID)
61 |
--------------------------------------------------------------------------------
/src/kafka_core/kafka_util.py:
--------------------------------------------------------------------------------
1 | from kafka import KafkaConsumer
2 |
3 | TWO_MINUTES = 2
4 |
5 |
6 | def is_end_offset_none(end_offsets: dict, start_offsets: dict) -> bool:
7 | """
8 | Utility function to check if the partition that has start offset has end offset too.
9 | :param end_offsets: topic partition and end offsets
10 | :param start_offsets:topic partition and start offsets
11 | :return: True/False
12 | """
13 | if len(end_offsets) == 0:
14 | return True
15 |
16 | for tp, offset in end_offsets.items():
17 | if offset is None and start_offsets[tp] is not None:
18 | return True
19 |
20 | return False
21 |
22 |
23 | def is_all_end_offset_found(end_offsets: dict, start_offsets: dict) -> bool:
24 | """
25 | Utility function to check if the partition that has start offset has end offset too.
26 | :param end_offsets: topic partition and end offsets
27 | :param start_offsets:topic partition and start offsets
28 | :return: True/False
29 | """
30 | if len(end_offsets) == 0:
31 | return False
32 |
33 | for tp, offset in end_offsets.items():
34 | if offset is None and start_offsets[tp] is not None:
35 | return False
36 |
37 | return True
38 |
39 |
40 | def get_start_end_offsets(start_timestamp: int, end_timestamp: int,
41 | topic_partitions: set, consumer: KafkaConsumer):
42 | """
43 | Get start and end offset for all the partitions based on the given start and end timestamp
44 | :param start_timestamp: start timestamp in epoch time millis
45 | :param end_timestamp: end timestamp in epoch time millis
46 | :param topic_partitions: topic partition set
47 | :param consumer: kafka consumer
48 | :return: tuple of start offsets and end offsets for each partition
49 | """
50 | tp_start_timestamps: dict = {}
51 | for tp in topic_partitions:
52 | tp_start_timestamps[tp] = start_timestamp
53 |
54 | start_offsets = consumer.offsets_for_times(tp_start_timestamps)
55 | end_offsets = {}
56 | # go back 2 minute and keep checking if there are end offsets in partition
57 | tp_end_timestamps: dict = {}
58 | while not is_all_end_offset_found(start_offsets=start_offsets, end_offsets=end_offsets):
59 | for tp in topic_partitions:
60 | # seek previous offset from a partition only if the offset is not found
61 | if len(end_offsets) == 0 or (end_offsets[tp] is None and start_offsets[tp] is not
62 | None):
63 | tp_end_timestamps[tp] = end_timestamp
64 |
65 | end_offsets = consumer.offsets_for_times(tp_end_timestamps)
66 | end_timestamp = end_timestamp - (TWO_MINUTES * 60 * 1000)
67 |
68 | return start_offsets, end_offsets
69 |
--------------------------------------------------------------------------------
/src/event_consumer_app.py:
--------------------------------------------------------------------------------
1 | import secrets
2 | import time
3 |
4 | from fastapi import FastAPI, Depends, HTTPException, status
5 | from fastapi.security import HTTPBasic, HTTPBasicCredentials
6 | from starlette.requests import Request
7 | from starlette.responses import JSONResponse
8 |
9 | from src import USERNAME, PASSWORD
10 | from src.exceptions.usi_exceptions import BadInput, GenericException
11 | from src.kafka_core.consumer_manager import ConsumerWorkerManager
12 | from src.utility import logging_util
13 |
14 | logger = logging_util.get_logger(__name__)
15 |
16 | app = FastAPI(title="Distributed Kafka Consumer Using Ray - Manager")
17 | security = HTTPBasic()
18 | cwm = ConsumerWorkerManager()
19 |
20 |
21 | def authorize(credentials: HTTPBasicCredentials = Depends(security)):
22 | correct_username = secrets.compare_digest(credentials.username, USERNAME)
23 | correct_password = secrets.compare_digest(credentials.password, PASSWORD)
24 | if not (correct_username and correct_password):
25 | raise HTTPException(
26 | status_code=status.HTTP_401_UNAUTHORIZED,
27 | detail="Incorrect username or password",
28 | headers={"WWW-Authenticate": "Basic"},
29 | )
30 | return credentials.username
31 |
32 |
33 | @app.on_event("startup")
34 | def on_startup():
35 | cwm.start_all_workers()
36 |
37 |
38 | @app.on_event("shutdown")
39 | def on_shutdown():
40 | cwm.start_all_workers()
41 |
42 |
43 | @app.post('/manager/health', include_in_schema=False)
44 | def health():
45 | return {'message': 'App is up!'}
46 |
47 |
48 | @app.post('/manager/start-consumers', dependencies=[Depends(authorize)])
49 | def start_consumers():
50 | cwm.start_all_workers()
51 | return "Successfully started all workers!"
52 |
53 |
54 | @app.get('/manager/fetch-consumers', dependencies=[Depends(authorize)])
55 | def get_consumers():
56 | return cwm.get_all_running_consumer()
57 |
58 |
59 | @app.post('/manager/read-from-timestamp', dependencies=[Depends(authorize)])
60 | def read_from_timestamp(consumer_name: str, start_timestamp: int,
61 | end_timestamp: int = int(time.time() * 1000),
62 | stop_running_consumer: bool = True):
63 | cwm.start_worker_with_timestamp(start_timestamp=start_timestamp, end_timestamp=end_timestamp,
64 | stop_regular=stop_running_consumer, name=consumer_name)
65 | return "Successfully started!"
66 |
67 |
68 | @app.post('/manager/start-consumer/{consumer_name}', dependencies=[Depends(authorize)])
69 | def start_consumer(consumer_name):
70 | cwm.start_worker(consumer_name)
71 | return "Successfully started worker!"
72 |
73 |
74 | @app.post('/manager/stop-consumers', dependencies=[Depends(authorize)])
75 | def stop_consumers():
76 | cwm.stop_all_workers()
77 | return "Successfully Stopped all workers!"
78 |
79 |
80 | @app.post('/manager/stop-consumer/{consumer_name}', dependencies=[Depends(authorize)])
81 | def stop_consumer(consumer_name):
82 | cwm.stop_worker(consumer_name)
83 | return "Successfully Stopped!"
84 |
85 |
86 | @app.exception_handler(Exception)
87 | def generic_exception_handler(request: Request, exc: Exception):
88 | logger.error(exc)
89 | return JSONResponse(
90 | status_code=500,
91 | content={"message": "Oops! I messed up!"},
92 | )
93 |
94 |
95 | @app.exception_handler(GenericException)
96 | def request_validation_exception_handler(request: Request, exc: GenericException):
97 | logger.error(exc)
98 | return JSONResponse(
99 | status_code=500,
100 | content={"message": exc.message},
101 | )
102 |
103 |
104 | @app.exception_handler(BadInput)
105 | def request_validation_exception_handler(request: Request, exc: BadInput):
106 | logger.error(exc)
107 | return JSONResponse(
108 | status_code=422,
109 | content={"message": exc.message},
110 | )
111 |
--------------------------------------------------------------------------------
/src/kafka_core/sink_task.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | from abc import ABC
4 | from typing import List
5 |
6 | from kafka.consumer.fetcher import ConsumerRecord
7 | from ratelimit import limits, sleep_and_retry
8 |
9 | from src.exceptions.usi_exceptions import BadConsumerConfigException
10 | from src.kafka_core.kafka_stream_writer import KafkaStreamWriter
11 | from src.model.worker_dto import DeadLetterDTO, SinkRecordDTO
12 | from src.stream_writers.stream_writer import StreamWriter, get_stream_writers
13 | from src.transformers.transformer import get_transformer
14 |
15 | ONE_SECOND = 1
16 | CALLS = 20
17 |
18 |
19 | class SinkTask(ABC):
20 |
21 | def __init__(self, config: dict):
22 | self.sink_configs = config.get('sink_configs')
23 | if self.sink_configs is None:
24 | raise BadConsumerConfigException('Missing Sink Config.')
25 | self.config = config
26 | processor_cls_path = self.sink_configs.get('transformer_cls')
27 | if not processor_cls_path:
28 | raise BadConsumerConfigException('sink_configs.transformer_cls is a mandatory config')
29 | self.stream_transformer = get_transformer(processor_cls_path, self.sink_configs)
30 | self.operation_extractor = None
31 | stream_writer_cls_paths: List[str] = self.sink_configs.get('stream_writers')
32 | if not stream_writer_cls_paths or len(stream_writer_cls_paths) == 0:
33 | raise BadConsumerConfigException('sink_configs.stream_writers is a mandatory config')
34 | self.sink_stream_writers: List[StreamWriter] = get_stream_writers(
35 | stream_writer_cls_paths, self.sink_configs)
36 | if config.get('dlq_config') is not None:
37 | self.dlq_stream_writer: KafkaStreamWriter[DeadLetterDTO] = KafkaStreamWriter(
38 | config.get('dlq_config'))
39 | self.retries = self.sink_configs.get('num_retries', 3)
40 | self.retry_delay_seconds = self.sink_configs.get('retry_delay_seconds', 1)
41 |
42 | def write_to_sink(self, sink_record_dto_list: List[SinkRecordDTO]):
43 | for stream_writer in self.sink_stream_writers:
44 | retries = 0
45 | while retries <= self.retries:
46 | try:
47 | stream_writer.write(sink_record_dto_list)
48 | break
49 | except Exception as e:
50 | if retries == self.retries:
51 | raise e
52 | retries = retries + 1
53 | logging.error(f'{type(stream_writer)} - Failed with exception: {e}, retrying '
54 | f'attempt'
55 | f' {retries}')
56 | time.sleep(self.retry_delay_seconds)
57 |
58 | @sleep_and_retry
59 | @limits(calls=CALLS, period=1)
60 | def process(self, consumer_records: List[ConsumerRecord]):
61 |
62 | for consumer_record in consumer_records:
63 | try:
64 | sink_record_dto: SinkRecordDTO = self.stream_transformer.transform(consumer_record)
65 | sink_record_dto_list: List[SinkRecordDTO] = [sink_record_dto]
66 | except Exception as e:
67 | self.handle_dlq_push(consumer_record.key, consumer_record.value,
68 | consumer_record.topic, consumer_record.partition,
69 | 'TRANSFORM', e, consumer_record.offset)
70 | continue
71 |
72 | try:
73 | self.write_to_sink(sink_record_dto_list)
74 | except Exception as e:
75 | self.handle_dlq_push(consumer_record.key, consumer_record.value,
76 | consumer_record.topic, consumer_record.partition,
77 | 'SINK_UPDATE', e, consumer_record.offset)
78 |
79 | def handle_dlq_push(self, key: str, message: str, topic: str, partition: int,
80 | failed_at: str, error: Exception, offset: int):
81 | logging.warning(
82 | f'failed to {failed_at} key: {key} and message: {message}, in topic {topic} '
83 | f'having offset {offset}, with error: {error}')
84 | try:
85 | if self.dlq_stream_writer is not None:
86 | dead_letter = DeadLetterDTO(key=key, message=message, topic=topic,
87 | partition=partition, failed_at=failed_at,
88 | error=str(error) if error is not None else "",
89 | offset=offset)
90 | self.dlq_stream_writer.write([dead_letter])
91 | except Exception as e:
92 | logging.error(f'Failed to write to DLQ: {e}')
93 |
--------------------------------------------------------------------------------
/k8/ray/ray-cluster-config.yaml:
--------------------------------------------------------------------------------
1 | ####
2 | # We are creating one head pod and 2 worker replica pods.
3 | # Head node takes one core and 512 MB of memory — change this as you need
4 | # Worker node takes 0.5 CPU and 512 MB of memory — change this as you need
5 | # Expose dashboard, ray head node and Redis server for public access using K8 external service
6 | # using NodePort.
7 | # Install all code dependencies as a package in both head and worker nodes. This way Ray head
8 | # nodes and worker nodes can find these modules.
9 | ####
10 | ---
11 | # create namespace, where all ray components will be deployed
12 | apiVersion: v1
13 | kind: Namespace
14 | metadata:
15 | name: ray
16 | ---
17 | # create service to expose ray head, redis and ray dashboard.
18 | apiVersion: v1
19 | kind: Service
20 | metadata:
21 | namespace: ray
22 | name: ray-head-service
23 | spec:
24 | type: NodePort
25 | ports:
26 | - name: client
27 | protocol: TCP
28 | port: 10001
29 | targetPort: 10001
30 | nodePort: 30001
31 | - name: dashboard
32 | protocol: TCP
33 | port: 8265
34 | targetPort: 8265
35 | nodePort: 30002
36 | - name: redis
37 | protocol: TCP
38 | port: 6379
39 | targetPort: 6379
40 | nodePort: 30003
41 | selector:
42 | component: ray-head
43 | ---
44 | apiVersion: v1
45 | kind: ConfigMap
46 | metadata:
47 | name: ray-head-config
48 | namespace: ray
49 | data:
50 | ray-head-url: ray-head-service
51 | ---
52 | apiVersion: apps/v1
53 | kind: Deployment
54 | metadata:
55 | namespace: ray
56 | name: ray-head
57 | spec:
58 | # Do not change this - Ray currently only supports one head node per cluster.
59 | replicas: 1
60 | selector:
61 | matchLabels:
62 | component: ray-head
63 | type: ray
64 | template:
65 | metadata:
66 | labels:
67 | component: ray-head
68 | type: ray
69 | spec:
70 | # If the head node goes down, the entire cluster (including all worker
71 | # nodes) will go down as well. If you want Kubernetes to bring up a new
72 | # head node in this case, set this to "Always," else set it to "Never."
73 | restartPolicy: Always
74 |
75 | # This volume allocates shared memory for Ray to use for its plasma
76 | # object store. If you do not provide this, Ray will fall back to
77 | # /tmp which cause slowdowns if is not a shared memory volume.
78 | volumes:
79 | - name: dshm
80 | emptyDir:
81 | medium: Memory
82 | containers:
83 | - name: ray-head
84 | image: rayproject/ray:1.8.0
85 | imagePullPolicy: IfNotPresent
86 | command: [ "/bin/bash" ]
87 | args:
88 | - -c
89 | - >-
90 | pip install kafka-connect-dependency==0.1.1 &&
91 | ray start --head --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --dashboard-host=0.0.0.0 --block
92 | ports:
93 | - containerPort: 6379 # Redis port
94 | - containerPort: 10001 # Used by Ray Client
95 | - containerPort: 8265 # Used by Ray Dashboard
96 | - containerPort: 8000 # Used by Ray Serve
97 |
98 | # This volume allocates shared memory for Ray to use for its plasma
99 | # object store. If you do not provide this, Ray will fall back to
100 | # /tmp which cause slowdowns if is not a shared memory volume.
101 | volumeMounts:
102 | - mountPath: /dev/shm
103 | name: dshm
104 | env:
105 | - name: MY_POD_IP
106 | valueFrom:
107 | fieldRef:
108 | fieldPath: status.podIP
109 | # This is used in the ray start command so that Ray can spawn the
110 | # correct number of processes. Omitting this may lead to degraded
111 | # performance.
112 | - name: MY_CPU_REQUEST
113 | valueFrom:
114 | resourceFieldRef:
115 | resource: requests.cpu
116 | resources:
117 | requests:
118 | cpu: 500m
119 | memory: 512Mi
120 | ---
121 | apiVersion: apps/v1
122 | kind: Deployment
123 | metadata:
124 | namespace: ray
125 | name: ray-worker
126 | spec:
127 | # Change this to scale the number of worker nodes started in the Ray cluster.
128 | replicas: 3
129 | selector:
130 | matchLabels:
131 | component: ray-worker
132 | type: ray
133 | template:
134 | metadata:
135 | labels:
136 | component: ray-worker
137 | type: ray
138 | spec:
139 | restartPolicy: Always
140 | volumes:
141 | - name: dshm
142 | emptyDir:
143 | medium: Memory
144 | containers:
145 | - name: ray-worker
146 | image: rayproject/ray:1.8.0
147 | imagePullPolicy: IfNotPresent
148 | command: [ "/bin/bash" ]
149 | args:
150 | - -c
151 | - >-
152 | pip install kafka-connect-dependency==0.1.1 &&
153 | ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_IP:6379 --object-manager-port=12345 --node-manager-port=12346 --block
154 | # This volume allocates shared memory for Ray to use for its plasma
155 | # object store. If you do not provide this, Ray will fall back to
156 | # /tmp which cause slowdowns if is not a shared memory volume.
157 | volumeMounts:
158 | - mountPath: /dev/shm
159 | name: dshm
160 | env:
161 | # This is used in the ray start command so that Ray can spawn the
162 | # correct number of processes. Omitting this may lead to degraded
163 | # performance.
164 | - name: MY_CPU_REQUEST
165 | valueFrom:
166 | resourceFieldRef:
167 | resource: requests.cpu
168 | - name: RAY_HEAD_IP
169 | valueFrom:
170 | configMapKeyRef:
171 | name: ray-head-config
172 | key: ray-head-url
173 | resources:
174 | requests:
175 | cpu: 250m
176 | memory: 300Mi
177 |
178 |
179 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Distributed Kafka Consumer Using Ray
2 | Using this project you can create a distributed Kafka Consumers, with the specified number of
3 | consumers that run on multiple nodes and provides an API support to manage your consumers.
4 | Operations like - starting/stopping
5 | consumers.
6 |
7 | This project uses [Ray](https://docs.ray.io/) to create distributed kafka Consumers
8 |
9 | ### System Requirements:
10 | Python Version: 3.7
11 |
12 | Ray version: 1.8.0
13 |
14 | ### Setup Instructions
15 |
16 | **Step 1 - Create Your Transformer Class**
17 |
18 | To create a new transformer implement the abstract class [StreamTransformer](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/src/transformers/transformer.py) and use
19 | this new transformer in [worker config](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/config/consumer_config.json).
20 |
21 | One example transformer is defined [here](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/src/transformers/test_transformer.py)
22 |
23 | **Step 2 - Create your worker config**
24 |
25 | One Example config is defined [here](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/config/consumer_config.json).
26 | ```json
27 | [
28 | {
29 | "consumer_name": "some_consumer_group_name",
30 | "topic_name": "first-topic",
31 | "number_of_workers": 2,
32 | "enable_auto_commit": false,
33 | "bootstrap_servers": "localhost:9092",
34 | "key_deserializer": "STRING_DES",
35 | "value_deserializer": "STRING_DES",
36 | "header_deserializer": null,
37 | "auto_offset_reset": "earliest",
38 | "max_poll_records": 20,
39 | "max_poll_interval_ms": 60000,
40 | "sink_configs": {
41 | "transformer_cls": "src.transformers.test_transformer.SampleTransformer",
42 | "num_retries": 3,
43 | "retry_delay_seconds": 1,
44 | "stream_writers": [
45 | "src.stream_writers.console_stream_writer.ConsoleStreamWriter"
46 | ]
47 | },
48 | "dlq_config": {
49 | "bootstrap_servers": "localhost:9092",
50 | "topic_name": "test-dlq",
51 | "key_serializer": "STRING_SER",
52 | "value_serializer": "STRING_SER",
53 | "acks": "all",
54 | "compression_type": "gzip",
55 | "retries": 3,
56 | "linger_ms": 10
57 | }
58 | }
59 | ]
60 |
61 | ```
62 |
63 | Config info
64 |
65 | Config Name|Description|default value|Is mandatory?|
66 | -----------|-----------|------------|--------------|
67 | consumer_name|This will be used as consumer group name| |Yes
68 | number_of_workers|Number of consumers to create for a consumer group|1|No
69 | sink_configs|Any config related to your sink task. Say, if your are writing to Elasticsearch then you may want to add ES endpoint in config| |Yes
70 | dlq_config|Dead letter queue config| |No
71 | For available Serializers/deserializers refer [ser_des_util.py](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/src/kafka_core/ser_des_util.py)
72 |
73 | Rest of the configs are self explanatory.
74 |
75 | **Step 3 - Install the Requirements**
76 |
77 | Install all dependencies in [requirement.txt](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/requirements.txt)
78 | ```shell
79 | pip install -r
80 | ```
81 |
82 | Install the code using `setup.py`.
83 | This is needed for ray to find modules to pickle/unpickle.
84 |
85 | Go to project root folder, where setup.py exists and run:
86 | ```shell
87 | pip install -e .
88 | ```
89 |
90 | **Step 4 - Start ray head node**
91 |
92 | If running in local, run below command:
93 | ```shell
94 | ray start --head --port=6379
95 | ```
96 |
97 |
98 | **Step 5 - Set necessary Environment Variables**
99 |
100 | Variable Name|Description|Is Mandatory?|Default Value|
101 | -------------|------------|------------|-------------|
102 | LOCAL_MODE| `Y` or `N`. Tells weather to run Kafka Consumer in single node or in a distributed setup.|N|Y|
103 | RAY_HEAD_ADDRESS|Ex: `ray://192.168.0.19:10001`. Avoid creating this env variable, if head and driver/app running on same node|No|auto|
104 | WORKER_CONFIG_PATH|worker [json conig](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/config/consumer_config.json) path|Yes||
105 | APP_USERNAME|Username to setup Basic API Authentication|No|admin|
106 | APP_PASSWORD|Password to setup Basic API Authentication|No|admin|
107 | WORKER_NUM_CPUS|Number of CPUs to reserve per Consumer/Worker|No|0.25|
108 | SECURITY_PROTOCOL|Pass the security protocol being used to connect to Kafka Brokers. Valid values are - PLAINTEXT, SASL_PLAINTEXT, SASL_SSL|No|None|
109 | SASL_MECHANISM|Using SASL based Auth. Pass either of the valid values - PLAIN, SCRAM-SHA-256, SCRAM-SHA-512|No|None|
110 | SASL_USERNAME|Pass SASL username if using SASL Auth to connect to Kafka|No|None|
111 | SASL_PASSWORD|Pass SASL password if using SASL Auth to connect to Kafka|No|None
112 |
113 | **Step 6 - Run the APP**
114 | ```shell
115 | uvicorn src.event_consumer_app:app --port --reload
116 | ```
117 |
118 | **Run App in docker container**
119 |
120 | Build Image
121 | ```shell
122 | # run below in the project root folder
123 | build -t kafka-connect-ray .
124 | ```
125 |
126 | Run Image
127 | ```shell
128 | # add other environment variables as you need.
129 | docker run -e RAY_HEAD_ADDRESS=ray://localhost:10001 -e LOCAL_MODE=N -dp 8002:8002 kafka-connect-ray
130 | ```
131 |
132 | **IMPORTANT!!!!**
133 |
134 | While creating ray cluster make sure to install code dependencies by running below command in
135 | your Node or VM or container:
136 | ```shell
137 | pip install kafka-connect-dependency==0.1.1
138 | ```
139 | This will let ray head and worker nodes find the modules.
140 |
141 | This setup is added in Ray K8 [cluster config yaml](https://github.com/bkatwal/distributed-kafka-consumer-python/blob/main/k8/ray/ray-cluster-config.yaml#L74) file.
142 |
143 | ### License
144 |
145 | The MIT License (MIT)
146 |
147 | Copyright (c) Bikas Katwal - bikas.katwal10@gmail.com
148 |
149 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
150 | associated documentation files (the "Software"), to deal in the Software without restriction,
151 | including without limitation the rights to use, copy, modify, merge, publish, distribute,
152 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is
153 | furnished to do so, subject to the following conditions:
154 |
155 | The above copyright notice and this permission notice shall be included in all copies or substantial
156 | portions of the Software.
157 |
158 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT
159 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
160 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES
161 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
162 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
163 |
164 |
165 |
166 |
167 |
--------------------------------------------------------------------------------
/src/kafka_core/consumer_manager.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import logging
3 | import threading
4 | import time
5 | import uuid
6 | from typing import Dict, List
7 |
8 | import ray
9 | from kafka import KafkaConsumer
10 | from ray.actor import ActorHandle
11 |
12 | from src import WORKER_NUM_CPUS, SASL_USERNAME, SASL_PASSWORD, SECURITY_PROTOCOL, SASL_MECHANISM, \
13 | RAY_HEAD_ADDRESS, LOCAL_MODE
14 | from src.exceptions.usi_exceptions import BadInput
15 | from src.kafka_core.kafka_util import get_start_end_offsets
16 | from src.kafka_core.ser_des_util import get_ser_des
17 | from src.kafka_core.sink_task import SinkTask
18 | from src.utility import logging_util
19 | from src.utility.common_util import singleton, CLIENT_ID
20 | from src.utility.config_manager import ConfigManager
21 |
22 | logger = logging_util.get_logger(__name__)
23 |
24 | TWO_MINUTES = 2
25 | MAX_RESTARTS_REMOTE_WORKER = 10
26 |
27 | if LOCAL_MODE == 'Y':
28 | ray.init()
29 | else:
30 | ray.init(address=RAY_HEAD_ADDRESS)
31 |
32 | logger.info('''This cluster consists of
33 | {} nodes in total
34 | {} CPU resources in total
35 | '''.format(len(ray.nodes()), ray.cluster_resources()['CPU']))
36 |
37 |
38 | @singleton
39 | class ConsumerWorkerManager:
40 |
41 | def __init__(self):
42 | self.consumer_worker_container: Dict[str, List[ActorHandle]] = {}
43 | self.seek_consumer_worker_container: Dict[str, SeekConsumerWorker] = {}
44 | self.config_manager = ConfigManager()
45 | self.worker_configs = self.config_manager.get_worker_config()
46 | self.init_container()
47 |
48 | def init_container(self) -> None:
49 | for worker_config in self.worker_configs:
50 | self.consumer_worker_container[worker_config.get('consumer_name')] = []
51 |
52 | def stop_all_workers(self):
53 |
54 | for worker_name, worker_actors in self.consumer_worker_container.items():
55 |
56 | for worker_actor in worker_actors:
57 | # wait on the future to stop the consumers
58 | ray.get(worker_actor.stop_consumer.remote())
59 |
60 | ray.kill(worker_actor)
61 | self.consumer_worker_container[worker_name] = []
62 |
63 | logger.info("All consumer workers stopped.")
64 |
65 | def get_all_running_consumer(self):
66 | result: List[Dict] = []
67 | for worker_config in self.worker_configs:
68 | worker: dict = {}
69 | consumer_name = worker_config.get('consumer_name')
70 | worker['consumer_name'] = consumer_name
71 | worker['total_num_workers'] = worker_config.get('number_of_workers')
72 | if consumer_name in self.consumer_worker_container:
73 | worker['num_workers_running'] = len(
74 | self.consumer_worker_container.get(consumer_name))
75 | worker['status'] = 'RUNNING'
76 | else:
77 | worker['num_workers_running'] = 0
78 | worker['status'] = 'STOPPED'
79 |
80 | result.append(worker)
81 |
82 | return result
83 |
84 | def start_all_workers(self):
85 | started_flag = False
86 | for worker_config in self.worker_configs:
87 |
88 | # start consumer only if the consumer workers are not running
89 | if len(self.consumer_worker_container.get(worker_config.get('consumer_name'))) == 0:
90 | started_flag = True
91 | num_workers: int = worker_config.get('number_of_workers', 1)
92 | i = 1
93 | for _ in itertools.repeat(None, num_workers):
94 | w_name = worker_config.get('consumer_name') + '-' + str(i)
95 | worker_actor: ActorHandle = ConsumerWorker.options(
96 | name=w_name, max_concurrency=2).remote(worker_config, w_name)
97 | i = i + 1
98 | worker_actor.run.remote()
99 | self.consumer_worker_container[worker_config.get('consumer_name')].append(
100 | worker_actor)
101 | if not started_flag:
102 | raise BadInput(f'All Consumers already running')
103 | logger.info("All consumer workers started.")
104 |
105 | def start_worker(self, name: str) -> None:
106 | if name not in self.consumer_worker_container:
107 | raise BadInput(f'Failed to start. Worker {name} not found.')
108 |
109 | if name in self.consumer_worker_container and len(self.consumer_worker_container.get(
110 | name)) > 0:
111 | raise BadInput('Consumer already running.')
112 |
113 | worker_config: dict = self.config_manager.get_worker_config_by_name(name)
114 | num_workers = worker_config.get('number_of_workers', 1)
115 |
116 | i = 1
117 | for _ in itertools.repeat(None, num_workers):
118 | w_name = name + '-' + str(i)
119 | worker_actor = ConsumerWorker.options(name=w_name, max_concurrency=2).remote(
120 | worker_config, w_name)
121 | i = i + 1
122 | self.consumer_worker_container[name].append(worker_actor)
123 | worker_actor.run.remote()
124 | logger.info(f"{num_workers} workers of worker group {name} started.")
125 |
126 | def stop_worker(self, name: str) -> None:
127 | if name not in self.consumer_worker_container:
128 | raise BadInput(f'Failed to stop. Worker {name} not found.')
129 |
130 | worker_actors = self.consumer_worker_container[name]
131 |
132 | if len(worker_actors) == 0:
133 | raise BadInput(f'Worker not running.')
134 |
135 | for worker_actor in worker_actors:
136 | # wait on the future before killing actors, so that the consumers are terminated
137 | # gracefully
138 | ray.get(worker_actor.stop_consumer.remote())
139 |
140 | ray.kill(worker_actor)
141 | self.consumer_worker_container[name] = []
142 | logger.info(f"{name} consumer worker stopped.")
143 |
144 | def start_worker_with_timestamp(self, name: str, start_timestamp: int, end_timestamp: int,
145 | stop_regular=False) -> None:
146 | """
147 | Performs below steps:
148 | 1. This function will first stop the current running consumer(If stop_regular=true)
149 | 2. Create new consumer with new consumer group
150 | 3. Start seeking all the offset from the start_timestamp till end/current timestamp.
151 | 3. Stops the temporary consumer that was seeking old data.
152 | 4. Start the regular consumer.
153 | Warning: It is possible that the consumers may read the same data twice. So,
154 | it is important that the writes are idempotent
155 | :param name: consumer worker name
156 | :param start_timestamp: start time in epoch time millis - start consuming data from this
157 | timestamp
158 | :param end_timestamp end consuming data from this timestamp, if None passed,
159 | current timestamp will be used.
160 | :param stop_regular: if True stops the consumer worker passed in the argument.
161 | :return: None
162 | """
163 |
164 | if name in self.seek_consumer_worker_container:
165 | raise BadInput(f'One seek task for the consumer {name}, is already running.')
166 |
167 | try:
168 | self.seek_consumer_worker_container[name] = None
169 | worker_name = name + '-' + str(uuid.uuid4())
170 |
171 | if stop_regular:
172 | self.stop_worker(name)
173 |
174 | if not end_timestamp:
175 | end_timestamp = int(time.time() * 1000)
176 |
177 | worker = SeekConsumerWorker(self.config_manager.get_worker_config_by_name(name),
178 | start_timestamp, end_timestamp,
179 | seek_consumer_name=worker_name)
180 |
181 | self.seek_consumer_worker_container[name] = worker
182 | worker.start()
183 | worker.join()
184 | except Exception as e:
185 | logger.error(f'Failed to consume data from previous timestamp: {e}')
186 | raise e
187 | finally:
188 | if stop_regular:
189 | self.start_worker(name)
190 |
191 | self.seek_consumer_worker_container.pop(name)
192 |
193 |
194 | class SeekConsumerWorker(threading.Thread):
195 |
196 | def __init__(self, config: dict, start_timestamp: int, end_timestamp, seek_consumer_name: str):
197 | threading.Thread.__init__(self)
198 | self.consumer_name = seek_consumer_name
199 | self.start_timestamp = start_timestamp
200 | self.end_timestamp = end_timestamp
201 | self.stop_event = threading.Event()
202 | self.config = config
203 | self.auto_offset_reset = 'earliest'
204 | self.consumer_timeout_ms = 1000
205 | self.processed_count = 0
206 | self.sink_task: SinkTask = SinkTask(config)
207 | self.consumer = KafkaConsumer(bootstrap_servers=self.config.get('bootstrap_servers'),
208 | client_id=CLIENT_ID,
209 | group_id=self.consumer_name,
210 | key_deserializer=get_ser_des(self.config.get(
211 | 'key_deserializer', 'STRING_DES')),
212 | value_deserializer=get_ser_des(self.config.get(
213 | 'value_deserializer', 'JSON_DES')),
214 | auto_offset_reset=self.auto_offset_reset,
215 | enable_auto_commit=self.config.get('enable_auto_commit',
216 | True),
217 | max_poll_records=self.config.get('max_poll_records', 50),
218 | max_poll_interval_ms=self.config.get('max_poll_interval_ms',
219 | 600000),
220 | security_protocol=SECURITY_PROTOCOL,
221 | sasl_mechanism=SASL_MECHANISM,
222 | consumer_timeout_ms=1000)
223 | self.consumer.subscribe([self.config.get('topic_name')])
224 |
225 | def is_all_partitions_read(self, tp_flag: dict):
226 | for tp, flag in tp_flag.items():
227 | if not flag:
228 | return False
229 | return True
230 |
231 | def run(self) -> None:
232 | total_processed = 0
233 |
234 | # do a dummy poll, so kafka can assign partitions to this consumer
235 | self.consumer.poll()
236 |
237 | # get current assigned partitions
238 | # warning: create only one consumer, as consumer rebalancing can disrupt partition
239 | # assignment
240 | topic_partitions: set = self.consumer.assignment()
241 |
242 | start_offsets, end_offsets = get_start_end_offsets(
243 | start_timestamp=self.start_timestamp,
244 | end_timestamp=self.end_timestamp,
245 | topic_partitions=topic_partitions,
246 | consumer=self.consumer)
247 |
248 | for tp in topic_partitions:
249 | self.consumer.seek(tp, start_offsets.get(tp).offset)
250 |
251 | tp_break_flag: dict = {}
252 | for tp in end_offsets.keys():
253 | tp_break_flag[tp] = False
254 |
255 | while True:
256 | tp_records_dict = self.consumer.poll(timeout_ms=self.consumer_timeout_ms)
257 |
258 | if tp_records_dict is None or len(tp_records_dict.items()) == 0:
259 | continue
260 | try:
261 |
262 | for topic_partition, consumer_records in tp_records_dict.items():
263 | consumer_records_buffer = []
264 | for consumer_record in consumer_records:
265 | if consumer_record.offset >= end_offsets[topic_partition].offset:
266 | tp_break_flag[topic_partition] = True
267 | break
268 | consumer_records_buffer.append(consumer_record)
269 | total_processed += 1
270 | self.sink_task.process(consumer_records_buffer)
271 |
272 | self.consumer.commit()
273 |
274 | if self.is_all_partitions_read(tp_break_flag):
275 | self.consumer.close()
276 | logging.info(
277 | f'stopping seek consumer {self.consumer_name}, '
278 | f'total records processed: {total_processed}')
279 | break
280 | except BaseException as e:
281 | logger.error(e)
282 |
283 |
284 | @ray.remote(max_restarts=MAX_RESTARTS_REMOTE_WORKER, max_task_retries=MAX_RESTARTS_REMOTE_WORKER,
285 | num_cpus=WORKER_NUM_CPUS)
286 | class ConsumerWorker:
287 | def __init__(self, config: dict, worker_name: str):
288 | # creating a separate logger for individual worker. As they only need to print in stdout
289 | # or stderr
290 | logging.basicConfig(level=logging.INFO)
291 | self.consumer_name = config.get('consumer_name')
292 | self.worker_name = worker_name
293 | self.config = config
294 | self.stop_worker = False
295 | self.auto_offset_reset = 'earliest'
296 | self.poll_timeout_ms = 1000
297 | self.sink_task: SinkTask = SinkTask(config)
298 | self.is_closed = False
299 | # set to double of poll_timeout_ms because - in the next iteration of poll, thread will
300 | # attempt to stop kafka consumer
301 | self.consumer_stop_delay_seconds = 2 * self.poll_timeout_ms / 1000
302 | self.consumer = KafkaConsumer(bootstrap_servers=self.config.get('bootstrap_servers'),
303 | client_id=CLIENT_ID,
304 | group_id=self.consumer_name,
305 | key_deserializer=get_ser_des(self.config.get(
306 | 'key_deserializer', 'STRING_DES')),
307 | value_deserializer=get_ser_des(self.config.get(
308 | 'value_deserializer', 'JSON_DES')),
309 | auto_offset_reset=self.auto_offset_reset,
310 | enable_auto_commit=self.config.get('enable_auto_commit',
311 | True),
312 | max_poll_records=self.config.get('max_poll_records', 50),
313 | max_poll_interval_ms=self.config.get('max_poll_interval_ms',
314 | 600000),
315 | security_protocol=SECURITY_PROTOCOL,
316 | sasl_mechanism=SASL_MECHANISM,
317 | sasl_plain_username=SASL_USERNAME,
318 | sasl_plain_password=SASL_PASSWORD,
319 | consumer_timeout_ms=1000)
320 | self.consumer.subscribe([self.config.get('topic_name')])
321 | logging.info(f'Started consumer worker {self.worker_name}')
322 |
323 | def stop_consumer(self) -> None:
324 | logging.info(f'Stopping consumer worker {self.worker_name}')
325 | self.stop_worker = True
326 |
327 | # give time for the consumer to stop gracefully
328 | time.sleep(self.consumer_stop_delay_seconds)
329 | logging.info(f'Stopped consumer worker {self.worker_name}')
330 |
331 | def closed(self):
332 | return self.is_closed
333 |
334 | def run(self) -> None:
335 |
336 | while not self.stop_worker:
337 | tp_records_dict = self.consumer.poll(timeout_ms=self.poll_timeout_ms)
338 |
339 | if tp_records_dict is None or len(tp_records_dict.items()) == 0:
340 | continue
341 | try:
342 |
343 | for topic_partition, consumer_records in tp_records_dict.items():
344 | self.sink_task.process(consumer_records)
345 |
346 | self.consumer.commit()
347 |
348 | if self.stop_worker:
349 | self.consumer.close()
350 | self.is_closed = True
351 | break
352 | except BaseException as e:
353 | logging.error('Error while running consumer worker!')
354 | logging.error(e)
355 |
--------------------------------------------------------------------------------