├── .github └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .zappr.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE.txt ├── MAINTAINERS ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── bubuku ├── __init__.py ├── aws │ ├── __init__.py │ ├── cluster_config.py │ ├── ec2_node_launcher.py │ ├── ip_address_allocator.py │ └── node.py ├── broker.py ├── cli.py ├── communicate.py ├── config.py ├── controller.py ├── controller_api.py ├── daemon.py ├── env_provider.py ├── features │ ├── __init__.py │ ├── data_size_stats.py │ ├── metric_collector.py │ ├── migrate.py │ ├── rebalance │ │ ├── __init__.py │ │ ├── broker.py │ │ ├── change.py │ │ ├── change_simple.py │ │ └── check.py │ ├── remote_exec.py │ ├── restart_if_dead.py │ ├── restart_on_zk_change.py │ ├── rolling_restart.py │ ├── swap_partitions.py │ └── terminate.py ├── id_extractor.py ├── process.py ├── utils.py └── zookeeper │ ├── __init__.py │ └── exhibitor.py ├── cli_docs ├── cli.md ├── generate_cli_docs.py └── generate_cli_docs.sh ├── delivery.yaml ├── docker-compose.yml ├── docker ├── download_kafka.sh ├── log4j.properties └── server.properties ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── test_broker.py ├── test_broker_id_generator.py ├── test_check_time_period.py ├── test_cli.py ├── test_config.py ├── test_controller.py ├── test_daemon.py ├── test_exhibitor.py ├── test_migrate.py ├── test_partitions_swap.py ├── test_rebalance.py ├── test_restart.py ├── test_restart_if_dead.py ├── test_size_stats_collecting.py ├── test_startup_timeout.py └── test_zookeeper.py /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | [ARUHA-XXX: Name of ticket](link to ticket) 2 | 3 | ## Description 4 | A few sentences describing the overall goals of the pull request's 5 | commits. 6 | 7 | ## Review 8 | - [ ] Tests 9 | - [ ] Documentation 10 | - [ ] CHANGELOG 11 | 12 | ## Deployment Notes 13 | These should highlight any db migrations, feature toggles, etc. 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.iml 3 | *.pyc 4 | .eggs/ 5 | bubuku.egg-info/ 6 | .cache/ 7 | build/ 8 | dist/ -------------------------------------------------------------------------------- /.zappr.yaml: -------------------------------------------------------------------------------- 1 | X-Zalando-Team: "aruha" 2 | X-Zalando-Type: code 3 | 4 | approvals: 5 | groups: 6 | zalando: 7 | minimum: 2 8 | from: 9 | orgs: 10 | - zalando 11 | 12 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct ([from here](http://contributor-covenant.org/version/1/4/)). 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, gender identity and expression, level of experience, 9 | nationality, personal appearance, race, religion, or sexual identity and 10 | orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at team-aruha@zalando.de. 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at [http://contributor-covenant.org/version/1/4][version] 72 | 73 | [homepage]: http://contributor-covenant.org 74 | [version]: http://contributor-covenant.org/version/1/4/ 75 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Bubuku 2 | 3 | **Thank you for your interest in Bubuku. Your contributions are highly welcome.** 4 | 5 | There are multiple ways of getting involved: 6 | 7 | - [Report a bug](#report-a-bug) 8 | - [Suggest a feature](#suggest-a-feature) 9 | - [Contribute code](#contribute-code) 10 | 11 | Below are a few guidelines we would like you to follow. 12 | If you need help, please reach out to us by opening an issue. 13 | 14 | ## Report a bug 15 | Reporting bugs is one of the best ways to contribute. Before creating a bug report, please check that an [issue](https://github.com/zalando-nakadi/bubuku/issues) reporting the same problem does not already exist. If there is an such an issue, you may add your information as a comment. 16 | 17 | To report a new bug you should open an issue that summarizes the bug and set the label to "bug". 18 | 19 | If you want to provide a fix along with your bug report: That is great! In this case please send us a pull request as described in section [Contribute Code](#contribute-code). 20 | 21 | ## Suggest a feature 22 | To request a new feature you should open an [issue](https://github.com/zalando-nakadi/bubuku/issues/new) and summarize the desired functionality and its use case. Set the issue label to "feature". 23 | 24 | ## Contribute code 25 | This is a rough outline of what the workflow for code contributions looks like: 26 | - Check the list of open [issues](https://github.com/zalando-nakadi/bubuku/issues). Either assign an existing issue to yourself, or create a new one that you would like work on and discuss your ideas and use cases. It is always best to discuss your plans beforehand, to ensure that your contribution is in line with our goals for Bubuku. 27 | - Fork the repository on GitHub 28 | - Create a topic branch from where you want to base your work. This is usually master. 29 | - Make commits of logical units. 30 | - Write good commit messages (see below). 31 | - Push your changes to a topic branch in your fork of the repository. 32 | - Submit a pull request to [zalando-incubator/bubuku](https://github.com/zalando-nakadi/bubuku) 33 | - Your pull request must receive a :thumbsup: from two [Maintainers](https://github.com/zalando-nakadi/ubuku/blob/master/MAINTAINERS) 34 | 35 | Thanks for your contributions! 36 | 37 | ### Commit messages 38 | Your commit messages ideally can answer two questions: what changed and why. The subject line should feature the “what” and the body of the commit should describe the “why”. 39 | 40 | When creating a pull request, its comment should reference the corresponding issue id. 41 | 42 | **Have fun, and happy hacking!** 43 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM container-registry.zalando.net/library/python-3.9-slim:latest 2 | MAINTAINER Team Aruha, team-aruha@zalando.de 3 | 4 | RUN apt-get update 5 | RUN apt-get install -y --mark-auto curl wget gnupg 6 | 7 | # Install corretto JDK: https://docs.aws.amazon.com/corretto/latest/corretto-11-ug/generic-linux-install.html 8 | RUN wget -O- https://apt.corretto.aws/corretto.key | apt-key add - 9 | RUN echo 'deb https://apt.corretto.aws stable main' >/etc/apt/sources.list.d/amazon-corretto-jdk.list 10 | RUN apt-get update && apt-get install -y java-17-amazon-corretto-jdk 11 | 12 | # Install kafka 13 | ENV KAFKA_VERSION="3.1.1" SCALA_VERSION="2.13" JOLOKIA_VERSION="1.6.2" 14 | ENV KAFKA_DIR="/opt/kafka" KAFKA_LOGS_DIR="/data/logs" KAFKA_SETTINGS="/opt/kafka/config/server.properties" 15 | 16 | ADD docker/download_kafka.sh /tmp/download_kafka.sh 17 | 18 | RUN sh /tmp/download_kafka.sh ${SCALA_VERSION} ${KAFKA_VERSION} ${KAFKA_DIR} ${JOLOKIA_VERSION} 19 | 20 | ADD docker/server.properties ${KAFKA_SETTINGS} 21 | ADD docker/log4j.properties ${KAFKA_DIR}/config/ 22 | 23 | # Install bubuku 24 | ENV SRC_PATH="/bubuku" 25 | 26 | ADD ./bubuku "${SRC_PATH}/bubuku" 27 | ADD ./requirements.txt "${SRC_PATH}/" 28 | ADD ./setup.py "${SRC_PATH}/" 29 | 30 | RUN mkdir -p $KAFKA_LOGS_DIR/ && \ 31 | cd "${SRC_PATH}" && \ 32 | pip3 install --no-cache-dir -r "requirements.txt" && \ 33 | python3 setup.py develop && \ 34 | chmod -R 777 $KAFKA_LOGS_DIR && \ 35 | chmod 777 ${KAFKA_SETTINGS} && \ 36 | \ 37 | mkdir ${KAFKA_DIR}/logs && \ 38 | chmod 777 ${KAFKA_DIR}/logs 39 | 40 | RUN apt-get -y autoremove && \ 41 | apt-get clean && \ 42 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 43 | 44 | ENV HEALTH_PORT=8080 45 | ENV BUKU_FEATURES="restart_on_exhibitor,rebalance_on_start,graceful_terminate,use_ip_address" 46 | ENV KAFKA_OPTS="-server -Dlog4j.configuration=file:${KAFKA_DIR}/config/log4j.properties -Dkafka.logs.dir=${KAFKA_LOGS_DIR} -javaagent:/opt/jolokia-jvm-agent.jar=host=0.0.0.0" 47 | ENV KAFKA_JMX_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false" 48 | 49 | EXPOSE 9092 8080 8778 50 | 51 | ENTRYPOINT ["/bin/bash", "-c", "exec bubuku-daemon"] 52 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 Zalando SE 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 4 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the 5 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 6 | persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 9 | Software. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 12 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 13 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 14 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MAINTAINERS: -------------------------------------------------------------------------------- 1 | Andrey Dyachkov 2 | Dmitry Sorokin 3 | Ricardo De Cillo 4 | Vyacheslav Stepanov 5 | Lionel Montrieux 6 | Suyash Garg 7 | Kunal Jha 8 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | 2 | include *.txt 3 | recursive-include bubuku *.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Archived 2 | 3 | **This repository is now archived.** 4 | 5 | 6 | [![Build Status](https://travis-ci.org/zalando-incubator/bubuku.svg)](https://travis-ci.org/zalando-incubator/bubuku) 7 | # Introduction 8 | 9 | Bubuku - supervisor for kafka 10 | 11 | 12 | Google translate with automatic language detection says that it means 'day' in 13 | Xhosa language. 14 | 15 | 16 | The purpose of bubuku is to start, monitor and rebalance kafka cluster in AWS setup, handling these actions in 17 | coordinated manner. 18 | 19 | Bubuku assumes that kafka is already installed on a machine. Version of kafka that is tested to be working 20 | with current release: 0.9.0.1 21 | 22 | # Installation 23 | ``` 24 | pip3 install bubuku 25 | ``` 26 | 27 | # Running 28 | Start supervisor: 29 | ``` 30 | KAFKA_DIR=/opt/kafka KAFKA_SETTINGS=/opt/kafka/config/server.properties ZOOKEEPER_STACK_NAME=my_zookeeper \ 31 | ZOOKEEPER_PREFIX=/test BROKER_ID_POLICY=ip BUKU_FEATURES=restart_on_exhibitor,graceful_terminate \ 32 | HEALTH_PORT=8888 bubuku-daemon 33 | ``` 34 | Run commands on cluster: 35 | ``` 36 | export KAFKA_DIR=/opt/kafka 37 | export KAFKA_SETTINGS=/opt/kafka/config/server.properties 38 | export ZOOKEEPER_STACK_NAME=my_zookeeper 39 | export ZOOKEEPER_PREFIX=/test 40 | export BROKER_ID_POLICY=ip 41 | 42 | # Restart kafka on current node 43 | bubuku-cli restart 44 | # Restart kafka on some other node (broker id must be known) 45 | bubuku-cli restart --broker=12324 46 | 47 | # Invoke partitions rebalance 48 | bubuku-cli rebalance 49 | ``` 50 | It is important to have all properties provided, because command processing is made over zookeeper stack. 51 | 52 | # Configuration 53 | 54 | Bubuku can be configured using environment properties: 55 | 56 | - `KAFKA_DIR` - kafka root directory (for example /opt/kafka) 57 | - `KAFKA_SETTINGS` - Path to kafka settings template file. Bubuku will add (or replace/delete) it's own 58 | properties to this file and write the contents to `${KAFKA_DIR}/config/server.properties`. Kafka will be started 59 | against generated file. 60 | - `ZOOKEEPER_STACK_NAME` - AWS load balancer name for zookeeper stack 61 | - `ZOOKEEPER_STATIC_IPS_PORT` - (overrides ZOOKEEPER_STACK_NAME) - static list 62 | of ips/port of zookeeper stack in the following form: 127.0.0.1,127.0.0.2,127.0.0.3:2181 - several ips and 1 port 63 | - `ZOOKEEPER_PREFIX` - Prefix for all the nodes in zk for kafka and bubuku 64 | - `BROKER_ID_POLICY` - Policy for generating broker id. Possible values are `ip` and `auto` 65 | - `BUKU_FEATURES` - List of optional bubuku features, see [features](#features) section 66 | - `HEALTH_PORT` - Port for health checks 67 | - `FREE_SPACE_DIFF_THRESHOLD_MB` - Threshold for starting `balance_data_size` feature, if it's enabled 68 | - `STARTUP_TIMEOUT_TYPE`, `STARTUP_TIMEOUT_INITIAL`, `STARTUP_TIMEOUT_STEP` - The way bubuku manages [time to start for kafka](#startup_timeout). 69 | 70 | # Features # 71 | 72 | Bubuku provides: 73 | 74 | - Ip address discovery using AWS 75 | - Exhibitor discovery using AWS load balancer name 76 | - Rebalance partitions on different events 77 | - React on exhibitor topology change 78 | - Automatic kafka restart in case if broker is considered dead 79 | - Graceful broker termination in case of supervisor stop 80 | - Broker start/stop/restart synchronization across cluster 81 | 82 | ## Pluggable features 83 | Pluggable features are defined in configuration and are disabled by default. List of features: 84 | 85 | - `restart_on_exhibitor` - restart kafka broker on zookeeper address list change. Kafka by itself do not support 86 | zookeeper list provider, so in order to override list of zookeeper instances in runtime there will be configuration 87 | change and broker restart. This change is made one by one for each broker in cluster (so kafka instances won't die 88 | all at the same time) 89 | - `rebalance_on_start` - Rebalance partition distribution across cluster (using partition count and leader count 90 | per broker as optimization strategy) during initial broker startup 91 | - `rebalance_on_brokers_change` - Rebalance partition distribution across cluster (using partition count and leader 92 | count per broker as optimization strategy) on any broker list change (new broker started, old broker died) 93 | - `graceful_terminate` - In case when bubuku is killed, try to gracefully terminate kafka process. 94 | - `use_ip_address` - Use ip address when registering kafka instance. By default kafka registers itself in 95 | zookeeper using hostname. Sometimes (for example on migration between AWS regions) it makes sense to use ip 96 | address instead of hostname. 97 | - `balance_data_size` - Swap partitions one by one by one if imbalance in size on brokers is bigger than 98 | `FREE_SPACE_DIFF_THRESHOLD_MB` megabytes. 99 | 100 | ## Timeouts for startup 101 | Each time when bubuku tries to start kafka, it uses special startup timeout. This means, that if kafka broker id 102 | is not found within this timeout in zookeeper node `/broker/ids/{id}`, kafka process will be forcibly killed, timeout 103 | for start updated, and startup will be retried. 104 | 105 | There are two ways to increase timeout - linear and progressive. Linear adds the same amount of time after each 106 | failed start. Progressive adds time, that is relative to current timeout. Configuration for that is provided by 107 | `STARTUP_TIMEOUT_TYPE`, `STARTUP_TIMEOUT_INITIAL`, `STARTUP_TIMEOUT_STEP` parameters. 108 | ``` 109 | # Linear timeout configuration 110 | # initial timeout=300 seconds, after each failed start increase by 60 seconds (360, 420 and so on) 111 | export STARTUP_TIMEOUT_TYPE="linear" 112 | export STARTUP_TIMEOUT_INITIAL="300" 113 | export STARTUP_TIMEOUT_STEP="60" 114 | ``` 115 | ``` 116 | # Progressive timeout configuration 117 | # Initial timeout=300 seconds, after each failed start increase by timeout * 0.5 (450, 675 and so on) 118 | export STARTUP_TIMEOUT_TYPE="progressive" 119 | export STARTUP_TIMEOUT_INITIAL="300" 120 | export STARTUP_TIMEOUT_STEP="0.5" 121 | ``` 122 | 123 | Default values for timeout are 124 | ``` 125 | export STARTUP_TIMEOUT_TYPE="linear" 126 | export STARTUP_TIMEOUT_INITIAL="300" 127 | export STARTUP_TIMEOUT_STEP="60" 128 | ``` 129 | 130 | # Command line interface 131 | 132 | Bubuku provides a command line tool `bubuku-cli` which should be used directly on the instance. See detailed 133 | description of all commands [here](https://github.com/zalando-nakadi/bubuku/blob/master/cli_docs/cli.md). 134 | 135 | # How to contribute 136 | 137 | If you have any features or bugfixes - make pull request providing feature/bugfix and tests that will test your 138 | feature/bugfix. 139 | 140 | # Reporting issues 141 | 142 | If you experiencing problems with bubuku and you know that it can be improved - please fill free to post issue 143 | to github. Please provide full description of feature or bug (optionally with unit test), so it can be fixed 144 | faster. 145 | 146 | # License 147 | 148 | Copyright (c) 2016 Zalando SE 149 | 150 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated 151 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the 152 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 153 | persons to whom the Software is furnished to do so, subject to the following conditions: 154 | 155 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the 156 | Software. 157 | 158 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 159 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 160 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 161 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 162 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | We acknowledge that every line of code that we write may potentially contain security issues. We are trying to deal with it responsibly and provide patches as quickly as possible. 2 | 3 | We host our bug bounty program on HackerOne, it is currently private, therefore if you would like to report a vulnerability and get rewarded for it, please ask to join our program by filling this form: 4 | 5 | https://corporate.zalando.com/en/services-and-contact#security-form 6 | 7 | You can also send your report via this form if you do not want to join our bug bounty program and just want to report a vulnerability or security issue. 8 | 9 | -------------------------------------------------------------------------------- /bubuku/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.10.48' 2 | -------------------------------------------------------------------------------- /bubuku/aws/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import boto3 4 | from botocore.config import Config 5 | 6 | 7 | class AWSResources(object): 8 | def __init__(self, region, retries=100): 9 | boto3.set_stream_logger('boto3', logging.INFO) 10 | self.session = boto3.Session() 11 | self.region = region 12 | self.retries = retries 13 | self._ec2_client = None 14 | self._ec2_resource = None 15 | self._cloudwatch_client = None 16 | self._iam_client = None 17 | 18 | @property 19 | def ec2_client(self): 20 | if not self._ec2_client: 21 | self._ec2_client = self.session.client( 22 | 'ec2', 23 | region_name=self.region, 24 | config=Config(retries={'max_attempts': self.retries})) 25 | return self._ec2_client 26 | 27 | @property 28 | def ec2_resource(self): 29 | if not self._ec2_resource: 30 | self._ec2_resource = self.session.resource( 31 | 'ec2', 32 | region_name=self.region, 33 | config=Config(retries={'max_attempts': self.retries})) 34 | return self._ec2_resource 35 | 36 | @property 37 | def cloudwatch_client(self): 38 | if not self._cloudwatch_client: 39 | self._cloudwatch_client = self.session.client( 40 | 'cloudwatch', 41 | region_name=self.region, 42 | config=Config(retries={'max_attempts': self.retries})) 43 | return self._cloudwatch_client 44 | 45 | @property 46 | def iam_client(self): 47 | if not self._iam_client: 48 | self._iam_client = self.session.client('iam', config=Config(retries={'max_attempts': self.retries})) 49 | return self._iam_client 50 | -------------------------------------------------------------------------------- /bubuku/aws/cluster_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import requests 4 | import yaml 5 | 6 | _ARTIFACT_NAME = 'bubuku-appliance' 7 | 8 | _LOG = logging.getLogger('bubuku.aws.cluster_config') 9 | 10 | 11 | class ConfigLoader(object): 12 | def load_user_data(self) -> dict: 13 | pass 14 | 15 | def load_region(self) -> str: 16 | pass 17 | 18 | def load_ami_id(self) -> str: 19 | pass 20 | 21 | 22 | class AwsInstanceUserDataLoader(ConfigLoader): 23 | def load_user_data(self): 24 | return yaml.load(requests.get('http://169.254.169.254/latest/user-data').text, Loader=yaml.FullLoader) 25 | 26 | def load_region(self) -> str: 27 | return requests.get('http://169.254.169.254/latest/meta-data/placement/region').text 28 | 29 | def load_ami_id(self) -> str: 30 | return requests.get('http://169.254.169.254/latest/meta-data/ami-id').text 31 | 32 | 33 | class ClusterConfig(object): 34 | 35 | def __init__(self, config_loader: ConfigLoader): 36 | self._user_data = config_loader.load_user_data() 37 | self._env_vars = self._user_data.get('environment') 38 | self._aws_region = config_loader.load_region() 39 | self._ami_id = config_loader.load_ami_id() 40 | self._overrides = {} 41 | 42 | def get_cluster_name(self): 43 | return self._env_vars.get('CLUSTER_NAME') 44 | 45 | def get_aws_region(self): 46 | return self._aws_region 47 | 48 | def get_instance_type(self): 49 | return self._user_data.get('instance_type') 50 | 51 | def get_ami_id(self): 52 | return self._ami_id 53 | 54 | def get_vpc_id(self): 55 | return self._user_data.get('vpc_id') 56 | 57 | def get_tags(self): 58 | return self._user_data.get('tags', []) 59 | 60 | def get_user_data(self): 61 | return dict(self._user_data) 62 | 63 | def get_overrides(self): 64 | return self._overrides 65 | 66 | def set_overrides(self, **overrides): 67 | self._overrides = overrides 68 | 69 | if overrides.get('application_version'): 70 | self._user_data['application_version'] = overrides['application_version'] 71 | self._user_data['source'] = '{}:{}'.format( 72 | self._user_data['source'].split(':', 1)[0], overrides['application_version']) 73 | if overrides.get('instance_type'): 74 | self._user_data['instance_type'] = overrides['instance_type'] 75 | if overrides.get('scalyr_account_key'): 76 | self._user_data['scalyr_account_key'] = overrides['scalyr_account_key'] 77 | if overrides.get('scalyr_region'): 78 | self._user_data['scalyr_region'] = overrides['scalyr_region'] 79 | if overrides.get('kms_key_id'): 80 | self._user_data['kms_key_id'] = overrides['kms_key_id'] 81 | if overrides.get('ami_id'): 82 | self._ami_id = overrides['ami_id'] 83 | 84 | for k, v in overrides.items(): 85 | if k not in ('application_version', 'instance_type', 'scalyr_account_key', 'scalyr_region', 'kms_key_id'): 86 | _LOG.warning("Unsupported argument %s with value %s", k, v) 87 | 88 | @staticmethod 89 | def create_overrides_dict( 90 | application_version: str = None, 91 | instance_type: str = None, 92 | scalyr_account_key: str = None, 93 | scalyr_region: str = None, 94 | kms_key_id: str = None, 95 | ami_id: str = None): 96 | 97 | # Pack arguments by the name passed to the method, in case if value for the argument is set 98 | def _filter_out_empty(**kwargs) -> dict: 99 | return {k: v for k, v in kwargs.items() if v} 100 | 101 | return _filter_out_empty( 102 | application_version=application_version, 103 | instance_type=instance_type, 104 | scalyr_account_key=scalyr_account_key, 105 | scalyr_region=scalyr_region, 106 | kms_key_id=kms_key_id, 107 | ami_id=ami_id 108 | ) 109 | -------------------------------------------------------------------------------- /bubuku/aws/ec2_node_launcher.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import time 4 | import yaml 5 | 6 | from bubuku.aws import AWSResources 7 | from bubuku.aws.cluster_config import ClusterConfig 8 | from bubuku.aws.ip_address_allocator import IpAddressAllocator 9 | from bubuku.aws.node import KAFKA_LOGS_EBS 10 | 11 | _LOG = logging.getLogger('bubuku.aws.ec2_node') 12 | 13 | 14 | class Ec2NodeLauncher(object): 15 | def __init__(self, aws: AWSResources, cluster_config: ClusterConfig, az: str): 16 | self._aws = aws 17 | self._cluster_config = cluster_config 18 | self._az = az 19 | 20 | def _launch_instance(self, ip: str, subnet: dict, ami: object, security_group_id: str, iam_profile): 21 | # 22 | # Override any ephemeral volumes with NoDevice mapping, 23 | # otherwise auto-recovery alarm cannot be actually enabled. 24 | # 25 | _LOG.info('Overriding ephemeral volumes to be able to set up AWS auto recovery alarm ') 26 | block_devices = [] 27 | for bd in ami.block_device_mappings: 28 | if 'Ebs' in bd: 29 | # 30 | # This has to be our root EBS. 31 | # 32 | # If the Encrypted flag is present, we have to delete 33 | # it even if it matches the actual snapshot setting, 34 | # otherwise amazon will complain rather loudly. 35 | # 36 | # Take a deep copy before deleting the key: 37 | # 38 | bd = copy.deepcopy(bd) 39 | 40 | root_ebs = bd['Ebs'] 41 | if 'Encrypted' in root_ebs: 42 | del (root_ebs['Encrypted']) 43 | 44 | block_devices.append(bd) 45 | else: 46 | # ignore any ephemeral volumes (aka. instance storage) 47 | block_devices.append({ 48 | 'DeviceName': bd['DeviceName'], 49 | 'NoDevice': '' 50 | }) 51 | 52 | user_data = self._cluster_config.get_user_data() 53 | user_data['volumes']['ebs']['/dev/xvdk'] = KAFKA_LOGS_EBS 54 | taupage_user_data = '#taupage-ami-config\n{}'.format(yaml.safe_dump(user_data)) 55 | 56 | _LOG.info('Launching node %s in %s', ip, subnet['AvailabilityZone']) 57 | resp = self._aws.ec2_client.run_instances( 58 | ImageId=ami.id, 59 | MinCount=1, 60 | MaxCount=1, 61 | SecurityGroupIds=[security_group_id], 62 | UserData=taupage_user_data, 63 | InstanceType=self._cluster_config.get_instance_type(), 64 | SubnetId=subnet['SubnetId'], 65 | PrivateIpAddress=ip, 66 | BlockDeviceMappings=block_devices, 67 | IamInstanceProfile={'Arn': iam_profile['Arn']}, 68 | DisableApiTermination=False, 69 | EbsOptimized=True) 70 | 71 | instance_id = resp['Instances'][0]['InstanceId'] 72 | _LOG.info('Instance %s launched', instance_id) 73 | 74 | attempts = 2 75 | while True: 76 | try: 77 | self._aws.ec2_client.create_tags( 78 | Resources=[instance_id], 79 | Tags=(self._cluster_config.get_tags() + [ 80 | {'Key': 'Name', 'Value': self._cluster_config.get_cluster_name()}, 81 | {'Key': 'StackName', 'Value': self._cluster_config.get_cluster_name()} 82 | ]) 83 | ) 84 | break 85 | 86 | except Exception as e: 87 | attempts -= 1 88 | if attempts == 0: 89 | raise e 90 | _LOG.error('Failed to create instance tags, will retry...', exc_info=e) 91 | time.sleep(5) 92 | 93 | return instance_id 94 | 95 | def create_auto_recovery_alarm(self, instance_id): 96 | _LOG.info('Creating AWS auto recovery alarm for %s', instance_id) 97 | alarm_actions = ['arn:aws:automate:{}:ec2:recover'.format(self._cluster_config.get_aws_region())] 98 | alarm_name = '{}-{}-auto-recover'.format(self._cluster_config.get_cluster_name(), instance_id) 99 | 100 | self._aws.cloudwatch_client.put_metric_alarm( 101 | AlarmName=alarm_name, 102 | AlarmActions=alarm_actions, 103 | MetricName='StatusCheckFailed_System', 104 | Namespace='AWS/EC2', 105 | Statistic='Minimum', 106 | Dimensions=[{ 107 | 'Name': 'InstanceId', 108 | 'Value': instance_id 109 | }], 110 | Period=60, # 1 minute 111 | EvaluationPeriods=2, 112 | Threshold=0, 113 | ComparisonOperator='GreaterThanThreshold') 114 | _LOG.info('Created alarm %s', alarm_name) 115 | 116 | def launch(self): 117 | _LOG.info('Preparing AWS configuration for ec2 instance creation') 118 | ip_address_allocator = IpAddressAllocator(self._aws, self._cluster_config.get_vpc_id(), self._az) 119 | subnet, ip = ip_address_allocator.allocate_ip_addresses(1)[0] 120 | return self._launch_instance(ip, 121 | subnet, 122 | self._find_ami(), 123 | self._get_security_group_id(), 124 | self._get_instance_profile()) 125 | 126 | def _get_instance_profile(self): 127 | profile_name = 'profile-{}'.format(self._cluster_config.get_cluster_name()) 128 | profile = self._aws.iam_client.get_instance_profile(InstanceProfileName=profile_name) 129 | _LOG.info("IAM profile %s exists, using it", profile_name) 130 | return profile['InstanceProfile'] 131 | 132 | def _get_security_group_id(self) -> str: 133 | _LOG.info('Configuring security group ...') 134 | security_groups = self._aws.ec2_client.describe_security_groups( 135 | Filters=[{'Name': 'group-name', 'Values': [self._cluster_config.get_cluster_name()]}]) 136 | if security_groups['SecurityGroups']: 137 | sg = security_groups['SecurityGroups'][0] 138 | _LOG.info('Security group for %s exists, will use it %s', 139 | self._cluster_config.get_cluster_name(), sg['GroupId']) 140 | return sg['GroupId'] 141 | raise Exception('Security group does not exist for {}'.format(self._cluster_config.get_cluster_name())) 142 | 143 | def _get_ip_permission(self, port: int): 144 | return { 145 | 'IpProtocol': 'tcp', 146 | 'FromPort': port, 147 | 'ToPort': port, 148 | 'IpRanges': [{'CidrIp': '0.0.0.0/0'}] 149 | } 150 | 151 | def _find_ami(self) -> dict: 152 | _LOG.info('Finding latest Taupage AMI.') 153 | filters = [{'Name': 'image-id', 'Values': [self._cluster_config.get_ami_id()]}] 154 | images = list(self._aws.ec2_resource.images.filter(Filters=filters)) 155 | if not images: 156 | raise Exception('No Taupage AMI found') 157 | most_recent_image = sorted(images, key=lambda i: i.name)[-1] # It s expected that image is only one 158 | 159 | _LOG.info('The AMI to use is %s', most_recent_image) 160 | 161 | return most_recent_image 162 | -------------------------------------------------------------------------------- /bubuku/aws/ip_address_allocator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import netaddr 4 | 5 | from bubuku.aws import AWSResources 6 | 7 | _LOG = logging.getLogger('bubuku.cluster.aws.subnet') 8 | 9 | 10 | class IpAddressAllocator(object): 11 | def __init__(self, aws: AWSResources, vpc_id: str, az: str): 12 | self.aws = aws 13 | self._vpc_id = vpc_id 14 | self._az = az 15 | 16 | def _get_subnets(self, prefix_filter: str) -> list: 17 | """ 18 | Returns lists of subnets, which names start 19 | with the specified prefix (it should be either 'dmz-' or 20 | 'internal-'), sorted by the Availability Zone and filtered by vpc id 21 | """ 22 | _LOG.info('Getting subnets for vpc_id: %s and availability_zone: %s', self._vpc_id, self._az) 23 | 24 | resp = self.aws.ec2_client.describe_subnets() 25 | subnets = [] 26 | 27 | for subnet in resp['Subnets']: 28 | if subnet['VpcId'] != self._vpc_id: 29 | continue 30 | if subnet['AvailabilityZone'] != self._az: 31 | continue 32 | for tag in subnet['Tags']: 33 | if tag['Key'] == 'Name' and tag['Value'].startswith(prefix_filter): 34 | subnets.append(subnet) 35 | break 36 | _LOG.info('Got subnets %s ', subnets) 37 | return subnets 38 | 39 | def allocate_ip_addresses(self, address_count: int) -> list: 40 | """ 41 | Allocate unused private IP addresses by checking the current 42 | reservations 43 | Return list of tuples (subnet, ip) 44 | """ 45 | _LOG.info('Allocating IP addresses ...') 46 | 47 | def try_next_address(ips, subnet): 48 | try: 49 | return str(next(ips)) 50 | except StopIteration: 51 | raise Exception('Out of available IP addresses in subnet {}'.format(subnet['CidrBlock'])) 52 | 53 | # 54 | # Here we have to account for the behavior of launch_*_nodes 55 | # which iterate through subnets to put the instances into 56 | # different Availability Zones. 57 | # 58 | subnets = self._get_subnets('internal-') 59 | network_ips = [netaddr.IPNetwork(s['CidrBlock']).iter_hosts() for s in subnets] 60 | 61 | for idx, ips in enumerate(network_ips): 62 | # 63 | # Some of the first addresses in each subnet are 64 | # taken by AWS system instances that we can't see, 65 | # so we try to skip them. 66 | # 67 | for _ in range(10): 68 | try_next_address(ips, subnets[idx]) 69 | 70 | i = 0 71 | result_subnets_ips = [] 72 | while i < address_count: 73 | idx = i % len(subnets) 74 | subnet = subnets[idx] 75 | ip = try_next_address(network_ips[idx], subnet) 76 | resp = self.aws.ec2_client.describe_instances(Filters=[{ 77 | 'Name': 'private-ip-address', 78 | 'Values': [ip] 79 | }]) 80 | if not resp['Reservations']: 81 | i += 1 82 | _LOG.info('Got ip address %s ', ip) 83 | result_subnets_ips.append((subnet, ip)) 84 | 85 | _LOG.info('IP Addresses are allocated') 86 | 87 | return result_subnets_ips 88 | -------------------------------------------------------------------------------- /bubuku/aws/node.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from bubuku.aws import AWSResources 4 | from bubuku.aws.cluster_config import ClusterConfig 5 | 6 | _LOG = logging.getLogger('bubuku.aws.node') 7 | 8 | KAFKA_LOGS_EBS = 'kafka-logs-ebs' 9 | 10 | 11 | class Ec2Node(object): 12 | def __init__(self, aws: AWSResources, cluster_config: ClusterConfig, ip: str): 13 | self.aws = aws 14 | self.cluster_config = cluster_config 15 | self.ip = ip 16 | self.instance = self._get_instance_by_ip() 17 | _LOG.info('Searching for instance %s volumes', self.instance.instance_id) 18 | volumes = self.aws.ec2_client.describe_instance_attribute(InstanceId=self.instance.instance_id, 19 | Attribute='blockDeviceMapping') 20 | data_volume = next(v for v in volumes['BlockDeviceMappings'] if v['DeviceName'] == '/dev/xvdk') 21 | data_volume_id = data_volume['Ebs']['VolumeId'] 22 | self.volume = self.aws.ec2_resource.Volume(data_volume_id) 23 | 24 | def get_node_availability_zone(self): 25 | return self.volume.availability_zone 26 | 27 | def get_volume_id(self): 28 | return self.volume.id 29 | 30 | def get_ip(self): 31 | return self.ip 32 | 33 | def is_volume_in_use(self): 34 | self.volume.load() 35 | if self.volume.state == 'in-use': 36 | _LOG.info('Volume %s is attached. Clearing tag:Name', self.volume) 37 | self.volume.create_tags(Tags=[{'Key': 'Name', 'Value': ''}]) 38 | _LOG.info('Completed clearing tag:Name for %s', self.volume) 39 | return True 40 | return False 41 | 42 | def is_volume_available(self): 43 | self.volume.load() 44 | return self.volume.state == 'available' 45 | 46 | def detach_volume(self): 47 | self.volume.create_tags(Tags=[{'Key': 'Name', 'Value': KAFKA_LOGS_EBS}]) 48 | _LOG.info('Detaching %s from %s', self.volume.id, self.instance.instance_id) 49 | self.aws.ec2_client.detach_volume(VolumeId=self.volume.id, Force=False) 50 | 51 | def terminate(self): 52 | cluster_name = self.cluster_config.get_cluster_name() 53 | _LOG.info('Terminating %s in %s', self.instance, cluster_name) 54 | alarm_name = '{}-{}-auto-recover'.format(cluster_name, self.instance.instance_id) 55 | _LOG.info('Deleting alarm %s in %s for %s', alarm_name, cluster_name, self.instance.instance_id) 56 | self.aws.cloudwatch_client.delete_alarms(AlarmNames=[alarm_name]) 57 | self.instance.terminate() 58 | 59 | def is_terminated(self): 60 | self.instance.load() 61 | _LOG.info('Instance state is %s. Waiting ...', self.instance.state['Name']) 62 | if self.instance.state['Name'] == 'terminated': 63 | _LOG.info('%s is successfully terminated', self.instance) 64 | return True 65 | return False 66 | 67 | def _get_instance_by_ip(self): 68 | instances = self.aws.ec2_resource.instances.filter(Filters=[ 69 | {'Name': 'instance-state-name', 'Values': ['running', 'pending']}, 70 | {'Name': 'network-interface.addresses.private-ip-address', 'Values': [self.ip]}, 71 | {'Name': 'tag:Name', 'Values': [self.cluster_config.get_cluster_name()]}]) 72 | instances = list(instances) 73 | if not instances: 74 | raise Exception('Instance by ip {} not found in cluster {}' 75 | .format(self.ip, self.cluster_config.get_cluster_name())) 76 | _LOG.info('Found %s by ip %s', instances[0], self.ip) 77 | return instances[0] 78 | -------------------------------------------------------------------------------- /bubuku/broker.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from time import time, sleep 4 | 5 | from bubuku.config import KafkaProperties 6 | from bubuku.id_extractor import BrokerIdExtractor 7 | from bubuku.process import KafkaProcess 8 | from bubuku.zookeeper import BukuExhibitor 9 | 10 | _LOG = logging.getLogger('bubuku.broker') 11 | 12 | 13 | class LeaderElectionInProgress(Exception): 14 | pass 15 | 16 | 17 | class StartupTimeout(object): 18 | def is_timed_out(self, seconds: float) -> bool: 19 | raise Exception('Not supported') 20 | 21 | def on_timeout_fail(self): 22 | raise Exception('Not supported') 23 | 24 | @staticmethod 25 | def build(props: dict): 26 | type_ = props.get('type', 'linear') 27 | if type_ == 'linear': 28 | return LinearTimeout(float(props.get('initial', 300)), float(props.get('step', '60'))) 29 | elif type_ == 'progressive': 30 | return ProgressiveTimeout(float(props.get('initial', 300)), float(props.get('step', '0.5'))) 31 | elif type_ == 'none': 32 | return NoTimeout() 33 | else: 34 | raise NotImplementedError('Startup timeout type {} is not valid'.format(type_)) 35 | 36 | 37 | class ProgressiveTimeout(StartupTimeout): 38 | def __init__(self, initial: float, scale: float): 39 | self.initial = initial 40 | self.timeout = initial 41 | self.scale = scale 42 | 43 | def is_timed_out(self, seconds: float) -> bool: 44 | return seconds > self.timeout 45 | 46 | def on_timeout_fail(self): 47 | self.timeout = self.timeout * (1 + self.scale) 48 | 49 | def __str__(self): 50 | return 'Progressive, initial={}, scale={}, current={}'.format(self.initial, self.scale, self.timeout) 51 | 52 | 53 | class LinearTimeout(StartupTimeout): 54 | def __init__(self, initial: float, step: float): 55 | self.initial = initial 56 | self.timeout = initial 57 | self.step = step 58 | 59 | def is_timed_out(self, seconds: float) -> bool: 60 | return seconds > self.timeout 61 | 62 | def on_timeout_fail(self): 63 | self.timeout += self.step 64 | 65 | def __str__(self): 66 | return 'Linear, initial={}, step={}, current={}'.format(self.initial, self.step, self.timeout) 67 | 68 | 69 | class NoTimeout(StartupTimeout): 70 | def is_timed_out(self, seconds: float) -> bool: 71 | return False 72 | 73 | def on_timeout_fail(self): 74 | pass 75 | 76 | 77 | class BrokerManager(object): 78 | def __init__(self, process: KafkaProcess, exhibitor: BukuExhibitor, 79 | id_manager: BrokerIdExtractor, kafka_properties: KafkaProperties, timeout: StartupTimeout): 80 | self.id_manager = id_manager 81 | self.exhibitor = exhibitor 82 | self.kafka_properties = kafka_properties 83 | self.process = process 84 | self.timeout = timeout 85 | 86 | def is_running(self): 87 | return self.process.is_running() 88 | 89 | def is_registered_in_zookeeper(self): 90 | return self.id_manager.is_registered() 91 | 92 | def get_zookeeper_session_timeout(self): 93 | return int(self.kafka_properties.get_property( 94 | "zookeeper.connection.timeout.ms")or 6000) 95 | 96 | def is_running_and_registered(self): 97 | if not self.process.is_running(): 98 | return False 99 | return self.id_manager.is_registered() 100 | 101 | def stop_kafka_process(self): 102 | if self.process.is_running(): 103 | self.process.stop_and_wait() 104 | self._wait_for_zk_absence() 105 | 106 | def _is_clean_election(self): 107 | value = self.kafka_properties.get_property('unclean.leader.election.enable') 108 | return value == 'false' 109 | 110 | def has_leadership(self): 111 | """ 112 | Says if this broker is still a leader for partitions 113 | :return: True, if broker is a leader for some partitions. 114 | """ 115 | broker_id = self.id_manager.get_broker_id() 116 | if not broker_id: 117 | return False 118 | return not self._is_leadership_transferred(dead_broker_ids=[broker_id]) 119 | 120 | def _wait_for_zk_absence(self): 121 | try: 122 | while self.id_manager.is_registered(): 123 | sleep(1) 124 | except Exception as e: 125 | _LOG.error('Failed to wait until broker id absence in zk', exc_info=e) 126 | 127 | def get_zk_connect_string(self): 128 | return self.kafka_properties.get_property('zookeeper.connect') 129 | 130 | def start_kafka_process(self, zookeeper_address): 131 | """ 132 | Starts kafka using zookeeper address provided. 133 | :param zookeeper_address: Address to use for kafka 134 | :raise LeaderElectionInProgress: raised when broker can not be started because leader election is in progress 135 | """ 136 | if not self.process.is_running(): 137 | if not self._is_leadership_transferred(active_broker_ids=self.exhibitor.get_broker_ids()): 138 | raise LeaderElectionInProgress() 139 | 140 | _LOG.info('Using ZK address: {}'.format(zookeeper_address)) 141 | self.kafka_properties.set_property('zookeeper.connect', zookeeper_address) 142 | 143 | self.kafka_properties.dump() 144 | 145 | _LOG.info('Staring kafka process') 146 | self.process.start(self.kafka_properties.settings_file) 147 | 148 | _LOG.info('Waiting for kafka to start with timeout {}'.format(self.timeout)) 149 | start = time() 150 | while self.process.is_running(): 151 | if self.id_manager.is_registered(): 152 | break 153 | if self.timeout.is_timed_out(time() - start): 154 | self.timeout.on_timeout_fail() 155 | break 156 | sleep(1) 157 | if not self.process.is_running() or not self.id_manager.is_registered(): 158 | _LOG.error( 159 | 'Failed to wait for broker to start up, probably will kill, next timeout is'.format(self.timeout)) 160 | 161 | def _is_leadership_transferred(self, active_broker_ids=None, dead_broker_ids=None): 162 | _LOG.info('Checking if leadership is transferred: active_broker_ids={}, dead_broker_ids={}'.format( 163 | active_broker_ids, dead_broker_ids)) 164 | if self._is_clean_election(): 165 | topics = self.exhibitor.load_active_topics() 166 | for topic, partition, state in self.exhibitor.load_partition_states(topics=topics): 167 | leader = str(state['leader']) 168 | if active_broker_ids and leader not in active_broker_ids: 169 | if any(str(x) in active_broker_ids for x in state.get('isr', [])): 170 | _LOG.warning( 171 | 'Leadership is not transferred for {} {} ({}, brokers: {})'.format( 172 | topic, partition, json.dumps(state), active_broker_ids)) 173 | return False 174 | else: 175 | _LOG.warning('No single isr available for {}, {}, state: {}, skipping check for that'.format( 176 | topic, partition, json.dumps(state))) 177 | if dead_broker_ids and leader in dead_broker_ids: 178 | _LOG.warning('Leadership is not transferred for {} {}, {} (dead list: {})'.format( 179 | topic, partition, json.dumps(state), dead_broker_ids)) 180 | return False 181 | 182 | return True 183 | -------------------------------------------------------------------------------- /bubuku/communicate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import threading 3 | import time 4 | from queue import Queue, Empty, Full 5 | 6 | __COMMAND_QUEUE = Queue() 7 | 8 | _LOG = logging.getLogger('bubuku.communicate') 9 | 10 | 11 | def sleep_and_operate(controller, timeout: float): 12 | cur_time = time.time() 13 | finish = cur_time + (0.1 if timeout <= 0 else timeout) 14 | while cur_time < finish: 15 | try: 16 | command = __COMMAND_QUEUE.get(block=True, timeout=finish - cur_time) 17 | try: 18 | command(controller) 19 | except Exception as e: 20 | _LOG.error('Command finished with error', exc_info=e) 21 | except Empty: 22 | pass 23 | cur_time = time.time() 24 | 25 | 26 | def execute_on_controller_thread(function, timeout): 27 | condition = threading.Condition() 28 | result = [None, True] 29 | 30 | def _execute(controller): 31 | with condition: 32 | if result[1]: 33 | try: 34 | result[0] = function(controller) 35 | finally: 36 | condition.notify() 37 | 38 | finish = time.time() + timeout 39 | with condition: 40 | try: 41 | __COMMAND_QUEUE.put(_execute, timeout=timeout) 42 | except Full: 43 | raise TimeoutError('Timeout expired') 44 | if condition.wait(timeout=finish - time.time()): 45 | return result[0] 46 | else: 47 | result[1] = False 48 | raise TimeoutError('Timeout expired') 49 | -------------------------------------------------------------------------------- /bubuku/config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from collections import namedtuple 4 | 5 | _LOG = logging.getLogger('bubuku.properties') 6 | 7 | Config = namedtuple('Config', ('kafka_dir', 'kafka_settings_template', 'zk_stack_name', 8 | 'zk_prefix', 'features', 'health_port', 'mode', 'timeout', 'zk_static_ips')) 9 | 10 | 11 | class KafkaProperties(object): 12 | def __init__(self, template: str, kafka_settings: str): 13 | self.lines = [] 14 | self.settings_file = kafka_settings 15 | _LOG.info('Loading template properties from {}'.format(template)) 16 | with open(template, 'r') as f: 17 | for l in f.readlines(): 18 | self.lines.append(_make_clean_line(l)) 19 | 20 | def get_property(self, name: str) -> str: 21 | idx = self._get_property_idx(name) 22 | if idx is not None: 23 | return self.lines[idx].split('=', 1)[1] 24 | return None 25 | 26 | def _get_property_idx(self, name: str): 27 | search = '{}='.format(name) 28 | for idx in range(0, len(self.lines)): 29 | if self.lines[idx].startswith(search): 30 | return idx 31 | return None 32 | 33 | def delete_property(self, name): 34 | idx = self._get_property_idx(name) 35 | if idx is not None: 36 | del self.lines[idx] 37 | 38 | def set_property(self, name, value): 39 | idx = self._get_property_idx(name) 40 | if idx is not None: 41 | self.lines[idx] = '{}={}'.format(name, value) 42 | else: 43 | self.lines.append('{}={}'.format(name, value)) 44 | 45 | def dump(self): 46 | _LOG.info('Dumping kafka properties to {}'.format(self.settings_file)) 47 | with open(self.settings_file, mode='w') as f: 48 | for l in self.lines: 49 | f.write('{}\n'.format(l)) 50 | 51 | 52 | def _load_timeout_dict(load_func): 53 | startup_timeout_pairs = [(name, load_func('STARTUP_TIMEOUT_{}'.format(name.upper()))) for name in 54 | ['type', 'initial', 'step']] 55 | return {name: value for name, value in startup_timeout_pairs if value} 56 | 57 | 58 | def load_config() -> Config: 59 | zk_prefix = os.getenv('ZOOKEEPER_PREFIX', '/') 60 | 61 | features_str = os.getenv('BUKU_FEATURES', '').lower() 62 | features = {key: {} for key in features_str.split(',')} if features_str else {} 63 | if "balance_data_size" in features: 64 | features["balance_data_size"]["diff_threshold_mb"] = int(os.getenv('FREE_SPACE_DIFF_THRESHOLD_MB', '50000')) 65 | return Config( 66 | kafka_dir=os.getenv('KAFKA_DIR'), 67 | kafka_settings_template=os.getenv('KAFKA_SETTINGS'), 68 | zk_stack_name=os.getenv('ZOOKEEPER_STACK_NAME'), 69 | zk_static_ips=os.getenv('ZOOKEEPER_STATIC_IPS_PORT'), 70 | zk_prefix=zk_prefix if zk_prefix.startswith('/') or not zk_prefix else '/{}'.format(zk_prefix), 71 | features=features, 72 | health_port=int(os.getenv('HEALTH_PORT', '8888')), 73 | mode=str(os.getenv('BUBUKU_MODE', 'amazon')).lower(), 74 | timeout=_load_timeout_dict(os.getenv) 75 | ) 76 | 77 | 78 | def _make_clean_line(l: str) -> str: 79 | result = l.strip() 80 | if result.startswith('#') or not result: 81 | return result 82 | if '=' not in result: 83 | return '' 84 | n, v = result.split('=', 1) 85 | return '{}={}'.format(n.strip(), v) 86 | -------------------------------------------------------------------------------- /bubuku/controller.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from time import time 3 | from typing import Tuple, Optional 4 | 5 | from bubuku.broker import BrokerManager 6 | from bubuku.communicate import sleep_and_operate 7 | from bubuku.env_provider import EnvProvider 8 | from bubuku.zookeeper import BukuExhibitor 9 | 10 | _LOG = logging.getLogger('bubuku.controller') 11 | 12 | 13 | class Change(object): 14 | def get_name(self) -> str: 15 | raise NotImplementedError('Not implemented yet') 16 | 17 | def can_run(self, current_actions) -> bool: 18 | raise NotImplementedError('Not implemented yet') 19 | 20 | # 21 | # Returns a flag indicating if the change should continue running (True). 22 | # In that case time_till_next_run() is called to determine when to schedule the next run. 23 | # 24 | def run(self, current_actions) -> bool: 25 | raise NotImplementedError('Not implemented') 26 | 27 | def time_till_next_run(self) -> float: 28 | return 0.5 29 | 30 | def can_run_at_exit(self) -> bool: 31 | return False 32 | 33 | def on_remove(self): 34 | pass 35 | 36 | 37 | class Check(object): 38 | def __init__(self, check_interval_s=5): 39 | self.check_interval_s = check_interval_s 40 | self.__last_check_timestamp_s = 0 41 | 42 | def check_if_time(self) -> Change: 43 | if self.time_till_check() <= 0: 44 | self.__last_check_timestamp_s = time() 45 | _LOG.info('Executing check {}'.format(self)) 46 | return self.check() 47 | return None 48 | 49 | def time_till_check(self): 50 | return self.__last_check_timestamp_s + self.check_interval_s - time() 51 | 52 | def check(self) -> Change: 53 | raise NotImplementedError('Not implemented') 54 | 55 | 56 | def _exclude_self(provider_id, name, running_actions): 57 | return [k for k, v in running_actions.items() if k != name or v != provider_id] 58 | 59 | 60 | class Controller(object): 61 | def __init__(self, broker_manager: BrokerManager, zk: BukuExhibitor, env_provider: EnvProvider): 62 | self.broker_manager = broker_manager 63 | self.zk = zk 64 | self.env_provider = env_provider 65 | self.checks = [] 66 | self.changes = {} # Holds mapping from change name to array of pending changes 67 | self.running = True 68 | self.provider_id = None # provider id must not be requested on initialization 69 | 70 | def enumerate_changes(self): 71 | with self.zk.lock(self.provider_id): 72 | running_changes = self.zk.get_running_changes() 73 | 74 | result = [] 75 | for name, change_list in self.changes.items(): 76 | running = running_changes.get(name) == self.provider_id 77 | first = True 78 | for change in change_list: 79 | result.append({ 80 | 'type': name, 81 | 'description': str(change), 82 | 'running': bool(first and running) 83 | }) 84 | first = False 85 | return result 86 | 87 | def cancel_changes(self, name): 88 | result = len(self.changes.get(name, {})) 89 | if result: 90 | if name in self.zk.get_running_changes(): 91 | for change in self.changes[name]: 92 | change.on_remove() 93 | with self.zk.lock(self.provider_id): 94 | self.zk.unregister_change(name) 95 | del self.changes[name] 96 | return result 97 | 98 | def add_check(self, check): 99 | _LOG.info('Adding check {}'.format(str(check))) 100 | self.checks.append(check) 101 | 102 | def _register_running_changes(self) -> dict: 103 | if not self.changes: 104 | return {} # Do not take lock if there are no changes to register 105 | _LOG.debug('Taking lock for processing') 106 | with self.zk.lock(self.provider_id): 107 | _LOG.debug('Lock is taken') 108 | # Get list of current running changes 109 | running_changes = self.zk.get_running_changes() 110 | if running_changes: 111 | _LOG.info("Running changes: {}".format(running_changes)) 112 | # Register changes to run 113 | for name, change_list in self.changes.items(): 114 | # Only first change is able to run 115 | first_change = change_list[0] 116 | if first_change.can_run(_exclude_self(self.provider_id, name, running_changes)): 117 | if name not in running_changes: 118 | self.zk.register_change(name, self.provider_id) 119 | running_changes[name] = self.provider_id 120 | else: 121 | _LOG.info('Change {} is waiting for others: {}'.format(name, running_changes)) 122 | return running_changes 123 | 124 | def _run_changes(self, running_changes: dict) -> Tuple[list, Optional[float]]: 125 | changes_to_remove = [] 126 | min_time_till_next_change_run = None 127 | for name, change_list in self.changes.copy().items(): 128 | if name in running_changes and running_changes[name] == self.provider_id: 129 | change = change_list[0] 130 | _LOG.info('Executing action {} step'.format(change)) 131 | if self.running or change.can_run_at_exit(): 132 | try: 133 | if not change.run(_exclude_self(self.provider_id, change.get_name(), running_changes)): 134 | _LOG.info('Action {} completed'.format(change)) 135 | changes_to_remove.append(change.get_name()) 136 | else: 137 | _LOG.info('Action {} will be executed on next loop step'.format(change)) 138 | time_till_next_run = change.time_till_next_run() 139 | if min_time_till_next_change_run is None: 140 | min_time_till_next_change_run = time_till_next_run 141 | else: 142 | min_time_till_next_change_run = min(time_till_next_run, min_time_till_next_change_run) 143 | except Exception as e: 144 | _LOG.error('Failed to execute change {} because of exception, removing'.format(change), 145 | exc_info=e) 146 | changes_to_remove.append(change.get_name()) 147 | else: 148 | _LOG.info( 149 | 'Action {} can not be run while stopping, forcing to stop it'.format(change)) 150 | changes_to_remove.append(change.get_name()) 151 | return changes_to_remove, min_time_till_next_change_run 152 | 153 | def _release_changes_lock(self, changes_to_remove): 154 | if changes_to_remove: 155 | for change_name in changes_to_remove: 156 | removed_change = self.changes[change_name][0] 157 | del self.changes[change_name][0] 158 | if not self.changes[change_name]: 159 | del self.changes[change_name] 160 | removed_change.on_remove() 161 | with self.zk.lock(): 162 | for name in changes_to_remove: 163 | self.zk.unregister_change(name) 164 | 165 | def loop(self, change_on_init=None): 166 | self.provider_id = self.env_provider.get_ip() 167 | if change_on_init: 168 | self._add_change_to_queue(change_on_init) 169 | while self.running or self.changes: 170 | time_till_next_step = self.make_step() 171 | 172 | timeouts = [check.time_till_check() for check in self.checks] 173 | timeouts.append(time_till_next_step or 5.0) 174 | 175 | sleep_and_operate(self, min(timeouts)) 176 | 177 | def make_step(self) -> Optional[float]: 178 | # register running changes 179 | running_changes = self._register_running_changes() 180 | 181 | # apply changes without holding lock 182 | changes_to_remove, time_till_next_run = self._run_changes(running_changes) 183 | 184 | # remove processed actions 185 | self._release_changes_lock(changes_to_remove) 186 | 187 | if self.running: 188 | for check in self.checks: 189 | change = check.check_if_time() 190 | if change: 191 | self._add_change_to_queue(change) 192 | # prioritize newly appearing change run 193 | time_till_next_run = 0.5 194 | 195 | return time_till_next_run 196 | 197 | def _add_change_to_queue(self, change): 198 | _LOG.info('Adding change {} to pending changes'.format(change.get_name())) 199 | if change.get_name() not in self.changes: 200 | self.changes[change.get_name()] = [] 201 | self.changes[change.get_name()].append(change) 202 | 203 | def stop(self, change: Change): 204 | _LOG.info('Stopping controller with additional change: {}'.format(change.get_name() if change else None)) 205 | # clear all pending changes 206 | if change: 207 | self._add_change_to_queue(change) 208 | self.running = False 209 | -------------------------------------------------------------------------------- /bubuku/controller_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import threading 4 | from functools import partial 5 | from http.server import BaseHTTPRequestHandler, HTTPServer 6 | 7 | from bubuku.communicate import execute_on_controller_thread 8 | from bubuku.controller import Controller 9 | from bubuku.env_provider import EnvProvider 10 | from bubuku.utils import CmdHelper 11 | from bubuku.features.metric_collector import MetricCollector 12 | from bubuku.config import load_config 13 | from bubuku.zookeeper import load_exhibitor_proxy 14 | 15 | _CONTROLLER_TIMEOUT = 5 16 | 17 | _API_CONTROLLER = '/api/controller/' 18 | 19 | _LOG = logging.getLogger('bubuku.health') 20 | 21 | 22 | def load_controller_queue(controller: Controller): 23 | return controller.enumerate_changes() 24 | 25 | 26 | def delete_from_controller_queue(name: str, controller: Controller): 27 | return { 28 | 'count': controller.cancel_changes(name) 29 | } 30 | 31 | 32 | class _Handler(BaseHTTPRequestHandler): 33 | cmd_helper = None 34 | 35 | def do_GET(self): 36 | if self.path in ('/api/disk_stats', '/api/disk_stats/'): 37 | used_kb, free_kb = self.cmd_helper.get_disk_stats() 38 | self._send_response({'free_kb': free_kb, 'used_kb': used_kb}) 39 | elif self.path.startswith(_API_CONTROLLER): 40 | self.wrap_controller_execution(lambda: self._run_controller_action(self.path[len(_API_CONTROLLER):])) 41 | elif self.path in ('/api/metrics', '/api/metrics/'): 42 | config = load_config() 43 | env_provider = EnvProvider.create_env_provider(config) 44 | with load_exhibitor_proxy(env_provider.get_address_provider(), config.zk_prefix) as zookeeper: 45 | metrics = MetricCollector(zookeeper).get_metrics_from_brokers() 46 | self._send_response({'metrics': metrics}) 47 | else: 48 | self._send_response({'status': 'OK'}) 49 | 50 | def wrap_controller_execution(self, call): 51 | try: 52 | call() 53 | except TimeoutError as e: 54 | _LOG.error('Failed to rum action because of timeouts', exc_info=e) 55 | self._send_response({'code': 'timeout', 'message': 'Timeout occurred'}, 500) 56 | 57 | def do_DELETE(self): 58 | if not self.path.startswith(_API_CONTROLLER): 59 | return self._send_response({'message': 'Path {} is not supported'.format(self.path)}, 404) 60 | action = self.path[len(_API_CONTROLLER):].split('/') 61 | if action[0] == 'queue': 62 | if len(action) < 2: 63 | return self._send_response({'message': 'No second argument provided!'}, 400) 64 | self.wrap_controller_execution( 65 | lambda: self._send_response(execute_on_controller_thread( 66 | partial(delete_from_controller_queue, action[1]), _CONTROLLER_TIMEOUT), 200)) 67 | else: 68 | return self._send_response({'message': 'Action {} is not supported'.format(action[0])}, 404) 69 | 70 | def _run_controller_action(self, action): 71 | if action.split('/')[0] == 'queue': 72 | return self._send_response(execute_on_controller_thread(load_controller_queue, _CONTROLLER_TIMEOUT), 200) 73 | else: 74 | return self._send_response({'message': 'Action {} is not supported'.format(action)}, 404) 75 | 76 | def _send_response(self, json_, status_code=200): 77 | self.send_response(status_code) 78 | self.send_header('Content-type', 'application/json') 79 | self.end_headers() 80 | self.wfile.write(json.dumps(json_).encode('utf-8')) 81 | 82 | 83 | def start_server(port, cmd_helper: CmdHelper) -> threading.Thread: 84 | def _thread_func(): 85 | _Handler.cmd_helper = cmd_helper 86 | server = HTTPServer(('', port), _Handler) 87 | server.serve_forever() 88 | server.socket.close() 89 | 90 | t = threading.Thread(target=_thread_func, daemon=True) 91 | _LOG.info('Starting health server on port {}'.format(port)) 92 | t.start() 93 | return t 94 | -------------------------------------------------------------------------------- /bubuku/daemon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """General Kafka Start Script.""" 3 | 4 | import logging 5 | 6 | from bubuku import controller_api 7 | from bubuku.broker import BrokerManager, StartupTimeout 8 | from bubuku.config import load_config, KafkaProperties, Config 9 | from bubuku.controller import Controller 10 | from bubuku.env_provider import EnvProvider 11 | from bubuku.features.data_size_stats import GenerateDataSizeStatistics 12 | from bubuku.features.rebalance.check import RebalanceOnStartCheck, RebalanceOnBrokerListCheck 13 | from bubuku.features.remote_exec import RemoteCommandExecutorCheck 14 | from bubuku.features.restart_if_dead import CheckBrokerStopped 15 | from bubuku.features.restart_on_zk_change import CheckExhibitorAddressChanged, RestartBrokerChange 16 | from bubuku.features.swap_partitions import CheckBrokersDiskImbalance 17 | from bubuku.features.terminate import register_terminate_on_interrupt 18 | from bubuku.process import KafkaProcess 19 | from bubuku.utils import CmdHelper 20 | from bubuku.zookeeper import BukuExhibitor, load_exhibitor_proxy 21 | 22 | _LOG = logging.getLogger('bubuku.main') 23 | 24 | 25 | def apply_features(api_port, features: dict, controller: Controller, buku_proxy: BukuExhibitor, broker: BrokerManager, 26 | kafka_properties: KafkaProperties, env_provider: EnvProvider) -> list: 27 | for feature, config in features.items(): 28 | if feature == 'restart_on_exhibitor': 29 | controller.add_check(CheckExhibitorAddressChanged(buku_proxy, broker)) 30 | elif feature == 'rebalance_on_start': 31 | controller.add_check(RebalanceOnStartCheck(buku_proxy, broker)) 32 | elif feature == 'rebalance_on_brokers_change': 33 | controller.add_check(RebalanceOnBrokerListCheck(buku_proxy, broker)) 34 | elif feature == 'balance_data_size': 35 | controller.add_check( 36 | CheckBrokersDiskImbalance(buku_proxy, broker, config["diff_threshold_mb"] * 1024, api_port)) 37 | elif feature == 'graceful_terminate': 38 | register_terminate_on_interrupt(controller, broker) 39 | elif feature == 'use_ip_address': 40 | unique_adv_listeners = __get_transformed_listeners(kafka_properties, env_provider.get_ip(), "advertised.listeners") 41 | kafka_properties.set_property('advertised.listeners', ",".join(unique_adv_listeners)) 42 | 43 | unique_listeners = __get_transformed_listeners(kafka_properties, "0.0.0.0", "listeners") 44 | kafka_properties.set_property('listeners', ",".join(unique_listeners)) 45 | 46 | else: 47 | _LOG.error('Using of unsupported feature "{}", skipping it'.format(feature)) 48 | 49 | def __get_transformed_listeners(kafka_properties: KafkaProperties, ip_addr, listener_property): 50 | old_listeners = kafka_properties.get_property(listener_property) 51 | if not old_listeners: 52 | old_listeners = 'PLAINTEXT://:9092' 53 | new_listeners = [] 54 | for adv_listener in old_listeners.split(","): 55 | protocol, _ignore, port = adv_listener.split(":") 56 | new_listeners.append("{protocol}://{host}:{port}".format( 57 | protocol=protocol, 58 | host=ip_addr, 59 | port=port 60 | )) 61 | unique_listeners = sorted(set(new_listeners)) 62 | return unique_listeners 63 | 64 | def run_daemon_loop(config: Config, process_holder: KafkaProcess, cmd_helper: CmdHelper, restart_on_init: bool): 65 | _LOG.info("Using configuration: {}".format(config)) 66 | kafka_props = KafkaProperties(config.kafka_settings_template, 67 | '{}/config/server.properties'.format(config.kafka_dir)) 68 | 69 | env_provider = EnvProvider.create_env_provider(config) 70 | address_provider = env_provider.get_address_provider() 71 | rack = env_provider.get_rack() 72 | if rack: 73 | kafka_props.set_property('broker.rack', rack) 74 | startup_timeout = StartupTimeout.build(config.timeout) 75 | 76 | _LOG.info("Loading exhibitor configuration") 77 | with load_exhibitor_proxy(address_provider, config.zk_prefix) as zookeeper: 78 | _LOG.info("Loading broker_id policy") 79 | broker_id_manager = env_provider.create_broker_id_manager(zookeeper, kafka_props) 80 | 81 | _LOG.info("Building broker manager") 82 | broker = BrokerManager(process_holder, zookeeper, broker_id_manager, kafka_props, 83 | startup_timeout) 84 | 85 | _LOG.info("Creating controller") 86 | controller = Controller(broker, zookeeper, env_provider) 87 | 88 | controller.add_check(CheckBrokerStopped(broker, zookeeper)) 89 | controller.add_check(RemoteCommandExecutorCheck(zookeeper, broker, controller, config.health_port)) 90 | controller.add_check(GenerateDataSizeStatistics(zookeeper, broker, cmd_helper, 91 | kafka_props.get_property("log.dirs").split(","))) 92 | apply_features(config.health_port, config.features, controller, zookeeper, broker, kafka_props, env_provider) 93 | 94 | _LOG.info('Starting main controller loop') 95 | controller.loop(RestartBrokerChange(zookeeper, broker, lambda: False) if restart_on_init else None) 96 | 97 | 98 | def main(): 99 | logging.basicConfig(level=getattr(logging, 'INFO', None), 100 | format='%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s', 101 | datefmt='%Y-%m-%d %H:%M:%S') 102 | 103 | config = load_config() 104 | _LOG.info("Using configuration: {}".format(config)) 105 | process = KafkaProcess(config.kafka_dir) 106 | _LOG.info('Starting health server') 107 | cmd_helper = CmdHelper() 108 | controller_api.start_server(config.health_port, cmd_helper) 109 | restart_on_init = False 110 | while True: 111 | try: 112 | run_daemon_loop(config, process, cmd_helper, restart_on_init) 113 | break 114 | except Exception as ex: 115 | _LOG.error("WOW! Almost died! Will try to restart from the begin. " 116 | "After initialization will be complete, will try to restart", exc_info=ex) 117 | if process.is_running(): 118 | restart_on_init = False 119 | 120 | 121 | if __name__ == '__main__': 122 | main() 123 | -------------------------------------------------------------------------------- /bubuku/env_provider.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import uuid 4 | from functools import partial 5 | from typing import List 6 | 7 | import boto3 8 | import requests 9 | 10 | from bubuku.config import Config, KafkaProperties 11 | from bubuku.id_extractor import BrokerIdExtractor 12 | from bubuku.zookeeper import BukuExhibitor, AddressListProvider 13 | from bubuku.zookeeper.exhibitor import ExhibitorAddressProvider 14 | 15 | _LOG = logging.getLogger('bubuku.amazon') 16 | 17 | 18 | class EnvProvider(object): 19 | def get_ip(self) -> str: 20 | raise NotImplementedError('Not implemented') 21 | 22 | def get_address_provider(self): 23 | raise NotImplementedError('Not implemented') 24 | 25 | def create_broker_id_manager(self, zk: BukuExhibitor, kafka_props: KafkaProperties): 26 | raise NotImplementedError('Not implemented') 27 | 28 | def get_rack(self): 29 | raise NotImplementedError('Not implemented') 30 | 31 | @staticmethod 32 | def create_env_provider(config: Config): 33 | if config.mode == 'amazon': 34 | return AmazonEnvProvider(config) 35 | elif config.mode == 'local': 36 | return LocalEnvProvider() 37 | else: 38 | raise NotImplementedError('Configuration mode "{}" is not supported'.format(config.mode)) 39 | 40 | 41 | class AmazonEnvProvider(EnvProvider): 42 | def __init__(self, config: Config): 43 | self.aws_addr = '169.254.169.254' 44 | self.config = config 45 | self.ip_address = None 46 | self._document = None 47 | 48 | def _get_document(self) -> dict: 49 | if not self._document: 50 | self._document = requests.get( 51 | 'http://{}/latest/dynamic/instance-identity/document'.format(self.aws_addr), 52 | timeout=5).json() 53 | _LOG.info("Amazon specific information loaded from AWS: {}".format(json.dumps(self._document, indent=2))) 54 | return self._document 55 | 56 | def get_ip(self) -> str: 57 | if not self.ip_address: 58 | self.ip_address = self._get_document()['privateIp'] 59 | return self.ip_address 60 | 61 | def get_rack(self): 62 | return self._get_document()['availabilityZone'] 63 | 64 | def _load_instance_ips(self, lb_name: str): 65 | region = self._get_document()['region'] 66 | 67 | private_ips = [] 68 | 69 | elb = boto3.client('elb', region_name=region) 70 | ec2 = boto3.client('ec2', region_name=region) 71 | 72 | response = elb.describe_instance_health(LoadBalancerName=lb_name) 73 | 74 | for instance in response['InstanceStates']: 75 | if instance['State'] == 'InService': 76 | private_ips.append(ec2.describe_instances( 77 | InstanceIds=[instance['InstanceId']])['Reservations'][0]['Instances'][0]['PrivateIpAddress']) 78 | 79 | _LOG.info("Ip addresses for {} are: {}".format(lb_name, private_ips)) 80 | return private_ips 81 | 82 | def get_address_provider(self): 83 | if self.config.zk_static_ips: 84 | return StaticAddressesProvider(self.config.zk_static_ips) 85 | else: 86 | return ExhibitorAddressProvider(partial(self._load_instance_ips, self.config.zk_stack_name)) 87 | 88 | def create_broker_id_manager(self, zk: BukuExhibitor, kafka_props: KafkaProperties): 89 | return BrokerIdExtractor(zk, kafka_props) 90 | 91 | 92 | class _LocalAddressProvider(AddressListProvider): 93 | def get_latest_address(self) -> (List[str], int): 94 | return ('zookeeper',), 2181 95 | 96 | 97 | class StaticAddressesProvider(AddressListProvider): 98 | def __init__(self, addr: str): 99 | ips, port = addr.split(':') 100 | self.ips = ips.split(',') 101 | self.port = int(port) 102 | 103 | def get_latest_address(self) -> (List[str], int): 104 | return self.ips, self.port 105 | 106 | 107 | class LocalEnvProvider(EnvProvider): 108 | 109 | def get_ip(self) -> str: 110 | return '127.0.0.1' 111 | 112 | def get_address_provider(self): 113 | return _LocalAddressProvider() 114 | 115 | def get_rack(self): 116 | return None 117 | 118 | def create_broker_id_manager(self, zk: BukuExhibitor, kafka_props: KafkaProperties): 119 | return BrokerIdExtractor(zk, kafka_props) 120 | -------------------------------------------------------------------------------- /bubuku/features/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zalando-nakadi/bubuku/7be53c4a8edbf6a248d70eb6ce0c38022f4391be/bubuku/features/__init__.py -------------------------------------------------------------------------------- /bubuku/features/data_size_stats.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from bubuku.broker import BrokerManager 4 | from bubuku.controller import Check 5 | from bubuku.utils import CmdHelper 6 | from bubuku.zookeeper import BukuExhibitor 7 | 8 | _LOG = logging.getLogger('bubuku.features.data_size_stats') 9 | 10 | 11 | class GenerateDataSizeStatistics(Check): 12 | def __init__(self, zk: BukuExhibitor, broker: BrokerManager, cmd_helper: CmdHelper, kafka_log_dirs: list): 13 | super().__init__(check_interval_s=600) 14 | self.zk = zk 15 | self.broker = broker 16 | self.cmd_helper = cmd_helper 17 | self.kafka_log_dirs = kafka_log_dirs 18 | 19 | def check(self): 20 | if self.broker.is_running_and_registered(): 21 | _LOG.info("Generating data size statistics") 22 | try: 23 | self.__generate_stats() 24 | _LOG.info("Data size statistics successfully written to zk") 25 | except Exception: 26 | _LOG.warn("Error occurred when collecting size statistics", exc_info=True) 27 | return None 28 | 29 | def __str__(self): 30 | return 'GenerateDataSizeStatistics' 31 | 32 | def __generate_stats(self): 33 | topics_stats = self.__get_topics_stats() 34 | used_kb, free_kb = self.cmd_helper.get_disk_stats() 35 | stats = {"disk": {'used_kb': used_kb, 'free_kb': free_kb}, "topics": topics_stats} 36 | self.zk.update_disk_stats(self.broker.id_manager.get_broker_id(), stats) 37 | 38 | def __get_topics_stats(self): 39 | topics_stats = {} 40 | for log_dir in self.kafka_log_dirs: 41 | _LOG.info("Processing log dir: {}".format(log_dir)) 42 | topic_dirs = self.cmd_helper.cmd_run("du -k -d 1 {}".format(log_dir)).split("\n") 43 | for topic_dir in topic_dirs: 44 | dir_stats = self.__parse_dir_stats(topic_dir, log_dir) 45 | if dir_stats: 46 | topic, partition, size_kb = dir_stats 47 | if topic not in topics_stats: 48 | topics_stats[topic] = {} 49 | topics_stats[topic][partition] = int(size_kb) 50 | return topics_stats 51 | 52 | @staticmethod 53 | def __parse_dir_stats(topic_dir, log_dir): 54 | """ 55 | Parses topic-partition size stats from "du" tool single line output 56 | :param topic_dir: the string to be parsed; example: "45983\t/tmp/kafka-logs/my-kafka-topic-0" 57 | :param log_dir: the kafka log directory name itself 58 | :return: tuple (topic, partition, size) or None if the topic_dir has incorrect format 59 | """ 60 | dir_data = topic_dir.split("\t") 61 | if len(dir_data) == 2 and dir_data[1] != log_dir: 62 | size_kb, dir_name = tuple(dir_data) 63 | tp_name = dir_name.split("/")[-1] 64 | tp_parts = tp_name.rsplit("-", 1) 65 | if len(tp_parts) == 2: 66 | topic, partition = tuple(tp_parts) 67 | return topic, partition, size_kb 68 | return None 69 | -------------------------------------------------------------------------------- /bubuku/features/metric_collector.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import asyncio 3 | import logging 4 | from bubuku.zookeeper import BukuExhibitor 5 | 6 | _LOG = logging.getLogger('MetricCollector') 7 | 8 | 9 | class MetricCollector: 10 | _OFFLINE_PARTITIONS_MBEAN = { 11 | 'name': 'OfflinePartitions', 12 | 'mbean': 'kafka.controller:type=KafkaController,name=OfflinePartitionsCount', 13 | 'field': 'Value'} 14 | _UNDER_REPLICATED_PARTITIONS_MBEAN = { 15 | 'name': 'UnderReplicatedPartitions', 16 | 'mbean': 'kafka.server:type=ReplicaManager,name=UnderReplicatedPartitions', 17 | 'field': 'Value'} 18 | _PREFERRED_REPLICA_IMBALANCE_MBEAN = { 19 | 'name': 'PreferredReplicaImbalance', 20 | 'mbean': 'kafka.controller:name=PreferredReplicaImbalanceCount,type=KafkaController', 21 | 'field': 'Value'} 22 | _BYTES_IN_MBEAN = { 23 | 'name': 'BytesIn', 24 | 'mbean': 'kafka.server:name=BytesInPerSec,type=BrokerTopicMetrics', 25 | 'field': 'OneMinuteRate' 26 | } 27 | _JOLOKIA_PORT = 8778 28 | 29 | def __init__(self, zk: BukuExhibitor): 30 | self.zk = zk 31 | 32 | async def _get_metrics_from_broker(self, broker_id: int): 33 | broker_address = self.zk.get_broker_address(broker_id) 34 | data = {'broker_address': broker_address, 'broker_id': broker_id, 'metrics': {}} 35 | for metric in self.get_metric_mbeans(): 36 | metric_fetched = False 37 | try: 38 | response = requests.get("http://{}:{}/jolokia/read/{}".format( 39 | broker_address, self._JOLOKIA_PORT, metric['mbean'])) 40 | if response.status_code == 200: 41 | response_body = response.json() 42 | if response_body.get('status') == 200: 43 | if response_body.get('value', {}).get(metric['field']) is not None: 44 | data['metrics'][metric['name']] = response_body['value'][metric['field']] 45 | metric_fetched = True 46 | if not metric_fetched: 47 | _LOG.error("Fetching metric {} for broker: {} failed. Response from broker: {}:{}".format( 48 | metric['name'], broker_id, response.status_code, response.text)) 49 | except Exception as e: 50 | _LOG.error("Fetching metric {} for broker {} failed".format(metric['name'], broker_id), exc_info=e) 51 | return data 52 | 53 | async def _get_metrics_from_brokers(self, broker_ids): 54 | metrics = [] 55 | for broker_id in broker_ids: 56 | metrics.append(asyncio.ensure_future(self._get_metrics_from_broker(broker_id))) 57 | metrics = await asyncio.gather(*metrics) 58 | return metrics 59 | 60 | def get_metrics_from_brokers(self, broker_ids=None): 61 | """ 62 | Get metrics for brokers in the cluster 63 | :param broker_ids: List of broker_ids to fetch metrics for 64 | :return: List of dictionaries containing metrics for each broker 65 | { 66 | "metrics": {...}, 67 | "broker_id": int, 68 | "broker_address": str 69 | } 70 | """ 71 | broker_ids = self.zk.get_broker_ids() if not broker_ids else broker_ids 72 | loop = asyncio.new_event_loop() 73 | asyncio.set_event_loop(loop) 74 | try: 75 | return loop.run_until_complete(self._get_metrics_from_brokers(broker_ids)) 76 | except Exception as e: 77 | _LOG.error('Could not fetch metrics from brokers', exc_info=e) 78 | finally: 79 | loop.close() 80 | 81 | @classmethod 82 | def get_metric_mbeans(cls): 83 | return [ 84 | cls._OFFLINE_PARTITIONS_MBEAN, 85 | cls._UNDER_REPLICATED_PARTITIONS_MBEAN, 86 | cls._PREFERRED_REPLICA_IMBALANCE_MBEAN, 87 | cls._BYTES_IN_MBEAN, 88 | ] 89 | -------------------------------------------------------------------------------- /bubuku/features/migrate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from bubuku.features.rebalance import BaseRebalanceChange 4 | from bubuku.zookeeper import BukuExhibitor, RebalanceThrottleManager 5 | 6 | _LOG = logging.getLogger('bubuku.features.migrate') 7 | 8 | 9 | class MigrationChange(BaseRebalanceChange): 10 | def __init__(self, zk: BukuExhibitor, from_: list, to: list, shrink: bool, parallelism: int = 1, 11 | throttle: int = 100000000): 12 | self.zk = zk 13 | self.migration = {int(from_[i]): int(to[i]) for i in range(0, len(from_))} 14 | self.shrink = shrink 15 | self.data_to_migrate = None 16 | self.parallelism = parallelism 17 | self.throttle_manager = RebalanceThrottleManager(self.zk, throttle) 18 | 19 | def run(self, current_actions) -> bool: 20 | if self.should_be_paused(current_actions): 21 | return True 22 | if self.zk.is_rebalancing(): 23 | return True 24 | active_ids = [int(k) for k in self.zk.get_broker_ids()] 25 | if any(b not in active_ids for b in self.migration.keys()): 26 | _LOG.error('Source brokers {} are not in active list {}. Stopping.'.format( 27 | self.migration.keys(), active_ids)) 28 | return False 29 | if any(b not in active_ids for b in self.migration.values()): 30 | _LOG.error('Target brokers {} are not in active list {}. Stopping.'.format( 31 | self.migration.values(), active_ids)) 32 | return False 33 | if self.data_to_migrate is None: 34 | _LOG.info('Loading partition assignment') 35 | self.data_to_migrate = [data for data in self.zk.load_partition_assignment()] 36 | _LOG.info('Load {} partitions'.format(len(self.data_to_migrate))) 37 | return True 38 | 39 | items_to_migrate = [] 40 | self.throttle_manager.remove_old_throttle_configurations() 41 | while self.data_to_migrate and len(items_to_migrate) < self.parallelism: 42 | topic, partition, replicas = self.data_to_migrate.pop() 43 | replaced_replicas = self._replace_replicas(replicas) 44 | if replaced_replicas == replicas: 45 | continue 46 | items_to_migrate.append((topic, partition, replicas, replaced_replicas)) 47 | if not items_to_migrate: 48 | return False 49 | self.throttle_manager.apply_throttle([(t, p, rr) for t, p, _, rr in items_to_migrate]) 50 | if not self.zk.reallocate_partitions([(t, p, rr) for t, p, _, rr in items_to_migrate]): 51 | for topic, partition, replicas, _ in items_to_migrate: 52 | self.data_to_migrate.append((topic, partition, replicas)) 53 | return True 54 | 55 | def __str__(self): 56 | return 'Migration links {}, shrink: {}, data_to_move: {}, parallelism: {}'.format( 57 | self.migration, 58 | self.shrink, 59 | len(self.data_to_migrate) if self.data_to_migrate is not None else 'Unknown', 60 | self.parallelism, 61 | ) 62 | 63 | def _replace_replicas(self, replicas): 64 | replacement = [self.migration[k] for k in replicas if k in self.migration] 65 | if self.shrink: 66 | result = [] 67 | for v in replicas: 68 | to_use = self.migration.get(v, v) 69 | if to_use not in result: 70 | result.append(to_use) 71 | return result 72 | else: 73 | return replicas + [k for k in replacement if k not in replicas] 74 | 75 | def on_remove(self): 76 | RebalanceThrottleManager.remove_all_throttle_configurations(self.zk) 77 | -------------------------------------------------------------------------------- /bubuku/features/rebalance/__init__.py: -------------------------------------------------------------------------------- 1 | from bubuku.controller import Change 2 | 3 | 4 | class BaseRebalanceChange(Change): 5 | def get_name(self) -> str: 6 | return 'rebalance' 7 | 8 | def can_run(self, current_actions): 9 | return all([a not in current_actions for a in ['start', 'restart', 'rebalance', 'stop', 'complete_stop', 'rolling_restart']]) 10 | 11 | @staticmethod 12 | def should_be_paused(current_actions): 13 | return any([a in current_actions for a in ['restart', 'start', 'stop', 'complete_stop', 'rolling_restart']]) 14 | -------------------------------------------------------------------------------- /bubuku/features/rebalance/broker.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple, List, Iterator, Dict, Optional 2 | 3 | 4 | class _TopicPartitions(object): 5 | __slots__ = ( 6 | '_items', 7 | '_expectation', 8 | '_has_free_slots', 9 | '_topic_partitions' 10 | ) 11 | 12 | def __init__(self): 13 | self._items = set() 14 | self._expectation = 0 15 | self._has_free_slots = None 16 | self._topic_partitions = {} 17 | 18 | def __str__(self): 19 | return 'TP, expectation: {}, items: {}'.format(self._expectation, self._items) 20 | 21 | def get_item_count(self) -> int: 22 | return len(self._items) 23 | 24 | def contains(self, item) -> bool: 25 | return item in self._items 26 | 27 | def add(self, item: Tuple[str, int]): 28 | self._items.add(item) 29 | topic, partition = item 30 | if topic not in self._topic_partitions: 31 | self._topic_partitions[topic] = [] 32 | self._topic_partitions[topic].append(partition) 33 | self._has_free_slots = None 34 | 35 | def remove(self, item: Tuple[str, int]): 36 | self._items.remove(item) 37 | topic, partition = item 38 | self._topic_partitions[topic].remove(partition) 39 | self._has_free_slots = None 40 | 41 | def iterate_items(self) -> Iterator[Tuple[str, int]]: 42 | return self._items.__iter__() 43 | 44 | def get_topic_partitions(self, topic: str) -> List[int]: 45 | return self._topic_partitions.get(topic, []) 46 | 47 | def get_expectation(self) -> int: 48 | return self._expectation 49 | 50 | def set_expectation(self, expectation: int): 51 | self._has_free_slots = None 52 | self._expectation = expectation 53 | 54 | def calculate_cardinality(self) -> Dict[str, int]: 55 | return {k: len(v) for k, v in self._topic_partitions.items()} 56 | 57 | def has_free_slots(self) -> bool: 58 | if self._has_free_slots is None: 59 | self._has_free_slots = self.get_item_count() < self.get_expectation() 60 | return self._has_free_slots 61 | 62 | 63 | class BrokerDescription(object): 64 | __slots__ = ( 65 | '_broker_id', 66 | '_rack_id', 67 | '_leaders', 68 | '_replicas', 69 | ) 70 | 71 | def __init__(self, broker_id: int, rack_id: str = None): 72 | self._broker_id = broker_id 73 | self._rack_id = rack_id 74 | self._leaders = _TopicPartitions() 75 | self._replicas = _TopicPartitions() 76 | 77 | @property 78 | def broker_id(self) -> int: 79 | return self._broker_id 80 | 81 | @property 82 | def rack_id(self) -> str: 83 | return self._rack_id 84 | 85 | def __str__(self) -> str: 86 | return 'BrokerDescription(id={}, rack={}, leaders={}, replicas={})'.format( 87 | self._broker_id, self._rack_id, self._leaders, self._replicas) 88 | 89 | def set_leader_expectation(self, leader_count: int): 90 | self._leaders.set_expectation(leader_count) 91 | 92 | def set_replica_expectation(self, replica_count: int): 93 | self._replicas.set_expectation(replica_count) 94 | 95 | def add_leader(self, topic_partition: Tuple[str, int]): 96 | self._leaders.add(topic_partition) 97 | 98 | def add_replica(self, topic_partition: Tuple[str, int]): 99 | self._replicas.add(topic_partition) 100 | 101 | def get_leader_count(self) -> int: 102 | return self._leaders.get_item_count() 103 | 104 | def get_replica_count(self) -> int: 105 | return self._replicas.get_item_count() 106 | 107 | def get_replica_overload(self) -> int: 108 | return self._replicas.get_item_count() - self._replicas.get_expectation() 109 | 110 | def has_free_replica_slots(self) -> int: 111 | return self._replicas.has_free_slots() 112 | 113 | def have_extra_leaders(self) -> bool: 114 | return self._leaders.get_expectation() < self._leaders.get_item_count() 115 | 116 | def have_less_leaders(self) -> bool: 117 | return self._leaders.get_expectation() > self._leaders.get_item_count() 118 | 119 | def get_expected_leaders(self) -> int: 120 | return self._leaders.get_expectation() 121 | 122 | def accept_leader(self, source_broker: 'BrokerDescription', topic_partition: Tuple[str, int]): 123 | """ 124 | Moves topic_partition from source_broker to self broker. 125 | :param source_broker: Broker to take topic_partition from. 126 | :param topic_partition: topic and partition tuple to take. 127 | """ 128 | self._leaders.add(topic_partition) 129 | source_broker._leaders.remove(topic_partition) 130 | 131 | def _accept_replica(self, source_broker: 'BrokerDescription', topic_partition: Tuple[str, int]) -> bool: 132 | # Already a leader for this partition 133 | if self._leaders.contains(topic_partition): 134 | return False 135 | # Already a replica for this partition 136 | if self._replicas.contains(topic_partition): 137 | return False 138 | if self._rack_id != source_broker._rack_id: 139 | return False 140 | self._replicas.add(topic_partition) 141 | source_broker._replicas.remove(topic_partition) 142 | return True 143 | 144 | def move_replica(self, topic_partition: Tuple[str, int], broker_list: List['BrokerDescription']) \ 145 | -> Optional['BrokerDescription']: 146 | """ 147 | Moves replica topic_partition to some broker from broker_list. 148 | :param topic_partition: Topic and partition to move 149 | :param broker_list: List of brokers (BrokerDescription) to move to 150 | :return: Broker, to which partition was moved 151 | """ 152 | for target in broker_list: 153 | if target._accept_replica(self, topic_partition): 154 | return target 155 | return None 156 | 157 | def list_replica_copies(self) -> List[Tuple[str, int]]: 158 | return list([tp for tp in self._replicas.iterate_items() if self._leaders.contains(tp)]) 159 | 160 | def list_partitions(self, topic: str, replica: bool) -> List[int]: 161 | return (self._replicas if replica else self._leaders).get_topic_partitions(topic) 162 | 163 | def list_replicas(self) -> Iterator[Tuple[str, int]]: 164 | return self._replicas.iterate_items() 165 | 166 | def calculate_topic_cardinality(self) -> Dict[str, int]: 167 | """ 168 | Calculates 'topic to leader count' dictionary on this broker. 169 | For example, topic t0 have partitions 0, 1, 2, 3. If leaders for partitions 0, 3 are located on this broker 170 | than return value will contain mapping t0->2 (there are 2 leaders for topic t0 on this broker) 171 | :return: Dictionary with leaders count per topic for this broker. 172 | """ 173 | return self._leaders.calculate_cardinality() 174 | -------------------------------------------------------------------------------- /bubuku/features/rebalance/check.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from bubuku.broker import BrokerManager 3 | from bubuku.controller import Check 4 | from bubuku.features.rebalance.change import OptimizedRebalanceChange 5 | from bubuku.zookeeper import BukuExhibitor 6 | 7 | _LOG = logging.getLogger('bubuku.features.rebalance') 8 | 9 | 10 | class RebalanceOnStartCheck(Check): 11 | def __init__(self, zk: BukuExhibitor, broker: BrokerManager): 12 | super().__init__() 13 | self.zk = zk 14 | self.broker = broker 15 | self.executed = False 16 | 17 | def check(self): 18 | if self.executed: 19 | return None 20 | if not self.broker.is_running_and_registered(): 21 | return None 22 | _LOG.info("Rebalance on start, triggering rebalance") 23 | self.executed = True 24 | return OptimizedRebalanceChange(self.zk, self.zk.get_broker_ids(), [], []) 25 | 26 | def __str__(self): 27 | return 'RebalanceOnStartCheck (executed={})'.format(self.executed) 28 | 29 | 30 | class RebalanceOnBrokerListCheck(Check): 31 | def __init__(self, zk: BukuExhibitor, broker: BrokerManager): 32 | super().__init__() 33 | self.zk = zk 34 | self.broker = broker 35 | self.old_broker_list = [] 36 | 37 | def check(self): 38 | if not self.broker.is_running_and_registered(): 39 | return None 40 | new_list = self.zk.get_broker_ids() 41 | if not new_list == self.old_broker_list: 42 | _LOG.info('Broker list changed from {} to {}, triggering rebalance'.format(self.old_broker_list, new_list)) 43 | self.old_broker_list = new_list 44 | return OptimizedRebalanceChange(self.zk, new_list, [], []) 45 | return None 46 | 47 | def __str__(self): 48 | return 'RebalanceOnBrokerListChange, cached list: {}'.format(self.old_broker_list) 49 | -------------------------------------------------------------------------------- /bubuku/features/remote_exec.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from bubuku.aws.cluster_config import ClusterConfig, AwsInstanceUserDataLoader 4 | from bubuku.broker import BrokerManager 5 | from bubuku.controller import Check, Change, Controller 6 | from bubuku.features.migrate import MigrationChange 7 | from bubuku.features.rebalance.change import OptimizedRebalanceChange 8 | from bubuku.features.rebalance.change_simple import SimpleRebalanceChange 9 | from bubuku.features.restart_on_zk_change import RestartBrokerChange 10 | from bubuku.features.rolling_restart import RollingRestartChange 11 | from bubuku.features.swap_partitions import SwapPartitionsChange, load_swap_data 12 | from bubuku.features.terminate import CompleteStopChange 13 | from bubuku.zookeeper import BukuExhibitor 14 | 15 | _LOG = logging.getLogger('bubuku.features.remote_exec') 16 | 17 | 18 | class RemoteCommandExecutorCheck(Check): 19 | def __init__(self, zk: BukuExhibitor, broker_manager: BrokerManager, controller: Controller, api_port: int): 20 | super().__init__(check_interval_s=30) 21 | self.zk = zk 22 | self.broker_manager = broker_manager 23 | self.controller = controller 24 | self.api_port = api_port 25 | 26 | def check(self) -> Change: 27 | with self.zk.lock(): 28 | data = self.zk.take_action(self.broker_manager.id_manager.get_broker_id()) 29 | if not data: 30 | return None 31 | if 'name' not in data: 32 | _LOG.error('Action name can not be restored from {}, skipping'.format(data)) 33 | return None 34 | try: 35 | if data['name'] == 'restart': 36 | return RestartBrokerChange(self.zk, self.broker_manager, lambda: False) 37 | elif data['name'] == 'rebalance': 38 | if data.get('bin_packing', False): 39 | return OptimizedRebalanceChange(self.zk, 40 | self.zk.get_broker_ids(), 41 | data['empty_brokers'], 42 | data['exclude_topics'], 43 | data['throttle'], 44 | int(data.get('parallelism', 1))) 45 | else: 46 | return SimpleRebalanceChange(self.zk, 47 | self.zk.get_broker_ids(), 48 | data['empty_brokers'], 49 | data['exclude_topics'], 50 | int(data.get('parallelism', 1)), 51 | data['throttle']) 52 | 53 | elif data['name'] == 'migrate': 54 | return MigrationChange(self.zk, data['from'], data['to'], data['shrink'], 55 | int(data.get('parallelism', '1')), data['throttle']) 56 | elif data['name'] == 'fatboyslim': 57 | return SwapPartitionsChange(self.zk, 58 | lambda x: load_swap_data(x, self.api_port, int(data['threshold_kb']))) 59 | elif data['name'] == 'rolling_restart': 60 | cluster_config = ClusterConfig(AwsInstanceUserDataLoader()) 61 | cluster_config.set_overrides(**data['overrides']) 62 | return RollingRestartChange(self.zk, 63 | cluster_config, 64 | data['restart_assignment'], 65 | self.broker_manager.id_manager.broker_id, 66 | data['cool_down']) 67 | elif data['name'] == 'stop': 68 | return CompleteStopChange(self.broker_manager, self.controller) 69 | else: 70 | _LOG.error('Action {} not supported'.format(data)) 71 | except Exception as e: 72 | _LOG.error('Failed to create action from {}'.format(data), exc_info=e) 73 | return None 74 | 75 | def __str__(self): 76 | return 'RemoteCommandExecutorCheck' 77 | 78 | @staticmethod 79 | def register_restart(zk: BukuExhibitor, broker_id: str): 80 | with zk.lock(): 81 | zk.register_action({'name': 'restart'}, broker_id=broker_id) 82 | 83 | @staticmethod 84 | def register_rebalance(zk: BukuExhibitor, broker_id: str, empty_brokers: list, exclude_topics: list, 85 | parallelism: int, bin_packing: bool, throttle: int): 86 | if parallelism <= 0: 87 | raise Exception('Parallelism for rebalance should be greater than 0') 88 | action = { 89 | 'name': 'rebalance', 90 | 'empty_brokers': empty_brokers, 91 | 'exclude_topics': exclude_topics, 92 | 'parallelism': int(parallelism), 93 | 'bin_packing': bool(bin_packing), 94 | 'throttle': int(throttle) 95 | } 96 | with zk.lock(): 97 | if broker_id: 98 | zk.register_action(action, broker_id=broker_id) 99 | else: 100 | zk.register_action(action) 101 | 102 | @staticmethod 103 | def register_migration(zk: BukuExhibitor, brokers_from: list, brokers_to: list, shrink: bool, broker_id: str, 104 | throttle: int, parallelism: int): 105 | if len(brokers_from) != len(brokers_to): 106 | raise Exception('Brokers list {} and {} must have the same size'.format(brokers_from, brokers_to)) 107 | if any(b in brokers_from for b in brokers_to) or any(b in brokers_to for b in brokers_from): 108 | raise Exception('Broker lists can not hold same broker ids') 109 | 110 | if len(set(brokers_from)) != len(brokers_from): 111 | raise Exception('Can not use same broker ids for source_list {}'.format(brokers_from)) 112 | if len(set(brokers_to)) != len(brokers_to): 113 | raise Exception('Can not use same broker ids for source_list {}'.format(brokers_from)) 114 | 115 | active_ids = zk.get_broker_ids() 116 | if any(b not in active_ids for b in brokers_from) or any(b not in active_ids for b in brokers_to): 117 | raise Exception('Brokers dead from: {} to: {} alive:{}'.format(brokers_from, brokers_to, active_ids)) 118 | 119 | if broker_id and str(broker_id) not in active_ids: 120 | raise Exception('Broker id to run change on ({}) is not in active list {}'.format( 121 | broker_id, active_ids)) 122 | if parallelism <= 0: 123 | raise Exception('Parallelism for migration should be greater than 0') 124 | 125 | with zk.lock(): 126 | action = {'name': 'migrate', 'from': brokers_from, 'to': brokers_to, 'shrink': bool(shrink), 127 | 'parallelism': int(parallelism), 'throttle': int(throttle)} 128 | if broker_id: 129 | zk.register_action(action, str(broker_id)) 130 | else: 131 | zk.register_action(action) 132 | 133 | @staticmethod 134 | def register_fatboy_slim(zk: BukuExhibitor, threshold_kb: int): 135 | if zk.is_rebalancing(): 136 | _LOG.warning('Rebalance is already in progress, may be it will take time for this command to start ' 137 | 'processing') 138 | with zk.lock(): 139 | zk.register_action({'name': 'fatboyslim', 'threshold_kb': threshold_kb}) 140 | 141 | @staticmethod 142 | def register_rolling_restart(zk: BukuExhibitor, broker_id: str, image: str, instance_type: str, scalyr_key: str, 143 | scalyr_region: str, kms_key_id: str, ami_id: str, cool_down: int): 144 | if zk.is_rolling_restart_in_progress(): 145 | _LOG.warning('Rolling restart in progress, skipping') 146 | return 147 | 148 | restart_assignment = {} 149 | brokers = zk.get_broker_ids() 150 | for idx in range(len(brokers)): 151 | broker_to_make_restart = brokers[idx] 152 | if idx == len(brokers) - 1: 153 | broker_to_restart = brokers[0] 154 | else: 155 | broker_to_restart = brokers[idx + 1] 156 | restart_assignment[broker_to_make_restart] = broker_to_restart 157 | 158 | _LOG.info('Rolling restart assignment:\n {}'.format(restart_assignment)) 159 | action = { 160 | 'name': 'rolling_restart', 161 | 'restart_assignment': restart_assignment, 162 | 'overrides': ClusterConfig.create_overrides_dict( 163 | application_version=image, 164 | scalyr_account_key=scalyr_key, 165 | scalyr_region=scalyr_region, 166 | instance_type=instance_type, 167 | kms_key_id=kms_key_id, 168 | ami_id=ami_id, 169 | ), 170 | 'cool_down': cool_down 171 | } 172 | zk.register_action(action, broker_id=broker_id) 173 | 174 | @staticmethod 175 | def register_stop(zk: BukuExhibitor, broker_id: str): 176 | zk.register_action({'name': 'stop'}, broker_id=broker_id) 177 | -------------------------------------------------------------------------------- /bubuku/features/restart_if_dead.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from datetime import datetime, timedelta 4 | from bubuku.broker import BrokerManager 5 | from bubuku.controller import Change, Check 6 | from bubuku.features.restart_on_zk_change import RestartBrokerChange 7 | from bubuku.zookeeper import BukuExhibitor 8 | 9 | _LOG = logging.getLogger('bubuku.features.restart_if_dead') 10 | 11 | class CheckBrokerStopped(Check): 12 | def __init__(self, broker: BrokerManager, zk: BukuExhibitor): 13 | super().__init__() 14 | self.broker = broker 15 | self.zk = zk 16 | self.need_check = True 17 | self.last_zk_session_failed_check = None 18 | 19 | def check(self) -> Change: 20 | if not self.need_check: 21 | return None 22 | if not self.should_restart(): 23 | return None 24 | 25 | _LOG.warning('Oops! Broker is dead, triggering restart') 26 | self.need_check = False 27 | 28 | # Do not start if broker is running and registered 29 | def _cancel_if(): 30 | return self.broker.is_running() and self.broker.is_registered_in_zookeeper() 31 | 32 | return RestartBrokerChange(self.zk, self.broker, _cancel_if, self.on_change_executed) 33 | 34 | # Attempt to verify that broker is not registered in zookeeper for twice as long as the zookeeper session timeout. 35 | # Allow zookeeper client to try to restore the session before killing tha kafka process as soon as zookeeper session is dead. 36 | def should_restart(self): 37 | current_time = datetime.now() 38 | if not self.broker.is_running(): 39 | return True 40 | if not self.broker.is_registered_in_zookeeper(): 41 | _LOG.warning('Broker is not regiestered in Zookeeper') 42 | if not self.last_zk_session_failed_check: 43 | self.last_zk_session_failed_check = current_time 44 | time_to_restart_at = self.last_zk_session_failed_check + timedelta(milliseconds=self.broker.get_zookeeper_session_timeout() * 2) 45 | if current_time > time_to_restart_at: 46 | return True 47 | else: 48 | self.last_zk_session_failed_check = None 49 | return False 50 | 51 | def on_change_executed(self): 52 | self.need_check = True 53 | self.last_zk_session_failed_check = None 54 | 55 | def __str__(self): 56 | return 'CheckBrokerStopped' 57 | -------------------------------------------------------------------------------- /bubuku/features/restart_on_zk_change.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from bubuku.broker import BrokerManager 4 | from bubuku.controller import Change, Check 5 | from bubuku.zookeeper import BukuExhibitor 6 | 7 | _LOG = logging.getLogger('bubuku.features.restart_on_zk') 8 | 9 | _STAGE_STOP = 'stop' 10 | _STAGE_START = 'start' 11 | 12 | 13 | class RestartBrokerChange(Change): 14 | def __init__(self, zk: BukuExhibitor, broker: BrokerManager, break_condition, processed_callback=None): 15 | self.zk = zk 16 | self.broker = broker 17 | self.break_condition = break_condition 18 | self.stage = _STAGE_STOP 19 | self.processed_callback = processed_callback 20 | 21 | def get_name(self): 22 | return 'restart' 23 | 24 | def can_run(self, current_actions): 25 | return all([a not in current_actions for a in ['start', 'restart', 'stop', 'complete_stop']]) 26 | 27 | def run(self, current_actions): 28 | if self.stage == _STAGE_STOP: 29 | if self.break_condition and self.break_condition(): 30 | return False 31 | self.broker.stop_kafka_process() 32 | self.stage = _STAGE_START 33 | return True 34 | elif self.stage == _STAGE_START: 35 | # Yep, use latest data 36 | zk_conn_str = self.zk.get_conn_str() 37 | try: 38 | self.broker.start_kafka_process(zk_conn_str) 39 | except Exception as e: 40 | _LOG.error('Failed to start kafka process against {}'.format(zk_conn_str), exc_info=e) 41 | return True 42 | return False 43 | else: 44 | _LOG.error('Stage {} is not supported'.format(self.stage)) 45 | return False 46 | 47 | def on_remove(self): 48 | if self.processed_callback: 49 | self.processed_callback() 50 | 51 | def __str__(self): 52 | return 'RestartBrokerChange ({}), stage={}'.format(self.get_name(), self.stage) 53 | 54 | 55 | class CheckExhibitorAddressChanged(Check): 56 | def __init__(self, zk: BukuExhibitor, broker: BrokerManager): 57 | super().__init__() 58 | self.zk = zk 59 | self.broker = broker 60 | self.conn_str = None 61 | 62 | def check(self) -> Change: 63 | new_conn_str = self.zk.get_conn_str() 64 | if new_conn_str != self.conn_str: 65 | def _cancel_if(): 66 | current_conn_str = self.zk.get_conn_str() 67 | if current_conn_str != new_conn_str: 68 | _LOG.warning('ZK address changed again, from {} to {}'.format(new_conn_str, current_conn_str)) 69 | return True 70 | if current_conn_str == self.broker.get_zk_connect_string(): 71 | _LOG.warning('Broker already have latest version of zk address: {}'.format(current_conn_str)) 72 | return True 73 | return False 74 | 75 | _LOG.info('ZK addresses changed from {} to {}, triggering restart'.format(self.conn_str, new_conn_str)) 76 | self.conn_str = new_conn_str 77 | return RestartBrokerChange(self.zk, self.broker, _cancel_if) 78 | 79 | def __str__(self): 80 | return 'CheckExhibitorAddressChanged, current={}'.format(self.conn_str) 81 | -------------------------------------------------------------------------------- /bubuku/features/rolling_restart.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from time import time 3 | 4 | from bubuku import utils 5 | from bubuku.aws import AWSResources 6 | from bubuku.aws.cluster_config import ClusterConfig 7 | from bubuku.aws.ec2_node_launcher import Ec2NodeLauncher 8 | from bubuku.aws.node import Ec2Node 9 | from bubuku.controller import Change 10 | from bubuku.zookeeper import BukuExhibitor 11 | 12 | _LOG = logging.getLogger('bubuku.features.rolling_restart') 13 | 14 | 15 | class RollingRestartChange(Change): 16 | def __init__(self, 17 | zk: BukuExhibitor, 18 | cluster_config: ClusterConfig, 19 | restart_assignment, 20 | broker_id: str, 21 | cool_down: int): 22 | self.zk = zk 23 | self.restart_assignment = restart_assignment 24 | self.broker_id = broker_id 25 | self.broker_id_to_restart = self.restart_assignment.pop(broker_id) 26 | self.broker_ip_to_restart = self.zk.get_broker_address(self.broker_id_to_restart) 27 | 28 | self.cluster_config = cluster_config 29 | 30 | self.aws = AWSResources(region=self.cluster_config.get_aws_region()) 31 | self.ec_node = Ec2Node(self.aws, self.cluster_config, self.broker_ip_to_restart) 32 | self.ec2_node_launcher = Ec2NodeLauncher( 33 | self.aws, self.cluster_config, self.ec_node.get_node_availability_zone()) 34 | 35 | self.state_context = StateContext(self.zk, self.aws, self.ec_node, self.ec2_node_launcher, 36 | self.broker_id_to_restart, self.restart_assignment, 37 | self.cluster_config, cool_down) 38 | 39 | def get_name(self) -> str: 40 | return 'rolling_restart' 41 | 42 | def can_run(self, current_actions): 43 | return all([a not in current_actions for a in ['stop', 'complete_stop', 'restart', 'rebalance']]) 44 | 45 | def run(self, current_actions) -> bool: 46 | return self.state_context.run() 47 | 48 | def time_till_next_run(self): 49 | return 10 50 | 51 | 52 | class StateContext: 53 | def __init__(self, zk: BukuExhibitor, aws: AWSResources, ec_node: Ec2Node, ec2_node_launcher: Ec2NodeLauncher, 54 | broker_id_to_restart, restart_assignment, cluster_config: ClusterConfig, cool_down: int): 55 | self.zk = zk 56 | self.restart_assignment = restart_assignment 57 | self.cluster_config = cluster_config 58 | self.aws = aws 59 | self.ec_node = ec_node 60 | self.ec2_node_launcher = ec2_node_launcher 61 | self.broker_id_to_restart = broker_id_to_restart 62 | self.current_state = StopKafka(self) 63 | self.new_instance_id = None 64 | self.cool_down = cool_down 65 | 66 | def run(self): 67 | """ 68 | Runs states one after another. If state is finished, it takes the next one. 69 | """ 70 | try: 71 | _LOG.info('Running state {}'.format(self.current_state)) 72 | if self.current_state.run(): 73 | next_state = self.current_state.next() 74 | _LOG.info('Next state {}'.format(next_state)) 75 | if next_state is None: 76 | return False 77 | self.current_state = next_state 78 | return True 79 | except Exception as e: 80 | _LOG.error('Failed to run state', exc_info=e) 81 | return True 82 | 83 | 84 | class State: 85 | """ 86 | State which can be run as many times as required before it finishes it work. The progress of the state has to be 87 | recoverable 88 | """ 89 | 90 | def __init__(self, state_context): 91 | self.state_context = state_context 92 | self.time_to_check_s = time() 93 | 94 | def run(self) -> bool: 95 | """ 96 | Runs the state, and if state finishes successfully it returns True, otherwise it returns False, which means 97 | that state has to be executed again 98 | """ 99 | pass 100 | 101 | def next(self): 102 | """ 103 | Return the next state, which has to be executed after the current state 104 | """ 105 | pass 106 | 107 | def run_with_timeout(self, func): 108 | """ 109 | Runs func() with timeout 110 | :param func function to execute 111 | """ 112 | if time() >= self.time_to_check_s: 113 | self.time_to_check_s = time() + 10 114 | return func() 115 | return False 116 | 117 | 118 | class StopKafka(State): 119 | def run(self): 120 | if utils.is_cluster_healthy(): 121 | from bubuku.features.remote_exec import RemoteCommandExecutorCheck 122 | RemoteCommandExecutorCheck.register_stop(self.state_context.zk, self.state_context.broker_id_to_restart) 123 | return True 124 | _LOG.warning('Cluster is not healthy, waiting for it to recover') 125 | return False 126 | 127 | def next(self): 128 | return WaitBrokerStopped(self.state_context) 129 | 130 | def __str__(self): 131 | return 'StopKafka: stopping broker {}'.format(self.state_context.broker_id_to_restart) 132 | 133 | 134 | class WaitBrokerStopped(State): 135 | def run(self): 136 | def func(): 137 | return not self.state_context.zk.is_broker_registered(self.state_context.broker_id_to_restart) 138 | 139 | return self.run_with_timeout(func) 140 | 141 | def next(self): 142 | return DetachVolume(self.state_context) 143 | 144 | def __str__(self): 145 | return 'WaitBrokerStopped: waiting for broker {} to stop'.format(self.state_context.broker_id_to_restart) 146 | 147 | 148 | class DetachVolume(State): 149 | def run(self): 150 | self.state_context.ec_node.detach_volume() 151 | return True 152 | 153 | def next(self): 154 | return TerminateInstance(self.state_context) 155 | 156 | def __str__(self): 157 | return 'DetachVolume: detaching volume {} from broker {}'.format(self.state_context.ec_node.get_volume_id(), 158 | self.state_context.broker_id_to_restart) 159 | 160 | 161 | class TerminateInstance(State): 162 | def run(self): 163 | self.state_context.ec_node.terminate() 164 | return True 165 | 166 | def next(self): 167 | return WaitInstanceTerminated(self.state_context) 168 | 169 | def __str__(self): 170 | return 'TerminateInstance: terminating instance {}'.format(self.state_context.ec_node.get_ip()) 171 | 172 | 173 | class WaitInstanceTerminated(State): 174 | def run(self): 175 | def func(): 176 | return self.state_context.ec_node.is_terminated() 177 | 178 | return self.run_with_timeout(func) 179 | 180 | def next(self): 181 | return WaitVolumeAvailable(self.state_context) 182 | 183 | def __str__(self): 184 | return 'WaitInstanceTerminated: waiting for instance {} to be terminated'.format( 185 | self.state_context.ec_node.get_ip()) 186 | 187 | 188 | class WaitVolumeAvailable(State): 189 | def run(self): 190 | def func(): 191 | return self.state_context.ec_node.is_volume_available() 192 | 193 | return self.run_with_timeout(func) 194 | 195 | def next(self): 196 | return LaunchInstance(self.state_context) 197 | 198 | def __str__(self): 199 | return 'WaitVolumeAvailable: waiting for volume {} to be available'.format( 200 | self.state_context.ec_node.get_volume_id()) 201 | 202 | 203 | class LaunchInstance(State): 204 | def run(self): 205 | self.state_context.new_instance_id = self.state_context.ec2_node_launcher.launch() 206 | return True 207 | 208 | def next(self): 209 | return WaitVolumeAttached(self.state_context) 210 | 211 | 212 | class WaitVolumeAttached(State): 213 | def run(self): 214 | def func(): 215 | if self.state_context.ec_node.is_volume_in_use(): 216 | self.state_context.ec2_node_launcher.create_auto_recovery_alarm(self.state_context.new_instance_id) 217 | return True 218 | return False 219 | 220 | return self.run_with_timeout(func) 221 | 222 | def next(self): 223 | return WaitKafkaRunning(self.state_context) 224 | 225 | def __str__(self): 226 | return 'WaitVolumeAttached: waiting for volume {} to be attached'.format( 227 | self.state_context.ec_node.get_volume_id()) 228 | 229 | 230 | class WaitKafkaRunning(State): 231 | def run(self): 232 | def func(): 233 | return self.state_context.zk.is_broker_registered(self.state_context.broker_id_to_restart) 234 | 235 | return self.run_with_timeout(func) 236 | 237 | def next(self): 238 | return RegisterRollingRestart(self.state_context) 239 | 240 | def __str__(self): 241 | return 'WaitKafkaRunning: waiting broker {} is running'.format(self.state_context.broker_id_to_restart) 242 | 243 | 244 | class RegisterRollingRestart(State): 245 | def __init__(self, state_context): 246 | super(RegisterRollingRestart, self).__init__(state_context) 247 | self.cluster_is_healthy_from = 0 248 | 249 | def run(self): 250 | if len(self.state_context.restart_assignment) == 0: 251 | _LOG.info('Rolling restart is successfully finished') 252 | return True 253 | else: 254 | if utils.is_cluster_healthy(): 255 | if self.cluster_is_healthy_from == 0: 256 | self.cluster_is_healthy_from = time() 257 | else: 258 | _LOG.warning('Cluster is not healthy, waiting for it to recover') 259 | self.cluster_is_healthy_from = 0 260 | return False 261 | 262 | if time() - self.cluster_is_healthy_from >= self.state_context.cool_down: 263 | action = {'name': 'rolling_restart', 264 | 'restart_assignment': self.state_context.restart_assignment, 265 | 'overrides': self.state_context.cluster_config.get_overrides(), 266 | 'cool_down': self.state_context.cool_down} 267 | next_broker_id = self.state_context.broker_id_to_restart 268 | self.state_context.zk.register_action(action, broker_id=next_broker_id) 269 | return True 270 | return False 271 | 272 | def next(self): 273 | return None 274 | -------------------------------------------------------------------------------- /bubuku/features/swap_partitions.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import namedtuple 3 | from operator import attrgetter 4 | from typing import List 5 | 6 | import requests 7 | 8 | from bubuku.broker import BrokerManager 9 | from bubuku.controller import Check 10 | from bubuku.features.rebalance import BaseRebalanceChange 11 | from bubuku.zookeeper import BukuExhibitor 12 | 13 | _LOG = logging.getLogger('bubuku.features.swap_partitions') 14 | 15 | TpData = namedtuple('_TpData', ('topic', 'partition', 'size', 'replicas')) 16 | 17 | 18 | class SwapPartitionsChange(BaseRebalanceChange): 19 | def __init__(self, zk: BukuExhibitor, swap_data_provider): 20 | self.zk = zk 21 | self.to_move = None 22 | self.swap_data_provider = swap_data_provider 23 | 24 | def run(self, current_actions): 25 | if self.should_be_paused(current_actions): 26 | _LOG.info("Pausing swap partitions change as there are conflicting actions: {}".format(current_actions)) 27 | return True 28 | # if there's a rebalance currently running - postpone current change 29 | if self.zk.is_rebalancing(): 30 | return True 31 | 32 | if self.to_move is None: 33 | slim_broker_id, fat_broker_id, gap, size_stats = self.swap_data_provider(self.zk) 34 | if slim_broker_id is None: 35 | _LOG.info('Can not find slim broker and fat broker during reassignment. Probably gap changed') 36 | return False 37 | # merge topics size stats to a single dict 38 | topics_stats = {} 39 | for broker_stats in size_stats.values(): 40 | for topic in broker_stats["topics"].keys(): 41 | if topic not in topics_stats: 42 | topics_stats[topic] = {} 43 | topics_stats[topic].update(broker_stats["topics"][topic]) 44 | 45 | # find partitions that are candidates to be swapped between "fat" and "slim" brokers 46 | swap_partition_candidates = self.__find_all_swap_candidates(fat_broker_id, slim_broker_id, topics_stats) 47 | 48 | # smallest partition from slim broker is the one we move to fat broker 49 | slim_broker_smallest_partition = min(swap_partition_candidates[slim_broker_id], key=attrgetter("size")) 50 | if not slim_broker_smallest_partition: 51 | _LOG.info("No partitions on slim broker(id: {}) found to swap".format(slim_broker_id)) 52 | return False 53 | _LOG.info("Slim broker(id: {}) partition to swap: {}".format( 54 | slim_broker_id, slim_broker_smallest_partition)) 55 | 56 | # find the best fitting fat broker partition to move to slim broker 57 | # (should be as much as possible closing the gap between brokers) 58 | fat_broker_swap_candidates = swap_partition_candidates[fat_broker_id] 59 | matching_swap_partition = self.__find_best_swap_candidate(fat_broker_swap_candidates, gap, 60 | slim_broker_smallest_partition.size) 61 | 62 | # if there is no possible swap that will decrease the gap - just do nothing 63 | if not matching_swap_partition: 64 | _LOG.info("No candidate from fat broker(id:{}) found to swap".format(fat_broker_id)) 65 | return False 66 | _LOG.info("Fat broker(id: {}) partition to swap: {}".format(fat_broker_id, matching_swap_partition)) 67 | # write rebalance-json to ZK; Kafka will read it and perform the partitions swap 68 | self.to_move = self.__create_rebalance_list(slim_broker_smallest_partition, slim_broker_id, 69 | matching_swap_partition, fat_broker_id) 70 | 71 | # if there is already a swap which was postponed - just execute it 72 | return not self.__perform_swap(self.to_move) 73 | 74 | def __perform_swap(self, rebalance_list): 75 | _LOG.info("Writing rebalance-json to ZK for partitions swap: {}".format(rebalance_list)) 76 | return self.zk.reallocate_partitions(rebalance_list) 77 | 78 | def __find_all_swap_candidates(self, fat_broker_id: int, slim_broker_id: int, topics_stats: dict) -> dict: 79 | swap_partition_candidates = {fat_broker_id: [], slim_broker_id: []} 80 | for topic, partition, replicas in self.zk.load_partition_assignment(): 81 | if topic not in topics_stats or str(partition) not in topics_stats[topic]: 82 | continue # we skip this partition as there is not data size stats for it 83 | 84 | if replicas[0] in (fat_broker_id, slim_broker_id): 85 | continue # Skip leadership transfer 86 | 87 | if fat_broker_id in replicas and slim_broker_id in replicas: 88 | continue # we skip this partition as it exists on both involved brokers 89 | 90 | for broker_id in [slim_broker_id, fat_broker_id]: 91 | if broker_id in replicas: 92 | swap_partition_candidates[broker_id].append( 93 | TpData(topic, partition, topics_stats[topic][str(partition)], replicas)) 94 | return swap_partition_candidates 95 | 96 | @staticmethod 97 | def __find_best_swap_candidate(candidates: list, brokers_gap: int, partition_to_swap_size: int) -> TpData: 98 | candidates.sort(key=attrgetter("size"), reverse=True) 99 | matching_swap_partition = None 100 | smallest_new_gap = brokers_gap 101 | for tp in candidates: 102 | new_gap = abs(brokers_gap - 2 * abs(tp.size - partition_to_swap_size)) 103 | if new_gap < smallest_new_gap: 104 | smallest_new_gap = new_gap 105 | matching_swap_partition = tp 106 | return matching_swap_partition 107 | 108 | def __create_rebalance_list(self, tp1: TpData, br1: int, tp2: TpData, br2: int) -> list: 109 | return [ 110 | (tp1.topic, tp1.partition, self.__replace_broker(tp1.replicas, br1, br2, tp2.replicas[0] == br2)), 111 | (tp2.topic, tp2.partition, self.__replace_broker(tp2.replicas, br2, br1, tp1.replicas[0] == br1)) 112 | ] 113 | 114 | def __replace_broker(self, replicas: list, broker_to_replace: int, replacement: int, was_leader: bool) -> list: 115 | rps = [x for x in replicas if x != broker_to_replace] 116 | if was_leader: 117 | return [replacement] + rps 118 | else: 119 | return rps + [replacement] 120 | 121 | def __str__(self): 122 | return 'SwapPartitions' 123 | 124 | 125 | def _load_disk_stats(zk: BukuExhibitor, api_port: int): 126 | size_stats = zk.get_disk_stats() 127 | if len(size_stats) < 2: 128 | _LOG.info("No size stats available, imbalance check cancelled") 129 | return None 130 | result = {} 131 | for broker_id, value in size_stats.items(): 132 | try: 133 | if api_port != -1: # For unit tests only 134 | host = zk.get_broker_address(broker_id) 135 | tmp = requests.get( 136 | 'http://{}:{}/api/disk_stats'.format(host, api_port), 137 | timeout=5).json() 138 | if any(a not in tmp for a in ['free_kb', 'used_kb']): 139 | continue 140 | value['disk'] = tmp 141 | value['host'] = host 142 | result[broker_id] = value 143 | except Exception as e: 144 | _LOG.error('Failed to load disk stats for broker {}. Skipping it'.format(broker_id), exc_info=e) 145 | 146 | return result 147 | 148 | 149 | def load_swap_data(zk: BukuExhibitor, api_port: int, gap: int) -> (str, str, int, dict): 150 | """ 151 | Finds brokers that could be used for gap of size gap. If rack awareness is enabled, the swap will be between two 152 | brokers in the same rack 153 | :param zk: Bubuku exhibitor 154 | :param api_port: bubuku api port 155 | :param gap: gap in kb to get information for 156 | :return: (slim_broker_id, fat_broker_id, calculated_gap, size_stats) or (None, None, calculated_gap, size_stats) 157 | """ 158 | size_stats = _load_disk_stats(zk, api_port) 159 | if not size_stats or len(size_stats) < 2: 160 | return None, None, None, size_stats 161 | sorted_stats = sorted(size_stats.items(), key=lambda tup: tup[1]["disk"]["free_kb"]) 162 | fat_broker, slim_broker = select_fat_slim_brokers(zk, sorted_stats) 163 | if fat_broker is None: 164 | return None, None, None, size_stats 165 | 166 | calculated_gap = slim_broker[1]['disk']['free_kb'] - fat_broker[1]['disk']['free_kb'] 167 | _LOG.info('Gap between {} and {} is {}, need to fix: {}'.format( 168 | fat_broker[0], slim_broker[0], calculated_gap, calculated_gap > gap)) 169 | if calculated_gap >= gap: 170 | return int(slim_broker[0]), int(fat_broker[0]), calculated_gap, size_stats 171 | return None, None, calculated_gap, size_stats 172 | 173 | 174 | def select_fat_slim_brokers(zk: BukuExhibitor, sorted_stats: list): 175 | racks = zk.get_broker_racks() 176 | if any([rack is None for rack in racks.values()]): 177 | return sorted_stats[0], sorted_stats[-1] 178 | for i in range(len(sorted_stats) - 1): 179 | fat_broker = sorted_stats[i] 180 | fat_rack = racks[int(fat_broker[0])] 181 | for j in range(len(sorted_stats) -1, i, -1): 182 | slim_broker = sorted_stats[j] 183 | slim_rack = racks[int(slim_broker[0])] 184 | if slim_rack == fat_rack: 185 | return fat_broker, slim_broker 186 | 187 | return None, None 188 | 189 | 190 | class CheckBrokersDiskImbalance(Check): 191 | def __init__(self, zk: BukuExhibitor, broker: BrokerManager, diff_threshold_kb: int, api_port: int): 192 | super().__init__(check_interval_s=900) 193 | self.zk = zk 194 | self.api_port = api_port 195 | self.broker = broker 196 | self.diff_threshold_kb = diff_threshold_kb 197 | 198 | def check(self): 199 | if self.broker.is_running_and_registered(): 200 | _LOG.info("Starting broker disk imbalance check") 201 | try: 202 | slim_broker_id, fat_broker_id, gap, size_stats = load_swap_data( 203 | self.zk, self.api_port, self.diff_threshold_kb) 204 | if slim_broker_id is not None: # All or nothing 205 | return SwapPartitionsChange( 206 | self.zk, 207 | lambda x: load_swap_data(x, self.api_port, self.diff_threshold_kb)) 208 | except Exception as e: 209 | _LOG.warn("Error occurred when performing disk imbalance check", exc_info=e) 210 | return None 211 | 212 | def __str__(self): 213 | return 'CheckBrokersDiskImbalance' 214 | -------------------------------------------------------------------------------- /bubuku/features/terminate.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import signal 3 | 4 | from bubuku.broker import BrokerManager 5 | from bubuku.controller import Controller, Change 6 | 7 | _LOG = logging.getLogger('bubuku.features.terminate') 8 | 9 | 10 | class StopBrokerChange(Change): 11 | def __init__(self, broker: BrokerManager): 12 | self.broker = broker 13 | 14 | def get_name(self): 15 | return 'stop' 16 | 17 | def __str__(self): 18 | return 'StopBrokerChange ({})'.format(self.get_name()) 19 | 20 | def can_run(self, current_actions): 21 | return all([action not in current_actions for action in ['start', 'restart', 'stop', 'complete_stop']]) 22 | 23 | def run(self, current_actions): 24 | _LOG.info('Stopping kafka process') 25 | self.broker.stop_kafka_process() 26 | return self.broker.has_leadership() 27 | 28 | def can_run_at_exit(self): 29 | return True 30 | 31 | 32 | class CompleteStopChange(Change): 33 | def __init__(self, broker: BrokerManager, controller: Controller): 34 | self.broker = broker 35 | self.controller = controller 36 | 37 | def get_name(self): 38 | return 'complete_stop' 39 | 40 | def __str__(self): 41 | return 'CompleteStopChange ({})'.format(self.get_name()) 42 | 43 | def can_run(self, current_actions): 44 | return all([action not in current_actions for action in ['start', 'restart', 'stop', 'complete_stop']]) 45 | 46 | def run(self, current_actions): 47 | _LOG.info('Stopping kafka process and the controller') 48 | self.controller.stop(StopBrokerChange(self.broker)) 49 | return False 50 | 51 | def can_run_at_exit(self): 52 | return False 53 | 54 | 55 | __REGISTERED = None 56 | 57 | 58 | def get_registration(): 59 | if not __REGISTERED: 60 | return None, None 61 | return __REGISTERED 62 | 63 | 64 | def register_terminate_on_interrupt(controller: Controller, broker: BrokerManager): 65 | global __REGISTERED 66 | 67 | def _sig_handler(*args, **kwargs): 68 | _LOG.info('Signal was caught, stopping controller gracefully') 69 | controller.stop(StopBrokerChange(broker)) 70 | 71 | _LOG.info('Registering signal handler') 72 | old_handler = signal.signal(signal.SIGTERM, _sig_handler) 73 | if old_handler: 74 | _LOG.warn('Old handler is removed: {}'.format(old_handler)) 75 | __REGISTERED = (controller, broker) 76 | -------------------------------------------------------------------------------- /bubuku/id_extractor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import logging 3 | import os 4 | import re 5 | from typing import Optional, List 6 | 7 | from bubuku.config import KafkaProperties 8 | from bubuku.zookeeper import BukuExhibitor 9 | 10 | _LOG = logging.getLogger('bubuku.id_generator') 11 | 12 | 13 | def _search_broker_id(lines: List[str]) -> Optional[str]: 14 | for line in lines: 15 | match = re.search('^broker\\.id=(\\d+)$', line.strip()) 16 | if match: 17 | return match.group(1) 18 | 19 | 20 | class BrokerIdExtractor(object): 21 | def __init__(self, zk: BukuExhibitor, kafka_properties: KafkaProperties): 22 | super().__init__() 23 | self.zk = zk 24 | self.kafka_properties = kafka_properties 25 | self.broker_id = None 26 | 27 | def get_broker_id(self): 28 | if self.broker_id: 29 | return self.broker_id 30 | 31 | meta_path = '{}/meta.properties'.format(self.kafka_properties.get_property('log.dirs')) 32 | while not os.path.isfile(meta_path): 33 | return None 34 | with open(meta_path) as f: 35 | self.broker_id = _search_broker_id(f.readlines()) 36 | return self.broker_id 37 | 38 | def is_registered(self): 39 | broker_id = self.get_broker_id() 40 | if broker_id: 41 | return self.zk.is_broker_registered(broker_id) 42 | return False 43 | -------------------------------------------------------------------------------- /bubuku/process.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | 4 | class KafkaProcess(object): 5 | def __init__(self, kafka_dir: str): 6 | self.process = None 7 | self.kafka_dir = kafka_dir 8 | 9 | def start(self, settings_file): 10 | if self.is_running(): 11 | raise Exception('Kafka process already started') 12 | self.process = subprocess.Popen([self.kafka_dir + "/bin/kafka-server-start.sh", settings_file]) 13 | 14 | def is_running(self) -> bool: 15 | if self.process: 16 | self.process.poll() 17 | return self.process.returncode is None 18 | return False 19 | 20 | def stop_and_wait(self): 21 | if self.process is None: 22 | raise Exception('Process was not started') 23 | self.process.terminate() 24 | self.process.wait() 25 | self.process = None 26 | -------------------------------------------------------------------------------- /bubuku/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import subprocess 3 | 4 | import requests 5 | 6 | from bubuku.config import Config, KafkaProperties, load_config 7 | from bubuku.env_provider import EnvProvider 8 | from bubuku.zookeeper import BukuExhibitor 9 | 10 | _LOG = logging.getLogger('bubuku.utils') 11 | 12 | 13 | class CmdHelper(object): 14 | def get_disk_stats(self) -> (int, int): 15 | """ 16 | Returns total disk stats, 17 | :return: used_kb, free_kb 18 | """ 19 | disks = self.cmd_run("df -k | tail -n +2 | awk '{ print $3, $4 }'").split("\n") 20 | total_used = total_free = 0 21 | for disk in disks: 22 | parts = disk.split(" ") 23 | if len(parts) == 2: 24 | used, free = tuple(parts) 25 | total_used += int(used) 26 | total_free += int(free) 27 | return total_used, total_free 28 | 29 | def cmd_run(self, cmd): 30 | output = subprocess.check_output(cmd, shell=True) 31 | return output.decode("utf-8") 32 | 33 | 34 | def get_opt_broker_id(broker_id: str, config: Config, zk: BukuExhibitor, env_provider: EnvProvider, throw_on_missing=True) -> str: 35 | if not broker_id: 36 | kafka_properties = KafkaProperties(config.kafka_settings_template, '/tmp/tmp.props'.format(config.kafka_dir)) 37 | broker_id_manager = env_provider.create_broker_id_manager(zk, kafka_properties) 38 | broker_id = broker_id_manager.get_broker_id() 39 | _LOG.info('Will use broker_id {}'.format(broker_id)) 40 | running_brokers = zk.get_broker_ids() 41 | if broker_id not in running_brokers and throw_on_missing: 42 | raise Exception('Broker id {} is not registered ({})'.format(broker_id, running_brokers)) 43 | return broker_id 44 | 45 | 46 | def prepare_configs(): 47 | config = load_config() 48 | _LOG.info('Using config: {}'.format(config)) 49 | env_provider = EnvProvider.create_env_provider(config) 50 | return config, env_provider 51 | 52 | 53 | def is_cluster_healthy(): 54 | config = load_config() 55 | try: 56 | response = requests.get('http://{}:{}/api/metrics'.format('localhost', '8080')) 57 | resp_json = response.json() 58 | if not resp_json['metrics']: 59 | return False 60 | for metrics in resp_json['metrics']: 61 | metric = metrics['metrics'] 62 | if metric: 63 | if metric['PreferredReplicaImbalance'] > 0: 64 | return False 65 | if metric['OfflinePartitions'] > 0: 66 | return False 67 | if metric['UnderReplicatedPartitions'] > 0: 68 | return False 69 | else: 70 | return False 71 | return True 72 | except Exception as e: 73 | _LOG.error('Failed to get cluster state', exc_info=e) 74 | return False 75 | 76 | 77 | def get_max_bytes_in(): 78 | response = requests.get('http://{}:{}/api/metrics'.format('localhost', '8080')) 79 | resp_json = response.json() 80 | if not resp_json['metrics']: 81 | raise Exception("Can't fetch metrics to note current cluster state. Please try again") 82 | return max([int(metric['metrics']['BytesIn']) for metric in resp_json["metrics"]]) 83 | -------------------------------------------------------------------------------- /bubuku/zookeeper/exhibitor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import random 3 | 4 | import requests 5 | from requests import RequestException 6 | 7 | from bubuku.zookeeper import AddressListProvider 8 | 9 | _LOG = logging.getLogger('bubuku.zookeeper.exhibitor') 10 | 11 | 12 | class ExhibitorAddressProvider(AddressListProvider): 13 | def __init__(self, initial_list_provider): 14 | self.initial_list_provider = initial_list_provider 15 | self.exhibitors = [] 16 | 17 | def get_latest_address(self) -> (list, int): 18 | json_ = self._query_exhibitors(self.exhibitors) 19 | if not json_: 20 | self.exhibitors = self.initial_list_provider() 21 | json_ = self._query_exhibitors(self.exhibitors) 22 | if isinstance(json_, dict) and 'servers' in json_ and 'port' in json_: 23 | self.exhibitors = json_['servers'] 24 | return sorted(json_['servers']), int(json_['port']) 25 | return None 26 | 27 | def _query_exhibitors(self, exhibitors): 28 | if not exhibitors: 29 | return None 30 | random.shuffle(exhibitors) 31 | for host in exhibitors: 32 | url = 'http://{}:{}{}'.format(host, 8181, '/exhibitor/v1/cluster/list') 33 | try: 34 | response = requests.get(url, timeout=3.1, headers={'Accept': 'application/json'}) 35 | return response.json() 36 | except RequestException as e: 37 | _LOG.warning('Failed to query zookeeper list information from {}'.format(url), exc_info=e) 38 | except ConnectionError as e: 39 | _LOG.warning('Failed to connect to zookeeper instance {}'.format(url), exc_info=e) 40 | except Exception as e: 41 | _LOG.warning('Unknown error connecting to zookeeper instance {}'.format(url), exc_info=e) 42 | return None 43 | -------------------------------------------------------------------------------- /cli_docs/cli.md: -------------------------------------------------------------------------------- 1 | # Bubuku command line interface 2 | 3 | Bubuku provides a command line tool `bubuku-cli` which should be used directly on the instance. Available commands: 4 | 5 | #### preferred-replica-election 6 | ``` 7 | Usage: bubuku-cli preferred-replica-election [OPTIONS] 8 | 9 | Do preferred replica election, as command line tool from kafka have a 10 | number of limitations. Only partitions, that are improperly allocated will 11 | be affected. In case if size of resulting json is too big, it will be 12 | split into several parts, and they will be executed one after another. 13 | 14 | Options: 15 | --dry-run Do not apply the changes. Instead just prepare json 16 | file(s) 17 | --max-json-size INTEGER Maximum size of json data in bytes to write to zk 18 | [default: 512000] 19 | --help Show this message and exit. 20 | ``` 21 | 22 | #### restart 23 | ``` 24 | Usage: bubuku-cli restart [OPTIONS] 25 | 26 | Restart kafka instance 27 | 28 | Options: 29 | --broker TEXT Broker id to restart. By default current broker id is 30 | restarted 31 | --help Show this message and exit. 32 | ``` 33 | 34 | #### rolling-restart 35 | ``` 36 | Usage: bubuku-cli rolling-restart [OPTIONS] 37 | 38 | Rolling restart of Kafka cluster 39 | 40 | Options: 41 | --image-tag TEXT Docker image to run Kafka broker 42 | --instance-type TEXT AWS instance type to run Kafka broker on 43 | --scalyr-key TEXT Scalyr account key 44 | --scalyr-region TEXT Scalyr region to use 45 | --kms-key-id TEXT Kms key id to decrypt data with 46 | --cool-down INTEGER Number of seconds to wait before passing the restart 47 | task to another broker, after cluster is stable 48 | [default: 20] 49 | --help Show this message and exit. 50 | ``` 51 | 52 | #### rebalance 53 | ``` 54 | Usage: bubuku-cli rebalance [OPTIONS] 55 | 56 | Run rebalance process on one of brokers. If rack-awareness is enabled, 57 | replicas will only be move to other brokers in the same rack 58 | 59 | Options: 60 | --broker TEXT Broker instance on which to perform rebalance. By 61 | default, any free broker will start it 62 | --empty_brokers TEXT Comma-separated list of brokers to empty. All 63 | partitions will be moved to other brokers 64 | --exclude_topics TEXT Comma-separated list of topics to exclude from 65 | rebalance 66 | --bin-packing Use bean packing approach instead of one way 67 | processing 68 | --parallelism INTEGER Amount of partitions to move in a single rebalance 69 | step [default: 1] 70 | --throttle INTEGER Upper bound on bandwidth (in bytes/sec) used for 71 | rebalance 72 | --remove-throttle Don't trigger rebalance but remove throttling 73 | configuration from all the brokers and topics 74 | --help Show this message and exit. 75 | ``` 76 | 77 | #### migrate 78 | ``` 79 | Usage: bubuku-cli migrate [OPTIONS] 80 | 81 | Replace one broker with another for all partitions 82 | 83 | Options: 84 | --from TEXT List of brokers to migrate from (separated with ",") 85 | --to TEXT List of brokers to migrate to (separated with ",") 86 | --shrink Whether or not to shrink replaced broker ids form 87 | partition assignment [default: False] 88 | --broker TEXT Optional broker id to execute check on 89 | --throttle INTEGER Upper bound on bandwidth (in bytes/sec) used for 90 | reassigning partitions 91 | --parallelism INTEGER Amount of partitions to move in a single migration 92 | step [default: 1] 93 | --remove-throttle Don't trigger rebalance but remove throttling 94 | configuration from all the brokers and topics 95 | --help Show this message and exit. 96 | ``` 97 | 98 | #### swap_fat_slim 99 | ``` 100 | Usage: bubuku-cli swap_fat_slim [OPTIONS] 101 | 102 | Move one partition from fat broker to slim one 103 | 104 | Options: 105 | --threshold INTEGER Threshold in kb to run swap [default: 100000] 106 | --help Show this message and exit. 107 | ``` 108 | 109 | #### actions list 110 | ``` 111 | Usage: bubuku-cli actions list [OPTIONS] 112 | 113 | List all the actions on broker(s) 114 | 115 | Options: 116 | --broker TEXT Broker id to list actions on. By default all brokers are 117 | enumerated 118 | --help Show this message and exit. 119 | ``` 120 | 121 | #### actions delete 122 | ``` 123 | Usage: bubuku-cli actions delete [OPTIONS] 124 | 125 | Remove all actions of specified type on broker(s) 126 | 127 | Options: 128 | --action TEXT Action to delete 129 | --broker TEXT Broker id to delete actions on. By default actions are 130 | deleted on all brokers 131 | --help Show this message and exit. 132 | ``` 133 | 134 | #### stats 135 | ``` 136 | Usage: bubuku-cli stats [OPTIONS] 137 | 138 | Display statistics about brokers 139 | 140 | Options: 141 | --help Show this message and exit. 142 | ``` 143 | 144 | #### validate replication 145 | ``` 146 | Usage: bubuku-cli validate replication [OPTIONS] 147 | 148 | Returns all partitions whose ISR size differs from the replication factor 149 | or have not registered broker ids 150 | 151 | Options: 152 | --factor INTEGER Replication factor [default: 3] 153 | --help Show this message and exit. 154 | ``` 155 | 156 | -------------------------------------------------------------------------------- /cli_docs/generate_cli_docs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import click 4 | 5 | from bubuku import cli 6 | 7 | _HEADER = """# Bubuku command line interface 8 | 9 | Bubuku provides a command line tool `bubuku-cli` which should be used directly on the instance. Available commands: 10 | 11 | """ 12 | 13 | 14 | def generate_command_docs(name, command, md_file, parent_ctx=None): 15 | ctx = click.Context(command, info_name=name, parent=parent_ctx) 16 | sub_commands = getattr(command, "commands", {}) 17 | 18 | # generate docs only for actual commands (not command groups) 19 | if len(sub_commands) == 0: 20 | cmd_path = ctx.command_path.split() 21 | cmd_path.pop(0) 22 | md_file.write("#### {}\n".format(" ".join(cmd_path))) 23 | md_file.write("```\n{}\n```\n\n".format(ctx.get_help())) 24 | else: 25 | # if command has sub-commands - recursively generate docs for all sub-commands 26 | for sub_cmd_name, sub_command in sub_commands.items(): 27 | generate_command_docs(sub_cmd_name, sub_command, md_file, ctx) 28 | 29 | 30 | if __name__ == '__main__': 31 | print("Generating 'cli.md'...") 32 | 33 | with open("cli.md", "w") as md_file: 34 | md_file.write(_HEADER) 35 | generate_command_docs("bubuku-cli", cli.cli, md_file) 36 | 37 | print("Done") 38 | -------------------------------------------------------------------------------- /cli_docs/generate_cli_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd `dirname ${0}` 4 | PYTHONPATH=.. python3 generate_cli_docs.py 5 | cd - > /dev/null 6 | -------------------------------------------------------------------------------- /delivery.yaml: -------------------------------------------------------------------------------- 1 | version: "2017-09-20" 2 | pipeline: 3 | - id: build 4 | type: script 5 | vm_config: 6 | type: linux 7 | size: large 8 | image: cdp-runtime/python-3.9 9 | commands: 10 | - desc: Run tests 11 | cmd: | 12 | pip3 install -r requirements.txt 13 | python3 setup.py test 14 | 15 | - desc: Build docker images for different architectures 16 | cmd: | 17 | IMAGE="container-registry-test.zalando.net/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}" 18 | 19 | # create a Buildkit builder with CDP specific configuration 20 | docker buildx create \ 21 | --config /etc/cdp-buildkitd.toml \ 22 | --driver-opt network=host \ 23 | --name cdpbuildx \ 24 | --bootstrap \ 25 | --use 26 | 27 | # (1) build image for AMD64 and ARM64 and push it to the Zalando Container Registry 28 | docker buildx build \ 29 | --platform linux/amd64,linux/arm64 \ 30 | -t ${IMAGE} \ 31 | --push \ 32 | . 33 | 34 | # (2) promote it from `container-registry-test` to `container-registry`, marking it production-ready 35 | cdp-promote-image ${IMAGE} 36 | 37 | - id: push-pierone-arm64 38 | type: script 39 | when: 40 | event: push 41 | vm_config: 42 | type: linux 43 | image: cdp-runtime/base 44 | commands: 45 | - desc: Push ARM64 image to PierOne 46 | cmd: | 47 | IMAGE="container-registry.zalando.net/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}" 48 | PIERONE_IMAGE="registry-write.opensource.zalan.do/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}-arm64" 49 | 50 | docker pull --platform linux/arm64 $IMAGE 51 | docker tag $IMAGE $PIERONE_IMAGE 52 | docker push $PIERONE_IMAGE 53 | 54 | - id: push-pierone-amd64 55 | type: script 56 | when: 57 | event: push 58 | vm_config: 59 | type: linux 60 | image: cdp-runtime/base 61 | commands: 62 | - desc: Push AMD64 image to PierOne 63 | cmd: | 64 | IMAGE="container-registry.zalando.net/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}" 65 | PIERONE_IMAGE="registry-write.opensource.zalan.do/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}-amd64" 66 | 67 | docker pull --platform linux/amd64 $IMAGE 68 | docker tag $IMAGE $PIERONE_IMAGE 69 | docker push $PIERONE_IMAGE 70 | 71 | notifications: 72 | - channel: google_chat 73 | rooms: 74 | - AAAAmX_hkRQ 75 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | 4 | bubuku: 5 | build: . 6 | depends_on: 7 | - zookeeper 8 | environment: 9 | BUBUKU_MODE: "local" 10 | HEALTH_PORT: "8080" 11 | BUKU_FEATURES: "restart_on_exhibitor,rebalance_on_brokers_change,graceful_terminate" 12 | 13 | zookeeper: 14 | image: wurstmeister/zookeeper:3.4.6 15 | -------------------------------------------------------------------------------- /docker/download_kafka.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | SCALA_VERSION=${1} 4 | KAFKA_VERSION=${2} 5 | KAFKA_DIR=${3} 6 | JOLOKIA_VERSION=${4} 7 | 8 | set -xe 9 | 10 | curl -f "https://archive.apache.org/dist/kafka/${KAFKA_VERSION}/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz" > "/tmp/kafka_release.tgz" 11 | tar xf /tmp/kafka_release.tgz -C /opt 12 | rm -f /tmp/kafka_release.tgz 13 | mv /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} $KAFKA_DIR 14 | 15 | curl -fL "http://search.maven.org/remotecontent?filepath=org/jolokia/jolokia-jvm/${JOLOKIA_VERSION}/jolokia-jvm-${JOLOKIA_VERSION}-agent.jar" > "/opt/jolokia-jvm-agent.jar" 16 | 17 | -------------------------------------------------------------------------------- /docker/log4j.properties: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one or more 2 | # contributor license agreements. See the NOTICE file distributed with 3 | # this work for additional information regarding copyright ownership. 4 | # The ASF licenses this file to You under the Apache License, Version 2.0 5 | # (the "License"); you may not use this file except in compliance with 6 | # the License. You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | log4j.rootLogger=WARN, stdout 17 | 18 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender 19 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout 20 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %p %m (%c)%n 21 | 22 | log4j.logger.kafka=WARN 23 | log4j.logger.kafka.server.FetchManager=ERROR 24 | -------------------------------------------------------------------------------- /docker/server.properties: -------------------------------------------------------------------------------- 1 | log.dirs=/data/kafka-logs 2 | listeners=PLAINTEXT://:9092 3 | advertised.listeners=PLAINTEXT://:9092 4 | auto.create.topics.enable=false 5 | delete.topic.enable=true 6 | auto.leader.rebalance.enable=true 7 | leader.imbalance.check.interval.seconds=100 8 | unclean.leader.election.enable=false 9 | min.insync.replicas=2 10 | reserved.broker.max.id=67108864 11 | broker.id.generation.enable=true 12 | ### from http://kafka.apache.org/documentation.html#prodconfig 13 | 14 | # Replication configurations 15 | num.replica.fetchers=8 16 | replica.fetch.max.bytes=2097152 17 | replica.fetch.wait.max.ms=500 18 | replica.high.watermark.checkpoint.interval.ms=5000 19 | replica.socket.timeout.ms=30000 20 | replica.socket.receive.buffer.bytes=65536 21 | replica.lag.time.max.ms=10000 22 | replica.lag.max.messages=4000 23 | replica.selector.class=org.apache.kafka.common.replica.RackAwareReplicaSelector 24 | 25 | controller.socket.timeout.ms=30000 26 | controller.message.queue.size=10 27 | 28 | # Log configuration 29 | #num.partitions=8 30 | message.max.bytes=2098152 31 | #auto.create.topics.enable=true 32 | log.index.interval.bytes=4096 33 | log.index.size.max.bytes=10485760 34 | log.retention.hours=168 35 | log.flush.interval.ms=10000 36 | log.flush.interval.messages=20000 37 | log.flush.scheduler.interval.ms=2000 38 | log.roll.hours=168 39 | log.retention.check.interval.ms=300000 40 | log.segment.bytes=1073741824 41 | log.cleaner.max.compaction.lag.ms=1209600000 42 | 43 | # ZK configuration 44 | zookeeper.connection.timeout.ms=6000 45 | zookeeper.sync.time.ms=2000 46 | 47 | # Socket server configuration 48 | num.io.threads=16 49 | num.network.threads=16 50 | socket.request.max.bytes=104857600 51 | socket.receive.buffer.bytes=1048576 52 | socket.send.buffer.bytes=1048576 53 | queued.max.requests=32 54 | fetch.purgatory.purge.interval.requests=100 55 | producer.purgatory.purge.interval.requests=100 56 | 57 | #migration 58 | inter.broker.protocol.version=3.1 59 | log.message.format.version=3.1 60 | 61 | # never expire consumer offsets 62 | offsets.retention.minutes=52560000 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | botocore>=1.17.30 2 | kazoo>=2.8.0 3 | boto3>=1.14.30 4 | requests>=2.24.0 5 | click>=7.1.2 6 | pyyaml>=5.3.1 7 | netaddr>=0.8.0 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md 3 | 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import inspect 5 | import os 6 | import sys 7 | 8 | import setuptools 9 | from setuptools import setup 10 | from setuptools.command.test import test 11 | from distutils.core import Command 12 | 13 | if sys.version_info < (3, 5, 0): 14 | sys.stderr.write('FATAL: Bubuku needs to be run with Python 3.5+\n') 15 | sys.exit(1) 16 | 17 | __location__ = os.path.join(os.getcwd(), os.path.dirname(inspect.getfile(inspect.currentframe()))) 18 | 19 | 20 | def read_version(package): 21 | with open(os.path.join(package, '__init__.py'), 'r') as fd: 22 | for line in fd: 23 | if line.startswith('__version__ = '): 24 | return line.split()[-1].strip().strip("'") 25 | 26 | 27 | NAME = 'bubuku' 28 | MAIN_PACKAGE = 'bubuku' 29 | VERSION = read_version(MAIN_PACKAGE) 30 | DESCRIPTION = 'AWS support for kafka broker' 31 | LICENSE = 'Apache License 2.0' 32 | URL = 'https://github.com/zalando-incubator/bubuku' 33 | AUTHOR = 'Dmitry Sorokin' 34 | EMAIL = 'dmitriy.sorokin@zalando.de' 35 | KEYWORDS = 'aws kafka supervisor' 36 | 37 | # Add here all kinds of additional classifiers as defined under 38 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 39 | CLASSIFIERS = [ 40 | 'Development Status :: 3 - Alpha', 41 | 'Environment :: Console', 42 | 'Intended Audience :: Developers', 43 | 'Intended Audience :: System Administrators', 44 | 'License :: OSI Approved :: Apache Software License', 45 | 'Operating System :: POSIX :: Linux', 46 | 'Programming Language :: Python', 47 | 'Programming Language :: Python :: 3.4', 48 | 'Programming Language :: Python :: Implementation :: CPython', 49 | ] 50 | 51 | CONSOLE_SCRIPTS = [ 52 | 'bubuku-daemon = bubuku.daemon:main', 53 | 'bubuku-cli = bubuku.cli:cli' 54 | ] 55 | 56 | 57 | class DockerUpCommand(Command): 58 | description = "Start up docker compose with 3 bubuku and 1 zookeeper instances" 59 | user_options = [ 60 | ('bubuku-scale=', None, 'Specify number of bubuku instances') 61 | ] 62 | 63 | def initialize_options(self): 64 | self.bubuku_scale = 3 65 | 66 | def finalize_options(self): 67 | pass 68 | 69 | def run(self): 70 | os.system('docker-compose up -d --build && docker-compose scale bubuku=' + str(self.bubuku_scale)) 71 | 72 | 73 | class DockerDownCommand(Command): 74 | description = "Stop docker compose" 75 | user_options = [] 76 | 77 | def initialize_options(self): 78 | pass 79 | 80 | def finalize_options(self): 81 | pass 82 | 83 | def run(self): 84 | os.system('docker-compose down') 85 | 86 | 87 | class PyTest(test): 88 | def run_tests(self): 89 | try: 90 | import pytest 91 | except: 92 | raise RuntimeError('py.test is not installed, run: pip install pytest') 93 | params = {'args': self.test_args} 94 | errno = pytest.main(**params) 95 | sys.exit(errno) 96 | 97 | 98 | def read(fname): 99 | with open(os.path.join(__location__, fname)) as f: 100 | return f.read() 101 | 102 | 103 | def setup_package(): 104 | command_options = {'test': {'test_suite': ('setup.py', 'tests')}} 105 | 106 | setup( 107 | name=NAME, 108 | version=VERSION, 109 | url=URL, 110 | description=DESCRIPTION, 111 | author=AUTHOR, 112 | author_email=EMAIL, 113 | license=LICENSE, 114 | keywords=KEYWORDS, 115 | classifiers=CLASSIFIERS, 116 | test_suite='tests', 117 | packages=setuptools.find_packages(exclude=['tests', 'tests.*']), 118 | install_requires=[req for req in read('requirements.txt').split('\\n') if req != ''], 119 | cmdclass={'test': PyTest, 'docker_up': DockerUpCommand, 'docker_down': DockerDownCommand}, 120 | tests_require=['pytest-cov', 'pytest'], 121 | command_options=command_options, 122 | entry_points={ 123 | 'console_scripts': CONSOLE_SCRIPTS, 124 | }, 125 | ) 126 | 127 | if __name__ == '__main__': 128 | setup_package() 129 | -------------------------------------------------------------------------------- /tests/test_broker.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | from bubuku.broker import BrokerManager, LeaderElectionInProgress, StartupTimeout 5 | from bubuku.process import KafkaProcess 6 | from test_config import build_test_properties 7 | 8 | zk_fake_host = 'zk_host:8181/path' 9 | 10 | 11 | class FakeProcessManager(KafkaProcess): 12 | def __init__(self): 13 | super().__init__('') 14 | self.running = False 15 | 16 | def start(self, settings_file): 17 | if self.running: 18 | raise Exception('Start second time') 19 | self.running = True 20 | 21 | def stop_and_wait(self): 22 | self.running = False 23 | 24 | def is_running(self) -> bool: 25 | return self.running 26 | 27 | 28 | def _prepare_for_start_fail(broker_ids, leader, isr): 29 | exhibitor = MagicMock() 30 | exhibitor.get_broker_ids.return_value = broker_ids 31 | exhibitor.load_partition_states.return_value = [ 32 | ('t0', 0, {'leader': int(leader), 'isr': [int(i) for i in isr]})] 33 | 34 | id_manager = MagicMock() 35 | id_manager.get_broker_id = lambda: '1' 36 | kafka_props = build_test_properties() 37 | 38 | broker = BrokerManager(FakeProcessManager(), exhibitor, id_manager, kafka_props, 39 | StartupTimeout.build({'type': 'linear'})) 40 | 41 | kafka_props.set_property('unclean.leader.election.enable', 'false') 42 | return kafka_props, broker 43 | 44 | 45 | class TestBroker(unittest.TestCase): 46 | def test_broker_checks_death(self): 47 | exhibitor = MagicMock() 48 | states = [2, 2] 49 | 50 | def _load_states(topics=None): 51 | for idx in range(0, len(states)): 52 | states[idx] -= 1 53 | return [ 54 | ('t1', 0, {'leader': states[0], 'isr': [1, 3] if states[0] >= 1 else [3]}), 55 | ('t2', 0, {'leader': states[1], 'isr': [1, 3] if states[1] >= 1 else [3]}) 56 | ] 57 | 58 | exhibitor.load_partition_states = _load_states 59 | 60 | id_manager = MagicMock() 61 | id_manager.get_broker_id = lambda: '1' 62 | kafka_props = build_test_properties() 63 | kafka_props.set_property('unclean.leader.election.enable', 'true') 64 | 65 | manager = BrokerManager(FakeProcessManager(), exhibitor, id_manager, kafka_props, 66 | StartupTimeout.build({'type': 'linear'})) 67 | 68 | assert not manager.has_leadership() 69 | 70 | kafka_props.set_property('unclean.leader.election.enable', 'false') 71 | assert manager.has_leadership() 72 | assert not manager.has_leadership() 73 | 74 | def test_broker_start_success_isr(self): 75 | kafka_props, broker = _prepare_for_start_fail(['1', '2'], 1, [3, 4]) 76 | # suppose that leader exists, but isr - not 77 | broker.start_kafka_process(zk_fake_host) 78 | 79 | def test_broker_start_fail_isr(self): 80 | kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [4, 2]) 81 | # suppose that leader is not present 82 | try: 83 | broker.start_kafka_process(zk_fake_host) 84 | assert False, 'broker 2 must be in leaders, it must be impossible to start 1' 85 | except LeaderElectionInProgress: 86 | pass 87 | 88 | def test_broker_start_fail_leader(self): 89 | kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [1, 5]) 90 | # suppose that broker is free to start 91 | try: 92 | broker.start_kafka_process(zk_fake_host) 93 | assert False, 'Broker must not start in case where it''s possible to change leader' 94 | except LeaderElectionInProgress: 95 | pass 96 | 97 | def test_broker_start_success_no_leader_candidate(self): 98 | kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [4, 5]) 99 | # suppose that broker is free to start 100 | broker.start_kafka_process(zk_fake_host) 101 | 102 | def test_broker_start_success_unclean_1(self): 103 | kafka_props, broker = _prepare_for_start_fail(['1', '2'], 1, [1, 2]) 104 | kafka_props.delete_property('unclean.leader.election.enable') 105 | # suppose that broker is free to start 106 | broker.start_kafka_process(zk_fake_host) 107 | 108 | def test_broker_start_success_unclean_2(self): 109 | kafka_props, broker = _prepare_for_start_fail(['1', '2'], 1, [1, 2]) 110 | kafka_props.set_property('unclean.leader.election.enable', 'true') 111 | # suppose that broker is free to start 112 | broker.start_kafka_process(zk_fake_host) 113 | 114 | def test_broker_start_fail_no_zk_conn(self): 115 | kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [1, 5]) 116 | try: 117 | broker.start_kafka_process(zk_fake_host) 118 | assert False, 'Broker must not start in case there is no connection to zk' 119 | except Exception as e: 120 | error_msg = str(e) 121 | assert error_msg != 'No connection to zookeeper' 122 | -------------------------------------------------------------------------------- /tests/test_broker_id_generator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from bubuku.id_extractor import _search_broker_id 4 | 5 | 6 | class TestBrokerIdExtractor(unittest.TestCase): 7 | def test_match_valid(self): 8 | assert '123534' == _search_broker_id(['broker.id=123534']) 9 | assert '123534' == _search_broker_id(['\tbroker.id=123534']) 10 | assert '123534' == _search_broker_id(['\tbroker.id=123534\n']) 11 | assert '123534' == _search_broker_id(['broker.id=123534 \n\r']) 12 | assert '123534' == _search_broker_id(['\tbroker.id=123534 \r']) 13 | assert '123534' == _search_broker_id(['xbroker.id=1', 'broker.id=123534']) 14 | assert '123534' == _search_broker_id(['broker.id=123534', 'boker.id=123534']) 15 | 16 | def test_match_invalid(self): 17 | assert _search_broker_id([]) is None 18 | assert _search_broker_id(['broker_id=123534']) is None 19 | assert _search_broker_id(['xbroker.id=1', 'broker.id=12f3534']) is None 20 | assert _search_broker_id(['bruker.id=123534', 'boker.id=123534']) is None 21 | -------------------------------------------------------------------------------- /tests/test_check_time_period.py: -------------------------------------------------------------------------------- 1 | from bubuku.controller import Check, Change 2 | from time import sleep 3 | 4 | 5 | def test_check_time_period(): 6 | test_check = _TestCheck() 7 | 8 | assert test_check.check_if_time() is not None # first time it should always run 9 | assert test_check.check_if_time() is None # time has not come yet 10 | 11 | sleep(1) 12 | assert test_check.time_till_check() < 0 # time to run the check 13 | assert test_check.check_if_time() is not None # should run the check 14 | assert 0.0 < test_check.time_till_check() < 1 # there's still some time before the check can be run again 15 | 16 | 17 | class _TestCheck(Check): 18 | def __init__(self): 19 | super().__init__(check_interval_s=0.5) 20 | 21 | def check(self) -> Change: 22 | return Change() 23 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | from bubuku.cli import _print_table, _dump_replica_assignment_as_json 2 | 3 | 4 | def test_print_table(): 5 | lines = [] 6 | _print_table([{'Test': 1, 'Test2': '123456789'}, {'Test2': 'Test1', 'Test3': None}], lambda x: lines.append(x)) 7 | assert len(lines) == 3 8 | assert lines[0] == 'Test Test2 Test3' 9 | assert lines[1] == '1 123456789 ' 10 | assert lines[2] == ' Test1 None ' 11 | 12 | 13 | def test_dump_replica_assignment(): 14 | assert _dump_replica_assignment_as_json([('topic-a', "1")]) \ 15 | == '''{"version":1,"partitions":[{"topic":"topic-a","partition":1}]}''' 16 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tempfile import mkstemp 3 | 4 | from bubuku.config import KafkaProperties, load_config, _load_timeout_dict 5 | 6 | __PROPS = """ 7 | log.dirs=/data/kafka-logs 8 | auto.create.topics.enable=false 9 | delete.topic.enable=true 10 | auto.leader.rebalance.enable=true 11 | leader.imbalance.check.interval.seconds=100 12 | 13 | ### from http://kafka.apache.org/documentation.html#prodconfig 14 | 15 | # Replication configurations 16 | num.replica.fetchers=4 17 | replica.fetch.max.bytes=1048576 18 | replica.fetch.wait.max.ms=500 19 | replica.high.watermark.checkpoint.interval.ms=5000 20 | replica.socket.timeout.ms=30000 21 | replica.socket.receive.buffer.bytes=65536 22 | replica.lag.time.max.ms=10000 23 | replica.lag.max.messages=4000 24 | 25 | controller.socket.timeout.ms=30000 26 | controller.message.queue.size=10 27 | 28 | # Log configuration 29 | #num.partitions=8 30 | #message.max.bytes=1000000 31 | #auto.create.topics.enable=true 32 | log.index.interval.bytes=4096 33 | log.index.size.max.bytes=10485760 34 | log.retention.hours=168 35 | log.flush.interval.ms=10000 36 | log.flush.interval.messages=20000 37 | log.flush.scheduler.interval.ms=2000 38 | log.roll.hours=168 39 | log.retention.check.interval.ms=300000 40 | log.segment.bytes=1073741824 41 | 42 | # ZK configuration 43 | zookeeper.connection.timeout.ms=6000 44 | zookeeper.sync.time.ms=2000 45 | 46 | # Socket server configuration 47 | num.io.threads=8 48 | num.network.threads=8 49 | socket.request.max.bytes=104857600 50 | socket.receive.buffer.bytes=1048576 51 | socket.send.buffer.bytes=1048576 52 | queued.max.requests=16 53 | fetch.purgatory.purge.interval.requests=100 54 | producer.purgatory.purge.interval.requests=100 55 | """ 56 | 57 | __FNAME = '' 58 | 59 | 60 | def build_test_properties(): 61 | __create_kafak_props_file() 62 | return KafkaProperties(__FNAME, __FNAME) 63 | 64 | 65 | def __create_kafak_props_file(): 66 | global __FNAME 67 | if not __FNAME: 68 | _, __FNAME = mkstemp(text=True) 69 | with open(__FNAME, 'w') as fd: 70 | fd.write(__PROPS) 71 | 72 | 73 | __create_kafak_props_file() 74 | 75 | 76 | def test_parse_kafka_properties(): 77 | props = build_test_properties() 78 | 79 | assert props.get_property('log.retention.hours') == '168' 80 | 81 | 82 | def test_update_kafka_properties(): 83 | props = build_test_properties() 84 | 85 | assert '100' == props.get_property('producer.purgatory.purge.interval.requests') 86 | 87 | props.set_property('producer.purgatory.purge.interval.requests', '180') 88 | 89 | assert '180' == props.get_property('producer.purgatory.purge.interval.requests') 90 | 91 | props.dump() 92 | 93 | props2 = build_test_properties() 94 | 95 | assert '180' == props2.get_property('producer.purgatory.purge.interval.requests') 96 | 97 | 98 | def test_zk_prefix_replacement(): 99 | if os.getenv('ZOOKEEPER_PREFIX', None): 100 | os.unsetenv('ZOOKEEPER_PREFIX') 101 | assert load_config().zk_prefix == '/' 102 | 103 | os.environ['ZOOKEEPER_PREFIX'] = '/' 104 | assert load_config().zk_prefix == '/' 105 | 106 | os.environ['ZOOKEEPER_PREFIX'] = 'test' 107 | assert load_config().zk_prefix == '/test' 108 | 109 | os.environ['ZOOKEEPER_PREFIX'] = '/test' 110 | assert load_config().zk_prefix == '/test' 111 | 112 | 113 | def test_parse_timeout(): 114 | assert {'type': 'linear', 'initial': '300', 'step': '60'} == _load_timeout_dict( 115 | {'STARTUP_TIMEOUT_TYPE': 'linear', 'STARTUP_TIMEOUT_INITIAL': '300', 'STARTUP_TIMEOUT_STEP': '60'}.get) 116 | assert {'type': 'linear', 'step': '60'} == _load_timeout_dict( 117 | {'STARTUP_TIMEOUT_TYPE': 'linear', 'STARTUP_TIMEOUT_STEP': '60'}.get) 118 | assert {'initial': '300', 'step': '60'} == _load_timeout_dict( 119 | {'STARTUP_TIMEOUT_INITIAL': '300', 'STARTUP_TIMEOUT_STEP': '60'}.get) 120 | -------------------------------------------------------------------------------- /tests/test_controller.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | from bubuku.controller import Controller, Check, Change, _exclude_self 4 | 5 | 6 | def test_exculde_self(): 7 | 8 | assert sorted(['test1', 'test2']) == sorted(_exclude_self('127.0.0.1', 'xxx', { 9 | 'test1': '127.0.0.1', 10 | 'test2': '127.0.0.2', 11 | 'xxx': '127.0.0.1', 12 | })) 13 | 14 | 15 | def test_multiple_changes_are_executed_one_by_one(): 16 | running_count = [3, 3, 3] 17 | 18 | class FakeChange(Change): 19 | def __init__(self, index): 20 | self.index = index 21 | 22 | def get_name(self): 23 | return 'fake' 24 | 25 | def can_run(self, current_actions): 26 | return True 27 | 28 | def run(self, current_actions): 29 | running_count[self.index] -= 1 30 | return running_count[self.index] > 0 31 | 32 | class FakeCheck(Check): 33 | def __init__(self): 34 | super().__init__(0) 35 | self.changes_limit = 3 36 | self.changes_issued = 0 37 | 38 | def check(self): 39 | if self.changes_issued < self.changes_limit: 40 | self.changes_issued += 1 41 | return FakeChange(self.changes_issued - 1) 42 | 43 | current_changes = {} 44 | zk = MagicMock() 45 | zk.get_running_changes.return_value = current_changes 46 | zk.register_change = lambda x, y: current_changes.update({x: y}) 47 | zk.unregister_change = lambda x: current_changes.pop(x) 48 | 49 | controller = Controller(MagicMock(), zk, MagicMock()) 50 | controller.provider_id = 'fake' 51 | controller.add_check(FakeCheck()) 52 | 53 | assert [3, 3, 3] == running_count 54 | controller.make_step() 55 | assert not current_changes 56 | assert [3, 3, 3] == running_count 57 | controller.make_step() 58 | assert current_changes 59 | assert [2, 3, 3] == running_count 60 | controller.make_step() 61 | assert [1, 3, 3] == running_count 62 | controller.make_step() 63 | assert [0, 3, 3] == running_count 64 | controller.make_step() 65 | assert [0, 2, 3] == running_count 66 | controller.make_step() 67 | assert [0, 1, 3] == running_count 68 | controller.make_step() 69 | assert [0, 0, 3] == running_count 70 | controller.make_step() 71 | assert [0, 0, 2] == running_count 72 | controller.make_step() 73 | assert [0, 0, 1] == running_count 74 | assert current_changes 75 | controller.make_step() 76 | assert [0, 0, 0] == running_count 77 | assert not current_changes 78 | controller.make_step() 79 | assert [0, 0, 0] == running_count 80 | assert not current_changes 81 | -------------------------------------------------------------------------------- /tests/test_daemon.py: -------------------------------------------------------------------------------- 1 | from unittest.mock import MagicMock 2 | 3 | from bubuku.daemon import apply_features 4 | from bubuku.features.rebalance.check import RebalanceOnStartCheck, RebalanceOnBrokerListCheck 5 | from bubuku.features.restart_on_zk_change import CheckExhibitorAddressChanged 6 | from bubuku.features.terminate import get_registration 7 | from test_config import build_test_properties 8 | 9 | 10 | class _TestController(object): 11 | def __init__(self): 12 | self.checks = [] 13 | 14 | def add_check(self, check): 15 | self.checks.append(check) 16 | 17 | def test_load_restart_on_exhibitor(): 18 | exhibitor = object() 19 | broker = object() 20 | 21 | controller = _TestController() 22 | 23 | apply_features(-1, {'restart_on_exhibitor': {}}, controller, exhibitor, broker, None, None) 24 | 25 | assert len(controller.checks) == 1 26 | check = controller.checks[0] 27 | assert type(check) == CheckExhibitorAddressChanged 28 | assert check.zk == exhibitor 29 | assert check.broker == broker 30 | 31 | 32 | def test_rebalance_on_start(): 33 | exhibitor = object() 34 | broker = object() 35 | 36 | controller = _TestController() 37 | 38 | apply_features(-1, {'rebalance_on_start': {}}, controller, exhibitor, broker, None, None) 39 | 40 | assert len(controller.checks) == 1 41 | check = controller.checks[0] 42 | assert type(check) == RebalanceOnStartCheck 43 | assert check.zk == exhibitor 44 | assert check.broker == broker 45 | assert not check.executed 46 | 47 | 48 | def test_rebalance_on_broker_list_change(): 49 | exhibitor = object() 50 | broker = object() 51 | 52 | controller = _TestController() 53 | 54 | apply_features(-1, {'rebalance_on_brokers_change': {}}, controller, exhibitor, broker, None, None) 55 | 56 | assert len(controller.checks) == 1 57 | check = controller.checks[0] 58 | assert type(check) == RebalanceOnBrokerListCheck 59 | assert check.zk == exhibitor 60 | assert check.broker == broker 61 | 62 | 63 | def test_graceful_terminate(): 64 | c, b = get_registration() 65 | assert c is None 66 | assert b is None 67 | 68 | broker = object() 69 | 70 | controller = _TestController() 71 | 72 | apply_features(-1, {'graceful_terminate': {}}, controller, None, broker, None, None) 73 | 74 | assert len(controller.checks) == 0 75 | 76 | c, b = get_registration() 77 | assert c == controller 78 | assert b == broker 79 | 80 | 81 | def test_use_ip_address_default(): 82 | props = build_test_properties() 83 | 84 | amazon = MagicMock() 85 | amazon.get_ip = MagicMock(return_value='172.31.146.57') 86 | 87 | apply_features(-1, {'use_ip_address': {}}, None, None, None, props, amazon) 88 | 89 | assert props.get_property('advertised.listeners') == 'PLAINTEXT://172.31.146.57:9092' 90 | assert props.get_property('listeners') == 'PLAINTEXT://0.0.0.0:9092' 91 | 92 | 93 | def test_use_ip_address_custom(): 94 | props = build_test_properties() 95 | props.set_property("listeners", "CUSTOM://:9094,CUSTOM2://:9095,CUSTOM2://:9095") 96 | props.set_property("advertised.listeners", "CUSTOM://:9094,CUSTOM2://:9095,CUSTOM2://:9095") 97 | 98 | amazon = MagicMock() 99 | amazon.get_ip = MagicMock(return_value='172.31.146.57') 100 | 101 | apply_features(-1, {'use_ip_address': {}}, None, None, None, props, amazon) 102 | 103 | print(props.get_property('advertised.listeners')) 104 | assert props.get_property('advertised.listeners') == 'CUSTOM2://172.31.146.57:9095,CUSTOM://172.31.146.57:9094' 105 | assert props.get_property('listeners') == 'CUSTOM2://0.0.0.0:9095,CUSTOM://0.0.0.0:9094' 106 | -------------------------------------------------------------------------------- /tests/test_exhibitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | from bubuku.zookeeper.exhibitor import ExhibitorAddressProvider 5 | 6 | 7 | class ExhibitorAddressProviderTest(unittest.TestCase): 8 | def test_get_latest_address(self): 9 | address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2']) 10 | address_provider._query_exhibitors = lambda _: {'servers': ['aws-lb-1-new'], 'port': 99} 11 | 12 | actual_result = address_provider.get_latest_address() 13 | 14 | assert actual_result == (['aws-lb-1-new'], 99) 15 | 16 | def test_get_latest_address_no_exhibitors(self): 17 | address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2']) 18 | address_provider._query_exhibitors = lambda _: None 19 | 20 | actual_result = address_provider.get_latest_address() 21 | assert actual_result is None 22 | 23 | def test_get_latest_address_2(self): 24 | address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2']) 25 | address_provider._query_exhibitors = MagicMock() 26 | address_provider._query_exhibitors.side_effect = [None, {'servers': ['aws-lb-1-new'], 'port': 99}] 27 | 28 | actual_result = address_provider.get_latest_address() 29 | 30 | assert address_provider._query_exhibitors.call_count == 2 31 | assert actual_result == (['aws-lb-1-new'], 99) 32 | 33 | def test_addresses_are_sorted(self): 34 | address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2']) 35 | address_provider._query_exhibitors = lambda _: {'servers': ['1', '2', '3'], 'port': '1234'} 36 | tmp_result = address_provider.get_latest_address() 37 | 38 | # Check that two calls in sequence will return the same value 39 | assert tmp_result == address_provider.get_latest_address() 40 | 41 | # Check sort 1 42 | address_provider._query_exhibitors = lambda _: {'servers': ['2', '1', '3'], 'port': '1234'} 43 | assert tmp_result == address_provider.get_latest_address() 44 | 45 | # Check sort again (just to be sure) 46 | address_provider._query_exhibitors = lambda _: {'servers': ['3', '2', '1'], 'port': '1234'} 47 | assert tmp_result == address_provider.get_latest_address() 48 | 49 | -------------------------------------------------------------------------------- /tests/test_migrate.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | from bubuku.features.migrate import MigrationChange 5 | 6 | 7 | class TestMigrate(unittest.TestCase): 8 | def test_migration_all_steps(self): 9 | partitions = { 10 | ('test', 0): [1, 2, 3], 11 | ('test', 1): [2, 3, 1], 12 | ('test1', 0): [3, 2, 1], 13 | } 14 | zk = MagicMock() 15 | zk.is_rebalancing = lambda: False 16 | zk.load_partition_assignment = lambda: [(k[0], k[1], v) for k, v in partitions.items()] 17 | result = {} 18 | 19 | def _reallocate_partition(t, p, r): 20 | result.update({(t, p): r}) 21 | return True 22 | 23 | def _reallocate_partitions(items): 24 | for item in items: 25 | _reallocate_partition(*item) 26 | return True 27 | 28 | zk.reallocate_partition = _reallocate_partition 29 | zk.reallocate_partitions = _reallocate_partitions 30 | zk.get_broker_ids = lambda: [1, 2, 3, 4, 5, 6] 31 | 32 | change = MigrationChange(zk, [1, 2, 3], [4, 5, 6], False) 33 | while change.run([]): 34 | pass 35 | expected = { 36 | ('test', 0): [1, 2, 3, 4, 5, 6], 37 | ('test', 1): [2, 3, 1, 5, 6, 4], 38 | ('test1', 0): [3, 2, 1, 6, 5, 4], 39 | } 40 | assert expected == result 41 | 42 | zk.load_partition_assignment = lambda: [(k[0], k[1], v) for k, v in expected.items()] 43 | result.clear() 44 | 45 | change = MigrationChange(zk, [1, 2, 3], [4, 5, 6], True) 46 | while change.run([]): 47 | pass 48 | 49 | expected = { 50 | ('test', 0): [4, 5, 6], 51 | ('test', 1): [5, 6, 4], 52 | ('test1', 0): [6, 5, 4], 53 | } 54 | 55 | assert expected == result 56 | 57 | def test_replica_generation_no_shrink(self): 58 | change = MigrationChange(MagicMock(), [1, 2, 3], [4, 5, 6], False) 59 | 60 | assert [4, 5, 6] == change._replace_replicas([4, 5, 6]) 61 | assert [1, 2, 3, 4, 5, 6] == change._replace_replicas([1, 2, 3]) 62 | assert [1, 2, 6, 4, 5] == change._replace_replicas([1, 2, 6]) 63 | assert [1, 6, 2, 4, 5] == change._replace_replicas([1, 6, 2]) 64 | assert [1, 6, 3, 4] == change._replace_replicas([1, 6, 3]) 65 | 66 | def test_replica_generation_shrink(self): 67 | change = MigrationChange(MagicMock(), [1, 2, 3], [4, 5, 6], True) 68 | 69 | assert [4, 5, 6] == change._replace_replicas([1, 2, 3]) 70 | assert [4, 5, 6] == change._replace_replicas([4, 2, 6]) 71 | assert [8, 5, 10] == change._replace_replicas([8, 2, 10]) 72 | assert [4, 8, 5] == change._replace_replicas([1, 8, 2]) 73 | assert [4, 5, 6] == change._replace_replicas([1, 2, 3, 4, 5, 6]) 74 | -------------------------------------------------------------------------------- /tests/test_partitions_swap.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | from bubuku.features.swap_partitions import CheckBrokersDiskImbalance, SwapPartitionsChange, load_swap_data 5 | 6 | 7 | class TestPartitionsSwap(unittest.TestCase): 8 | test_size_stats = { 9 | "111": {"disk": {"free_kb": 20000, "used_kb": 20000}, "topics": { 10 | "t1": {"1": 3434, "2": 200}, 11 | "t2": {"1": 1000}, 12 | "t3": {"1": 300} 13 | }}, 14 | "222": {"disk": {"free_kb": 25000, "used_kb": 15000}, "topics": { 15 | "t1": {"2": 200}, 16 | "t2": {"1": 1000, "2": 100}, 17 | "t3": {"2": 2000} 18 | }}, 19 | "333": {"disk": {"free_kb": 30000, "used_kb": 10000}, "topics": { 20 | "t1": {"1": 3434}, 21 | "t2": {"2": 100}, 22 | "t3": {"1": 300, "2": 2000} 23 | }} 24 | } 25 | 26 | test_assignment = [ 27 | ("t1", 1, [111, 333]), 28 | ("t1", 2, [111, 222]), 29 | ("t2", 1, [222, 111]), 30 | ("t2", 2, [222, 333]), 31 | ("t3", 1, [333, 111]), 32 | ("t3", 2, [333, 222]), 33 | ] 34 | 35 | test_broker_racks_unaware = { 36 | 111: None, 37 | 222: None, 38 | 333: None 39 | } 40 | 41 | test_size_stats_nine = { 42 | "111": {"disk": {"free_kb": 20000, "used_kb": 20000}, "topics": { 43 | "t1": {"1": 3434, "2": 200}, 44 | "t2": {"1": 1000}, 45 | "t3": {"1": 300} 46 | }}, 47 | "222": {"disk": {"free_kb": 25000, "used_kb": 15000}, "topics": { 48 | "t1": {"2": 200}, 49 | "t2": {"1": 1000, "2": 100}, 50 | "t3": {"2": 2000} 51 | }}, 52 | "333": {"disk": {"free_kb": 30000, "used_kb": 10000}, "topics": { 53 | "t1": {"1": 3434}, 54 | "t2": {"2": 100}, 55 | "t3": {"1": 300, "2": 2000} 56 | }}, 57 | "444": {"disk": {"free_kb": 21000, "used_kb": 19000}, "topics": { 58 | "t4": {"1": 3434, "2": 200}, 59 | "t5": {"1": 1000}, 60 | "t6": {"1": 300} 61 | }}, 62 | "555": {"disk": {"free_kb": 10000, "used_kb": 30000}, "topics": { 63 | "t4": {"2": 200}, 64 | "t5": {"1": 1000, "2": 100}, 65 | "t6": {"2": 2000} 66 | }}, 67 | "666": {"disk": {"free_kb": 22000, "used_kb": 18000}, "topics": { 68 | "t4": {"1": 3434}, 69 | "t5": {"2": 100}, 70 | "t6": {"1": 300, "2": 2000} 71 | }}, 72 | "777": {"disk": {"free_kb": 23000, "used_kb": 17000}, "topics": { 73 | "t7": {"1": 3434, "2": 200}, 74 | "t8": {"1": 1000}, 75 | "t9": {"1": 300} 76 | }}, 77 | "888": {"disk": {"free_kb": 24000, "used_kb": 16000}, "topics": { 78 | "t7": {"2": 200}, 79 | "t8": {"1": 1000, "2": 100}, 80 | "t9": {"2": 2000} 81 | }}, 82 | "999": {"disk": {"free_kb": 26000, "used_kb": 14000}, "topics": { 83 | "t7": {"1": 3434}, 84 | "t8": {"2": 100}, 85 | "t9": {"1": 300, "2": 2000} 86 | }} 87 | } 88 | 89 | test_assignment_nine = [ 90 | ("t1", 1, [111, 333]), 91 | ("t1", 2, [111, 222]), 92 | ("t2", 1, [222, 111]), 93 | ("t2", 2, [222, 333]), 94 | ("t3", 1, [333, 111]), 95 | ("t3", 2, [333, 222]), 96 | ("t4", 1, [444, 666]), 97 | ("t4", 2, [444, 555]), 98 | ("t5", 1, [555, 444]), 99 | ("t5", 2, [555, 666]), 100 | ("t6", 1, [666, 444]), 101 | ("t6", 2, [666, 555]), 102 | ("t7", 1, [777, 999]), 103 | ("t7", 2, [777, 888]), 104 | ("t8", 1, [888, 777]), 105 | ("t8", 2, [888, 999]), 106 | ("t9", 1, [999, 777]), 107 | ("t9", 2, [999, 888]), 108 | ] 109 | 110 | test_broker_racks_aware = { 111 | 111: "eu-central-1a", 112 | 222: "eu-central-1b", 113 | 333: "eu-central-1c", 114 | 444: "eu-central-1a", 115 | 555: "eu-central-1b", 116 | 666: "eu-central-1c", 117 | 777: "eu-central-1a", 118 | 888: "eu-central-1b", 119 | 999: "eu-central-1c", 120 | } 121 | 122 | def setUp(self): 123 | self.zk = self.__mock_zk() 124 | self.broker = self.__mock_broker() 125 | 126 | def test_check_requires_swap_partitions_change(self): 127 | check_imbalance = CheckBrokersDiskImbalance(self.zk, self.broker, 3000, -1) 128 | change = check_imbalance.check() 129 | 130 | assert change 131 | 132 | def test_self_fat_slim_brokers_rack_aware(self): 133 | zk = self.__mock_zk_rack() 134 | 135 | slim, fat, gap, stats = load_swap_data(zk, -1, 100) 136 | assert fat == 555 137 | assert slim == 222 138 | 139 | def test_check_requires_swap_partitions_change_rack_aware(self): 140 | self.zk = self.__mock_zk_rack() 141 | check_imbalance = CheckBrokersDiskImbalance(self.zk, self.broker, 3000, -1) 142 | change = check_imbalance.check() 143 | 144 | assert change 145 | 146 | def test_check_requires_not_swap_partitions_change(self): 147 | check_imbalance = CheckBrokersDiskImbalance(self.zk, self.broker, 15000, -1) 148 | change = check_imbalance.check() 149 | 150 | # change should not be created as the gap between brokers is less than threshold 151 | assert not change 152 | 153 | def test_swap_partitions_change_performed(self): 154 | def _swap_data_provider(zk): 155 | return load_swap_data(zk, -1, 10000) 156 | 157 | swap_change = SwapPartitionsChange(self.zk, _swap_data_provider) 158 | result = swap_change.run([]) 159 | 160 | assert not result 161 | self.zk.reallocate_partitions.assert_called_with([('t2', 2, [222, 111]), ('t2', 1, [222, 333])]) 162 | 163 | def test_swap_partitions_change_not_performed(self): 164 | swap_change = SwapPartitionsChange(self.zk, lambda x: load_swap_data(x, -1, 10001)) 165 | result = swap_change.run([]) 166 | 167 | # change should not trigger partitions swap as there is no possible 168 | # partitions swap that will decrease the gap between brokers 169 | assert not result 170 | self.zk.reallocate_partitions.assert_not_called() 171 | 172 | def test_swap_partitions_change_postponed(self): 173 | self.zk.reallocate_partitions.return_value = False 174 | 175 | swap_change = SwapPartitionsChange(self.zk, lambda x: load_swap_data(x, -1, 10000)) 176 | result = swap_change.run([]) 177 | 178 | # if the write to ZK wasn't possible for some reason, the change should 179 | # return True and repeat write to ZK during next trigger by controller 180 | assert result 181 | assert swap_change.to_move == [('t2', 2, [222, 111]), ('t2', 1, [222, 333])] 182 | 183 | def test_swap_partitions_change_postponed_when_rebalancing(self): 184 | self.zk.is_rebalancing.return_value = True 185 | 186 | swap_change = SwapPartitionsChange(self.zk, None) 187 | result = swap_change.run([]) 188 | 189 | # if there was a rebalance node in ZK - the change should be postponed 190 | assert result 191 | assert not swap_change.to_move 192 | 193 | def test_swap_partitions_change_performed_existing(self): 194 | swap_change = SwapPartitionsChange(self.zk, None) 195 | dummy_move_list = ["dummy"] 196 | swap_change.to_move = ["dummy"] 197 | result = swap_change.run([]) 198 | 199 | # if there already was a pair of partitions to swap in to_move 200 | # property - SwapPartitionsChange should just execute this swap 201 | assert not result 202 | self.zk.reallocate_partitions.assert_called_with(dummy_move_list) 203 | self.zk.load_partition_assignment.assert_not_called() 204 | 205 | def __mock_broker(self) -> MagicMock: 206 | broker = MagicMock() 207 | broker.is_running_and_registered.return_value = True 208 | return broker 209 | 210 | def __mock_zk(self) -> MagicMock: 211 | zk = MagicMock() 212 | zk.is_rebalancing.return_value = False 213 | zk.load_partition_assignment.return_value = self.test_assignment 214 | zk.get_disk_stats.return_value = self.test_size_stats 215 | zk.get_broker_racks.return_value = self.test_broker_racks_unaware 216 | return zk 217 | 218 | def __mock_zk_rack(self) -> MagicMock: 219 | zk = MagicMock() 220 | zk.is_rebalancing.return_value = False 221 | zk.load_partition_assignment.return_value = self.test_assignment_nine 222 | zk.get_disk_stats.return_value = self.test_size_stats_nine 223 | zk.get_broker_racks.return_value = self.test_broker_racks_aware 224 | return zk 225 | -------------------------------------------------------------------------------- /tests/test_rebalance.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import unittest 3 | from typing import Dict 4 | from unittest.mock import MagicMock 5 | 6 | from kazoo.exceptions import NoNodeError 7 | 8 | from bubuku.features.rebalance import BaseRebalanceChange 9 | from bubuku.features.rebalance.change import OptimizedRebalanceChange 10 | from bubuku.features.rebalance.change_simple import SimpleRebalanceChange 11 | from bubuku.features.rebalance.check import RebalanceOnBrokerListCheck 12 | from bubuku.zookeeper import BukuExhibitor 13 | 14 | 15 | def _verify_balanced(broker_ids, distribution, delta=1): 16 | per_broker_data = {k: {'leaders': 0, 'total': 0} for k in broker_ids} 17 | for broker_ids in distribution.values(): 18 | per_broker_data[broker_ids[0]]['leaders'] += 1 19 | for b in broker_ids: 20 | per_broker_data[b]['total'] += 1 21 | assert len([bb for bb in broker_ids if bb == b]) == 1 22 | min_leaders = min(k['leaders'] for k in per_broker_data.values()) 23 | max_leaders = max(k['leaders'] for k in per_broker_data.values()) 24 | 25 | assert (max_leaders - min_leaders) <= delta 26 | 27 | min_total = min(k['total'] for k in per_broker_data.values()) 28 | max_total = max(k['total'] for k in per_broker_data.values()) 29 | 30 | assert (max_total - min_total) <= delta 31 | 32 | 33 | def _verify_rack_aware(initial_distribution, final_distribution, racks): 34 | for (topic, partition) in initial_distribution.keys(): 35 | final_assignment = final_distribution[(topic, partition)] 36 | final_racks = _brokers_to_racks(final_assignment, racks) 37 | if len(racks) > len(final_assignment): 38 | assert (len(final_assignment) == len(set(final_racks))) 39 | else: 40 | assert (len(set(final_assignment)) == len(racks)) 41 | 42 | 43 | def _brokers_to_racks(brokers: list, racks: Dict[int, str]): 44 | return [racks[int(broker)] for broker in brokers] 45 | 46 | 47 | def _verify_empty_brokers(broker_ids, distribution): 48 | for brokers in distribution.values(): 49 | for broker in brokers: 50 | if broker in broker_ids: 51 | assert False 52 | assert True 53 | 54 | 55 | class TestRebalanceCheck(unittest.TestCase): 56 | 57 | def test_rebalance_invoked_on_broker_list_change(self): 58 | zk = MagicMock() 59 | 60 | zk.get = MagicMock(side_effect=NoNodeError) 61 | 62 | check = RebalanceOnBrokerListCheck(zk, MagicMock()) 63 | zk.get_broker_ids.return_value = ['1', '2', '3'] 64 | 65 | assert check.check() is not None 66 | assert check.check() is None 67 | zk.get_broker_ids.return_value = ['1', '2', '3'] 68 | assert check.check() is None 69 | zk.get_broker_ids.return_value = ['1', '2', '4'] 70 | assert check.check() is not None 71 | assert check.check() is None 72 | 73 | 74 | class TestBaseRebalance(unittest.TestCase): 75 | __test__ = False 76 | 77 | _correct_rack_assignment = True 78 | 79 | def createChange(self, zk, broker_ids, empty_brokers, exclude_topics, parallelism=1) -> BaseRebalanceChange: 80 | pass 81 | 82 | def _create_zk_for_topics(self, topic_data, broker_ids=None, racks=None) -> (list, BukuExhibitor): 83 | buku_proxy = MagicMock() 84 | actual_broker_ids = broker_ids if broker_ids else sorted(list( 85 | set(functools.reduce(lambda x, y: x + y, topic_data.values(), [])))) 86 | buku_proxy.get_broker_ids.return_value = actual_broker_ids 87 | brokers = broker_ids if broker_ids else sorted(list( 88 | set(functools.reduce(lambda x, y: x + y, topic_data.values(), [])))) 89 | if self._correct_rack_assignment: 90 | buku_proxy.get_broker_racks.return_value = {k: v for k, v in racks.items() if str(k) in actual_broker_ids} \ 91 | if racks else {int(broker_id): None for broker_id in brokers} 92 | else: 93 | buku_proxy.get_broker_racks.return_value = racks if racks else {int(broker_id): None for broker_id in 94 | brokers} 95 | 96 | def _load_assignment(): 97 | return [(k[0], int(k[1]), [int(p) for p in v]) for k, v in topic_data.items()] 98 | 99 | def _load_states(topics=None): 100 | return [(k[0], int(k[1]), {'isr': [int(p) for p in v]}) for k, v in topic_data.items()] 101 | 102 | buku_proxy.load_partition_assignment = _load_assignment 103 | buku_proxy.load_partition_states = _load_states 104 | buku_proxy.is_rebalancing.return_value = False 105 | 106 | def _reassign(topic, partition, replicas): 107 | topic_data[(topic, str(partition))] = [str(x) for x in replicas] 108 | return True 109 | 110 | def _reassign_many(items): 111 | for item in items: 112 | _reassign(*item) 113 | return True 114 | 115 | buku_proxy.reallocate_partition = _reassign 116 | buku_proxy.reallocate_partitions = _reassign_many 117 | return sorted(list(set(functools.reduce(lambda x, y: x + y, topic_data.values(), [])))), buku_proxy 118 | 119 | def test_rebalance_can_run(self): 120 | brokers, zk = self._create_zk_for_topics({}) 121 | o = self.createChange(zk, [], [], []) 122 | 123 | blocked_actions = ['restart', 'start', 'stop', 'rebalance'] 124 | 125 | # Check that can run in exact cases 126 | for a in blocked_actions: 127 | assert not o.can_run([a]) 128 | 129 | assert o.can_run(['xxx']) 130 | assert o.can_run([]) 131 | 132 | def test_rebalance_get_name(self): 133 | brokers, zk = self._create_zk_for_topics({}) 134 | o = self.createChange(zk, [], [], []) 135 | assert o.get_name() == 'rebalance' 136 | 137 | def test_rebalance_on_empty1(self): 138 | brokers, zk = self._create_zk_for_topics({}) 139 | o = self.createChange(zk, brokers, [], []) 140 | while o.run([]): 141 | pass 142 | 143 | def test_rebalance_on_filled1(self): 144 | distribution = { 145 | ('t0', '0'): ['2'], 146 | ('t0', '1'): ['1'], 147 | ('t0', '2'): ['1'], 148 | ('t0', '3'): ['1'], 149 | } 150 | brokers, zk = self._create_zk_for_topics(distribution) 151 | o = self.createChange(zk, brokers, [], []) 152 | # broker to partitions 153 | while o.run([]): 154 | pass 155 | 156 | _verify_balanced(('1', '2'), distribution) 157 | 158 | def test_rebalance_with_racks(self): 159 | distribution = { 160 | ('t0', '0'): ['3', '1'], 161 | ('t0', '1'): ['1', '5'], 162 | ('t0', '2'): ['5', '3'], 163 | ('t1', '0'): ['5', '3'], 164 | ('t1', '1'): ['3', '1'], 165 | ('t2', '0'): ['1', '5'], 166 | ('t2', '1'): ['2', '4'], 167 | ('t2', '2'): ['2', '6'], 168 | ('t2', '3'): ['4', '2'], 169 | ('t2', '4'): ['6', '2'], 170 | ('t2', '5'): ['3', '6'], 171 | ('t2', '6'): ['6', '3'], 172 | } 173 | 174 | initial_distribution = dict(distribution) 175 | 176 | racks = { 177 | 1: 'r1', 178 | 2: 'r1', 179 | 3: 'r2', 180 | 4: 'r2', 181 | 5: 'r3', 182 | 6: 'r3' 183 | } 184 | 185 | brokers, zk = self._create_zk_for_topics(distribution, ['1', '2', '3', '4', '5', '6'], racks) 186 | o = self.createChange(zk, brokers, [], []) 187 | while o.run([]): 188 | pass 189 | 190 | _verify_balanced(('1', '2', '3', '4', '5', '6'), distribution, 2) 191 | _verify_rack_aware(initial_distribution, distribution, racks) 192 | 193 | def test_rebalance_empty_one_broker(self): 194 | distribution = { 195 | ('t0', '0'): ['1', '2'], 196 | ('t0', '1'): ['2', '3'], 197 | ('t1', '0'): ['2', '3'], 198 | ('t1', '1'): ['3', '4'], 199 | } 200 | brokers, zk = self._create_zk_for_topics(distribution) 201 | o = self.createChange(zk, brokers, ['2'], []) 202 | while o.run([]): 203 | pass 204 | 205 | _verify_empty_brokers(('2'), distribution) 206 | 207 | def test_rebalance_empty_multiple_brokers(self): 208 | distribution = { 209 | ('t0', '0'): ['1', '2'], 210 | ('t0', '1'): ['2', '3'], 211 | ('t1', '0'): ['2', '3'], 212 | ('t1', '1'): ['3', '4'], 213 | ('t1', '2'): ['4', '5'], 214 | ('t2', '0'): ['3', '4'], 215 | ('t2', '1'): ['4', '5'], 216 | ('t2', '2'): ['5', '6'], 217 | } 218 | brokers, zk = self._create_zk_for_topics(distribution) 219 | o = self.createChange(zk, brokers, ['2', '3'], []) 220 | while o.run([]): 221 | pass 222 | 223 | _verify_empty_brokers(('2', '3'), distribution) 224 | 225 | def test_rebalance_empty_brokers_and_exclude_topics(self): 226 | distribution = { 227 | ('t0', '0'): ['1', '2'], 228 | ('t0', '1'): ['2', '3'], 229 | ('t1', '0'): ['2', '3'], 230 | ('t1', '1'): ['3', '4'], 231 | ('t1', '2'): ['4', '5'], 232 | ('t2', '0'): ['3', '4'], 233 | ('t2', '1'): ['4', '5'], 234 | ('t2', '2'): ['5', '6'], 235 | } 236 | brokers, zk = self._create_zk_for_topics(distribution) 237 | o = OptimizedRebalanceChange(zk, brokers, ['2', '3'], ['t1']) 238 | while o.run([]): 239 | pass 240 | 241 | assert distribution[('t1', '0')] == ['2', '3'] 242 | assert distribution[('t1', '1')] == ['3', '4'] 243 | assert distribution[('t1', '2')] == ['4', '5'] 244 | 245 | distribution.pop(('t1', '0')) 246 | distribution.pop(('t1', '1')) 247 | distribution.pop(('t1', '2')) 248 | 249 | brokers = [item for sublist in distribution.values() for item in sublist] 250 | assert '2' not in brokers 251 | assert '3' not in brokers 252 | 253 | def test_rebalance_on_filled2(self): 254 | distribution = { 255 | ('t0', '0'): ['2', '1'], 256 | ('t0', '1'): ['1', '2'], 257 | ('t0', '2'): ['1', '2'], 258 | ('t0', '3'): ['1', '2'], 259 | ('t0', '4'): ['1', '2'], 260 | ('t0', '5'): ['1', '2'], 261 | ('t0', '6'): ['1', '2'], 262 | } 263 | brokers, zk = self._create_zk_for_topics(distribution) 264 | o = self.createChange(zk, brokers, [], []) 265 | # broker to partitions 266 | while o.run([]): 267 | pass 268 | 269 | _verify_balanced(('1', '2'), distribution) 270 | 271 | def test_rebalance_with_dead_brokers(self): 272 | distribution = { 273 | ('t0', '0'): ['2', '1'], 274 | ('t0', '1'): ['1', '2'], 275 | ('t0', '2'): ['1', '2'], 276 | ('t0', '3'): ['1', '2'], 277 | ('t0', '4'): ['1', '2'], 278 | ('t0', '5'): ['1', '2'], 279 | ('t0', '6'): ['1', '2'], 280 | } 281 | _, zk = self._create_zk_for_topics(distribution, broker_ids=['1', '3'], racks={1: None, 2: None, 3: None}) 282 | o = self.createChange(zk, ['1', '3'], [], []) 283 | while o.run([]): 284 | pass 285 | _verify_balanced(['1', '3'], distribution) 286 | 287 | def test_rebalance_fail_with_not_enough_replicas(self): 288 | distribution = { 289 | ('t0', '0'): ['2', '1', '3'], 290 | ('t0', '1'): ['1', '2'], 291 | } 292 | 293 | _, zk = self._create_zk_for_topics(distribution, broker_ids=['1', '3']) 294 | o = self.createChange(zk, ['1', '3'], [], []) 295 | try: 296 | while o.run([]): 297 | pass 298 | assert False, "Balancing can not work with low replication factor" 299 | except Exception: 300 | pass 301 | 302 | def test_rebalance_recovered_with_additional_copy2(self): 303 | distribution = { 304 | ('t0', '0'): ['2', '1'], 305 | ('t0', '1'): ['1', '2'], 306 | ('t0', '2'): ['3', '4'] 307 | } 308 | _, zk = self._create_zk_for_topics(distribution, ['1', '2', '4'], racks={1: None, 2: None, 3: None, 4: None}) 309 | o = self.createChange(zk, ['1', '2', '4'], [], []) 310 | while o.run([]): 311 | pass 312 | _verify_balanced(['1', '2', '4'], distribution) 313 | 314 | def test_rebalance_with_many_topics(self): 315 | distribution = {} 316 | topic_count = 1000 317 | partition_count = 21 318 | broker_ids = [str(i) for i in range(1, 22)] 319 | for i in range(0, topic_count): 320 | topic = 't{}'.format(i) 321 | distribution.update({(topic, str(partition)): ['1', '2', '3'] for partition in range(0, partition_count)}) 322 | _, zk = self._create_zk_for_topics(distribution, broker_ids=broker_ids) 323 | 324 | o = self.createChange(zk, broker_ids, [], [], parallelism=1000) 325 | steps = 0 326 | while o.run([]): 327 | steps += 1 328 | _verify_balanced(broker_ids, distribution, 1) 329 | 330 | def test_leader_partition_limit(self): 331 | distribution = { 332 | ('t0', '0'): ['1', '2'], 333 | ('t0', '1'): ['1', '2'], 334 | ('t0', '2'): ['1', '2'], 335 | ('t1', '2'): ['1', '2'], 336 | } 337 | _, zk = self._create_zk_for_topics(distribution, ['2', '3'], racks={1: None, 2: None, 3: None}) 338 | o = self.createChange(zk, ['2', '3'], [], []) 339 | while o.run([]): 340 | pass 341 | _verify_balanced(['2', '3'], distribution) 342 | 343 | 344 | class OptimizedRebalanceTest(TestBaseRebalance): 345 | __test__ = True 346 | _correct_rack_assignment = False 347 | 348 | def createChange(self, zk, broker_ids, empty_brokers, exclude_topics, parallelism=1): 349 | return OptimizedRebalanceChange(zk, broker_ids, empty_brokers, exclude_topics, parallelism) 350 | 351 | def test_rebalance_recovered_with_additional_copy1(self): 352 | distribution = { 353 | ('t0', '0'): ['2', '1'], 354 | ('t0', '1'): ['1', '2'], 355 | ('t0', '2'): ['3', '4'] 356 | } 357 | _, zk = self._create_zk_for_topics(distribution, ['1', '2', '3'], racks={1: None, 2: None, 3: None, 4: None}) 358 | o = self.createChange(zk, ['1', '2', '3'], [], []) 359 | while o.run([]): 360 | pass 361 | _verify_balanced(['1', '2', '3'], distribution) 362 | 363 | 364 | class SimpleRebalanceTest(TestBaseRebalance): 365 | __test__ = True 366 | 367 | def createChange(self, zk, broker_ids, empty_brokers, exclude_topics, parallelism=1): 368 | return SimpleRebalanceChange(zk, 369 | broker_ids=broker_ids, 370 | empty_brokers=empty_brokers, 371 | exclude_topics=exclude_topics, 372 | parallelism=parallelism) 373 | 374 | def test_rebalance_with_racks_different_nr_partitions_per_rack(self): 375 | distribution = { 376 | ('t0', '0'): ['3', '1'], 377 | ('t0', '1'): ['6', '5'], 378 | ('t0', '2'): ['5', '3'], 379 | ('t1', '0'): ['5', '6'], 380 | ('t1', '1'): ['6', '5'], 381 | ('t2', '0'): ['6', '4'], 382 | ('t2', '1'): ['2', '6'], 383 | ('t2', '2'): ['2', '6'], 384 | ('t2', '3'): ['4', '2'], 385 | ('t2', '4'): ['6', '5'], 386 | ('t2', '5'): ['4', '6'], 387 | ('t2', '6'): ['6', '4'], 388 | } 389 | 390 | initial_distribution = dict(distribution) 391 | 392 | racks = { 393 | 1: 'r1', 394 | 2: 'r1', 395 | 3: 'r2', 396 | 4: 'r2', 397 | 5: 'r3', 398 | 6: 'r3' 399 | } 400 | 401 | brokers, zk = self._create_zk_for_topics(distribution, ['1', '2', '3', '4', '5', '6'], racks) 402 | o = self.createChange(zk, brokers, [], []) 403 | while o.run([]): 404 | pass 405 | 406 | _verify_rack_aware(initial_distribution, distribution, racks) 407 | -------------------------------------------------------------------------------- /tests/test_restart.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock, Mock 3 | 4 | from bubuku.broker import LeaderElectionInProgress 5 | from bubuku.features.restart_on_zk_change import RestartBrokerChange 6 | 7 | 8 | class RestartTest(unittest.TestCase): 9 | def test_restart_atomicity(self): 10 | """ 11 | Because of action locks structure there is need to restart instances atomically. That means, that during restart 12 | action parts (stop, wait for leader election, start) must be under same lock. That guarantees, that cluster 13 | instances won't be destroyed at the same time during zk update. 14 | """ 15 | broker = MagicMock() 16 | zk = MagicMock() 17 | change = RestartBrokerChange(zk, broker, None) 18 | 19 | zk.get_conn_str = lambda: 'xxx' 20 | 21 | broker.is_running_and_registered = lambda: True 22 | stopped = [] 23 | broker.stop_kafka_process = lambda: stopped.append(True) 24 | assert change.run([]) 25 | assert stopped and stopped[0] 26 | 27 | broker.start_kafka_process = Mock(side_effect=LeaderElectionInProgress()) 28 | for i in range(1, 50): 29 | assert change.run([]) 30 | 31 | started = [] 32 | broker.start_kafka_process = lambda x: started.append(x) 33 | assert not change.run([]) 34 | assert started and 'xxx' == started[0] 35 | -------------------------------------------------------------------------------- /tests/test_restart_if_dead.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | from bubuku.features.restart_if_dead import CheckBrokerStopped 4 | 5 | class TestRestartIfDeadCheck(unittest.TestCase): 6 | 7 | def test_broker_retries_before_it_restarts(self): 8 | brokerManager = MagicMock() 9 | isRegistered = MagicMock(return_value=False) 10 | attrs = {'is_running.return_value': True, 11 | 'is_registered_in_zookeeper': isRegistered, 12 | 'get_zookeeper_session_timeout.return_value': 1} 13 | brokerManager.configure_mock(**attrs) 14 | 15 | exhibitor = MagicMock() 16 | checkBrokerStopped = CheckBrokerStopped(brokerManager, exhibitor) 17 | checkReturnedSomething = None 18 | while not checkReturnedSomething: 19 | checkReturnedSomething = checkBrokerStopped.check() 20 | assert isRegistered.call_count > 0 21 | -------------------------------------------------------------------------------- /tests/test_size_stats_collecting.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from unittest.mock import MagicMock 3 | 4 | from bubuku.features.data_size_stats import GenerateDataSizeStatistics 5 | from bubuku.utils import CmdHelper 6 | 7 | 8 | class TestDataSizeStats(unittest.TestCase): 9 | 10 | def test_size_stats_collecting(self): 11 | zk = MagicMock() 12 | 13 | stat_check = GenerateDataSizeStatistics(zk, self.__mock_broker(), self.__mock_cmd_helper(), ["/kafka-logs"]) 14 | stat_check.check() 15 | 16 | expected_json = { 17 | "disk": {"free_kb": 606, "used_kb": 404}, 18 | "topics": { 19 | "another_topic": {"0": 3}, 20 | "my-topic": {"0": 10, "2": 200} 21 | } 22 | } 23 | zk.update_disk_stats.assert_called_with('dummy_id', expected_json) 24 | 25 | def __mock_cmd_helper(self) -> CmdHelper: 26 | class CmdHelperMock(CmdHelper): 27 | def cmd_run(self, cmd: str): 28 | if cmd.startswith("du"): 29 | return "10\t/kafka-logs/my-topic-0\n" \ 30 | "200\t/kafka-logs/my-topic-2\n" \ 31 | "3\t/kafka-logs/another_topic-0\n" \ 32 | "55\t/kafka-logs\n" \ 33 | "77\t/kafka-logs/wrong_topic\n" \ 34 | "blah" 35 | elif cmd.startswith("df"): 36 | return "101 202\n" \ 37 | "303 404\n" \ 38 | "500" 39 | else: 40 | raise ValueError("Call not expected") 41 | 42 | return CmdHelperMock() 43 | 44 | def __mock_broker(self) -> MagicMock: 45 | broker = MagicMock() 46 | broker.is_running_and_registered.return_value = True 47 | broker.id_manager.get_broker_id.return_value = "dummy_id" 48 | return broker 49 | -------------------------------------------------------------------------------- /tests/test_startup_timeout.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from bubuku.broker import StartupTimeout 4 | 5 | 6 | class TestStartupTimeout(unittest.TestCase): 7 | @staticmethod 8 | def _verify(o: StartupTimeout, border_value: float, border_value_after_fail: float): 9 | print(o) 10 | assert not o.is_timed_out(border_value) 11 | assert o.is_timed_out(border_value + 1) 12 | o.on_timeout_fail() 13 | assert not o.is_timed_out(border_value_after_fail) 14 | assert o.is_timed_out(border_value_after_fail + 1) 15 | 16 | def test_linear_defaults(self): 17 | o = StartupTimeout.build({'type': 'linear'}) 18 | TestStartupTimeout._verify(o, 300., 360.) 19 | 20 | def test_linear(self): 21 | o = StartupTimeout.build({'type': 'linear', 'initial': '10', 'step': 2}) 22 | TestStartupTimeout._verify(o, 10., 12.) 23 | 24 | def test_progressive_defaults(self): 25 | o = StartupTimeout.build({'type': 'progressive'}) 26 | TestStartupTimeout._verify(o, 300., 450.) 27 | 28 | def test_progressive(self): 29 | o = StartupTimeout.build({'type': 'progressive', 'initial': '16', 'step': '0.25'}) 30 | 31 | TestStartupTimeout._verify(o, 16., 20.) 32 | -------------------------------------------------------------------------------- /tests/test_zookeeper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import re 4 | import time 5 | import unittest 6 | from unittest.mock import MagicMock 7 | 8 | from kazoo.exceptions import NoNodeError, NodeExistsError 9 | 10 | from bubuku.zookeeper import BukuExhibitor, SlowlyUpdatedCache 11 | 12 | 13 | def test_get_broker_ids(): 14 | exhibitor_mock = MagicMock() 15 | 16 | def _get_children(path): 17 | if path == '/brokers/ids': 18 | return ['3', '1', '2'] 19 | else: 20 | raise NotImplementedError() 21 | 22 | exhibitor_mock.get_children = _get_children 23 | 24 | buku = BukuExhibitor(exhibitor_mock) 25 | 26 | assert ['1', '2', '3'] == buku.get_broker_ids() # ensure that return list is sorted 27 | 28 | 29 | def test_load_active_topics(): 30 | exhibitor_mock = MagicMock() 31 | 32 | def _get_children(path): 33 | if path == '/brokers/topics': 34 | return ['3', '1', '2'] 35 | elif path == '/admin/delete_topics': 36 | return ['3', '1'] 37 | else: 38 | raise NotImplementedError() 39 | 40 | exhibitor_mock.get_children = _get_children 41 | buku = BukuExhibitor(exhibitor_mock) 42 | 43 | assert ['2'] == buku.load_active_topics() 44 | 45 | 46 | def test_is_broker_registered(): 47 | def _get(path): 48 | if path == '/brokers/ids/123': 49 | return '123', object() 50 | elif path == '/brokers/ids/321': 51 | return None, None 52 | else: 53 | raise NoNodeError() 54 | 55 | exhibitor_mock = MagicMock() 56 | exhibitor_mock.get = _get 57 | buku = BukuExhibitor(exhibitor_mock) 58 | 59 | assert buku.is_broker_registered('123') 60 | assert buku.is_broker_registered(123) 61 | assert not buku.is_broker_registered('321') 62 | assert not buku.is_broker_registered(321) 63 | assert not buku.is_broker_registered(333) 64 | assert not buku.is_broker_registered('333') 65 | 66 | 67 | def _test_load_partition_assignment(async_: bool): 68 | exhibitor_mock = MagicMock() 69 | 70 | def _get_children(path): 71 | if path == '/brokers/topics': 72 | return ['t01', 't02'] 73 | else: 74 | raise NotImplementedError() 75 | 76 | def _get(path): 77 | if path == '/brokers/topics/t01': 78 | return json.dumps({'partitions': {0: [1, 2, 3], 1: [3, 2, 1]}}).encode('utf-8'), object() 79 | elif path == '/brokers/topics/t02': 80 | return json.dumps({'partitions': {0: [4, 5, 6], 1: [5, 1, 2]}}).encode('utf-8'), object() 81 | else: 82 | raise NotImplementedError() 83 | 84 | def _get_async(path): 85 | def _get_iresult(block): 86 | assert block 87 | return _get(path) 88 | 89 | mock = MagicMock() 90 | mock.get = _get_iresult 91 | return mock 92 | 93 | exhibitor_mock.get = _get 94 | exhibitor_mock.get_async = _get_async 95 | exhibitor_mock.get_children = _get_children 96 | 97 | buku_ex = BukuExhibitor(exhibitor_mock, async_) 98 | 99 | expected_result = [ 100 | ('t01', 0, [1, 2, 3]), 101 | ('t01', 1, [3, 2, 1]), 102 | ('t02', 0, [4, 5, 6]), 103 | ('t02', 1, [5, 1, 2]), 104 | ] 105 | result = [r for r in buku_ex.load_partition_assignment()] 106 | assert len(expected_result) == len(result) 107 | for e in expected_result: 108 | assert e in result 109 | 110 | 111 | def test_load_partition_assignment_sync(): 112 | _test_load_partition_assignment(False) 113 | 114 | 115 | def test_load_partition_assignment_async(): 116 | _test_load_partition_assignment(True) 117 | 118 | 119 | def _test_load_partition_states(async_: bool): 120 | exhibitor_mock = MagicMock() 121 | 122 | def _get_children(path): 123 | if path == '/brokers/topics': 124 | return ['t01', 't02'] 125 | elif path == '/brokers/topics/t01/partitions': 126 | return ['0', '1'] 127 | elif path == '/brokers/topics/t02/partitions': 128 | return ['0', '1', '2'] 129 | else: 130 | raise NotImplementedError() 131 | 132 | def _get(path: str): 133 | matched = re.match('/brokers/topics/(.*)/partitions/(.*)/state', path) 134 | if not matched: 135 | topic = path[len('/brokers/topics/'):] 136 | if topic not in ['t01', 't02']: 137 | raise NotImplementedError('Not implemented for path {}'.format(path)) 138 | cnt = 2 if topic == 't01' else 3 139 | return json.dumps({'partitions': {x: None for x in range(0, cnt)}}).encode('utf-8'), object() 140 | topic = matched.group(1) 141 | partition = matched.group(2) 142 | if topic == 't01' and partition not in ('0', '1'): 143 | raise NotImplementedError() 144 | elif topic == 't02' and partition not in ('0', '1', '2'): 145 | raise NotImplementedError() 146 | elif topic not in ('t01', 't02'): 147 | raise NotImplementedError() 148 | idx = (100 if topic == 't01' else 200) + int(partition) 149 | return json.dumps({'fake_data': idx}).encode('utf-8'), object() 150 | 151 | def _get_async(path): 152 | def _get_iasync(block): 153 | assert block 154 | return _get(path) 155 | 156 | mock = MagicMock() 157 | mock.get = _get_iasync 158 | return mock 159 | 160 | exhibitor_mock.get = _get 161 | exhibitor_mock.get_async = _get_async 162 | exhibitor_mock.get_children = _get_children 163 | 164 | buku_ex = BukuExhibitor(exhibitor_mock, async_=async_) 165 | 166 | expected_result = [ 167 | ('t01', 0, {'fake_data': 100}), 168 | ('t01', 1, {'fake_data': 101}), 169 | ('t02', 0, {'fake_data': 200}), 170 | ('t02', 1, {'fake_data': 201}), 171 | ('t02', 2, {'fake_data': 202}), 172 | ] 173 | 174 | result = [r for r in buku_ex.load_partition_states()] 175 | assert len(expected_result) == len(result) 176 | for e in expected_result: 177 | assert e in result 178 | 179 | 180 | def test_load_partition_states_sync(): 181 | _test_load_partition_states(False) 182 | 183 | 184 | def test_load_partition_states_async(): 185 | _test_load_partition_states(True) 186 | 187 | 188 | def test_reallocate_partition(): 189 | call_idx = [0] 190 | 191 | def _create(path, value=None, **kwargs): 192 | if path in ('/bubuku/changes', '/bubuku/actions/global'): 193 | pass 194 | elif path == '/admin/reassign_partitions': 195 | if call_idx[0] >= 5: 196 | raise NodeExistsError() 197 | call_idx[0] += 1 198 | j = json.loads(value.decode('utf-8')) 199 | assert j['version'] == '1' 200 | assert len(j['partitions']) == 1 201 | p = j['partitions'][0] 202 | assert p['topic'] == 't01' 203 | assert p['partition'] == 0 204 | assert p['replicas'] == [1, 2, 3] 205 | else: 206 | raise NotImplementedError('Not implemented for path {}'.format(path)) 207 | 208 | exhibitor_mock = MagicMock() 209 | exhibitor_mock.create = _create 210 | 211 | buku = BukuExhibitor(exhibitor_mock) 212 | 213 | assert buku.reallocate_partition('t01', 0, ['1', '2', '3']) 214 | assert buku.reallocate_partition('t01', 0, ['1', '2', 3]) 215 | assert buku.reallocate_partition('t01', 0, [1, 2, 3]) 216 | assert buku.reallocate_partition('t01', 0, [1, 2, 3]) 217 | assert buku.reallocate_partition('t01', 0, [1, 2, 3]) 218 | # Node exists 219 | assert not buku.reallocate_partition('t01', 0, [1, 2, 3]) 220 | 221 | 222 | class SlowlyUpdatedCacheTest(unittest.TestCase): 223 | def test_initial_update_fast(self): 224 | result = [None] 225 | 226 | def _update(value_): 227 | result[0] = value_ 228 | 229 | cache = SlowlyUpdatedCache(lambda: (['test'], 1), _update, 0, 0) 230 | 231 | cache.touch() 232 | assert result[0] == (['test'], 1) 233 | 234 | def test_exception_eating(self): 235 | result = [10, None] 236 | 237 | def _update(value_): 238 | result[1] = value_ 239 | 240 | def _load(): 241 | if result[0] > 0: 242 | result[0] -= 1 243 | raise Exception() 244 | return ['test'], 1 245 | 246 | cache = SlowlyUpdatedCache(_load, _update, 0, 0) 247 | cache.force = False # Small hack to avoid initial refresh cycle 248 | for i in range(0, 10): 249 | cache.touch() 250 | assert result[1] is None 251 | assert result[0] == 9 - i 252 | cache.touch() 253 | assert result[1] == (['test'], 1) 254 | 255 | def test_initial_update_slow(self): 256 | result = [None] 257 | call_count = [0] 258 | 259 | def _load(): 260 | call_count[0] += 1 261 | if call_count[0] == 100: 262 | return ['test'], 1 263 | return None 264 | 265 | def _update(value_): 266 | result[0] = value_ 267 | 268 | cache = SlowlyUpdatedCache(_load, _update, 0, 0) 269 | 270 | cache.touch() 271 | assert call_count[0] == 100 272 | assert result[0] == (['test'], 1) 273 | 274 | def test_delays_illegal(self): 275 | result = [None] 276 | load_calls = [] 277 | update_calls = [] 278 | 279 | def _load(): 280 | load_calls.append(time.time()) 281 | return ['test'], 0 if len(load_calls) > 1 else 1 282 | 283 | def _update(value_): 284 | update_calls.append(time.time()) 285 | result[0] = value_ 286 | 287 | # refresh every 1 second, delay 0.5 second 288 | cache = SlowlyUpdatedCache(_load, _update, 0.5, 0.25) 289 | 290 | while len(update_calls) != 2: 291 | time.sleep(0.1) 292 | cache.touch() 293 | 294 | assert math.fabs(update_calls[0] - load_calls[0]) <= 0.15 # 0.1 + 0.1/2 295 | # Verify that load calls were made one by another 296 | assert math.fabs(load_calls[1] - load_calls[0] - .5) <= 0.15 297 | # Verity that update call was made in correct interval 298 | 299 | assert load_calls[1] + 0.25 <= update_calls[1] <= load_calls[1] + 0.25 + 0.15 300 | 301 | def test_delays_legal(self): 302 | result = [None] 303 | main_call = [] 304 | load_calls = [] 305 | update_calls = [] 306 | 307 | def _load(): 308 | load_calls.append(time.time()) 309 | if len(load_calls) == 5: 310 | main_call.append(time.time()) 311 | return ['test'], 0 if len(load_calls) >= 5 else len(load_calls) 312 | 313 | def _update(value_): 314 | update_calls.append(time.time()) 315 | result[0] = value_ 316 | 317 | # refresh every 1 second, delay 5 second - in case where situation is constantly changing - wait for 318 | # last stable update 319 | cache = SlowlyUpdatedCache(_load, _update, 0.5, 3) 320 | 321 | while len(update_calls) != 2: 322 | time.sleep(0.1) 323 | cache.touch() 324 | print(cache) 325 | 326 | assert len(main_call) == 1 327 | assert main_call[0] + 3 - .15 < update_calls[1] < main_call[0] + 3 + .15 328 | --------------------------------------------------------------------------------