├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .zappr.yaml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE.txt
├── MAINTAINERS
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── bubuku
    ├── __init__.py
    ├── aws
    │   ├── __init__.py
    │   ├── cluster_config.py
    │   ├── ec2_node_launcher.py
    │   ├── ip_address_allocator.py
    │   └── node.py
    ├── broker.py
    ├── cli.py
    ├── communicate.py
    ├── config.py
    ├── controller.py
    ├── controller_api.py
    ├── daemon.py
    ├── env_provider.py
    ├── features
    │   ├── __init__.py
    │   ├── data_size_stats.py
    │   ├── metric_collector.py
    │   ├── migrate.py
    │   ├── rebalance
    │   │   ├── __init__.py
    │   │   ├── broker.py
    │   │   ├── change.py
    │   │   ├── change_simple.py
    │   │   └── check.py
    │   ├── remote_exec.py
    │   ├── restart_if_dead.py
    │   ├── restart_on_zk_change.py
    │   ├── rolling_restart.py
    │   ├── swap_partitions.py
    │   └── terminate.py
    ├── id_extractor.py
    ├── process.py
    ├── utils.py
    └── zookeeper
    │   ├── __init__.py
    │   └── exhibitor.py
├── cli_docs
    ├── cli.md
    ├── generate_cli_docs.py
    └── generate_cli_docs.sh
├── delivery.yaml
├── docker-compose.yml
├── docker
    ├── download_kafka.sh
    ├── log4j.properties
    └── server.properties
├── requirements.txt
├── setup.cfg
├── setup.py
└── tests
    ├── test_broker.py
    ├── test_broker_id_generator.py
    ├── test_check_time_period.py
    ├── test_cli.py
    ├── test_config.py
    ├── test_controller.py
    ├── test_daemon.py
    ├── test_exhibitor.py
    ├── test_migrate.py
    ├── test_partitions_swap.py
    ├── test_rebalance.py
    ├── test_restart.py
    ├── test_restart_if_dead.py
    ├── test_size_stats_collecting.py
    ├── test_startup_timeout.py
    └── test_zookeeper.py


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | [ARUHA-XXX: Name of ticket](link to ticket)
 2 | 
 3 | ## Description
 4 | A few sentences describing the overall goals of the pull request's
 5 | commits.
 6 | 
 7 | ## Review
 8 | - [ ] Tests
 9 | - [ ] Documentation
10 | - [ ] CHANGELOG
11 | 
12 | ## Deployment Notes
13 | These should highlight any db migrations, feature toggles, etc.
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.iml
3 | *.pyc
4 | .eggs/
5 | bubuku.egg-info/
6 | .cache/
7 | build/
8 | dist/


--------------------------------------------------------------------------------
/.zappr.yaml:
--------------------------------------------------------------------------------
 1 | X-Zalando-Team: "aruha"
 2 | X-Zalando-Type: code
 3 | 
 4 | approvals:
 5 |   groups:
 6 |     zalando:
 7 |       minimum: 2
 8 |       from:
 9 |         orgs:
10 |           - zalando
11 |           
12 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct ([from here](http://contributor-covenant.org/version/1/4/)).
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, gender identity and expression, level of experience,
 9 | nationality, personal appearance, race, religion, or sexual identity and
10 | orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 | advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 |   * Other conduct which could reasonably be considered inappropriate in a
32 |     professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at team-aruha@zalando.de.
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at [http://contributor-covenant.org/version/1/4][version]
72 | 
73 | [homepage]: http://contributor-covenant.org
74 | [version]: http://contributor-covenant.org/version/1/4/
75 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Bubuku
 2 | 
 3 | **Thank you for your interest in Bubuku. Your contributions are highly welcome.**
 4 | 
 5 | There are multiple ways of getting involved:
 6 | 
 7 | - [Report a bug](#report-a-bug) 
 8 | - [Suggest a feature](#suggest-a-feature) 
 9 | - [Contribute code](#contribute-code) 
10 | 
11 | Below are a few guidelines we would like you to follow.
12 | If you need help, please reach out to us by opening an issue.
13 | 
14 | ## Report a bug 
15 | Reporting bugs is one of the best ways to contribute. Before creating a bug report, please check that an [issue](https://github.com/zalando-nakadi/bubuku/issues) reporting the same problem does not already exist. If there is an such an issue, you may add your information as a comment.
16 | 
17 | To report a new bug you should open an issue that summarizes the bug and set the label to "bug".
18 | 
19 | If you want to provide a fix along with your bug report: That is great! In this case please send us a pull request as described in section [Contribute Code](#contribute-code).
20 | 
21 | ## Suggest a feature
22 | To request a new feature you should open an [issue](https://github.com/zalando-nakadi/bubuku/issues/new) and summarize the desired functionality and its use case. Set the issue label to "feature".  
23 | 
24 | ## Contribute code
25 | This is a rough outline of what the workflow for code contributions looks like:
26 | - Check the list of open [issues](https://github.com/zalando-nakadi/bubuku/issues). Either assign an existing issue to yourself, or create a new one that you would like work on and discuss your ideas and use cases. It is always best to discuss your plans beforehand, to ensure that your contribution is in line with our goals for Bubuku.
27 | - Fork the repository on GitHub
28 | - Create a topic branch from where you want to base your work. This is usually master.
29 | - Make commits of logical units.
30 | - Write good commit messages (see below).
31 | - Push your changes to a topic branch in your fork of the repository.
32 | - Submit a pull request to [zalando-incubator/bubuku](https://github.com/zalando-nakadi/bubuku)
33 | - Your pull request must receive a :thumbsup: from two [Maintainers](https://github.com/zalando-nakadi/ubuku/blob/master/MAINTAINERS)
34 | 
35 | Thanks for your contributions!
36 | 
37 | ### Commit messages
38 | Your commit messages ideally can answer two questions: what changed and why. The subject line should feature the “what” and the body of the commit should describe the “why”.  
39 | 
40 | When creating a pull request, its comment should reference the corresponding issue id.
41 | 
42 | **Have fun, and happy hacking!**
43 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM container-registry.zalando.net/library/python-3.9-slim:latest
 2 | MAINTAINER Team Aruha, team-aruha@zalando.de
 3 | 
 4 | RUN apt-get update
 5 | RUN apt-get install -y --mark-auto curl wget gnupg
 6 | 
 7 | # Install corretto JDK: https://docs.aws.amazon.com/corretto/latest/corretto-11-ug/generic-linux-install.html
 8 | RUN wget -O- https://apt.corretto.aws/corretto.key | apt-key add - 
 9 | RUN echo 'deb https://apt.corretto.aws stable main' >/etc/apt/sources.list.d/amazon-corretto-jdk.list
10 | RUN apt-get update && apt-get install -y java-17-amazon-corretto-jdk
11 | 
12 | # Install kafka
13 | ENV KAFKA_VERSION="3.1.1" SCALA_VERSION="2.13" JOLOKIA_VERSION="1.6.2"
14 | ENV KAFKA_DIR="/opt/kafka" KAFKA_LOGS_DIR="/data/logs" KAFKA_SETTINGS="/opt/kafka/config/server.properties"
15 | 
16 | ADD docker/download_kafka.sh /tmp/download_kafka.sh
17 | 
18 | RUN sh /tmp/download_kafka.sh ${SCALA_VERSION} ${KAFKA_VERSION} ${KAFKA_DIR} ${JOLOKIA_VERSION}
19 | 
20 | ADD docker/server.properties ${KAFKA_SETTINGS}
21 | ADD docker/log4j.properties ${KAFKA_DIR}/config/
22 | 
23 | # Install bubuku
24 | ENV SRC_PATH="/bubuku"
25 | 
26 | ADD ./bubuku "${SRC_PATH}/bubuku"
27 | ADD ./requirements.txt "${SRC_PATH}/"
28 | ADD ./setup.py "${SRC_PATH}/"
29 | 
30 | RUN mkdir -p $KAFKA_LOGS_DIR/ && \
31 |     cd "${SRC_PATH}" && \
32 |     pip3 install --no-cache-dir -r "requirements.txt" && \
33 |     python3 setup.py develop && \
34 |     chmod -R 777 $KAFKA_LOGS_DIR && \
35 |     chmod 777 ${KAFKA_SETTINGS} && \
36 |     \
37 |     mkdir ${KAFKA_DIR}/logs && \
38 |     chmod 777 ${KAFKA_DIR}/logs
39 | 
40 | RUN apt-get -y autoremove && \
41 |     apt-get clean && \
42 |     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
43 | 
44 | ENV HEALTH_PORT=8080
45 | ENV BUKU_FEATURES="restart_on_exhibitor,rebalance_on_start,graceful_terminate,use_ip_address"
46 | ENV KAFKA_OPTS="-server -Dlog4j.configuration=file:${KAFKA_DIR}/config/log4j.properties -Dkafka.logs.dir=${KAFKA_LOGS_DIR} -javaagent:/opt/jolokia-jvm-agent.jar=host=0.0.0.0"
47 | ENV KAFKA_JMX_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false"
48 | 
49 | EXPOSE 9092 8080 8778
50 | 
51 | ENTRYPOINT ["/bin/bash", "-c", "exec bubuku-daemon"]
52 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 Zalando SE
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
 4 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
 5 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
 6 | persons to whom the Software is furnished to do so, subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
 9 | Software.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
12 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
13 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
14 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/MAINTAINERS:
--------------------------------------------------------------------------------
1 | Andrey Dyachkov <andrey.dyachkov@zalando.de>
2 | Dmitry Sorokin <dmitriy.sorokin@zalando.de>
3 | Ricardo De Cillo <ricardo.de.cillo@zalando.de>
4 | Vyacheslav Stepanov <vyacheslav.stepanov@zalando.de>
5 | Lionel Montrieux <lionel.montrieux@zalando.de>
6 | Suyash Garg <suyash.garg@zalando.de>
7 | Kunal Jha <kunal.jha@zalando.de>
8 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | 
2 | include *.txt
3 | recursive-include bubuku *.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Archived
  2 | 
  3 | **This repository is now archived.**
  4 | 
  5 | 
  6 | [![Build Status](https://travis-ci.org/zalando-incubator/bubuku.svg)](https://travis-ci.org/zalando-incubator/bubuku)
  7 | # Introduction
  8 | 
  9 | Bubuku - supervisor for kafka
 10 | 
 11 | 
 12 | Google translate with automatic language detection says that it means 'day' in
 13 | Xhosa language.
 14 | 
 15 | 
 16 | The purpose of bubuku is to start, monitor and rebalance kafka cluster in AWS setup, handling these actions in 
 17 | coordinated manner.
 18 |  
 19 | Bubuku assumes that kafka is already installed on a machine. Version of kafka that is tested to be working
 20 | with current release: 0.9.0.1
 21 | 
 22 | # Installation
 23 | ```
 24 | pip3 install bubuku
 25 | ```
 26 | 
 27 | # Running
 28 | Start supervisor:
 29 | ```
 30 | KAFKA_DIR=/opt/kafka KAFKA_SETTINGS=/opt/kafka/config/server.properties ZOOKEEPER_STACK_NAME=my_zookeeper \
 31 | ZOOKEEPER_PREFIX=/test BROKER_ID_POLICY=ip BUKU_FEATURES=restart_on_exhibitor,graceful_terminate \
 32 | HEALTH_PORT=8888 bubuku-daemon
 33 | ```
 34 | Run commands on cluster:
 35 | ```
 36 | export KAFKA_DIR=/opt/kafka
 37 | export KAFKA_SETTINGS=/opt/kafka/config/server.properties 
 38 | export ZOOKEEPER_STACK_NAME=my_zookeeper 
 39 | export ZOOKEEPER_PREFIX=/test 
 40 | export BROKER_ID_POLICY=ip
 41 | 
 42 | # Restart kafka on current node
 43 | bubuku-cli restart
 44 | # Restart kafka on some other node (broker id must be known)
 45 | bubuku-cli restart --broker=12324
 46 | 
 47 | # Invoke partitions rebalance
 48 | bubuku-cli rebalance
 49 | ```
 50 | It is important to have all properties provided, because command processing is made over zookeeper stack. 
 51 | 
 52 | # Configuration
 53 | 
 54 | Bubuku can be configured using environment properties:
 55 | 
 56 |  - `KAFKA_DIR` - kafka root directory (for example /opt/kafka) 
 57 |  - `KAFKA_SETTINGS` - Path to kafka settings template file. Bubuku will add (or replace/delete) it's own 
 58 |  properties to this file and write the contents to `${KAFKA_DIR}/config/server.properties`. Kafka will be started 
 59 |   against generated file.
 60 |  - `ZOOKEEPER_STACK_NAME` - AWS load balancer name for zookeeper stack
 61 |  - `ZOOKEEPER_STATIC_IPS_PORT` - (overrides ZOOKEEPER_STACK_NAME) - static list 
 62 |  of ips/port of zookeeper stack in the following form: 127.0.0.1,127.0.0.2,127.0.0.3:2181 - several ips and 1 port 
 63 |  - `ZOOKEEPER_PREFIX` - Prefix for all the nodes in zk for kafka and bubuku
 64 |  - `BROKER_ID_POLICY` - Policy for generating broker id. Possible values are `ip` and `auto`
 65 |  - `BUKU_FEATURES` - List of optional bubuku features, see [features](#features) section
 66 |  - `HEALTH_PORT` - Port for health checks
 67 |  - `FREE_SPACE_DIFF_THRESHOLD_MB` - Threshold for starting `balance_data_size` feature, if it's enabled
 68 |  - `STARTUP_TIMEOUT_TYPE`, `STARTUP_TIMEOUT_INITIAL`, `STARTUP_TIMEOUT_STEP` - The way bubuku manages [time to start for kafka](#startup_timeout).
 69 |  
 70 | # Features #
 71 | 
 72 | Bubuku provides:
 73 |     
 74 |  - Ip address discovery using AWS
 75 |  - Exhibitor discovery using AWS load balancer name
 76 |  - Rebalance partitions on different events
 77 |  - React on exhibitor topology change
 78 |  - Automatic kafka restart in case if broker is considered dead
 79 |  - Graceful broker termination in case of supervisor stop
 80 |  - Broker start/stop/restart synchronization across cluster
 81 |  
 82 | ## <a name="features"></a> Pluggable features
 83 | Pluggable features are defined in configuration and are disabled by default. List of features:
 84 |  
 85 |  - `restart_on_exhibitor` - restart kafka broker on zookeeper address list change. Kafka by itself do not support
 86 |  zookeeper list provider, so in order to override list of zookeeper instances in runtime there will be configuration
 87 |  change and broker restart. This change is made one by one for each broker in cluster (so kafka instances won't die
 88 |  all at the same time)
 89 |  - `rebalance_on_start` - Rebalance partition distribution across cluster (using partition count and leader count
 90 |  per broker as optimization strategy) during initial broker startup
 91 |  - `rebalance_on_brokers_change` - Rebalance partition distribution across cluster (using partition count and leader 
 92 |  count per broker as optimization strategy) on any broker list change (new broker started, old broker died)
 93 |  - `graceful_terminate` - In case when bubuku is killed, try to gracefully terminate kafka process.
 94 |  - `use_ip_address` - Use ip address when registering kafka instance. By default kafka registers itself in 
 95 |  zookeeper using hostname. Sometimes (for example on migration between AWS regions) it makes sense to use ip 
 96 |  address instead of hostname.
 97 |  - `balance_data_size` - Swap partitions one by one by one if imbalance in size on brokers is bigger than 
 98 |  `FREE_SPACE_DIFF_THRESHOLD_MB` megabytes.
 99 |  
100 | ## <a name="startup_timeout"></a> Timeouts for startup
101 |  Each time when bubuku tries to start kafka, it uses special startup timeout. This means, that if kafka broker id 
102 |  is not found within this timeout in zookeeper node `/broker/ids/{id}`, kafka process will be forcibly killed, timeout 
103 |  for start updated, and startup will be retried. 
104 |   
105 |   There are two ways to increase timeout - linear and progressive. Linear adds the same amount of time after each 
106 |   failed start. Progressive adds time, that is relative to current timeout. Configuration for that is provided by 
107 |   `STARTUP_TIMEOUT_TYPE`, `STARTUP_TIMEOUT_INITIAL`, `STARTUP_TIMEOUT_STEP` parameters.
108 |   ```
109 |   # Linear timeout configuration 
110 |   # initial timeout=300 seconds, after each failed start increase by 60 seconds (360, 420 and so on)
111 |   export STARTUP_TIMEOUT_TYPE="linear"
112 |   export STARTUP_TIMEOUT_INITIAL="300"
113 |   export STARTUP_TIMEOUT_STEP="60"
114 |   ```
115 |   ```
116 |   # Progressive timeout configuration
117 |   # Initial timeout=300 seconds, after each failed start increase by timeout * 0.5 (450, 675 and so on)
118 |   export STARTUP_TIMEOUT_TYPE="progressive"
119 |   export STARTUP_TIMEOUT_INITIAL="300"
120 |   export STARTUP_TIMEOUT_STEP="0.5"
121 |   ```
122 | 
123 |  Default values for timeout are
124 |  ```
125 |   export STARTUP_TIMEOUT_TYPE="linear"
126 |   export STARTUP_TIMEOUT_INITIAL="300"
127 |   export STARTUP_TIMEOUT_STEP="60"
128 |  ```
129 |  
130 | # Command line interface
131 | 
132 | Bubuku provides a command line tool `bubuku-cli` which should be used directly on the instance. See detailed
133 | description of all commands [here](https://github.com/zalando-nakadi/bubuku/blob/master/cli_docs/cli.md).
134 | 
135 | # How to contribute
136 | 
137 | If you have any features or bugfixes - make pull request providing feature/bugfix and tests that will test your 
138 | feature/bugfix.
139 | 
140 | # Reporting issues
141 | 
142 | If you experiencing problems with bubuku and you know that it can be improved - please fill free to post issue
143 | to github. Please provide full description of feature or bug (optionally with unit test), so it can be fixed 
144 | faster.
145 | 
146 | # License
147 | 
148 | Copyright (c) 2016 Zalando SE
149 | 
150 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
151 | documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
152 | rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
153 | persons to whom the Software is furnished to do so, subject to the following conditions:
154 | 
155 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the
156 | Software.
157 | 
158 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
159 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
160 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
161 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
162 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | We acknowledge that every line of code that we write may potentially contain security issues. We are trying to deal with it responsibly and provide patches as quickly as possible.
2 | 
3 | We host our bug bounty program on HackerOne, it is currently private, therefore if you would like to report a vulnerability and get rewarded for it, please ask to join our program by filling this form:
4 | 
5 | https://corporate.zalando.com/en/services-and-contact#security-form
6 | 
7 | You can also send your report via this form if you do not want to join our bug bounty program and just want to report a vulnerability or security issue.
8 | 
9 | 


--------------------------------------------------------------------------------
/bubuku/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.10.48'
2 | 


--------------------------------------------------------------------------------
/bubuku/aws/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import boto3
 4 | from botocore.config import Config
 5 | 
 6 | 
 7 | class AWSResources(object):
 8 |     def __init__(self, region, retries=100):
 9 |         boto3.set_stream_logger('boto3', logging.INFO)
10 |         self.session = boto3.Session()
11 |         self.region = region
12 |         self.retries = retries
13 |         self._ec2_client = None
14 |         self._ec2_resource = None
15 |         self._cloudwatch_client = None
16 |         self._iam_client = None
17 | 
18 |     @property
19 |     def ec2_client(self):
20 |         if not self._ec2_client:
21 |             self._ec2_client = self.session.client(
22 |                 'ec2',
23 |                 region_name=self.region,
24 |                 config=Config(retries={'max_attempts': self.retries}))
25 |         return self._ec2_client
26 | 
27 |     @property
28 |     def ec2_resource(self):
29 |         if not self._ec2_resource:
30 |             self._ec2_resource = self.session.resource(
31 |                 'ec2',
32 |                 region_name=self.region,
33 |                 config=Config(retries={'max_attempts': self.retries}))
34 |         return self._ec2_resource
35 | 
36 |     @property
37 |     def cloudwatch_client(self):
38 |         if not self._cloudwatch_client:
39 |             self._cloudwatch_client = self.session.client(
40 |                 'cloudwatch',
41 |                 region_name=self.region,
42 |                 config=Config(retries={'max_attempts': self.retries}))
43 |         return self._cloudwatch_client
44 | 
45 |     @property
46 |     def iam_client(self):
47 |         if not self._iam_client:
48 |             self._iam_client = self.session.client('iam', config=Config(retries={'max_attempts': self.retries}))
49 |         return self._iam_client
50 | 


--------------------------------------------------------------------------------
/bubuku/aws/cluster_config.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import requests
  4 | import yaml
  5 | 
  6 | _ARTIFACT_NAME = 'bubuku-appliance'
  7 | 
  8 | _LOG = logging.getLogger('bubuku.aws.cluster_config')
  9 | 
 10 | 
 11 | class ConfigLoader(object):
 12 |     def load_user_data(self) -> dict:
 13 |         pass
 14 | 
 15 |     def load_region(self) -> str:
 16 |         pass
 17 | 
 18 |     def load_ami_id(self) -> str:
 19 |         pass
 20 | 
 21 | 
 22 | class AwsInstanceUserDataLoader(ConfigLoader):
 23 |     def load_user_data(self):
 24 |         return yaml.load(requests.get('http://169.254.169.254/latest/user-data').text, Loader=yaml.FullLoader)
 25 | 
 26 |     def load_region(self) -> str:
 27 |         return requests.get('http://169.254.169.254/latest/meta-data/placement/region').text
 28 | 
 29 |     def load_ami_id(self) -> str:
 30 |         return requests.get('http://169.254.169.254/latest/meta-data/ami-id').text
 31 | 
 32 | 
 33 | class ClusterConfig(object):
 34 | 
 35 |     def __init__(self, config_loader: ConfigLoader):
 36 |         self._user_data = config_loader.load_user_data()
 37 |         self._env_vars = self._user_data.get('environment')
 38 |         self._aws_region = config_loader.load_region()
 39 |         self._ami_id = config_loader.load_ami_id()
 40 |         self._overrides = {}
 41 | 
 42 |     def get_cluster_name(self):
 43 |         return self._env_vars.get('CLUSTER_NAME')
 44 | 
 45 |     def get_aws_region(self):
 46 |         return self._aws_region
 47 | 
 48 |     def get_instance_type(self):
 49 |         return self._user_data.get('instance_type')
 50 | 
 51 |     def get_ami_id(self):
 52 |         return self._ami_id
 53 | 
 54 |     def get_vpc_id(self):
 55 |         return self._user_data.get('vpc_id')
 56 | 
 57 |     def get_tags(self):
 58 |         return self._user_data.get('tags', [])
 59 | 
 60 |     def get_user_data(self):
 61 |         return dict(self._user_data)
 62 | 
 63 |     def get_overrides(self):
 64 |         return self._overrides
 65 | 
 66 |     def set_overrides(self, **overrides):
 67 |         self._overrides = overrides
 68 | 
 69 |         if overrides.get('application_version'):
 70 |             self._user_data['application_version'] = overrides['application_version']
 71 |             self._user_data['source'] = '{}:{}'.format(
 72 |                 self._user_data['source'].split(':', 1)[0], overrides['application_version'])
 73 |         if overrides.get('instance_type'):
 74 |             self._user_data['instance_type'] = overrides['instance_type']
 75 |         if overrides.get('scalyr_account_key'):
 76 |             self._user_data['scalyr_account_key'] = overrides['scalyr_account_key']
 77 |         if overrides.get('scalyr_region'):
 78 |             self._user_data['scalyr_region'] = overrides['scalyr_region']
 79 |         if overrides.get('kms_key_id'):
 80 |             self._user_data['kms_key_id'] = overrides['kms_key_id']
 81 |         if overrides.get('ami_id'):
 82 |             self._ami_id = overrides['ami_id']
 83 | 
 84 |         for k, v in overrides.items():
 85 |             if k not in ('application_version', 'instance_type', 'scalyr_account_key', 'scalyr_region', 'kms_key_id'):
 86 |                 _LOG.warning("Unsupported argument %s with value %s", k, v)
 87 | 
 88 |     @staticmethod
 89 |     def create_overrides_dict(
 90 |             application_version: str = None,
 91 |             instance_type: str = None,
 92 |             scalyr_account_key: str = None,
 93 |             scalyr_region: str = None,
 94 |             kms_key_id: str = None,
 95 |             ami_id: str = None):
 96 | 
 97 |         # Pack arguments by the name passed to the method, in case if value for the argument is set
 98 |         def _filter_out_empty(**kwargs) -> dict:
 99 |             return {k: v for k, v in kwargs.items() if v}
100 | 
101 |         return _filter_out_empty(
102 |             application_version=application_version,
103 |             instance_type=instance_type,
104 |             scalyr_account_key=scalyr_account_key,
105 |             scalyr_region=scalyr_region,
106 |             kms_key_id=kms_key_id,
107 |             ami_id=ami_id
108 |         )
109 | 


--------------------------------------------------------------------------------
/bubuku/aws/ec2_node_launcher.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import logging
  3 | import time
  4 | import yaml
  5 | 
  6 | from bubuku.aws import AWSResources
  7 | from bubuku.aws.cluster_config import ClusterConfig
  8 | from bubuku.aws.ip_address_allocator import IpAddressAllocator
  9 | from bubuku.aws.node import KAFKA_LOGS_EBS
 10 | 
 11 | _LOG = logging.getLogger('bubuku.aws.ec2_node')
 12 | 
 13 | 
 14 | class Ec2NodeLauncher(object):
 15 |     def __init__(self, aws: AWSResources, cluster_config: ClusterConfig, az: str):
 16 |         self._aws = aws
 17 |         self._cluster_config = cluster_config
 18 |         self._az = az
 19 | 
 20 |     def _launch_instance(self, ip: str, subnet: dict, ami: object, security_group_id: str, iam_profile):
 21 |         #
 22 |         # Override any ephemeral volumes with NoDevice mapping,
 23 |         # otherwise auto-recovery alarm cannot be actually enabled.
 24 |         #
 25 |         _LOG.info('Overriding ephemeral volumes to be able to set up AWS auto recovery alarm ')
 26 |         block_devices = []
 27 |         for bd in ami.block_device_mappings:
 28 |             if 'Ebs' in bd:
 29 |                 #
 30 |                 # This has to be our root EBS.
 31 |                 #
 32 |                 # If the Encrypted flag is present, we have to delete
 33 |                 # it even if it matches the actual snapshot setting,
 34 |                 # otherwise amazon will complain rather loudly.
 35 |                 #
 36 |                 # Take a deep copy before deleting the key:
 37 |                 #
 38 |                 bd = copy.deepcopy(bd)
 39 | 
 40 |                 root_ebs = bd['Ebs']
 41 |                 if 'Encrypted' in root_ebs:
 42 |                     del (root_ebs['Encrypted'])
 43 | 
 44 |                 block_devices.append(bd)
 45 |             else:
 46 |                 # ignore any ephemeral volumes (aka. instance storage)
 47 |                 block_devices.append({
 48 |                     'DeviceName': bd['DeviceName'],
 49 |                     'NoDevice': ''
 50 |                 })
 51 | 
 52 |         user_data = self._cluster_config.get_user_data()
 53 |         user_data['volumes']['ebs']['/dev/xvdk'] = KAFKA_LOGS_EBS
 54 |         taupage_user_data = '#taupage-ami-config\n{}'.format(yaml.safe_dump(user_data))
 55 | 
 56 |         _LOG.info('Launching node %s in %s', ip, subnet['AvailabilityZone'])
 57 |         resp = self._aws.ec2_client.run_instances(
 58 |             ImageId=ami.id,
 59 |             MinCount=1,
 60 |             MaxCount=1,
 61 |             SecurityGroupIds=[security_group_id],
 62 |             UserData=taupage_user_data,
 63 |             InstanceType=self._cluster_config.get_instance_type(),
 64 |             SubnetId=subnet['SubnetId'],
 65 |             PrivateIpAddress=ip,
 66 |             BlockDeviceMappings=block_devices,
 67 |             IamInstanceProfile={'Arn': iam_profile['Arn']},
 68 |             DisableApiTermination=False,
 69 |             EbsOptimized=True)
 70 | 
 71 |         instance_id = resp['Instances'][0]['InstanceId']
 72 |         _LOG.info('Instance %s launched', instance_id)
 73 | 
 74 |         attempts = 2
 75 |         while True:
 76 |             try:
 77 |                 self._aws.ec2_client.create_tags(
 78 |                     Resources=[instance_id],
 79 |                     Tags=(self._cluster_config.get_tags() + [
 80 |                         {'Key': 'Name', 'Value': self._cluster_config.get_cluster_name()},
 81 |                         {'Key': 'StackName', 'Value': self._cluster_config.get_cluster_name()}
 82 |                     ])
 83 |                 )
 84 |                 break
 85 | 
 86 |             except Exception as e:
 87 |                 attempts -= 1
 88 |                 if attempts == 0:
 89 |                     raise e
 90 |                 _LOG.error('Failed to create instance tags, will retry...', exc_info=e)
 91 |                 time.sleep(5)
 92 | 
 93 |         return instance_id
 94 | 
 95 |     def create_auto_recovery_alarm(self, instance_id):
 96 |         _LOG.info('Creating AWS auto recovery alarm for %s', instance_id)
 97 |         alarm_actions = ['arn:aws:automate:{}:ec2:recover'.format(self._cluster_config.get_aws_region())]
 98 |         alarm_name = '{}-{}-auto-recover'.format(self._cluster_config.get_cluster_name(), instance_id)
 99 | 
100 |         self._aws.cloudwatch_client.put_metric_alarm(
101 |             AlarmName=alarm_name,
102 |             AlarmActions=alarm_actions,
103 |             MetricName='StatusCheckFailed_System',
104 |             Namespace='AWS/EC2',
105 |             Statistic='Minimum',
106 |             Dimensions=[{
107 |                 'Name': 'InstanceId',
108 |                 'Value': instance_id
109 |             }],
110 |             Period=60,  # 1 minute
111 |             EvaluationPeriods=2,
112 |             Threshold=0,
113 |             ComparisonOperator='GreaterThanThreshold')
114 |         _LOG.info('Created alarm %s', alarm_name)
115 | 
116 |     def launch(self):
117 |         _LOG.info('Preparing AWS configuration for ec2 instance creation')
118 |         ip_address_allocator = IpAddressAllocator(self._aws, self._cluster_config.get_vpc_id(), self._az)
119 |         subnet, ip = ip_address_allocator.allocate_ip_addresses(1)[0]
120 |         return self._launch_instance(ip,
121 |                                      subnet,
122 |                                      self._find_ami(),
123 |                                      self._get_security_group_id(),
124 |                                      self._get_instance_profile())
125 | 
126 |     def _get_instance_profile(self):
127 |         profile_name = 'profile-{}'.format(self._cluster_config.get_cluster_name())
128 |         profile = self._aws.iam_client.get_instance_profile(InstanceProfileName=profile_name)
129 |         _LOG.info("IAM profile %s exists, using it", profile_name)
130 |         return profile['InstanceProfile']
131 | 
132 |     def _get_security_group_id(self) -> str:
133 |         _LOG.info('Configuring security group ...')
134 |         security_groups = self._aws.ec2_client.describe_security_groups(
135 |             Filters=[{'Name': 'group-name', 'Values': [self._cluster_config.get_cluster_name()]}])
136 |         if security_groups['SecurityGroups']:
137 |             sg = security_groups['SecurityGroups'][0]
138 |             _LOG.info('Security group for %s exists, will use it %s',
139 |                       self._cluster_config.get_cluster_name(), sg['GroupId'])
140 |             return sg['GroupId']
141 |         raise Exception('Security group does not exist for {}'.format(self._cluster_config.get_cluster_name()))
142 | 
143 |     def _get_ip_permission(self, port: int):
144 |         return {
145 |             'IpProtocol': 'tcp',
146 |             'FromPort': port,
147 |             'ToPort': port,
148 |             'IpRanges': [{'CidrIp': '0.0.0.0/0'}]
149 |         }
150 | 
151 |     def _find_ami(self) -> dict:
152 |         _LOG.info('Finding latest Taupage AMI.')
153 |         filters = [{'Name': 'image-id', 'Values': [self._cluster_config.get_ami_id()]}]
154 |         images = list(self._aws.ec2_resource.images.filter(Filters=filters))
155 |         if not images:
156 |             raise Exception('No Taupage AMI found')
157 |         most_recent_image = sorted(images, key=lambda i: i.name)[-1]  # It s expected that image is only one
158 | 
159 |         _LOG.info('The AMI to use is %s', most_recent_image)
160 | 
161 |         return most_recent_image
162 | 


--------------------------------------------------------------------------------
/bubuku/aws/ip_address_allocator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | import netaddr
 4 | 
 5 | from bubuku.aws import AWSResources
 6 | 
 7 | _LOG = logging.getLogger('bubuku.cluster.aws.subnet')
 8 | 
 9 | 
10 | class IpAddressAllocator(object):
11 |     def __init__(self, aws: AWSResources, vpc_id: str, az: str):
12 |         self.aws = aws
13 |         self._vpc_id = vpc_id
14 |         self._az = az
15 | 
16 |     def _get_subnets(self, prefix_filter: str) -> list:
17 |         """
18 |         Returns lists of subnets, which names start
19 |         with the specified prefix (it should be either 'dmz-' or
20 |         'internal-'), sorted by the Availability Zone and filtered by vpc id
21 |         """
22 |         _LOG.info('Getting subnets for vpc_id: %s and availability_zone: %s', self._vpc_id, self._az)
23 | 
24 |         resp = self.aws.ec2_client.describe_subnets()
25 |         subnets = []
26 | 
27 |         for subnet in resp['Subnets']:
28 |             if subnet['VpcId'] != self._vpc_id:
29 |                 continue
30 |             if subnet['AvailabilityZone'] != self._az:
31 |                 continue
32 |             for tag in subnet['Tags']:
33 |                 if tag['Key'] == 'Name' and tag['Value'].startswith(prefix_filter):
34 |                     subnets.append(subnet)
35 |                     break
36 |         _LOG.info('Got subnets %s ', subnets)
37 |         return subnets
38 | 
39 |     def allocate_ip_addresses(self, address_count: int) -> list:
40 |         """
41 |         Allocate unused private IP addresses by checking the current
42 |         reservations
43 |         Return list of tuples (subnet, ip)
44 |         """
45 |         _LOG.info('Allocating IP addresses ...')
46 | 
47 |         def try_next_address(ips, subnet):
48 |             try:
49 |                 return str(next(ips))
50 |             except StopIteration:
51 |                 raise Exception('Out of available IP addresses in subnet {}'.format(subnet['CidrBlock']))
52 | 
53 |         #
54 |         # Here we have to account for the behavior of launch_*_nodes
55 |         # which iterate through subnets to put the instances into
56 |         # different Availability Zones.
57 |         #
58 |         subnets = self._get_subnets('internal-')
59 |         network_ips = [netaddr.IPNetwork(s['CidrBlock']).iter_hosts() for s in subnets]
60 | 
61 |         for idx, ips in enumerate(network_ips):
62 |             #
63 |             # Some of the first addresses in each subnet are
64 |             # taken by AWS system instances that we can't see,
65 |             # so we try to skip them.
66 |             #
67 |             for _ in range(10):
68 |                 try_next_address(ips, subnets[idx])
69 | 
70 |         i = 0
71 |         result_subnets_ips = []
72 |         while i < address_count:
73 |             idx = i % len(subnets)
74 |             subnet = subnets[idx]
75 |             ip = try_next_address(network_ips[idx], subnet)
76 |             resp = self.aws.ec2_client.describe_instances(Filters=[{
77 |                 'Name': 'private-ip-address',
78 |                 'Values': [ip]
79 |             }])
80 |             if not resp['Reservations']:
81 |                 i += 1
82 |                 _LOG.info('Got ip address %s ', ip)
83 |                 result_subnets_ips.append((subnet, ip))
84 | 
85 |         _LOG.info('IP Addresses are allocated')
86 | 
87 |         return result_subnets_ips
88 | 


--------------------------------------------------------------------------------
/bubuku/aws/node.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from bubuku.aws import AWSResources
 4 | from bubuku.aws.cluster_config import ClusterConfig
 5 | 
 6 | _LOG = logging.getLogger('bubuku.aws.node')
 7 | 
 8 | KAFKA_LOGS_EBS = 'kafka-logs-ebs'
 9 | 
10 | 
11 | class Ec2Node(object):
12 |     def __init__(self, aws: AWSResources, cluster_config: ClusterConfig, ip: str):
13 |         self.aws = aws
14 |         self.cluster_config = cluster_config
15 |         self.ip = ip
16 |         self.instance = self._get_instance_by_ip()
17 |         _LOG.info('Searching for instance %s volumes', self.instance.instance_id)
18 |         volumes = self.aws.ec2_client.describe_instance_attribute(InstanceId=self.instance.instance_id,
19 |                                                                   Attribute='blockDeviceMapping')
20 |         data_volume = next(v for v in volumes['BlockDeviceMappings'] if v['DeviceName'] == '/dev/xvdk')
21 |         data_volume_id = data_volume['Ebs']['VolumeId']
22 |         self.volume = self.aws.ec2_resource.Volume(data_volume_id)
23 | 
24 |     def get_node_availability_zone(self):
25 |         return self.volume.availability_zone
26 | 
27 |     def get_volume_id(self):
28 |         return self.volume.id
29 | 
30 |     def get_ip(self):
31 |         return self.ip
32 | 
33 |     def is_volume_in_use(self):
34 |         self.volume.load()
35 |         if self.volume.state == 'in-use':
36 |             _LOG.info('Volume %s is attached. Clearing tag:Name', self.volume)
37 |             self.volume.create_tags(Tags=[{'Key': 'Name', 'Value': ''}])
38 |             _LOG.info('Completed clearing tag:Name for %s', self.volume)
39 |             return True
40 |         return False
41 | 
42 |     def is_volume_available(self):
43 |         self.volume.load()
44 |         return self.volume.state == 'available'
45 | 
46 |     def detach_volume(self):
47 |         self.volume.create_tags(Tags=[{'Key': 'Name', 'Value': KAFKA_LOGS_EBS}])
48 |         _LOG.info('Detaching %s from %s', self.volume.id, self.instance.instance_id)
49 |         self.aws.ec2_client.detach_volume(VolumeId=self.volume.id, Force=False)
50 | 
51 |     def terminate(self):
52 |         cluster_name = self.cluster_config.get_cluster_name()
53 |         _LOG.info('Terminating %s in %s', self.instance, cluster_name)
54 |         alarm_name = '{}-{}-auto-recover'.format(cluster_name, self.instance.instance_id)
55 |         _LOG.info('Deleting alarm %s in %s for %s', alarm_name, cluster_name, self.instance.instance_id)
56 |         self.aws.cloudwatch_client.delete_alarms(AlarmNames=[alarm_name])
57 |         self.instance.terminate()
58 | 
59 |     def is_terminated(self):
60 |         self.instance.load()
61 |         _LOG.info('Instance state is %s. Waiting ...', self.instance.state['Name'])
62 |         if self.instance.state['Name'] == 'terminated':
63 |             _LOG.info('%s is successfully terminated', self.instance)
64 |             return True
65 |         return False
66 | 
67 |     def _get_instance_by_ip(self):
68 |         instances = self.aws.ec2_resource.instances.filter(Filters=[
69 |             {'Name': 'instance-state-name', 'Values': ['running', 'pending']},
70 |             {'Name': 'network-interface.addresses.private-ip-address', 'Values': [self.ip]},
71 |             {'Name': 'tag:Name', 'Values': [self.cluster_config.get_cluster_name()]}])
72 |         instances = list(instances)
73 |         if not instances:
74 |             raise Exception('Instance by ip {} not found in cluster {}'
75 |                             .format(self.ip, self.cluster_config.get_cluster_name()))
76 |         _LOG.info('Found %s by ip %s', instances[0], self.ip)
77 |         return instances[0]
78 | 


--------------------------------------------------------------------------------
/bubuku/broker.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from time import time, sleep
  4 | 
  5 | from bubuku.config import KafkaProperties
  6 | from bubuku.id_extractor import BrokerIdExtractor
  7 | from bubuku.process import KafkaProcess
  8 | from bubuku.zookeeper import BukuExhibitor
  9 | 
 10 | _LOG = logging.getLogger('bubuku.broker')
 11 | 
 12 | 
 13 | class LeaderElectionInProgress(Exception):
 14 |     pass
 15 | 
 16 | 
 17 | class StartupTimeout(object):
 18 |     def is_timed_out(self, seconds: float) -> bool:
 19 |         raise Exception('Not supported')
 20 | 
 21 |     def on_timeout_fail(self):
 22 |         raise Exception('Not supported')
 23 | 
 24 |     @staticmethod
 25 |     def build(props: dict):
 26 |         type_ = props.get('type', 'linear')
 27 |         if type_ == 'linear':
 28 |             return LinearTimeout(float(props.get('initial', 300)), float(props.get('step', '60')))
 29 |         elif type_ == 'progressive':
 30 |             return ProgressiveTimeout(float(props.get('initial', 300)), float(props.get('step', '0.5')))
 31 |         elif type_ == 'none':
 32 |             return NoTimeout()
 33 |         else:
 34 |             raise NotImplementedError('Startup timeout type {} is not valid'.format(type_))
 35 | 
 36 | 
 37 | class ProgressiveTimeout(StartupTimeout):
 38 |     def __init__(self, initial: float, scale: float):
 39 |         self.initial = initial
 40 |         self.timeout = initial
 41 |         self.scale = scale
 42 | 
 43 |     def is_timed_out(self, seconds: float) -> bool:
 44 |         return seconds > self.timeout
 45 | 
 46 |     def on_timeout_fail(self):
 47 |         self.timeout = self.timeout * (1 + self.scale)
 48 | 
 49 |     def __str__(self):
 50 |         return 'Progressive, initial={}, scale={}, current={}'.format(self.initial, self.scale, self.timeout)
 51 | 
 52 | 
 53 | class LinearTimeout(StartupTimeout):
 54 |     def __init__(self, initial: float, step: float):
 55 |         self.initial = initial
 56 |         self.timeout = initial
 57 |         self.step = step
 58 | 
 59 |     def is_timed_out(self, seconds: float) -> bool:
 60 |         return seconds > self.timeout
 61 | 
 62 |     def on_timeout_fail(self):
 63 |         self.timeout += self.step
 64 | 
 65 |     def __str__(self):
 66 |         return 'Linear, initial={}, step={}, current={}'.format(self.initial, self.step, self.timeout)
 67 | 
 68 | 
 69 | class NoTimeout(StartupTimeout):
 70 |     def is_timed_out(self, seconds: float) -> bool:
 71 |         return False
 72 | 
 73 |     def on_timeout_fail(self):
 74 |         pass
 75 | 
 76 | 
 77 | class BrokerManager(object):
 78 |     def __init__(self, process: KafkaProcess, exhibitor: BukuExhibitor,
 79 |                  id_manager: BrokerIdExtractor, kafka_properties: KafkaProperties, timeout: StartupTimeout):
 80 |         self.id_manager = id_manager
 81 |         self.exhibitor = exhibitor
 82 |         self.kafka_properties = kafka_properties
 83 |         self.process = process
 84 |         self.timeout = timeout
 85 | 
 86 |     def is_running(self):
 87 |         return self.process.is_running()
 88 | 
 89 |     def is_registered_in_zookeeper(self):
 90 |         return self.id_manager.is_registered()
 91 | 
 92 |     def get_zookeeper_session_timeout(self):
 93 |         return int(self.kafka_properties.get_property(
 94 |             "zookeeper.connection.timeout.ms")or 6000) 
 95 | 
 96 |     def is_running_and_registered(self):
 97 |         if not self.process.is_running():
 98 |             return False
 99 |         return self.id_manager.is_registered()
100 | 
101 |     def stop_kafka_process(self):
102 |         if self.process.is_running():
103 |             self.process.stop_and_wait()
104 |         self._wait_for_zk_absence()
105 | 
106 |     def _is_clean_election(self):
107 |         value = self.kafka_properties.get_property('unclean.leader.election.enable')
108 |         return value == 'false'
109 | 
110 |     def has_leadership(self):
111 |         """
112 |         Says if this broker is still a leader for partitions
113 |         :return: True, if broker is a leader for some partitions.
114 |         """
115 |         broker_id = self.id_manager.get_broker_id()
116 |         if not broker_id:
117 |             return False
118 |         return not self._is_leadership_transferred(dead_broker_ids=[broker_id])
119 | 
120 |     def _wait_for_zk_absence(self):
121 |         try:
122 |             while self.id_manager.is_registered():
123 |                 sleep(1)
124 |         except Exception as e:
125 |             _LOG.error('Failed to wait until broker id absence in zk', exc_info=e)
126 | 
127 |     def get_zk_connect_string(self):
128 |         return self.kafka_properties.get_property('zookeeper.connect')
129 | 
130 |     def start_kafka_process(self, zookeeper_address):
131 |         """
132 |         Starts kafka using zookeeper address provided.
133 |         :param zookeeper_address: Address to use for kafka
134 |         :raise LeaderElectionInProgress: raised when broker can not be started because leader election is in progress
135 |         """
136 |         if not self.process.is_running():
137 |             if not self._is_leadership_transferred(active_broker_ids=self.exhibitor.get_broker_ids()):
138 |                 raise LeaderElectionInProgress()
139 | 
140 |             _LOG.info('Using ZK address: {}'.format(zookeeper_address))
141 |             self.kafka_properties.set_property('zookeeper.connect', zookeeper_address)
142 | 
143 |             self.kafka_properties.dump()
144 | 
145 |             _LOG.info('Staring kafka process')
146 |             self.process.start(self.kafka_properties.settings_file)
147 | 
148 |             _LOG.info('Waiting for kafka to start with timeout {}'.format(self.timeout))
149 |             start = time()
150 |             while self.process.is_running():
151 |                 if self.id_manager.is_registered():
152 |                     break
153 |                 if self.timeout.is_timed_out(time() - start):
154 |                     self.timeout.on_timeout_fail()
155 |                     break
156 |                 sleep(1)
157 |             if not self.process.is_running() or not self.id_manager.is_registered():
158 |                 _LOG.error(
159 |                     'Failed to wait for broker to start up, probably will kill, next timeout is'.format(self.timeout))
160 | 
161 |     def _is_leadership_transferred(self, active_broker_ids=None, dead_broker_ids=None):
162 |         _LOG.info('Checking if leadership is transferred: active_broker_ids={}, dead_broker_ids={}'.format(
163 |             active_broker_ids, dead_broker_ids))
164 |         if self._is_clean_election():
165 |             topics = self.exhibitor.load_active_topics()
166 |             for topic, partition, state in self.exhibitor.load_partition_states(topics=topics):
167 |                 leader = str(state['leader'])
168 |                 if active_broker_ids and leader not in active_broker_ids:
169 |                     if any(str(x) in active_broker_ids for x in state.get('isr', [])):
170 |                         _LOG.warning(
171 |                             'Leadership is not transferred for {} {} ({}, brokers: {})'.format(
172 |                                 topic, partition, json.dumps(state), active_broker_ids))
173 |                         return False
174 |                     else:
175 |                         _LOG.warning('No single isr available for {}, {}, state: {}, skipping check for that'.format(
176 |                             topic, partition, json.dumps(state)))
177 |                 if dead_broker_ids and leader in dead_broker_ids:
178 |                     _LOG.warning('Leadership is not transferred for {} {}, {} (dead list: {})'.format(
179 |                         topic, partition, json.dumps(state), dead_broker_ids))
180 |                     return False
181 | 
182 |         return True
183 | 


--------------------------------------------------------------------------------
/bubuku/communicate.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import threading
 3 | import time
 4 | from queue import Queue, Empty, Full
 5 | 
 6 | __COMMAND_QUEUE = Queue()
 7 | 
 8 | _LOG = logging.getLogger('bubuku.communicate')
 9 | 
10 | 
11 | def sleep_and_operate(controller, timeout: float):
12 |     cur_time = time.time()
13 |     finish = cur_time + (0.1 if timeout <= 0 else timeout)
14 |     while cur_time < finish:
15 |         try:
16 |             command = __COMMAND_QUEUE.get(block=True, timeout=finish - cur_time)
17 |             try:
18 |                 command(controller)
19 |             except Exception as e:
20 |                 _LOG.error('Command finished with error', exc_info=e)
21 |         except Empty:
22 |             pass
23 |         cur_time = time.time()
24 | 
25 | 
26 | def execute_on_controller_thread(function, timeout):
27 |     condition = threading.Condition()
28 |     result = [None, True]
29 | 
30 |     def _execute(controller):
31 |         with condition:
32 |             if result[1]:
33 |                 try:
34 |                     result[0] = function(controller)
35 |                 finally:
36 |                     condition.notify()
37 | 
38 |     finish = time.time() + timeout
39 |     with condition:
40 |         try:
41 |             __COMMAND_QUEUE.put(_execute, timeout=timeout)
42 |         except Full:
43 |             raise TimeoutError('Timeout expired')
44 |         if condition.wait(timeout=finish - time.time()):
45 |             return result[0]
46 |         else:
47 |             result[1] = False
48 |             raise TimeoutError('Timeout expired')
49 | 


--------------------------------------------------------------------------------
/bubuku/config.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | from collections import namedtuple
 4 | 
 5 | _LOG = logging.getLogger('bubuku.properties')
 6 | 
 7 | Config = namedtuple('Config', ('kafka_dir', 'kafka_settings_template', 'zk_stack_name',
 8 |                                'zk_prefix', 'features', 'health_port', 'mode', 'timeout', 'zk_static_ips'))
 9 | 
10 | 
11 | class KafkaProperties(object):
12 |     def __init__(self, template: str, kafka_settings: str):
13 |         self.lines = []
14 |         self.settings_file = kafka_settings
15 |         _LOG.info('Loading template properties from {}'.format(template))
16 |         with open(template, 'r') as f:
17 |             for l in f.readlines():
18 |                 self.lines.append(_make_clean_line(l))
19 | 
20 |     def get_property(self, name: str) -> str:
21 |         idx = self._get_property_idx(name)
22 |         if idx is not None:
23 |             return self.lines[idx].split('=', 1)[1]
24 |         return None
25 | 
26 |     def _get_property_idx(self, name: str):
27 |         search = '{}='.format(name)
28 |         for idx in range(0, len(self.lines)):
29 |             if self.lines[idx].startswith(search):
30 |                 return idx
31 |         return None
32 | 
33 |     def delete_property(self, name):
34 |         idx = self._get_property_idx(name)
35 |         if idx is not None:
36 |             del self.lines[idx]
37 | 
38 |     def set_property(self, name, value):
39 |         idx = self._get_property_idx(name)
40 |         if idx is not None:
41 |             self.lines[idx] = '{}={}'.format(name, value)
42 |         else:
43 |             self.lines.append('{}={}'.format(name, value))
44 | 
45 |     def dump(self):
46 |         _LOG.info('Dumping kafka properties to {}'.format(self.settings_file))
47 |         with open(self.settings_file, mode='w') as f:
48 |             for l in self.lines:
49 |                 f.write('{}\n'.format(l))
50 | 
51 | 
52 | def _load_timeout_dict(load_func):
53 |     startup_timeout_pairs = [(name, load_func('STARTUP_TIMEOUT_{}'.format(name.upper()))) for name in
54 |                              ['type', 'initial', 'step']]
55 |     return {name: value for name, value in startup_timeout_pairs if value}
56 | 
57 | 
58 | def load_config() -> Config:
59 |     zk_prefix = os.getenv('ZOOKEEPER_PREFIX', '/')
60 | 
61 |     features_str = os.getenv('BUKU_FEATURES', '').lower()
62 |     features = {key: {} for key in features_str.split(',')} if features_str else {}
63 |     if "balance_data_size" in features:
64 |         features["balance_data_size"]["diff_threshold_mb"] = int(os.getenv('FREE_SPACE_DIFF_THRESHOLD_MB', '50000'))
65 |     return Config(
66 |         kafka_dir=os.getenv('KAFKA_DIR'),
67 |         kafka_settings_template=os.getenv('KAFKA_SETTINGS'),
68 |         zk_stack_name=os.getenv('ZOOKEEPER_STACK_NAME'),
69 |         zk_static_ips=os.getenv('ZOOKEEPER_STATIC_IPS_PORT'),
70 |         zk_prefix=zk_prefix if zk_prefix.startswith('/') or not zk_prefix else '/{}'.format(zk_prefix),
71 |         features=features,
72 |         health_port=int(os.getenv('HEALTH_PORT', '8888')),
73 |         mode=str(os.getenv('BUBUKU_MODE', 'amazon')).lower(),
74 |         timeout=_load_timeout_dict(os.getenv)
75 |     )
76 | 
77 | 
78 | def _make_clean_line(l: str) -> str:
79 |     result = l.strip()
80 |     if result.startswith('#') or not result:
81 |         return result
82 |     if '=' not in result:
83 |         return ''
84 |     n, v = result.split('=', 1)
85 |     return '{}={}'.format(n.strip(), v)
86 | 


--------------------------------------------------------------------------------
/bubuku/controller.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from time import time
  3 | from typing import Tuple, Optional
  4 | 
  5 | from bubuku.broker import BrokerManager
  6 | from bubuku.communicate import sleep_and_operate
  7 | from bubuku.env_provider import EnvProvider
  8 | from bubuku.zookeeper import BukuExhibitor
  9 | 
 10 | _LOG = logging.getLogger('bubuku.controller')
 11 | 
 12 | 
 13 | class Change(object):
 14 |     def get_name(self) -> str:
 15 |         raise NotImplementedError('Not implemented yet')
 16 | 
 17 |     def can_run(self, current_actions) -> bool:
 18 |         raise NotImplementedError('Not implemented yet')
 19 | 
 20 |     #
 21 |     # Returns a flag indicating if the change should continue running (True).
 22 |     # In that case time_till_next_run() is called to determine when to schedule the next run.
 23 |     #
 24 |     def run(self, current_actions) -> bool:
 25 |         raise NotImplementedError('Not implemented')
 26 | 
 27 |     def time_till_next_run(self) -> float:
 28 |         return 0.5
 29 | 
 30 |     def can_run_at_exit(self) -> bool:
 31 |         return False
 32 | 
 33 |     def on_remove(self):
 34 |         pass
 35 | 
 36 | 
 37 | class Check(object):
 38 |     def __init__(self, check_interval_s=5):
 39 |         self.check_interval_s = check_interval_s
 40 |         self.__last_check_timestamp_s = 0
 41 | 
 42 |     def check_if_time(self) -> Change:
 43 |         if self.time_till_check() <= 0:
 44 |             self.__last_check_timestamp_s = time()
 45 |             _LOG.info('Executing check {}'.format(self))
 46 |             return self.check()
 47 |         return None
 48 | 
 49 |     def time_till_check(self):
 50 |         return self.__last_check_timestamp_s + self.check_interval_s - time()
 51 | 
 52 |     def check(self) -> Change:
 53 |         raise NotImplementedError('Not implemented')
 54 | 
 55 | 
 56 | def _exclude_self(provider_id, name, running_actions):
 57 |     return [k for k, v in running_actions.items() if k != name or v != provider_id]
 58 | 
 59 | 
 60 | class Controller(object):
 61 |     def __init__(self, broker_manager: BrokerManager, zk: BukuExhibitor, env_provider: EnvProvider):
 62 |         self.broker_manager = broker_manager
 63 |         self.zk = zk
 64 |         self.env_provider = env_provider
 65 |         self.checks = []
 66 |         self.changes = {}  # Holds mapping from change name to array of pending changes
 67 |         self.running = True
 68 |         self.provider_id = None  # provider id must not be requested on initialization
 69 | 
 70 |     def enumerate_changes(self):
 71 |         with self.zk.lock(self.provider_id):
 72 |             running_changes = self.zk.get_running_changes()
 73 | 
 74 |         result = []
 75 |         for name, change_list in self.changes.items():
 76 |             running = running_changes.get(name) == self.provider_id
 77 |             first = True
 78 |             for change in change_list:
 79 |                 result.append({
 80 |                     'type': name,
 81 |                     'description': str(change),
 82 |                     'running': bool(first and running)
 83 |                 })
 84 |                 first = False
 85 |         return result
 86 | 
 87 |     def cancel_changes(self, name):
 88 |         result = len(self.changes.get(name, {}))
 89 |         if result:
 90 |             if name in self.zk.get_running_changes():
 91 |                 for change in self.changes[name]:
 92 |                     change.on_remove()
 93 |             with self.zk.lock(self.provider_id):
 94 |                 self.zk.unregister_change(name)
 95 |             del self.changes[name]
 96 |         return result
 97 | 
 98 |     def add_check(self, check):
 99 |         _LOG.info('Adding check {}'.format(str(check)))
100 |         self.checks.append(check)
101 | 
102 |     def _register_running_changes(self) -> dict:
103 |         if not self.changes:
104 |             return {}  # Do not take lock if there are no changes to register
105 |         _LOG.debug('Taking lock for processing')
106 |         with self.zk.lock(self.provider_id):
107 |             _LOG.debug('Lock is taken')
108 |             # Get list of current running changes
109 |             running_changes = self.zk.get_running_changes()
110 |             if running_changes:
111 |                 _LOG.info("Running changes: {}".format(running_changes))
112 |             # Register changes to run
113 |             for name, change_list in self.changes.items():
114 |                 # Only first change is able to run
115 |                 first_change = change_list[0]
116 |                 if first_change.can_run(_exclude_self(self.provider_id, name, running_changes)):
117 |                     if name not in running_changes:
118 |                         self.zk.register_change(name, self.provider_id)
119 |                         running_changes[name] = self.provider_id
120 |                 else:
121 |                     _LOG.info('Change {} is waiting for others: {}'.format(name, running_changes))
122 |             return running_changes
123 | 
124 |     def _run_changes(self, running_changes: dict) -> Tuple[list, Optional[float]]:
125 |         changes_to_remove = []
126 |         min_time_till_next_change_run = None
127 |         for name, change_list in self.changes.copy().items():
128 |             if name in running_changes and running_changes[name] == self.provider_id:
129 |                 change = change_list[0]
130 |                 _LOG.info('Executing action {} step'.format(change))
131 |                 if self.running or change.can_run_at_exit():
132 |                     try:
133 |                         if not change.run(_exclude_self(self.provider_id, change.get_name(), running_changes)):
134 |                             _LOG.info('Action {} completed'.format(change))
135 |                             changes_to_remove.append(change.get_name())
136 |                         else:
137 |                             _LOG.info('Action {} will be executed on next loop step'.format(change))
138 |                             time_till_next_run = change.time_till_next_run()
139 |                             if min_time_till_next_change_run is None:
140 |                                 min_time_till_next_change_run = time_till_next_run
141 |                             else:
142 |                                 min_time_till_next_change_run = min(time_till_next_run, min_time_till_next_change_run)
143 |                     except Exception as e:
144 |                         _LOG.error('Failed to execute change {} because of exception, removing'.format(change),
145 |                                    exc_info=e)
146 |                         changes_to_remove.append(change.get_name())
147 |                 else:
148 |                     _LOG.info(
149 |                         'Action {} can not be run while stopping, forcing to stop it'.format(change))
150 |                     changes_to_remove.append(change.get_name())
151 |         return changes_to_remove, min_time_till_next_change_run
152 | 
153 |     def _release_changes_lock(self, changes_to_remove):
154 |         if changes_to_remove:
155 |             for change_name in changes_to_remove:
156 |                 removed_change = self.changes[change_name][0]
157 |                 del self.changes[change_name][0]
158 |                 if not self.changes[change_name]:
159 |                     del self.changes[change_name]
160 |                 removed_change.on_remove()
161 |             with self.zk.lock():
162 |                 for name in changes_to_remove:
163 |                     self.zk.unregister_change(name)
164 | 
165 |     def loop(self, change_on_init=None):
166 |         self.provider_id = self.env_provider.get_ip()
167 |         if change_on_init:
168 |             self._add_change_to_queue(change_on_init)
169 |         while self.running or self.changes:
170 |             time_till_next_step = self.make_step()
171 | 
172 |             timeouts = [check.time_till_check() for check in self.checks]
173 |             timeouts.append(time_till_next_step or 5.0)
174 | 
175 |             sleep_and_operate(self, min(timeouts))
176 | 
177 |     def make_step(self) -> Optional[float]:
178 |         # register running changes
179 |         running_changes = self._register_running_changes()
180 | 
181 |         # apply changes without holding lock
182 |         changes_to_remove, time_till_next_run = self._run_changes(running_changes)
183 | 
184 |         # remove processed actions
185 |         self._release_changes_lock(changes_to_remove)
186 | 
187 |         if self.running:
188 |             for check in self.checks:
189 |                 change = check.check_if_time()
190 |                 if change:
191 |                     self._add_change_to_queue(change)
192 |                     # prioritize newly appearing change run
193 |                     time_till_next_run = 0.5
194 | 
195 |         return time_till_next_run
196 | 
197 |     def _add_change_to_queue(self, change):
198 |         _LOG.info('Adding change {} to pending changes'.format(change.get_name()))
199 |         if change.get_name() not in self.changes:
200 |             self.changes[change.get_name()] = []
201 |         self.changes[change.get_name()].append(change)
202 | 
203 |     def stop(self, change: Change):
204 |         _LOG.info('Stopping controller with additional change: {}'.format(change.get_name() if change else None))
205 |         # clear all pending changes
206 |         if change:
207 |             self._add_change_to_queue(change)
208 |         self.running = False
209 | 


--------------------------------------------------------------------------------
/bubuku/controller_api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import threading
 4 | from functools import partial
 5 | from http.server import BaseHTTPRequestHandler, HTTPServer
 6 | 
 7 | from bubuku.communicate import execute_on_controller_thread
 8 | from bubuku.controller import Controller
 9 | from bubuku.env_provider import EnvProvider
10 | from bubuku.utils import CmdHelper
11 | from bubuku.features.metric_collector import MetricCollector
12 | from bubuku.config import load_config
13 | from bubuku.zookeeper import load_exhibitor_proxy
14 | 
15 | _CONTROLLER_TIMEOUT = 5
16 | 
17 | _API_CONTROLLER = '/api/controller/'
18 | 
19 | _LOG = logging.getLogger('bubuku.health')
20 | 
21 | 
22 | def load_controller_queue(controller: Controller):
23 |     return controller.enumerate_changes()
24 | 
25 | 
26 | def delete_from_controller_queue(name: str, controller: Controller):
27 |     return {
28 |         'count': controller.cancel_changes(name)
29 |     }
30 | 
31 | 
32 | class _Handler(BaseHTTPRequestHandler):
33 |     cmd_helper = None
34 | 
35 |     def do_GET(self):
36 |         if self.path in ('/api/disk_stats', '/api/disk_stats/'):
37 |             used_kb, free_kb = self.cmd_helper.get_disk_stats()
38 |             self._send_response({'free_kb': free_kb, 'used_kb': used_kb})
39 |         elif self.path.startswith(_API_CONTROLLER):
40 |             self.wrap_controller_execution(lambda: self._run_controller_action(self.path[len(_API_CONTROLLER):]))
41 |         elif self.path in ('/api/metrics', '/api/metrics/'):
42 |             config = load_config()
43 |             env_provider = EnvProvider.create_env_provider(config)
44 |             with load_exhibitor_proxy(env_provider.get_address_provider(), config.zk_prefix) as zookeeper:
45 |                 metrics = MetricCollector(zookeeper).get_metrics_from_brokers()
46 |                 self._send_response({'metrics': metrics})
47 |         else:
48 |             self._send_response({'status': 'OK'})
49 | 
50 |     def wrap_controller_execution(self, call):
51 |         try:
52 |             call()
53 |         except TimeoutError as e:
54 |             _LOG.error('Failed to rum action because of timeouts', exc_info=e)
55 |             self._send_response({'code': 'timeout', 'message': 'Timeout occurred'}, 500)
56 | 
57 |     def do_DELETE(self):
58 |         if not self.path.startswith(_API_CONTROLLER):
59 |             return self._send_response({'message': 'Path {} is not supported'.format(self.path)}, 404)
60 |         action = self.path[len(_API_CONTROLLER):].split('/')
61 |         if action[0] == 'queue':
62 |             if len(action) < 2:
63 |                 return self._send_response({'message': 'No second argument provided!'}, 400)
64 |             self.wrap_controller_execution(
65 |                 lambda: self._send_response(execute_on_controller_thread(
66 |                     partial(delete_from_controller_queue, action[1]), _CONTROLLER_TIMEOUT), 200))
67 |         else:
68 |             return self._send_response({'message': 'Action {} is not supported'.format(action[0])}, 404)
69 | 
70 |     def _run_controller_action(self, action):
71 |         if action.split('/')[0] == 'queue':
72 |             return self._send_response(execute_on_controller_thread(load_controller_queue, _CONTROLLER_TIMEOUT), 200)
73 |         else:
74 |             return self._send_response({'message': 'Action {} is not supported'.format(action)}, 404)
75 | 
76 |     def _send_response(self, json_, status_code=200):
77 |         self.send_response(status_code)
78 |         self.send_header('Content-type', 'application/json')
79 |         self.end_headers()
80 |         self.wfile.write(json.dumps(json_).encode('utf-8'))
81 | 
82 | 
83 | def start_server(port, cmd_helper: CmdHelper) -> threading.Thread:
84 |     def _thread_func():
85 |         _Handler.cmd_helper = cmd_helper
86 |         server = HTTPServer(('', port), _Handler)
87 |         server.serve_forever()
88 |         server.socket.close()
89 | 
90 |     t = threading.Thread(target=_thread_func, daemon=True)
91 |     _LOG.info('Starting health server on port {}'.format(port))
92 |     t.start()
93 |     return t
94 | 


--------------------------------------------------------------------------------
/bubuku/daemon.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """General Kafka Start Script."""
  3 | 
  4 | import logging
  5 | 
  6 | from bubuku import controller_api
  7 | from bubuku.broker import BrokerManager, StartupTimeout
  8 | from bubuku.config import load_config, KafkaProperties, Config
  9 | from bubuku.controller import Controller
 10 | from bubuku.env_provider import EnvProvider
 11 | from bubuku.features.data_size_stats import GenerateDataSizeStatistics
 12 | from bubuku.features.rebalance.check import RebalanceOnStartCheck, RebalanceOnBrokerListCheck
 13 | from bubuku.features.remote_exec import RemoteCommandExecutorCheck
 14 | from bubuku.features.restart_if_dead import CheckBrokerStopped
 15 | from bubuku.features.restart_on_zk_change import CheckExhibitorAddressChanged, RestartBrokerChange
 16 | from bubuku.features.swap_partitions import CheckBrokersDiskImbalance
 17 | from bubuku.features.terminate import register_terminate_on_interrupt
 18 | from bubuku.process import KafkaProcess
 19 | from bubuku.utils import CmdHelper
 20 | from bubuku.zookeeper import BukuExhibitor, load_exhibitor_proxy
 21 | 
 22 | _LOG = logging.getLogger('bubuku.main')
 23 | 
 24 | 
 25 | def apply_features(api_port, features: dict, controller: Controller, buku_proxy: BukuExhibitor, broker: BrokerManager,
 26 |                    kafka_properties: KafkaProperties, env_provider: EnvProvider) -> list:
 27 |     for feature, config in features.items():
 28 |         if feature == 'restart_on_exhibitor':
 29 |             controller.add_check(CheckExhibitorAddressChanged(buku_proxy, broker))
 30 |         elif feature == 'rebalance_on_start':
 31 |             controller.add_check(RebalanceOnStartCheck(buku_proxy, broker))
 32 |         elif feature == 'rebalance_on_brokers_change':
 33 |             controller.add_check(RebalanceOnBrokerListCheck(buku_proxy, broker))
 34 |         elif feature == 'balance_data_size':
 35 |             controller.add_check(
 36 |                 CheckBrokersDiskImbalance(buku_proxy, broker, config["diff_threshold_mb"] * 1024, api_port))
 37 |         elif feature == 'graceful_terminate':
 38 |             register_terminate_on_interrupt(controller, broker)
 39 |         elif feature == 'use_ip_address':
 40 |             unique_adv_listeners = __get_transformed_listeners(kafka_properties, env_provider.get_ip(), "advertised.listeners")
 41 |             kafka_properties.set_property('advertised.listeners', ",".join(unique_adv_listeners))
 42 |             
 43 |             unique_listeners = __get_transformed_listeners(kafka_properties, "0.0.0.0", "listeners")
 44 |             kafka_properties.set_property('listeners', ",".join(unique_listeners))
 45 | 
 46 |         else:
 47 |             _LOG.error('Using of unsupported feature "{}", skipping it'.format(feature))
 48 | 
 49 | def __get_transformed_listeners(kafka_properties: KafkaProperties, ip_addr, listener_property):
 50 |     old_listeners = kafka_properties.get_property(listener_property)            
 51 |     if not old_listeners:
 52 |         old_listeners = 'PLAINTEXT://:9092'            
 53 |     new_listeners = []
 54 |     for adv_listener in old_listeners.split(","):
 55 |         protocol, _ignore, port = adv_listener.split(":")
 56 |         new_listeners.append("{protocol}://{host}:{port}".format(
 57 |                     protocol=protocol,
 58 |                     host=ip_addr,
 59 |                     port=port
 60 |                 ))
 61 |     unique_listeners = sorted(set(new_listeners))
 62 |     return unique_listeners
 63 | 
 64 | def run_daemon_loop(config: Config, process_holder: KafkaProcess, cmd_helper: CmdHelper, restart_on_init: bool):
 65 |     _LOG.info("Using configuration: {}".format(config))
 66 |     kafka_props = KafkaProperties(config.kafka_settings_template,
 67 |                                   '{}/config/server.properties'.format(config.kafka_dir))
 68 | 
 69 |     env_provider = EnvProvider.create_env_provider(config)
 70 |     address_provider = env_provider.get_address_provider()
 71 |     rack = env_provider.get_rack()
 72 |     if rack:
 73 |         kafka_props.set_property('broker.rack', rack)
 74 |     startup_timeout = StartupTimeout.build(config.timeout)
 75 | 
 76 |     _LOG.info("Loading exhibitor configuration")
 77 |     with load_exhibitor_proxy(address_provider, config.zk_prefix) as zookeeper:
 78 |         _LOG.info("Loading broker_id policy")
 79 |         broker_id_manager = env_provider.create_broker_id_manager(zookeeper, kafka_props)
 80 | 
 81 |         _LOG.info("Building broker manager")
 82 |         broker = BrokerManager(process_holder, zookeeper, broker_id_manager, kafka_props,
 83 |                                startup_timeout)
 84 | 
 85 |         _LOG.info("Creating controller")
 86 |         controller = Controller(broker, zookeeper, env_provider)
 87 | 
 88 |         controller.add_check(CheckBrokerStopped(broker, zookeeper))
 89 |         controller.add_check(RemoteCommandExecutorCheck(zookeeper, broker, controller, config.health_port))
 90 |         controller.add_check(GenerateDataSizeStatistics(zookeeper, broker, cmd_helper,
 91 |                                                         kafka_props.get_property("log.dirs").split(",")))
 92 |         apply_features(config.health_port, config.features, controller, zookeeper, broker, kafka_props, env_provider)
 93 | 
 94 |         _LOG.info('Starting main controller loop')
 95 |         controller.loop(RestartBrokerChange(zookeeper, broker, lambda: False) if restart_on_init else None)
 96 | 
 97 | 
 98 | def main():
 99 |     logging.basicConfig(level=getattr(logging, 'INFO', None),
100 |                         format='%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s',
101 |                         datefmt='%Y-%m-%d %H:%M:%S')
102 | 
103 |     config = load_config()
104 |     _LOG.info("Using configuration: {}".format(config))
105 |     process = KafkaProcess(config.kafka_dir)
106 |     _LOG.info('Starting health server')
107 |     cmd_helper = CmdHelper()
108 |     controller_api.start_server(config.health_port, cmd_helper)
109 |     restart_on_init = False
110 |     while True:
111 |         try:
112 |             run_daemon_loop(config, process, cmd_helper, restart_on_init)
113 |             break
114 |         except Exception as ex:
115 |             _LOG.error("WOW! Almost died! Will try to restart from the begin. "
116 |                        "After initialization will be complete, will try to restart", exc_info=ex)
117 |             if process.is_running():
118 |                 restart_on_init = False
119 | 
120 | 
121 | if __name__ == '__main__':
122 |     main()
123 | 


--------------------------------------------------------------------------------
/bubuku/env_provider.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import uuid
  4 | from functools import partial
  5 | from typing import List
  6 | 
  7 | import boto3
  8 | import requests
  9 | 
 10 | from bubuku.config import Config, KafkaProperties
 11 | from bubuku.id_extractor import BrokerIdExtractor
 12 | from bubuku.zookeeper import BukuExhibitor, AddressListProvider
 13 | from bubuku.zookeeper.exhibitor import ExhibitorAddressProvider
 14 | 
 15 | _LOG = logging.getLogger('bubuku.amazon')
 16 | 
 17 | 
 18 | class EnvProvider(object):
 19 |     def get_ip(self) -> str:
 20 |         raise NotImplementedError('Not implemented')
 21 | 
 22 |     def get_address_provider(self):
 23 |         raise NotImplementedError('Not implemented')
 24 | 
 25 |     def create_broker_id_manager(self, zk: BukuExhibitor, kafka_props: KafkaProperties):
 26 |         raise NotImplementedError('Not implemented')
 27 | 
 28 |     def get_rack(self):
 29 |         raise NotImplementedError('Not implemented')
 30 | 
 31 |     @staticmethod
 32 |     def create_env_provider(config: Config):
 33 |         if config.mode == 'amazon':
 34 |             return AmazonEnvProvider(config)
 35 |         elif config.mode == 'local':
 36 |             return LocalEnvProvider()
 37 |         else:
 38 |             raise NotImplementedError('Configuration mode "{}" is not supported'.format(config.mode))
 39 | 
 40 | 
 41 | class AmazonEnvProvider(EnvProvider):
 42 |     def __init__(self, config: Config):
 43 |         self.aws_addr = '169.254.169.254'
 44 |         self.config = config
 45 |         self.ip_address = None
 46 |         self._document = None
 47 | 
 48 |     def _get_document(self) -> dict:
 49 |         if not self._document:
 50 |             self._document = requests.get(
 51 |                 'http://{}/latest/dynamic/instance-identity/document'.format(self.aws_addr),
 52 |                 timeout=5).json()
 53 |             _LOG.info("Amazon specific information loaded from AWS: {}".format(json.dumps(self._document, indent=2)))
 54 |         return self._document
 55 | 
 56 |     def get_ip(self) -> str:
 57 |         if not self.ip_address:
 58 |             self.ip_address = self._get_document()['privateIp']
 59 |         return self.ip_address
 60 | 
 61 |     def get_rack(self):
 62 |         return self._get_document()['availabilityZone']
 63 | 
 64 |     def _load_instance_ips(self, lb_name: str):
 65 |         region = self._get_document()['region']
 66 | 
 67 |         private_ips = []
 68 | 
 69 |         elb = boto3.client('elb', region_name=region)
 70 |         ec2 = boto3.client('ec2', region_name=region)
 71 | 
 72 |         response = elb.describe_instance_health(LoadBalancerName=lb_name)
 73 | 
 74 |         for instance in response['InstanceStates']:
 75 |             if instance['State'] == 'InService':
 76 |                 private_ips.append(ec2.describe_instances(
 77 |                     InstanceIds=[instance['InstanceId']])['Reservations'][0]['Instances'][0]['PrivateIpAddress'])
 78 | 
 79 |         _LOG.info("Ip addresses for {} are: {}".format(lb_name, private_ips))
 80 |         return private_ips
 81 | 
 82 |     def get_address_provider(self):
 83 |         if self.config.zk_static_ips:
 84 |             return StaticAddressesProvider(self.config.zk_static_ips)
 85 |         else:
 86 |             return ExhibitorAddressProvider(partial(self._load_instance_ips, self.config.zk_stack_name))
 87 | 
 88 |     def create_broker_id_manager(self, zk: BukuExhibitor, kafka_props: KafkaProperties):
 89 |         return BrokerIdExtractor(zk, kafka_props)
 90 | 
 91 | 
 92 | class _LocalAddressProvider(AddressListProvider):
 93 |     def get_latest_address(self) -> (List[str], int):
 94 |         return ('zookeeper',), 2181
 95 | 
 96 | 
 97 | class StaticAddressesProvider(AddressListProvider):
 98 |     def __init__(self, addr: str):
 99 |         ips, port = addr.split(':')
100 |         self.ips = ips.split(',')
101 |         self.port = int(port)
102 | 
103 |     def get_latest_address(self) -> (List[str], int):
104 |         return self.ips, self.port
105 | 
106 | 
107 | class LocalEnvProvider(EnvProvider):
108 | 
109 |     def get_ip(self) -> str:
110 |         return '127.0.0.1'
111 | 
112 |     def get_address_provider(self):
113 |         return _LocalAddressProvider()
114 | 
115 |     def get_rack(self):
116 |         return None
117 | 
118 |     def create_broker_id_manager(self, zk: BukuExhibitor, kafka_props: KafkaProperties):
119 |         return BrokerIdExtractor(zk, kafka_props)
120 | 


--------------------------------------------------------------------------------
/bubuku/features/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zalando-nakadi/bubuku/7be53c4a8edbf6a248d70eb6ce0c38022f4391be/bubuku/features/__init__.py


--------------------------------------------------------------------------------
/bubuku/features/data_size_stats.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from bubuku.broker import BrokerManager
 4 | from bubuku.controller import Check
 5 | from bubuku.utils import CmdHelper
 6 | from bubuku.zookeeper import BukuExhibitor
 7 | 
 8 | _LOG = logging.getLogger('bubuku.features.data_size_stats')
 9 | 
10 | 
11 | class GenerateDataSizeStatistics(Check):
12 |     def __init__(self, zk: BukuExhibitor, broker: BrokerManager, cmd_helper: CmdHelper, kafka_log_dirs: list):
13 |         super().__init__(check_interval_s=600)
14 |         self.zk = zk
15 |         self.broker = broker
16 |         self.cmd_helper = cmd_helper
17 |         self.kafka_log_dirs = kafka_log_dirs
18 | 
19 |     def check(self):
20 |         if self.broker.is_running_and_registered():
21 |             _LOG.info("Generating data size statistics")
22 |             try:
23 |                 self.__generate_stats()
24 |                 _LOG.info("Data size statistics successfully written to zk")
25 |             except Exception:
26 |                 _LOG.warn("Error occurred when collecting size statistics", exc_info=True)
27 |         return None
28 | 
29 |     def __str__(self):
30 |         return 'GenerateDataSizeStatistics'
31 | 
32 |     def __generate_stats(self):
33 |         topics_stats = self.__get_topics_stats()
34 |         used_kb, free_kb = self.cmd_helper.get_disk_stats()
35 |         stats = {"disk": {'used_kb': used_kb, 'free_kb': free_kb}, "topics": topics_stats}
36 |         self.zk.update_disk_stats(self.broker.id_manager.get_broker_id(), stats)
37 | 
38 |     def __get_topics_stats(self):
39 |         topics_stats = {}
40 |         for log_dir in self.kafka_log_dirs:
41 |             _LOG.info("Processing log dir: {}".format(log_dir))
42 |             topic_dirs = self.cmd_helper.cmd_run("du -k -d 1 {}".format(log_dir)).split("\n")
43 |             for topic_dir in topic_dirs:
44 |                 dir_stats = self.__parse_dir_stats(topic_dir, log_dir)
45 |                 if dir_stats:
46 |                     topic, partition, size_kb = dir_stats
47 |                     if topic not in topics_stats:
48 |                         topics_stats[topic] = {}
49 |                     topics_stats[topic][partition] = int(size_kb)
50 |         return topics_stats
51 | 
52 |     @staticmethod
53 |     def __parse_dir_stats(topic_dir, log_dir):
54 |         """
55 |         Parses topic-partition size stats from "du" tool single line output
56 |         :param topic_dir: the string to be parsed; example: "45983\t/tmp/kafka-logs/my-kafka-topic-0"
57 |         :param log_dir: the kafka log directory name itself
58 |         :return: tuple (topic, partition, size) or None if the topic_dir has incorrect format
59 |         """
60 |         dir_data = topic_dir.split("\t")
61 |         if len(dir_data) == 2 and dir_data[1] != log_dir:
62 |             size_kb, dir_name = tuple(dir_data)
63 |             tp_name = dir_name.split("/")[-1]
64 |             tp_parts = tp_name.rsplit("-", 1)
65 |             if len(tp_parts) == 2:
66 |                 topic, partition = tuple(tp_parts)
67 |                 return topic, partition, size_kb
68 |         return None
69 | 


--------------------------------------------------------------------------------
/bubuku/features/metric_collector.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import asyncio
 3 | import logging
 4 | from bubuku.zookeeper import BukuExhibitor
 5 | 
 6 | _LOG = logging.getLogger('MetricCollector')
 7 | 
 8 | 
 9 | class MetricCollector:
10 |     _OFFLINE_PARTITIONS_MBEAN = {
11 |         'name': 'OfflinePartitions',
12 |         'mbean': 'kafka.controller:type=KafkaController,name=OfflinePartitionsCount',
13 |         'field': 'Value'}
14 |     _UNDER_REPLICATED_PARTITIONS_MBEAN = {
15 |         'name': 'UnderReplicatedPartitions',
16 |         'mbean': 'kafka.server:type=ReplicaManager,name=UnderReplicatedPartitions',
17 |         'field': 'Value'}
18 |     _PREFERRED_REPLICA_IMBALANCE_MBEAN = {
19 |         'name': 'PreferredReplicaImbalance',
20 |         'mbean': 'kafka.controller:name=PreferredReplicaImbalanceCount,type=KafkaController',
21 |         'field': 'Value'}
22 |     _BYTES_IN_MBEAN = {
23 |         'name': 'BytesIn',
24 |         'mbean': 'kafka.server:name=BytesInPerSec,type=BrokerTopicMetrics',
25 |         'field': 'OneMinuteRate'
26 |     }
27 |     _JOLOKIA_PORT = 8778
28 | 
29 |     def __init__(self, zk: BukuExhibitor):
30 |         self.zk = zk
31 | 
32 |     async def _get_metrics_from_broker(self, broker_id: int):
33 |         broker_address = self.zk.get_broker_address(broker_id)
34 |         data = {'broker_address': broker_address, 'broker_id': broker_id, 'metrics': {}}
35 |         for metric in self.get_metric_mbeans():
36 |             metric_fetched = False
37 |             try:
38 |                 response = requests.get("http://{}:{}/jolokia/read/{}".format(
39 |                     broker_address, self._JOLOKIA_PORT, metric['mbean']))
40 |                 if response.status_code == 200:
41 |                     response_body = response.json()
42 |                     if response_body.get('status') == 200:
43 |                         if response_body.get('value', {}).get(metric['field']) is not None:
44 |                             data['metrics'][metric['name']] = response_body['value'][metric['field']]
45 |                             metric_fetched = True
46 |                 if not metric_fetched:
47 |                     _LOG.error("Fetching metric {} for broker: {} failed. Response from broker: {}:{}".format(
48 |                         metric['name'], broker_id, response.status_code, response.text))
49 |             except Exception as e:
50 |                 _LOG.error("Fetching metric {} for broker {} failed".format(metric['name'], broker_id), exc_info=e)
51 |         return data
52 | 
53 |     async def _get_metrics_from_brokers(self, broker_ids):
54 |         metrics = []
55 |         for broker_id in broker_ids:
56 |             metrics.append(asyncio.ensure_future(self._get_metrics_from_broker(broker_id)))
57 |         metrics = await asyncio.gather(*metrics)
58 |         return metrics
59 | 
60 |     def get_metrics_from_brokers(self, broker_ids=None):
61 |         """
62 |         Get metrics for brokers in the cluster
63 |         :param broker_ids: List of broker_ids to fetch metrics for
64 |         :return: List of dictionaries containing metrics for each broker
65 |         {
66 |             "metrics": {...},
67 |             "broker_id": int,
68 |             "broker_address": str
69 |         }
70 |         """
71 |         broker_ids = self.zk.get_broker_ids() if not broker_ids else broker_ids
72 |         loop = asyncio.new_event_loop()
73 |         asyncio.set_event_loop(loop)
74 |         try:
75 |             return loop.run_until_complete(self._get_metrics_from_brokers(broker_ids))
76 |         except Exception as e:
77 |             _LOG.error('Could not fetch metrics from brokers', exc_info=e)
78 |         finally:
79 |             loop.close()
80 | 
81 |     @classmethod
82 |     def get_metric_mbeans(cls):
83 |         return [
84 |             cls._OFFLINE_PARTITIONS_MBEAN,
85 |             cls._UNDER_REPLICATED_PARTITIONS_MBEAN,
86 |             cls._PREFERRED_REPLICA_IMBALANCE_MBEAN,
87 |             cls._BYTES_IN_MBEAN,
88 |         ]
89 | 


--------------------------------------------------------------------------------
/bubuku/features/migrate.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from bubuku.features.rebalance import BaseRebalanceChange
 4 | from bubuku.zookeeper import BukuExhibitor, RebalanceThrottleManager
 5 | 
 6 | _LOG = logging.getLogger('bubuku.features.migrate')
 7 | 
 8 | 
 9 | class MigrationChange(BaseRebalanceChange):
10 |     def __init__(self, zk: BukuExhibitor, from_: list, to: list, shrink: bool, parallelism: int = 1,
11 |                  throttle: int = 100000000):
12 |         self.zk = zk
13 |         self.migration = {int(from_[i]): int(to[i]) for i in range(0, len(from_))}
14 |         self.shrink = shrink
15 |         self.data_to_migrate = None
16 |         self.parallelism = parallelism
17 |         self.throttle_manager = RebalanceThrottleManager(self.zk, throttle)
18 | 
19 |     def run(self, current_actions) -> bool:
20 |         if self.should_be_paused(current_actions):
21 |             return True
22 |         if self.zk.is_rebalancing():
23 |             return True
24 |         active_ids = [int(k) for k in self.zk.get_broker_ids()]
25 |         if any(b not in active_ids for b in self.migration.keys()):
26 |             _LOG.error('Source brokers {} are not in active list {}. Stopping.'.format(
27 |                 self.migration.keys(), active_ids))
28 |             return False
29 |         if any(b not in active_ids for b in self.migration.values()):
30 |             _LOG.error('Target brokers {} are not in active list {}. Stopping.'.format(
31 |                 self.migration.values(), active_ids))
32 |             return False
33 |         if self.data_to_migrate is None:
34 |             _LOG.info('Loading partition assignment')
35 |             self.data_to_migrate = [data for data in self.zk.load_partition_assignment()]
36 |             _LOG.info('Load {} partitions'.format(len(self.data_to_migrate)))
37 |             return True
38 | 
39 |         items_to_migrate = []
40 |         self.throttle_manager.remove_old_throttle_configurations()
41 |         while self.data_to_migrate and len(items_to_migrate) < self.parallelism:
42 |             topic, partition, replicas = self.data_to_migrate.pop()
43 |             replaced_replicas = self._replace_replicas(replicas)
44 |             if replaced_replicas == replicas:
45 |                 continue
46 |             items_to_migrate.append((topic, partition, replicas, replaced_replicas))
47 |         if not items_to_migrate:
48 |             return False
49 |         self.throttle_manager.apply_throttle([(t, p, rr) for t, p, _, rr in items_to_migrate])
50 |         if not self.zk.reallocate_partitions([(t, p, rr) for t, p, _, rr in items_to_migrate]):
51 |             for topic, partition, replicas, _ in items_to_migrate:
52 |                 self.data_to_migrate.append((topic, partition, replicas))
53 |         return True
54 | 
55 |     def __str__(self):
56 |         return 'Migration links {}, shrink: {}, data_to_move: {}, parallelism: {}'.format(
57 |             self.migration,
58 |             self.shrink,
59 |             len(self.data_to_migrate) if self.data_to_migrate is not None else 'Unknown',
60 |             self.parallelism,
61 |         )
62 | 
63 |     def _replace_replicas(self, replicas):
64 |         replacement = [self.migration[k] for k in replicas if k in self.migration]
65 |         if self.shrink:
66 |             result = []
67 |             for v in replicas:
68 |                 to_use = self.migration.get(v, v)
69 |                 if to_use not in result:
70 |                     result.append(to_use)
71 |             return result
72 |         else:
73 |             return replicas + [k for k in replacement if k not in replicas]
74 | 
75 |     def on_remove(self):
76 |         RebalanceThrottleManager.remove_all_throttle_configurations(self.zk)
77 | 


--------------------------------------------------------------------------------
/bubuku/features/rebalance/__init__.py:
--------------------------------------------------------------------------------
 1 | from bubuku.controller import Change
 2 | 
 3 | 
 4 | class BaseRebalanceChange(Change):
 5 |     def get_name(self) -> str:
 6 |         return 'rebalance'
 7 | 
 8 |     def can_run(self, current_actions):
 9 |         return all([a not in current_actions for a in ['start', 'restart', 'rebalance', 'stop', 'complete_stop', 'rolling_restart']])
10 | 
11 |     @staticmethod
12 |     def should_be_paused(current_actions):
13 |         return any([a in current_actions for a in ['restart', 'start', 'stop', 'complete_stop', 'rolling_restart']])
14 | 


--------------------------------------------------------------------------------
/bubuku/features/rebalance/broker.py:
--------------------------------------------------------------------------------
  1 | from typing import Tuple, List, Iterator, Dict, Optional
  2 | 
  3 | 
  4 | class _TopicPartitions(object):
  5 |     __slots__ = (
  6 |         '_items',
  7 |         '_expectation',
  8 |         '_has_free_slots',
  9 |         '_topic_partitions'
 10 |     )
 11 | 
 12 |     def __init__(self):
 13 |         self._items = set()
 14 |         self._expectation = 0
 15 |         self._has_free_slots = None
 16 |         self._topic_partitions = {}
 17 | 
 18 |     def __str__(self):
 19 |         return 'TP, expectation: {}, items: {}'.format(self._expectation, self._items)
 20 | 
 21 |     def get_item_count(self) -> int:
 22 |         return len(self._items)
 23 | 
 24 |     def contains(self, item) -> bool:
 25 |         return item in self._items
 26 | 
 27 |     def add(self, item: Tuple[str, int]):
 28 |         self._items.add(item)
 29 |         topic, partition = item
 30 |         if topic not in self._topic_partitions:
 31 |             self._topic_partitions[topic] = []
 32 |         self._topic_partitions[topic].append(partition)
 33 |         self._has_free_slots = None
 34 | 
 35 |     def remove(self, item: Tuple[str, int]):
 36 |         self._items.remove(item)
 37 |         topic, partition = item
 38 |         self._topic_partitions[topic].remove(partition)
 39 |         self._has_free_slots = None
 40 | 
 41 |     def iterate_items(self) -> Iterator[Tuple[str, int]]:
 42 |         return self._items.__iter__()
 43 | 
 44 |     def get_topic_partitions(self, topic: str) -> List[int]:
 45 |         return self._topic_partitions.get(topic, [])
 46 | 
 47 |     def get_expectation(self) -> int:
 48 |         return self._expectation
 49 | 
 50 |     def set_expectation(self, expectation: int):
 51 |         self._has_free_slots = None
 52 |         self._expectation = expectation
 53 | 
 54 |     def calculate_cardinality(self) -> Dict[str, int]:
 55 |         return {k: len(v) for k, v in self._topic_partitions.items()}
 56 | 
 57 |     def has_free_slots(self) -> bool:
 58 |         if self._has_free_slots is None:
 59 |             self._has_free_slots = self.get_item_count() < self.get_expectation()
 60 |         return self._has_free_slots
 61 | 
 62 | 
 63 | class BrokerDescription(object):
 64 |     __slots__ = (
 65 |         '_broker_id',
 66 |         '_rack_id',
 67 |         '_leaders',
 68 |         '_replicas',
 69 |     )
 70 | 
 71 |     def __init__(self, broker_id: int, rack_id: str = None):
 72 |         self._broker_id = broker_id
 73 |         self._rack_id = rack_id
 74 |         self._leaders = _TopicPartitions()
 75 |         self._replicas = _TopicPartitions()
 76 | 
 77 |     @property
 78 |     def broker_id(self) -> int:
 79 |         return self._broker_id
 80 | 
 81 |     @property
 82 |     def rack_id(self) -> str:
 83 |         return self._rack_id
 84 | 
 85 |     def __str__(self) -> str:
 86 |         return 'BrokerDescription(id={}, rack={}, leaders={}, replicas={})'.format(
 87 |             self._broker_id, self._rack_id, self._leaders, self._replicas)
 88 | 
 89 |     def set_leader_expectation(self, leader_count: int):
 90 |         self._leaders.set_expectation(leader_count)
 91 | 
 92 |     def set_replica_expectation(self, replica_count: int):
 93 |         self._replicas.set_expectation(replica_count)
 94 | 
 95 |     def add_leader(self, topic_partition: Tuple[str, int]):
 96 |         self._leaders.add(topic_partition)
 97 | 
 98 |     def add_replica(self, topic_partition: Tuple[str, int]):
 99 |         self._replicas.add(topic_partition)
100 | 
101 |     def get_leader_count(self) -> int:
102 |         return self._leaders.get_item_count()
103 | 
104 |     def get_replica_count(self) -> int:
105 |         return self._replicas.get_item_count()
106 | 
107 |     def get_replica_overload(self) -> int:
108 |         return self._replicas.get_item_count() - self._replicas.get_expectation()
109 | 
110 |     def has_free_replica_slots(self) -> int:
111 |         return self._replicas.has_free_slots()
112 | 
113 |     def have_extra_leaders(self) -> bool:
114 |         return self._leaders.get_expectation() < self._leaders.get_item_count()
115 | 
116 |     def have_less_leaders(self) -> bool:
117 |         return self._leaders.get_expectation() > self._leaders.get_item_count()
118 | 
119 |     def get_expected_leaders(self) -> int:
120 |         return self._leaders.get_expectation()
121 | 
122 |     def accept_leader(self, source_broker: 'BrokerDescription', topic_partition: Tuple[str, int]):
123 |         """
124 |         Moves topic_partition from source_broker to self broker.
125 |         :param source_broker: Broker to take topic_partition from.
126 |         :param topic_partition: topic and partition tuple to take.
127 |         """
128 |         self._leaders.add(topic_partition)
129 |         source_broker._leaders.remove(topic_partition)
130 | 
131 |     def _accept_replica(self, source_broker: 'BrokerDescription', topic_partition: Tuple[str, int]) -> bool:
132 |         # Already a leader for this partition
133 |         if self._leaders.contains(topic_partition):
134 |             return False
135 |         # Already a replica for this partition
136 |         if self._replicas.contains(topic_partition):
137 |             return False
138 |         if self._rack_id != source_broker._rack_id:
139 |             return False
140 |         self._replicas.add(topic_partition)
141 |         source_broker._replicas.remove(topic_partition)
142 |         return True
143 | 
144 |     def move_replica(self, topic_partition: Tuple[str, int], broker_list: List['BrokerDescription']) \
145 |             -> Optional['BrokerDescription']:
146 |         """
147 |         Moves replica topic_partition to some broker from broker_list.
148 |         :param topic_partition: Topic and partition to move
149 |         :param broker_list: List of brokers (BrokerDescription) to move to
150 |         :return: Broker, to which partition was moved
151 |         """
152 |         for target in broker_list:
153 |             if target._accept_replica(self, topic_partition):
154 |                 return target
155 |         return None
156 | 
157 |     def list_replica_copies(self) -> List[Tuple[str, int]]:
158 |         return list([tp for tp in self._replicas.iterate_items() if self._leaders.contains(tp)])
159 | 
160 |     def list_partitions(self, topic: str, replica: bool) -> List[int]:
161 |         return (self._replicas if replica else self._leaders).get_topic_partitions(topic)
162 | 
163 |     def list_replicas(self) -> Iterator[Tuple[str, int]]:
164 |         return self._replicas.iterate_items()
165 | 
166 |     def calculate_topic_cardinality(self) -> Dict[str, int]:
167 |         """
168 |         Calculates 'topic to leader count' dictionary on this broker.
169 |         For example, topic t0 have partitions 0, 1, 2, 3. If leaders for partitions 0, 3 are located on this broker
170 |          than return value will contain mapping t0->2 (there are 2 leaders for topic t0 on this broker)
171 |         :return: Dictionary with leaders count per topic for this broker.
172 |         """
173 |         return self._leaders.calculate_cardinality()
174 | 


--------------------------------------------------------------------------------
/bubuku/features/rebalance/check.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from bubuku.broker import BrokerManager
 3 | from bubuku.controller import Check
 4 | from bubuku.features.rebalance.change import OptimizedRebalanceChange
 5 | from bubuku.zookeeper import BukuExhibitor
 6 | 
 7 | _LOG = logging.getLogger('bubuku.features.rebalance')
 8 | 
 9 | 
10 | class RebalanceOnStartCheck(Check):
11 |     def __init__(self, zk: BukuExhibitor, broker: BrokerManager):
12 |         super().__init__()
13 |         self.zk = zk
14 |         self.broker = broker
15 |         self.executed = False
16 | 
17 |     def check(self):
18 |         if self.executed:
19 |             return None
20 |         if not self.broker.is_running_and_registered():
21 |             return None
22 |         _LOG.info("Rebalance on start, triggering rebalance")
23 |         self.executed = True
24 |         return OptimizedRebalanceChange(self.zk, self.zk.get_broker_ids(), [], [])
25 | 
26 |     def __str__(self):
27 |         return 'RebalanceOnStartCheck (executed={})'.format(self.executed)
28 | 
29 | 
30 | class RebalanceOnBrokerListCheck(Check):
31 |     def __init__(self, zk: BukuExhibitor, broker: BrokerManager):
32 |         super().__init__()
33 |         self.zk = zk
34 |         self.broker = broker
35 |         self.old_broker_list = []
36 | 
37 |     def check(self):
38 |         if not self.broker.is_running_and_registered():
39 |             return None
40 |         new_list = self.zk.get_broker_ids()
41 |         if not new_list == self.old_broker_list:
42 |             _LOG.info('Broker list changed from {} to {}, triggering rebalance'.format(self.old_broker_list, new_list))
43 |             self.old_broker_list = new_list
44 |             return OptimizedRebalanceChange(self.zk, new_list, [], [])
45 |         return None
46 | 
47 |     def __str__(self):
48 |         return 'RebalanceOnBrokerListChange, cached list: {}'.format(self.old_broker_list)
49 | 


--------------------------------------------------------------------------------
/bubuku/features/remote_exec.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | from bubuku.aws.cluster_config import ClusterConfig, AwsInstanceUserDataLoader
  4 | from bubuku.broker import BrokerManager
  5 | from bubuku.controller import Check, Change, Controller
  6 | from bubuku.features.migrate import MigrationChange
  7 | from bubuku.features.rebalance.change import OptimizedRebalanceChange
  8 | from bubuku.features.rebalance.change_simple import SimpleRebalanceChange
  9 | from bubuku.features.restart_on_zk_change import RestartBrokerChange
 10 | from bubuku.features.rolling_restart import RollingRestartChange
 11 | from bubuku.features.swap_partitions import SwapPartitionsChange, load_swap_data
 12 | from bubuku.features.terminate import CompleteStopChange
 13 | from bubuku.zookeeper import BukuExhibitor
 14 | 
 15 | _LOG = logging.getLogger('bubuku.features.remote_exec')
 16 | 
 17 | 
 18 | class RemoteCommandExecutorCheck(Check):
 19 |     def __init__(self, zk: BukuExhibitor, broker_manager: BrokerManager, controller: Controller, api_port: int):
 20 |         super().__init__(check_interval_s=30)
 21 |         self.zk = zk
 22 |         self.broker_manager = broker_manager
 23 |         self.controller = controller
 24 |         self.api_port = api_port
 25 | 
 26 |     def check(self) -> Change:
 27 |         with self.zk.lock():
 28 |             data = self.zk.take_action(self.broker_manager.id_manager.get_broker_id())
 29 |         if not data:
 30 |             return None
 31 |         if 'name' not in data:
 32 |             _LOG.error('Action name can not be restored from {}, skipping'.format(data))
 33 |             return None
 34 |         try:
 35 |             if data['name'] == 'restart':
 36 |                 return RestartBrokerChange(self.zk, self.broker_manager, lambda: False)
 37 |             elif data['name'] == 'rebalance':
 38 |                 if data.get('bin_packing', False):
 39 |                     return OptimizedRebalanceChange(self.zk,
 40 |                                                     self.zk.get_broker_ids(),
 41 |                                                     data['empty_brokers'],
 42 |                                                     data['exclude_topics'],
 43 |                                                     data['throttle'],
 44 |                                                     int(data.get('parallelism', 1)))
 45 |                 else:
 46 |                     return SimpleRebalanceChange(self.zk,
 47 |                                                  self.zk.get_broker_ids(),
 48 |                                                  data['empty_brokers'],
 49 |                                                  data['exclude_topics'],
 50 |                                                  int(data.get('parallelism', 1)),
 51 |                                                  data['throttle'])
 52 | 
 53 |             elif data['name'] == 'migrate':
 54 |                 return MigrationChange(self.zk, data['from'], data['to'], data['shrink'],
 55 |                                        int(data.get('parallelism', '1')), data['throttle'])
 56 |             elif data['name'] == 'fatboyslim':
 57 |                 return SwapPartitionsChange(self.zk,
 58 |                                             lambda x: load_swap_data(x, self.api_port, int(data['threshold_kb'])))
 59 |             elif data['name'] == 'rolling_restart':
 60 |                 cluster_config = ClusterConfig(AwsInstanceUserDataLoader())
 61 |                 cluster_config.set_overrides(**data['overrides'])
 62 |                 return RollingRestartChange(self.zk,
 63 |                                             cluster_config,
 64 |                                             data['restart_assignment'],
 65 |                                             self.broker_manager.id_manager.broker_id,
 66 |                                             data['cool_down'])
 67 |             elif data['name'] == 'stop':
 68 |                 return CompleteStopChange(self.broker_manager, self.controller)
 69 |             else:
 70 |                 _LOG.error('Action {} not supported'.format(data))
 71 |         except Exception as e:
 72 |             _LOG.error('Failed to create action from {}'.format(data), exc_info=e)
 73 |         return None
 74 | 
 75 |     def __str__(self):
 76 |         return 'RemoteCommandExecutorCheck'
 77 | 
 78 |     @staticmethod
 79 |     def register_restart(zk: BukuExhibitor, broker_id: str):
 80 |         with zk.lock():
 81 |             zk.register_action({'name': 'restart'}, broker_id=broker_id)
 82 | 
 83 |     @staticmethod
 84 |     def register_rebalance(zk: BukuExhibitor, broker_id: str, empty_brokers: list, exclude_topics: list,
 85 |                            parallelism: int, bin_packing: bool, throttle: int):
 86 |         if parallelism <= 0:
 87 |             raise Exception('Parallelism for rebalance should be greater than 0')
 88 |         action = {
 89 |             'name': 'rebalance',
 90 |             'empty_brokers': empty_brokers,
 91 |             'exclude_topics': exclude_topics,
 92 |             'parallelism': int(parallelism),
 93 |             'bin_packing': bool(bin_packing),
 94 |             'throttle': int(throttle)
 95 |         }
 96 |         with zk.lock():
 97 |             if broker_id:
 98 |                 zk.register_action(action, broker_id=broker_id)
 99 |             else:
100 |                 zk.register_action(action)
101 | 
102 |     @staticmethod
103 |     def register_migration(zk: BukuExhibitor, brokers_from: list, brokers_to: list, shrink: bool, broker_id: str,
104 |                            throttle: int, parallelism: int):
105 |         if len(brokers_from) != len(brokers_to):
106 |             raise Exception('Brokers list {} and {} must have the same size'.format(brokers_from, brokers_to))
107 |         if any(b in brokers_from for b in brokers_to) or any(b in brokers_to for b in brokers_from):
108 |             raise Exception('Broker lists can not hold same broker ids')
109 | 
110 |         if len(set(brokers_from)) != len(brokers_from):
111 |             raise Exception('Can not use same broker ids for source_list {}'.format(brokers_from))
112 |         if len(set(brokers_to)) != len(brokers_to):
113 |             raise Exception('Can not use same broker ids for source_list {}'.format(brokers_from))
114 | 
115 |         active_ids = zk.get_broker_ids()
116 |         if any(b not in active_ids for b in brokers_from) or any(b not in active_ids for b in brokers_to):
117 |             raise Exception('Brokers dead from: {} to: {} alive:{}'.format(brokers_from, brokers_to, active_ids))
118 | 
119 |         if broker_id and str(broker_id) not in active_ids:
120 |             raise Exception('Broker id to run change on ({}) is not in active list {}'.format(
121 |                 broker_id, active_ids))
122 |         if parallelism <= 0:
123 |             raise Exception('Parallelism for migration should be greater than 0')
124 | 
125 |         with zk.lock():
126 |             action = {'name': 'migrate', 'from': brokers_from, 'to': brokers_to, 'shrink': bool(shrink),
127 |                       'parallelism': int(parallelism), 'throttle': int(throttle)}
128 |             if broker_id:
129 |                 zk.register_action(action, str(broker_id))
130 |             else:
131 |                 zk.register_action(action)
132 | 
133 |     @staticmethod
134 |     def register_fatboy_slim(zk: BukuExhibitor, threshold_kb: int):
135 |         if zk.is_rebalancing():
136 |             _LOG.warning('Rebalance is already in progress, may be it will take time for this command to start '
137 |                          'processing')
138 |         with zk.lock():
139 |             zk.register_action({'name': 'fatboyslim', 'threshold_kb': threshold_kb})
140 | 
141 |     @staticmethod
142 |     def register_rolling_restart(zk: BukuExhibitor, broker_id: str, image: str, instance_type: str, scalyr_key: str,
143 |                                  scalyr_region: str, kms_key_id: str, ami_id: str, cool_down: int):
144 |         if zk.is_rolling_restart_in_progress():
145 |             _LOG.warning('Rolling restart in progress, skipping')
146 |             return
147 | 
148 |         restart_assignment = {}
149 |         brokers = zk.get_broker_ids()
150 |         for idx in range(len(brokers)):
151 |             broker_to_make_restart = brokers[idx]
152 |             if idx == len(brokers) - 1:
153 |                 broker_to_restart = brokers[0]
154 |             else:
155 |                 broker_to_restart = brokers[idx + 1]
156 |             restart_assignment[broker_to_make_restart] = broker_to_restart
157 | 
158 |         _LOG.info('Rolling restart assignment:\n {}'.format(restart_assignment))
159 |         action = {
160 |             'name': 'rolling_restart',
161 |             'restart_assignment': restart_assignment,
162 |             'overrides': ClusterConfig.create_overrides_dict(
163 |                 application_version=image,
164 |                 scalyr_account_key=scalyr_key,
165 |                 scalyr_region=scalyr_region,
166 |                 instance_type=instance_type,
167 |                 kms_key_id=kms_key_id,
168 |                 ami_id=ami_id,
169 |             ),
170 |             'cool_down': cool_down
171 |         }
172 |         zk.register_action(action, broker_id=broker_id)
173 | 
174 |     @staticmethod
175 |     def register_stop(zk: BukuExhibitor, broker_id: str):
176 |         zk.register_action({'name': 'stop'}, broker_id=broker_id)
177 | 


--------------------------------------------------------------------------------
/bubuku/features/restart_if_dead.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from datetime import datetime, timedelta
 4 | from bubuku.broker import BrokerManager
 5 | from bubuku.controller import Change, Check
 6 | from bubuku.features.restart_on_zk_change import RestartBrokerChange
 7 | from bubuku.zookeeper import BukuExhibitor
 8 | 
 9 | _LOG = logging.getLogger('bubuku.features.restart_if_dead')
10 | 
11 | class CheckBrokerStopped(Check):
12 |     def __init__(self, broker: BrokerManager, zk: BukuExhibitor):
13 |         super().__init__()
14 |         self.broker = broker
15 |         self.zk = zk
16 |         self.need_check = True
17 |         self.last_zk_session_failed_check = None
18 | 
19 |     def check(self) -> Change:
20 |         if not self.need_check:
21 |             return None
22 |         if not self.should_restart():
23 |             return None
24 | 
25 |         _LOG.warning('Oops! Broker is dead, triggering restart')
26 |         self.need_check = False
27 | 
28 |         # Do not start if broker is running and registered
29 |         def _cancel_if():
30 |             return self.broker.is_running() and self.broker.is_registered_in_zookeeper()
31 | 
32 |         return RestartBrokerChange(self.zk, self.broker, _cancel_if, self.on_change_executed)
33 | 
34 |     # Attempt to verify that broker is not registered in zookeeper for twice as long as the zookeeper session timeout.
35 |     # Allow zookeeper client to try to restore the session before killing tha kafka process as soon as zookeeper session is dead.
36 |     def should_restart(self):
37 |         current_time = datetime.now()
38 |         if not self.broker.is_running():
39 |             return True
40 |         if not self.broker.is_registered_in_zookeeper():
41 |             _LOG.warning('Broker is not regiestered in Zookeeper')
42 |             if not self.last_zk_session_failed_check:
43 |                 self.last_zk_session_failed_check = current_time
44 |             time_to_restart_at = self.last_zk_session_failed_check + timedelta(milliseconds=self.broker.get_zookeeper_session_timeout() * 2)
45 |             if current_time > time_to_restart_at:
46 |                 return True
47 |         else:
48 |             self.last_zk_session_failed_check = None
49 |         return False
50 | 
51 |     def on_change_executed(self):
52 |         self.need_check = True
53 |         self.last_zk_session_failed_check = None
54 | 
55 |     def __str__(self):
56 |         return 'CheckBrokerStopped'
57 | 


--------------------------------------------------------------------------------
/bubuku/features/restart_on_zk_change.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from bubuku.broker import BrokerManager
 4 | from bubuku.controller import Change, Check
 5 | from bubuku.zookeeper import BukuExhibitor
 6 | 
 7 | _LOG = logging.getLogger('bubuku.features.restart_on_zk')
 8 | 
 9 | _STAGE_STOP = 'stop'
10 | _STAGE_START = 'start'
11 | 
12 | 
13 | class RestartBrokerChange(Change):
14 |     def __init__(self, zk: BukuExhibitor, broker: BrokerManager, break_condition, processed_callback=None):
15 |         self.zk = zk
16 |         self.broker = broker
17 |         self.break_condition = break_condition
18 |         self.stage = _STAGE_STOP
19 |         self.processed_callback = processed_callback
20 | 
21 |     def get_name(self):
22 |         return 'restart'
23 | 
24 |     def can_run(self, current_actions):
25 |         return all([a not in current_actions for a in ['start', 'restart', 'stop', 'complete_stop']])
26 | 
27 |     def run(self, current_actions):
28 |         if self.stage == _STAGE_STOP:
29 |             if self.break_condition and self.break_condition():
30 |                 return False
31 |             self.broker.stop_kafka_process()
32 |             self.stage = _STAGE_START
33 |             return True
34 |         elif self.stage == _STAGE_START:
35 |             # Yep, use latest data
36 |             zk_conn_str = self.zk.get_conn_str()
37 |             try:
38 |                 self.broker.start_kafka_process(zk_conn_str)
39 |             except Exception as e:
40 |                 _LOG.error('Failed to start kafka process against {}'.format(zk_conn_str), exc_info=e)
41 |                 return True
42 |             return False
43 |         else:
44 |             _LOG.error('Stage {} is not supported'.format(self.stage))
45 |         return False
46 | 
47 |     def on_remove(self):
48 |         if self.processed_callback:
49 |             self.processed_callback()
50 | 
51 |     def __str__(self):
52 |         return 'RestartBrokerChange ({}), stage={}'.format(self.get_name(), self.stage)
53 | 
54 | 
55 | class CheckExhibitorAddressChanged(Check):
56 |     def __init__(self, zk: BukuExhibitor, broker: BrokerManager):
57 |         super().__init__()
58 |         self.zk = zk
59 |         self.broker = broker
60 |         self.conn_str = None
61 | 
62 |     def check(self) -> Change:
63 |         new_conn_str = self.zk.get_conn_str()
64 |         if new_conn_str != self.conn_str:
65 |             def _cancel_if():
66 |                 current_conn_str = self.zk.get_conn_str()
67 |                 if current_conn_str != new_conn_str:
68 |                     _LOG.warning('ZK address changed again, from {} to {}'.format(new_conn_str, current_conn_str))
69 |                     return True
70 |                 if current_conn_str == self.broker.get_zk_connect_string():
71 |                     _LOG.warning('Broker already have latest version of zk address: {}'.format(current_conn_str))
72 |                     return True
73 |                 return False
74 | 
75 |             _LOG.info('ZK addresses changed from {} to {}, triggering restart'.format(self.conn_str, new_conn_str))
76 |             self.conn_str = new_conn_str
77 |             return RestartBrokerChange(self.zk, self.broker, _cancel_if)
78 | 
79 |     def __str__(self):
80 |         return 'CheckExhibitorAddressChanged, current={}'.format(self.conn_str)
81 | 


--------------------------------------------------------------------------------
/bubuku/features/rolling_restart.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from time import time
  3 | 
  4 | from bubuku import utils
  5 | from bubuku.aws import AWSResources
  6 | from bubuku.aws.cluster_config import ClusterConfig
  7 | from bubuku.aws.ec2_node_launcher import Ec2NodeLauncher
  8 | from bubuku.aws.node import Ec2Node
  9 | from bubuku.controller import Change
 10 | from bubuku.zookeeper import BukuExhibitor
 11 | 
 12 | _LOG = logging.getLogger('bubuku.features.rolling_restart')
 13 | 
 14 | 
 15 | class RollingRestartChange(Change):
 16 |     def __init__(self,
 17 |                  zk: BukuExhibitor,
 18 |                  cluster_config: ClusterConfig,
 19 |                  restart_assignment,
 20 |                  broker_id: str,
 21 |                  cool_down: int):
 22 |         self.zk = zk
 23 |         self.restart_assignment = restart_assignment
 24 |         self.broker_id = broker_id
 25 |         self.broker_id_to_restart = self.restart_assignment.pop(broker_id)
 26 |         self.broker_ip_to_restart = self.zk.get_broker_address(self.broker_id_to_restart)
 27 | 
 28 |         self.cluster_config = cluster_config
 29 | 
 30 |         self.aws = AWSResources(region=self.cluster_config.get_aws_region())
 31 |         self.ec_node = Ec2Node(self.aws, self.cluster_config, self.broker_ip_to_restart)
 32 |         self.ec2_node_launcher = Ec2NodeLauncher(
 33 |             self.aws, self.cluster_config, self.ec_node.get_node_availability_zone())
 34 | 
 35 |         self.state_context = StateContext(self.zk, self.aws, self.ec_node, self.ec2_node_launcher,
 36 |                                           self.broker_id_to_restart, self.restart_assignment,
 37 |                                           self.cluster_config, cool_down)
 38 | 
 39 |     def get_name(self) -> str:
 40 |         return 'rolling_restart'
 41 | 
 42 |     def can_run(self, current_actions):
 43 |         return all([a not in current_actions for a in ['stop', 'complete_stop', 'restart', 'rebalance']])
 44 | 
 45 |     def run(self, current_actions) -> bool:
 46 |         return self.state_context.run()
 47 | 
 48 |     def time_till_next_run(self):
 49 |         return 10
 50 | 
 51 | 
 52 | class StateContext:
 53 |     def __init__(self, zk: BukuExhibitor, aws: AWSResources, ec_node: Ec2Node, ec2_node_launcher: Ec2NodeLauncher,
 54 |                  broker_id_to_restart, restart_assignment, cluster_config: ClusterConfig, cool_down: int):
 55 |         self.zk = zk
 56 |         self.restart_assignment = restart_assignment
 57 |         self.cluster_config = cluster_config
 58 |         self.aws = aws
 59 |         self.ec_node = ec_node
 60 |         self.ec2_node_launcher = ec2_node_launcher
 61 |         self.broker_id_to_restart = broker_id_to_restart
 62 |         self.current_state = StopKafka(self)
 63 |         self.new_instance_id = None
 64 |         self.cool_down = cool_down
 65 | 
 66 |     def run(self):
 67 |         """
 68 |         Runs states one after another. If state is finished, it takes the next one.
 69 |         """
 70 |         try:
 71 |             _LOG.info('Running state {}'.format(self.current_state))
 72 |             if self.current_state.run():
 73 |                 next_state = self.current_state.next()
 74 |                 _LOG.info('Next state {}'.format(next_state))
 75 |                 if next_state is None:
 76 |                     return False
 77 |                 self.current_state = next_state
 78 |             return True
 79 |         except Exception as e:
 80 |             _LOG.error('Failed to run state', exc_info=e)
 81 |             return True
 82 | 
 83 | 
 84 | class State:
 85 |     """
 86 |     State which can be run as many times as required before it finishes it work. The progress of the state has to be
 87 |     recoverable
 88 |     """
 89 | 
 90 |     def __init__(self, state_context):
 91 |         self.state_context = state_context
 92 |         self.time_to_check_s = time()
 93 | 
 94 |     def run(self) -> bool:
 95 |         """
 96 |         Runs the state, and if state finishes successfully it returns True, otherwise it returns False, which means
 97 |         that state has to be executed again
 98 |         """
 99 |         pass
100 | 
101 |     def next(self):
102 |         """
103 |         Return the next state, which has to be executed after the current state
104 |         """
105 |         pass
106 | 
107 |     def run_with_timeout(self, func):
108 |         """
109 |         Runs func() with timeout
110 |         :param func function to execute
111 |         """
112 |         if time() >= self.time_to_check_s:
113 |             self.time_to_check_s = time() + 10
114 |             return func()
115 |         return False
116 | 
117 | 
118 | class StopKafka(State):
119 |     def run(self):
120 |         if utils.is_cluster_healthy():
121 |             from bubuku.features.remote_exec import RemoteCommandExecutorCheck
122 |             RemoteCommandExecutorCheck.register_stop(self.state_context.zk, self.state_context.broker_id_to_restart)
123 |             return True
124 |         _LOG.warning('Cluster is not healthy, waiting for it to recover')
125 |         return False
126 | 
127 |     def next(self):
128 |         return WaitBrokerStopped(self.state_context)
129 | 
130 |     def __str__(self):
131 |         return 'StopKafka: stopping broker {}'.format(self.state_context.broker_id_to_restart)
132 | 
133 | 
134 | class WaitBrokerStopped(State):
135 |     def run(self):
136 |         def func():
137 |             return not self.state_context.zk.is_broker_registered(self.state_context.broker_id_to_restart)
138 | 
139 |         return self.run_with_timeout(func)
140 | 
141 |     def next(self):
142 |         return DetachVolume(self.state_context)
143 | 
144 |     def __str__(self):
145 |         return 'WaitBrokerStopped: waiting for broker {} to stop'.format(self.state_context.broker_id_to_restart)
146 | 
147 | 
148 | class DetachVolume(State):
149 |     def run(self):
150 |         self.state_context.ec_node.detach_volume()
151 |         return True
152 | 
153 |     def next(self):
154 |         return TerminateInstance(self.state_context)
155 | 
156 |     def __str__(self):
157 |         return 'DetachVolume: detaching volume {} from broker {}'.format(self.state_context.ec_node.get_volume_id(),
158 |                                                                          self.state_context.broker_id_to_restart)
159 | 
160 | 
161 | class TerminateInstance(State):
162 |     def run(self):
163 |         self.state_context.ec_node.terminate()
164 |         return True
165 | 
166 |     def next(self):
167 |         return WaitInstanceTerminated(self.state_context)
168 | 
169 |     def __str__(self):
170 |         return 'TerminateInstance: terminating instance {}'.format(self.state_context.ec_node.get_ip())
171 | 
172 | 
173 | class WaitInstanceTerminated(State):
174 |     def run(self):
175 |         def func():
176 |             return self.state_context.ec_node.is_terminated()
177 | 
178 |         return self.run_with_timeout(func)
179 | 
180 |     def next(self):
181 |         return WaitVolumeAvailable(self.state_context)
182 | 
183 |     def __str__(self):
184 |         return 'WaitInstanceTerminated: waiting for instance {} to be terminated'.format(
185 |             self.state_context.ec_node.get_ip())
186 | 
187 | 
188 | class WaitVolumeAvailable(State):
189 |     def run(self):
190 |         def func():
191 |             return self.state_context.ec_node.is_volume_available()
192 | 
193 |         return self.run_with_timeout(func)
194 | 
195 |     def next(self):
196 |         return LaunchInstance(self.state_context)
197 | 
198 |     def __str__(self):
199 |         return 'WaitVolumeAvailable: waiting for volume {} to be available'.format(
200 |             self.state_context.ec_node.get_volume_id())
201 | 
202 | 
203 | class LaunchInstance(State):
204 |     def run(self):
205 |         self.state_context.new_instance_id = self.state_context.ec2_node_launcher.launch()
206 |         return True
207 | 
208 |     def next(self):
209 |         return WaitVolumeAttached(self.state_context)
210 | 
211 | 
212 | class WaitVolumeAttached(State):
213 |     def run(self):
214 |         def func():
215 |             if self.state_context.ec_node.is_volume_in_use():
216 |                 self.state_context.ec2_node_launcher.create_auto_recovery_alarm(self.state_context.new_instance_id)
217 |                 return True
218 |             return False
219 | 
220 |         return self.run_with_timeout(func)
221 | 
222 |     def next(self):
223 |         return WaitKafkaRunning(self.state_context)
224 | 
225 |     def __str__(self):
226 |         return 'WaitVolumeAttached: waiting for volume {} to be attached'.format(
227 |             self.state_context.ec_node.get_volume_id())
228 | 
229 | 
230 | class WaitKafkaRunning(State):
231 |     def run(self):
232 |         def func():
233 |             return self.state_context.zk.is_broker_registered(self.state_context.broker_id_to_restart)
234 | 
235 |         return self.run_with_timeout(func)
236 | 
237 |     def next(self):
238 |         return RegisterRollingRestart(self.state_context)
239 | 
240 |     def __str__(self):
241 |         return 'WaitKafkaRunning: waiting broker {} is running'.format(self.state_context.broker_id_to_restart)
242 | 
243 | 
244 | class RegisterRollingRestart(State):
245 |     def __init__(self, state_context):
246 |         super(RegisterRollingRestart, self).__init__(state_context)
247 |         self.cluster_is_healthy_from = 0
248 | 
249 |     def run(self):
250 |         if len(self.state_context.restart_assignment) == 0:
251 |             _LOG.info('Rolling restart is successfully finished')
252 |             return True
253 |         else:
254 |             if utils.is_cluster_healthy():
255 |                 if self.cluster_is_healthy_from == 0:
256 |                     self.cluster_is_healthy_from = time()
257 |             else:
258 |                 _LOG.warning('Cluster is not healthy, waiting for it to recover')
259 |                 self.cluster_is_healthy_from = 0
260 |                 return False
261 | 
262 |             if time() - self.cluster_is_healthy_from >= self.state_context.cool_down:
263 |                 action = {'name': 'rolling_restart',
264 |                           'restart_assignment': self.state_context.restart_assignment,
265 |                           'overrides': self.state_context.cluster_config.get_overrides(),
266 |                           'cool_down': self.state_context.cool_down}
267 |                 next_broker_id = self.state_context.broker_id_to_restart
268 |                 self.state_context.zk.register_action(action, broker_id=next_broker_id)
269 |                 return True
270 |             return False
271 | 
272 |     def next(self):
273 |         return None
274 | 


--------------------------------------------------------------------------------
/bubuku/features/swap_partitions.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections import namedtuple
  3 | from operator import attrgetter
  4 | from typing import List
  5 | 
  6 | import requests
  7 | 
  8 | from bubuku.broker import BrokerManager
  9 | from bubuku.controller import Check
 10 | from bubuku.features.rebalance import BaseRebalanceChange
 11 | from bubuku.zookeeper import BukuExhibitor
 12 | 
 13 | _LOG = logging.getLogger('bubuku.features.swap_partitions')
 14 | 
 15 | TpData = namedtuple('_TpData', ('topic', 'partition', 'size', 'replicas'))
 16 | 
 17 | 
 18 | class SwapPartitionsChange(BaseRebalanceChange):
 19 |     def __init__(self, zk: BukuExhibitor, swap_data_provider):
 20 |         self.zk = zk
 21 |         self.to_move = None
 22 |         self.swap_data_provider = swap_data_provider
 23 | 
 24 |     def run(self, current_actions):
 25 |         if self.should_be_paused(current_actions):
 26 |             _LOG.info("Pausing swap partitions change as there are conflicting actions: {}".format(current_actions))
 27 |             return True
 28 |         # if there's a rebalance currently running - postpone current change
 29 |         if self.zk.is_rebalancing():
 30 |             return True
 31 | 
 32 |         if self.to_move is None:
 33 |             slim_broker_id, fat_broker_id, gap, size_stats = self.swap_data_provider(self.zk)
 34 |             if slim_broker_id is None:
 35 |                 _LOG.info('Can not find slim broker and fat broker during reassignment. Probably gap changed')
 36 |                 return False
 37 |             # merge topics size stats to a single dict
 38 |             topics_stats = {}
 39 |             for broker_stats in size_stats.values():
 40 |                 for topic in broker_stats["topics"].keys():
 41 |                     if topic not in topics_stats:
 42 |                         topics_stats[topic] = {}
 43 |                     topics_stats[topic].update(broker_stats["topics"][topic])
 44 | 
 45 |             # find partitions that are candidates to be swapped between "fat" and "slim" brokers
 46 |             swap_partition_candidates = self.__find_all_swap_candidates(fat_broker_id, slim_broker_id, topics_stats)
 47 | 
 48 |             # smallest partition from slim broker is the one we move to fat broker
 49 |             slim_broker_smallest_partition = min(swap_partition_candidates[slim_broker_id], key=attrgetter("size"))
 50 |             if not slim_broker_smallest_partition:
 51 |                 _LOG.info("No partitions on slim broker(id: {}) found to swap".format(slim_broker_id))
 52 |                 return False
 53 |             _LOG.info("Slim broker(id: {}) partition to swap: {}".format(
 54 |                 slim_broker_id, slim_broker_smallest_partition))
 55 | 
 56 |             # find the best fitting fat broker partition to move to slim broker
 57 |             # (should be as much as possible closing the gap between brokers)
 58 |             fat_broker_swap_candidates = swap_partition_candidates[fat_broker_id]
 59 |             matching_swap_partition = self.__find_best_swap_candidate(fat_broker_swap_candidates, gap,
 60 |                                                                       slim_broker_smallest_partition.size)
 61 | 
 62 |             # if there is no possible swap that will decrease the gap - just do nothing
 63 |             if not matching_swap_partition:
 64 |                 _LOG.info("No candidate from fat broker(id:{}) found to swap".format(fat_broker_id))
 65 |                 return False
 66 |             _LOG.info("Fat broker(id: {}) partition to swap: {}".format(fat_broker_id, matching_swap_partition))
 67 |             # write rebalance-json to ZK; Kafka will read it and perform the partitions swap
 68 |             self.to_move = self.__create_rebalance_list(slim_broker_smallest_partition, slim_broker_id,
 69 |                                                         matching_swap_partition, fat_broker_id)
 70 | 
 71 |         # if there is already a swap which was postponed - just execute it
 72 |         return not self.__perform_swap(self.to_move)
 73 | 
 74 |     def __perform_swap(self, rebalance_list):
 75 |         _LOG.info("Writing rebalance-json to ZK for partitions swap: {}".format(rebalance_list))
 76 |         return self.zk.reallocate_partitions(rebalance_list)
 77 | 
 78 |     def __find_all_swap_candidates(self, fat_broker_id: int, slim_broker_id: int, topics_stats: dict) -> dict:
 79 |         swap_partition_candidates = {fat_broker_id: [], slim_broker_id: []}
 80 |         for topic, partition, replicas in self.zk.load_partition_assignment():
 81 |             if topic not in topics_stats or str(partition) not in topics_stats[topic]:
 82 |                 continue  # we skip this partition as there is not data size stats for it
 83 | 
 84 |             if replicas[0] in (fat_broker_id, slim_broker_id):
 85 |                 continue  # Skip leadership transfer
 86 | 
 87 |             if fat_broker_id in replicas and slim_broker_id in replicas:
 88 |                 continue  # we skip this partition as it exists on both involved brokers
 89 | 
 90 |             for broker_id in [slim_broker_id, fat_broker_id]:
 91 |                 if broker_id in replicas:
 92 |                     swap_partition_candidates[broker_id].append(
 93 |                         TpData(topic, partition, topics_stats[topic][str(partition)], replicas))
 94 |         return swap_partition_candidates
 95 | 
 96 |     @staticmethod
 97 |     def __find_best_swap_candidate(candidates: list, brokers_gap: int, partition_to_swap_size: int) -> TpData:
 98 |         candidates.sort(key=attrgetter("size"), reverse=True)
 99 |         matching_swap_partition = None
100 |         smallest_new_gap = brokers_gap
101 |         for tp in candidates:
102 |             new_gap = abs(brokers_gap - 2 * abs(tp.size - partition_to_swap_size))
103 |             if new_gap < smallest_new_gap:
104 |                 smallest_new_gap = new_gap
105 |                 matching_swap_partition = tp
106 |         return matching_swap_partition
107 | 
108 |     def __create_rebalance_list(self, tp1: TpData, br1: int, tp2: TpData, br2: int) -> list:
109 |         return [
110 |             (tp1.topic, tp1.partition, self.__replace_broker(tp1.replicas, br1, br2, tp2.replicas[0] == br2)),
111 |             (tp2.topic, tp2.partition, self.__replace_broker(tp2.replicas, br2, br1, tp1.replicas[0] == br1))
112 |         ]
113 | 
114 |     def __replace_broker(self, replicas: list, broker_to_replace: int, replacement: int, was_leader: bool) -> list:
115 |         rps = [x for x in replicas if x != broker_to_replace]
116 |         if was_leader:
117 |             return [replacement] + rps
118 |         else:
119 |             return rps + [replacement]
120 | 
121 |     def __str__(self):
122 |         return 'SwapPartitions'
123 | 
124 | 
125 | def _load_disk_stats(zk: BukuExhibitor, api_port: int):
126 |     size_stats = zk.get_disk_stats()
127 |     if len(size_stats) < 2:
128 |         _LOG.info("No size stats available, imbalance check cancelled")
129 |         return None
130 |     result = {}
131 |     for broker_id, value in size_stats.items():
132 |         try:
133 |             if api_port != -1:  # For unit tests only
134 |                 host = zk.get_broker_address(broker_id)
135 |                 tmp = requests.get(
136 |                     'http://{}:{}/api/disk_stats'.format(host, api_port),
137 |                     timeout=5).json()
138 |                 if any(a not in tmp for a in ['free_kb', 'used_kb']):
139 |                     continue
140 |                 value['disk'] = tmp
141 |                 value['host'] = host
142 |             result[broker_id] = value
143 |         except Exception as e:
144 |             _LOG.error('Failed to load disk stats for broker {}. Skipping it'.format(broker_id), exc_info=e)
145 | 
146 |     return result
147 | 
148 | 
149 | def load_swap_data(zk: BukuExhibitor, api_port: int, gap: int) -> (str, str, int, dict):
150 |     """
151 |     Finds brokers that could be used for gap of size gap. If rack awareness is enabled, the swap will be between two
152 |     brokers in the same rack
153 |     :param zk: Bubuku exhibitor
154 |     :param api_port: bubuku api port
155 |     :param gap: gap in kb to get information for
156 |     :return: (slim_broker_id, fat_broker_id, calculated_gap, size_stats) or (None, None, calculated_gap, size_stats)
157 |     """
158 |     size_stats = _load_disk_stats(zk, api_port)
159 |     if not size_stats or len(size_stats) < 2:
160 |         return None, None, None, size_stats
161 |     sorted_stats = sorted(size_stats.items(), key=lambda tup: tup[1]["disk"]["free_kb"])
162 |     fat_broker, slim_broker = select_fat_slim_brokers(zk, sorted_stats)
163 |     if fat_broker is None:
164 |         return None, None, None, size_stats
165 | 
166 |     calculated_gap = slim_broker[1]['disk']['free_kb'] - fat_broker[1]['disk']['free_kb']
167 |     _LOG.info('Gap between {} and {} is {}, need to fix: {}'.format(
168 |         fat_broker[0], slim_broker[0], calculated_gap, calculated_gap > gap))
169 |     if calculated_gap >= gap:
170 |         return int(slim_broker[0]), int(fat_broker[0]), calculated_gap, size_stats
171 |     return None, None, calculated_gap, size_stats
172 | 
173 | 
174 | def select_fat_slim_brokers(zk: BukuExhibitor, sorted_stats: list):
175 |     racks = zk.get_broker_racks()
176 |     if any([rack is None for rack in racks.values()]):
177 |         return sorted_stats[0], sorted_stats[-1]
178 |     for i in range(len(sorted_stats) - 1):
179 |         fat_broker = sorted_stats[i]
180 |         fat_rack = racks[int(fat_broker[0])]
181 |         for j in range(len(sorted_stats) -1, i, -1):
182 |             slim_broker = sorted_stats[j]
183 |             slim_rack = racks[int(slim_broker[0])]
184 |             if slim_rack == fat_rack:
185 |                 return fat_broker, slim_broker
186 | 
187 |     return None, None
188 | 
189 | 
190 | class CheckBrokersDiskImbalance(Check):
191 |     def __init__(self, zk: BukuExhibitor, broker: BrokerManager, diff_threshold_kb: int, api_port: int):
192 |         super().__init__(check_interval_s=900)
193 |         self.zk = zk
194 |         self.api_port = api_port
195 |         self.broker = broker
196 |         self.diff_threshold_kb = diff_threshold_kb
197 | 
198 |     def check(self):
199 |         if self.broker.is_running_and_registered():
200 |             _LOG.info("Starting broker disk imbalance check")
201 |             try:
202 |                 slim_broker_id, fat_broker_id, gap, size_stats = load_swap_data(
203 |                     self.zk, self.api_port, self.diff_threshold_kb)
204 |                 if slim_broker_id is not None:  # All or nothing
205 |                     return SwapPartitionsChange(
206 |                         self.zk,
207 |                         lambda x: load_swap_data(x, self.api_port, self.diff_threshold_kb))
208 |             except Exception as e:
209 |                 _LOG.warn("Error occurred when performing disk imbalance check", exc_info=e)
210 |         return None
211 | 
212 |     def __str__(self):
213 |         return 'CheckBrokersDiskImbalance'
214 | 


--------------------------------------------------------------------------------
/bubuku/features/terminate.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import signal
 3 | 
 4 | from bubuku.broker import BrokerManager
 5 | from bubuku.controller import Controller, Change
 6 | 
 7 | _LOG = logging.getLogger('bubuku.features.terminate')
 8 | 
 9 | 
10 | class StopBrokerChange(Change):
11 |     def __init__(self, broker: BrokerManager):
12 |         self.broker = broker
13 | 
14 |     def get_name(self):
15 |         return 'stop'
16 | 
17 |     def __str__(self):
18 |         return 'StopBrokerChange ({})'.format(self.get_name())
19 | 
20 |     def can_run(self, current_actions):
21 |         return all([action not in current_actions for action in ['start', 'restart', 'stop', 'complete_stop']])
22 | 
23 |     def run(self, current_actions):
24 |         _LOG.info('Stopping kafka process')
25 |         self.broker.stop_kafka_process()
26 |         return self.broker.has_leadership()
27 | 
28 |     def can_run_at_exit(self):
29 |         return True
30 | 
31 | 
32 | class CompleteStopChange(Change):
33 |     def __init__(self, broker: BrokerManager, controller: Controller):
34 |         self.broker = broker
35 |         self.controller = controller
36 | 
37 |     def get_name(self):
38 |         return 'complete_stop'
39 | 
40 |     def __str__(self):
41 |         return 'CompleteStopChange ({})'.format(self.get_name())
42 | 
43 |     def can_run(self, current_actions):
44 |         return all([action not in current_actions for action in ['start', 'restart', 'stop', 'complete_stop']])
45 | 
46 |     def run(self, current_actions):
47 |         _LOG.info('Stopping kafka process and the controller')
48 |         self.controller.stop(StopBrokerChange(self.broker))
49 |         return False
50 | 
51 |     def can_run_at_exit(self):
52 |         return False
53 | 
54 | 
55 | __REGISTERED = None
56 | 
57 | 
58 | def get_registration():
59 |     if not __REGISTERED:
60 |         return None, None
61 |     return __REGISTERED
62 | 
63 | 
64 | def register_terminate_on_interrupt(controller: Controller, broker: BrokerManager):
65 |     global __REGISTERED
66 | 
67 |     def _sig_handler(*args, **kwargs):
68 |         _LOG.info('Signal was caught, stopping controller gracefully')
69 |         controller.stop(StopBrokerChange(broker))
70 | 
71 |     _LOG.info('Registering signal handler')
72 |     old_handler = signal.signal(signal.SIGTERM, _sig_handler)
73 |     if old_handler:
74 |         _LOG.warn('Old handler is removed: {}'.format(old_handler))
75 |     __REGISTERED = (controller, broker)
76 | 


--------------------------------------------------------------------------------
/bubuku/id_extractor.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import logging
 3 | import os
 4 | import re
 5 | from typing import Optional, List
 6 | 
 7 | from bubuku.config import KafkaProperties
 8 | from bubuku.zookeeper import BukuExhibitor
 9 | 
10 | _LOG = logging.getLogger('bubuku.id_generator')
11 | 
12 | 
13 | def _search_broker_id(lines: List[str]) -> Optional[str]:
14 |     for line in lines:
15 |         match = re.search('^broker\\.id=(\\d+)$', line.strip())
16 |         if match:
17 |             return match.group(1)
18 | 
19 | 
20 | class BrokerIdExtractor(object):
21 |     def __init__(self, zk: BukuExhibitor, kafka_properties: KafkaProperties):
22 |         super().__init__()
23 |         self.zk = zk
24 |         self.kafka_properties = kafka_properties
25 |         self.broker_id = None
26 | 
27 |     def get_broker_id(self):
28 |         if self.broker_id:
29 |             return self.broker_id
30 | 
31 |         meta_path = '{}/meta.properties'.format(self.kafka_properties.get_property('log.dirs'))
32 |         while not os.path.isfile(meta_path):
33 |             return None
34 |         with open(meta_path) as f:
35 |             self.broker_id = _search_broker_id(f.readlines())
36 |         return self.broker_id
37 | 
38 |     def is_registered(self):
39 |         broker_id = self.get_broker_id()
40 |         if broker_id:
41 |             return self.zk.is_broker_registered(broker_id)
42 |         return False
43 | 


--------------------------------------------------------------------------------
/bubuku/process.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | 
 4 | class KafkaProcess(object):
 5 |     def __init__(self, kafka_dir: str):
 6 |         self.process = None
 7 |         self.kafka_dir = kafka_dir
 8 | 
 9 |     def start(self, settings_file):
10 |         if self.is_running():
11 |             raise Exception('Kafka process already started')
12 |         self.process = subprocess.Popen([self.kafka_dir + "/bin/kafka-server-start.sh", settings_file])
13 | 
14 |     def is_running(self) -> bool:
15 |         if self.process:
16 |             self.process.poll()
17 |             return self.process.returncode is None
18 |         return False
19 | 
20 |     def stop_and_wait(self):
21 |         if self.process is None:
22 |             raise Exception('Process was not started')
23 |         self.process.terminate()
24 |         self.process.wait()
25 |         self.process = None
26 | 


--------------------------------------------------------------------------------
/bubuku/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import subprocess
 3 | 
 4 | import requests
 5 | 
 6 | from bubuku.config import Config, KafkaProperties, load_config
 7 | from bubuku.env_provider import EnvProvider
 8 | from bubuku.zookeeper import BukuExhibitor
 9 | 
10 | _LOG = logging.getLogger('bubuku.utils')
11 | 
12 | 
13 | class CmdHelper(object):
14 |     def get_disk_stats(self) -> (int, int):
15 |         """
16 |         Returns total disk stats,
17 |         :return: used_kb, free_kb
18 |         """
19 |         disks = self.cmd_run("df -k | tail -n +2 |  awk '{ print $3, $4 }'").split("\n")
20 |         total_used = total_free = 0
21 |         for disk in disks:
22 |             parts = disk.split(" ")
23 |             if len(parts) == 2:
24 |                 used, free = tuple(parts)
25 |                 total_used += int(used)
26 |                 total_free += int(free)
27 |         return total_used, total_free
28 | 
29 |     def cmd_run(self, cmd):
30 |         output = subprocess.check_output(cmd, shell=True)
31 |         return output.decode("utf-8")
32 | 
33 | 
34 | def get_opt_broker_id(broker_id: str, config: Config, zk: BukuExhibitor, env_provider: EnvProvider, throw_on_missing=True) -> str:
35 |     if not broker_id:
36 |         kafka_properties = KafkaProperties(config.kafka_settings_template, '/tmp/tmp.props'.format(config.kafka_dir))
37 |         broker_id_manager = env_provider.create_broker_id_manager(zk, kafka_properties)
38 |         broker_id = broker_id_manager.get_broker_id()
39 |         _LOG.info('Will use broker_id {}'.format(broker_id))
40 |     running_brokers = zk.get_broker_ids()
41 |     if broker_id not in running_brokers and throw_on_missing:
42 |         raise Exception('Broker id {} is not registered ({})'.format(broker_id, running_brokers))
43 |     return broker_id
44 | 
45 | 
46 | def prepare_configs():
47 |     config = load_config()
48 |     _LOG.info('Using config: {}'.format(config))
49 |     env_provider = EnvProvider.create_env_provider(config)
50 |     return config, env_provider
51 | 
52 | 
53 | def is_cluster_healthy():
54 |     config = load_config()
55 |     try:
56 |         response = requests.get('http://{}:{}/api/metrics'.format('localhost', '8080'))
57 |         resp_json = response.json()
58 |         if not resp_json['metrics']:
59 |             return False
60 |         for metrics in resp_json['metrics']:
61 |             metric = metrics['metrics']
62 |             if metric:
63 |                 if metric['PreferredReplicaImbalance'] > 0:
64 |                     return False
65 |                 if metric['OfflinePartitions'] > 0:
66 |                     return False
67 |                 if metric['UnderReplicatedPartitions'] > 0:
68 |                     return False
69 |             else:
70 |                 return False
71 |         return True
72 |     except Exception as e:
73 |         _LOG.error('Failed to get cluster state', exc_info=e)
74 |         return False
75 | 
76 | 
77 | def get_max_bytes_in():
78 |     response = requests.get('http://{}:{}/api/metrics'.format('localhost', '8080'))
79 |     resp_json = response.json()
80 |     if not resp_json['metrics']:
81 |         raise Exception("Can't fetch metrics to note current cluster state. Please try again")
82 |     return max([int(metric['metrics']['BytesIn']) for metric in resp_json["metrics"]])
83 | 


--------------------------------------------------------------------------------
/bubuku/zookeeper/exhibitor.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import random
 3 | 
 4 | import requests
 5 | from requests import RequestException
 6 | 
 7 | from bubuku.zookeeper import AddressListProvider
 8 | 
 9 | _LOG = logging.getLogger('bubuku.zookeeper.exhibitor')
10 | 
11 | 
12 | class ExhibitorAddressProvider(AddressListProvider):
13 |     def __init__(self, initial_list_provider):
14 |         self.initial_list_provider = initial_list_provider
15 |         self.exhibitors = []
16 | 
17 |     def get_latest_address(self) -> (list, int):
18 |         json_ = self._query_exhibitors(self.exhibitors)
19 |         if not json_:
20 |             self.exhibitors = self.initial_list_provider()
21 |             json_ = self._query_exhibitors(self.exhibitors)
22 |         if isinstance(json_, dict) and 'servers' in json_ and 'port' in json_:
23 |             self.exhibitors = json_['servers']
24 |             return sorted(json_['servers']), int(json_['port'])
25 |         return None
26 | 
27 |     def _query_exhibitors(self, exhibitors):
28 |         if not exhibitors:
29 |             return None
30 |         random.shuffle(exhibitors)
31 |         for host in exhibitors:
32 |             url = 'http://{}:{}{}'.format(host, 8181, '/exhibitor/v1/cluster/list')
33 |             try:
34 |                 response = requests.get(url, timeout=3.1, headers={'Accept': 'application/json'})
35 |                 return response.json()
36 |             except RequestException as e:
37 |                 _LOG.warning('Failed to query zookeeper list information from {}'.format(url), exc_info=e)
38 |             except ConnectionError as e:
39 |                 _LOG.warning('Failed to connect to zookeeper instance {}'.format(url), exc_info=e)
40 |             except Exception as e:
41 |                 _LOG.warning('Unknown error connecting to zookeeper instance {}'.format(url), exc_info=e)
42 |         return None
43 | 


--------------------------------------------------------------------------------
/cli_docs/cli.md:
--------------------------------------------------------------------------------
  1 | # Bubuku command line interface
  2 | 
  3 | Bubuku provides a command line tool `bubuku-cli` which should be used directly on the instance. Available commands:
  4 | 
  5 | #### preferred-replica-election
  6 | ```
  7 | Usage: bubuku-cli preferred-replica-election [OPTIONS]
  8 | 
  9 |   Do preferred replica election, as command line tool from kafka have a
 10 |   number of limitations. Only partitions, that are improperly allocated will
 11 |   be affected. In case if size of resulting json is too big, it will be
 12 |   split into several parts, and they will be executed one after another.
 13 | 
 14 | Options:
 15 |   --dry-run                Do not apply the changes. Instead just prepare json
 16 |                            file(s)
 17 |   --max-json-size INTEGER  Maximum size of json data in bytes to write to zk
 18 |                            [default: 512000]
 19 |   --help                   Show this message and exit.
 20 | ```
 21 | 
 22 | #### restart
 23 | ```
 24 | Usage: bubuku-cli restart [OPTIONS]
 25 | 
 26 |   Restart kafka instance
 27 | 
 28 | Options:
 29 |   --broker TEXT  Broker id to restart. By default current broker id is
 30 |                  restarted
 31 |   --help         Show this message and exit.
 32 | ```
 33 | 
 34 | #### rolling-restart
 35 | ```
 36 | Usage: bubuku-cli rolling-restart [OPTIONS]
 37 | 
 38 |   Rolling restart of Kafka cluster
 39 | 
 40 | Options:
 41 |   --image-tag TEXT      Docker image to run Kafka broker
 42 |   --instance-type TEXT  AWS instance type to run Kafka broker on
 43 |   --scalyr-key TEXT     Scalyr account key
 44 |   --scalyr-region TEXT  Scalyr region to use
 45 |   --kms-key-id TEXT     Kms key id to decrypt data with
 46 |   --cool-down INTEGER   Number of seconds to wait before passing the restart
 47 |                         task to another broker, after cluster is stable
 48 |                         [default: 20]
 49 |   --help                Show this message and exit.
 50 | ```
 51 | 
 52 | #### rebalance
 53 | ```
 54 | Usage: bubuku-cli rebalance [OPTIONS]
 55 | 
 56 |   Run rebalance process on one of brokers. If rack-awareness is enabled,
 57 |   replicas will only be move to other brokers in the same rack
 58 | 
 59 | Options:
 60 |   --broker TEXT          Broker instance on which to perform rebalance. By
 61 |                          default, any free broker will start it
 62 |   --empty_brokers TEXT   Comma-separated list of brokers to empty. All
 63 |                          partitions will be moved to other brokers
 64 |   --exclude_topics TEXT  Comma-separated list of topics to exclude from
 65 |                          rebalance
 66 |   --bin-packing          Use bean packing approach instead of one way
 67 |                          processing
 68 |   --parallelism INTEGER  Amount of partitions to move in a single rebalance
 69 |                          step  [default: 1]
 70 |   --throttle INTEGER     Upper bound on bandwidth (in bytes/sec) used for
 71 |                          rebalance
 72 |   --remove-throttle      Don't trigger rebalance but remove throttling
 73 |                          configuration from all the brokers and topics
 74 |   --help                 Show this message and exit.
 75 | ```
 76 | 
 77 | #### migrate
 78 | ```
 79 | Usage: bubuku-cli migrate [OPTIONS]
 80 | 
 81 |   Replace one broker with another for all partitions
 82 | 
 83 | Options:
 84 |   --from TEXT            List of brokers to migrate from (separated with ",")
 85 |   --to TEXT              List of brokers to migrate to (separated with ",")
 86 |   --shrink               Whether or not to shrink replaced broker ids form
 87 |                          partition assignment  [default: False]
 88 |   --broker TEXT          Optional broker id to execute check on
 89 |   --throttle INTEGER     Upper bound on bandwidth (in bytes/sec) used for
 90 |                          reassigning partitions
 91 |   --parallelism INTEGER  Amount of partitions to move in a single migration
 92 |                          step  [default: 1]
 93 |   --remove-throttle      Don't trigger rebalance but remove throttling
 94 |                          configuration from all the brokers and topics
 95 |   --help                 Show this message and exit.
 96 | ```
 97 | 
 98 | #### swap_fat_slim
 99 | ```
100 | Usage: bubuku-cli swap_fat_slim [OPTIONS]
101 | 
102 |   Move one partition from fat broker to slim one
103 | 
104 | Options:
105 |   --threshold INTEGER  Threshold in kb to run swap  [default: 100000]
106 |   --help               Show this message and exit.
107 | ```
108 | 
109 | #### actions list
110 | ```
111 | Usage: bubuku-cli actions list [OPTIONS]
112 | 
113 |   List all the actions on broker(s)
114 | 
115 | Options:
116 |   --broker TEXT  Broker id to list actions on. By default all brokers are
117 |                  enumerated
118 |   --help         Show this message and exit.
119 | ```
120 | 
121 | #### actions delete
122 | ```
123 | Usage: bubuku-cli actions delete [OPTIONS]
124 | 
125 |   Remove all actions of specified type on broker(s)
126 | 
127 | Options:
128 |   --action TEXT  Action to delete
129 |   --broker TEXT  Broker id to delete actions on. By default actions are
130 |                  deleted on all brokers
131 |   --help         Show this message and exit.
132 | ```
133 | 
134 | #### stats
135 | ```
136 | Usage: bubuku-cli stats [OPTIONS]
137 | 
138 |   Display statistics about brokers
139 | 
140 | Options:
141 |   --help  Show this message and exit.
142 | ```
143 | 
144 | #### validate replication
145 | ```
146 | Usage: bubuku-cli validate replication [OPTIONS]
147 | 
148 |   Returns all partitions whose ISR size differs from the replication factor
149 |   or have not registered broker ids
150 | 
151 | Options:
152 |   --factor INTEGER  Replication factor  [default: 3]
153 |   --help            Show this message and exit.
154 | ```
155 | 
156 | 


--------------------------------------------------------------------------------
/cli_docs/generate_cli_docs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import click
 4 | 
 5 | from bubuku import cli
 6 | 
 7 | _HEADER = """# Bubuku command line interface
 8 | 
 9 | Bubuku provides a command line tool `bubuku-cli` which should be used directly on the instance. Available commands:
10 | 
11 | """
12 | 
13 | 
14 | def generate_command_docs(name, command, md_file, parent_ctx=None):
15 |     ctx = click.Context(command, info_name=name, parent=parent_ctx)
16 |     sub_commands = getattr(command, "commands", {})
17 | 
18 |     # generate docs only for actual commands (not command groups)
19 |     if len(sub_commands) == 0:
20 |         cmd_path = ctx.command_path.split()
21 |         cmd_path.pop(0)
22 |         md_file.write("#### {}\n".format(" ".join(cmd_path)))
23 |         md_file.write("```\n{}\n```\n\n".format(ctx.get_help()))
24 |     else:
25 |         # if command has sub-commands - recursively generate docs for all sub-commands
26 |         for sub_cmd_name, sub_command in sub_commands.items():
27 |             generate_command_docs(sub_cmd_name, sub_command, md_file, ctx)
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     print("Generating 'cli.md'...")
32 |     
33 |     with open("cli.md", "w") as md_file:
34 |         md_file.write(_HEADER)
35 |         generate_command_docs("bubuku-cli", cli.cli, md_file)
36 | 
37 |     print("Done")
38 | 


--------------------------------------------------------------------------------
/cli_docs/generate_cli_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cd `dirname ${0}`
4 | PYTHONPATH=.. python3 generate_cli_docs.py
5 | cd - > /dev/null
6 | 


--------------------------------------------------------------------------------
/delivery.yaml:
--------------------------------------------------------------------------------
 1 | version: "2017-09-20"
 2 | pipeline:
 3 |   - id: build
 4 |     type: script
 5 |     vm_config: 
 6 |       type: linux
 7 |       size: large
 8 |       image: cdp-runtime/python-3.9
 9 |     commands:
10 |       - desc: Run tests
11 |         cmd: |
12 |           pip3 install -r requirements.txt 
13 |           python3 setup.py test
14 | 
15 |       - desc: Build docker images for different architectures
16 |         cmd: |
17 |           IMAGE="container-registry-test.zalando.net/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}"
18 | 
19 |           # create a Buildkit builder with CDP specific configuration
20 |           docker buildx create \
21 |                 --config /etc/cdp-buildkitd.toml \
22 |                 --driver-opt network=host \
23 |                 --name cdpbuildx \
24 |                 --bootstrap \
25 |                 --use
26 | 
27 |           # (1) build image for AMD64 and ARM64 and push it to the Zalando Container Registry
28 |           docker buildx build \
29 |                 --platform linux/amd64,linux/arm64 \
30 |                 -t ${IMAGE} \
31 |                 --push \
32 |                 .
33 | 
34 |           # (2) promote it from `container-registry-test` to `container-registry`, marking it production-ready
35 |           cdp-promote-image ${IMAGE}
36 | 
37 |   - id: push-pierone-arm64
38 |     type: script
39 |     when:
40 |       event: push
41 |     vm_config:
42 |       type: linux
43 |       image: cdp-runtime/base
44 |     commands:
45 |       - desc: Push ARM64 image to PierOne
46 |         cmd: |
47 |           IMAGE="container-registry.zalando.net/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}"
48 |           PIERONE_IMAGE="registry-write.opensource.zalan.do/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}-arm64"
49 | 
50 |           docker pull --platform linux/arm64 $IMAGE
51 |           docker tag $IMAGE $PIERONE_IMAGE
52 |           docker push $PIERONE_IMAGE
53 | 
54 |   - id: push-pierone-amd64
55 |     type: script
56 |     when:
57 |       event: push
58 |     vm_config:
59 |       type: linux
60 |       image: cdp-runtime/base
61 |     commands:
62 |       - desc: Push AMD64 image to PierOne
63 |         cmd: |
64 |           IMAGE="container-registry.zalando.net/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}"
65 |           PIERONE_IMAGE="registry-write.opensource.zalan.do/aruha/bubuku-appliance:oss-${CDP_BUILD_VERSION}-amd64"
66 | 
67 |           docker pull --platform linux/amd64 $IMAGE
68 |           docker tag $IMAGE $PIERONE_IMAGE
69 |           docker push $PIERONE_IMAGE
70 | 
71 | notifications:
72 |   - channel: google_chat
73 |     rooms:
74 |       - AAAAmX_hkRQ
75 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 | 
 4 |   bubuku:
 5 |     build: .
 6 |     depends_on:
 7 |       - zookeeper
 8 |     environment:
 9 |       BUBUKU_MODE: "local"
10 |       HEALTH_PORT: "8080"
11 |       BUKU_FEATURES: "restart_on_exhibitor,rebalance_on_brokers_change,graceful_terminate"
12 | 
13 |   zookeeper:
14 |     image: wurstmeister/zookeeper:3.4.6
15 | 


--------------------------------------------------------------------------------
/docker/download_kafka.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | SCALA_VERSION=${1}
 4 | KAFKA_VERSION=${2}
 5 | KAFKA_DIR=${3}
 6 | JOLOKIA_VERSION=${4}
 7 | 
 8 | set -xe
 9 | 
10 | curl -f "https://archive.apache.org/dist/kafka/${KAFKA_VERSION}/kafka_${SCALA_VERSION}-${KAFKA_VERSION}.tgz" > "/tmp/kafka_release.tgz"
11 | tar xf /tmp/kafka_release.tgz -C /opt
12 | rm -f /tmp/kafka_release.tgz
13 | mv /opt/kafka_${SCALA_VERSION}-${KAFKA_VERSION} $KAFKA_DIR
14 | 
15 | curl -fL "http://search.maven.org/remotecontent?filepath=org/jolokia/jolokia-jvm/${JOLOKIA_VERSION}/jolokia-jvm-${JOLOKIA_VERSION}-agent.jar" > "/opt/jolokia-jvm-agent.jar"
16 | 
17 | 


--------------------------------------------------------------------------------
/docker/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one or more
 2 | # contributor license agreements.  See the NOTICE file distributed with
 3 | # this work for additional information regarding copyright ownership.
 4 | # The ASF licenses this file to You under the Apache License, Version 2.0
 5 | # (the "License"); you may not use this file except in compliance with
 6 | # the License.  You may obtain a copy of the License at
 7 | #
 8 | #    http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | log4j.rootLogger=WARN, stdout
17 | 
18 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
19 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
20 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss.SSS} %p %m (%c)%n
21 | 
22 | log4j.logger.kafka=WARN
23 | log4j.logger.kafka.server.FetchManager=ERROR
24 | 


--------------------------------------------------------------------------------
/docker/server.properties:
--------------------------------------------------------------------------------
 1 | log.dirs=/data/kafka-logs
 2 | listeners=PLAINTEXT://:9092
 3 | advertised.listeners=PLAINTEXT://:9092
 4 | auto.create.topics.enable=false
 5 | delete.topic.enable=true
 6 | auto.leader.rebalance.enable=true
 7 | leader.imbalance.check.interval.seconds=100
 8 | unclean.leader.election.enable=false
 9 | min.insync.replicas=2
10 | reserved.broker.max.id=67108864
11 | broker.id.generation.enable=true
12 | ### from http://kafka.apache.org/documentation.html#prodconfig
13 | 
14 | # Replication configurations
15 | num.replica.fetchers=8
16 | replica.fetch.max.bytes=2097152
17 | replica.fetch.wait.max.ms=500
18 | replica.high.watermark.checkpoint.interval.ms=5000
19 | replica.socket.timeout.ms=30000
20 | replica.socket.receive.buffer.bytes=65536
21 | replica.lag.time.max.ms=10000
22 | replica.lag.max.messages=4000
23 | replica.selector.class=org.apache.kafka.common.replica.RackAwareReplicaSelector
24 | 
25 | controller.socket.timeout.ms=30000
26 | controller.message.queue.size=10
27 | 
28 | # Log configuration
29 | #num.partitions=8
30 | message.max.bytes=2098152
31 | #auto.create.topics.enable=true
32 | log.index.interval.bytes=4096
33 | log.index.size.max.bytes=10485760
34 | log.retention.hours=168
35 | log.flush.interval.ms=10000
36 | log.flush.interval.messages=20000
37 | log.flush.scheduler.interval.ms=2000
38 | log.roll.hours=168
39 | log.retention.check.interval.ms=300000
40 | log.segment.bytes=1073741824
41 | log.cleaner.max.compaction.lag.ms=1209600000
42 | 
43 | # ZK configuration
44 | zookeeper.connection.timeout.ms=6000
45 | zookeeper.sync.time.ms=2000
46 | 
47 | # Socket server configuration
48 | num.io.threads=16
49 | num.network.threads=16
50 | socket.request.max.bytes=104857600
51 | socket.receive.buffer.bytes=1048576
52 | socket.send.buffer.bytes=1048576
53 | queued.max.requests=32
54 | fetch.purgatory.purge.interval.requests=100
55 | producer.purgatory.purge.interval.requests=100
56 | 
57 | #migration
58 | inter.broker.protocol.version=3.1
59 | log.message.format.version=3.1
60 | 
61 | # never expire consumer offsets
62 | offsets.retention.minutes=52560000
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | botocore>=1.17.30
2 | kazoo>=2.8.0
3 | boto3>=1.14.30
4 | requests>=2.24.0
5 | click>=7.1.2
6 | pyyaml>=5.3.1
7 | netaddr>=0.8.0
8 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
3 | 
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import inspect
  5 | import os
  6 | import sys
  7 | 
  8 | import setuptools
  9 | from setuptools import setup
 10 | from setuptools.command.test import test
 11 | from distutils.core import Command
 12 | 
 13 | if sys.version_info < (3, 5, 0):
 14 |     sys.stderr.write('FATAL: Bubuku needs to be run with Python 3.5+\n')
 15 |     sys.exit(1)
 16 | 
 17 | __location__ = os.path.join(os.getcwd(), os.path.dirname(inspect.getfile(inspect.currentframe())))
 18 | 
 19 | 
 20 | def read_version(package):
 21 |     with open(os.path.join(package, '__init__.py'), 'r') as fd:
 22 |         for line in fd:
 23 |             if line.startswith('__version__ = '):
 24 |                 return line.split()[-1].strip().strip("'")
 25 | 
 26 | 
 27 | NAME = 'bubuku'
 28 | MAIN_PACKAGE = 'bubuku'
 29 | VERSION = read_version(MAIN_PACKAGE)
 30 | DESCRIPTION = 'AWS support for kafka broker'
 31 | LICENSE = 'Apache License 2.0'
 32 | URL = 'https://github.com/zalando-incubator/bubuku'
 33 | AUTHOR = 'Dmitry Sorokin'
 34 | EMAIL = 'dmitriy.sorokin@zalando.de'
 35 | KEYWORDS = 'aws kafka supervisor'
 36 | 
 37 | # Add here all kinds of additional classifiers as defined under
 38 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers
 39 | CLASSIFIERS = [
 40 |     'Development Status :: 3 - Alpha',
 41 |     'Environment :: Console',
 42 |     'Intended Audience :: Developers',
 43 |     'Intended Audience :: System Administrators',
 44 |     'License :: OSI Approved :: Apache Software License',
 45 |     'Operating System :: POSIX :: Linux',
 46 |     'Programming Language :: Python',
 47 |     'Programming Language :: Python :: 3.4',
 48 |     'Programming Language :: Python :: Implementation :: CPython',
 49 | ]
 50 | 
 51 | CONSOLE_SCRIPTS = [
 52 |     'bubuku-daemon = bubuku.daemon:main',
 53 |     'bubuku-cli = bubuku.cli:cli'
 54 | ]
 55 | 
 56 | 
 57 | class DockerUpCommand(Command):
 58 |     description = "Start up docker compose with 3 bubuku and 1 zookeeper instances"
 59 |     user_options = [
 60 |         ('bubuku-scale=', None, 'Specify number of bubuku instances')
 61 |     ]
 62 | 
 63 |     def initialize_options(self):
 64 |         self.bubuku_scale = 3
 65 | 
 66 |     def finalize_options(self):
 67 |         pass
 68 | 
 69 |     def run(self):
 70 |         os.system('docker-compose up -d --build && docker-compose scale bubuku=' + str(self.bubuku_scale))
 71 | 
 72 | 
 73 | class DockerDownCommand(Command):
 74 |     description = "Stop docker compose"
 75 |     user_options = []
 76 | 
 77 |     def initialize_options(self):
 78 |         pass
 79 | 
 80 |     def finalize_options(self):
 81 |         pass
 82 | 
 83 |     def run(self):
 84 |         os.system('docker-compose down')
 85 | 
 86 | 
 87 | class PyTest(test):
 88 |     def run_tests(self):
 89 |         try:
 90 |             import pytest
 91 |         except:
 92 |             raise RuntimeError('py.test is not installed, run: pip install pytest')
 93 |         params = {'args': self.test_args}
 94 |         errno = pytest.main(**params)
 95 |         sys.exit(errno)
 96 | 
 97 | 
 98 | def read(fname):
 99 |     with open(os.path.join(__location__, fname)) as f:
100 |         return f.read()
101 | 
102 | 
103 | def setup_package():
104 |     command_options = {'test': {'test_suite': ('setup.py', 'tests')}}
105 | 
106 |     setup(
107 |         name=NAME,
108 |         version=VERSION,
109 |         url=URL,
110 |         description=DESCRIPTION,
111 |         author=AUTHOR,
112 |         author_email=EMAIL,
113 |         license=LICENSE,
114 |         keywords=KEYWORDS,
115 |         classifiers=CLASSIFIERS,
116 |         test_suite='tests',
117 |         packages=setuptools.find_packages(exclude=['tests', 'tests.*']),
118 |         install_requires=[req for req in read('requirements.txt').split('\\n') if req != ''],
119 |         cmdclass={'test': PyTest, 'docker_up': DockerUpCommand, 'docker_down': DockerDownCommand},
120 |         tests_require=['pytest-cov', 'pytest'],
121 |         command_options=command_options,
122 |         entry_points={
123 |             'console_scripts': CONSOLE_SCRIPTS,
124 |         },
125 |     )
126 | 
127 | if __name__ == '__main__':
128 |     setup_package()
129 | 


--------------------------------------------------------------------------------
/tests/test_broker.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import MagicMock
  3 | 
  4 | from bubuku.broker import BrokerManager, LeaderElectionInProgress, StartupTimeout
  5 | from bubuku.process import KafkaProcess
  6 | from test_config import build_test_properties
  7 | 
  8 | zk_fake_host = 'zk_host:8181/path'
  9 | 
 10 | 
 11 | class FakeProcessManager(KafkaProcess):
 12 |     def __init__(self):
 13 |         super().__init__('')
 14 |         self.running = False
 15 | 
 16 |     def start(self, settings_file):
 17 |         if self.running:
 18 |             raise Exception('Start second time')
 19 |         self.running = True
 20 | 
 21 |     def stop_and_wait(self):
 22 |         self.running = False
 23 | 
 24 |     def is_running(self) -> bool:
 25 |         return self.running
 26 | 
 27 | 
 28 | def _prepare_for_start_fail(broker_ids, leader, isr):
 29 |     exhibitor = MagicMock()
 30 |     exhibitor.get_broker_ids.return_value = broker_ids
 31 |     exhibitor.load_partition_states.return_value = [
 32 |         ('t0', 0, {'leader': int(leader), 'isr': [int(i) for i in isr]})]
 33 | 
 34 |     id_manager = MagicMock()
 35 |     id_manager.get_broker_id = lambda: '1'
 36 |     kafka_props = build_test_properties()
 37 | 
 38 |     broker = BrokerManager(FakeProcessManager(), exhibitor, id_manager, kafka_props,
 39 |                            StartupTimeout.build({'type': 'linear'}))
 40 | 
 41 |     kafka_props.set_property('unclean.leader.election.enable', 'false')
 42 |     return kafka_props, broker
 43 | 
 44 | 
 45 | class TestBroker(unittest.TestCase):
 46 |     def test_broker_checks_death(self):
 47 |         exhibitor = MagicMock()
 48 |         states = [2, 2]
 49 | 
 50 |         def _load_states(topics=None):
 51 |             for idx in range(0, len(states)):
 52 |                 states[idx] -= 1
 53 |             return [
 54 |                 ('t1', 0, {'leader': states[0], 'isr': [1, 3] if states[0] >= 1 else [3]}),
 55 |                 ('t2', 0, {'leader': states[1], 'isr': [1, 3] if states[1] >= 1 else [3]})
 56 |             ]
 57 | 
 58 |         exhibitor.load_partition_states = _load_states
 59 | 
 60 |         id_manager = MagicMock()
 61 |         id_manager.get_broker_id = lambda: '1'
 62 |         kafka_props = build_test_properties()
 63 |         kafka_props.set_property('unclean.leader.election.enable', 'true')
 64 | 
 65 |         manager = BrokerManager(FakeProcessManager(), exhibitor, id_manager, kafka_props,
 66 |                                 StartupTimeout.build({'type': 'linear'}))
 67 | 
 68 |         assert not manager.has_leadership()
 69 | 
 70 |         kafka_props.set_property('unclean.leader.election.enable', 'false')
 71 |         assert manager.has_leadership()
 72 |         assert not manager.has_leadership()
 73 | 
 74 |     def test_broker_start_success_isr(self):
 75 |         kafka_props, broker = _prepare_for_start_fail(['1', '2'], 1, [3, 4])
 76 |         # suppose that leader exists, but isr - not
 77 |         broker.start_kafka_process(zk_fake_host)
 78 | 
 79 |     def test_broker_start_fail_isr(self):
 80 |         kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [4, 2])
 81 |         # suppose that leader is not present
 82 |         try:
 83 |             broker.start_kafka_process(zk_fake_host)
 84 |             assert False, 'broker 2 must be in leaders, it must be impossible to start 1'
 85 |         except LeaderElectionInProgress:
 86 |             pass
 87 | 
 88 |     def test_broker_start_fail_leader(self):
 89 |         kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [1, 5])
 90 |         # suppose that broker is free to start
 91 |         try:
 92 |             broker.start_kafka_process(zk_fake_host)
 93 |             assert False, 'Broker must not start in case where it''s possible to change leader'
 94 |         except LeaderElectionInProgress:
 95 |             pass
 96 | 
 97 |     def test_broker_start_success_no_leader_candidate(self):
 98 |         kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [4, 5])
 99 |         # suppose that broker is free to start
100 |         broker.start_kafka_process(zk_fake_host)
101 | 
102 |     def test_broker_start_success_unclean_1(self):
103 |         kafka_props, broker = _prepare_for_start_fail(['1', '2'], 1, [1, 2])
104 |         kafka_props.delete_property('unclean.leader.election.enable')
105 |         # suppose that broker is free to start
106 |         broker.start_kafka_process(zk_fake_host)
107 | 
108 |     def test_broker_start_success_unclean_2(self):
109 |         kafka_props, broker = _prepare_for_start_fail(['1', '2'], 1, [1, 2])
110 |         kafka_props.set_property('unclean.leader.election.enable', 'true')
111 |         # suppose that broker is free to start
112 |         broker.start_kafka_process(zk_fake_host)
113 | 
114 |     def test_broker_start_fail_no_zk_conn(self):
115 |         kafka_props, broker = _prepare_for_start_fail(['1', '2'], 3, [1, 5])
116 |         try:
117 |             broker.start_kafka_process(zk_fake_host)
118 |             assert False, 'Broker must not start in case there is no connection to zk'
119 |         except Exception as e:
120 |             error_msg = str(e)
121 |             assert error_msg != 'No connection to zookeeper'
122 | 


--------------------------------------------------------------------------------
/tests/test_broker_id_generator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from bubuku.id_extractor import _search_broker_id
 4 | 
 5 | 
 6 | class TestBrokerIdExtractor(unittest.TestCase):
 7 |     def test_match_valid(self):
 8 |         assert '123534' == _search_broker_id(['broker.id=123534'])
 9 |         assert '123534' == _search_broker_id(['\tbroker.id=123534'])
10 |         assert '123534' == _search_broker_id(['\tbroker.id=123534\n'])
11 |         assert '123534' == _search_broker_id(['broker.id=123534 \n\r'])
12 |         assert '123534' == _search_broker_id(['\tbroker.id=123534 \r'])
13 |         assert '123534' == _search_broker_id(['xbroker.id=1', 'broker.id=123534'])
14 |         assert '123534' == _search_broker_id(['broker.id=123534', 'boker.id=123534'])
15 | 
16 |     def test_match_invalid(self):
17 |         assert _search_broker_id([]) is None
18 |         assert _search_broker_id(['broker_id=123534']) is None
19 |         assert _search_broker_id(['xbroker.id=1', 'broker.id=12f3534']) is None
20 |         assert _search_broker_id(['bruker.id=123534', 'boker.id=123534']) is None
21 | 


--------------------------------------------------------------------------------
/tests/test_check_time_period.py:
--------------------------------------------------------------------------------
 1 | from bubuku.controller import Check, Change
 2 | from time import sleep
 3 | 
 4 | 
 5 | def test_check_time_period():
 6 |     test_check = _TestCheck()
 7 | 
 8 |     assert test_check.check_if_time() is not None  # first time it should always run
 9 |     assert test_check.check_if_time() is None  # time has not come yet
10 | 
11 |     sleep(1)
12 |     assert test_check.time_till_check() < 0  # time to run the check
13 |     assert test_check.check_if_time() is not None  # should run the check
14 |     assert 0.0 < test_check.time_till_check() < 1  # there's still some time before the check can be run again
15 | 
16 | 
17 | class _TestCheck(Check):
18 |     def __init__(self):
19 |         super().__init__(check_interval_s=0.5)
20 | 
21 |     def check(self) -> Change:
22 |         return Change()
23 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | from bubuku.cli import _print_table, _dump_replica_assignment_as_json
 2 | 
 3 | 
 4 | def test_print_table():
 5 |     lines = []
 6 |     _print_table([{'Test': 1, 'Test2': '123456789'}, {'Test2': 'Test1', 'Test3': None}], lambda x: lines.append(x))
 7 |     assert len(lines) == 3
 8 |     assert lines[0] == 'Test  Test2      Test3'
 9 |     assert lines[1] == '1     123456789       '
10 |     assert lines[2] == '      Test1      None '
11 | 
12 | 
13 | def test_dump_replica_assignment():
14 |     assert _dump_replica_assignment_as_json([('topic-a', "1")]) \
15 |         == '''{"version":1,"partitions":[{"topic":"topic-a","partition":1}]}'''
16 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from tempfile import mkstemp
  3 | 
  4 | from bubuku.config import KafkaProperties, load_config, _load_timeout_dict
  5 | 
  6 | __PROPS = """
  7 | log.dirs=/data/kafka-logs
  8 | auto.create.topics.enable=false
  9 | delete.topic.enable=true
 10 | auto.leader.rebalance.enable=true
 11 | leader.imbalance.check.interval.seconds=100
 12 | 
 13 | ### from http://kafka.apache.org/documentation.html#prodconfig
 14 | 
 15 | # Replication configurations
 16 | num.replica.fetchers=4
 17 | replica.fetch.max.bytes=1048576
 18 | replica.fetch.wait.max.ms=500
 19 | replica.high.watermark.checkpoint.interval.ms=5000
 20 | replica.socket.timeout.ms=30000
 21 | replica.socket.receive.buffer.bytes=65536
 22 | replica.lag.time.max.ms=10000
 23 | replica.lag.max.messages=4000
 24 | 
 25 | controller.socket.timeout.ms=30000
 26 | controller.message.queue.size=10
 27 | 
 28 | # Log configuration
 29 | #num.partitions=8
 30 | #message.max.bytes=1000000
 31 | #auto.create.topics.enable=true
 32 | log.index.interval.bytes=4096
 33 | log.index.size.max.bytes=10485760
 34 | log.retention.hours=168
 35 | log.flush.interval.ms=10000
 36 | log.flush.interval.messages=20000
 37 | log.flush.scheduler.interval.ms=2000
 38 | log.roll.hours=168
 39 | log.retention.check.interval.ms=300000
 40 | log.segment.bytes=1073741824
 41 | 
 42 | # ZK configuration
 43 | zookeeper.connection.timeout.ms=6000
 44 | zookeeper.sync.time.ms=2000
 45 | 
 46 | # Socket server configuration
 47 | num.io.threads=8
 48 | num.network.threads=8
 49 | socket.request.max.bytes=104857600
 50 | socket.receive.buffer.bytes=1048576
 51 | socket.send.buffer.bytes=1048576
 52 | queued.max.requests=16
 53 | fetch.purgatory.purge.interval.requests=100
 54 | producer.purgatory.purge.interval.requests=100
 55 | """
 56 | 
 57 | __FNAME = ''
 58 | 
 59 | 
 60 | def build_test_properties():
 61 |     __create_kafak_props_file()
 62 |     return KafkaProperties(__FNAME, __FNAME)
 63 | 
 64 | 
 65 | def __create_kafak_props_file():
 66 |     global __FNAME
 67 |     if not __FNAME:
 68 |         _, __FNAME = mkstemp(text=True)
 69 |         with open(__FNAME, 'w') as fd:
 70 |             fd.write(__PROPS)
 71 | 
 72 | 
 73 | __create_kafak_props_file()
 74 | 
 75 | 
 76 | def test_parse_kafka_properties():
 77 |     props = build_test_properties()
 78 | 
 79 |     assert props.get_property('log.retention.hours') == '168'
 80 | 
 81 | 
 82 | def test_update_kafka_properties():
 83 |     props = build_test_properties()
 84 | 
 85 |     assert '100' == props.get_property('producer.purgatory.purge.interval.requests')
 86 | 
 87 |     props.set_property('producer.purgatory.purge.interval.requests', '180')
 88 | 
 89 |     assert '180' == props.get_property('producer.purgatory.purge.interval.requests')
 90 | 
 91 |     props.dump()
 92 | 
 93 |     props2 = build_test_properties()
 94 | 
 95 |     assert '180' == props2.get_property('producer.purgatory.purge.interval.requests')
 96 | 
 97 | 
 98 | def test_zk_prefix_replacement():
 99 |     if os.getenv('ZOOKEEPER_PREFIX', None):
100 |         os.unsetenv('ZOOKEEPER_PREFIX')
101 |     assert load_config().zk_prefix == '/'
102 | 
103 |     os.environ['ZOOKEEPER_PREFIX'] = '/'
104 |     assert load_config().zk_prefix == '/'
105 | 
106 |     os.environ['ZOOKEEPER_PREFIX'] = 'test'
107 |     assert load_config().zk_prefix == '/test'
108 | 
109 |     os.environ['ZOOKEEPER_PREFIX'] = '/test'
110 |     assert load_config().zk_prefix == '/test'
111 | 
112 | 
113 | def test_parse_timeout():
114 |     assert {'type': 'linear', 'initial': '300', 'step': '60'} == _load_timeout_dict(
115 |         {'STARTUP_TIMEOUT_TYPE': 'linear', 'STARTUP_TIMEOUT_INITIAL': '300', 'STARTUP_TIMEOUT_STEP': '60'}.get)
116 |     assert {'type': 'linear', 'step': '60'} == _load_timeout_dict(
117 |         {'STARTUP_TIMEOUT_TYPE': 'linear', 'STARTUP_TIMEOUT_STEP': '60'}.get)
118 |     assert {'initial': '300', 'step': '60'} == _load_timeout_dict(
119 |         {'STARTUP_TIMEOUT_INITIAL': '300', 'STARTUP_TIMEOUT_STEP': '60'}.get)
120 | 


--------------------------------------------------------------------------------
/tests/test_controller.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock
 2 | 
 3 | from bubuku.controller import Controller, Check, Change, _exclude_self
 4 | 
 5 | 
 6 | def test_exculde_self():
 7 | 
 8 |     assert sorted(['test1', 'test2']) == sorted(_exclude_self('127.0.0.1', 'xxx', {
 9 |         'test1': '127.0.0.1',
10 |         'test2': '127.0.0.2',
11 |         'xxx': '127.0.0.1',
12 |     }))
13 | 
14 | 
15 | def test_multiple_changes_are_executed_one_by_one():
16 |     running_count = [3, 3, 3]
17 | 
18 |     class FakeChange(Change):
19 |         def __init__(self, index):
20 |             self.index = index
21 | 
22 |         def get_name(self):
23 |             return 'fake'
24 | 
25 |         def can_run(self, current_actions):
26 |             return True
27 | 
28 |         def run(self, current_actions):
29 |             running_count[self.index] -= 1
30 |             return running_count[self.index] > 0
31 | 
32 |     class FakeCheck(Check):
33 |         def __init__(self):
34 |             super().__init__(0)
35 |             self.changes_limit = 3
36 |             self.changes_issued = 0
37 | 
38 |         def check(self):
39 |             if self.changes_issued < self.changes_limit:
40 |                 self.changes_issued += 1
41 |                 return FakeChange(self.changes_issued - 1)
42 | 
43 |     current_changes = {}
44 |     zk = MagicMock()
45 |     zk.get_running_changes.return_value = current_changes
46 |     zk.register_change = lambda x, y: current_changes.update({x: y})
47 |     zk.unregister_change = lambda x: current_changes.pop(x)
48 | 
49 |     controller = Controller(MagicMock(), zk, MagicMock())
50 |     controller.provider_id = 'fake'
51 |     controller.add_check(FakeCheck())
52 | 
53 |     assert [3, 3, 3] == running_count
54 |     controller.make_step()
55 |     assert not current_changes
56 |     assert [3, 3, 3] == running_count
57 |     controller.make_step()
58 |     assert current_changes
59 |     assert [2, 3, 3] == running_count
60 |     controller.make_step()
61 |     assert [1, 3, 3] == running_count
62 |     controller.make_step()
63 |     assert [0, 3, 3] == running_count
64 |     controller.make_step()
65 |     assert [0, 2, 3] == running_count
66 |     controller.make_step()
67 |     assert [0, 1, 3] == running_count
68 |     controller.make_step()
69 |     assert [0, 0, 3] == running_count
70 |     controller.make_step()
71 |     assert [0, 0, 2] == running_count
72 |     controller.make_step()
73 |     assert [0, 0, 1] == running_count
74 |     assert current_changes
75 |     controller.make_step()
76 |     assert [0, 0, 0] == running_count
77 |     assert not current_changes
78 |     controller.make_step()
79 |     assert [0, 0, 0] == running_count
80 |     assert not current_changes
81 | 


--------------------------------------------------------------------------------
/tests/test_daemon.py:
--------------------------------------------------------------------------------
  1 | from unittest.mock import MagicMock
  2 | 
  3 | from bubuku.daemon import apply_features
  4 | from bubuku.features.rebalance.check import RebalanceOnStartCheck, RebalanceOnBrokerListCheck
  5 | from bubuku.features.restart_on_zk_change import CheckExhibitorAddressChanged
  6 | from bubuku.features.terminate import get_registration
  7 | from test_config import build_test_properties
  8 | 
  9 | 
 10 | class _TestController(object):
 11 |     def __init__(self):
 12 |         self.checks = []
 13 | 
 14 |     def add_check(self, check):
 15 |         self.checks.append(check)
 16 | 
 17 | def test_load_restart_on_exhibitor():
 18 |     exhibitor = object()
 19 |     broker = object()
 20 | 
 21 |     controller = _TestController()
 22 | 
 23 |     apply_features(-1, {'restart_on_exhibitor': {}}, controller, exhibitor, broker, None, None)
 24 | 
 25 |     assert len(controller.checks) == 1
 26 |     check = controller.checks[0]
 27 |     assert type(check) == CheckExhibitorAddressChanged
 28 |     assert check.zk == exhibitor
 29 |     assert check.broker == broker
 30 | 
 31 | 
 32 | def test_rebalance_on_start():
 33 |     exhibitor = object()
 34 |     broker = object()
 35 | 
 36 |     controller = _TestController()
 37 | 
 38 |     apply_features(-1, {'rebalance_on_start': {}}, controller, exhibitor, broker, None, None)
 39 | 
 40 |     assert len(controller.checks) == 1
 41 |     check = controller.checks[0]
 42 |     assert type(check) == RebalanceOnStartCheck
 43 |     assert check.zk == exhibitor
 44 |     assert check.broker == broker
 45 |     assert not check.executed
 46 | 
 47 | 
 48 | def test_rebalance_on_broker_list_change():
 49 |     exhibitor = object()
 50 |     broker = object()
 51 | 
 52 |     controller = _TestController()
 53 | 
 54 |     apply_features(-1, {'rebalance_on_brokers_change': {}}, controller, exhibitor, broker, None, None)
 55 | 
 56 |     assert len(controller.checks) == 1
 57 |     check = controller.checks[0]
 58 |     assert type(check) == RebalanceOnBrokerListCheck
 59 |     assert check.zk == exhibitor
 60 |     assert check.broker == broker
 61 | 
 62 | 
 63 | def test_graceful_terminate():
 64 |     c, b = get_registration()
 65 |     assert c is None
 66 |     assert b is None
 67 | 
 68 |     broker = object()
 69 | 
 70 |     controller = _TestController()
 71 | 
 72 |     apply_features(-1, {'graceful_terminate': {}}, controller, None, broker, None, None)
 73 | 
 74 |     assert len(controller.checks) == 0
 75 | 
 76 |     c, b = get_registration()
 77 |     assert c == controller
 78 |     assert b == broker
 79 | 
 80 | 
 81 | def test_use_ip_address_default():
 82 |     props = build_test_properties()
 83 | 
 84 |     amazon = MagicMock()
 85 |     amazon.get_ip = MagicMock(return_value='172.31.146.57')
 86 | 
 87 |     apply_features(-1, {'use_ip_address': {}}, None, None, None, props, amazon)
 88 | 
 89 |     assert props.get_property('advertised.listeners') == 'PLAINTEXT://172.31.146.57:9092'
 90 |     assert props.get_property('listeners') == 'PLAINTEXT://0.0.0.0:9092'
 91 | 
 92 | 
 93 | def test_use_ip_address_custom():
 94 |     props = build_test_properties()
 95 |     props.set_property("listeners", "CUSTOM://:9094,CUSTOM2://:9095,CUSTOM2://:9095")
 96 |     props.set_property("advertised.listeners", "CUSTOM://:9094,CUSTOM2://:9095,CUSTOM2://:9095")
 97 | 
 98 |     amazon = MagicMock()
 99 |     amazon.get_ip = MagicMock(return_value='172.31.146.57')
100 | 
101 |     apply_features(-1, {'use_ip_address': {}}, None, None, None, props, amazon)
102 |     
103 |     print(props.get_property('advertised.listeners'))
104 |     assert props.get_property('advertised.listeners') == 'CUSTOM2://172.31.146.57:9095,CUSTOM://172.31.146.57:9094'
105 |     assert props.get_property('listeners') == 'CUSTOM2://0.0.0.0:9095,CUSTOM://0.0.0.0:9094'
106 | 


--------------------------------------------------------------------------------
/tests/test_exhibitor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import MagicMock
 3 | 
 4 | from bubuku.zookeeper.exhibitor import ExhibitorAddressProvider
 5 | 
 6 | 
 7 | class ExhibitorAddressProviderTest(unittest.TestCase):
 8 |     def test_get_latest_address(self):
 9 |         address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2'])
10 |         address_provider._query_exhibitors = lambda _: {'servers': ['aws-lb-1-new'], 'port': 99}
11 | 
12 |         actual_result = address_provider.get_latest_address()
13 | 
14 |         assert actual_result == (['aws-lb-1-new'], 99)
15 | 
16 |     def test_get_latest_address_no_exhibitors(self):
17 |         address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2'])
18 |         address_provider._query_exhibitors = lambda _: None
19 | 
20 |         actual_result = address_provider.get_latest_address()
21 |         assert actual_result is None
22 | 
23 |     def test_get_latest_address_2(self):
24 |         address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2'])
25 |         address_provider._query_exhibitors = MagicMock()
26 |         address_provider._query_exhibitors.side_effect = [None, {'servers': ['aws-lb-1-new'], 'port': 99}]
27 | 
28 |         actual_result = address_provider.get_latest_address()
29 | 
30 |         assert address_provider._query_exhibitors.call_count == 2
31 |         assert actual_result == (['aws-lb-1-new'], 99)
32 | 
33 |     def test_addresses_are_sorted(self):
34 |         address_provider = ExhibitorAddressProvider(lambda: ['aws-lb-1', 'aws-lb-2'])
35 |         address_provider._query_exhibitors = lambda _: {'servers': ['1', '2', '3'], 'port': '1234'}
36 |         tmp_result = address_provider.get_latest_address()
37 | 
38 |         # Check that two calls in sequence will return the same value
39 |         assert tmp_result == address_provider.get_latest_address()
40 | 
41 |         # Check sort 1
42 |         address_provider._query_exhibitors = lambda _: {'servers': ['2', '1', '3'], 'port': '1234'}
43 |         assert tmp_result == address_provider.get_latest_address()
44 | 
45 |         # Check sort again (just to be sure)
46 |         address_provider._query_exhibitors = lambda _: {'servers': ['3', '2', '1'], 'port': '1234'}
47 |         assert tmp_result == address_provider.get_latest_address()
48 | 
49 | 


--------------------------------------------------------------------------------
/tests/test_migrate.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import MagicMock
 3 | 
 4 | from bubuku.features.migrate import MigrationChange
 5 | 
 6 | 
 7 | class TestMigrate(unittest.TestCase):
 8 |     def test_migration_all_steps(self):
 9 |         partitions = {
10 |             ('test', 0): [1, 2, 3],
11 |             ('test', 1): [2, 3, 1],
12 |             ('test1', 0): [3, 2, 1],
13 |         }
14 |         zk = MagicMock()
15 |         zk.is_rebalancing = lambda: False
16 |         zk.load_partition_assignment = lambda: [(k[0], k[1], v) for k, v in partitions.items()]
17 |         result = {}
18 | 
19 |         def _reallocate_partition(t, p, r):
20 |             result.update({(t, p): r})
21 |             return True
22 | 
23 |         def _reallocate_partitions(items):
24 |             for item in items:
25 |                 _reallocate_partition(*item)
26 |             return True
27 | 
28 |         zk.reallocate_partition = _reallocate_partition
29 |         zk.reallocate_partitions = _reallocate_partitions
30 |         zk.get_broker_ids = lambda: [1, 2, 3, 4, 5, 6]
31 | 
32 |         change = MigrationChange(zk, [1, 2, 3], [4, 5, 6], False)
33 |         while change.run([]):
34 |             pass
35 |         expected = {
36 |             ('test', 0): [1, 2, 3, 4, 5, 6],
37 |             ('test', 1): [2, 3, 1, 5, 6, 4],
38 |             ('test1', 0): [3, 2, 1, 6, 5, 4],
39 |         }
40 |         assert expected == result
41 | 
42 |         zk.load_partition_assignment = lambda: [(k[0], k[1], v) for k, v in expected.items()]
43 |         result.clear()
44 | 
45 |         change = MigrationChange(zk, [1, 2, 3], [4, 5, 6], True)
46 |         while change.run([]):
47 |             pass
48 | 
49 |         expected = {
50 |             ('test', 0): [4, 5, 6],
51 |             ('test', 1): [5, 6, 4],
52 |             ('test1', 0): [6, 5, 4],
53 |         }
54 | 
55 |         assert expected == result
56 | 
57 |     def test_replica_generation_no_shrink(self):
58 |         change = MigrationChange(MagicMock(), [1, 2, 3], [4, 5, 6], False)
59 | 
60 |         assert [4, 5, 6] == change._replace_replicas([4, 5, 6])
61 |         assert [1, 2, 3, 4, 5, 6] == change._replace_replicas([1, 2, 3])
62 |         assert [1, 2, 6, 4, 5] == change._replace_replicas([1, 2, 6])
63 |         assert [1, 6, 2, 4, 5] == change._replace_replicas([1, 6, 2])
64 |         assert [1, 6, 3, 4] == change._replace_replicas([1, 6, 3])
65 | 
66 |     def test_replica_generation_shrink(self):
67 |         change = MigrationChange(MagicMock(), [1, 2, 3], [4, 5, 6], True)
68 | 
69 |         assert [4, 5, 6] == change._replace_replicas([1, 2, 3])
70 |         assert [4, 5, 6] == change._replace_replicas([4, 2, 6])
71 |         assert [8, 5, 10] == change._replace_replicas([8, 2, 10])
72 |         assert [4, 8, 5] == change._replace_replicas([1, 8, 2])
73 |         assert [4, 5, 6] == change._replace_replicas([1, 2, 3, 4, 5, 6])
74 | 


--------------------------------------------------------------------------------
/tests/test_partitions_swap.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from unittest.mock import MagicMock
  3 | 
  4 | from bubuku.features.swap_partitions import CheckBrokersDiskImbalance, SwapPartitionsChange, load_swap_data
  5 | 
  6 | 
  7 | class TestPartitionsSwap(unittest.TestCase):
  8 |     test_size_stats = {
  9 |         "111": {"disk": {"free_kb": 20000, "used_kb": 20000}, "topics": {
 10 |             "t1": {"1": 3434, "2": 200},
 11 |             "t2": {"1": 1000},
 12 |             "t3": {"1": 300}
 13 |         }},
 14 |         "222": {"disk": {"free_kb": 25000, "used_kb": 15000}, "topics": {
 15 |             "t1": {"2": 200},
 16 |             "t2": {"1": 1000, "2": 100},
 17 |             "t3": {"2": 2000}
 18 |         }},
 19 |         "333": {"disk": {"free_kb": 30000, "used_kb": 10000}, "topics": {
 20 |             "t1": {"1": 3434},
 21 |             "t2": {"2": 100},
 22 |             "t3": {"1": 300, "2": 2000}
 23 |         }}
 24 |     }
 25 | 
 26 |     test_assignment = [
 27 |         ("t1", 1, [111, 333]),
 28 |         ("t1", 2, [111, 222]),
 29 |         ("t2", 1, [222, 111]),
 30 |         ("t2", 2, [222, 333]),
 31 |         ("t3", 1, [333, 111]),
 32 |         ("t3", 2, [333, 222]),
 33 |     ]
 34 | 
 35 |     test_broker_racks_unaware = {
 36 |         111: None,
 37 |         222: None,
 38 |         333: None
 39 |     }
 40 | 
 41 |     test_size_stats_nine = {
 42 |         "111": {"disk": {"free_kb": 20000, "used_kb": 20000}, "topics": {
 43 |             "t1": {"1": 3434, "2": 200},
 44 |             "t2": {"1": 1000},
 45 |             "t3": {"1": 300}
 46 |         }},
 47 |         "222": {"disk": {"free_kb": 25000, "used_kb": 15000}, "topics": {
 48 |             "t1": {"2": 200},
 49 |             "t2": {"1": 1000, "2": 100},
 50 |             "t3": {"2": 2000}
 51 |         }},
 52 |         "333": {"disk": {"free_kb": 30000, "used_kb": 10000}, "topics": {
 53 |             "t1": {"1": 3434},
 54 |             "t2": {"2": 100},
 55 |             "t3": {"1": 300, "2": 2000}
 56 |         }},
 57 |         "444": {"disk": {"free_kb": 21000, "used_kb": 19000}, "topics": {
 58 |             "t4": {"1": 3434, "2": 200},
 59 |             "t5": {"1": 1000},
 60 |             "t6": {"1": 300}
 61 |         }},
 62 |         "555": {"disk": {"free_kb": 10000, "used_kb": 30000}, "topics": {
 63 |             "t4": {"2": 200},
 64 |             "t5": {"1": 1000, "2": 100},
 65 |             "t6": {"2": 2000}
 66 |         }},
 67 |         "666": {"disk": {"free_kb": 22000, "used_kb": 18000}, "topics": {
 68 |             "t4": {"1": 3434},
 69 |             "t5": {"2": 100},
 70 |             "t6": {"1": 300, "2": 2000}
 71 |         }},
 72 |         "777": {"disk": {"free_kb": 23000, "used_kb": 17000}, "topics": {
 73 |             "t7": {"1": 3434, "2": 200},
 74 |             "t8": {"1": 1000},
 75 |             "t9": {"1": 300}
 76 |         }},
 77 |         "888": {"disk": {"free_kb": 24000, "used_kb": 16000}, "topics": {
 78 |             "t7": {"2": 200},
 79 |             "t8": {"1": 1000, "2": 100},
 80 |             "t9": {"2": 2000}
 81 |         }},
 82 |         "999": {"disk": {"free_kb": 26000, "used_kb": 14000}, "topics": {
 83 |             "t7": {"1": 3434},
 84 |             "t8": {"2": 100},
 85 |             "t9": {"1": 300, "2": 2000}
 86 |         }}
 87 |     }
 88 | 
 89 |     test_assignment_nine = [
 90 |         ("t1", 1, [111, 333]),
 91 |         ("t1", 2, [111, 222]),
 92 |         ("t2", 1, [222, 111]),
 93 |         ("t2", 2, [222, 333]),
 94 |         ("t3", 1, [333, 111]),
 95 |         ("t3", 2, [333, 222]),
 96 |         ("t4", 1, [444, 666]),
 97 |         ("t4", 2, [444, 555]),
 98 |         ("t5", 1, [555, 444]),
 99 |         ("t5", 2, [555, 666]),
100 |         ("t6", 1, [666, 444]),
101 |         ("t6", 2, [666, 555]),
102 |         ("t7", 1, [777, 999]),
103 |         ("t7", 2, [777, 888]),
104 |         ("t8", 1, [888, 777]),
105 |         ("t8", 2, [888, 999]),
106 |         ("t9", 1, [999, 777]),
107 |         ("t9", 2, [999, 888]),
108 |     ]
109 | 
110 |     test_broker_racks_aware = {
111 |         111: "eu-central-1a",
112 |         222: "eu-central-1b",
113 |         333: "eu-central-1c",
114 |         444: "eu-central-1a",
115 |         555: "eu-central-1b",
116 |         666: "eu-central-1c",
117 |         777: "eu-central-1a",
118 |         888: "eu-central-1b",
119 |         999: "eu-central-1c",
120 |     }
121 | 
122 |     def setUp(self):
123 |         self.zk = self.__mock_zk()
124 |         self.broker = self.__mock_broker()
125 | 
126 |     def test_check_requires_swap_partitions_change(self):
127 |         check_imbalance = CheckBrokersDiskImbalance(self.zk, self.broker, 3000, -1)
128 |         change = check_imbalance.check()
129 | 
130 |         assert change
131 | 
132 |     def test_self_fat_slim_brokers_rack_aware(self):
133 |         zk = self.__mock_zk_rack()
134 | 
135 |         slim, fat, gap, stats = load_swap_data(zk, -1, 100)
136 |         assert fat == 555
137 |         assert slim == 222
138 | 
139 |     def test_check_requires_swap_partitions_change_rack_aware(self):
140 |         self.zk = self.__mock_zk_rack()
141 |         check_imbalance = CheckBrokersDiskImbalance(self.zk, self.broker, 3000, -1)
142 |         change = check_imbalance.check()
143 | 
144 |         assert change
145 | 
146 |     def test_check_requires_not_swap_partitions_change(self):
147 |         check_imbalance = CheckBrokersDiskImbalance(self.zk, self.broker, 15000, -1)
148 |         change = check_imbalance.check()
149 | 
150 |         # change should not be created as the gap between brokers is less than threshold
151 |         assert not change
152 | 
153 |     def test_swap_partitions_change_performed(self):
154 |         def _swap_data_provider(zk):
155 |             return load_swap_data(zk, -1, 10000)
156 | 
157 |         swap_change = SwapPartitionsChange(self.zk, _swap_data_provider)
158 |         result = swap_change.run([])
159 | 
160 |         assert not result
161 |         self.zk.reallocate_partitions.assert_called_with([('t2', 2, [222, 111]), ('t2', 1, [222, 333])])
162 | 
163 |     def test_swap_partitions_change_not_performed(self):
164 |         swap_change = SwapPartitionsChange(self.zk, lambda x: load_swap_data(x, -1, 10001))
165 |         result = swap_change.run([])
166 | 
167 |         # change should not trigger partitions swap as there is no possible
168 |         # partitions swap that will decrease the gap between brokers
169 |         assert not result
170 |         self.zk.reallocate_partitions.assert_not_called()
171 | 
172 |     def test_swap_partitions_change_postponed(self):
173 |         self.zk.reallocate_partitions.return_value = False
174 | 
175 |         swap_change = SwapPartitionsChange(self.zk, lambda x: load_swap_data(x, -1, 10000))
176 |         result = swap_change.run([])
177 | 
178 |         # if the write to ZK wasn't possible for some reason, the change should
179 |         # return True and repeat write to ZK during next trigger by controller
180 |         assert result
181 |         assert swap_change.to_move == [('t2', 2, [222, 111]), ('t2', 1, [222, 333])]
182 | 
183 |     def test_swap_partitions_change_postponed_when_rebalancing(self):
184 |         self.zk.is_rebalancing.return_value = True
185 | 
186 |         swap_change = SwapPartitionsChange(self.zk, None)
187 |         result = swap_change.run([])
188 | 
189 |         # if there was a rebalance node in ZK - the change should be postponed
190 |         assert result
191 |         assert not swap_change.to_move
192 | 
193 |     def test_swap_partitions_change_performed_existing(self):
194 |         swap_change = SwapPartitionsChange(self.zk, None)
195 |         dummy_move_list = ["dummy"]
196 |         swap_change.to_move = ["dummy"]
197 |         result = swap_change.run([])
198 | 
199 |         # if there already was a pair of partitions to swap in to_move
200 |         # property - SwapPartitionsChange should just execute this swap
201 |         assert not result
202 |         self.zk.reallocate_partitions.assert_called_with(dummy_move_list)
203 |         self.zk.load_partition_assignment.assert_not_called()
204 | 
205 |     def __mock_broker(self) -> MagicMock:
206 |         broker = MagicMock()
207 |         broker.is_running_and_registered.return_value = True
208 |         return broker
209 | 
210 |     def __mock_zk(self) -> MagicMock:
211 |         zk = MagicMock()
212 |         zk.is_rebalancing.return_value = False
213 |         zk.load_partition_assignment.return_value = self.test_assignment
214 |         zk.get_disk_stats.return_value = self.test_size_stats
215 |         zk.get_broker_racks.return_value = self.test_broker_racks_unaware
216 |         return zk
217 | 
218 |     def __mock_zk_rack(self) -> MagicMock:
219 |         zk = MagicMock()
220 |         zk.is_rebalancing.return_value = False
221 |         zk.load_partition_assignment.return_value = self.test_assignment_nine
222 |         zk.get_disk_stats.return_value = self.test_size_stats_nine
223 |         zk.get_broker_racks.return_value = self.test_broker_racks_aware
224 |         return zk
225 | 


--------------------------------------------------------------------------------
/tests/test_rebalance.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import unittest
  3 | from typing import Dict
  4 | from unittest.mock import MagicMock
  5 | 
  6 | from kazoo.exceptions import NoNodeError
  7 | 
  8 | from bubuku.features.rebalance import BaseRebalanceChange
  9 | from bubuku.features.rebalance.change import OptimizedRebalanceChange
 10 | from bubuku.features.rebalance.change_simple import SimpleRebalanceChange
 11 | from bubuku.features.rebalance.check import RebalanceOnBrokerListCheck
 12 | from bubuku.zookeeper import BukuExhibitor
 13 | 
 14 | 
 15 | def _verify_balanced(broker_ids, distribution, delta=1):
 16 |     per_broker_data = {k: {'leaders': 0, 'total': 0} for k in broker_ids}
 17 |     for broker_ids in distribution.values():
 18 |         per_broker_data[broker_ids[0]]['leaders'] += 1
 19 |         for b in broker_ids:
 20 |             per_broker_data[b]['total'] += 1
 21 |             assert len([bb for bb in broker_ids if bb == b]) == 1
 22 |     min_leaders = min(k['leaders'] for k in per_broker_data.values())
 23 |     max_leaders = max(k['leaders'] for k in per_broker_data.values())
 24 | 
 25 |     assert (max_leaders - min_leaders) <= delta
 26 | 
 27 |     min_total = min(k['total'] for k in per_broker_data.values())
 28 |     max_total = max(k['total'] for k in per_broker_data.values())
 29 | 
 30 |     assert (max_total - min_total) <= delta
 31 | 
 32 | 
 33 | def _verify_rack_aware(initial_distribution, final_distribution, racks):
 34 |     for (topic, partition) in initial_distribution.keys():
 35 |         final_assignment = final_distribution[(topic, partition)]
 36 |         final_racks = _brokers_to_racks(final_assignment, racks)
 37 |         if len(racks) > len(final_assignment):
 38 |             assert (len(final_assignment) == len(set(final_racks)))
 39 |         else:
 40 |             assert (len(set(final_assignment)) == len(racks))
 41 | 
 42 | 
 43 | def _brokers_to_racks(brokers: list, racks: Dict[int, str]):
 44 |     return [racks[int(broker)] for broker in brokers]
 45 | 
 46 | 
 47 | def _verify_empty_brokers(broker_ids, distribution):
 48 |     for brokers in distribution.values():
 49 |         for broker in brokers:
 50 |             if broker in broker_ids:
 51 |                 assert False
 52 |     assert True
 53 | 
 54 | 
 55 | class TestRebalanceCheck(unittest.TestCase):
 56 | 
 57 |     def test_rebalance_invoked_on_broker_list_change(self):
 58 |         zk = MagicMock()
 59 | 
 60 |         zk.get = MagicMock(side_effect=NoNodeError)
 61 | 
 62 |         check = RebalanceOnBrokerListCheck(zk, MagicMock())
 63 |         zk.get_broker_ids.return_value = ['1', '2', '3']
 64 | 
 65 |         assert check.check() is not None
 66 |         assert check.check() is None
 67 |         zk.get_broker_ids.return_value = ['1', '2', '3']
 68 |         assert check.check() is None
 69 |         zk.get_broker_ids.return_value = ['1', '2', '4']
 70 |         assert check.check() is not None
 71 |         assert check.check() is None
 72 | 
 73 | 
 74 | class TestBaseRebalance(unittest.TestCase):
 75 |     __test__ = False
 76 | 
 77 |     _correct_rack_assignment = True
 78 | 
 79 |     def createChange(self, zk, broker_ids, empty_brokers, exclude_topics, parallelism=1) -> BaseRebalanceChange:
 80 |         pass
 81 | 
 82 |     def _create_zk_for_topics(self, topic_data, broker_ids=None, racks=None) -> (list, BukuExhibitor):
 83 |         buku_proxy = MagicMock()
 84 |         actual_broker_ids = broker_ids if broker_ids else sorted(list(
 85 |             set(functools.reduce(lambda x, y: x + y, topic_data.values(), []))))
 86 |         buku_proxy.get_broker_ids.return_value = actual_broker_ids
 87 |         brokers = broker_ids if broker_ids else sorted(list(
 88 |             set(functools.reduce(lambda x, y: x + y, topic_data.values(), []))))
 89 |         if self._correct_rack_assignment:
 90 |             buku_proxy.get_broker_racks.return_value = {k: v for k, v in racks.items() if str(k) in actual_broker_ids} \
 91 |                 if racks else {int(broker_id): None for broker_id in brokers}
 92 |         else:
 93 |             buku_proxy.get_broker_racks.return_value = racks if racks else {int(broker_id): None for broker_id in
 94 |                                                                             brokers}
 95 | 
 96 |         def _load_assignment():
 97 |             return [(k[0], int(k[1]), [int(p) for p in v]) for k, v in topic_data.items()]
 98 | 
 99 |         def _load_states(topics=None):
100 |             return [(k[0], int(k[1]), {'isr': [int(p) for p in v]}) for k, v in topic_data.items()]
101 | 
102 |         buku_proxy.load_partition_assignment = _load_assignment
103 |         buku_proxy.load_partition_states = _load_states
104 |         buku_proxy.is_rebalancing.return_value = False
105 | 
106 |         def _reassign(topic, partition, replicas):
107 |             topic_data[(topic, str(partition))] = [str(x) for x in replicas]
108 |             return True
109 | 
110 |         def _reassign_many(items):
111 |             for item in items:
112 |                 _reassign(*item)
113 |             return True
114 | 
115 |         buku_proxy.reallocate_partition = _reassign
116 |         buku_proxy.reallocate_partitions = _reassign_many
117 |         return sorted(list(set(functools.reduce(lambda x, y: x + y, topic_data.values(), [])))), buku_proxy
118 | 
119 |     def test_rebalance_can_run(self):
120 |         brokers, zk = self._create_zk_for_topics({})
121 |         o = self.createChange(zk, [], [], [])
122 | 
123 |         blocked_actions = ['restart', 'start', 'stop', 'rebalance']
124 | 
125 |         # Check that can run in exact cases
126 |         for a in blocked_actions:
127 |             assert not o.can_run([a])
128 | 
129 |         assert o.can_run(['xxx'])
130 |         assert o.can_run([])
131 | 
132 |     def test_rebalance_get_name(self):
133 |         brokers, zk = self._create_zk_for_topics({})
134 |         o = self.createChange(zk, [], [], [])
135 |         assert o.get_name() == 'rebalance'
136 | 
137 |     def test_rebalance_on_empty1(self):
138 |         brokers, zk = self._create_zk_for_topics({})
139 |         o = self.createChange(zk, brokers, [], [])
140 |         while o.run([]):
141 |             pass
142 | 
143 |     def test_rebalance_on_filled1(self):
144 |         distribution = {
145 |             ('t0', '0'): ['2'],
146 |             ('t0', '1'): ['1'],
147 |             ('t0', '2'): ['1'],
148 |             ('t0', '3'): ['1'],
149 |         }
150 |         brokers, zk = self._create_zk_for_topics(distribution)
151 |         o = self.createChange(zk, brokers, [], [])
152 |         # broker to partitions
153 |         while o.run([]):
154 |             pass
155 | 
156 |         _verify_balanced(('1', '2'), distribution)
157 | 
158 |     def test_rebalance_with_racks(self):
159 |         distribution = {
160 |             ('t0', '0'): ['3', '1'],
161 |             ('t0', '1'): ['1', '5'],
162 |             ('t0', '2'): ['5', '3'],
163 |             ('t1', '0'): ['5', '3'],
164 |             ('t1', '1'): ['3', '1'],
165 |             ('t2', '0'): ['1', '5'],
166 |             ('t2', '1'): ['2', '4'],
167 |             ('t2', '2'): ['2', '6'],
168 |             ('t2', '3'): ['4', '2'],
169 |             ('t2', '4'): ['6', '2'],
170 |             ('t2', '5'): ['3', '6'],
171 |             ('t2', '6'): ['6', '3'],
172 |         }
173 | 
174 |         initial_distribution = dict(distribution)
175 | 
176 |         racks = {
177 |             1: 'r1',
178 |             2: 'r1',
179 |             3: 'r2',
180 |             4: 'r2',
181 |             5: 'r3',
182 |             6: 'r3'
183 |         }
184 | 
185 |         brokers, zk = self._create_zk_for_topics(distribution, ['1', '2', '3', '4', '5', '6'], racks)
186 |         o = self.createChange(zk, brokers, [], [])
187 |         while o.run([]):
188 |             pass
189 | 
190 |         _verify_balanced(('1', '2', '3', '4', '5', '6'), distribution, 2)
191 |         _verify_rack_aware(initial_distribution, distribution, racks)
192 | 
193 |     def test_rebalance_empty_one_broker(self):
194 |         distribution = {
195 |             ('t0', '0'): ['1', '2'],
196 |             ('t0', '1'): ['2', '3'],
197 |             ('t1', '0'): ['2', '3'],
198 |             ('t1', '1'): ['3', '4'],
199 |         }
200 |         brokers, zk = self._create_zk_for_topics(distribution)
201 |         o = self.createChange(zk, brokers, ['2'], [])
202 |         while o.run([]):
203 |             pass
204 | 
205 |         _verify_empty_brokers(('2'), distribution)
206 | 
207 |     def test_rebalance_empty_multiple_brokers(self):
208 |         distribution = {
209 |             ('t0', '0'): ['1', '2'],
210 |             ('t0', '1'): ['2', '3'],
211 |             ('t1', '0'): ['2', '3'],
212 |             ('t1', '1'): ['3', '4'],
213 |             ('t1', '2'): ['4', '5'],
214 |             ('t2', '0'): ['3', '4'],
215 |             ('t2', '1'): ['4', '5'],
216 |             ('t2', '2'): ['5', '6'],
217 |         }
218 |         brokers, zk = self._create_zk_for_topics(distribution)
219 |         o = self.createChange(zk, brokers, ['2', '3'], [])
220 |         while o.run([]):
221 |             pass
222 | 
223 |         _verify_empty_brokers(('2', '3'), distribution)
224 | 
225 |     def test_rebalance_empty_brokers_and_exclude_topics(self):
226 |         distribution = {
227 |             ('t0', '0'): ['1', '2'],
228 |             ('t0', '1'): ['2', '3'],
229 |             ('t1', '0'): ['2', '3'],
230 |             ('t1', '1'): ['3', '4'],
231 |             ('t1', '2'): ['4', '5'],
232 |             ('t2', '0'): ['3', '4'],
233 |             ('t2', '1'): ['4', '5'],
234 |             ('t2', '2'): ['5', '6'],
235 |         }
236 |         brokers, zk = self._create_zk_for_topics(distribution)
237 |         o = OptimizedRebalanceChange(zk, brokers, ['2', '3'], ['t1'])
238 |         while o.run([]):
239 |             pass
240 | 
241 |         assert distribution[('t1', '0')] == ['2', '3']
242 |         assert distribution[('t1', '1')] == ['3', '4']
243 |         assert distribution[('t1', '2')] == ['4', '5']
244 | 
245 |         distribution.pop(('t1', '0'))
246 |         distribution.pop(('t1', '1'))
247 |         distribution.pop(('t1', '2'))
248 | 
249 |         brokers = [item for sublist in distribution.values() for item in sublist]
250 |         assert '2' not in brokers
251 |         assert '3' not in brokers
252 | 
253 |     def test_rebalance_on_filled2(self):
254 |         distribution = {
255 |             ('t0', '0'): ['2', '1'],
256 |             ('t0', '1'): ['1', '2'],
257 |             ('t0', '2'): ['1', '2'],
258 |             ('t0', '3'): ['1', '2'],
259 |             ('t0', '4'): ['1', '2'],
260 |             ('t0', '5'): ['1', '2'],
261 |             ('t0', '6'): ['1', '2'],
262 |         }
263 |         brokers, zk = self._create_zk_for_topics(distribution)
264 |         o = self.createChange(zk, brokers, [], [])
265 |         # broker to partitions
266 |         while o.run([]):
267 |             pass
268 | 
269 |         _verify_balanced(('1', '2'), distribution)
270 | 
271 |     def test_rebalance_with_dead_brokers(self):
272 |         distribution = {
273 |             ('t0', '0'): ['2', '1'],
274 |             ('t0', '1'): ['1', '2'],
275 |             ('t0', '2'): ['1', '2'],
276 |             ('t0', '3'): ['1', '2'],
277 |             ('t0', '4'): ['1', '2'],
278 |             ('t0', '5'): ['1', '2'],
279 |             ('t0', '6'): ['1', '2'],
280 |         }
281 |         _, zk = self._create_zk_for_topics(distribution, broker_ids=['1', '3'], racks={1: None, 2: None, 3: None})
282 |         o = self.createChange(zk, ['1', '3'], [], [])
283 |         while o.run([]):
284 |             pass
285 |         _verify_balanced(['1', '3'], distribution)
286 | 
287 |     def test_rebalance_fail_with_not_enough_replicas(self):
288 |         distribution = {
289 |             ('t0', '0'): ['2', '1', '3'],
290 |             ('t0', '1'): ['1', '2'],
291 |         }
292 | 
293 |         _, zk = self._create_zk_for_topics(distribution, broker_ids=['1', '3'])
294 |         o = self.createChange(zk, ['1', '3'], [], [])
295 |         try:
296 |             while o.run([]):
297 |                 pass
298 |             assert False, "Balancing can not work with low replication factor"
299 |         except Exception:
300 |             pass
301 | 
302 |     def test_rebalance_recovered_with_additional_copy2(self):
303 |         distribution = {
304 |             ('t0', '0'): ['2', '1'],
305 |             ('t0', '1'): ['1', '2'],
306 |             ('t0', '2'): ['3', '4']
307 |         }
308 |         _, zk = self._create_zk_for_topics(distribution, ['1', '2', '4'], racks={1: None, 2: None, 3: None, 4: None})
309 |         o = self.createChange(zk, ['1', '2', '4'], [], [])
310 |         while o.run([]):
311 |             pass
312 |         _verify_balanced(['1', '2', '4'], distribution)
313 | 
314 |     def test_rebalance_with_many_topics(self):
315 |         distribution = {}
316 |         topic_count = 1000
317 |         partition_count = 21
318 |         broker_ids = [str(i) for i in range(1, 22)]
319 |         for i in range(0, topic_count):
320 |             topic = 't{}'.format(i)
321 |             distribution.update({(topic, str(partition)): ['1', '2', '3'] for partition in range(0, partition_count)})
322 |         _, zk = self._create_zk_for_topics(distribution, broker_ids=broker_ids)
323 | 
324 |         o = self.createChange(zk, broker_ids, [], [], parallelism=1000)
325 |         steps = 0
326 |         while o.run([]):
327 |             steps += 1
328 |         _verify_balanced(broker_ids, distribution, 1)
329 | 
330 |     def test_leader_partition_limit(self):
331 |         distribution = {
332 |             ('t0', '0'): ['1', '2'],
333 |             ('t0', '1'): ['1', '2'],
334 |             ('t0', '2'): ['1', '2'],
335 |             ('t1', '2'): ['1', '2'],
336 |         }
337 |         _, zk = self._create_zk_for_topics(distribution, ['2', '3'], racks={1: None, 2: None, 3: None})
338 |         o = self.createChange(zk, ['2', '3'], [], [])
339 |         while o.run([]):
340 |             pass
341 |         _verify_balanced(['2', '3'], distribution)
342 | 
343 | 
344 | class OptimizedRebalanceTest(TestBaseRebalance):
345 |     __test__ = True
346 |     _correct_rack_assignment = False
347 | 
348 |     def createChange(self, zk, broker_ids, empty_brokers, exclude_topics, parallelism=1):
349 |         return OptimizedRebalanceChange(zk, broker_ids, empty_brokers, exclude_topics, parallelism)
350 | 
351 |     def test_rebalance_recovered_with_additional_copy1(self):
352 |         distribution = {
353 |             ('t0', '0'): ['2', '1'],
354 |             ('t0', '1'): ['1', '2'],
355 |             ('t0', '2'): ['3', '4']
356 |         }
357 |         _, zk = self._create_zk_for_topics(distribution, ['1', '2', '3'], racks={1: None, 2: None, 3: None, 4: None})
358 |         o = self.createChange(zk, ['1', '2', '3'], [], [])
359 |         while o.run([]):
360 |             pass
361 |         _verify_balanced(['1', '2', '3'], distribution)
362 | 
363 | 
364 | class SimpleRebalanceTest(TestBaseRebalance):
365 |     __test__ = True
366 | 
367 |     def createChange(self, zk, broker_ids, empty_brokers, exclude_topics, parallelism=1):
368 |         return SimpleRebalanceChange(zk,
369 |                                      broker_ids=broker_ids,
370 |                                      empty_brokers=empty_brokers,
371 |                                      exclude_topics=exclude_topics,
372 |                                      parallelism=parallelism)
373 | 
374 |     def test_rebalance_with_racks_different_nr_partitions_per_rack(self):
375 |         distribution = {
376 |             ('t0', '0'): ['3', '1'],
377 |             ('t0', '1'): ['6', '5'],
378 |             ('t0', '2'): ['5', '3'],
379 |             ('t1', '0'): ['5', '6'],
380 |             ('t1', '1'): ['6', '5'],
381 |             ('t2', '0'): ['6', '4'],
382 |             ('t2', '1'): ['2', '6'],
383 |             ('t2', '2'): ['2', '6'],
384 |             ('t2', '3'): ['4', '2'],
385 |             ('t2', '4'): ['6', '5'],
386 |             ('t2', '5'): ['4', '6'],
387 |             ('t2', '6'): ['6', '4'],
388 |         }
389 | 
390 |         initial_distribution = dict(distribution)
391 | 
392 |         racks = {
393 |             1: 'r1',
394 |             2: 'r1',
395 |             3: 'r2',
396 |             4: 'r2',
397 |             5: 'r3',
398 |             6: 'r3'
399 |         }
400 | 
401 |         brokers, zk = self._create_zk_for_topics(distribution, ['1', '2', '3', '4', '5', '6'], racks)
402 |         o = self.createChange(zk, brokers, [], [])
403 |         while o.run([]):
404 |             pass
405 | 
406 |         _verify_rack_aware(initial_distribution, distribution, racks)
407 | 


--------------------------------------------------------------------------------
/tests/test_restart.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import MagicMock, Mock
 3 | 
 4 | from bubuku.broker import LeaderElectionInProgress
 5 | from bubuku.features.restart_on_zk_change import RestartBrokerChange
 6 | 
 7 | 
 8 | class RestartTest(unittest.TestCase):
 9 |     def test_restart_atomicity(self):
10 |         """
11 |         Because of action locks structure there is need to restart instances atomically. That means, that during restart
12 |         action parts (stop, wait for leader election, start) must be under same lock. That guarantees, that cluster
13 |         instances won't be destroyed at the same time during zk update.
14 |         """
15 |         broker = MagicMock()
16 |         zk = MagicMock()
17 |         change = RestartBrokerChange(zk, broker, None)
18 | 
19 |         zk.get_conn_str = lambda: 'xxx'
20 | 
21 |         broker.is_running_and_registered = lambda: True
22 |         stopped = []
23 |         broker.stop_kafka_process = lambda: stopped.append(True)
24 |         assert change.run([])
25 |         assert stopped and stopped[0]
26 | 
27 |         broker.start_kafka_process = Mock(side_effect=LeaderElectionInProgress())
28 |         for i in range(1, 50):
29 |             assert change.run([])
30 | 
31 |         started = []
32 |         broker.start_kafka_process = lambda x: started.append(x)
33 |         assert not change.run([])
34 |         assert started and 'xxx' == started[0]
35 | 


--------------------------------------------------------------------------------
/tests/test_restart_if_dead.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import MagicMock
 3 | from bubuku.features.restart_if_dead import CheckBrokerStopped
 4 | 
 5 | class TestRestartIfDeadCheck(unittest.TestCase):
 6 | 
 7 |     def test_broker_retries_before_it_restarts(self):
 8 |         brokerManager = MagicMock()
 9 |         isRegistered = MagicMock(return_value=False)
10 |         attrs = {'is_running.return_value': True,
11 |                  'is_registered_in_zookeeper': isRegistered,
12 |                  'get_zookeeper_session_timeout.return_value': 1}
13 |         brokerManager.configure_mock(**attrs)
14 | 
15 |         exhibitor = MagicMock()
16 |         checkBrokerStopped = CheckBrokerStopped(brokerManager, exhibitor)
17 |         checkReturnedSomething = None
18 |         while not checkReturnedSomething:
19 |             checkReturnedSomething = checkBrokerStopped.check()
20 |         assert isRegistered.call_count > 0
21 | 


--------------------------------------------------------------------------------
/tests/test_size_stats_collecting.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from unittest.mock import MagicMock
 3 | 
 4 | from bubuku.features.data_size_stats import GenerateDataSizeStatistics
 5 | from bubuku.utils import CmdHelper
 6 | 
 7 | 
 8 | class TestDataSizeStats(unittest.TestCase):
 9 | 
10 |     def test_size_stats_collecting(self):
11 |         zk = MagicMock()
12 | 
13 |         stat_check = GenerateDataSizeStatistics(zk, self.__mock_broker(), self.__mock_cmd_helper(), ["/kafka-logs"])
14 |         stat_check.check()
15 | 
16 |         expected_json = {
17 |             "disk": {"free_kb": 606, "used_kb": 404},
18 |             "topics": {
19 |                 "another_topic": {"0": 3},
20 |                 "my-topic": {"0": 10, "2": 200}
21 |             }
22 |         }
23 |         zk.update_disk_stats.assert_called_with('dummy_id', expected_json)
24 | 
25 |     def __mock_cmd_helper(self) -> CmdHelper:
26 |         class CmdHelperMock(CmdHelper):
27 |             def cmd_run(self, cmd: str):
28 |                 if cmd.startswith("du"):
29 |                     return "10\t/kafka-logs/my-topic-0\n" \
30 |                            "200\t/kafka-logs/my-topic-2\n" \
31 |                            "3\t/kafka-logs/another_topic-0\n" \
32 |                            "55\t/kafka-logs\n" \
33 |                            "77\t/kafka-logs/wrong_topic\n" \
34 |                            "blah"
35 |                 elif cmd.startswith("df"):
36 |                     return "101 202\n" \
37 |                            "303 404\n" \
38 |                            "500"
39 |                 else:
40 |                     raise ValueError("Call not expected")
41 | 
42 |         return CmdHelperMock()
43 | 
44 |     def __mock_broker(self) -> MagicMock:
45 |         broker = MagicMock()
46 |         broker.is_running_and_registered.return_value = True
47 |         broker.id_manager.get_broker_id.return_value = "dummy_id"
48 |         return broker
49 | 


--------------------------------------------------------------------------------
/tests/test_startup_timeout.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from bubuku.broker import StartupTimeout
 4 | 
 5 | 
 6 | class TestStartupTimeout(unittest.TestCase):
 7 |     @staticmethod
 8 |     def _verify(o: StartupTimeout, border_value: float, border_value_after_fail: float):
 9 |         print(o)
10 |         assert not o.is_timed_out(border_value)
11 |         assert o.is_timed_out(border_value + 1)
12 |         o.on_timeout_fail()
13 |         assert not o.is_timed_out(border_value_after_fail)
14 |         assert o.is_timed_out(border_value_after_fail + 1)
15 | 
16 |     def test_linear_defaults(self):
17 |         o = StartupTimeout.build({'type': 'linear'})
18 |         TestStartupTimeout._verify(o, 300., 360.)
19 | 
20 |     def test_linear(self):
21 |         o = StartupTimeout.build({'type': 'linear', 'initial': '10', 'step': 2})
22 |         TestStartupTimeout._verify(o, 10., 12.)
23 | 
24 |     def test_progressive_defaults(self):
25 |         o = StartupTimeout.build({'type': 'progressive'})
26 |         TestStartupTimeout._verify(o, 300., 450.)
27 | 
28 |     def test_progressive(self):
29 |         o = StartupTimeout.build({'type': 'progressive', 'initial': '16', 'step': '0.25'})
30 | 
31 |         TestStartupTimeout._verify(o, 16., 20.)
32 | 


--------------------------------------------------------------------------------
/tests/test_zookeeper.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import math
  3 | import re
  4 | import time
  5 | import unittest
  6 | from unittest.mock import MagicMock
  7 | 
  8 | from kazoo.exceptions import NoNodeError, NodeExistsError
  9 | 
 10 | from bubuku.zookeeper import BukuExhibitor, SlowlyUpdatedCache
 11 | 
 12 | 
 13 | def test_get_broker_ids():
 14 |     exhibitor_mock = MagicMock()
 15 | 
 16 |     def _get_children(path):
 17 |         if path == '/brokers/ids':
 18 |             return ['3', '1', '2']
 19 |         else:
 20 |             raise NotImplementedError()
 21 | 
 22 |     exhibitor_mock.get_children = _get_children
 23 | 
 24 |     buku = BukuExhibitor(exhibitor_mock)
 25 | 
 26 |     assert ['1', '2', '3'] == buku.get_broker_ids()  # ensure that return list is sorted
 27 | 
 28 | 
 29 | def test_load_active_topics():
 30 |     exhibitor_mock = MagicMock()
 31 | 
 32 |     def _get_children(path):
 33 |         if path == '/brokers/topics':
 34 |             return ['3', '1', '2']
 35 |         elif path == '/admin/delete_topics':
 36 |             return ['3', '1']
 37 |         else:
 38 |             raise NotImplementedError()
 39 | 
 40 |     exhibitor_mock.get_children = _get_children
 41 |     buku = BukuExhibitor(exhibitor_mock)
 42 | 
 43 |     assert ['2'] == buku.load_active_topics()
 44 | 
 45 | 
 46 | def test_is_broker_registered():
 47 |     def _get(path):
 48 |         if path == '/brokers/ids/123':
 49 |             return '123', object()
 50 |         elif path == '/brokers/ids/321':
 51 |             return None, None
 52 |         else:
 53 |             raise NoNodeError()
 54 | 
 55 |     exhibitor_mock = MagicMock()
 56 |     exhibitor_mock.get = _get
 57 |     buku = BukuExhibitor(exhibitor_mock)
 58 | 
 59 |     assert buku.is_broker_registered('123')
 60 |     assert buku.is_broker_registered(123)
 61 |     assert not buku.is_broker_registered('321')
 62 |     assert not buku.is_broker_registered(321)
 63 |     assert not buku.is_broker_registered(333)
 64 |     assert not buku.is_broker_registered('333')
 65 | 
 66 | 
 67 | def _test_load_partition_assignment(async_: bool):
 68 |     exhibitor_mock = MagicMock()
 69 | 
 70 |     def _get_children(path):
 71 |         if path == '/brokers/topics':
 72 |             return ['t01', 't02']
 73 |         else:
 74 |             raise NotImplementedError()
 75 | 
 76 |     def _get(path):
 77 |         if path == '/brokers/topics/t01':
 78 |             return json.dumps({'partitions': {0: [1, 2, 3], 1: [3, 2, 1]}}).encode('utf-8'), object()
 79 |         elif path == '/brokers/topics/t02':
 80 |             return json.dumps({'partitions': {0: [4, 5, 6], 1: [5, 1, 2]}}).encode('utf-8'), object()
 81 |         else:
 82 |             raise NotImplementedError()
 83 | 
 84 |     def _get_async(path):
 85 |         def _get_iresult(block):
 86 |             assert block
 87 |             return _get(path)
 88 | 
 89 |         mock = MagicMock()
 90 |         mock.get = _get_iresult
 91 |         return mock
 92 | 
 93 |     exhibitor_mock.get = _get
 94 |     exhibitor_mock.get_async = _get_async
 95 |     exhibitor_mock.get_children = _get_children
 96 | 
 97 |     buku_ex = BukuExhibitor(exhibitor_mock, async_)
 98 | 
 99 |     expected_result = [
100 |         ('t01', 0, [1, 2, 3]),
101 |         ('t01', 1, [3, 2, 1]),
102 |         ('t02', 0, [4, 5, 6]),
103 |         ('t02', 1, [5, 1, 2]),
104 |     ]
105 |     result = [r for r in buku_ex.load_partition_assignment()]
106 |     assert len(expected_result) == len(result)
107 |     for e in expected_result:
108 |         assert e in result
109 | 
110 | 
111 | def test_load_partition_assignment_sync():
112 |     _test_load_partition_assignment(False)
113 | 
114 | 
115 | def test_load_partition_assignment_async():
116 |     _test_load_partition_assignment(True)
117 | 
118 | 
119 | def _test_load_partition_states(async_: bool):
120 |     exhibitor_mock = MagicMock()
121 | 
122 |     def _get_children(path):
123 |         if path == '/brokers/topics':
124 |             return ['t01', 't02']
125 |         elif path == '/brokers/topics/t01/partitions':
126 |             return ['0', '1']
127 |         elif path == '/brokers/topics/t02/partitions':
128 |             return ['0', '1', '2']
129 |         else:
130 |             raise NotImplementedError()
131 | 
132 |     def _get(path: str):
133 |         matched = re.match('/brokers/topics/(.*)/partitions/(.*)/state', path)
134 |         if not matched:
135 |             topic = path[len('/brokers/topics/'):]
136 |             if topic not in ['t01', 't02']:
137 |                 raise NotImplementedError('Not implemented for path {}'.format(path))
138 |             cnt = 2 if topic == 't01' else 3
139 |             return json.dumps({'partitions': {x: None for x in range(0, cnt)}}).encode('utf-8'), object()
140 |         topic = matched.group(1)
141 |         partition = matched.group(2)
142 |         if topic == 't01' and partition not in ('0', '1'):
143 |             raise NotImplementedError()
144 |         elif topic == 't02' and partition not in ('0', '1', '2'):
145 |             raise NotImplementedError()
146 |         elif topic not in ('t01', 't02'):
147 |             raise NotImplementedError()
148 |         idx = (100 if topic == 't01' else 200) + int(partition)
149 |         return json.dumps({'fake_data': idx}).encode('utf-8'), object()
150 | 
151 |     def _get_async(path):
152 |         def _get_iasync(block):
153 |             assert block
154 |             return _get(path)
155 | 
156 |         mock = MagicMock()
157 |         mock.get = _get_iasync
158 |         return mock
159 | 
160 |     exhibitor_mock.get = _get
161 |     exhibitor_mock.get_async = _get_async
162 |     exhibitor_mock.get_children = _get_children
163 | 
164 |     buku_ex = BukuExhibitor(exhibitor_mock, async_=async_)
165 | 
166 |     expected_result = [
167 |         ('t01', 0, {'fake_data': 100}),
168 |         ('t01', 1, {'fake_data': 101}),
169 |         ('t02', 0, {'fake_data': 200}),
170 |         ('t02', 1, {'fake_data': 201}),
171 |         ('t02', 2, {'fake_data': 202}),
172 |     ]
173 | 
174 |     result = [r for r in buku_ex.load_partition_states()]
175 |     assert len(expected_result) == len(result)
176 |     for e in expected_result:
177 |         assert e in result
178 | 
179 | 
180 | def test_load_partition_states_sync():
181 |     _test_load_partition_states(False)
182 | 
183 | 
184 | def test_load_partition_states_async():
185 |     _test_load_partition_states(True)
186 | 
187 | 
188 | def test_reallocate_partition():
189 |     call_idx = [0]
190 | 
191 |     def _create(path, value=None, **kwargs):
192 |         if path in ('/bubuku/changes', '/bubuku/actions/global'):
193 |             pass
194 |         elif path == '/admin/reassign_partitions':
195 |             if call_idx[0] >= 5:
196 |                 raise NodeExistsError()
197 |             call_idx[0] += 1
198 |             j = json.loads(value.decode('utf-8'))
199 |             assert j['version'] == '1'
200 |             assert len(j['partitions']) == 1
201 |             p = j['partitions'][0]
202 |             assert p['topic'] == 't01'
203 |             assert p['partition'] == 0
204 |             assert p['replicas'] == [1, 2, 3]
205 |         else:
206 |             raise NotImplementedError('Not implemented for path {}'.format(path))
207 | 
208 |     exhibitor_mock = MagicMock()
209 |     exhibitor_mock.create = _create
210 | 
211 |     buku = BukuExhibitor(exhibitor_mock)
212 | 
213 |     assert buku.reallocate_partition('t01', 0, ['1', '2', '3'])
214 |     assert buku.reallocate_partition('t01', 0, ['1', '2', 3])
215 |     assert buku.reallocate_partition('t01', 0, [1, 2, 3])
216 |     assert buku.reallocate_partition('t01', 0, [1, 2, 3])
217 |     assert buku.reallocate_partition('t01', 0, [1, 2, 3])
218 |     # Node exists
219 |     assert not buku.reallocate_partition('t01', 0, [1, 2, 3])
220 | 
221 | 
222 | class SlowlyUpdatedCacheTest(unittest.TestCase):
223 |     def test_initial_update_fast(self):
224 |         result = [None]
225 | 
226 |         def _update(value_):
227 |             result[0] = value_
228 | 
229 |         cache = SlowlyUpdatedCache(lambda: (['test'], 1), _update, 0, 0)
230 | 
231 |         cache.touch()
232 |         assert result[0] == (['test'], 1)
233 | 
234 |     def test_exception_eating(self):
235 |         result = [10, None]
236 | 
237 |         def _update(value_):
238 |             result[1] = value_
239 | 
240 |         def _load():
241 |             if result[0] > 0:
242 |                 result[0] -= 1
243 |                 raise Exception()
244 |             return ['test'], 1
245 | 
246 |         cache = SlowlyUpdatedCache(_load, _update, 0, 0)
247 |         cache.force = False  # Small hack to avoid initial refresh cycle
248 |         for i in range(0, 10):
249 |             cache.touch()
250 |             assert result[1] is None
251 |             assert result[0] == 9 - i
252 |         cache.touch()
253 |         assert result[1] == (['test'], 1)
254 | 
255 |     def test_initial_update_slow(self):
256 |         result = [None]
257 |         call_count = [0]
258 | 
259 |         def _load():
260 |             call_count[0] += 1
261 |             if call_count[0] == 100:
262 |                 return ['test'], 1
263 |             return None
264 | 
265 |         def _update(value_):
266 |             result[0] = value_
267 | 
268 |         cache = SlowlyUpdatedCache(_load, _update, 0, 0)
269 | 
270 |         cache.touch()
271 |         assert call_count[0] == 100
272 |         assert result[0] == (['test'], 1)
273 | 
274 |     def test_delays_illegal(self):
275 |         result = [None]
276 |         load_calls = []
277 |         update_calls = []
278 | 
279 |         def _load():
280 |             load_calls.append(time.time())
281 |             return ['test'], 0 if len(load_calls) > 1 else 1
282 | 
283 |         def _update(value_):
284 |             update_calls.append(time.time())
285 |             result[0] = value_
286 | 
287 |         # refresh every 1 second, delay 0.5 second
288 |         cache = SlowlyUpdatedCache(_load, _update, 0.5, 0.25)
289 | 
290 |         while len(update_calls) != 2:
291 |             time.sleep(0.1)
292 |             cache.touch()
293 | 
294 |         assert math.fabs(update_calls[0] - load_calls[0]) <= 0.15  # 0.1 + 0.1/2
295 |         # Verify that load calls were made one by another
296 |         assert math.fabs(load_calls[1] - load_calls[0] - .5) <= 0.15
297 |         # Verity that update call was made in correct interval
298 | 
299 |         assert load_calls[1] + 0.25 <= update_calls[1] <= load_calls[1] + 0.25 + 0.15
300 | 
301 |     def test_delays_legal(self):
302 |         result = [None]
303 |         main_call = []
304 |         load_calls = []
305 |         update_calls = []
306 | 
307 |         def _load():
308 |             load_calls.append(time.time())
309 |             if len(load_calls) == 5:
310 |                 main_call.append(time.time())
311 |             return ['test'], 0 if len(load_calls) >= 5 else len(load_calls)
312 | 
313 |         def _update(value_):
314 |             update_calls.append(time.time())
315 |             result[0] = value_
316 | 
317 |         # refresh every 1 second, delay 5 second - in case where situation is constantly changing - wait for
318 |         # last stable update
319 |         cache = SlowlyUpdatedCache(_load, _update, 0.5, 3)
320 | 
321 |         while len(update_calls) != 2:
322 |             time.sleep(0.1)
323 |             cache.touch()
324 |             print(cache)
325 | 
326 |         assert len(main_call) == 1
327 |         assert main_call[0] + 3 - .15 < update_calls[1] < main_call[0] + 3 + .15
328 | 


--------------------------------------------------------------------------------