├── .gitignore ├── .travis.yml ├── .zappr.yaml ├── Dockerfile ├── LICENSE ├── MAINTAINERS ├── README.md ├── delivery.yaml ├── etcd-cluster-multiregion.yaml ├── etcd-cluster.yaml ├── etcd.py ├── requirements.txt ├── setup.py ├── tests ├── test_etcd_cluster.py ├── test_etcd_housekeeper.py ├── test_etcd_manager.py ├── test_etcd_member.py └── test_etcd_multiregion_cluster.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # vi(m) swap files: 4 | *.sw? 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Packages 10 | *.egg 11 | *.eggs 12 | *.egg-info 13 | dist 14 | build 15 | eggs 16 | parts 17 | bin 18 | var 19 | sdist 20 | develop-eggs 21 | .installed.cfg 22 | lib 23 | lib64 24 | 25 | # Installer logs 26 | pip-log.txt 27 | 28 | # Unit test / coverage reports 29 | .coverage 30 | .tox 31 | nosetests.xml 32 | coverage.xml 33 | htmlcov 34 | junit.xml 35 | 36 | # Translations 37 | *.mo 38 | 39 | # Mr Developer 40 | .mr.developer.cfg 41 | .project 42 | .pydevproject 43 | 44 | scm-source.json 45 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | dist: trusty 3 | language: python 4 | python: 5 | - "3.5" 6 | - "3.6" 7 | install: 8 | - pip install setuptools --upgrade 9 | - pip install -r requirements.txt 10 | - pip install coveralls flake8 11 | script: 12 | - python setup.py test 13 | - python setup.py flake8 14 | after_success: 15 | - coveralls 16 | -------------------------------------------------------------------------------- /.zappr.yaml: -------------------------------------------------------------------------------- 1 | # for github.com 2 | approvals: 3 | groups: 4 | zalando: 5 | minimum: 2 6 | from: 7 | orgs: 8 | - "zalando" 9 | # team should be valid team id in team service https://teams.auth.zalando.com/api/teams/:id 10 | X-Zalando-Team: "acid" 11 | # type should be one of [code, doc, config, tools, secrets] 12 | # code will be the default value, if X-Zalando-Type is not found in .zappr.yml 13 | X-Zalando-Type: code 14 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM registry.opensource.zalan.do/library/ubuntu-18.04 2 | MAINTAINER Alexander Kukushkin 3 | 4 | ENV USER etcd 5 | ENV HOME /home/${USER} 6 | 7 | # Create home directory for etcd 8 | RUN useradd -d ${HOME} -k /etc/skel -s /bin/bash -m ${USER} && chmod 777 ${HOME} 9 | 10 | RUN export DEBIAN_FRONTEND=noninteractive \ 11 | && apt-get update \ 12 | && echo 'APT::Install-Recommends "0";' > /etc/apt/apt.conf.d/01norecommend \ 13 | && echo 'APT::Install-Suggests "0";' >> /etc/apt/apt.conf.d/01norecommend \ 14 | 15 | && apt-get upgrade -y \ 16 | && apt-get install -y curl ca-certificates python3-boto3 \ 17 | 18 | # Clean up 19 | && apt-get clean \ 20 | && rm -rf /var/lib/apt/lists/* 21 | 22 | ## Install etcd 23 | 24 | ARG ETCDVERSION_PREV=3.3.25 25 | RUN curl -L https://github.com/etcd-io/etcd/releases/download/v${ETCDVERSION_PREV}/etcd-v${ETCDVERSION_PREV}-linux-amd64.tar.gz \ 26 | | tar xz -C /bin --xform='s/$/.old/x' --strip=1 --wildcards --no-anchored etcd \ 27 | && chown root:root /bin/etcd.old \ 28 | && chmod +x /bin/etcd.old 29 | 30 | ARG ETCDVERSION=3.4.14 31 | ENV ETCDVERSION=$ETCDVERSION 32 | RUN curl -L https://github.com/etcd-io/etcd/releases/download/v${ETCDVERSION}/etcd-v${ETCDVERSION}-linux-amd64.tar.gz \ 33 | | tar xz -C /bin --strip=1 --wildcards --no-anchored etcd etcdctl \ 34 | && chown root:root /bin/etcd /bin/etcdctl \ 35 | && chmod +x /bin/etcd /bin/etcdctl 36 | 37 | COPY etcd.py /bin/etcd.py 38 | 39 | WORKDIR $HOME 40 | USER ${USER} 41 | EXPOSE 2379 2380 2381 42 | CMD ["/usr/bin/python3", "/bin/etcd.py"] 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015 Zalando SE 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /MAINTAINERS: -------------------------------------------------------------------------------- 1 | Alexander Kukushkin 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/zalando-incubator/stups-etcd-cluster.svg?branch=master)](https://travis-ci.org/zalando-incubator/stups-etcd-cluster) 2 | [![Coverage Status](https://coveralls.io/repos/zalando-incubator/stups-etcd-cluster/badge.svg?branch=master&service=github)](https://coveralls.io/github/zalando-incubator/stups-etcd-cluster?branch=master) 3 | 4 | Introduction 5 | ============ 6 | This etcd appliance is created for an AWS environment. It is available as an etcd cluster internally, for any application willing to use it. For discovery of the appliance we have a recently updated DNS SRV and A records in a Route53 zone. 7 | 8 | Design 9 | ====== 10 | The appliance is supposed to be run on EC2 instances, members of one autoscaling group. 11 | Usage of autoscaling group give us possibility to discover all cluster member via AWS api (python-boto). 12 | Etcd process is executed by a python wrapper which is taking care of discovering all members of already existing cluster or the new cluster. 13 | 14 | Currently the following scenarios are supported: 15 | 16 | - Starting up the new cluster. etcd.py will figure out that this is the new cluster and run etcd daemon with necessary options. 17 | - If the new EC2 instance is spawned within existing autoscaling group etcd.py will take care of adding this instance into already existing cluster and apply needed options to etcd daemon. 18 | - If something happened with etcd (crached or exited), etcd.py will try to restart it. 19 | - Periodically leader performs cluster health check and remove cluster members which are not members of autoscaling group 20 | - Also it creates or updates SRV and A records in a given zone via AWS api. 21 | 22 | Usage 23 | ===== 24 | 25 | ## Step 1: Create an etcd cluster 26 | A cluster can be created by issuing such a command: 27 | 28 | senza create etcd-cluster.yaml STACK_VERSION HOSTED_ZONE DOCKER_IMAGE 29 | 30 | For example, if you are making an etcd cluster to be used by a service called `release`, you could issue the following: 31 | 32 | senza create https://raw.githubusercontent.com/zalando-stups/stups-etcd-cluster/master/etcd-cluster.yaml releaseetcd \ 33 | HostedZone=elephant.example.org \ 34 | DockerImage=registry.opensource.zalan.do/acid/etcd-cluster:3.0.17-p17 35 | 36 | ## Step 2: Confirm successful cluster creation 37 | 38 | Running this `senza create` command should have created: 39 | 40 | - the required number of EC2 instances 41 | - with stack name `etcd-cluster` 42 | - with instance name `etcd-cluster-releaseetcd` 43 | - a security group allowing etcd's ports 2379 and 2380 44 | - a role that allows List and Describe EC2 resources and create records in a Route53 45 | - DNS records 46 | - an A record of the form `releaseetcd.elephant.example.org.` 47 | - a SRV record of the form `_etcd-server._tcp.releaseetcd.elephant.example.org.` with port = 2380, i.e. peer port 48 | - a SRV record of the form `_etcd._tcp.releaseetcd.elephant.example.org.` with port = 2379, i.e. client port 49 | 50 | 51 | Multiregion cluster 52 | =================== 53 | It is possible to deploy etcd-cluster across multiple regions. To do that you have to deploy cloud formation stack into multiple regions with the same stack names. This enables discovery of instances from other regions and grants access to those instances via SecurityGroups. Deployment has to be done region by region, otherwise there is a chance of race condition during cluster bootstrap. 54 | 55 | senza --region eu-central-1 create etcd-cluster-multiregion.yaml multietcd 56 | HostedZone=elephant.example.org \ 57 | DockerImage=registry.opensource.zalan.do/acid/etcd-cluster:3.0.17-p17 \ 58 | ActiveRegions=eu-west-1,eu-central-1 \ 59 | InstanceCount=4 60 | 61 | senza --region eu-central-1 wait etcd-cluster multietcd 62 | 63 | senza --region eu-west-1 create etcd-cluster-multiregion.yaml multietcd 64 | HostedZone=elephant.example.org \ 65 | DockerImage=registry.opensource.zalan.do/acid/etcd-cluster:3.0.17-p17 \ 66 | ActiveRegions=eu-west-1,eu-central-1 \ 67 | InstanceCount=1 68 | 69 | 70 | Upgrade 71 | ======= 72 | 73 | In order to perform a minor or major upgrade without downtime you need to terminate all EC2 instances one-by-one. Between every termination you need to wait at least 5 minutes and monitor cluster-health, logs and DNS records. You should only terminate the next instance if the cluster is healthy again. 74 | 75 | To upgrade an existing etcd deployment to 3.0, you must be running 2.3. If you are running a version of etcd before 2.3, you must upgrade to 2.3 (preferably 2.3.7) before upgrading to 3.0. 76 | 77 | A major upgrade is possible one version at a time, i.e. it is possible to upgrade from 2.0 to 2.1 and from 2.1 to 2.2, but it is not possible to upgrade from 2.0 to 2.2. 78 | 79 | Before 3.0 it was possible simply to "join" the new member with a higher major version with the empty data directory to the cluster and it was working fine. Somehow this approach has stopped working for 2.3 -> 3.0 upgrade. So now we are using another technique: if the cluster_version is still 2.3, we are "joining" etcd 2.3.7 member to the cluster, in order to download latest data. When the cluster becomes healthy again, we are taking an "upgrade_lock", stopping etcd 2.3.7 and starting up etcd 3.0. When the cluster is healthy again we are removing "upgrade_lock" in order for other members to upgrade. 80 | 81 | The upgrade lock is needed to: 82 | - Temporary switch off "house-keeping" job, which task is removing "unhealthy" members and updating DNS records. 83 | - Make sure that we are upgrading one cluster member at a time. 84 | 85 | Migration of an existing cluster to multiregion setup 86 | ===================================================== 87 | Currently there are only two AZ in eu-central-1 region, therefore if the one AZ will go down we have a 50% chance that our etcd will become read-only. To avoid that we want to run one additional instance in eu-west-1 region. 88 | 89 | Step 1: you have to migrate to the multiregion setup but with only 1 (ONE) active region eu-central-1. To do that you need to run: 90 | 91 | senza --region=eu-central-1 update etcd-cluster-multiregion.yaml existingcluster \ 92 | HostedZone=elephant.example.org \ 93 | DockerImage=registry.opensource.zalan.do/acid/etcd-cluster:3.0.17-p17 \ 94 | ActiveRegions=eu-central-1 \ 95 | InstanceCount=5 96 | 97 | And do instance rotation like during Upgrade procedure. 98 | 99 | Step 2: Enable the second region. 100 | 101 | senza --region=eu-central-1 update etcd-cluster-multiregion.yaml existingcluster \ 102 | HostedZone=elephant.example.org \ 103 | DockerImage=registry.opensource.zalan.do/acid/etcd-cluster:3.0.17-p17 \ 104 | ActiveRegions=eu-central-1,eu-west-1 \ 105 | InstanceCount=5 106 | 107 | And rotate all instances once again. Although the second region is not there yet, cluster will think that it is working in multiregion mode. 108 | 109 | Step 3: Change instance count in eu-central-1 to 4: 110 | 111 | senza --region=eu-central-1 update etcd-cluster-multiregion.yaml existingcluster \ 112 | HostedZone=elephant.example.org \ 113 | DockerImage=registry.opensource.zalan.do/acid/etcd-cluster:3.0.17-p17 \ 114 | ActiveRegions=eu-central-1,eu-west-1 \ 115 | InstanceCount=4 116 | 117 | Autoscaling will kill one of the instances automatically. 118 | 119 | Step 4: Deploy cloudformation in another region: 120 | 121 | senza --region eu-west-1 create etcd-cluster-multiregion.yaml existingcluster 122 | HostedZone=elephant.example.org \ 123 | DockerImage=registry.opensource.zalan.do/acid/etcd-cluster:3.0.17-p17 \ 124 | ActiveRegions=eu-west-1,eu-central-1 \ 125 | InstanceCount=1 126 | 127 | Demo 128 | ==== 129 | [![Demo on asciicast](https://asciinema.org/a/32703.png)](https://asciinema.org/a/32703) 130 | -------------------------------------------------------------------------------- /delivery.yaml: -------------------------------------------------------------------------------- 1 | build_steps: 2 | - desc: Prepare Environment 3 | cmd: | 4 | apt-get update 5 | apt-get install -y jq 6 | 7 | - desc: Build and push docker images 8 | cmd: | 9 | # Please bump PATCH_VERSION if you change etcd.py or Dockerfile 10 | PATCH_VERSION=p24 11 | STOP_VERSION=2.3 12 | 13 | ETCD_VERSIONS=$(curl -sL "https://api.github.com/repos/etcd-io/etcd/releases?per_page=100" | jq -r .[].name | sed -n 's/^v\([^-]*\)$/\1/p' | sort -urV) 14 | ETCD_MAJOR_VERSIONS=$(sed 's/\.[0-9]*$//g' <<< "$ETCD_VERSIONS" | sort -urV) 15 | 16 | for major_version in $ETCD_MAJOR_VERSIONS; do 17 | version=$(egrep -m 1 "^${major_version/\./\\.}\." <<< "$ETCD_VERSIONS") 18 | 19 | if [[ ! -z $prev ]]; then 20 | IMAGE="registry-write.opensource.zalan.do/acid/etcd-cluster:$prev-$PATCH_VERSION" 21 | docker build --build-arg ETCDVERSION_PREV=$version --build-arg ETCDVERSION=$prev -t $IMAGE . 22 | 23 | # push docker images only for commits to the master branch 24 | if [[ "x${CDP_SOURCE_BRANCH}" == "x" && "x${CDP_TARGET_BRANCH}" == "xmaster" ]]; then 25 | docker push $IMAGE 26 | fi 27 | fi 28 | 29 | [[ $major_version == $STOP_VERSION ]] && break 30 | 31 | prev=$version 32 | done 33 | 34 | docker images 35 | -------------------------------------------------------------------------------- /etcd-cluster-multiregion.yaml: -------------------------------------------------------------------------------- 1 | SenzaComponents: 2 | - Configuration: 3 | Type: Senza::StupsAutoConfiguration 4 | PublicOnly: true 5 | - AppServer: 6 | AssociatePublicIpAddress: true 7 | IamRoles: 8 | - Ref: EtcdRole 9 | InstanceType: t2.small 10 | SecurityGroups: 11 | - Fn::GetAtt: 12 | - EtcdSecurityGroup 13 | - GroupId 14 | TaupageConfig: 15 | ports: 16 | 2379: 2379 17 | 2380: 2380 18 | runtime: Docker 19 | source: '{{Arguments.DockerImage}}' 20 | environment: 21 | HOSTED_ZONE: '{{Arguments.HostedZone}}' 22 | ACTIVE_REGIONS: '{{Arguments.ActiveRegions}}' 23 | mounts: 24 | /home/etcd: 25 | partition: none 26 | filesystem: tmpfs 27 | erase_on_boot: false 28 | options: size=1024m 29 | appdynamics_application: 'etcd-cluster-{{Arguments.version}}' 30 | Type: Senza::TaupageAutoScalingGroup 31 | AutoScaling: 32 | Minimum: "{{Arguments.InstanceCount}}" 33 | Maximum: "{{Arguments.InstanceCount}}" 34 | SenzaInfo: 35 | Parameters: 36 | - HostedZone: 37 | Description: AWS Hosted Zone to work with 38 | - DockerImage: 39 | Description: Docker image of etcd-cluster. 40 | - ActiveRegions: 41 | Description: Multi-Region-Cluster? Active/Deployed regions. 42 | Default: eu-west-1,eu-central-1 43 | - InstanceCount: 44 | Description: Instance number in ASG 45 | Default: 5 46 | StackName: etcd-cluster 47 | Resources: 48 | EtcdSecurityGroup: 49 | Type: AWS::EC2::SecurityGroup 50 | Properties: 51 | GroupDescription: Etcd Appliance Security Group 52 | SecurityGroupIngress: 53 | - IpProtocol: tcp 54 | FromPort: 22 55 | ToPort: 22 56 | CidrIp: 172.16.0.0/12 57 | - IpProtocol: tcp 58 | FromPort: 2379 59 | ToPort: 2380 60 | CidrIp: 172.16.0.0/12 61 | - IpProtocol: tcp 62 | FromPort: 9100 63 | ToPort: 9100 64 | CidrIp: 172.16.0.0/12 65 | EtcdIngressMembers: 66 | Type: "AWS::EC2::SecurityGroupIngress" 67 | Properties: 68 | GroupId: 69 | Fn::GetAtt: 70 | - EtcdSecurityGroup 71 | - GroupId 72 | IpProtocol: tcp 73 | FromPort: 0 74 | ToPort: 65535 75 | SourceSecurityGroupId: 76 | Fn::GetAtt: 77 | - EtcdSecurityGroup 78 | - GroupId 79 | EtcdRole: 80 | Type: AWS::IAM::Role 81 | Properties: 82 | AssumeRolePolicyDocument: 83 | Version: "2012-10-17" 84 | Statement: 85 | - Effect: Allow 86 | Principal: 87 | Service: ec2.amazonaws.com 88 | Action: sts:AssumeRole 89 | Path: / 90 | Policies: 91 | - PolicyName: AmazonEC2ReadOnlyAccess 92 | PolicyDocument: 93 | Version: "2012-10-17" 94 | Statement: 95 | - Effect: Allow 96 | Action: 97 | - ec2:Describe* 98 | - ec2:AuthorizeSecurityGroupIngress 99 | - ec2:RevokeSecurityGroupIngress 100 | Resource: "*" 101 | - Effect: Allow 102 | Action: autoscaling:Describe* 103 | Resource: "*" 104 | - PolicyName: AmazonRoute53Access 105 | PolicyDocument: 106 | Version: "2012-10-17" 107 | Statement: 108 | - Effect: Allow 109 | Action: 110 | - route53:ListHostedZonesByName 111 | - route53:ChangeResourceRecordSets 112 | - route53:GetHostedZone 113 | - route53:ListResourceRecordSets 114 | - route53:GetChange 115 | Resource: "*" 116 | -------------------------------------------------------------------------------- /etcd-cluster.yaml: -------------------------------------------------------------------------------- 1 | SenzaComponents: 2 | - Configuration: 3 | Type: Senza::StupsAutoConfiguration 4 | - AppServer: 5 | IamRoles: 6 | - Ref: EtcdRole 7 | InstanceType: t2.small 8 | SecurityGroups: 9 | - Fn::GetAtt: 10 | - EtcdSecurityGroup 11 | - GroupId 12 | TaupageConfig: 13 | ports: 14 | 2379: 2379 15 | 2380: 2380 16 | runtime: Docker 17 | source: '{{Arguments.DockerImage}}' 18 | environment: 19 | HOSTED_ZONE: '{{Arguments.HostedZone}}' 20 | mounts: 21 | /home/etcd: 22 | partition: none 23 | filesystem: tmpfs 24 | erase_on_boot: false 25 | options: size=1024m 26 | scalyr_region: '{{Arguments.ScalyrRegion}}' 27 | scalyr_account_key: '{{Arguments.ScalyrAccountKey}}' 28 | Type: Senza::TaupageAutoScalingGroup 29 | AutoScaling: 30 | Minimum: 5 31 | Maximum: 5 32 | MetricType: CPU 33 | SenzaInfo: 34 | Parameters: 35 | - HostedZone: 36 | Description: AWS Hosted Zone to work with 37 | - DockerImage: 38 | Description: Docker image of etcd-cluster. 39 | - ScalyrAccountKey: 40 | Description: Key for writing logs to scalyr 41 | Default: '' 42 | - ScalyrRegion: 43 | Description: Scalyr region 44 | Default: 'eu' 45 | StackName: etcd-cluster 46 | Resources: 47 | EtcdSecurityGroup: 48 | Type: AWS::EC2::SecurityGroup 49 | Properties: 50 | GroupDescription: Etcd Appliance Security Group 51 | SecurityGroupIngress: 52 | - IpProtocol: tcp 53 | FromPort: 22 54 | ToPort: 22 55 | CidrIp: 0.0.0.0/0 56 | - IpProtocol: tcp 57 | FromPort: 2379 58 | ToPort: 2380 59 | CidrIp: 0.0.0.0/0 60 | EtcdIngressMembers: 61 | Type: "AWS::EC2::SecurityGroupIngress" 62 | Properties: 63 | GroupId: 64 | Fn::GetAtt: 65 | - EtcdSecurityGroup 66 | - GroupId 67 | IpProtocol: tcp 68 | FromPort: 0 69 | ToPort: 65535 70 | SourceSecurityGroupId: 71 | Fn::GetAtt: 72 | - EtcdSecurityGroup 73 | - GroupId 74 | EtcdRole: 75 | Type: AWS::IAM::Role 76 | Properties: 77 | AssumeRolePolicyDocument: 78 | Version: "2012-10-17" 79 | Statement: 80 | - Effect: Allow 81 | Principal: 82 | Service: ec2.amazonaws.com 83 | Action: sts:AssumeRole 84 | Path: / 85 | Policies: 86 | - PolicyName: AmazonEC2ReadOnlyAccess 87 | PolicyDocument: 88 | Version: "2012-10-17" 89 | Statement: 90 | - Effect: Allow 91 | Action: ec2:Describe* 92 | Resource: "*" 93 | - Effect: Allow 94 | Action: autoscaling:Describe* 95 | Resource: "*" 96 | - PolicyName: AmazonRoute53Access 97 | PolicyDocument: 98 | Version: "2012-10-17" 99 | Statement: 100 | - Effect: Allow 101 | Action: 102 | - route53:ListHostedZonesByName 103 | - route53:ChangeResourceRecordSets 104 | - route53:GetHostedZone 105 | - route53:ListResourceRecordSets 106 | - route53:GetChange 107 | Resource: [ "*" ] 108 | -------------------------------------------------------------------------------- /etcd.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | import boto3 7 | import json 8 | import logging 9 | import os 10 | import re 11 | import requests 12 | import shutil 13 | import signal 14 | import subprocess 15 | import sys 16 | import time 17 | 18 | from threading import Thread 19 | 20 | if sys.hexversion >= 0x03000000: 21 | from urllib.parse import urlparse 22 | else: 23 | from urlparse import urlparse 24 | 25 | 26 | class EtcdClusterException(Exception): 27 | pass 28 | 29 | 30 | def tags_to_dict(tags): 31 | return {t['Key']: t['Value'] for t in tags} 32 | 33 | 34 | class EtcdMember: 35 | 36 | API_TIMEOUT = 3.1 37 | API_VERSION = '/v2/' 38 | DEFAULT_CLIENT_PORT = 2379 39 | DEFAULT_PEER_PORT = 2380 40 | DEFAULT_METRICS_PORT = 2381 41 | AG_TAG = 'aws:autoscaling:groupName' 42 | CF_TAG = 'aws:cloudformation:stack-name' 43 | 44 | def __init__(self, arg, region=None): 45 | self.id = None # id of cluster member, could be obtained only from running cluster 46 | self.name = None # name of cluster member, always match with the AWS instance.id 47 | self.instance_id = None # AWS instance.id 48 | self.private_ip_address = None 49 | self.public_ip_address = None 50 | self.private_dns_name = None 51 | self.public_dns_name = None 52 | self._addr = None # ip addr (private or public) could be assigned only from etcd 53 | self._dns = None # hostname (private or public) could be assigned only from etcd 54 | self.autoscaling_group = None # Name of autoscaling group (aws:autoscaling:groupName) 55 | self.cloudformation_stack = None # Name of cloudformation stack (aws:cloudformation:stack-name) 56 | self.region = region 57 | 58 | self.client_port = self.DEFAULT_CLIENT_PORT 59 | self.peer_port = self.DEFAULT_PEER_PORT 60 | self.metrics_port = self.DEFAULT_METRICS_PORT 61 | 62 | self.client_urls = [] # these values could be assigned only from the running etcd 63 | self.peer_urls = [] # cluster by performing http://addr:client_port/v2/members api call 64 | 65 | if isinstance(arg, dict): 66 | self.set_info_from_etcd(arg) 67 | else: 68 | self.set_info_from_ec2_instance(arg) 69 | 70 | def set_info_from_ec2_instance(self, instance): 71 | # by convention member.name == instance.id 72 | if self.name and self.name != instance.id: 73 | return 74 | 75 | if self._addr and self._addr not in (instance.private_ip_address, instance.public_ip_address) or \ 76 | self._dns and self._dns not in (instance.private_dns_name, instance.public_dns_name): 77 | return 78 | 79 | self.instance_id = instance.id 80 | self.private_ip_address = instance.private_ip_address 81 | self.public_ip_address = instance.public_ip_address 82 | self.private_dns_name = instance.private_dns_name 83 | self.public_dns_name = instance.public_dns_name 84 | 85 | tags = tags_to_dict(instance.tags) 86 | self.cloudformation_stack = tags[self.CF_TAG] 87 | self.autoscaling_group = tags[self.AG_TAG] 88 | 89 | @staticmethod 90 | def get_addr_from_urls(urls): 91 | for url in urls: 92 | url = urlparse(url) 93 | if url and url.netloc: 94 | return url.hostname 95 | return None 96 | 97 | def addr_matches(self, peer_urls): 98 | t = '{0}:' + str(self.peer_port) 99 | for url in peer_urls: 100 | url = urlparse(url) 101 | if url and url.netloc and url.netloc in (t.format(self.private_ip_address), 102 | t.format(self.public_ip_address), 103 | t.format(self.private_dns_name), 104 | t.format(self.public_dns_name)): 105 | return True 106 | return False 107 | 108 | def set_info_from_etcd(self, info): 109 | # by convention member.name == instance.id 110 | if self.instance_id and info['name'] and self.instance_id != info['name']: 111 | return 112 | 113 | addr = self.get_addr_from_urls(info['peerURLs']) 114 | # when you add new member it doesn't have name, but we can match it by peer_addr 115 | if not addr: 116 | return 117 | elif re.match(r'^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$', addr): 118 | if (self.private_ip_address or self.public_ip_address) and \ 119 | addr not in (self.private_ip_address, self.public_ip_address): 120 | return 121 | self._addr = addr 122 | else: 123 | if (self.private_dns_name or self.public_dns_name) and \ 124 | addr not in (self.private_dns_name, self.public_dns_name): 125 | return 126 | self._dns = addr 127 | 128 | self.id = info['id'] 129 | self.name = info['name'] 130 | self.client_urls = info['clientURLs'] 131 | self.peer_urls = info['peerURLs'] 132 | 133 | @staticmethod 134 | def generate_url(addr, port): 135 | return 'http://{}:{}'.format(addr, port) 136 | 137 | def get_client_url(self, endpoint=''): 138 | url = self.generate_url(self.advertise_addr, self.client_port) 139 | if endpoint: 140 | url += self.API_VERSION + endpoint 141 | return url 142 | 143 | @property 144 | def addr(self): 145 | return EtcdCluster.is_multiregion() and self.public_ip_address or self.private_ip_address 146 | 147 | @property 148 | def dns(self): 149 | return EtcdCluster.is_multiregion() and self.public_dns_name or self.private_dns_name 150 | 151 | @property 152 | def advertise_addr(self): 153 | return EtcdCluster.is_multiregion() and self.public_dns_name or self.private_ip_address 154 | 155 | @property 156 | def peer_addr(self): 157 | return '{}:{}'.format(self.dns or self._dns or self._addr, self.peer_port) 158 | 159 | @property 160 | def peer_url(self): 161 | return self.peer_urls and self.peer_urls[0] or self.generate_url(self.advertise_addr, self.peer_port) 162 | 163 | def api_get(self, endpoint): 164 | url = self.get_client_url(endpoint) 165 | response = requests.get(url, timeout=self.API_TIMEOUT) 166 | logging.debug('Got response from GET %s: code=%s content=%s', url, response.status_code, response.content) 167 | return (response.json() if response.status_code == 200 else None) 168 | 169 | def api_put(self, endpoint, data): 170 | url = self.get_client_url(endpoint) 171 | response = requests.put(url, data=data) 172 | logging.debug('Got response from PUT %s %s: code=%s content=%s', url, data, response.status_code, 173 | response.content) 174 | return (response.json() if response.status_code == 201 else None) 175 | 176 | def api_post(self, endpoint, data): 177 | url = self.get_client_url(endpoint) 178 | headers = {'Content-type': 'application/json'} 179 | data = json.dumps(data) 180 | response = requests.post(url, data=data, headers=headers) 181 | logging.debug('Got response from POST %s %s: code=%s content=%s', url, data, response.status_code, 182 | response.content) 183 | return (response.json() if response.status_code == 201 else None) 184 | 185 | def api_delete(self, endpoint, data=None): 186 | url = self.get_client_url(endpoint) 187 | response = requests.delete(url, data=data) 188 | logging.debug('Got response from DELETE %s: code=%s content=%s', url, response.status_code, response.content) 189 | return response.status_code == 204 190 | 191 | def get_cluster_version(self): 192 | response = requests.get(self.get_client_url() + '/version') 193 | return response.json()['etcdcluster'] if response.status_code == 200 else None 194 | 195 | def is_leader(self): 196 | return not self.api_get('stats/leader') is None 197 | 198 | def get_leader(self): 199 | json = self.api_get('stats/self') 200 | return (json['leaderInfo']['leader'] if json else None) 201 | 202 | def get_members(self): 203 | json = self.api_get('members') 204 | return (json['members'] if json else []) 205 | 206 | def adjust_security_groups(self, action, *members): 207 | if not EtcdCluster.is_multiregion(): 208 | return 209 | 210 | for region in EtcdCluster.REGIONS: 211 | ec2 = boto3.resource('ec2', region) 212 | # stack resource from cloudformation returns the GroupName instat of the GroupID... 213 | # cloudformation = boto3.resource('cloudformation', region) 214 | # stack_resource = cloudformation.StackResource(me.cloudformation_stack, 215 | # 'EtcdSecurityGroup') 216 | # security_group = ec2.SecurityGroup(stack_resource.physical_resource_id) 217 | # .filter(...) works only with default VPC! 218 | for sg in ec2.security_groups.all(): 219 | if sg.tags and tags_to_dict(sg.tags).get(self.CF_TAG, '') == self.cloudformation_stack: 220 | for m in members: 221 | if not m.region or m.region != region: 222 | try: 223 | getattr(sg, action)( 224 | IpProtocol='tcp', 225 | FromPort=self.client_port, 226 | ToPort=self.peer_port, 227 | CidrIp='{}/32'.format(m.addr) 228 | ) 229 | except Exception: 230 | logging.exception('Exception on %s for for %s', action, m.addr) 231 | 232 | def add_member(self, member): 233 | logging.debug('Adding new member %s:%s to cluster', member.instance_id, member.peer_url) 234 | response = self.api_post('members', {'peerURLs': [member.peer_url]}) 235 | if response: 236 | member.set_info_from_etcd(response) 237 | return True 238 | return False 239 | 240 | def delete_member(self, member): 241 | logging.debug('Removing member %s from cluster', member.id) 242 | result = self.api_delete('members/' + member.id) 243 | self.adjust_security_groups('revoke_ingress', member) 244 | return result 245 | 246 | def etcd_arguments(self, data_dir, initial_cluster, cluster_state, run_old): 247 | # common flags that always have to be set 248 | arguments = [ 249 | '-name', 250 | self.instance_id, 251 | '--data-dir', 252 | data_dir, 253 | '-listen-peer-urls', 254 | 'http://0.0.0.0:{}'.format(self.peer_port), 255 | '-initial-advertise-peer-urls', 256 | self.peer_url, 257 | '-listen-client-urls', 258 | 'http://0.0.0.0:{}'.format(self.client_port), 259 | '-advertise-client-urls', 260 | self.get_client_url(), 261 | '-initial-cluster', 262 | initial_cluster, 263 | '-initial-cluster-token', 264 | self.cloudformation_stack, 265 | '-initial-cluster-state', 266 | cluster_state 267 | ] 268 | 269 | # this section handles etcd version specific flags 270 | etcdversion = os.environ.get('ETCDVERSION_PREV' if run_old else 'ETCDVERSION') 271 | if etcdversion: 272 | etcdversion = tuple(int(x) for x in etcdversion.split('.')) 273 | # etcd >= v3.3: serve metrics on an additonal port 274 | if etcdversion >= (3, 3): 275 | arguments += [ 276 | '-listen-metrics-urls', 277 | 'http://0.0.0.0:{}'.format(self.metrics_port), 278 | ] 279 | if etcdversion >= (3, 4): 280 | arguments += [ 281 | '--enable-v2', 282 | ] 283 | 284 | # return final list of arguments 285 | return arguments 286 | 287 | 288 | class EtcdCluster: 289 | REGIONS = [] # more then one (1) Region if this a Multi-Region-Cluster 290 | 291 | def __init__(self, manager): 292 | self.manager = manager 293 | self.accessible_member = None 294 | self.leader_id = None 295 | self.cluster_version = None 296 | self.members = [] 297 | 298 | @property 299 | def is_upgraded(self): 300 | etcdversion = os.environ.get('ETCDVERSION') 301 | if etcdversion: 302 | etcdversion = etcdversion[:etcdversion.rfind('.') + 1] 303 | 304 | return etcdversion and self.cluster_version is not None and self.cluster_version.startswith(etcdversion) 305 | 306 | @staticmethod 307 | def is_multiregion(): 308 | return len(EtcdCluster.REGIONS) > 1 309 | 310 | @staticmethod 311 | def merge_member_lists(ec2_members, etcd_members): 312 | # we can match EC2 instance with single etcd member by comparing 'addr:peer_port' 313 | peers = {m.peer_addr: m for m in ec2_members} 314 | 315 | # iterate through list of etcd members obtained from running etcd cluster 316 | for m in etcd_members: 317 | for peer in peers.values(): 318 | if peer.addr_matches(m['peerURLs']): 319 | peer.set_info_from_etcd(m) 320 | break 321 | else: # when etcd member hasn't been found just add it into list 322 | m = EtcdMember(m) 323 | peers[m.peer_addr] = m 324 | return sorted(peers.values(), key=lambda e: e.instance_id or e.name) 325 | 326 | def load_members(self): 327 | self.accessible_member = None 328 | self.leader_id = None 329 | ec2_members = self.manager.get_autoscaling_members() 330 | etcd_members = [] 331 | 332 | # Try to connect to members of autoscaling_group group and fetch information about etcd-cluster 333 | for member in ec2_members: 334 | if member.instance_id != self.manager.instance_id: # Skip myself 335 | try: 336 | etcd_members = member.get_members() 337 | if etcd_members: # We've found accessible etcd member 338 | self.accessible_member = member 339 | self.leader_id = member.get_leader() # Let's ask him about leader of etcd-cluster 340 | self.cluster_version = member.get_cluster_version() # and about cluster-wide etcd version 341 | break 342 | except Exception: 343 | logging.exception('Load members from etcd') 344 | 345 | # combine both lists together 346 | self.members = self.merge_member_lists(ec2_members, etcd_members) 347 | 348 | def is_healthy(self, me): 349 | """"Check that cluster does not contain members other then from our ASG 350 | or given EC2 instance is already part of cluster""" 351 | 352 | for m in self.members: 353 | if m.name == me.instance_id: 354 | return True 355 | if not m.instance_id: 356 | logging.warning('Member id=%s name=%s is not part of ASG', m.id, m.name) 357 | logging.warning('Will wait until it would be removed from cluster by HouseKeeper job running on leader') 358 | return False 359 | if m.id and not m.name and not m.client_urls: 360 | if me.addr_matches(m.peer_urls): 361 | return True 362 | logging.warning('Member (id=%s peerURLs=%s) is registered but not yet joined', m.id, m.peer_urls) 363 | return False 364 | return True 365 | 366 | 367 | class EtcdManager: 368 | 369 | ETCD_BINARY = '/bin/etcd' 370 | DATA_DIR = 'data' 371 | NAPTIME = 30 372 | 373 | def __init__(self): 374 | self.region = None 375 | self.instance_id = None 376 | self.me = None 377 | self.etcd_pid = 0 378 | self.run_old = False 379 | self._access_granted = False 380 | 381 | def load_my_identities(self): 382 | url = 'http://169.254.169.254/latest/dynamic/instance-identity/document' 383 | response = requests.get(url) 384 | if response.status_code != 200: 385 | raise EtcdClusterException('GET %s: code=%s content=%s', url, response.status_code, response.content) 386 | json = response.json() 387 | if not EtcdCluster.is_multiregion(): 388 | EtcdCluster.REGIONS = [json['region']] 389 | self.region = json['region'] 390 | self.instance_id = json['instanceId'] 391 | 392 | def find_my_instance(self): 393 | if not self.instance_id or not self.region: 394 | self.load_my_identities() 395 | 396 | conn = boto3.resource('ec2', region_name=self.region) 397 | for i in conn.instances.filter(Filters=[{'Name': 'instance-id', 'Values': [self.instance_id]}]): 398 | if i.id == self.instance_id and EtcdMember.CF_TAG in tags_to_dict(i.tags): 399 | return EtcdMember(i, self.region) 400 | 401 | def get_my_instance(self): 402 | if not self.me: 403 | self.me = self.find_my_instance() 404 | return self.me 405 | 406 | def get_autoscaling_members(self): 407 | me = self.get_my_instance() 408 | members = [] 409 | for region in EtcdCluster.REGIONS: 410 | conn = boto3.resource('ec2', region_name=region) 411 | for i in conn.instances.filter(Filters=[ 412 | {'Name': 'tag:{}'.format(EtcdMember.CF_TAG), 413 | 'Values': [me.cloudformation_stack]}]): 414 | if (i.state['Name'] == 'running' and 415 | tags_to_dict(i.tags).get(EtcdMember.CF_TAG, '') == me.cloudformation_stack): 416 | m = EtcdMember(i, region) 417 | if self.region == region or m.public_ip_address: 418 | members.append(m) 419 | 420 | if not self._access_granted: 421 | me.adjust_security_groups('authorize_ingress', *members) 422 | self._access_granted = True 423 | return members 424 | 425 | def clean_data_dir(self): 426 | path = self.DATA_DIR 427 | logging.info('Removing data directory: %s', path) 428 | try: 429 | if os.path.islink(path): 430 | os.unlink(path) 431 | elif not os.path.exists(path): 432 | return 433 | elif os.path.isfile(path): 434 | os.remove(path) 435 | elif os.path.isdir(path): 436 | shutil.rmtree(path) 437 | except Exception: 438 | logging.exception('Can not remove %s', path) 439 | 440 | def register_me(self, cluster): 441 | cluster_state = 'existing' 442 | include_ec2_instances = remove_member = add_member = False 443 | data_exists = os.path.exists(self.DATA_DIR) 444 | if cluster.accessible_member is None: 445 | include_ec2_instances = True 446 | cluster_state = 'existing' if data_exists else 'new' 447 | logging.info('Cluster does not have accessible member yet, cluster state=%s', cluster_state) 448 | elif len(self.me.client_urls) > 0: 449 | remove_member = add_member = not data_exists 450 | logging.info('My clientURLs list is not empty: %s', self.me.client_urls) 451 | logging.info('My data directory exists=%s', data_exists) 452 | else: 453 | if self.me.id: 454 | cluster_state = 'new' if self.me.name else 'existing' 455 | logging.info('Cluster state=%s because my(id=%s, name=%s)', cluster_state, self.me.id, self.me.name) 456 | else: 457 | add_member = True 458 | logging.info('add_member = True because I am not part of cluster yet') 459 | self.clean_data_dir() 460 | 461 | if add_member or remove_member: 462 | if not cluster.leader_id: 463 | raise EtcdClusterException('Etcd cluster does not have leader yet. Can not add myself') 464 | if remove_member: 465 | if not cluster.accessible_member.delete_member(self.me): 466 | raise EtcdClusterException('Can not remove my old instance from etcd cluster') 467 | time.sleep(self.NAPTIME) 468 | if add_member: 469 | if not cluster.accessible_member.add_member(self.me): 470 | raise EtcdClusterException('Can not register myself in etcd cluster') 471 | time.sleep(self.NAPTIME) 472 | 473 | self.run_old = add_member and cluster_state == 'existing' and not cluster.is_upgraded 474 | 475 | peers = ','.join(['{}={}'.format(m.instance_id or m.name, m.peer_url) for m in cluster.members 476 | if (include_ec2_instances and m.instance_id) or m.peer_urls]) 477 | 478 | return self.me.etcd_arguments(self.DATA_DIR, peers, cluster_state, self.run_old) 479 | 480 | def run(self): 481 | cluster = EtcdCluster(self) 482 | while True: 483 | try: 484 | cluster.load_members() 485 | 486 | self.me = ([m for m in cluster.members if m.instance_id == self.me.instance_id] or [self.me])[0] 487 | 488 | if cluster.is_healthy(self.me): 489 | args = self.register_me(cluster) 490 | binary = self.ETCD_BINARY + ('.old' if self.run_old else '') 491 | 492 | self.etcd_pid = os.fork() 493 | if self.etcd_pid == 0: 494 | os.execv(binary, [binary] + args) 495 | 496 | logging.info('Started new %s process with pid: %s and args: %s', binary, self.etcd_pid, args) 497 | pid, status = os.waitpid(self.etcd_pid, 0) 498 | logging.warning('Process %s finished with exit code %s', pid, status >> 8) 499 | self.etcd_pid = 0 500 | except SystemExit: 501 | break 502 | except Exception: 503 | logging.exception('Exception in main loop') 504 | logging.warning('Sleeping %s seconds before next try...', self.NAPTIME) 505 | time.sleep(self.NAPTIME) 506 | 507 | 508 | class HouseKeeper(Thread): 509 | 510 | NAPTIME = 30 511 | 512 | def __init__(self, manager, hosted_zone): 513 | super(HouseKeeper, self).__init__() 514 | self.daemon = True 515 | self.manager = manager 516 | self.hosted_zone = hosted_zone 517 | if hosted_zone: 518 | self.hosted_zone = hosted_zone.rstrip('.') + '.' 519 | self.members = {} 520 | self.unhealthy_members = {} 521 | 522 | def is_leader(self): 523 | return self.manager.me.is_leader() 524 | 525 | def acquire_lock(self): 526 | data = {'value': self.manager.instance_id, 'ttl': self.NAPTIME, 'prevExist': False} 527 | return self.manager.me.api_put('keys/_self_maintenance_lock', data=data) is not None 528 | 529 | def take_upgrade_lock(self, ttl): 530 | data = {'value': self.manager.instance_id, 'ttl': ttl, 'prevExist': False} 531 | return self.manager.me.api_put('keys/_upgrade_lock', data=data) is not None 532 | 533 | def release_upgrade_lock(self): 534 | return self.manager.me.api_delete('keys/_upgrade_lock', data={'value': self.manager.instance_id}) 535 | 536 | def check_upgrade_lock(self): 537 | return self.manager.me.api_get('keys/_upgrade_lock') is not None 538 | 539 | def members_changed(self): 540 | old_members = self.members.copy() 541 | new_members = self.manager.me.get_members() 542 | if all(old_members.pop(m['id'], None) == m for m in new_members) and not old_members: 543 | return False 544 | self.members = {m['id']: m for m in new_members} 545 | return True 546 | 547 | def cluster_unhealthy(self): 548 | process = subprocess.Popen([self.manager.ETCD_BINARY + 'ctl', 'cluster-health'], 549 | stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env={'ETCDCTL_API': '2'}) 550 | ret = any('unhealthy' in line or 'unreachable' in line for line in map(str, process.stdout)) 551 | process.wait() 552 | return ret 553 | 554 | def remove_unhealthy_members(self, autoscaling_members): 555 | for etcd_member in self.members.values(): 556 | for ec2_member in autoscaling_members: 557 | if ec2_member.addr_matches(etcd_member['peerURLs']): 558 | break 559 | else: 560 | self.manager.me.delete_member(EtcdMember(etcd_member)) 561 | 562 | def update_record(self, conn, zone_id, rtype, rname, new_value): 563 | conn.change_resource_record_sets( 564 | HostedZoneId=zone_id, 565 | ChangeBatch={ 566 | 'Changes': [ 567 | { 568 | 'Action': 'UPSERT', 569 | 'ResourceRecordSet': { 570 | 'Name': rname, 571 | 'Type': rtype, 572 | 'TTL': 60, 573 | 'ResourceRecords': new_value, 574 | } 575 | } 576 | ] 577 | } 578 | ) 579 | 580 | def update_route53_records(self, autoscaling_members): 581 | conn = boto3.client('route53', region_name=self.manager.region) 582 | zones = conn.list_hosted_zones_by_name(DNSName=self.hosted_zone) 583 | zone = ([z for z in zones['HostedZones'] if z['Name'] == self.hosted_zone] or [None])[0] 584 | if not zone: 585 | raise Exception('Failed to find hosted_zone {}'.format(self.hosted_zone)) 586 | zone_id = zone['Id'] 587 | 588 | stack_version = self.manager.me.cloudformation_stack.split('-')[-1] 589 | 590 | members = [] 591 | for ec2_member in autoscaling_members: 592 | for etcd_member in self.members.values(): 593 | if ec2_member.addr_matches(etcd_member['peerURLs']): 594 | members.append(ec2_member) 595 | break 596 | 597 | record_name = '_etcd-server._tcp.{}.{}'.format(stack_version, self.hosted_zone) 598 | new_record = [{'Value': ' '.join(map(str, [1, 1, i.peer_port, i.dns]))} for i in members] 599 | self.update_record(conn, zone_id, 'SRV', record_name, new_record) 600 | 601 | record_name = '_etcd-client._tcp.{}.{}'.format(stack_version, self.hosted_zone) 602 | new_record = [{'Value': ' '.join(map(str, [1, 1, i.client_port, i.dns]))} for i in members] 603 | self.update_record(conn, zone_id, 'SRV', record_name, new_record) 604 | 605 | new_record = [{'Value': i.addr} for i in members] 606 | self.update_record(conn, zone_id, 'A', 'etcd-server.{}.{}'.format(stack_version, self.hosted_zone), new_record) 607 | 608 | def run(self): 609 | update_required = False 610 | while True: 611 | try: 612 | if self.manager.etcd_pid != 0 and self.is_leader(): 613 | if (update_required or self.members_changed() or self.cluster_unhealthy()) \ 614 | and not self.check_upgrade_lock() and self.acquire_lock(): 615 | update_required = True 616 | autoscaling_members = self.manager.get_autoscaling_members() 617 | if autoscaling_members: 618 | self.remove_unhealthy_members(autoscaling_members) 619 | self.update_route53_records(autoscaling_members) 620 | update_required = False 621 | else: 622 | self.members = {} 623 | update_required = False 624 | if self.manager.etcd_pid != 0 and self.manager.run_old \ 625 | and not self.cluster_unhealthy() and self.take_upgrade_lock(600): 626 | logging.info('Performing upgrade of member %s', self.manager.me.name) 627 | os.kill(self.manager.etcd_pid, signal.SIGTERM) 628 | for _ in range(0, 59): 629 | time.sleep(10) 630 | if self.cluster_unhealthy(): 631 | logging.info('upgrade: cluster is unhealthy...') 632 | else: 633 | logging.info('upgrade complete, removing upgrade lock') 634 | self.release_upgrade_lock() 635 | break 636 | else: 637 | logging.error('upgrade: giving up...') 638 | except Exception: 639 | logging.exception('Exception in HouseKeeper main loop') 640 | logging.debug('Sleeping %s seconds...', self.NAPTIME) 641 | time.sleep(self.NAPTIME) 642 | 643 | 644 | __ignore_sigterm = False 645 | 646 | 647 | def sigterm_handler(signo, stack_frame): 648 | global __ignore_sigterm 649 | if not __ignore_sigterm: 650 | __ignore_sigterm = True 651 | sys.exit() 652 | 653 | 654 | def main(): 655 | signal.signal(signal.SIGTERM, sigterm_handler) 656 | logging.basicConfig(format='%(levelname)-6s %(asctime)s - %(message)s', level=logging.INFO) 657 | hosted_zone = os.environ.get('HOSTED_ZONE', None) 658 | if os.environ.get('ACTIVE_REGIONS', '') != '': 659 | EtcdCluster.REGIONS = os.environ.get('ACTIVE_REGIONS').split(',') 660 | 661 | manager = EtcdManager() 662 | try: 663 | house_keeper = HouseKeeper(manager, hosted_zone) 664 | house_keeper.start() 665 | manager.run() 666 | finally: 667 | logging.info('Trying to remove myself from cluster...') 668 | try: 669 | cluster = EtcdCluster(manager) 670 | cluster.load_members() 671 | if cluster.accessible_member: 672 | if [m for m in cluster.members if m.name == manager.me.instance_id]\ 673 | and not cluster.accessible_member.delete_member(manager.me): 674 | logging.error('Can not remove myself from cluster') 675 | else: 676 | logging.error('Cluster does not have accessible member') 677 | except Exception: 678 | logging.exception('Failed to remove myself from cluster') 679 | 680 | 681 | if __name__ == '__main__': 682 | main() 683 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import inspect 4 | import os 5 | import sys 6 | from setuptools.command.test import test as TestCommand 7 | from setuptools import setup 8 | 9 | __location__ = os.path.join(os.getcwd(), os.path.dirname(inspect.getfile(inspect.currentframe()))) 10 | 11 | MAIN_SCRIPT = 'etcd.py' 12 | NAME = 'stups-etcd-cluster' 13 | VERSION = '1.0' 14 | DESCRIPTION = 'Etcd cluster appliance for the STUPS (AWS) environment' 15 | LICENSE = 'Apache License Version 2.0' 16 | URL = 'https://github.com/zalando-stups/stups-etcd-cluster' 17 | AUTHOR = 'Alexander Kukushkin' 18 | AUTHOR_EMAIL = 'alexander.kukushkin@zalando.de' 19 | KEYWORDS = 'etcd cluster etcd-cluster stups aws' 20 | 21 | COVERAGE_XML = True 22 | COVERAGE_HTML = False 23 | JUNIT_XML = True 24 | 25 | # Add here all kinds of additional classifiers as defined under 26 | # https://pypi.python.org/pypi?%3Aaction=list_classifiers 27 | CLASSIFIERS = [ 28 | 'Development Status :: 4 - Beta', 29 | 'Environment :: Console', 30 | 'Intended Audience :: Developers', 31 | 'Intended Audience :: System Administrators', 32 | 'License :: OSI Approved :: Apache Software License', 33 | 'Operating System :: POSIX :: Linux', 34 | 'Programming Language :: Python', 35 | 'Programming Language :: Python :: 2.7', 36 | 'Programming Language :: Python :: 3.4', 37 | 'Programming Language :: Python :: 3.5', 38 | 'Programming Language :: Python :: 3.6', 39 | 'Programming Language :: Python :: Implementation :: CPython', 40 | ] 41 | 42 | CONSOLE_SCRIPTS = ['etcd = etcd:main'] 43 | 44 | 45 | class PyTest(TestCommand): 46 | 47 | user_options = [('cov=', None, 'Run coverage'), ('cov-xml=', None, 'Generate junit xml report'), ('cov-html=', 48 | None, 'Generate junit html report'), ('junitxml=', None, 'Generate xml of test results')] 49 | 50 | def initialize_options(self): 51 | TestCommand.initialize_options(self) 52 | self.cov_xml = False 53 | self.cov_html = False 54 | self.junitxml = None 55 | 56 | def finalize_options(self): 57 | TestCommand.finalize_options(self) 58 | if self.cov_xml or self.cov_html: 59 | self.cov = ['--cov', MAIN_SCRIPT[:-3], '--cov-report', 'term-missing'] 60 | if self.cov_xml: 61 | self.cov.extend(['--cov-report', 'xml']) 62 | if self.cov_html: 63 | self.cov.extend(['--cov-report', 'html']) 64 | if self.junitxml is not None: 65 | self.junitxml = ['--junitxml', self.junitxml] 66 | 67 | def run_tests(self): 68 | try: 69 | import pytest 70 | except Exception: 71 | raise RuntimeError('py.test is not installed, run: pip install pytest') 72 | params = {'args': self.test_args} 73 | if self.cov: 74 | params['args'] += self.cov 75 | if self.junitxml: 76 | params['args'] += self.junitxml 77 | params['args'] += ['--doctest-modules', MAIN_SCRIPT, '-s', '-vv'] 78 | errno = pytest.main(**params) 79 | sys.exit(errno) 80 | 81 | 82 | def get_install_requirements(path): 83 | content = open(os.path.join(__location__, path)).read() 84 | return [req for req in content.split('\\n') if req != ''] 85 | 86 | 87 | def read(fname): 88 | return open(os.path.join(__location__, fname)).read() 89 | 90 | 91 | def setup_package(): 92 | # Assemble additional setup commands 93 | cmdclass = {} 94 | cmdclass['test'] = PyTest 95 | 96 | # Some helper variables 97 | version = os.getenv('GO_PIPELINE_LABEL', VERSION) 98 | 99 | command_options = {'test': {'test_suite': ('setup.py', 'tests')}} 100 | if JUNIT_XML: 101 | command_options['test']['junitxml'] = 'setup.py', 'junit.xml' 102 | if COVERAGE_XML: 103 | command_options['test']['cov_xml'] = 'setup.py', True 104 | if COVERAGE_HTML: 105 | command_options['test']['cov_html'] = 'setup.py', True 106 | 107 | setup( 108 | name=NAME, 109 | version=version, 110 | url=URL, 111 | author=AUTHOR, 112 | author_email=AUTHOR_EMAIL, 113 | description=DESCRIPTION, 114 | license=LICENSE, 115 | keywords=KEYWORDS, 116 | long_description=read('README.md'), 117 | classifiers=CLASSIFIERS, 118 | test_suite='tests', 119 | packages=[], 120 | install_requires=get_install_requirements('requirements.txt'), 121 | cmdclass=cmdclass, 122 | tests_require=['pytest-cov', 'pytest', 'mock', 'flake8'], 123 | command_options=command_options, 124 | entry_points={'console_scripts': CONSOLE_SCRIPTS}, 125 | ) 126 | 127 | 128 | if __name__ == '__main__': 129 | setup_package() 130 | -------------------------------------------------------------------------------- /tests/test_etcd_cluster.py: -------------------------------------------------------------------------------- 1 | import os 2 | import unittest 3 | 4 | from etcd import EtcdCluster, EtcdManager, EtcdMember 5 | from mock import Mock, patch 6 | from test_etcd_manager import requests_get, instances 7 | 8 | 9 | class TestEtcdCluster(unittest.TestCase): 10 | 11 | @patch('requests.get', requests_get) 12 | @patch('boto3.resource') 13 | def setUp(self, res): 14 | res.return_value.instances.filter.return_value = instances() 15 | self.manager = EtcdManager() 16 | self.manager.instance_id = 'i-deadbeef3' 17 | self.manager.region = 'eu-west-1' 18 | EtcdCluster.REGIONS = ['eu-west-1'] 19 | self.cluster = EtcdCluster(self.manager) 20 | self.cluster.load_members() 21 | self.assertFalse(EtcdCluster.is_multiregion()) 22 | os.environ['ETCDVERSION'] = '3.2.10' 23 | 24 | @patch('boto3.resource') 25 | def test_load_members(self, res): 26 | res.return_value.instances.filter.return_value = instances() 27 | self.assertEqual(len(self.cluster.members), 4) 28 | with patch('requests.get', Mock(side_effect=Exception)): 29 | self.cluster.load_members() 30 | 31 | def test_is_healthy(self): 32 | private_ip_address = '127.0.0.22' 33 | private_dns_name = 'ip-{}.eu-west-1.compute.internal'.format(private_ip_address.replace('.', '-')) 34 | url = 'http://' + private_ip_address 35 | peer_urls = ['{}:{}'.format(url, EtcdMember.DEFAULT_PEER_PORT)] 36 | me = EtcdMember({ 37 | 'id': 'ifoobari7', 38 | 'name': 'i-sadfjhg', 39 | 'clientURLs': ['{}:{}'.format(private_ip_address, EtcdMember.DEFAULT_CLIENT_PORT)], 40 | 'peerURLs': peer_urls 41 | }) 42 | me.private_ip_address = private_ip_address 43 | self.assertFalse(self.cluster.is_healthy(me)) 44 | self.cluster.members[-1].instance_id = 'foo' 45 | self.cluster.members[-1].name = '' 46 | self.assertFalse(self.cluster.is_healthy(me)) 47 | 48 | self.cluster.members[-1].peer_urls = peer_urls 49 | self.assertTrue(self.cluster.is_healthy(me)) 50 | self.cluster.members.pop() 51 | self.assertTrue(self.cluster.is_healthy(me)) 52 | -------------------------------------------------------------------------------- /tests/test_etcd_housekeeper.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from etcd import EtcdManager, HouseKeeper 4 | from mock import Mock, patch 5 | from test_etcd_manager import instances, requests_get, requests_delete, MockResponse 6 | 7 | 8 | def requests_put(url, **kwargs): 9 | response = MockResponse() 10 | response.status_code = 201 11 | return response 12 | 13 | 14 | class Popen: 15 | 16 | def __init__(self, args, **kwargs): 17 | if args[1] != 'cluster-health': 18 | raise Exception() 19 | self.stdout = ['cluster is healthy', 'member 15a694aa6a6003f4 is healthy', 20 | 'member effbc38ed2b11107 is unhealthy'] 21 | 22 | def wait(self): 23 | pass 24 | 25 | 26 | class TestHouseKeeper(unittest.TestCase): 27 | 28 | @patch('requests.get', requests_get) 29 | @patch('boto3.resource') 30 | def setUp(self, res): 31 | res.return_value.instances.filter.return_value = instances() 32 | self.manager = EtcdManager() 33 | self.manager.get_my_instance() 34 | self.manager.instance_id = 'i-deadbeef3' 35 | self.manager.region = 'eu-west-1' 36 | self.keeper = HouseKeeper(self.manager, 'test.') 37 | self.members_changed = self.keeper.members_changed() 38 | 39 | @patch('requests.get', requests_get) 40 | def test_members_changed(self): 41 | self.assertTrue(self.members_changed) 42 | self.keeper.members['blabla'] = True 43 | self.assertTrue(self.keeper.members_changed()) 44 | self.assertFalse(self.keeper.members_changed()) 45 | 46 | @patch('requests.get', requests_get) 47 | def test_is_leader(self): 48 | self.assertTrue(self.keeper.is_leader()) 49 | 50 | @patch('requests.put', requests_put) 51 | def test_acquire_lock(self): 52 | self.assertTrue(self.keeper.acquire_lock()) 53 | 54 | @patch('requests.delete', requests_delete) 55 | @patch('boto3.resource') 56 | def test_remove_unhealthy_members(self, res): 57 | res.return_value.instances.filter.return_value = instances() 58 | autoscaling_members = self.manager.get_autoscaling_members() 59 | self.assertIsNone(self.keeper.remove_unhealthy_members(autoscaling_members)) 60 | 61 | @patch('boto3.resource') 62 | @patch('boto3.client') 63 | def test_update_route53_records(self, cli, res): 64 | cli.return_value.list_hosted_zones_by_name.return_value = {'HostedZones': [{'Id': '', 'Name': 'test.'}]} 65 | res.return_value.instances.filter.return_value = instances() 66 | autoscaling_members = self.manager.get_autoscaling_members() 67 | self.assertIsNone(self.keeper.update_route53_records(autoscaling_members)) 68 | self.keeper.hosted_zone = 'bla' 69 | self.assertRaises(Exception, self.keeper.update_route53_records, autoscaling_members) 70 | 71 | @patch('subprocess.Popen', Popen) 72 | def test_cluster_unhealthy(self): 73 | self.assertTrue(self.keeper.cluster_unhealthy()) 74 | 75 | @patch('logging.exception', Mock(side_effect=Exception)) 76 | @patch('os.kill', Mock()) 77 | @patch('time.sleep', Mock(side_effect=Exception)) 78 | @patch('requests.get', requests_get) 79 | @patch('requests.put', requests_put) 80 | @patch('requests.delete', requests_delete) 81 | @patch('subprocess.Popen', Popen) 82 | @patch('boto3.resource') 83 | @patch('boto3.client') 84 | def test_run(self, cli, res): 85 | cli.return_value.list_hosted_zones_by_name.return_value = {'HostedZones': [{'Id': '', 'Name': 'test.'}]} 86 | res.return_value.instances.filter.return_value = instances() 87 | self.assertRaises(Exception, self.keeper.run) 88 | self.keeper.manager.etcd_pid = 1 89 | self.assertRaises(Exception, self.keeper.run) 90 | self.keeper.is_leader = Mock(side_effect=Exception) 91 | self.assertRaises(Exception, self.keeper.run) 92 | with patch('time.sleep', Mock()): 93 | self.keeper.is_leader = Mock(return_value=False) 94 | self.keeper.manager.run_old = True 95 | self.keeper.cluster_unhealthy = Mock(side_effect=[False, True, False]) 96 | self.assertRaises(Exception, self.keeper.run) 97 | self.keeper.cluster_unhealthy = Mock(side_effect=[False] + [True]*100) 98 | self.assertRaises(Exception, self.keeper.run) 99 | -------------------------------------------------------------------------------- /tests/test_etcd_manager.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import unittest 4 | 5 | from etcd import EtcdCluster, EtcdClusterException, EtcdManager, EtcdMember, HouseKeeper, main, sigterm_handler 6 | from mock import Mock, patch 7 | 8 | 9 | class MockResponse: 10 | 11 | def __init__(self): 12 | self.status_code = 200 13 | self.content = '{}' 14 | 15 | def json(self): 16 | return json.loads(self.content) 17 | 18 | 19 | def requests_get(url, **kwargs): 20 | response = MockResponse() 21 | if url == 'http://127.0.0.7:2379/v2/members': 22 | response.content = '{"members":[]}' 23 | elif url == 'http://127.0.0.1:2379/version': 24 | response.content = '{"etcdserver":"2.3.7","etcdcluster":"2.3.0"}' 25 | elif url == 'http://127.0.0.3:2379/v2/keys/_upgrade_lock': 26 | response.status_code = 404 27 | else: 28 | response.content = \ 29 | """{"region":"eu-west-1", "instanceId": "i-deadbeef3", "leaderInfo":{"leader":"ifoobari1"},"members":[ 30 | {"id":"ifoobari1","name":"i-deadbeef1","peerURLs":["http://ip-127-0-0-1.eu-west-1.compute.internal:2380"], 31 | "clientURLs":["http://127.0.0.1:2379"]}, 32 | {"id":"ifoobari2","name":"i-deadbeef2","peerURLs":["http://ip-127-0-0-2.eu-west-1.compute.internal:2380"], 33 | "clientURLs":["http://127.0.0.2:2379"]}, 34 | {"id":"ifoobari3","name":"i-deadbeef3","peerURLs":["http://ip-127-0-0-3.eu-west-1.compute.internal:2380"], 35 | "clientURLs":["http://127.0.0.3:2379"]}, 36 | {"id":"ifoobari4","name":"i-deadbeef4","peerURLs":["http://ip-127-0-0-4.eu-west-1.compute.internal:2380"], 37 | "clientURLs":[]}]}""" 38 | return response 39 | 40 | 41 | def requests_get_multiregion(url, **kwargs): 42 | response = MockResponse() 43 | if url == 'http://ec2-52-0-0-128.eu-west-1.compute.amazonaws.com:2379/v2/members': 44 | response.content = '{"members":[]}' 45 | elif url == 'http://ec2-52-0-0-41.eu-west-1.compute.amazonaws.com:2379/version': 46 | response.content = '{"etcdserver":"2.3.7","etcdcluster":"2.3.0"}' 47 | elif url == 'http://ec2-52-0-0-43.eu-west-1.compute.amazonaws.com:2379/v2/keys/_upgrade_lock': 48 | response.status_code = 404 49 | else: 50 | response.content = \ 51 | """{"region":"eu-west-1", "instanceId": "i-deadbeef3", "leaderInfo":{"leader":"ifoobari1"},"members":[ 52 | {"id":"ifoobari1","name":"i-deadbeef1","peerURLs":["http://ec2-52-0-0-41.eu-west-1.compute.amazonaws.com:2380"], 53 | "clientURLs":["http://ec2-52-0-0-41.eu-west-1.compute.amazonaws.com:2379"]}, 54 | {"id":"ifoobari2","name":"i-deadbeef2","peerURLs":["http://ec2-52-0-0-42.eu-west-1.compute.amazonaws.com:2380"], 55 | "clientURLs":["http://ec2-52-0-0-42.eu-west-1.compute.amazonaws.com:2379"]}, 56 | {"id":"ifoobari3","name":"i-deadbeef3","peerURLs":["http://ec2-52-0-0-43.eu-west-1.compute.amazonaws.com:2380"], 57 | "clientURLs":["http://ec2-52-0-0-43.eu-west-1.compute.amazonaws.com:2379"]}, 58 | {"id":"ifoobari4","name":"i-deadbeef4","peerURLs":["http://ec2-52-0-0-44.eu-west-1.compute.amazonaws.com:2380"], 59 | "clientURLs":[]}, 60 | {"id":"ifoobari5","name":"i-beefcent1","peerURLs":["http://ec2-54-200-0-41.eu-central-1.compute.amazonaws.com:2380"], 61 | "clientURLs":["http://ec2-54-200-0-41.eu-central-1.compute.amazonaws.com:2379"]}, 62 | {"id":"ifoobari6","name":"i-beefcent2","peerURLs":["http://ec2-54-200-0-42.eu-central-1.compute.amazonaws.com:2380"], 63 | "clientURLs":["http://ec2-54-200-0-42.eu-central-1.compute.amazonaws.com:2379"]}, 64 | {"id":"ifoobari7","name":"i-beefcent3","peerURLs":["http://ec2-54-200-0-43.eu-central-1.compute.amazonaws.com:2380"], 65 | "clientURLs":["http://ec2-54-200-0-43.eu-central-1.compute.amazonaws.com:2379"]}]}""" 66 | return response 67 | 68 | 69 | def requests_get_bad_status(url, **kwargs): 70 | response = requests_get(url, **kwargs) 71 | response.status_code = 404 72 | return response 73 | 74 | 75 | def requests_get_bad_etcd(url, **kwargs): 76 | response = requests_get(url, **kwargs) 77 | if '//169.254.169.254/latest/' not in url: 78 | response.status_code = 404 79 | return response 80 | 81 | 82 | def requests_delete(url, **kwargs): 83 | response = MockResponse() 84 | response.status_code = (500 if url.endswith('/v2/members/ifoobari7') else 204) 85 | return response 86 | 87 | 88 | class MockReservation: 89 | 90 | def __init__(self, instance): 91 | self.instances = [instance] 92 | 93 | 94 | class MockInstance: 95 | 96 | state = {'Code': 16, 'Name': 'running'} 97 | 98 | def __init__(self, id, ip, region='eu-west-1', public_ip=None): 99 | self.id = id 100 | self.private_ip_address = ip 101 | self.private_dns_name = 'ip-{}.{}.compute.internal'.format(ip.replace('.', '-'), region) 102 | self.public_ip_address = public_ip 103 | self.public_dns_name = public_ip and \ 104 | 'ec2-{}.{}.compute.amazonaws.com'.format(public_ip.replace('.', '-'), region) 105 | self.tags = [ 106 | {'Key': 'aws:cloudformation:stack-name', 'Value': 'etc-cluster'}, 107 | {'Key': 'aws:autoscaling:groupName', 'Value': 'etc-cluster-postgres'} 108 | ] 109 | 110 | 111 | def instances(): 112 | return [ 113 | MockInstance('i-deadbeef1', '127.0.0.1'), 114 | MockInstance('i-deadbeef2', '127.0.0.2'), 115 | MockInstance('i-deadbeef3', '127.0.0.3') 116 | ] 117 | 118 | 119 | def public_instances(): 120 | return [ 121 | MockInstance('i-deadbeef1', '127.0.0.1', 'eu-west-1', '52.0.0.41'), 122 | MockInstance('i-deadbeef2', '127.0.0.2', 'eu-west-1', '52.0.0.42'), 123 | MockInstance('i-deadbeef3', '127.0.0.3', 'eu-west-1', '52.0.0.43'), 124 | MockInstance('i-beefcent1', '127.0.0.1', 'eu-central-1', '54.200.0.41'), 125 | MockInstance('i-beefcent2', '127.0.0.2', 'eu-central-1', '54.200.0.42'), 126 | MockInstance('i-beefcent3', '127.0.0.3', 'eu-central-1', '54.200.0.43') 127 | ] 128 | 129 | 130 | class SleepException(Exception): 131 | pass 132 | 133 | 134 | class TestEtcdManager(unittest.TestCase): 135 | 136 | @patch('boto3.resource') 137 | @patch('requests.get', requests_get) 138 | def setUp(self, res): 139 | self.manager = EtcdManager() 140 | res.return_value.instances.filter.return_value = instances() 141 | self.manager.find_my_instance() 142 | 143 | @patch('boto3.resource') 144 | def test_get_autoscaling_members(self, res): 145 | res.return_value.instances.filter.return_value = instances() 146 | self.assertEqual(len(self.manager.get_autoscaling_members()), 3) 147 | self.assertEqual(self.manager.instance_id, 'i-deadbeef3') 148 | self.assertEqual(self.manager.region, 'eu-west-1') 149 | 150 | def test_clean_data_dir(self): 151 | self.manager.clean_data_dir() 152 | os.mkdir(self.manager.DATA_DIR) 153 | self.manager.clean_data_dir() 154 | open(self.manager.DATA_DIR, 'w').close() 155 | self.manager.clean_data_dir() 156 | os.symlink('foo', self.manager.DATA_DIR) 157 | with patch('os.unlink', Mock(side_effect=Exception)): 158 | self.manager.clean_data_dir() 159 | self.manager.clean_data_dir() 160 | 161 | @patch('requests.get', requests_get_bad_status) 162 | def test_load_my_identities(self): 163 | self.assertRaises(EtcdClusterException, self.manager.load_my_identities) 164 | 165 | @patch('time.sleep', Mock()) 166 | @patch('requests.get', requests_get) 167 | @patch('boto3.resource') 168 | def test_register_me(self, res): 169 | res.return_value.instances.filter.return_value = instances() 170 | cluster = EtcdCluster(self.manager) 171 | cluster.load_members() 172 | self.manager.me.id = '1' 173 | self.manager.register_me(cluster) 174 | 175 | self.manager.me.id = None 176 | cluster.accessible_member.add_member = Mock(return_value=False) 177 | self.assertRaises(EtcdClusterException, self.manager.register_me, cluster) 178 | 179 | self.manager.me.client_urls = ['a'] 180 | cluster.accessible_member.delete_member = Mock(return_value=False) 181 | self.assertRaises(EtcdClusterException, self.manager.register_me, cluster) 182 | 183 | cluster.accessible_member.delete_member = cluster.accessible_member.add_member = Mock(return_value=True) 184 | self.manager.register_me(cluster) 185 | 186 | cluster.leader_id = None 187 | self.assertRaises(EtcdClusterException, self.manager.register_me, cluster) 188 | 189 | cluster.accessible_member = None 190 | self.manager.register_me(cluster) 191 | 192 | @patch('boto3.resource') 193 | @patch('os.path.exists', Mock(return_value=True)) 194 | @patch('os.execv', Mock(side_effect=Exception)) 195 | @patch('os.fork', Mock(return_value=0)) 196 | @patch('time.sleep', Mock(side_effect=SleepException)) 197 | @patch('requests.get', requests_get) 198 | def test_run(self, res): 199 | res.return_value.instances.filter.return_value = instances() 200 | self.assertRaises(SleepException, self.manager.run) 201 | 202 | with patch('os.fork', Mock(return_value=1)): 203 | with patch('os.waitpid', Mock(return_value=(1, 0))): 204 | self.assertRaises(SleepException, self.manager.run) 205 | with patch.object(EtcdCluster, 'load_members', Mock(side_effect=SystemExit)): 206 | self.manager.run() 207 | 208 | 209 | class TestMain(unittest.TestCase): 210 | 211 | def test_sigterm_handler(self): 212 | self.assertRaises(SystemExit, sigterm_handler, None, None) 213 | 214 | @patch('requests.get', requests_get) 215 | @patch('requests.delete', requests_delete) 216 | @patch.object(HouseKeeper, 'start', Mock()) 217 | @patch.object(EtcdMember, 'delete_member', Mock(return_value=False)) 218 | @patch('os.fork', Mock(return_value=1)) 219 | @patch('os.waitpid', Mock(return_value=(1, 0))) 220 | @patch('time.sleep', Mock(side_effect=SleepException)) 221 | @patch('boto3.resource') 222 | def test_main(self, res): 223 | res.return_value.instances.filter.return_value = instances() 224 | self.assertRaises(SleepException, main) 225 | with patch('requests.get', requests_get_bad_status): 226 | self.assertRaises(SleepException, main) 227 | with patch('requests.get', requests_get_bad_etcd): 228 | self.assertRaises(SleepException, main) 229 | -------------------------------------------------------------------------------- /tests/test_etcd_member.py: -------------------------------------------------------------------------------- 1 | import json 2 | import unittest 3 | 4 | from etcd import EtcdMember 5 | from mock import patch, Mock 6 | from test_etcd_manager import requests_delete, requests_get, MockInstance, MockResponse 7 | 8 | 9 | def requests_post(url, **kwargs): 10 | response = MockResponse() 11 | data = json.loads(kwargs['data']) 12 | if data['peerURLs'][0] in ['http://ip-127-0-0-2.eu-west-1.compute.internal:2380', 13 | 'http://ip-127-0-0-3.eu-west-1.compute.internal:2380']: 14 | response.status_code = 201 15 | response.content = '{"id":"ifoobar","name":"","peerURLs":["' + data['peerURLs'][0] + '"],"clientURLs":[""]}' 16 | else: 17 | response.status_code = 403 18 | return response 19 | 20 | 21 | class TestEtcdMember(unittest.TestCase): 22 | 23 | def setUp(self): 24 | self.ec2 = MockInstance('i-foobar', '127.0.0.1') 25 | self.ec2_member = EtcdMember(self.ec2) 26 | self.etcd = { 27 | 'id': 'deadbeef', 28 | 'name': 'i-foobar2', 29 | 'clientURLs': [], 30 | 'peerURLs': ['http://ip-127-0-0-2.eu-west-1.compute.internal:{}'.format(EtcdMember.DEFAULT_PEER_PORT)], 31 | } 32 | self.etcd_member = EtcdMember(self.etcd) 33 | 34 | def test_get_addr_from_urls(self): 35 | self.assertEqual(self.ec2_member.get_addr_from_urls(['http://1.2:3']), '1.2') 36 | self.assertEqual(self.ec2_member.get_addr_from_urls(['http://1.2']), '1.2') 37 | self.assertIsNone(self.ec2_member.get_addr_from_urls(['http//1.2'])) 38 | 39 | def test_set_info_from_ec2_instance(self): 40 | self.etcd_member.set_info_from_ec2_instance(self.ec2) 41 | self.etcd_member.name = '' 42 | self.etcd_member.set_info_from_ec2_instance(self.ec2) 43 | 44 | def test_set_info_from_etcd(self): 45 | self.ec2_member.set_info_from_etcd(self.etcd) 46 | self.etcd['name'] = 'i-foobar' 47 | self.ec2_member.set_info_from_etcd(self.etcd) 48 | self.etcd['peerURLs'] = ['http://127.0.0.100:{}'.format(EtcdMember.DEFAULT_PEER_PORT)] 49 | self.ec2_member.set_info_from_etcd(self.etcd) 50 | self.etcd['peerURLs'] = ['http://127.0.0.1:{}'.format(EtcdMember.DEFAULT_PEER_PORT)] 51 | self.ec2_member.set_info_from_etcd(self.etcd) 52 | self.etcd['peerURLs'] = [] 53 | self.ec2_member.set_info_from_etcd(self.etcd) 54 | 55 | @patch('requests.post', requests_post) 56 | def test_add_member(self): 57 | member = EtcdMember({ 58 | 'id': '', 59 | 'name': '', 60 | 'clientURLs': [], 61 | 'peerURLs': ['http://ip-127-0-0-2.eu-west-1.compute.internal:{}'.format(EtcdMember.DEFAULT_PEER_PORT)], 62 | }) 63 | self.assertTrue(self.ec2_member.add_member(member)) 64 | member.peer_urls[0] = member.peer_urls[0].replace('2', '4') 65 | self.assertFalse(self.ec2_member.add_member(member)) 66 | 67 | @patch('requests.get', requests_get) 68 | def test_is_leader(self): 69 | self.assertTrue(self.ec2_member.is_leader()) 70 | 71 | @patch('boto3.resource') 72 | @patch('requests.delete', requests_delete) 73 | @patch('etcd.EtcdCluster.is_multiregion', Mock(return_value=True)) 74 | def test_delete_member(self, res): 75 | sg = Mock() 76 | sg.tags = [ 77 | {'Key': 'aws:cloudformation:stack-name', 'Value': 'etc-cluster'}, 78 | {'Key': 'aws:autoscaling:groupName', 'Value': 'etc-cluster-postgres'} 79 | ] 80 | sg.revoke_ingress.side_effect = Exception 81 | res.return_value.security_groups.all.return_value = [sg] 82 | member = EtcdMember({ 83 | 'id': 'ifoobari7', 84 | 'name': 'i-sadfjhg', 85 | 'clientURLs': ['http://127.0.0.2:{}'.format(EtcdMember.DEFAULT_CLIENT_PORT)], 86 | 'peerURLs': ['http://ip-127-0-0-2.eu-west-1.compute.internal:{}'.format(EtcdMember.DEFAULT_PEER_PORT)] 87 | }) 88 | member.peer_urls[0] = member.peer_urls[0].replace('2', '1') 89 | self.assertFalse(self.ec2_member.delete_member(member)) 90 | 91 | @patch('requests.get', requests_get) 92 | def test_get_leader(self): 93 | self.ec2_member.private_ip_address = '127.0.0.7' 94 | self.assertEqual(self.ec2_member.get_leader(), 'ifoobari1') 95 | 96 | @patch('requests.get', requests_get) 97 | def test_get_members(self): 98 | self.ec2_member.private_ip_address = '127.0.0.7' 99 | self.assertEqual(self.ec2_member.get_members(), []) 100 | -------------------------------------------------------------------------------- /tests/test_etcd_multiregion_cluster.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from etcd import EtcdCluster, EtcdManager, EtcdMember 4 | from mock import Mock, patch 5 | from test_etcd_manager import requests_get_multiregion, public_instances 6 | 7 | 8 | class TestEtcdMultiRegionCluster(unittest.TestCase): 9 | 10 | @patch('requests.get', requests_get_multiregion) 11 | @patch('boto3.resource') 12 | def setUp(self, res): 13 | res.return_value.instances.filter.return_value = public_instances() 14 | self.manager = EtcdManager() 15 | self.manager.instance_id = 'i-deadbeef3' 16 | self.manager.region = 'eu-west-1' 17 | EtcdCluster.REGIONS = ['eu-west-1', 'eu-central-1'] 18 | self.cluster = EtcdCluster(self.manager) 19 | self.cluster.load_members() 20 | self.assertTrue(EtcdCluster.is_multiregion()) 21 | 22 | @patch('boto3.resource') 23 | def test_load_members(self, res): 24 | res.return_value.instances.filter.return_value = public_instances() 25 | self.assertEqual(len(self.cluster.members), 7) 26 | with patch('requests.get', Mock(side_effect=Exception)): 27 | self.cluster.load_members() 28 | 29 | def test_is_healthy(self): 30 | public_dns_name = 'ec2-52-0-0-128.eu-west-1.compute.amazonaws.com' 31 | url = 'http://' + public_dns_name 32 | peer_urls = ['{}:{}'.format(url, EtcdMember.DEFAULT_PEER_PORT)] 33 | me = EtcdMember({ 34 | 'id': 'ifoobari0815', 35 | 'name': 'i-sadfjhg', 36 | 'clientURLs': ['{}:{}'.format(url, EtcdMember.DEFAULT_CLIENT_PORT)], 37 | 'peerURLs': peer_urls 38 | }) 39 | me.public_dns_name = public_dns_name 40 | self.assertFalse(self.cluster.is_healthy(me)) 41 | self.cluster.members[-1].instance_id = 'foo' 42 | self.cluster.members[-1].name = '' 43 | self.assertFalse(self.cluster.is_healthy(me)) 44 | self.cluster.members[-1].peer_urls = peer_urls 45 | self.assertTrue(self.cluster.is_healthy(me)) 46 | self.cluster.members.pop() 47 | self.assertTrue(self.cluster.is_healthy(me)) 48 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=120 3 | --------------------------------------------------------------------------------