├── netmet ├── __init__.py ├── client │ ├── __init__.py │ ├── conf.py │ ├── main.py │ └── collector.py ├── server │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── eslock.py │ ├── deployer.py │ ├── mesher.py │ ├── main.py │ └── db.py ├── utils │ ├── __init__.py │ ├── asyncer.py │ ├── status.py │ ├── worker.py │ ├── secure.py │ ├── pusher.py │ └── ping.py ├── config.py ├── exceptions.py └── run.py ├── tests ├── __init__.py └── unit │ ├── __init__.py │ ├── utils │ ├── __init__.py │ ├── test_asyncer.py │ ├── test_worker.py │ ├── test_status.py │ ├── test_ping.py │ ├── test_pusher.py │ └── test_secure.py │ ├── client │ ├── __init__.py │ ├── test_conf.py │ └── test_collector.py │ ├── server │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── test_eslock.py │ ├── test_mesher.py │ └── test_db.py │ ├── test.py │ └── test_run.py ├── media ├── netmet-logo.png ├── netmet-deployment.png ├── netmet-architecture.png ├── netmet-deployment-logical.png └── netmet-deployment-update.png ├── ansible ├── run.yml ├── roles │ ├── common │ │ ├── templates │ │ │ └── docker.json.j2 │ │ └── tasks │ │ │ └── main.yml │ ├── grafana │ │ ├── templates │ │ │ └── grafana.conf.j2 │ │ └── tasks │ │ │ └── main.yml │ ├── elastic │ │ ├── templates │ │ │ ├── log4j2.properties │ │ │ ├── elasticsearch.yml │ │ │ └── jvm.options │ │ └── tasks │ │ │ └── main.yml │ ├── controller │ │ └── tasks │ │ │ └── main.yml │ ├── nginx │ │ ├── templates │ │ │ └── netmet.conf.j2 │ │ └── tasks │ │ │ └── main.yml │ ├── netmet │ │ └── tasks │ │ │ └── main.yml │ └── netmet_config │ │ ├── tasks │ │ └── main.yml │ │ └── templates │ │ └── config.json.j2 ├── ansible.cfg ├── terminate.yml ├── group_vars │ └── example.yml ├── inventory │ └── example.ini ├── check.yml └── deploy.yml ├── test-requirements.txt ├── requirements.txt ├── Dockerfile ├── upgrades ├── README.md ├── 0003_upgrade_config_v1_v2.py ├── 0002_rename_south_north_to_north_south_type.py └── 0001_upgrade_data_v1_v2.py ├── setup.py ├── tox.ini ├── .gitignore ├── docs ├── elastic.md └── security.md ├── README.md └── LICENSE /netmet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /netmet/client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /netmet/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /netmet/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /netmet/server/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/client/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/server/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/server/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /media/netmet-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godaddy/netmet/master/media/netmet-logo.png -------------------------------------------------------------------------------- /media/netmet-deployment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godaddy/netmet/master/media/netmet-deployment.png -------------------------------------------------------------------------------- /media/netmet-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godaddy/netmet/master/media/netmet-architecture.png -------------------------------------------------------------------------------- /ansible/run.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - include: check.yml 4 | 5 | - hosts: controller 6 | roles: 7 | - netmet_config -------------------------------------------------------------------------------- /media/netmet-deployment-logical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godaddy/netmet/master/media/netmet-deployment-logical.png -------------------------------------------------------------------------------- /media/netmet-deployment-update.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/godaddy/netmet/master/media/netmet-deployment-update.png -------------------------------------------------------------------------------- /ansible/roles/common/templates/docker.json.j2: -------------------------------------------------------------------------------- 1 | { 2 | "live-restore": true, 3 | "bip":"192.168.180.1/22", 4 | "fixed-cidr":"192.168.180.0/22" 5 | } -------------------------------------------------------------------------------- /netmet/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | _DATA = {} 4 | 5 | 6 | def set(key, value): 7 | _DATA[key] = value 8 | 9 | 10 | def get(key): 11 | return _DATA[key] 12 | -------------------------------------------------------------------------------- /tests/unit/test.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import testtools 4 | 5 | 6 | class TestCase(testtools.TestCase): 7 | 8 | def setUp(self): 9 | super(TestCase, self).setUp() 10 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | hacking>=0.9.2,<0.10 2 | pytest>=2.7,<=2.9.2 3 | pytest-cov>=2.2.1,<=2.3.0 4 | pytest-html==1.10.1 5 | 6 | coverage>=3.6,!=4.3.0 7 | ddt>=1.0.1 8 | mock>=2.0 9 | 10 | testtools>=1.4.0 11 | future==0.16.0 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | gevent==1.2.1 2 | Flask==0.11.1 3 | flask-helpers==0.1 4 | requests==2.11.1 5 | jsonschema==2.5.1 6 | future==0.16.0 7 | futurist==0.22.0 8 | elasticsearch==5.0.1 9 | cachetools==2.0.0 10 | monotonic 11 | morph 12 | enum34 13 | webob 14 | -------------------------------------------------------------------------------- /ansible/ansible.cfg: -------------------------------------------------------------------------------- 1 | [defaults] 2 | 3 | host_key_checking = False 4 | retry_files_enabled = False 5 | 6 | [ssh_connection] 7 | 8 | ssh_args = -C -o ControlMaster=auto -o ControlPersist=60s -o ForwardAgent=yes 9 | control_path = %(directory)s/%%h-%%p-%%r 10 | -------------------------------------------------------------------------------- /ansible/roles/grafana/templates/grafana.conf.j2: -------------------------------------------------------------------------------- 1 | upstream grafana { 2 | server localhost:3000 ; 3 | } 4 | 5 | server { 6 | listen 80; 7 | server_name localhost; 8 | 9 | location / { 10 | proxy_pass http://grafana; 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:latest 2 | MAINTAINER Sushant Bhadkamkar 3 | RUN apk add --update \ 4 | build-base \ 5 | linux-headers \ 6 | python \ 7 | python-dev \ 8 | py-pip 9 | 10 | COPY . /app 11 | WORKDIR /app 12 | 13 | RUN pip install -r requirements.txt \ 14 | && python setup.py install 15 | 16 | EXPOSE 5000 17 | -------------------------------------------------------------------------------- /ansible/roles/elastic/templates/log4j2.properties: -------------------------------------------------------------------------------- 1 | status = error 2 | 3 | appender.console.type = Console 4 | appender.console.name = console 5 | appender.console.layout.type = PatternLayout 6 | appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] %marker%m%n 7 | 8 | rootLogger.level = info 9 | rootLogger.appenderRef.console.ref = console -------------------------------------------------------------------------------- /ansible/roles/elastic/templates/elasticsearch.yml: -------------------------------------------------------------------------------- 1 | cluster.name: oss 2 | node.name: node-{{host}} 3 | node.attr.host: {{host}} 4 | discovery.zen.ping.unicast.hosts: [{% for host in elastic_hosts %} "{{hostvars[host].inventory_hostname}}", {% endfor %}] 5 | discovery.zen.minimum_master_nodes: 2 6 | network.host: {{host}} 7 | xpack.security.enabled: false 8 | -------------------------------------------------------------------------------- /ansible/roles/controller/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Install NPM 3 | yum: name={{ item }} state=present 4 | with_items: 5 | - epel-release 6 | - nodejs 7 | - npm 8 | become: true 9 | 10 | - name: Install Elasticdump 11 | npm: 12 | name: elasticdump 13 | version: "3.1.0" 14 | global: yes 15 | become: true -------------------------------------------------------------------------------- /ansible/roles/nginx/templates/netmet.conf.j2: -------------------------------------------------------------------------------- 1 | upstream netmet_server { 2 | {% for host in netmet_servers %} 3 | server {{hostvars[host].inventory_hostname}}:{{netmet_port}} max_fails=1 fail_timeout=30; 4 | {% endfor %} 5 | } 6 | 7 | server { 8 | listen 80; 9 | server_name localhost; 10 | 11 | location / { 12 | proxy_pass http://netmet_server; 13 | } 14 | 15 | } -------------------------------------------------------------------------------- /upgrades/README.md: -------------------------------------------------------------------------------- 1 | # Upgrades! What can go wrong;) 2 | 3 | 4 | Netmet upgrades are not that hard, they have 1 or 2 steps 5 | 6 | - Upgrade Code 7 | - Migrate Data (not always required) 8 | 9 | 10 | ## Rolling upgrades of code 11 | 12 | - Upgrade all Netmet servers one by one 13 | - Upgrade all clinets 14 | 15 | 16 | ## Data upgrades 17 | 18 | - Run proper upgrade script e.g.: `python 0001_upgrade_data_v1_v2.py` -------------------------------------------------------------------------------- /ansible/terminate.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - hosts: netmet_servers 4 | tasks: 5 | - name: "Removing netmet_server containers." 6 | become: true 7 | docker_container: 8 | name: netmet_server 9 | state: absent 10 | 11 | - hosts: netmet_clients 12 | tasks: 13 | - name: "Removing netmet_server containers." 14 | become: true 15 | docker_container: 16 | name: netmet_client 17 | state: absent 18 | -------------------------------------------------------------------------------- /ansible/roles/netmet/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Logging in to netmet_registry 4 | docker_login: 5 | registry: "{{registry_url}}" 6 | username: "{{registry_user}}" 7 | password: "{{registry_pwd}}" 8 | reauthorize: yes 9 | 10 | - name: Pull Netmet Docker image 11 | docker_image: 12 | force: yes 13 | state: present 14 | name: "{{registry_url}}/{{netmet_img_name}}" 15 | tag: "{{netmet_img_tag}}" 16 | 17 | -------------------------------------------------------------------------------- /ansible/group_vars/example.yml: -------------------------------------------------------------------------------- 1 | --- 2 | ansible_user: foo 3 | registry_url: hub.docker.com 4 | reigstry_pwd: 5 | registry_user: 6 | 7 | netmet_client_port: 80 8 | netmet_server_port: 80 9 | 10 | netmet_img_name: foo 11 | netmet_img_tag: foo 12 | grafana_image: grafana/grafana 13 | grafana_tag: 4.2.0 14 | 15 | grafana_pwd: "somepassword" 16 | server_hmacs: "anyradnomstrong" 17 | client_hmacs: "anyradnomstrong" 18 | basic_auth: "admin:password" 19 | -------------------------------------------------------------------------------- /ansible/inventory/example.ini: -------------------------------------------------------------------------------- 1 | [controller] 2 | manage-from-this-node 3 | 4 | [nginx] 5 | deploy-nginx-on-this-host 6 | 7 | [elastic] 8 | existing-elastic-host 9 | 10 | [elastic_deploy] 11 | deploy-elastic-on-this-host 12 | 13 | [grafana] 14 | deploy-here-grafana 15 | 16 | [netmet_servers] 17 | deploy-netmet-server-here 18 | 19 | [netmet_clients] 20 | deploy-netmet-client-here ip="1.2.3.4" availability zone name" dc="data center name" 21 | 22 | [netmet_external] 23 | test_external_ip dest="8.8.8.8" protocol="icmp" period=10 timeout=1 24 | 25 | [netmet:children] 26 | netmet_servers 27 | netmet_clients 28 | 29 | -------------------------------------------------------------------------------- /netmet/utils/asyncer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import threading 4 | 5 | 6 | _THREADS = [] 7 | _DIE = threading.Event() 8 | 9 | 10 | def asyncme(func): 11 | func._die = _DIE 12 | 13 | def async_call(*args, **kwargs): 14 | thread = threading.Thread(target=func, args=args, kwargs=kwargs) 15 | thread.daemon = True 16 | thread.start() 17 | _THREADS.append(thread) 18 | 19 | func.async_call = async_call 20 | return func 21 | 22 | 23 | def die(*args, **kwargs): 24 | global _DIE, _THREADS 25 | 26 | if not _DIE.is_set(): 27 | _DIE.set() 28 | for t in _THREADS: 29 | if t: 30 | t.join() 31 | _THREADS = [] 32 | _DIE = threading.Event() 33 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import find_packages 4 | from setuptools import setup 5 | 6 | 7 | def find_subpackages(package): 8 | packages = [package] 9 | for subpackage in find_packages(package): 10 | packages.append("{0}.{1}".format(package, subpackage)) 11 | return packages 12 | 13 | 14 | setup(name="netmet", 15 | version="0.1", 16 | description="Simple Continious Mesh Network Monitoring Tool", 17 | url="", 18 | author="Boris Pavlovic", 19 | author_email="bpavlovic@godaddy.com", 20 | packages=find_subpackages("netmet"), 21 | platforms='Linux', 22 | license='Apache 2.0', 23 | entry_points={ 24 | "console_scripts": [ 25 | "netmet = netmet.run:run" 26 | ] 27 | }) 28 | -------------------------------------------------------------------------------- /ansible/roles/netmet_config/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | - name: Generate netmet server config 3 | vars: 4 | netmet_clients: "{{groups['netmet_clients']}}" 5 | netmet_external: "{{groups['netmet_external']}}" 6 | netmet_port: "{{netmet_client_port}}" 7 | template: 8 | src: config.json.j2 9 | dest: "{{ ansible_env.HOME }}/config.json" 10 | register: netmet_server_conf 11 | 12 | - name: Slurp config file 13 | slurp: 14 | src: "{{ ansible_env.HOME }}/config.json" 15 | register: configfile 16 | 17 | - name: Refresh Netmet server configurtion 18 | uri: 19 | url: "http://{{basic_auth}}@{{groups['nginx'][0]}}/api/v2/config" 20 | method: POST 21 | body: "{{ configfile['content'] | b64decode }}" 22 | status_code: 201 23 | body_format: json 24 | -------------------------------------------------------------------------------- /ansible/check.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - hosts: controller 4 | tasks: 5 | - name: "Check netmet servers" 6 | uri: 7 | url: "http://{{hostvars[item].inventory_hostname}}:{{netmet_server_port}}/status" 8 | method: GET 9 | status_code: 200 10 | with_items: "{{groups['netmet_servers']}}" 11 | 12 | - name: "Check netmet clients" 13 | uri: 14 | url: "http://{{hostvars[item].inventory_hostname}}:{{netmet_client_port}}/status" 15 | method: GET 16 | status_code: 200 17 | with_items: "{{groups['netmet_clients']}}" 18 | 19 | - name: "Check elastics" 20 | uri: 21 | url: "http://{{hostvars[item].inventory_hostname}}:9200" 22 | method: GET 23 | status_code: 200 24 | with_items: "{{groups['elastic'] + groups['elastic_deploy']}}" 25 | -------------------------------------------------------------------------------- /ansible/roles/nginx/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Pull NGINX docker image 4 | docker_image: 5 | name: nginx 6 | tag: alpine 7 | become: true 8 | 9 | - name: Create nginx conf directory 10 | file: 11 | path: nginx_conf 12 | state: directory 13 | mode: 0755 14 | 15 | - name: Generate and copy nginx netmet.conf template 16 | vars: 17 | netmet_servers: "{{groups['netmet_servers']}}" 18 | netmet_port: "{{netmet_server_port}}" 19 | template: 20 | src: netmet.conf.j2 21 | dest: "{{ ansible_env.HOME }}/nginx_conf/netmet.conf" 22 | 23 | - name: Run Nginx container 24 | docker_container: 25 | name: nginx 26 | image: nginx:alpine 27 | network_mode: host 28 | recreate: yes 29 | restart_policy: unless-stopped 30 | volumes: 31 | - "{{ ansible_env.HOME }}/nginx_conf/:/etc/nginx/conf.d:rw" 32 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 1.6 3 | skipsdist = True 4 | envlist = py27,pep8 5 | 6 | [testenv] 7 | setenv = VIRTUAL_ENV={envdir} 8 | LANG=en_US.UTF-8 9 | LANGUAGE=en_US:en 10 | LC_ALL=C 11 | PYTHONHASHSEED=0 12 | TOX_ENV_NAME={envname} 13 | whitelist_externals = find 14 | deps = -r{toxinidir}/requirements.txt 15 | -r{toxinidir}/test-requirements.txt 16 | install_command = pip install -U {opts} {packages} 17 | usedevelop = True 18 | commands = 19 | find . -type f -name "*.pyc" -delete 20 | py.test --junit-xml=test-results.xml --durations=10 "tests/unit" {posargs} 21 | distribute = false 22 | basepython = python2.7 23 | 24 | [testenv:pep8] 25 | commands = flake8 26 | distribute = false 27 | 28 | [testenv:cover] 29 | commands = py.test --cov=netmet tests/unit/ --cov-report=html 30 | 31 | [testenv:venv] 32 | commands = {posargs} 33 | 34 | [flake8] 35 | show-source = true 36 | ignore = H102 37 | exclude=.venv,.git,.tox,dist,doc,*lib/python*,*egg,tools,build,setup.py 38 | -------------------------------------------------------------------------------- /ansible/roles/netmet_config/templates/config.json.j2: -------------------------------------------------------------------------------- 1 | { 2 | "deployment": { 3 | "static": { 4 | "clients": [ 5 | {% for host in netmet_clients %}{ 6 | "host": "{{hostvars[host].inventory_hostname}}", 7 | "ip": "{{hostvars[host].ip}}", 8 | "port": {{netmet_port}}, 9 | "az": "{{hostvars[host].az}}", 10 | "dc": "{{hostvars[host].dc}}" 11 | }{% if not loop.last %},{%endif%} 12 | 13 | {% endfor %} 14 | 15 | ] 16 | } 17 | }, 18 | "external": [ 19 | {% for host in netmet_external %}{ 20 | "dest": "{{hostvars[host].dest}}", 21 | "protocol": "{{hostvars[host].protocol}}", 22 | "period": {{hostvars[host].period}}, 23 | "timeout": {{hostvars[host].timeout}} 24 | }{% if not loop.last %},{%endif%} 25 | 26 | {% endfor %} 27 | 28 | ], 29 | "mesher": { 30 | "full_mesh": {} 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /netmet/server/utils/eslock.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import logging 4 | 5 | from netmet import exceptions 6 | from netmet.server import db 7 | 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | class Glock(object): 13 | 14 | def __init__(self, name, ttl=10): 15 | self.name = name 16 | self.acquired = False 17 | self.ttl = 10 18 | 19 | def __enter__(self): 20 | if self.acquired: 21 | raise exceptions.GlobalLockException("Lock already in use %s" 22 | % self.name) 23 | if db.get().lock_acquire(self.name, self.ttl): 24 | self.acquired = True 25 | else: 26 | raise exceptions.GlobalLockException("Can't lock %s" % self.name) 27 | 28 | def __exit__(self, exception_type, exception_value, traceback): 29 | if not db.get().lock_release(self.name): 30 | logging.warning("Can't release lock %(name)s." 31 | % {"name": self.name}) 32 | 33 | self.acquired = False 34 | -------------------------------------------------------------------------------- /netmet/exceptions.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import six 4 | 5 | 6 | class NetmetException(Exception): 7 | 8 | msg_fmt = "%(message)s" 9 | 10 | def __init__(self, message=None, **kwargs): 11 | self.kwargs = kwargs 12 | 13 | if "%(message)s" in self.msg_fmt: 14 | kwargs.update({"message": message}) 15 | 16 | super(NetmetException, self).__init__(self.msg_fmt % kwargs) 17 | 18 | def format_message(self): 19 | return six.text_type(self) 20 | 21 | 22 | class GlobalLockException(NetmetException): 23 | msg_fmt = "Global Lock Exception: %(message)s" 24 | 25 | 26 | class DBNotInitialized(NetmetException): 27 | msg_fmt = "Try to use DB before it's initialized: %(message)s" 28 | 29 | 30 | class DBRecordNotFound(NetmetException): 31 | msg_fmt = "Didn't find record in DB: %(record)s" 32 | 33 | 34 | class DBConflict(NetmetException): 35 | msg_fmt = "DB Conflict: %(message)s" 36 | 37 | 38 | class DBInitFailure(NetmetException): 39 | msg_fmt = "Can't initialize DB %(elastic)s: %(message)s" 40 | -------------------------------------------------------------------------------- /tests/unit/utils/test_asyncer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import time 4 | 5 | from netmet.utils import asyncer 6 | from tests.unit import test 7 | 8 | 9 | class AsyncerTestCase(test.TestCase): 10 | 11 | def tearDown(self): 12 | asyncer.die() 13 | super(AsyncerTestCase, self).tearDown() 14 | 15 | def test_asyncer_regular_call(self): 16 | 17 | @asyncer.asyncme 18 | def method(a, b=2): 19 | return a + b 20 | 21 | self.assertEqual(4, method(2)) 22 | self.assertEqual(7, method(3, b=4)) 23 | self.assertEqual([], asyncer._THREADS) 24 | 25 | def test_asyncer_async_call(self): 26 | s = [] 27 | 28 | @asyncer.asyncme 29 | def method(a): 30 | time.sleep(a) 31 | s.append(a) 32 | 33 | method.async_call(0.2) 34 | method.async_call(0.1) 35 | 36 | self.assertEqual(2, len(asyncer._THREADS)) 37 | asyncer.die() 38 | self.assertEqual(0, len(asyncer._THREADS)) 39 | self.assertEqual([0.1, 0.2], s) 40 | 41 | def test_die_empty(self): 42 | asyncer.die() -------------------------------------------------------------------------------- /ansible/roles/common/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Install Docker 4 | yum: name={{ item }} state=present 5 | with_items: 6 | - docker 7 | - python-pip 8 | - python-virtualenv 9 | become: true 10 | 11 | - name: Create Docker Group 12 | group: 13 | name: docker 14 | state: present 15 | become: true 16 | 17 | - name: Add $USER to docker group 18 | user: 19 | name: "{{ ansible_user }}" 20 | groups: docker 21 | append: yes 22 | become: true 23 | register: add_user_to_docker_group 24 | 25 | - name: Copy Docker daemon conf 26 | become: true 27 | template: 28 | src: docker.json.j2 29 | dest: /etc/docker/daemon.json 30 | 31 | - name: Restart Docker service and enable it on start 32 | systemd: 33 | state: restarted 34 | name: docker 35 | enabled: yes 36 | become: true 37 | when: add_user_to_docker_group.changed 38 | 39 | - name: Ensure Docker service runs 40 | systemd: 41 | state: started 42 | name: docker 43 | enabled: yes 44 | become: true 45 | 46 | - name: Add docker-py (required by ansible docker plugin) 47 | pip: 48 | name: docker-py 49 | version: 1.10.6 50 | become: true -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .tests_result/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | coverage.xml 45 | *,cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | 52 | # Flask stuff: 53 | instance/ 54 | .webassets-cache 55 | 56 | # Sphinx documentation 57 | docs/_build/ 58 | 59 | # PyBuilder 60 | target/ 61 | 62 | # IPython Notebook 63 | .ipynb_checkpoints 64 | 65 | # pyenv 66 | .python-version 67 | 68 | 69 | # dotenv 70 | .env 71 | 72 | # virtualenv 73 | venv/ 74 | ENV/ 75 | 76 | # Spyder project settings 77 | .spyderproject 78 | 79 | # Rope project settings 80 | .ropeproject 81 | -------------------------------------------------------------------------------- /tests/unit/server/utils/test_eslock.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import elasticsearch 4 | import mock 5 | 6 | from netmet import exceptions 7 | from netmet.server import db 8 | from netmet.server.utils import eslock 9 | from tests.unit import test 10 | 11 | 12 | class EslockTestCase(test.TestCase): 13 | 14 | def test_init(self): 15 | g = eslock.Glock("some_name") 16 | self.assertEqual("some_name", g.name) 17 | self.assertEqual(10, g.ttl) 18 | self.assertFalse(g.acquired) 19 | 20 | @mock.patch("netmet.server.utils.eslock.db.get") 21 | def test_lock_acquired(self, mock_get): 22 | db_ = db.DB() 23 | db_.elastic = mock.MagicMock() 24 | db_.own_url = "upsis" 25 | mock_get.return_value = db_ 26 | 27 | g = eslock.Glock("some_name") 28 | with g: 29 | self.assertTrue(g.acquired) 30 | self.assertRaises(exceptions.GlobalLockException, g.__enter__) 31 | 32 | self.assertFalse(g.acquired) 33 | 34 | @mock.patch("netmet.server.utils.eslock.db.get") 35 | def test_lock_failed(self, mock_get): 36 | db_ = db.DB() 37 | db_.own_url = "upsis" 38 | db_.elastic = mock.MagicMock() 39 | db_.elastic.indices.create.side_effect = ( 40 | elasticsearch.exceptions.ElasticsearchException) 41 | mock_get.return_value = db_ 42 | 43 | g = eslock.Glock("some_name") 44 | self.assertRaises(exceptions.GlobalLockException, g.__enter__) 45 | -------------------------------------------------------------------------------- /ansible/roles/grafana/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Pull NGINX docker image 4 | docker_image: 5 | name: nginx 6 | tag: alpine 7 | become: true 8 | 9 | - name: Create nginx conf directory 10 | file: 11 | path: nginx_conf 12 | state: directory 13 | mode: 0755 14 | 15 | - name: Generate and copy nginx netmet.conf template 16 | template: 17 | src: grafana.conf.j2 18 | dest: "{{ ansible_env.HOME }}/nginx_conf/grafana.conf" 19 | 20 | - name: Run Nginx container 21 | docker_container: 22 | name: nginx 23 | image: nginx:alpine 24 | network_mode: host 25 | recreate: yes 26 | restart_policy: unless-stopped 27 | volumes: 28 | - "{{ ansible_env.HOME }}/nginx_conf/:/etc/nginx/conf.d:rw" 29 | 30 | - name: Create grafana data directory 31 | file: 32 | path: "{{ ansible_env.HOME }}/grafana_data" 33 | state: directory 34 | mode: 0755 35 | 36 | - name: Pull Grafana image 37 | become: yes 38 | docker_image: 39 | force: yes 40 | state: present 41 | name: "{{grafana_image}}" 42 | tag: "{{grafana_tag}}" 43 | 44 | - name: Run Grafana container 45 | become: yes 46 | docker_container: 47 | name: grafana 48 | image: "{{grafana_image}}" 49 | recreate: yes 50 | restart_policy: unless-stopped 51 | network_mode: host 52 | volumes: 53 | - "{{ ansible_env.HOME }}/grafana_data:/var/lib/grafana:rw" 54 | env: 55 | GF_SECURITY_ADMIN_PASSWORD: "{{grafana_pwd}}" 56 | GF_INSTALL_PLUGINS: "grafana-clock-panel,grafana-simple-json-datasource,savantly-heatmap-panel,mtanda-histogram-panel,vonage-status-panel" 57 | -------------------------------------------------------------------------------- /docs/elastic.md: -------------------------------------------------------------------------------- 1 | # Elastic Search Requirements 2 | 3 | Netmet is using Elasticsearch for storing all data including configuration. 4 | Only 2 types of indexes are stored for now. 5 | 6 | ## `netmet_catalog` 7 | 8 | It has two types: 9 | * config - stores all configs of netmet server 10 | * clients - stores information about all clients 11 | 12 | ### Index rollover 13 | 14 | No need. Amount of recrods is small and Elastic can store all data in one index. 15 | 16 | ### Index Size 17 | 18 | * Count of docs: `O(netmet_clients)` 19 | * Size of doc: `~100b` 20 | 21 | ## `netmet_data--` 22 | 23 | Stores raw data collected by netmet clients, such like pings, http pings and so on. 24 | 25 | ### Index rollover 26 | 27 | * Performed automatically by netmet server. (Checks every 10 minutes) 28 | * Conditions for rollover one of two: index older then one day, index has more than 10kk elements. 29 | 30 | ### Index Size 31 | 32 | * Count of docs: `(types_of_traffic * netmet_clients² / period) per second` 33 | * Max size of index: `10kk docs` 34 | * Size of doc: `500 bytes` 35 | 36 | ### Load calculation 37 | 38 | Input 39 | * 34 clients 40 | * 2 types of traffic (ICMP, HTTP) 41 | * period = 5 seconds 42 | * push_data_period = Every 10 seconds netmet client sends bulk of data to netmet server 43 | 44 | Count of documents / day 45 | * 2 * 34 * 34 * (60 / 5) * 60 * 24 = `~40kk documents per day` 46 | * It's about ~20 GB of data. 47 | 48 | Count of requests to netmet server 49 | * clients / push_data_period = 34 / 10 = `~3.5 / second` 50 | 51 | Count/Size of requests to elastic 52 | * clients / push_data_to_server_period = 34 / 10 = `~3.5 / second` 53 | * clients * types_of_traffic * push_data_period / period = 34 * 2 * 10 / 5 = `136 docs` in every bulk 54 | -------------------------------------------------------------------------------- /upgrades/0003_upgrade_config_v1_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import json 4 | import sys 5 | 6 | import elasticsearch 7 | import elasticsearch.helpers 8 | 9 | 10 | def upgrade(elastic, dry_run=False): 11 | elastic = elasticsearch.Elasticsearch(elastic) 12 | print(json.dumps(elastic.info(), indent=2)) 13 | 14 | if dry_run: 15 | print("Exit from dry mode") 16 | return 17 | 18 | body = [] 19 | for hit in elasticsearch.helpers.scan(elastic, 20 | index="netmet_catalog", 21 | doc_type="config"): 22 | 23 | config = json.loads(hit["_source"]["config"]) 24 | if "static" in config: 25 | print("Updating record %s" % hit["_id"]) 26 | new_config = json.dumps({ 27 | "deployment": config, 28 | "mesher": {"full_mesh": {}}, 29 | "external": [] 30 | }) 31 | 32 | body.append(json.dumps({"update": {"_id": hit["_id"]}})) 33 | body.append(json.dumps({"doc": {"config": new_config}})) 34 | 35 | if body: 36 | elastic.bulk(index="netmet_catalog", doc_type="config", 37 | body="\n".join(body)) 38 | print("Upgrade finished. %s records changed" % str(len(body) / 2)) 39 | else: 40 | print("Everything is up to date.") 41 | 42 | 43 | def main(): 44 | if (len(sys.argv) == 1 45 | or len(sys.argv) > 3 46 | or len(sys.argv) == 3 and sys.argv[2] != "--check"): 47 | print("Invalid input. Usage:") 48 | print("python 0003_upgrade_config_v1_v2.py [--check]") 49 | return 1 50 | else: 51 | upgrade(sys.argv[1], dry_run=len(sys.argv) == 3) 52 | 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /upgrades/0002_rename_south_north_to_north_south_type.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import sys 4 | 5 | import elasticsearch 6 | import requests 7 | 8 | 9 | def upgrade(elastic_url, dry_run=False): 10 | elastic = elasticsearch.Elasticsearch(elastic_url) 11 | print(elastic.info()) 12 | 13 | if dry_run: 14 | print("Exit from dry mode") 15 | return 16 | 17 | mapping = { 18 | "dynamic": "strict", 19 | "properties": { 20 | "client_src.host": {"type": "keyword"}, 21 | "client_src.ip": {"type": "ip"}, 22 | "client_src.port": {"type": "integer"}, 23 | "client_src.hypervisor": {"type": "keyword"}, 24 | "client_src.az": {"type": "keyword"}, 25 | "client_src.dc": {"type": "keyword"}, 26 | "dest": {"type": "keyword"}, 27 | "protocol": {"type": "keyword"}, 28 | "timestamp": {"type": "date"}, 29 | "transmitted": {"type": "integer"}, 30 | "packet_size": {"type": "integer"}, 31 | "lost": {"type": "integer"}, 32 | "latency": {"type": "float"}, 33 | "ret_code": {"type": "integer"}, 34 | "events": {"type": "keyword"} 35 | } 36 | } 37 | 38 | requests.delete("%s/*/south-north" % elastic_url) 39 | elastic.indices.put_mapping( 40 | index="netmet_data_v2-*", doc_type="north-south", body=mapping) 41 | 42 | 43 | def main(): 44 | if (len(sys.argv) == 1 45 | or len(sys.argv) > 3 46 | or len(sys.argv) == 3 and sys.argv[2] != "--check"): 47 | print("Invalid input. Usage:") 48 | print("python 0002_rename_south_north_to_north_south.py " 49 | "[--check]") 50 | return 1 51 | else: 52 | upgrade(sys.argv[1], dry_run=len(sys.argv) == 3) 53 | 54 | 55 | if __name__ == "__main__": 56 | main() 57 | -------------------------------------------------------------------------------- /netmet/utils/status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import copy 4 | import datetime 5 | import threading 6 | 7 | import flask 8 | import monotonic 9 | from webob import dec 10 | 11 | 12 | class Stats(object): 13 | 14 | def __init__(self): 15 | self.started_at = datetime.datetime.now() 16 | self.stats = { 17 | "requests": { 18 | "total": 0, 19 | "total_duration": 0, 20 | "avg_duration": 0, 21 | "success": 0, 22 | "success_rate": 1, 23 | "per_code": {} 24 | } 25 | } 26 | self.lock = threading.Lock() 27 | 28 | def count_request(self, status_code, duration): 29 | with self.lock: 30 | s = self.stats["requests"] 31 | s["total"] += 1 32 | s["total_duration"] += duration 33 | if status_code < 500: 34 | s["success"] += 1 35 | s["success_rate"] = s["success"] / float(s["total"]) 36 | s["avg_duration"] = s["total_duration"] / float(s["total"]) 37 | s["per_code"].setdefault(status_code, 0) 38 | s["per_code"][status_code] += 1 39 | 40 | def status(self): 41 | return { 42 | "stats": copy.deepcopy(self.stats), 43 | "started_at": self.started_at.isoformat(), 44 | "runtime": (datetime.datetime.now() - self.started_at).seconds 45 | } 46 | 47 | 48 | class StatusMiddleware(object): 49 | 50 | def __init__(self, flask_app): 51 | self.app = flask_app.wsgi_app 52 | self.stats = Stats() 53 | 54 | @flask_app.route("/status", methods=["GET"]) 55 | def status(): 56 | return flask.jsonify(self.stats.status()), 200 57 | 58 | @dec.wsgify 59 | def __call__(self, request): 60 | started_at = monotonic.monotonic() 61 | response = request.get_response(self.app) 62 | self.stats.count_request(response.status_code, 63 | monotonic.monotonic() - started_at) 64 | return response 65 | -------------------------------------------------------------------------------- /netmet/utils/worker.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import logging 4 | import threading 5 | 6 | import futurist 7 | 8 | 9 | LOG = logging.getLogger(__name__) 10 | 11 | 12 | class LonelyWorker(object): 13 | _self = None 14 | _lock = threading.Lock() 15 | _period = 60 16 | 17 | def __init__(self): 18 | """Do not call this method directly. Call create() instead.""" 19 | 20 | @classmethod 21 | def create(cls, callback_after_job=None): 22 | with cls._lock: 23 | if not cls._self: 24 | self = cls() 25 | cls._self = self 26 | self._worker = futurist.ThreadPoolExecutor() 27 | self._death = threading.Event() 28 | self._worker.submit(cls._self._periodic_worker) 29 | self._force_update = False 30 | self._callback_after_job = callback_after_job or (lambda: True) 31 | 32 | @classmethod 33 | def get(cls): 34 | return cls._self 35 | 36 | @classmethod 37 | def force_update(cls): 38 | if cls._self: 39 | cls._self._force_update = True 40 | 41 | @classmethod 42 | def destroy(cls): 43 | with cls._lock: 44 | if cls._self is not None: 45 | if not cls._self._death.is_set(): 46 | cls._self._death.set() 47 | cls._self._worker.shutdown() 48 | cls._self = None 49 | 50 | def _periodic_worker(self): 51 | while not self._death.is_set(): 52 | try: 53 | if self._job(): 54 | self._callback_after_job() 55 | 56 | t = 0 57 | while t < self._period: 58 | if self._force_update: 59 | self._force_update = False 60 | break 61 | else: 62 | wait = min(self._period / 10.0, 1.0) 63 | t += wait 64 | self._death.wait(wait) 65 | 66 | except Exception: 67 | LOG.exception("LonelyWorker fails to do peridoic duties %s" 68 | % self) 69 | -------------------------------------------------------------------------------- /tests/unit/utils/test_worker.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: Godaddy Inc. 2 | 3 | import time 4 | 5 | from netmet.utils import worker 6 | from tests.unit import test 7 | 8 | 9 | class LonelyWorkerTestCase(test.TestCase): 10 | 11 | def tearDown(self): 12 | super(LonelyWorkerTestCase, self).tearDown() 13 | worker.LonelyWorker.destroy() 14 | 15 | def test_get_not_initalized(self): 16 | self.assertIsNone(worker.LonelyWorker.get()) 17 | 18 | def test_create_and_get(self): 19 | worker.LonelyWorker.create() 20 | self.assertIsInstance(worker.LonelyWorker.get(), worker.LonelyWorker) 21 | 22 | def test_singletone(self): 23 | worker.LonelyWorker.create() 24 | first = worker.LonelyWorker.get() 25 | worker.LonelyWorker.create() 26 | second = worker.LonelyWorker.get() 27 | self.assertIs(first, second) 28 | 29 | def test_force_update(self): 30 | # check it doesn't fail if not inited 31 | worker.LonelyWorker.force_update() 32 | 33 | worker.LonelyWorker.create() 34 | worker.LonelyWorker.force_update() 35 | self.assertTrue(worker.LonelyWorker.get()._force_update) 36 | 37 | def test_destroy(self): 38 | worker.LonelyWorker.create() 39 | worker.LonelyWorker.destroy() 40 | self.assertIsNone(worker.LonelyWorker.get()) 41 | 42 | def test_periodic_worker(self): 43 | 44 | class LonelyWorkerInt(worker.LonelyWorker): 45 | _period = 0.1 46 | 47 | def _job(self): 48 | if not getattr(self, "counter", False): 49 | self.counter = 1 50 | else: 51 | self.counter += 1 52 | return True 53 | 54 | class AfterJob(object): 55 | 56 | def __init__(self): 57 | self.counter = 0 58 | 59 | def job(self): 60 | self.counter += 1 61 | 62 | try: 63 | after_job = AfterJob() 64 | LonelyWorkerInt.create(callback_after_job=after_job.job) 65 | time.sleep(0.01) 66 | self.assertEqual(1, LonelyWorkerInt.get().counter) 67 | self.assertEqual(1, after_job.counter) 68 | time.sleep(0.23) 69 | self.assertEqual(3, LonelyWorkerInt.get().counter) 70 | self.assertEqual(3, after_job.counter) 71 | finally: 72 | LonelyWorkerInt.destroy() 73 | -------------------------------------------------------------------------------- /ansible/roles/elastic/templates/jvm.options: -------------------------------------------------------------------------------- 1 | ## JVM configuration 2 | 3 | ################################################################ 4 | ## IMPORTANT: JVM heap size 5 | ################################################################ 6 | ## 7 | ## You should always set the min and max JVM heap 8 | ## size to the same value. For example, to set 9 | ## the heap to 4 GB, set: 10 | ## 11 | ## -Xms4g 12 | ## -Xmx4g 13 | ## 14 | ## See https://www.elastic.co/guide/en/elasticsearch/reference/current/heap-size.html 15 | ## for more information 16 | ## 17 | ################################################################ 18 | 19 | # Xms represents the initial size of total heap space 20 | # Xmx represents the maximum size of total heap space 21 | 22 | -Xms{{heap_size}} 23 | -Xmx{{heap_size}} 24 | 25 | ################################################################ 26 | ## Expert settings 27 | ################################################################ 28 | ## 29 | ## All settings below this section are considered 30 | ## expert settings. Don't tamper with them unless 31 | ## you understand what you are doing 32 | ## 33 | ################################################################ 34 | 35 | ## GC configuration 36 | -XX:+UseConcMarkSweepGC 37 | -XX:CMSInitiatingOccupancyFraction=75 38 | -XX:+UseCMSInitiatingOccupancyOnly 39 | 40 | ## optimizations 41 | 42 | # disable calls to System#gc 43 | -XX:+DisableExplicitGC 44 | 45 | # pre-touch memory pages used by the JVM during initialization 46 | -XX:+AlwaysPreTouch 47 | 48 | ## basic 49 | 50 | # force the server VM (remove on 32-bit client JVMs) 51 | -server 52 | 53 | # explicitly set the stack size (reduce to 320k on 32-bit client JVMs) 54 | -Xss1m 55 | 56 | # set to headless, just in case 57 | -Djava.awt.headless=true 58 | 59 | # ensure UTF-8 encoding by default (e.g. filenames) 60 | -Dfile.encoding=UTF-8 61 | 62 | # use our provided JNA always versus the system one 63 | -Djna.nosys=true 64 | 65 | # use old-style file permissions on JDK9 66 | -Djdk.io.permissionsUseCanonicalPath=true 67 | 68 | # flags to configure Netty 69 | -Dio.netty.noUnsafe=true 70 | -Dio.netty.noKeySetOptimization=true 71 | -Dio.netty.recycler.maxCapacityPerThread=0 72 | 73 | # log4j 2 74 | -Dlog4j.shutdownHookEnabled=false 75 | -Dlog4j2.disable.jmx=true 76 | -Dlog4j.skipJansi=true 77 | 78 | ## heap dumps 79 | 80 | # generate a heap dump when an allocation from the Java heap fails 81 | # heap dumps are created in the working directory of the JVM 82 | -XX:+HeapDumpOnOutOfMemoryError 83 | 84 | -------------------------------------------------------------------------------- /ansible/roles/elastic/tasks/main.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - name: Update pam limits 4 | become: yes 5 | pam_limits: 6 | domain: "{{ ansible_user }}" 7 | limit_type: "{{ item['limit_type'] }}" 8 | limit_item: "{{ item['limit_item'] }}" 9 | value: "{{ item['limit_value'] }}" 10 | with_items: 11 | - limit_type: soft 12 | limit_item: nofile 13 | limit_value: 65536 14 | - limit_type: hard 15 | limit_item: nofile 16 | limit_value: 65536 17 | - limit_type: soft 18 | limit_item: memlock 19 | limit_value: unlimited 20 | - limit_type: hard 21 | limit_item: memlock 22 | limit_value: unlimited 23 | 24 | - name: Set vm.max_map_count to 262144 25 | become: yes 26 | sysctl: 27 | name: vm.max_map_count 28 | value: 262144 29 | state: present 30 | reload: yes 31 | 32 | - name: Create elastic conf directory 33 | file: 34 | path: elastic_conf/scripts 35 | state: directory 36 | mode: 0755 37 | 38 | - name: Create elastic data directory 39 | file: 40 | path: elastic_data 41 | state: directory 42 | mode: 0755 43 | 44 | - name: Generate jvm.options 45 | vars: 46 | heap_size: 8g 47 | template: 48 | src: jvm.options 49 | dest: "{{ ansible_env.HOME }}/elastic_conf/jvm.options" 50 | register: jvm_conf 51 | 52 | - name: Generate log4j2.properties 53 | template: 54 | src: log4j2.properties 55 | dest: "{{ ansible_env.HOME }}/elastic_conf/log4j2.properties" 56 | register: log4j2 57 | 58 | - name: Generate elasticsearch.yml 59 | vars: 60 | host: "{{inventory_hostname}}" 61 | elastic_hosts: "{{groups['elastic'] + groups['elastic_deploy']}}" 62 | template: 63 | src: elasticsearch.yml 64 | dest: "{{ ansible_env.HOME }}/elastic_conf/elasticsearch.yml" 65 | register: elastic 66 | 67 | - name: Pull Elastic docker image 68 | become: yes 69 | docker_image: 70 | name: docker.elastic.co/elasticsearch/elasticsearch 71 | tag: 5.2.2 72 | 73 | - name: Run Elastic 74 | become: yes 75 | docker_container: 76 | name: elasticsearch 77 | image: docker.elastic.co/elasticsearch/elasticsearch:5.2.2 78 | network_mode: host 79 | state: started 80 | recreate: yes 81 | restart_policy: unless-stopped 82 | volumes: 83 | - "{{ ansible_env.HOME }}/elastic_conf:/usr/share/elasticsearch/config:rw" 84 | - "{{ ansible_env.HOME }}/elastic_data:/usr/share/elasticsearch/data:rw" 85 | -------------------------------------------------------------------------------- /ansible/deploy.yml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - hosts: controller 4 | roles: 5 | - common 6 | - controller 7 | 8 | - hosts: nginx 9 | roles: 10 | - common 11 | - nginx 12 | 13 | - hosts: elastic_deploy 14 | serial: 1 15 | roles: 16 | - common 17 | - elastic 18 | 19 | - hosts: grafana 20 | roles: 21 | - common 22 | - grafana 23 | 24 | - hosts: netmet 25 | roles: 26 | - common 27 | - netmet 28 | vars_prompt: 29 | - name: "registry_pwd" 30 | prompt: "Docker registry password" 31 | private: yes 32 | 33 | - hosts: netmet_servers 34 | serial: 1 35 | tasks: 36 | - name: Run Netmet Server Container 37 | become: yes 38 | docker_container: 39 | name: netmet_server 40 | image: "{{registry_url}}/{{netmet_img_name}}:{{netmet_img_tag}}" 41 | recreate: yes 42 | restart_policy: unless-stopped 43 | network_mode: host 44 | entrypoint: "python netmet/run.py" 45 | env: 46 | APP: server 47 | PORT: "{{netmet_server_port}}" 48 | NETMET_SERVER_URL: "http://{{groups['nginx'][0]}}" 49 | NETMET_OWN_URL: "http://{{inventory_hostname}}:{{netmet_server_port}}" 50 | ELASTIC: "{{ groups['elastic']|join(',') }}" 51 | NETMET_AUTH: "{{basic_auth}}" 52 | NETMET_HMACS: "{{server_hmacs}}" 53 | 54 | - name: Wait for netmet server to start 55 | uri: 56 | url: "http://{{inventory_hostname}}:{{netmet_server_port}}" 57 | register: result 58 | until: result.status == 200 59 | retries: 60 60 | delay: 0.25 61 | 62 | - hosts: netmet_clients 63 | serial: 1 64 | tasks: 65 | - name: Create netmet run conf directory 66 | file: 67 | path: netmet_client 68 | state: directory 69 | mode: 0755 70 | 71 | - name: Run Netmet client container 72 | become: yes 73 | docker_container: 74 | name: netmet_client 75 | image: "{{registry_url}}/{{netmet_img_name}}:{{netmet_img_tag}}" 76 | recreate: yes 77 | restart_policy: unless-stopped 78 | network_mode: host 79 | entrypoint: "python netmet/run.py" 80 | env: 81 | APP: client 82 | PORT: "{{netmet_client_port}}" 83 | NETMET_HMACS: "{{client_hmacs}}" 84 | volumes: 85 | - "{{ ansible_env.HOME }}/netmet_client:/var/run/netmet:rw" 86 | 87 | - name: Wait for Netmet client to start 88 | uri: 89 | url: "http://{{inventory_hostname}}:{{netmet_client_port}}" 90 | register: result 91 | until: result.status == 200 92 | retries: 60 93 | delay: 0.25 94 | -------------------------------------------------------------------------------- /netmet/server/deployer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import logging 4 | 5 | import futurist 6 | import requests 7 | 8 | from netmet import exceptions 9 | from netmet.server import db 10 | from netmet.server.utils import eslock 11 | from netmet.utils import worker 12 | 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | 17 | class Deployer(worker.LonelyWorker): 18 | 19 | def __init__(self): 20 | """Do not use this method directly. Use create() instead.""" 21 | 22 | def _job(self): 23 | get_conf = db.get().server_config_get 24 | is_applied = lambda cfg: not cfg or (cfg and cfg["applied"]) 25 | 26 | no_changes_msg = "Deployer: no changes in config detected." 27 | 28 | try: 29 | if is_applied(get_conf()): 30 | LOG.info(no_changes_msg) 31 | else: 32 | with eslock.Glock("update_config"): 33 | config = get_conf() # Refresh config after lock 34 | if not is_applied(config): 35 | LOG.info("Deployer detect new config: " 36 | "Updating deployment") 37 | clients = db.get().clients_get() 38 | 39 | # TODO(boris-42): Add support of multi drivers 40 | new_clients = StaticDeployer().redeploy( 41 | config["config"]["deployment"]["static"], clients) 42 | 43 | db.get().clients_set(new_clients) 44 | db.get().server_config_apply(config["id"]) 45 | return True 46 | else: 47 | LOG.info(no_changes_msg) 48 | 49 | except exceptions.GlobalLockException: 50 | pass # can't accuire lock, someone else is working on it 51 | 52 | except Exception: 53 | LOG.exception("Deployer update failed") 54 | 55 | def redeploy(self, config, clients): 56 | """Should update deployment based on change in config.""" 57 | raise NotImplemented() 58 | 59 | 60 | class StaticDeployer(Deployer): 61 | 62 | def redeploy(self, config, old_clients): 63 | new_clients = config["clients"] 64 | 65 | old_idx = {c["host"]: c for c in old_clients} 66 | new_idx = {c["host"]: c for c in new_clients} 67 | 68 | for c in new_clients: 69 | c["configured"] = False 70 | 71 | unregister = ["%s:%s/api/v1/unregister" % (h, old_idx[h]["port"]) 72 | for h in old_idx if h not in new_idx] 73 | with futurist.ThreadPoolExecutor(max_workers=10) as e: 74 | e.map(requests.post, unregister) 75 | 76 | return new_clients 77 | -------------------------------------------------------------------------------- /netmet/client/conf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import json 4 | import logging 5 | import os 6 | 7 | import requests 8 | 9 | from netmet.utils import asyncer 10 | from netmet.utils import secure 11 | 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | _RUNTIME_CONF_DIR = "/var/run/netmet/" 16 | _RUNTIME_CONF_FILE = _RUNTIME_CONF_DIR + "restore_api_%s" 17 | _RESTORE_API = "%(server)s/api/v1/clients/%(host)s/%(port)s" 18 | 19 | 20 | @asyncer.asyncme 21 | def restore(hmacs, port): 22 | url = restore_url_get(port) 23 | if not url: 24 | return 25 | 26 | while not restore._die.is_set(): 27 | for hmac in hmacs: 28 | try: 29 | r = requests.post( 30 | url, headers=secure.gen_hmac_headers("", hmac)) 31 | 32 | if r.status_code == 403: 33 | continue 34 | if r.status_code == 404: 35 | restore_url_clear(port) 36 | if r.status_code in [200, 404]: 37 | return 38 | 39 | except requests.exceptions.RequestException as e: 40 | LOG.warning("Netmet Server API %s is not available %s" 41 | % (url, e)) 42 | except Exception: 43 | LOG.exception("Something went wrong during the attempt " 44 | "to call netmet server to referesh config.") 45 | return 46 | 47 | if url != restore_url_get(port): 48 | break 49 | 50 | restore._die.wait(1) 51 | 52 | 53 | def restore_url_get(port): 54 | try: 55 | path = _RUNTIME_CONF_FILE % port 56 | 57 | with open(path, "rw") as f: 58 | LOG.info("Loading restore conf url from previous run: %s" % path) 59 | return json.load(f).get("refresh_conf_url", None) 60 | except IOError: 61 | LOG.info("Didn't find previous config: %s" % path) 62 | 63 | except Exception: 64 | LOG.exception("Failed to load restore_conf_url from previous run") 65 | return None 66 | 67 | 68 | def restore_url_set(netmet_server, host, port): 69 | LOG.info("Setting new netmet restore_conf_url %s" 70 | % (_RUNTIME_CONF_FILE % port)) 71 | try: 72 | if not os.path.exists(_RUNTIME_CONF_DIR): 73 | LOG.info("Creating directory: %s" % _RUNTIME_CONF_DIR) 74 | os.makedirs(_RUNTIME_CONF_DIR) 75 | 76 | with open(_RUNTIME_CONF_FILE % port, "w+") as f: 77 | if netmet_server: 78 | data = {"server": netmet_server, "host": host, "port": port} 79 | json.dump({"refresh_conf_url": _RESTORE_API % data}, f) 80 | else: 81 | json.dump({"refresh_conf_url": None}, f) 82 | 83 | except Exception: 84 | LOG.exception("Failed to store runtime info refresh_conf_url") 85 | 86 | 87 | def restore_url_clear(port): 88 | try: 89 | os.remove(_RUNTIME_CONF_FILE % port) 90 | except OSError: 91 | pass 92 | -------------------------------------------------------------------------------- /docs/security.md: -------------------------------------------------------------------------------- 1 | Secure Netmet Server API 2 | ======================== 3 | 4 | Netmet Server supports basic auth for any potentially dangeours method: 5 | 6 | GET /api/v2/config 7 | POST /api/v2/config 8 | POST /api/v1/events/ 9 | DELETE /api/v1/events/ 10 | POST /api/v1/events//_stop 11 | 12 | 13 | To Enable Basic Auth 14 | -------------------- 15 | 16 | Set enviorment variable 17 | 18 | NETMET_AUTH=":,:" 19 | 20 | password should have at least 1 number and 1 uppercase 1 lowercase and 21 | more then 6 characters. 22 | 23 | 24 | This is temporary soultion that is going to be replaced by full RBAC. 25 | 26 | 27 | Secure Netmet Server <-> Netmet Client Traffic 28 | ============================================== 29 | 30 | Intro 31 | ----- 32 | 33 | Type of Inteactions: 34 | 1) Netmet Server sets Netmet Config via POST /api/v2/config 35 | 2) Netmet Server disables Netmet Client via POST /api/v1/unregister 36 | 3) Netmet Client restores it's config via POST /api/clients// 37 | 4) Netmet Client sends metrics to Netmet Server via POST /api/v1/metrics 38 | 39 | 40 | In interactions are not secured anybody can perform any of this operations. 41 | Which means that one can changed configuration of client, or send own metrics 42 | back and they are going to process. 43 | 44 | In trusted enviorment it may be OK. 45 | However, Netmet provides simple way to secure this interaction and avoid 46 | potential risks. 47 | 48 | To do that Netmet uses HMAC mechanism. 49 | Data that is send during these requests is singed with HMAC key, and other 50 | side validates HMAC signature and drops requests with invalid signature. 51 | 52 | 53 | How To Run NetMet Without HMAC 54 | ------------------------------ 55 | 56 | Set Env variable: 57 | 58 | NETMET_HMAC_SKIP=True 59 | 60 | 61 | How To Enable HMAC 62 | ------------------ 63 | 64 | Provide to Netmet Server or Client: 65 | 66 | NETMET_HMACS="key1" 67 | 68 | You can provide multiple keys, e.g. "key1,key2". First key is going to be 69 | used to sign data, both for check signature. 70 | 71 | 72 | How To Add HMAC To Existing Netmet Deployment 73 | --------------------------------------------- 74 | 75 | 1) Re run netmet server specifing: 76 | 77 | NETMET_HMAC_SKIP=True 78 | NETMET_HMACS="your_key" 79 | 80 | 2) Re run all netmet clients with 81 | 82 | NETMET_HMACS="your_key" 83 | 84 | 3) Re run all netmet servers with 85 | 86 | NETMET_HMACS="your_key" 87 | 88 | 89 | How To Perform Rolling Upgrade of Netmet HMAC Keys 90 | -------------------------------------------------- 91 | 92 | 1) Re run NetMet Servers with 93 | 94 | NETMET_HMACS="," 95 | 96 | 97 | 2) Re run NetMet Clients with 98 | 99 | NETMET_HMACS="," 100 | 101 | 3) Re run Netmet Servers with 102 | 103 | NETMET_HMACS="" 104 | 105 | 4) Re run Netmet Clients with 106 | 107 | NETMET_HMACS="" 108 | 109 | -------------------------------------------------------------------------------- /tests/unit/test_run.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: Godaddy Inc. 2 | 3 | import os 4 | 5 | from gevent import wsgi 6 | import mock 7 | 8 | from netmet import run 9 | from tests.unit import test 10 | 11 | 12 | class RunTestCase(test.TestCase): 13 | 14 | @mock.patch.dict(os.environ, {}) 15 | def test_parse_auth_info_none(self): 16 | self.assertEqual({}, run._parse_auth_info()) 17 | 18 | @mock.patch.dict(os.environ, {"NETMET_AUTH": "wrong_format"}) 19 | def test_parse_auth_info_invalid(self): 20 | self.assertRaises(ValueError, run._parse_auth_info) 21 | 22 | @mock.patch.dict(os.environ, {"NETMET_AUTH": "not_a_valid:ttt"}) 23 | def test_parse_auth_info_week_password(self): 24 | self.assertRaises(ValueError, run._parse_auth_info) 25 | 26 | @mock.patch.dict(os.environ, {"NETMET_AUTH": "user3:ValidPass321"}) 27 | def test_parse_auth(self): 28 | self.assertEqual({"user3": "ValidPass321"}, run._parse_auth_info()) 29 | 30 | @mock.patch.dict(os.environ, {}) 31 | def test_load_no_app(self): 32 | self.assertRaises(ValueError, run.load) 33 | 34 | @mock.patch.dict(os.environ, {"APP": "not_a_valid"}) 35 | def test_load_wrong_app(self): 36 | self.assertRaises(ValueError, run.load) 37 | 38 | @mock.patch.dict(os.environ, {"APP": "server", "HOST": "", 39 | "NETMET_HMAC_SKIP": "True"}) 40 | @mock.patch("netmet.server.main.load") 41 | def test_load_server_app(self, mock_load): 42 | http_server = run.load() 43 | mock_load.assert_called_once_with() 44 | self.assertEqual(mock_load.call_count, 1) 45 | self.assertIsInstance(http_server, wsgi.WSGIServer) 46 | self.assertEqual("", http_server.server_host) 47 | self.assertEqual(5000, http_server.server_port) 48 | 49 | @mock.patch.dict(os.environ, {"APP": "client", "HOST": "", 50 | "NETMET_HMAC_SKIP": "True"}) 51 | @mock.patch("netmet.client.main.load") 52 | def test_load_client_app(self, mock_load): 53 | http_server = run.load() 54 | mock_load.assert_called_once_with() 55 | self.assertEqual(mock_load.call_count, 1) 56 | self.assertIsInstance(http_server, wsgi.WSGIServer) 57 | self.assertEqual("", http_server.server_host) 58 | self.assertEqual(5000, http_server.server_port) 59 | 60 | @mock.patch.dict(os.environ, { 61 | "APP": "client", "PORT": "80", "HOST": "1.2.3.4", 62 | "NETMET_HMAC_SKIP": "True"}) 63 | @mock.patch("netmet.client.main.load") 64 | def test_load_non_default_port_and_host(self, mock_load): 65 | http_server = run.load() 66 | self.assertEqual("1.2.3.4", http_server.server_host) 67 | self.assertEqual(80, http_server.server_port) 68 | 69 | @mock.patch.dict(os.environ, {"APP": "client", "NETMET_HMAC_SKIP": "True"}) 70 | @mock.patch("netmet.run.wsgi.WSGIServer.serve_forever") 71 | @mock.patch("netmet.client.main.load") 72 | def test_run(self, mock_load, mock_serve_forever): 73 | run.run() 74 | self.assertEqual(mock_serve_forever.call_count, 1) 75 | mock_serve_forever.assert_called_once_with() 76 | -------------------------------------------------------------------------------- /tests/unit/utils/test_status.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: Godaddy Inc. 2 | 3 | import datetime 4 | 5 | import mock 6 | 7 | from netmet.utils import status 8 | from tests.unit import test 9 | 10 | 11 | class StatusTestCase(test.TestCase): 12 | 13 | def test_count_request(self): 14 | self.stats = status.Stats() 15 | 16 | expected = { 17 | "total": 0, 18 | "total_duration": 0, 19 | "avg_duration": 0, 20 | "success": 0, 21 | "success_rate": 1, 22 | "per_code": {} 23 | } 24 | for k, v in expected.iteritems(): 25 | self.assertEqual(self.stats.stats["requests"][k], v) 26 | 27 | self.stats.count_request(400, 1.0) 28 | self.stats.count_request(400, 2.0) 29 | self.stats.count_request(500, 2.0) 30 | self.stats.count_request(200, 3.0) 31 | 32 | expected["total"] = 4 33 | expected["success"] = 3 34 | expected["success_rate"] = 3 / 4.0 35 | expected["total_duration"] = 8.0 36 | expected["avg_duration"] = 2.0 37 | expected["per_code"] = {400: 2, 500: 1, 200: 1} 38 | for k, v in expected.iteritems(): 39 | self.assertEqual(self.stats.stats["requests"][k], v) 40 | 41 | def test_status_response(self): 42 | stats = status.Stats() 43 | status_ = stats.status() 44 | 45 | self.assertEqual(status_["started_at"], stats.started_at.isoformat()) 46 | self.assertIsInstance(status_["runtime"], int) 47 | self.assertIsInstance(status_["stats"], dict) 48 | 49 | self.assertTrue(status_ is not stats.stats) 50 | self.assertEqual(status_["stats"], stats.stats) 51 | 52 | def test_status_respnose_runtime(self): 53 | started_at = datetime.datetime(2017, 4, 10, 14, 15, 43, 572065) 54 | running_1 = datetime.datetime(2017, 4, 10, 14, 20, 46, 572065) 55 | running_2 = datetime.datetime(2017, 4, 10, 14, 20, 47, 572065) 56 | 57 | with mock.patch("netmet.utils.status.datetime.datetime") as mock_date: 58 | mock_date.now.side_effect = [started_at, running_1, running_2] 59 | 60 | self.stats = status.Stats() 61 | self.assertEqual(self.stats.started_at, started_at) 62 | self.assertEqual(mock_date.now.call_count, 1) 63 | 64 | self.assertEqual(self.stats.status()["runtime"], 303) 65 | self.assertEqual(mock_date.now.call_count, 2) 66 | 67 | self.assertEqual(self.stats.status()["runtime"], 304) 68 | self.assertEqual(mock_date.now.call_count, 3) 69 | 70 | 71 | class TestStatusMiddleware(test.TestCase): 72 | 73 | def test_init(self): 74 | app = mock.Mock() 75 | app.wsgi_app = mock.Mock() 76 | app.route = mock.MagicMock() 77 | middleware = status.StatusMiddleware(app) 78 | 79 | self.assertEqual(middleware.app, app.wsgi_app) 80 | app.route.assert_called_once_with("/status", methods=["GET"]) 81 | app.route.return_value.assert_called() 82 | 83 | def test_call(self): 84 | app = mock.MagicMock() 85 | request = mock.MagicMock() 86 | 87 | s = status.StatusMiddleware(app) 88 | response = s(request) 89 | self.assertEqual(response, request.get_response.return_value) 90 | request.get_response.assert_called_once_with(app.wsgi_app) 91 | self.assertEqual(s.stats.stats["requests"]["total"], 1) 92 | -------------------------------------------------------------------------------- /netmet/utils/secure.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import datetime 4 | import functools 5 | import hashlib 6 | import hmac 7 | 8 | import flask 9 | 10 | from netmet import config 11 | 12 | 13 | def generate_digest(data, hmac_key): 14 | """Generate a hmac using a known key given the provided content.""" 15 | h = hmac.new(hmac_key, data, digestmod=hashlib.sha384) 16 | h = hmac.new(h.hexdigest(), data, digestmod=hashlib.sha384) 17 | return h.hexdigest() 18 | 19 | 20 | def is_valid_digest(hexdigest, data, valid_hmacs): 21 | """Check whatever hexdigest is valid for data and any of valid_hmacs 22 | 23 | :param hexdigest: Hex digest that should be checked 24 | :param data: Original Data that was signed 25 | :param valid_hmacs: List of valid hmacs 26 | """ 27 | for valid_hmac in valid_hmacs: 28 | if hmac.compare_digest(hexdigest, generate_digest(data, valid_hmac)): 29 | return True 30 | return False 31 | 32 | 33 | def gen_hmac_headers(data, hmac=None): 34 | """Generates and returns valid headers for HMAC auth as dicts 35 | 36 | Generates timestamp place it in X-AUTH-HMAC-TIMESTAMP 37 | Adds timestamp to data and generates hmac digest and puts it to 38 | X-AUTH-HMAC-DIGEST. 39 | """ 40 | if not (hmac or config.get("hmac_keys")): 41 | return {} 42 | 43 | timestamp = datetime.datetime.now().strftime("%s") 44 | headers = {} 45 | headers["X-AUTH-HMAC-TIMESTAMP"] = timestamp 46 | headers["X-AUTH-HMAC-DIGEST"] = generate_digest( 47 | data + timestamp, hmac or config.get("hmac_keys")[0]) 48 | return headers 49 | 50 | 51 | def check_hmac_auth(f): 52 | """Flask decorator for checking hmac auth.""" 53 | 54 | @functools.wraps(f) 55 | def wrapper(*args, **kwargs): 56 | if not config.get("hmac_skip_check"): 57 | data = flask.request.get_data() 58 | digest = str(flask.request.headers.get("X-AUTH-HMAC-DIGEST")) 59 | timestamp = flask.request.headers.get("X-AUTH-HMAC-TIMESTAMP") 60 | 61 | if not timestamp or not digest: 62 | msg = ("Invalid or Missing headers " 63 | "X-AUTH-HMAC-DIGEST or X-AUTH-HMAC-TIMESTAMP") 64 | return flask.jsonify({"error": msg}), 403 65 | 66 | now = datetime.datetime.now().strftime("%s") 67 | if int(now) - int(timestamp) > 30: 68 | return flask.jsonify({"error": "HMAC digest expired"}), 403 69 | 70 | if not is_valid_digest(digest, data + timestamp, 71 | config.get("hmac_keys")): 72 | return flask.jsonify({"error": "Wrong or missing digest"}), 403 73 | 74 | return f(*args, **kwargs) 75 | 76 | return wrapper 77 | 78 | 79 | def check_basic_auth(f): 80 | """Basic authentication checker.""" 81 | 82 | @functools.wraps(f) 83 | def decorated(*args, **kwargs): 84 | users = config.get("users") 85 | if users: 86 | auth = flask.request.authorization 87 | if not (auth and auth.username in users 88 | and users[auth.username] == auth.password): 89 | return flask.Response( 90 | "Could not verify your access level for that URL.\n" 91 | "You have to login with proper credentials", 401, 92 | {"WWW-Authenticate": "Basic realm=\"Login Required\""}) 93 | 94 | return f(*args, **kwargs) 95 | 96 | return decorated 97 | -------------------------------------------------------------------------------- /netmet/utils/pusher.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import collections 4 | import json 5 | import logging 6 | import threading 7 | 8 | import futurist 9 | import monotonic 10 | import requests 11 | 12 | 13 | LOG = logging.getLogger(__name__) 14 | 15 | 16 | class Pusher(object): 17 | 18 | def __init__(self, url, extra_headers=None, period=10, max_count=1000, 19 | dealey_between_requests=0.2, timeout=2): 20 | self.url = url 21 | self.extra_headers = extra_headers 22 | self.period = period 23 | self.dealey_between_requests = dealey_between_requests 24 | self.timeout = timeout 25 | self.max_count = max_count 26 | self.objects = collections.deque() 27 | self._worker = None 28 | self.session = requests.session() 29 | 30 | def _send(self): 31 | body = [] 32 | fails_in_row = 0 33 | while not self._death.is_set(): 34 | count = len(body) 35 | while self.objects and count < self.max_count: 36 | count += 1 37 | body.append(self.objects.popleft()) 38 | 39 | error_status = None 40 | try: 41 | data = json.dumps(body) 42 | headers = {} 43 | if isinstance(self.extra_headers, dict): 44 | headers = self.extra_headers 45 | if callable(self.extra_headers): 46 | headers = self.extra_headers(data) 47 | 48 | r = self.session.post( 49 | self.url, data=data, headers=headers, timeout=self.timeout) 50 | if r.status_code == 201: 51 | body = [] 52 | fails_in_row = 0 53 | 54 | error_status = r.status_code if r.status_code != 201 else None 55 | except requests.exceptions.RequestException as e: 56 | error_status = str(e) 57 | finally: 58 | if error_status: 59 | fails_in_row += 1 60 | LOG.warning("Can't push data to %s (status %s)" 61 | % (self.url, error_status)) 62 | 63 | if not body and len(self.objects) < self.max_count: 64 | break 65 | 66 | if fails_in_row > 2: 67 | self.objects.extendleft(body[::-1]) 68 | break 69 | 70 | self._death.wait(self.dealey_between_requests) 71 | 72 | def _send_periodically(self): 73 | while not self._death.is_set(): 74 | try: 75 | if monotonic.monotonic() - self._started_at > self.period: 76 | self._send() 77 | self._started_at = monotonic.monotonic() 78 | 79 | self._death.wait(self.period / 20.0) 80 | except Exception: 81 | # If execution fails we should reset our timer 82 | # to not flood netmet server 83 | self._started_at = monotonic.monotonic() 84 | LOG.exception("Pusher failed") 85 | 86 | def add(self, item): 87 | self.objects.append(item) 88 | 89 | def start(self): 90 | if not self._worker: 91 | self._started_at = monotonic.monotonic() 92 | self._worker = futurist.ThreadPoolExecutor() 93 | self._death = threading.Event() 94 | self._worker.submit(self._send_periodically) 95 | 96 | def stop(self): 97 | if self._worker: 98 | self._death.set() 99 | self._worker.shutdown() 100 | -------------------------------------------------------------------------------- /netmet/run.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import logging 4 | import os 5 | import signal 6 | import sys 7 | 8 | from gevent import wsgi 9 | 10 | from netmet.client import main as client_main 11 | from netmet import config 12 | from netmet.server import main as server_main 13 | from netmet.utils import asyncer 14 | 15 | 16 | LOG = logging.getLogger(__name__) 17 | 18 | 19 | def _parse_auth_info(): 20 | auth = os.getenv("NETMET_AUTH", "") 21 | 22 | if not auth: 23 | return {} 24 | 25 | users = {} 26 | for pairs in auth.split(","): 27 | user_password = pairs.split(":") 28 | 29 | if len(user_password) != 2: 30 | raise ValueError("NETMET_AUTH has wrong format at '%s'" % pairs) 31 | 32 | if user_password[0] in users: 33 | raise ValueError("NETMET_AUTH has duplicated user: '%s'" 34 | % user_password[0]) 35 | 36 | password_strength_checks = { 37 | "Password should have at least 6 symbols": lambda x: len(x) < 6, 38 | "Use upper and lower case": lambda x: x.lower() == x, 39 | "Use at least one number": lambda x: all( 40 | ord(c) < 48 and ord(c) > 57 for c in x) 41 | } 42 | 43 | user, password = user_password 44 | 45 | for reason, check in password_strength_checks.iteritems(): 46 | if check(user_password[1]): 47 | raise ValueError("NETMET_AUTH has invalid password '%s': %s " 48 | % (user_password[1], reason)) 49 | 50 | users[user_password[0]] = user_password[1] 51 | 52 | return users 53 | 54 | 55 | def _parse_hmac(): 56 | skip_check = os.getenv("NETMET_HMAC_SKIP", False) 57 | hmacs = os.getenv("NETMET_HMACS", "").strip() 58 | 59 | if not hmacs and not skip_check: 60 | raise ValueError("Set NETMET_HMAC_SKIP=True or Set NETMET_HMACS") 61 | 62 | hmacs = hmacs and hmacs.split(",") or [] 63 | if not all(hmacs): 64 | raise ValueError("One of HMAC is empty in NETMET_HMACS env variable.") 65 | 66 | return hmacs, skip_check 67 | 68 | 69 | def load(): 70 | level = logging.DEBUG if os.getenv("DEBUG") else logging.INFO 71 | logging.basicConfig(level=level, 72 | format="%(asctime)s %(levelname)-8s %(message)s", 73 | stream=sys.stdout) 74 | 75 | if not os.getenv("APP") or os.getenv("APP") not in ["server", "client"]: 76 | raise ValueError("Set APP env variable to 'server' or 'client'") 77 | elif os.getenv("APP") == "server": 78 | mode = server_main 79 | else: 80 | mode = client_main 81 | 82 | port = int(os.getenv("PORT", 5000)) 83 | config.set("port", port) 84 | config.set("users", _parse_auth_info()) 85 | hmacs, check_hmac = _parse_hmac() 86 | config.set("hmac_keys", hmacs) 87 | config.set("hmac_skip_check", check_hmac) 88 | 89 | app = mode.load() 90 | http_server = wsgi.WSGIServer((os.getenv("HOST", ""), port), app) 91 | 92 | def die(*args, **kwargs): 93 | LOG.info("Stopping netmet %s" % os.getenv("APP")) 94 | if os.getenv("APP") == "server": 95 | LOG.info("Stopping HTTP server") 96 | http_server.stop() 97 | LOG.info("Joining internal threads") 98 | mode.die() 99 | asyncer.die() 100 | if os.getenv("APP") == "client": 101 | LOG.info("Stopping HTTP server") 102 | http_server.stop() 103 | LOG.info("Bye Bye!") 104 | 105 | signal.signal(signal.SIGTERM, die) 106 | signal.signal(signal.SIGINT, die) 107 | return http_server 108 | 109 | 110 | def run(): 111 | load().serve_forever() 112 | 113 | 114 | if __name__ == "__main__": 115 | run() 116 | -------------------------------------------------------------------------------- /tests/unit/utils/test_ping.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import socket 4 | import struct 5 | 6 | import mock 7 | 8 | from netmet.utils import ping 9 | from tests.unit import test 10 | 11 | 12 | class PingTestCase(test.TestCase): 13 | 14 | @mock.patch("netmet.utils.ping.socket") 15 | def test_init_created_socket_success_ip(self, mock_socket): 16 | mock_socket.inet_pton.return_value = "1.1.1.1" 17 | p = ping.Ping("1.1.1.1") 18 | self.assertEqual(0, p.ret_code) 19 | self.assertEqual("1.1.1.1", p.dest) 20 | self.assertEqual(1, p.timeout) 21 | self.assertEqual(55, p.packet_size) 22 | self.assertEqual("1.1.1.1", p.dest_ip) 23 | self.assertEqual(mock_socket.socket.return_value, p.sock) 24 | 25 | @mock.patch("netmet.utils.ping.socket.inet_pton") 26 | @mock.patch("netmet.utils.ping.socket.gethostbyname") 27 | @mock.patch("netmet.utils.ping.socket.socket") 28 | def test_init_created_socket_success_host( 29 | self, mock_socket, mock_gethostbyname, mock_inet_pton): 30 | mock_inet_pton.side_effect = socket.error 31 | mock_gethostbyname.return_value = "2.2.2.2" 32 | p = ping.Ping("host", packet_size=100, timeout=5) 33 | self.assertEqual(0, p.ret_code) 34 | self.assertEqual("host", p.dest) 35 | self.assertEqual(5, p.timeout) 36 | self.assertEqual(100, p.packet_size) 37 | self.assertEqual("2.2.2.2", p.dest_ip) 38 | self.assertEqual(mock_socket.return_value, p.sock) 39 | 40 | @mock.patch("netmet.utils.ping.socket.inet_pton") 41 | @mock.patch("netmet.utils.ping.socket.gethostbyname") 42 | def test_init_created_socket_failed_not_found( 43 | self, mock_gethostbyname, mock_inet_pton): 44 | mock_inet_pton.side_effect = socket.error 45 | mock_gethostbyname.side_effect = socket.gaierror 46 | p = ping.Ping("host") 47 | self.assertEqual(ping.EXIT_STATUS.ERROR_HOST_NOT_FOUND, p.ret_code) 48 | self.assertEqual(None, p.sock) 49 | 50 | def test_ping(self): 51 | pass 52 | 53 | def test_ping_recreate_socket(self): 54 | pass 55 | 56 | @mock.patch("netmet.utils.ping.socket") 57 | def test_create_packet(self, mock_socket): 58 | p = ping.Ping("127.0.0.1") 59 | packet = p._create_packet(10) 60 | self.assertEqual(8 + 55, len(packet)) 61 | type_, code, checksum, id_, seq = struct.unpack("bbHHh", packet[:8]) 62 | self.assertEqual(8, type_) 63 | self.assertEqual(0, code) 64 | self.assertEqual(10, id_) 65 | self.assertEqual(1, seq) 66 | 67 | p = ping.Ping("127.0.0.1", packet_size=100) 68 | packet = p._create_packet(20) 69 | self.assertEqual(8 + 100, len(packet)) 70 | type_, code, checksum, id_, seq = struct.unpack("bbHHh", packet[:8]) 71 | self.assertEqual(8, type_) 72 | self.assertEqual(0, code) 73 | self.assertEqual(20, id_) 74 | self.assertEqual(1, seq) 75 | 76 | @mock.patch("netmet.utils.ping.socket") 77 | @mock.patch("netmet.utils.ping.select") 78 | @mock.patch("netmet.utils.ping.monotonic.monotonic") 79 | def test_handle_response(self, mock_monotonic, mock_select, mock_socket): 80 | p = ping.Ping("127.0.0.1") 81 | id_ = 10 82 | resp = "_" * 20 # NOTE(boris-42) We don't check header 83 | # NOTE(boris-42) Check checksum (fix me please) 84 | resp += struct.pack("bbHHh", 0, 0, 1, id_, 1) + "Q" * p.packet_size 85 | 86 | p.sock.recvfrom.return_value = (resp, "addr") 87 | mock_select.select.return_value == [[p.sock], [], []] 88 | mock_monotonic.side_effect = [0.1, 0.2, 0.25] 89 | self.assertEqual(150.0, p._response_handler(id_, 0.1)) 90 | 91 | @mock.patch("netmet.utils.ping.socket") 92 | @mock.patch("netmet.utils.ping.select") 93 | @mock.patch("netmet.utils.ping.monotonic.monotonic") 94 | def test_handle_response_timeout(self, mock_monotonic, mock_select, 95 | mock_socket): 96 | p = ping.Ping("127.0.0.1") 97 | mock_monotonic.side_effect = [2] 98 | self.assertEqual(None, p._response_handler(10, 0.1)) 99 | -------------------------------------------------------------------------------- /tests/unit/server/test_mesher.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import time 4 | 5 | import mock 6 | 7 | from netmet.server import db 8 | from netmet.server import mesher 9 | from tests.unit import test 10 | 11 | 12 | class MesherTestCase(test.TestCase): 13 | 14 | keys = ["ip", "port", "host", "dc", "az"] 15 | 16 | def tearDown(self): 17 | super(MesherTestCase, self).tearDown() 18 | mesher.Mesher.destroy() 19 | 20 | @mock.patch("netmet.server.mesher.Mesher._job") 21 | def test_create(self, mock_job): 22 | mesher.Mesher.create("netmet_server_url") 23 | time.sleep(0.1) 24 | mesher.Mesher.destroy() 25 | self.assertEqual(1, mock_job.call_count) 26 | mock_job.assert_called_once_with() 27 | 28 | @mock.patch("netmet.server.mesher.LOG.exception") 29 | @mock.patch("netmet.server.db.get") 30 | def test_job_failed(self, mock_db, mock_log): 31 | mock_db.return_value.server_config_get.side_effect = Exception 32 | mesher.Mesher()._job() 33 | self.assertEqual(1, mock_log.call_count) 34 | mock_log.assert_called_once_with(mesher.Mesher.update_failed_msg) 35 | 36 | @mock.patch("netmet.server.mesher.LOG.info") 37 | @mock.patch("netmet.server.db.DB.server_config_get") 38 | @mock.patch("netmet.server.db.get") 39 | def test_job_no_config(self, mock_db_get, mock_server_config_get, 40 | mock_log_info): 41 | mock_server_config_get.return_value = None 42 | mock_db_get.return_value = db.DB() 43 | 44 | mesher.Mesher()._job() 45 | mock_log_info.assert_called_once_with(mesher.Mesher.no_changes_msg) 46 | self.assertEqual(1, mock_db_get.call_count) 47 | self.assertEqual(1, mock_log_info.call_count) 48 | 49 | @mock.patch("netmet.server.mesher.LOG.info") 50 | @mock.patch("netmet.server.db.DB.server_config_get") 51 | @mock.patch("netmet.server.db.get") 52 | def test_job_not_applied(self, mock_db_get, mock_server_config_get, 53 | mock_log_info): 54 | mock_server_config_get.return_value = {"applied": False} 55 | mock_db_get.return_value = db.DB() 56 | 57 | mesher.Mesher()._job() 58 | mock_log_info.assert_called_once_with(mesher.Mesher.no_changes_msg) 59 | self.assertEqual(1, mock_db_get.call_count) 60 | self.assertEqual(1, mock_log_info.call_count) 61 | 62 | @mock.patch("netmet.server.mesher.LOG.info") 63 | @mock.patch("netmet.server.db.DB.server_config_get") 64 | @mock.patch("netmet.server.db.get") 65 | def test_job_applied_and_meshed(self, mock_db_get, mock_server_config_get, 66 | mock_log_info): 67 | 68 | mock_server_config_get.return_value = {"applied": True, "meshed": True} 69 | mock_db_get.return_value = db.DB() 70 | 71 | mesher.Mesher()._job() 72 | mock_log_info.assert_called_once_with(mesher.Mesher.no_changes_msg) 73 | self.assertEqual(1, mock_db_get.call_count) 74 | self.assertEqual(1, mock_log_info.call_count) 75 | 76 | @mock.patch("netmet.server.mesher.requests") 77 | @mock.patch("netmet.server.mesher.LOG") 78 | @mock.patch("netmet.server.db.DB.clients_get") 79 | @mock.patch("netmet.server.db.DB.server_config_meshed") 80 | @mock.patch("netmet.server.db.DB.server_config_get") 81 | @mock.patch("netmet.server.db.get") 82 | def test_job_applied_not_meshed( 83 | self, mock_db_get, mock_server_config_get, 84 | mock_server_config_meshed, mock_clients_get, mock_log, 85 | mock_requests): 86 | mock_server_config_get.return_value = { 87 | "id": "10", "applied": True, "meshed": False} 88 | db_ = db.DB() 89 | db_.own_url = "some_stuff" 90 | db_.elastic = mock.MagicMock() 91 | mock_db_get.return_value = db_ 92 | mock_clients_get.return_value = [ 93 | {k: str(i) for k in self.keys} for i in xrange(5) 94 | ] 95 | 96 | mesh = mesher.Mesher() 97 | mesh.netmet_server_url = "some_url" 98 | mesh._job() 99 | mock_log.info.assert_called_once_with(mesher.Mesher.new_config_msg) 100 | self.assertEqual(1, mock_log.info.call_count) 101 | -------------------------------------------------------------------------------- /tests/unit/utils/test_pusher.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: Godaddy Inc. 2 | 3 | import json 4 | import threading 5 | import time 6 | 7 | import futurist 8 | import mock 9 | import requests 10 | 11 | from netmet.utils import pusher 12 | from tests.unit import test 13 | 14 | 15 | class PusherTestCase(test.TestCase): 16 | 17 | def test_init(self): 18 | p = pusher.Pusher("some_url", period=20, max_count=40) 19 | self.assertEqual("some_url", p.url) 20 | self.assertEqual(20, p.period) 21 | self.assertEqual(40, p.max_count) 22 | 23 | def test_add(self): 24 | p = pusher.Pusher("") 25 | p.add(1) 26 | p.add(2) 27 | self.assertEqual(list(p.objects), [1, 2]) 28 | 29 | def test_send_stops(self): 30 | p = pusher.Pusher("") 31 | p._death = threading.Event() 32 | p._death.set() 33 | p._send() 34 | 35 | @mock.patch("netmet.utils.pusher.requests.session") 36 | def test_send(self, mock_session): 37 | mock_session.return_value.post.side_effect = [ 38 | mock.Mock(status_code=201), 39 | requests.exceptions.RequestException, 40 | mock.Mock(status_code=504), 41 | mock.Mock(status_code=201) 42 | ] 43 | 44 | p = pusher.Pusher("http://some_url", max_count=10) 45 | p._death = threading.Event() 46 | for i in xrange(22): 47 | p.add(i) 48 | 49 | p._send() 50 | calls = [ 51 | mock.call("http://some_url", 52 | data=json.dumps(range(0, 10)), headers={}, timeout=2), 53 | mock.call("http://some_url", 54 | data=json.dumps(range(10, 20)), headers={}, timeout=2), 55 | mock.call("http://some_url", 56 | data=json.dumps(range(10, 20)), headers={}, timeout=2), 57 | mock.call("http://some_url", 58 | data=json.dumps(range(10, 20)), headers={}, timeout=2) 59 | ] 60 | mock_session.return_value.post.assert_has_calls(calls) 61 | self.assertEqual(4, mock_session.return_value.post.call_count) 62 | 63 | @mock.patch("netmet.utils.pusher.requests.session") 64 | def test_send_hmac(self, mock_session): 65 | mock_session.return_value.post.return_value = ( 66 | mock.Mock(status_code=201)) 67 | 68 | p = pusher.Pusher("http://some_url", timeout=5, 69 | extra_headers=lambda x: {"a": "a"}, max_count=10) 70 | p._death = threading.Event() 71 | for i in xrange(11): 72 | p.add(i) 73 | 74 | p._send() 75 | 76 | mock_session.return_value.post.assert_called_once_with( 77 | "http://some_url", 78 | data=json.dumps(range(0, 10)), headers={"a": "a"}, timeout=5) 79 | self.assertEqual(1, mock_session.return_value.post.call_count) 80 | 81 | def test_send_periodically_stops(self): 82 | p = pusher.Pusher("") 83 | p._death = threading.Event() 84 | p._death.set() 85 | p._send_periodically() 86 | 87 | @mock.patch("netmet.utils.pusher.Pusher._send") 88 | def test_send_periodically(self, mock_send): 89 | 90 | p = pusher.Pusher("", period=0.1) 91 | p._death = threading.Event() 92 | 93 | def stop(): 94 | time.sleep(0.55) 95 | p._death.set() 96 | 97 | e = futurist.ThreadPoolExecutor() 98 | e.submit(stop) 99 | 100 | p._send_periodically() 101 | self.assertEqual(5, mock_send.call_count) 102 | e.shutdown() 103 | 104 | @mock.patch("netmet.utils.pusher.Pusher._send_periodically") 105 | def test_start_and_stop(self, mock_send_periodically): 106 | p = pusher.Pusher("", period=0.1) 107 | p.start() 108 | started_at = p._started_at 109 | worker = p._worker 110 | p.start() # test that start() can be called 2 times 111 | self.assertEqual(p._started_at, started_at) 112 | self.assertIs(p._worker, worker) 113 | time.sleep(0.1) 114 | self.assertEqual(1, mock_send_periodically.call_count) 115 | mock_send_periodically.assert_called_once_with() 116 | p.stop() 117 | p.stop() # test that stop() can be called 2 times 118 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![](/media/netmet-logo.png) 2 | 3 | NetMet is networking tool that allows you to track and analyze network uptime of multi data centers installations 4 | 5 | ## Motivation 6 | 7 | As a cloud provider one must gurantee SLA. For example 99.95% dataplane uptime 8 | (~43 seconds downtime per day). 9 | This means few things: 10 | - Cloud provider cannot rely on customer generated tickets for downtime measurements. 11 | - Cloud provider needs to be proactive: 12 | - Get alerts within seconds after any downtime occurs. 13 | - Have all required data for debugging in place 14 | - What Data Centers (DC), Availability Zones (AZ) & Servers are effected 15 | - Is it an underlay or overlay network issue 16 | - When the event started and when it stopped 17 | 18 | 19 | To verify uptime requirements we developed NetMet – a tool that constantly 20 | measures connectivity between all servers including those in different 21 | availability zones or even different data centers. 22 | 23 | ## Contributing 24 | 25 | Everybody is welcome to contribute to project. 26 | We use standard GitHub process with Issues & PR. 27 | 28 | ## Architecture 29 | 30 | ![Netmet Architecture](media/netmet-architecture.png) 31 | 32 | The client-server architecture of NetMet was designed with a clear separation 33 | of concerns in mind: 34 | 35 | - Clients periodically perform connectivity checks between each other 36 | - Clients periodically perform Internet connectivty checks 37 | - Clients send data to server 38 | - Server performs aggregation of data and stores it in ElasticSearch 39 | - Server exposes an API for retrieving aggregated data to facilitate visualization of it. 40 | 41 | ## Deployment 42 | 43 | Run all netmet clients and servers: 44 | 45 | - Netmet Server: run few instances of netmet servers under HAproxy or Nginx 46 | - Netmet Client: run 1 client per 1 server that should be monitored 47 | 48 | 49 | ### Physical placement 50 | 51 | To collect all metrics needed to monitor network of Data Centers use next schema: 52 | 53 | ![Netmet Deployment](media/netmet-deployment.png) 54 | 55 | - Run few instances of Netmet servers in different regions 56 | - Run 1 instance of Netmet client per 1 server 57 | - Elasticsearch cluster mode 58 | 59 | ### Logical placement 60 | 61 | To avoid Netmet downtime use next schema: 62 | 63 | ![Netmet Placement](media/netmet-deployment-logical.png) 64 | 65 | - Netmet servers should be run under Nginx/HAproxy/Lbaas (for now) 66 | - Netmet server may use multiple Elasticsearch addresses (no need in HA) 67 | 68 | ### Install & Run 69 | 70 | To install NetMet from source you should run next command 71 | 72 | pip install . # run it from root directory 73 | 74 | After that ``netmet`` command should become available 75 | 76 | To run Netmet Server 77 | 78 | APP=server NETMET_SERVER_URL="" NETMET_OWN_URL="" ELASTIC="" PORT=5005 netmet 79 | 80 | To run Netmet Client 81 | 82 | APP=client PORT=5005 netmet 83 | 84 | ### Configure & Upgrade 85 | 86 | Netmet is meant to be very easy to configuration. All configuration is done 87 | via Netmet server API method POST /api/v1/config which in future is going to 88 | update installation (remove/add clients), geneate new client configurations 89 | and update clients 90 | 91 | 92 | ![Updates & Upgrades](media/netmet-deployment-update.png) 93 | 94 | To configure the Netmet use POST /api/v2/config 95 | 96 | cat > config.json <<- EOM 97 | { 98 | "deployment": { 99 | "static": { 100 | "clients": [ 101 | { 102 | "az": "test-az", 103 | "dc": "test-dc", 104 | "host": "127.0.0.1", 105 | "ip": "127.0.0.1", 106 | "port": 5001 107 | } 108 | ] 109 | } 110 | }, 111 | "external": [ 112 | {"dest": "8.8.8.8", "period": 1, "protocol": "icmp", "timeout": 0.5} 113 | ], 114 | "mesher": { 115 | "full_mesh": {} 116 | } 117 | } 118 | EOM 119 | 120 | curl -H "Content-Type: application/json" -X POST -d '@config.json' ${NETMET_SERVER_URL}/api/v2/config 121 | 122 | ## Running Tests 123 | 124 | Running test is very easy. 125 | 126 | Install tox tool 127 | 128 | pip install tox 129 | 130 | Run tox 131 | 132 | tox # runs all tests 133 | tox -e pep8 # runs only pep8 code style checks 134 | tox -e py27 # runs unit tests using python 2.7 135 | -------------------------------------------------------------------------------- /tests/unit/utils/test_secure.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import mock 4 | 5 | from netmet.utils import secure 6 | from tests.unit import test 7 | 8 | 9 | class GenTestCase(test.TestCase): 10 | 11 | def test_generate_digest(self): 12 | self.assertEqual(type(secure.generate_digest("a", "b")), str) 13 | 14 | def test_is_valid_digest(self): 15 | digest = secure.generate_digest("abc", "1") 16 | self.assertTrue(secure.is_valid_digest(digest, "abc", ["2", "3", "1"])) 17 | self.assertFalse(secure.is_valid_digest(digest, "abc", ["2", "3"])) 18 | 19 | def test_gen_hmac_headers(self): 20 | h = secure.gen_hmac_headers("d", hmac="h") 21 | self.assertIn("X-AUTH-HMAC-TIMESTAMP", h) 22 | self.assertIn("X-AUTH-HMAC-DIGEST", h) 23 | 24 | self.assertTrue(secure.is_valid_digest( 25 | h["X-AUTH-HMAC-DIGEST"], "d" + h["X-AUTH-HMAC-TIMESTAMP"], ["h"])) 26 | 27 | @mock.patch("netmet.config.get") 28 | def test_gen_hmac_headers_none(self, mock_get): 29 | mock_get.return_value = [] 30 | 31 | self.assertEqual({}, secure.gen_hmac_headers("any_data")) 32 | mock_get.assert_called_once_with("hmac_keys") 33 | 34 | @mock.patch("netmet.config.get") 35 | def test_get_hmac_headers_env(self, mock_get): 36 | mock_get.return_value = ["a", "b"] 37 | 38 | h = secure.gen_hmac_headers("d") 39 | self.assertIn("X-AUTH-HMAC-TIMESTAMP", h) 40 | self.assertIn("X-AUTH-HMAC-DIGEST", h) 41 | self.assertTrue(secure.is_valid_digest( 42 | h["X-AUTH-HMAC-DIGEST"], "d" + h["X-AUTH-HMAC-TIMESTAMP"], ["a"])) 43 | 44 | @mock.patch("netmet.config.get") 45 | def test_check_hmac_auth_skip(self, mock_get): 46 | mock_get.return_value = True 47 | 48 | @secure.check_hmac_auth 49 | def f(a, b): 50 | return a - b 51 | 52 | self.assertEqual(1, f(3, 2)) 53 | mock_get.assert_called_once_with("hmac_skip_check") 54 | 55 | @mock.patch("netmet.utils.secure.flask") 56 | @mock.patch("netmet.utils.secure.config.get") 57 | def test_check_hmac_auth_no_header(self, mock_conf_get, mock_flask): 58 | mock_conf_get.return_value = False 59 | mock_flask.request.headers = {} 60 | 61 | @secure.check_hmac_auth 62 | def f(a, b): 63 | return a - b 64 | 65 | self.assertEqual(403, f(3, 2)[1]) 66 | 67 | @mock.patch("netmet.utils.secure.datetime") 68 | @mock.patch("netmet.utils.secure.flask") 69 | @mock.patch("netmet.utils.secure.config.get") 70 | def test_check_hmac_auth_time(self, mock_conf_get, mock_flask, mock_dt): 71 | mock_conf_get.return_value = False 72 | mock_dt.datetime.now.return_value.strftime.return_value = "41" 73 | mock_flask.request.headers = { 74 | "X-AUTH-HMAC-TIMESTAMP": "10", 75 | "X-AUTH-HMAC-DIGEST": "b" 76 | } 77 | 78 | @secure.check_hmac_auth 79 | def f(a, b): 80 | return a - b 81 | 82 | self.assertEqual(403, f(3, 2)[1]) 83 | mock_dt.datetime.now.return_value.strftime.assert_called_once_with( 84 | "%s") 85 | 86 | @mock.patch("netmet.utils.secure.datetime") 87 | @mock.patch("netmet.utils.secure.flask") 88 | @mock.patch("netmet.utils.secure.config.get") 89 | def test_check_hmac_auth_invalid(self, mock_conf_get, mock_flask, mock_dt): 90 | mock_dt.datetime.now.return_value.strftime.return_value = "22" 91 | cfg = {"hmac_skip_check": False, "hmac_keys": ["a", "b"]} 92 | mock_conf_get.side_effect = lambda x: cfg[x] 93 | 94 | mock_flask.request.get_data.return_value = "some_data" 95 | mock_flask.request.headers = { 96 | "X-AUTH-HMAC-TIMESTAMP": "1", "X-AUTH-HMAC-DIGEST": "wrong_digest"} 97 | 98 | @secure.check_hmac_auth 99 | def f(a, b): 100 | return a - b 101 | 102 | self.assertEqual(403, f(3, 2)[1]) 103 | 104 | @mock.patch("netmet.utils.secure.flask") 105 | @mock.patch("netmet.utils.secure.config.get") 106 | def test_check_basic_auth_invlid(self, mock_conf_get, mock_flask): 107 | mock_conf_get.return_value = ["users"] 108 | mock_flask.request.authorization = None 109 | mock_flask.Response = mock.MagicMock() 110 | 111 | @secure.check_basic_auth 112 | def f(): 113 | pass 114 | 115 | self.assertEqual(mock_flask.Response.return_value, f()) 116 | 117 | @mock.patch("netmet.utils.secure.config.get") 118 | def test_check_basic_auth_no_users(self, mock_conf_get): 119 | mock_conf_get.return_value = [] 120 | 121 | @secure.check_basic_auth 122 | def f(a, b): 123 | return b - a 124 | 125 | self.assertEqual(-1, f(3, 2)) 126 | -------------------------------------------------------------------------------- /upgrades/0001_upgrade_data_v1_v2.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | from __future__ import print_function 4 | 5 | import json 6 | import sys 7 | import time 8 | 9 | import elasticsearch 10 | 11 | 12 | MAPPING = { 13 | "settings": { 14 | "index": { 15 | "number_of_shards": 10, 16 | "number_of_replicas": 1 17 | } 18 | }, 19 | "mappings": { 20 | "south-north": { 21 | "dynamic": "strict", 22 | "properties": { 23 | "client_src.host": {"type": "keyword"}, 24 | "client_src.ip": {"type": "ip"}, 25 | "client_src.port": {"type": "integer"}, 26 | "client_src.hypervisor": {"type": "keyword"}, 27 | "client_src.az": {"type": "keyword"}, 28 | "client_src.dc": {"type": "keyword"}, 29 | "dest": {"type": "keyword"}, 30 | "protocol": {"type": "keyword"}, 31 | "timestamp": {"type": "date"}, 32 | "transmitted": {"type": "integer"}, 33 | "packet_size": {"type": "integer"}, 34 | "lost": {"type": "integer"}, 35 | "latency": {"type": "float"}, 36 | "ret_code": {"type": "integer"}, 37 | "events": {"type": "keyword"} 38 | } 39 | }, 40 | "east-west": { 41 | "dynamic": "strict", 42 | "properties": { 43 | "protocol": {"type": "keyword"}, 44 | "client_src.host": {"type": "keyword"}, 45 | "client_src.ip": {"type": "ip"}, 46 | "client_src.port": {"type": "integer"}, 47 | "client_src.hypervisor": {"type": "keyword"}, 48 | "client_src.az": {"type": "keyword"}, 49 | "client_src.dc": {"type": "keyword"}, 50 | "client_dest.host": {"type": "keyword"}, 51 | "client_dest.ip": {"type": "ip"}, 52 | "client_dest.port": {"type": "integer"}, 53 | "client_dest.hypervisor": {"type": "keyword"}, 54 | "client_dest.az": {"type": "keyword"}, 55 | "client_dest.dc": {"type": "keyword"}, 56 | "timestamp": {"type": "date"}, 57 | "packet_size": {"type": "integer"}, 58 | "transmitted": {"type": "integer"}, 59 | "lost": {"type": "integer"}, 60 | "latency": {"type": "float"}, 61 | "ret_code": {"type": "integer"}, 62 | "events": {"type": "keyword"} 63 | } 64 | } 65 | } 66 | } 67 | 68 | 69 | def upgrade(elastic, dry_run=False): 70 | elastic = elasticsearch.Elasticsearch(elastic) 71 | print(elastic.info()) 72 | 73 | all_idxs = elastic.indices.get_mapping().keys() 74 | do_for_idx = [k for k in all_idxs if k.startswith("netmet_data-") 75 | if "netmet_data_v2-%s" % k.split("-", 1)[1] 76 | not in all_idxs] 77 | 78 | print("All indexes: %s" % all_idxs) 79 | print("Reindex required for: %s" % do_for_idx) 80 | 81 | if dry_run: 82 | print("Exit from dry mode") 83 | return 84 | 85 | for source_idx in do_for_idx: 86 | target_idx = "netmet_data_v2-%s" % source_idx.split("-", 1)[1] 87 | elastic.indices.create(index=target_idx, body=MAPPING) 88 | 89 | body = { 90 | "source": {"index": source_idx}, 91 | "dest": {"index": target_idx}, 92 | "script": { 93 | "inline": "ctx._source.events = []; ctx._source.remove('mac')" 94 | } 95 | } 96 | task_id = elastic.reindex(body=json.dumps(body), 97 | requests_per_second=5000, 98 | wait_for_completion=False)["task"] 99 | 100 | print("Reindexing task id %s for index: %s" % (task_id, source_idx)) 101 | 102 | while True: 103 | time.sleep(2) 104 | t = elastic.tasks.get(task_id=task_id) 105 | status = t["task"]["status"] 106 | done, total = status["created"], status["total"] 107 | 108 | print("Status: %s from %s (%s%%)" 109 | % (done, total, 100 * float(done) / total), 110 | end="\r") 111 | sys.stdout.flush() 112 | 113 | if t.get("completed", False): 114 | print() 115 | break 116 | 117 | print("Done") 118 | 119 | 120 | def main(): 121 | if (len(sys.argv) == 1 122 | or len(sys.argv) > 3 123 | or len(sys.argv) == 3 and sys.argv[2] != "--check"): 124 | print("Invalid input. Usage:") 125 | print("python 0001_upgrade_data_v1_v2.py [--check]") 126 | return 1 127 | else: 128 | upgrade(sys.argv[1], dry_run=len(sys.argv) == 3) 129 | 130 | 131 | if __name__ == "__main__": 132 | main() 133 | -------------------------------------------------------------------------------- /netmet/utils/ping.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import datetime 4 | import random 5 | import select 6 | import socket 7 | import struct 8 | 9 | import monotonic 10 | 11 | 12 | class EXIT_STATUS(object): 13 | SUCCESS = 0 14 | ERROR_HOST_NOT_FOUND = 1 15 | ERROR_TIMEOUT = 2 16 | ERROR_ROOT_REQUIRED = 3 17 | ERROR_CANT_OPEN_SOCKET = 4 18 | ERROR_SOCKET_ERROR = 5 19 | 20 | 21 | class Ping(object): 22 | 23 | def __init__(self, dest, timeout=1, packet_size=55): 24 | self.ret_code = 0 25 | self.sock = None 26 | self.dest = dest 27 | self.dest_ip = None 28 | self.timeout = timeout 29 | self.packet_size = packet_size 30 | self._create_socket() 31 | 32 | def _create_socket(self): 33 | try: 34 | socket.inet_pton(socket.AF_INET, self.dest) 35 | dest_ip = self.dest 36 | except socket.error: 37 | try: 38 | dest_ip = socket.gethostbyname(self.dest) 39 | except socket.gaierror: 40 | self.ret_code = EXIT_STATUS.ERROR_HOST_NOT_FOUND 41 | return 42 | self.dest_ip = dest_ip 43 | 44 | try: 45 | self.sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, 46 | socket.getprotobyname("icmp")) 47 | except socket.error as e: 48 | if e.errno == 1: 49 | self.ret_code = EXIT_STATUS.ERROR_ROOT_REQUIRED 50 | else: 51 | self.ret_code = EXIT_STATUS.ERROR_CANT_OPEN_SOCKET 52 | return 53 | 54 | def __del__(self): 55 | if getattr(self, "sock", False): 56 | self.sock.close() 57 | 58 | def ping(self): 59 | result = { 60 | "rtt": None, 61 | "ret_code": None, 62 | "packet_size": self.packet_size, 63 | "timeout": self.timeout, 64 | "timestamp": datetime.datetime.now().isoformat(), 65 | "dest": self.dest_ip, 66 | "dest_ip": None 67 | } 68 | 69 | if not self.sock: 70 | self._create_socket() 71 | 72 | if self.ret_code: 73 | result["ret_code"] = self.ret_code 74 | return result 75 | 76 | try: 77 | packet_id = random.randint(0, 65534) 78 | packet = self._create_packet(packet_id) 79 | while packet: 80 | strated_at = monotonic.monotonic() 81 | sent = self.sock.sendto(packet, (self.dest_ip, 1)) 82 | packet = packet[sent:] 83 | 84 | delay = self._response_handler(packet_id, strated_at) 85 | if delay: 86 | result["ret_code"] = EXIT_STATUS.SUCCESS 87 | result["rtt"] = delay 88 | else: 89 | result["ret_code"] = EXIT_STATUS.ERROR_TIMEOUT 90 | except socket.error: 91 | result["ret_code"] = EXIT_STATUS.ERROR_SOCKET_ERROR 92 | 93 | return result 94 | 95 | def _checksum(self, src): 96 | checksum = 0 97 | count_to = len(src) & -2 98 | count = 0 99 | while count < count_to: 100 | this_val = ord(src[count + 1]) * 256 + ord(src[count]) 101 | checksum += this_val 102 | checksum &= 0xffffffff 103 | count += 2 104 | if count_to < len(src): 105 | checksum += ord(src[len(src) - 1]) 106 | checksum &= 0xffffffff 107 | checksum = (checksum >> 16) + (checksum & 0xffff) 108 | checksum += checksum >> 16 109 | answer = ~checksum 110 | answer &= 0xffff 111 | return answer >> 8 | (answer << 8 & 0xff00) 112 | 113 | def _create_packet(self, packet_id): 114 | """Creates a new echo request packet based on the given id.""" 115 | # Builds Dummy Header 116 | # Header is type (8), code (8), checksum (16), id (16), sequence (16) 117 | header = struct.pack("bbHHh", 8, 0, 0, packet_id, 1) 118 | data = self.packet_size * "Q" 119 | 120 | # Builds Real Header 121 | header = struct.pack( 122 | "bbHHh", 8, 0, socket.htons(self._checksum(header + data)), 123 | packet_id, 1) 124 | return header + data 125 | 126 | def _response_handler(self, packet_id, sent_at): 127 | """Handles packet response, returns delay or None if timeout.""" 128 | while monotonic.monotonic() < sent_at + self.timeout: 129 | ready = select.select([self.sock], [], [], self.timeout) 130 | received_at = monotonic.monotonic() 131 | if ready[0] == [] or received_at > sent_at + self.timeout: 132 | return None 133 | 134 | rec_packet, addr = self.sock.recvfrom(1024) 135 | received_at = monotonic.monotonic() 136 | icmp_header = rec_packet[20:28] 137 | type_, code, checksum, rec_id, sequence = struct.unpack( 138 | "bbHHh", icmp_header) 139 | 140 | if type_ == 0 and rec_id == packet_id: 141 | return (received_at - sent_at) * 1000 142 | 143 | return None 144 | -------------------------------------------------------------------------------- /tests/unit/client/test_conf.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import json 4 | 5 | import mock 6 | import requests 7 | 8 | from netmet.client import conf 9 | from tests.unit import test 10 | 11 | 12 | class ConfTestCase(test.TestCase): 13 | 14 | @mock.patch("netmet.client.conf.open", create=True) 15 | def test_restore_no_url(self, mock_open): 16 | mock_open.side_effect = mock.mock_open(read_data="{}").return_value 17 | conf.restore(["hmac"], 100) 18 | 19 | @mock.patch("netmet.client.conf.requests.post") 20 | @mock.patch("netmet.client.conf.restore._die.wait") 21 | @mock.patch("netmet.client.conf.open", create=True) 22 | def test_restore_io_error(self, mock_open, mock_wait, mock_post): 23 | mock_open.side_effect = [mock.mock_open( 24 | read_data=json.dumps({"refresh_conf_url": "aa"})).return_value 25 | ] 26 | mock_post.side_effect = Exception 27 | conf.restore(["hmac"], 100) 28 | 29 | @mock.patch("netmet.client.conf.LOG.warning") 30 | @mock.patch("netmet.client.conf.secure.gen_hmac_headers") 31 | @mock.patch("netmet.client.conf.requests.post") 32 | @mock.patch("netmet.client.conf.restore._die.wait") 33 | @mock.patch("netmet.client.conf.open", create=True) 34 | def test_restore_success_scenario(self, mock_open, mock_wait, 35 | mock_post, mock_gen_hmac, mock_warn): 36 | 37 | data = json.dumps({"refresh_conf_url": "aa"}) 38 | mock_open.side_effect = [mock.mock_open(read_data=data).return_value, 39 | mock.mock_open(read_data=data).return_value, 40 | mock.mock_open(read_data=data).return_value] 41 | 42 | mock_gen_hmac.return_value = "headers" 43 | 44 | mock_post.side_effect = [ 45 | requests.exceptions.RequestException, 46 | mock.Mock(status_code=500), 47 | mock.Mock(status_code=200) 48 | ] 49 | conf.restore(["hmac"], 50) 50 | mock_open.assert_has_calls( 51 | [mock.call(conf._RUNTIME_CONF_FILE % 50, "rw")] * 3) 52 | self.assertEqual(1, mock_warn.call_count) 53 | mock_post.assert_has_calls([ 54 | mock.call("aa", headers="headers"), 55 | mock.call("aa", headers="headers"), 56 | mock.call("aa", headers="headers") 57 | ]) 58 | mock_wait.assert_has_calls([mock.call(1), mock.call(1)]) 59 | 60 | @mock.patch("netmet.client.conf.os.remove") 61 | @mock.patch("netmet.client.conf.requests.post") 62 | @mock.patch("netmet.client.conf.restore._die.wait") 63 | @mock.patch("netmet.client.conf.open", create=True) 64 | def test_restore_404_scenario(self, mock_open, mock_wait, mock_post, 65 | mock_remove): 66 | mock_open.side_effect = [mock.mock_open( 67 | read_data=json.dumps({"refresh_conf_url": "aa"})).return_value 68 | ] 69 | mock_post.side_effect = [mock.Mock(status_code=404)] 70 | conf.restore(["hmac"], 50) 71 | mock_remove.assert_called_once_with(conf._RUNTIME_CONF_FILE % 50) 72 | 73 | @mock.patch("netmet.client.conf.open", create=True) 74 | def test_restore_url_get(self, mock_open): 75 | mock_open.side_effect = [ 76 | mock.mock_open( 77 | read_data=json.dumps({"refresh_conf_url": "aa"})).return_value, 78 | mock.mock_open(read_data=json.dumps({})).return_value 79 | ] 80 | self.assertEqual("aa", conf.restore_url_get(50)) 81 | self.assertIsNone(conf.restore_url_get(55)) 82 | mock_open.assert_has_calls( 83 | [mock.call(conf._RUNTIME_CONF_FILE % 50, "rw"), 84 | mock.call(conf._RUNTIME_CONF_FILE % 55, "rw")]) 85 | 86 | @mock.patch("netmet.client.conf.LOG.exception") 87 | @mock.patch("netmet.client.conf.open", create=True) 88 | def test_retore_url_get_no_file(self, mock_open, mock_log_exc): 89 | mock_open.side_effect = OSError 90 | self.assertIsNone(conf.restore_url_get(80)) 91 | mock_open.assert_called_once_with(conf._RUNTIME_CONF_FILE % 80, "rw") 92 | self.assertEqual(1, mock_log_exc.call_count) 93 | 94 | @mock.patch("netmet.client.conf.json.dump") 95 | @mock.patch("netmet.client.conf.os.path.exists") 96 | @mock.patch("netmet.client.conf.open", create=True) 97 | def test_restore_url_set(self, mock_open, mock_exists, mock_json_dump): 98 | mock_exists.return_value = True 99 | mock_open.return_value = mock.MagicMock() 100 | conf.restore_url_set("a", "b", 80) 101 | 102 | mock_exists.assert_called_once_with(conf._RUNTIME_CONF_DIR) 103 | mock_open.assert_called_once_with(conf._RUNTIME_CONF_FILE % 80, "w+") 104 | url = conf._RESTORE_API % {"server": "a", "host": "b", "port": 80} 105 | mock_json_dump.assert_called_once_with( 106 | {"refresh_conf_url": url}, 107 | mock_open.return_value.__enter__.return_value) 108 | 109 | @mock.patch("netmet.client.conf.json.dump") 110 | @mock.patch("netmet.client.conf.os.mkdir") 111 | @mock.patch("netmet.client.conf.os.path.exists") 112 | @mock.patch("netmet.client.conf.open", create=True) 113 | def test_restore_url_set_no_dir(self, mock_open, mock_exists, mock_mkdir, 114 | mock_json_dump): 115 | mock_exists.return_value = False 116 | mock_open.return_value = mock.MagicMock() 117 | conf.restore_url_set("c", "d", 80) 118 | mock_open.assert_called_once_with(conf._RUNTIME_CONF_FILE % 80, "w+") 119 | url = conf._RESTORE_API % {"server": "c", "host": "d", "port": 80} 120 | 121 | mock_json_dump.assert_called_once_with( 122 | {"refresh_conf_url": url}, 123 | mock_open.return_value.__enter__.return_value) 124 | 125 | @mock.patch("netmet.client.conf.LOG.exception") 126 | @mock.patch("netmet.client.conf.os.path.exists") 127 | def test_restore_url_set_unexpected_failure(self, mock_path_exists, 128 | mock_log_exc): 129 | mock_path_exists.side_effect = Exception 130 | conf.restore_url_set("any", "any", 80) 131 | self.assertEqual(1, mock_log_exc.call_count) 132 | 133 | @mock.patch("netmet.client.conf.os.remove") 134 | def test_restore_url_clear(self, mock_remove): 135 | conf.restore_url_clear(90) 136 | mock_remove.assert_called_once_with(conf._RUNTIME_CONF_FILE % 90) 137 | self.assertEqual(1, mock_remove.call_count) 138 | 139 | @mock.patch("netmet.client.conf.os.remove") 140 | def test_restore_url_clear_no_file(self, mock_remove): 141 | mock_remove.side_effect = OSError 142 | conf.restore_url_clear(500) 143 | mock_remove.assert_called_once_with(conf._RUNTIME_CONF_FILE % 500) 144 | self.assertEqual(1, mock_remove.call_count) 145 | -------------------------------------------------------------------------------- /netmet/client/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import json 4 | import logging 5 | import threading 6 | 7 | import flask 8 | from flask_helpers import routing 9 | import jsonschema 10 | 11 | from netmet.client import collector 12 | from netmet.client import conf 13 | from netmet import config 14 | from netmet.utils import secure 15 | from netmet.utils import status 16 | 17 | 18 | APP = flask.Flask(__name__, static_folder=None) 19 | APP.wsgi_app = status.StatusMiddleware(APP) 20 | LOG = logging.getLogger(__name__) 21 | 22 | # TOOD(boris-42): Move this to the Collector (unify with server). 23 | _LOCK = threading.Lock() 24 | _COLLECTOR = None 25 | _CONFIG = None 26 | _DEAD = False 27 | 28 | 29 | def _destroy_collector(): 30 | global _LOCK, _COLLECTOR, _CONFIG 31 | 32 | locked = False 33 | try: 34 | locked = _LOCK.acquire(False) 35 | if locked: 36 | if _COLLECTOR: 37 | _COLLECTOR.stop() 38 | _COLLECTOR = None 39 | _CONFIG = None 40 | finally: 41 | if locked: 42 | _LOCK.release() 43 | 44 | 45 | @APP.errorhandler(404) 46 | def not_found(error): 47 | """404 Page in case of failures.""" 48 | return flask.jsonify({"error": "Not Found"}), 404 49 | 50 | 51 | @APP.errorhandler(500) 52 | def internal_server_error(error): 53 | """500 Handle Internal Errors.""" 54 | return flask.jsonify({"error": "Internal Server Error"}), 500 55 | 56 | 57 | @APP.route("/api/v2/config", methods=['GET']) 58 | @APP.route("/api/v1/config", methods=['GET']) 59 | def get_config(): 60 | """Returns netmet config.""" 61 | global _CONFIG 62 | 63 | if _CONFIG: 64 | return flask.jsonify({"config": _CONFIG}), 200 65 | else: 66 | return flask.jsonify({"error": "Netmet is not configured"}), 404 67 | 68 | 69 | @APP.route("/api/v2/config", methods=['POST']) 70 | @secure.check_hmac_auth 71 | def set_config_v2(): 72 | global _LOCK, _COLLECTOR, _CONFIG 73 | 74 | if _DEAD: 75 | flask.abort(500) 76 | 77 | schema = { 78 | "type": "object", 79 | "definitions": { 80 | "client": { 81 | "type": "object", 82 | "properties": { 83 | "host": {"type": "string"}, 84 | "ip": {"type": "string"}, 85 | "port": {"type": "integer"}, 86 | "hypervisor": {"type": "string"}, 87 | "az": {"type": "string"}, 88 | "dc": {"type": "string"} 89 | }, 90 | "required": ["ip", "host", "az", "dc", "port"], 91 | "additionProperties": False 92 | }, 93 | "settings": { 94 | "type": "object", 95 | "properties": { 96 | "packet_size": {"type": "number", "minimum": 1}, 97 | "period": {"type": "number", "minimum": 0.1}, 98 | "timeout": {"type": "number", "minimum": 0.01} 99 | }, 100 | "required": ["period", "timeout"], 101 | "additionProperties": False 102 | } 103 | }, 104 | "properties": { 105 | "netmet_server": {"type": "string"}, 106 | "client_host": {"$ref": "#/definitions/client"}, 107 | "settings": {"$ref": "#/definitions/settings"}, 108 | "tasks": { 109 | "type": "array", 110 | "items": { 111 | "oneOf": [ 112 | { 113 | "type": "object", 114 | "properties": { 115 | "north-south": { 116 | "type": "object", 117 | "properties": { 118 | "dest": {"type": "string"}, 119 | "protocol": {"enum": ["http", "icmp"]}, 120 | "settings": { 121 | "$ref": "#/definitions/settings" 122 | } 123 | }, 124 | "required": ["dest", "protocol"], 125 | "additionalProperties": False 126 | } 127 | }, 128 | "required": ["north-south"], 129 | "additionalProperties": False 130 | }, 131 | { 132 | "type": "object", 133 | "properties": { 134 | "east-west": { 135 | "type": "object", 136 | "properties": { 137 | "dest": { 138 | "$ref": "#/definitions/client" 139 | }, 140 | "protocol": {"enum": ["http", "icmp"]}, 141 | "settings": { 142 | "$ref": "#/definitions/settings" 143 | } 144 | }, 145 | "required": ["dest", "protocol"], 146 | "additionalProperties": False 147 | } 148 | }, 149 | "required": ["east-west"], 150 | "additionalProperties": False 151 | } 152 | ] 153 | } 154 | } 155 | }, 156 | "required": ["netmet_server", "client_host", "tasks", "settings"], 157 | "additionalProperties": False 158 | } 159 | 160 | try: 161 | data = flask.request.get_json(silent=False, force=True) 162 | jsonschema.validate(data, schema) 163 | settings = data.pop("settings") 164 | settings.setdefault("packet_size", 55) 165 | for task in data["tasks"]: 166 | task[task.keys()[0]].setdefault("settings", {}) 167 | for k, v in settings.iteritems(): 168 | task[task.keys()[0]]["settings"].setdefault(k, v) 169 | 170 | LOG.info("Applying new config") 171 | LOG.info(json.dumps(data, indent=2)) 172 | except (ValueError, jsonschema.exceptions.ValidationError) as e: 173 | return flask.jsonify({"error": "Bad request: %s" % e}), 400 174 | 175 | with _LOCK: 176 | if _COLLECTOR: 177 | _COLLECTOR.stop() 178 | 179 | _CONFIG = data 180 | conf.restore_url_set(data["netmet_server"], 181 | data["client_host"]["host"], 182 | data["client_host"]["port"]) 183 | _COLLECTOR = collector.Collector(**data) 184 | _COLLECTOR.start() 185 | 186 | return flask.jsonify({"message": "Succesfully update netmet config"}), 201 187 | 188 | 189 | @APP.route("/api/v1/unregister", methods=['POST']) 190 | @APP.route("/api/v2/unregister", methods=['POST']) 191 | @secure.check_hmac_auth 192 | def unregister(): 193 | """Stops collector system.""" 194 | conf.restore_url_clear(config.get("port")) 195 | _destroy_collector() 196 | return flask.jsonify({"message": "Netmet clinet is unregistered."}), 201 197 | 198 | 199 | APP = routing.add_routing_map(APP, html_uri=None, json_uri="/") 200 | 201 | 202 | def die(): 203 | global _DEAD 204 | _DEAD = True 205 | _destroy_collector() 206 | 207 | 208 | def load(): 209 | conf.restore.async_call(config.get("hmac_keys"), config.get("port")) 210 | return APP 211 | -------------------------------------------------------------------------------- /netmet/client/collector.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import collections 4 | import datetime 5 | import logging 6 | import random 7 | import threading 8 | 9 | import futurist 10 | import futurist.rejection 11 | import monotonic 12 | import requests 13 | 14 | from netmet.utils import ping 15 | from netmet.utils import pusher 16 | from netmet.utils import secure 17 | 18 | LOG = logging.getLogger(__name__) 19 | 20 | 21 | class Collector(object): 22 | pinger_failed_msg = "Pinger failed to ping" 23 | 24 | def __init__(self, netmet_server, client_host, tasks): 25 | self.client_host = client_host 26 | self.tasks = tasks 27 | self.pusher = None 28 | if netmet_server: 29 | netmet_server = netmet_server.rstrip("/") 30 | self.pusher = pusher.Pusher("%s/api/v1/metrics" % netmet_server, 31 | extra_headers=secure.gen_hmac_headers) 32 | 33 | self.lock = threading.Lock() 34 | self.queue = collections.deque() 35 | self.death = threading.Event() 36 | self.started = False 37 | self.main_thread = None 38 | self.processing_thread = None 39 | 40 | def gen_periodic_ping(self, task): 41 | 42 | ip = (task["north-south"]["dest"] if "north-south" in task else 43 | task["east-west"]["dest"]["ip"]) 44 | settings = task[task.keys()[0]]["settings"] 45 | pinger = ping.Ping(ip, timeout=settings["timeout"], 46 | packet_size=settings["packet_size"]) 47 | 48 | def ping_(): 49 | try: 50 | result = pinger.ping() 51 | 52 | metric = { 53 | "client_src": self.client_host, 54 | "protocol": "icmp", 55 | "timestamp": result["timestamp"], 56 | "latency": result["rtt"], 57 | "packet_size": result["packet_size"], 58 | "lost": int(bool(result["ret_code"])), 59 | "transmitted": int(not bool(result["ret_code"])), 60 | "ret_code": result["ret_code"] 61 | } 62 | 63 | if "north-south" in task: 64 | metric["dest"] = task["north-south"]["dest"] 65 | self.queue.append({"north-south": metric}) 66 | 67 | else: 68 | metric["client_dest"] = task["east-west"]["dest"] 69 | self.queue.append({"east-west": metric}) 70 | 71 | except Exception: 72 | LOG.exception(self.pinger_failed_msg) 73 | 74 | return ping_ 75 | 76 | def gen_periodic_http_ping(self, task): 77 | 78 | def http_ping(): 79 | try: 80 | started_at = monotonic.monotonic() 81 | 82 | metric = { 83 | "client_src": self.client_host, 84 | "protocol": "http", 85 | "timestamp": datetime.datetime.now().isoformat(), 86 | "packet_size": 0, 87 | "latency": 0, 88 | "lost": 1, 89 | "transmitted": 0, 90 | "ret_code": 504 91 | } 92 | settings = task[task.keys()[0]]["settings"] 93 | 94 | if "east-west" in task: 95 | dest = task["east-west"]["dest"] 96 | metric["client_dest"] = dest 97 | dest = "http://%s:%s" % (dest["host"], dest["port"]) 98 | else: 99 | dest = task["north-south"]["dest"] 100 | metric["dest"] = dest 101 | 102 | r = requests.get(dest, timeout=settings["timeout"]) 103 | metric.update({ 104 | "latency": (monotonic.monotonic() - started_at) * 1000, 105 | "packet_size": len(r.content), 106 | "lost": int(r.status_code != 200), 107 | "transmitted": int(r.status_code == 200), 108 | "ret_code": r.status_code 109 | }) 110 | except requests.exceptions.ConnectionError: 111 | pass 112 | except Exception: 113 | LOG.exception("Collector failed to call another clinet API") 114 | finally: 115 | type_ = "east-west" if "east-west" in task else "north-south" 116 | self.queue.append({type_: metric}) 117 | 118 | return http_ping 119 | 120 | def process_results(self): 121 | while self.queue or not self.death.is_set(): 122 | while self.queue: 123 | item = self.queue.popleft() 124 | if self.pusher: 125 | self.pusher.add(item) # push to netmet server data 126 | else: 127 | print(item) # netmet client standalone mode 128 | 129 | self.death.wait(0.1) 130 | 131 | def _job_per_period(self, callables, period): 132 | 133 | def helper(): 134 | delay = period / float(len(callables)) 135 | pool = futurist.ThreadPoolExecutor( 136 | max_workers=50, 137 | check_and_reject=futurist.rejection.reject_when_reached(50)) 138 | 139 | with pool: 140 | while not self.death.is_set(): 141 | for item in callables: 142 | while not self.death.is_set(): 143 | try: 144 | pool.submit(item) 145 | break 146 | except futurist.RejectedSubmission: 147 | LOG.warning("Collector: Feed me! Mre threads!") 148 | self.death.wait(delay) 149 | 150 | self.death.wait(delay) 151 | 152 | # up to 0.1 second delay between runs of tasks 153 | self.death.wait(random.random() * min(delay, 1) / 10.0) 154 | return helper 155 | 156 | def _job(self): 157 | generators = { 158 | "icmp": self.gen_periodic_ping, 159 | "http": self.gen_periodic_http_ping 160 | } 161 | 162 | period_tasks = {} 163 | for task in self.tasks: 164 | task_data = task.values()[0] 165 | period_ = task_data["settings"]["period"] 166 | protocol = task_data["protocol"] 167 | period_tasks.setdefault(period_, []) 168 | if protocol in generators: 169 | period_tasks[period_].append(generators[protocol](task)) 170 | else: 171 | LOG.warning("Allowed protocols are: %s" % generators.keys()) 172 | 173 | pool = futurist.ThreadPoolExecutor(max_workers=len(period_tasks)) 174 | with pool: 175 | min_period = min(period_tasks) 176 | min_lag = float(min_period) / len(period_tasks[min_period]) 177 | lag = min(min_lag / len(period_tasks), 1) 178 | 179 | LOG.info(period_tasks) 180 | for period, callables in period_tasks.iteritems(): 181 | pool.submit(self._job_per_period(callables, period)) 182 | self.death.wait(lag) 183 | 184 | def start(self): 185 | with self.lock: 186 | if not self.started: 187 | self.started = True 188 | self.death = threading.Event() 189 | else: 190 | return 191 | 192 | if self.pusher: 193 | self.pusher.start() 194 | 195 | self.main_thread = threading.Thread(target=self._job) 196 | self.main_thread.daemon = True 197 | self.main_thread.start() 198 | 199 | self.processing_thread = threading.Thread(target=self.process_results) 200 | self.processing_thread.deamon = True 201 | self.processing_thread.start() 202 | return True 203 | 204 | def stop(self): 205 | with self.lock: 206 | if self.started and not self.death.is_set(): 207 | self.death.set() 208 | self.main_thread.join() 209 | self.processing_thread.join() 210 | if self.pusher: 211 | self.pusher.stop() 212 | self.started = False 213 | -------------------------------------------------------------------------------- /netmet/server/mesher.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import json 4 | import logging 5 | 6 | import requests 7 | 8 | from netmet import exceptions 9 | from netmet.server import db 10 | from netmet.server.utils import eslock 11 | from netmet.utils import secure 12 | from netmet.utils import worker 13 | 14 | LOG = logging.getLogger(__name__) 15 | 16 | 17 | class MeshPlugin(object): 18 | 19 | def mesh(self, config, clients, external): 20 | return [] 21 | 22 | 23 | class FullMesh(MeshPlugin): 24 | 25 | CONFIG_SCHEMA = { 26 | "type": "object", 27 | "properties": { 28 | "north-south": { 29 | "type": "object", 30 | "patternProperties": { 31 | "(http)|(icmp)": { 32 | "type": "object", 33 | "properties": { 34 | "period": {"type": "number"}, 35 | "timeout": {"type": "number"}, 36 | "packet_size": {"type": "number"} 37 | } 38 | } 39 | } 40 | }, 41 | }, 42 | "additionalProperties": False 43 | } 44 | 45 | def mesh(self, mesh_config, clients, external): 46 | 47 | for client in clients: 48 | tasks = [] 49 | 50 | for other_client in clients: 51 | if client == other_client: 52 | continue 53 | 54 | for protocol in ["http", "icmp"]: 55 | task = {"dest": other_client, "protocol": protocol} 56 | if mesh_config.get("north-south", {}).get(protocol): 57 | task["settings"] = mesh_config["north-south"][protocol] 58 | 59 | tasks.append({"east-west": task}) 60 | 61 | for ext in external: 62 | tasks.append({ 63 | "north-south": { 64 | "dest": ext["dest"], 65 | "protocol": ext["protocol"], 66 | "settings": { 67 | "period": ext["period"], 68 | "timeout": ext["timeout"] 69 | } 70 | } 71 | }) 72 | 73 | yield client, tasks 74 | 75 | 76 | class DistributedMesh(MeshPlugin): 77 | 78 | CONFIG_SCHEMA = { 79 | "type": "object", 80 | "properties": { 81 | "distributed_mesh": { 82 | "type": "object", 83 | "properties": { 84 | "north-south": { 85 | "type": "object", 86 | "properties": { 87 | "spread": { 88 | "enum": ["hypervisor", "dc", "az", "all"] 89 | }, 90 | "repeat": {"type": "number", "minimum": 1}, 91 | "period": {"type": "number", "minimum": 1}, 92 | "timeout": {"type": "number", "minimum": 1} 93 | } 94 | }, 95 | "east-west": { 96 | "type": "object", 97 | "properties": { 98 | "repeat_inside_az": { 99 | "type": "number", 100 | "minimum": 1 101 | }, 102 | "repeat_between_az": { 103 | "type": "number", 104 | "minimum": 1 105 | }, 106 | } 107 | } 108 | } 109 | } 110 | } 111 | } 112 | 113 | def mesh(self, config, clients, external): 114 | pass 115 | 116 | 117 | class Mesher(worker.LonelyWorker): 118 | no_changes_msg = "Mesher: no changes in config detected." 119 | new_config_msg = "Mesher detect new config: Remeshing clients." 120 | update_failed_msg = "Mesher update failed." 121 | lock_name = "update_config" 122 | client_api = "http://%s:%s/api/v2/config" 123 | # TODO(boris-42): Make this plugable 124 | plugins = { 125 | "full_mesh": FullMesh() 126 | } 127 | 128 | def __init__(self): 129 | """Do not use this method directly. Use create() instead.""" 130 | 131 | @classmethod 132 | def get_jsonschema(cls): 133 | return { 134 | "type": "object", 135 | "oneOf": [ 136 | {"properties": {name: p.CONFIG_SCHEMA}} 137 | for name, p in cls.plugins.iteritems() 138 | ] 139 | } 140 | 141 | @classmethod 142 | def create(cls, netmet_server_url): 143 | super(Mesher, cls).create() 144 | cls._self.netmet_server_url = netmet_server_url 145 | 146 | def _update_client(self, client, tasks): 147 | try: 148 | body = { 149 | "netmet_server": self.netmet_server_url, 150 | "client_host": client, 151 | "tasks": tasks, 152 | "settings": { 153 | "timeout": 1, 154 | "period": 5 155 | } 156 | } 157 | data = json.dumps(body) 158 | requests.post(self.client_api % (client["host"], client["port"]), 159 | data=data, headers=secure.gen_hmac_headers(data)) 160 | # Set client configured 161 | except Exception as e: 162 | exc = bool(LOG.isEnabledFor(logging.DEBUG)) 163 | msg = "Failed to update client config %s. " 164 | if exc: 165 | LOG.exception(msg % client["host"]) 166 | else: 167 | LOG.warning(msg % client["host"] + str(e)) 168 | 169 | return False, (500, msg % client["host"]) 170 | 171 | return True, 200, "Client updated" 172 | 173 | def _mesh(self, config): 174 | mesh = self.plugins[config["mesher"].keys()[0]].mesh 175 | 176 | allowed = set(["ip", "port", "host", "hypervisor", "dc", "az"]) 177 | clients = [{k: x[k] for k in allowed if k in x} 178 | for x in db.get().clients_get()] 179 | 180 | return mesh(config["mesher"].values()[0], clients, config["external"]) 181 | 182 | def refresh_client(self, host, port): 183 | lock_acuired = False 184 | attempts = 0 185 | 186 | while not lock_acuired and attempts < 3: 187 | try: 188 | with eslock.Glock("update_config"): 189 | config = db.get().server_config_get() 190 | if not (config["applied"] and config["meshed"]): 191 | return False, 404, "Configuration not found" 192 | 193 | for c in self._mesh(config["config"]): 194 | if c[0]["host"] == host and c[0]["port"] == port: 195 | return self._update_client(c[0], c[1]) 196 | 197 | return False, 404, "Client not found" 198 | 199 | except exceptions.GlobalLockException: 200 | attempts += 1 201 | self._death.wait(0.1) 202 | 203 | return False, 500, "Couldn't accuire lock" 204 | 205 | def _job(self): 206 | get_conf = db.get().server_config_get 207 | is_meshed = lambda cfg: (not cfg or (cfg and not cfg["applied"]) or 208 | (cfg and cfg["meshed"])) 209 | try: 210 | if is_meshed(get_conf()): 211 | LOG.info(self.no_changes_msg) 212 | else: 213 | with eslock.Glock("update_config"): 214 | # TODO(boris-42): Alogrithm should be a bit smarter 215 | # even if it is meshed try to update all not configured 216 | # clients. 217 | config = get_conf() 218 | if not is_meshed(config): 219 | LOG.info(self.new_config_msg) 220 | for c in self._mesh(config["config"]): 221 | # TODO(boris-42): Run this in parallel 222 | self._update_client(c[0], c[1]) 223 | db.get().server_config_meshed(config["id"]) 224 | else: 225 | LOG.info(self.no_changes_msg) 226 | 227 | except exceptions.GlobalLockException: 228 | pass # can't accuire lock, someone else is working on it 229 | 230 | except Exception: 231 | LOG.exception(self.update_failed_msg) 232 | -------------------------------------------------------------------------------- /tests/unit/client/test_collector.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import collections 4 | import StringIO 5 | import time 6 | 7 | import mock 8 | 9 | from netmet.client import collector 10 | from netmet.utils import pusher 11 | from tests.unit import test 12 | 13 | 14 | class CollectorTestCase(test.TestCase): 15 | 16 | def test_init_standalone(self): 17 | host = mock.MagicMock() 18 | tasks = [mock.MagicMock()] 19 | 20 | c = collector.Collector(None, host, tasks) 21 | self.assertEqual(tasks, c.tasks) 22 | self.assertIsNone(c.pusher) 23 | 24 | def test_init_full(self): 25 | host = mock.MagicMock() 26 | tasks = [mock.MagicMock()] 27 | 28 | c = collector.Collector("http://netmet_url", host, tasks) 29 | self.assertEqual(tasks, c.tasks) 30 | self.assertIsInstance(c.pusher, pusher.Pusher) 31 | 32 | @mock.patch("netmet.client.collector.ping.Ping.ping") 33 | def test_gen_periodic_ping_east_west(self, mock_ping): 34 | client_host = mock.MagicMock() 35 | task = { 36 | "east-west": { 37 | "dest": { 38 | "ip": "1.1.1.1" 39 | }, 40 | "settings": { 41 | "timeout": 5, 42 | "packet_size": 55 43 | } 44 | } 45 | } 46 | 47 | mock_ping.return_value = { 48 | "ret_code": 0, 49 | "rtt": 10, 50 | "timestamp": "ttt", 51 | "packet_size": 55 52 | } 53 | 54 | c = collector.Collector("some_url", client_host, []) 55 | c.gen_periodic_ping(task)() 56 | self.assertEqual(1, len(c.queue)) 57 | expected = { 58 | "client_src": client_host, 59 | "client_dest": task["east-west"]["dest"], 60 | "protocol": "icmp", 61 | "timestamp": "ttt", 62 | "latency": 10, 63 | "packet_size": 55, 64 | "lost": 0, 65 | "transmitted": 1, 66 | "ret_code": 0 67 | } 68 | self.assertEqual(expected, c.queue.pop()["east-west"]) 69 | 70 | @mock.patch("netmet.client.collector.ping.Ping.ping") 71 | def test_gen_periodic_ping_south_north(self, mock_ping): 72 | client_host = mock.MagicMock() 73 | task = { 74 | "north-south": { 75 | "dest": "1.1.1.1", 76 | "settings": { 77 | "timeout": 5, 78 | "packet_size": 55 79 | } 80 | } 81 | } 82 | 83 | mock_ping.return_value = { 84 | "ret_code": 0, 85 | "rtt": 10, 86 | "timestamp": "ttt", 87 | "packet_size": 55 88 | } 89 | 90 | c = collector.Collector("some_url", client_host, []) 91 | c.gen_periodic_ping(task)() 92 | self.assertEqual(1, len(c.queue)) 93 | expected = { 94 | "client_src": client_host, 95 | "dest": task["north-south"]["dest"], 96 | "protocol": "icmp", 97 | "timestamp": "ttt", 98 | "latency": 10, 99 | "packet_size": 55, 100 | "lost": 0, 101 | "transmitted": 1, 102 | "ret_code": 0 103 | } 104 | self.assertEqual(expected, c.queue.pop()["north-south"]) 105 | 106 | @mock.patch("netmet.client.collector.LOG") 107 | @mock.patch("netmet.client.collector.ping.Ping.ping") 108 | def test_gen_periodic_ping_raises(self, mock_ping, mock_log): 109 | c = collector.Collector("some_url", {}, []) 110 | mock_ping.side_effect = Exception 111 | ping_ = c.gen_periodic_ping({"east-west": { 112 | "dest": {"ip": "1.2.3.4"}, 113 | "settings": {"packet_size": 55, "timeout": 1} 114 | }}) 115 | ping_() 116 | 117 | mock_log.exception.assert_called_once_with(c.pinger_failed_msg) 118 | self.assertEqual(1, mock_log.exception.call_count) 119 | 120 | @mock.patch("netmet.client.collector.datetime") 121 | @mock.patch("netmet.client.collector.monotonic.monotonic") 122 | @mock.patch("netmet.client.collector.requests.get") 123 | def test_gen_periodic_http_ping_east_west(self, mock_get, mock_monotonic, 124 | mock_datetime): 125 | client_host = mock.MagicMock() 126 | task = { 127 | "east-west": { 128 | "dest": { 129 | "ip": "1.1.1.1", 130 | "host": "1.2.3.4", 131 | "port": 80 132 | }, 133 | "settings": { 134 | "timeout": 5, 135 | "packet_size": 55 136 | } 137 | } 138 | } 139 | 140 | mock_datetime.datetime.now.return_value.isoformat.return_value = "aaa" 141 | 142 | c = collector.Collector("some_url", client_host, [task]) 143 | mock_monotonic.side_effect = [1, 2] 144 | mock_get.return_value = mock.MagicMock( 145 | content="Q" * 10, status_code=200) 146 | c.gen_periodic_http_ping(task)() 147 | self.assertEqual(1, len(c.queue)) 148 | 149 | expected = { 150 | "client_src": client_host, 151 | "client_dest": task["east-west"]["dest"], 152 | "protocol": "http", 153 | "timestamp": "aaa", 154 | "latency": 1000, 155 | "packet_size": 10, 156 | "lost": 0, 157 | "transmitted": 1, 158 | "ret_code": 200 159 | } 160 | 161 | self.assertEqual(expected, c.queue.pop()["east-west"]) 162 | 163 | @mock.patch("netmet.client.collector.datetime") 164 | @mock.patch("netmet.client.collector.monotonic.monotonic") 165 | @mock.patch("netmet.client.collector.requests.get") 166 | def test_gen_periodic_http_ping_south_north(self, mock_get, mock_monotonic, 167 | mock_datetime): 168 | client_host = mock.MagicMock() 169 | task = { 170 | "north-south": { 171 | "dest": "http://1.2.3.4", 172 | "settings": { 173 | "timeout": 5, 174 | "packet_size": 55 175 | } 176 | } 177 | } 178 | 179 | mock_datetime.datetime.now.return_value.isoformat.return_value = "aaa" 180 | 181 | c = collector.Collector("some_url", client_host, [task]) 182 | mock_monotonic.side_effect = [1, 2] 183 | mock_get.return_value = mock.MagicMock( 184 | content="Q" * 10, status_code=200) 185 | c.gen_periodic_http_ping(task)() 186 | self.assertEqual(1, len(c.queue)) 187 | 188 | expected = { 189 | "client_src": client_host, 190 | "dest": task["north-south"]["dest"], 191 | "protocol": "http", 192 | "timestamp": "aaa", 193 | "latency": 1000, 194 | "packet_size": 10, 195 | "lost": 0, 196 | "transmitted": 1, 197 | "ret_code": 200 198 | } 199 | 200 | self.assertEqual(expected, c.queue.pop()["north-south"]) 201 | 202 | def test_gen_periodic_http_ping_requests_raises(self): 203 | pass 204 | 205 | def test_gen_periodic_http_ping_raises(self): 206 | pass 207 | 208 | @mock.patch("netmet.client.collector.pusher.Pusher.add") 209 | def test_process_results_with_pusher(self, mock_pusher_add): 210 | c = collector.Collector("some_url", {}, []) 211 | c.death.set() 212 | c.queue = collections.deque(xrange(100)) 213 | c.process_results() 214 | self.assertEqual(100, mock_pusher_add.call_count) 215 | 216 | @mock.patch("sys.stdout", new_callable=StringIO.StringIO) 217 | def test_process_results_without_pusher(self, mock_stdout): 218 | c = collector.Collector(None, {}, []) 219 | c.death.set() 220 | c.queue = collections.deque(xrange(10)) 221 | c.process_results() 222 | self.assertEqual("\n".join(str(i) for i in xrange(10)) + "\n", 223 | mock_stdout.getvalue()) 224 | 225 | @mock.patch("netmet.client.collector.Collector.gen_periodic_ping") 226 | @mock.patch("netmet.client.collector.Collector.gen_periodic_http_ping") 227 | def test_start_and_stop_no_pusher(self, mock_gen_ping, mock_gen_http_ping): 228 | mock_gen_ping.return_value = str 229 | mock_gen_http_ping.return_value = str 230 | c = collector.Collector(None, {}, [1, 2, 3]) 231 | c.start() 232 | time.sleep(0.05) 233 | c.stop() 234 | 235 | @mock.patch("netmet.client.collector.Collector.gen_periodic_ping") 236 | @mock.patch("netmet.client.collector.Collector.gen_periodic_http_ping") 237 | def test_start_and_stop_w_pusher(self, mock_gen_ping, mock_gen_http_ping): 238 | mock_gen_ping.return_value = str 239 | mock_gen_http_ping.return_value = str 240 | c = collector.Collector("netmet_url", {}, [1, 2, 3]) 241 | c.start() 242 | time.sleep(0.05) 243 | c.stop() 244 | -------------------------------------------------------------------------------- /netmet/server/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import functools 4 | import json 5 | import logging 6 | import os 7 | 8 | import elasticsearch 9 | import flask 10 | from flask_helpers import routing 11 | import jsonschema 12 | 13 | from netmet import exceptions 14 | from netmet.server import db 15 | from netmet.server import deployer 16 | from netmet.server import mesher 17 | from netmet.utils import secure 18 | from netmet.utils import status 19 | 20 | 21 | LOG = logging.getLogger(__name__) 22 | app = flask.Flask(__name__, static_folder=None) 23 | app.wsgi_app = status.StatusMiddleware(app) 24 | 25 | 26 | @app.errorhandler(404) 27 | def not_found(error): 28 | return flask.jsonify({"error": "Not Found"}), 404 29 | 30 | 31 | @app.errorhandler(500) 32 | def internal_server_error(error): 33 | """500 Handle Internal Errors.""" 34 | return flask.jsonify({"error": "Internal Server Error"}), 500 35 | 36 | 37 | def db_errors_handler(f): 38 | 39 | @functools.wraps(f) 40 | def wrapper(*args, **kwargs): 41 | try: 42 | return f(*args, **kwargs) 43 | except exceptions.DBRecordNotFound as e: 44 | return flask.jsonify({"error": str(e)}), 404 45 | except (exceptions.DBConflict, 46 | elasticsearch.exceptions.ConflictError) as e: 47 | return flask.jsonify({"error": str(e)}), 409 48 | 49 | return wrapper 50 | 51 | 52 | @app.route("/api/v1/config", methods=["GET"]) 53 | @secure.check_basic_auth 54 | @db_errors_handler 55 | def config_get(): 56 | """Returns netmet server configuration.""" 57 | server_config = db.get().server_config_get() 58 | 59 | if not server_config: 60 | return flask.jsonify({ 61 | "message": "Netmet server has not been setup yet"}), 404 62 | 63 | return flask.jsonify(server_config), 200 64 | 65 | 66 | @app.route("/api/v2/config", methods=["POST"]) 67 | @secure.check_basic_auth 68 | @db_errors_handler 69 | def config_set(): 70 | """Sets netmet server configuration.""" 71 | 72 | CONFIG_SCHEMA = { 73 | "type": "object", 74 | "properties": { 75 | "deployment": { 76 | "type": "object", 77 | "properties": { 78 | "static": { 79 | "type": "object", 80 | "properties": { 81 | "clients": { 82 | "type": "array", 83 | "items": { 84 | "type": "object", 85 | "properties": { 86 | "host": {"type": "string"}, 87 | "ip": {"type": "string"}, 88 | "port": {"type": "integer"}, 89 | "az": {"type": "string"}, 90 | "dc": {"type": "string"}, 91 | "hypervisor": {"type": "string"} 92 | }, 93 | "required": ["host", "ip", "az", "dc"], 94 | "additionalProperties": False 95 | } 96 | } 97 | }, 98 | "required": ["clients"], 99 | "additionalProperties": False 100 | } 101 | }, 102 | "required": ["static"], 103 | "additionalProperties": False 104 | }, 105 | "mesher": mesher.Mesher.get_jsonschema(), 106 | "external": { 107 | "type": "array", 108 | "items": { 109 | "type": "object", 110 | "properties": { 111 | "dest": {"type": "string"}, 112 | "protocol": {"enum": ["http", "icmp"]}, 113 | "period": {"type": "number"}, 114 | "timeout": {"type": "number"} 115 | }, 116 | "required": ["dest", "protocol", "period", "timeout"], 117 | "additionalProperties": False 118 | } 119 | } 120 | }, 121 | "required": ["deployment", "mesher"], 122 | "additionalProperties": False 123 | } 124 | try: 125 | server_config = flask.request.get_json(silent=False, force=True) 126 | jsonschema.validate(server_config, CONFIG_SCHEMA) 127 | except (ValueError, jsonschema.exceptions.ValidationError) as e: 128 | return flask.jsonify({"error": "Bad request: %s" % e}), 400 129 | 130 | db.get().server_config_add(server_config) 131 | deployer.Deployer.force_update() 132 | return flask.jsonify({"message": "Config was updated"}), 201 133 | 134 | 135 | @app.route("/api/v1/clients", methods=["GET"]) 136 | @db_errors_handler 137 | def clients_list(): 138 | """List all hosts.""" 139 | return flask.jsonify(db.get().clients_get()), 200 140 | 141 | 142 | @app.route("/api/v1/clients//", methods=["POST"]) 143 | @db_errors_handler 144 | @secure.check_hmac_auth 145 | def client_refresh(host, port): 146 | result = mesher.Mesher.get().refresh_client(host, int(port)) 147 | key = "message" if result[0] else "error" 148 | return flask.jsonify({key: result[2]}), result[1] 149 | 150 | 151 | @app.route("/api/v1/metrics", methods=["POST", "PUT"]) 152 | @db_errors_handler 153 | @secure.check_hmac_auth 154 | def metrics_add(): 155 | """Stores metrics to elastic.""" 156 | 157 | # Check just basic schema, let elastic check everything else 158 | schema = { 159 | "type": "array", 160 | "items": {"type": "object"} 161 | } 162 | 163 | try: 164 | req_data = flask.request.get_json(silent=False, force=True) 165 | jsonschema.validate(req_data, schema) 166 | except (ValueError, jsonschema.exceptions.ValidationError) as e: 167 | return flask.jsonify({"error": "Bad request: %s" % e}), 400 168 | else: 169 | data = {"north-south": [], "east-west": []} 170 | for d in req_data: 171 | for key in data: 172 | if key in d: 173 | data[key].append(d[key]) 174 | break 175 | else: 176 | LOG.warning("Ignoring wrong object %s" % json.dumps(d)) 177 | 178 | # TODO(boris-42): Use pusher here, to reduce amount of quires 179 | # from netmet server to elastic, join data from different netmet 180 | # clients requests before pushing them to elastic 181 | for k, v in data.iteritems(): 182 | if v: 183 | db.get().metrics_add(k, v) 184 | 185 | return flask.jsonify({"message": "successfully stored metrics"}), 201 186 | 187 | 188 | @app.route("/api/v1/metrics/", methods=["GET"]) 189 | @db_errors_handler 190 | def metrics_get(period): 191 | """Get metrics for period.""" 192 | return flask.jsonify({"message": "noop"}), 200 193 | 194 | 195 | @app.route("/api/v1/events", methods=["GET"]) 196 | @db_errors_handler 197 | def events_list(): 198 | offset = flask.request.args.get('offset', 0) 199 | limit = flask.request.args.get('limit', 100) 200 | active_only = flask.request.args.get('active_only') 201 | return flask.jsonify(db.get().events_list(offset, limit, active_only)), 200 202 | 203 | 204 | @app.route("/api/v1/events/", methods=["GET"]) 205 | @db_errors_handler 206 | def event_get(event_id): 207 | return flask.jsonify(db.event_get(event_id)[1]), 200 208 | 209 | 210 | @app.route("/api/v1/events/", methods=["POST"]) 211 | @secure.check_basic_auth 212 | @db_errors_handler 213 | def event_create(event_id): 214 | """If event already exists it recreates it.""" 215 | schema = { 216 | "type": "object", 217 | 218 | "definitions": { 219 | "traffic": { 220 | "type": "object", 221 | "properties": { 222 | "type": {"enum": ["host", "az", "dc"]}, 223 | "value": {"type": "string"} 224 | }, 225 | "required": ["type", "value"] 226 | } 227 | }, 228 | "properties": { 229 | "name": {"type": "string"}, 230 | "started_at": {"type": "string"}, 231 | "finished_at": {"type": "string"}, 232 | "traffic_from": {"$ref": "#/definitions/traffic"}, 233 | "traffic_to": {"$ref": "#/definitions/traffic"} 234 | }, 235 | "required": ["started_at", "name"], 236 | "additionalProperties": False 237 | } 238 | try: 239 | data = flask.request.get_json(silent=False, force=True) 240 | jsonschema.validate(data, schema) 241 | 242 | except (ValueError, jsonschema.exceptions.ValidationError) as e: 243 | return flask.jsonify({"error": "Bad request: %s" % e}), 400 244 | 245 | db.get().event_create(event_id, data) 246 | return flask.jsonify({"message": "Event created %s" % event_id}), 201 247 | 248 | 249 | @app.route("/api/v1/events//_stop", methods=["POST"]) 250 | @secure.check_basic_auth 251 | @db_errors_handler 252 | def event_stop(event_id): 253 | db.get().event_stop(event_id) 254 | return flask.jsonify({"message": "event %s stopped" % event_id}), 200 255 | 256 | 257 | @app.route("/api/v1/events/", methods=["DELETE"]) 258 | @secure.check_basic_auth 259 | @db_errors_handler 260 | def event_delete(event_id): 261 | db.get().event_delete(event_id) 262 | return flask.jsonify({"message": "event %s deleted" % event_id}), 202 263 | 264 | 265 | app = routing.add_routing_map(app, html_uri=None, json_uri="/") 266 | 267 | 268 | def die(): 269 | deployer.Deployer.destroy() 270 | mesher.Mesher.destroy() 271 | db.DB.destroy() 272 | 273 | 274 | def load(): 275 | NETMET_SERVER = os.getenv("NETMET_SERVER_URL") 276 | if not NETMET_SERVER: 277 | raise ValueError("Set NETMET_SERVER_URL to NetMet server public " 278 | "load balanced address") 279 | 280 | NETMET_OWN_URL = os.getenv("NETMET_OWN_URL") 281 | if not NETMET_OWN_URL: 282 | raise ValueError("Set NETMET_OWN_URL to NetMet server address") 283 | 284 | ELASTIC = os.getenv("ELASTIC", "") 285 | if not ELASTIC: 286 | raise ValueError("Set ELASTIC to list of urls of instances of cluster," 287 | " separated by comma.") 288 | 289 | db.DB.create(NETMET_OWN_URL, ELASTIC.split(",")) 290 | deployer.Deployer.create(mesher.Mesher.force_update) 291 | mesher.Mesher.create(NETMET_SERVER) 292 | 293 | return app 294 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /tests/unit/server/test_db.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import json 4 | 5 | import elasticsearch 6 | import mock 7 | 8 | from netmet import exceptions 9 | from netmet.server import db 10 | from tests.unit import test 11 | 12 | 13 | class DBTestCase(test.TestCase): 14 | 15 | def tearDown(self): 16 | db.DB.destroy() 17 | super(DBTestCase, self).tearDown() 18 | 19 | def test_get_not_init(self): 20 | self.assertIsNone(db.get()) 21 | 22 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 23 | @mock.patch("netmet.server.db.DB._rollover_data") 24 | @mock.patch("netmet.server.db.DB._ensure_schema") 25 | @mock.patch("netmet.server.db.DB._ensure_elastic") 26 | def test_create_mocked_all(self, mock_ensure_elastic, mock_ensure_schema, 27 | mock_rollover_data, mock_elastic): 28 | elastics = ["elastic"] 29 | db.DB.create("own_url", elastics) 30 | self.assertIsInstance(db.get(), db.DB) 31 | self.assertEqual(db.get().own_url, "own_url") 32 | self.assertEqual(db.get().elastic_urls, elastics) 33 | mock_ensure_elastic.assert_called_once_with() 34 | mock_ensure_schema.assert_called_once_with() 35 | mock_rollover_data.assert_called_once_with() 36 | mock_elastic.assert_called_once_with(elastics) 37 | 38 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 39 | def test_create_mocked_only_elastic(self, mock_elastic): 40 | elastics = ["elastic"] 41 | 42 | melastic = mock_elastic.return_value 43 | melastic.indices.exists.side_effect = [True, False, True] 44 | melastic.indices.create.side_effect = ( 45 | elasticsearch.exceptions.ElasticsearchException) 46 | 47 | melastic.indices.exists_alias.side_effect = [False, True] 48 | 49 | db.DB.create("own_url", elastics) 50 | mock_elastic.assert_called_once_with(elastics) 51 | 52 | melastic.info.assert_called_once_with() 53 | melastic.indices.exists.assert_has_calls( 54 | [mock.call("netmet_catalog"), mock.call("netmet_events")], 55 | any_order=True) 56 | 57 | melastic.indices.create.assert_has_calls( 58 | [mock.call(index="netmet_events", body=db.DB._EVENTS)]) 59 | 60 | melastic.indices.rollover.assert_called_once_with( 61 | alias="netmet_data_v2", body=mock.ANY) 62 | 63 | melastic.indices.exists_alias.assert_has_calls( 64 | [ 65 | mock.call(name="netmet_data_v2"), 66 | mock.call(name="netmet_data_v2") 67 | ], 68 | any_order=True) 69 | 70 | @mock.patch("netmet.server.db.DB._rollover_data") 71 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 72 | def test_job(self, mock_elastic, mock_rollover_data): 73 | db.DB()._job() 74 | self.assertEqual(0, mock_rollover_data.call_count) 75 | 76 | @mock.patch("netmet.server.db.DB._rollover_data") 77 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 78 | def test_job_inited(self, mock_elastic, mock_rollover_data): 79 | db.DB.create("own_url", ["elastics"]) 80 | self.assertEqual(1, mock_rollover_data.call_count) 81 | db.get()._job() 82 | self.assertEqual(2, mock_rollover_data.call_count) 83 | 84 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 85 | def test_clients_get(self, mock_elastic): 86 | mock_elastic.return_value.search.return_value = { 87 | "hits": {"hits": [{"_source": {"a": 1}}, {"_source": {"a": 2}}]} 88 | } 89 | db.DB.create("a", ["b"]) 90 | self.assertEqual(db.get().clients_get(), [{"a": 1}, {"a": 2}]) 91 | 92 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 93 | def test_clients_set(self, mock_elastic): 94 | fake_catalog = [{"a": 1}, {"b": 2}] 95 | expected_body = '{"index": {}}\n{"a": 1}\n{"index": {}}\n{"b": 2}' 96 | db.DB.create("a", ["b"]) 97 | db.get().clients_set(fake_catalog) 98 | 99 | mock_elastic.return_value.delete_by_query.assert_called_once_with( 100 | index="netmet_catalog", doc_type="clients", 101 | body={"query": {"match_all": {}}}) 102 | 103 | mock_elastic.return_value.bulk.assert_called_once_with( 104 | index="netmet_catalog", doc_type="clients", body=expected_body, 105 | refresh="true") 106 | 107 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 108 | def test_server_config_get(self, mock_elastic): 109 | config = { 110 | "config": json.dumps({"some": "stuff"}), 111 | "applied": True, 112 | "meshed": "False" 113 | } 114 | 115 | expected_result = { 116 | "id": "id", 117 | "config": {"some": "stuff"}, 118 | "applied": True, 119 | "meshed": "False" 120 | } 121 | 122 | query = { 123 | "sort": {"timestamp": {"order": "desc"}}, 124 | "query": {"term": {"applied": True}} 125 | } 126 | 127 | mock_elastic.return_value.search.side_effect = [ 128 | {"hits": {"hits": [{"_id": "id", "_source": config}]}}, 129 | {"hits": {"hits": []}} 130 | ] 131 | db.DB.create("a", ["b"]) 132 | self.assertEqual(expected_result, 133 | db.get().server_config_get(only_applied=True)) 134 | mock_elastic.return_value.search.assert_called_once_with( 135 | index="netmet_catalog", doc_type="config", body=query, size=1) 136 | 137 | self.assertIsNone(db.get().server_config_get(only_applied=True)) 138 | 139 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 140 | def test_server_config_add(self, mock_elastic): 141 | db.DB.create("a", ["b"]) 142 | db.get().server_config_add({"a": 1}) 143 | 144 | expected_body = { 145 | "config": '{"a": 1}', 146 | "applied": False, 147 | "meshed": False, 148 | "timestamp": mock.ANY 149 | } 150 | mock_elastic.return_value.index.assert_called_once_with( 151 | index="netmet_catalog", doc_type="config", body=expected_body, 152 | refresh="true") 153 | 154 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 155 | def test_server_config_apply(self, mock_elastic): 156 | db.DB.create("a", ["b"]) 157 | db.get().server_config_apply("id1") 158 | mock_elastic.return_value.update.assert_called_once_with( 159 | index="netmet_catalog", doc_type="config", id="id1", 160 | body={"doc": {"applied": True}}, refresh="true") 161 | 162 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 163 | def test_server_config_meshed(self, mock_elastic): 164 | db.DB.create("a", ["b"]) 165 | db.get().server_config_meshed("id2") 166 | mock_elastic.return_value.update.assert_called_once_with( 167 | index="netmet_catalog", doc_type="config", id="id2", 168 | body={"doc": {"meshed": True}}, refresh="true") 169 | 170 | def test_metrics_add_wrong_type(self): 171 | self.assertRaises(ValueError, 172 | db.DB().metrics_add, "some_invalid_type", []) 173 | 174 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 175 | def test_metrics_add(self, mock_elastic): 176 | mock_elastic.return_value.bulk.return_value = { 177 | "items": [ 178 | {"index": {"status": 200}}, 179 | {"index": {"status": 200}}, 180 | {"index": {"status": 500}} 181 | ] 182 | } 183 | db.DB.create("a", ["b"]) 184 | doc = {"a": {"b": 1}, "c": 2} 185 | expected_bulk = '{"index": {}}\n{"c": 2, "a.b": 1}' 186 | self.assertEqual({200: 2, 500: 1}, 187 | db.get().metrics_add("east-west", [doc])) 188 | mock_elastic.return_value.bulk.assert_called_once_with( 189 | index="netmet_data_v2", doc_type="east-west", body=expected_bulk) 190 | 191 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 192 | def test_event_get(self, mock_elastic): 193 | mock_elastic.return_value.get.return_value = { 194 | "found": True, "_version": 2, "_source": {"a": 1} 195 | } 196 | db.DB.create("a", ["b"]) 197 | self.assertEqual((2, {"a": 1}), db.get().event_get("some_id")) 198 | mock_elastic.return_value.get.assert_called_once_with( 199 | index="netmet_events", doc_type="events", id="some_id") 200 | 201 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 202 | def test_event_get_not_found(self, mock_elastic): 203 | mock_elastic.return_value.get.return_value = {"found": False} 204 | db.DB.create("a", ["b"]) 205 | self.assertRaises(exceptions.DBRecordNotFound, 206 | db.get().event_get, "some_id2") 207 | mock_elastic.return_value.get.assert_called_once_with( 208 | index="netmet_events", doc_type="events", id="some_id2") 209 | 210 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 211 | def test_events_list(self, mock_elastic): 212 | mock_elastic.return_value.search.return_value = { 213 | "hits": {"hits": [{"_source": {"a": 1}}, {"_source": {"b": 2}}]} 214 | } 215 | 216 | db.DB.create("a", ["b"]) 217 | self.assertEqual([{"a": 1}, {"b": 2}], 218 | db.get().events_list(10, 20, only_active=True)) 219 | 220 | expected_query = { 221 | "from": 10, 222 | "size": 20, 223 | "query": { 224 | "bool": { 225 | "must_not": [{"term": {"status": "deleted"}}], 226 | "should": [ 227 | {"range": {"finished_at": {"gt": "now/m"}}}, 228 | {"missing": {"field": "finished_at"}} 229 | ] 230 | }, 231 | "filter": [{"range": {"started_at": {"lte": "now/m"}}}] 232 | } 233 | } 234 | mock_elastic.return_value.search.assert_called_once_with( 235 | index="netmet_events", body=expected_query) 236 | 237 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 238 | @mock.patch("netmet.server.db.DB.event_get") 239 | def test_event_update(self, mock_event_get, mock_elastic): 240 | mock_elastic.return_value.update.side_effect = [ 241 | {"result": "updated"}, {"result": "noop"} 242 | ] 243 | mock_event_get.return_value = (2, {}) 244 | db.DB.create("a", ["b"]) 245 | self.assertTrue(db.get()._event_update("some_id", {"a": 1})) 246 | mock_event_get.assert_called_once_with("some_id") 247 | mock_elastic.return_value.update.assert_called_once_with( 248 | index="netmet_events", doc_type="events", id="some_id", 249 | body={"doc": {"a": 1}}, refresh='true', version=2) 250 | 251 | self.assertRaises(exceptions.DBConflict, 252 | db.get()._event_update, "some_other_id", {"a": 1}) 253 | 254 | @mock.patch("netmet.server.db.DB.event_get") 255 | def test_event_update_version_conflict(self, mock_event_get): 256 | mock_event_get.return_value = (1, {}) 257 | 258 | self.assertRaises(exceptions.DBConflict, 259 | db.DB()._event_update, "some_id", {}, version=2) 260 | 261 | def test_get_query(self): 262 | event = { 263 | "started_at": "a", 264 | "finished_at": "b", 265 | "traffic_to.type": "to_type", 266 | "traffic_to.value": "to_value", 267 | "traffic_from.type": "from_type", 268 | "traffic_from.value": "from_value" 269 | } 270 | 271 | expected_filter = [ 272 | {"range": {"timestamp": {"gte": "a", "lte": "b"}}}, 273 | {"term": {"client_dest.to_type": "to_value"}}, 274 | {"term": {"client_src.from_type": "from_value"}} 275 | ] 276 | id_query = {"term": {"events": "some_id"}} 277 | 278 | self.assertEqual( 279 | { 280 | "bool": { 281 | "filter": expected_filter, 282 | "must": [], 283 | "must_not": [id_query] 284 | } 285 | }, 286 | db.DB()._get_query(event, "some_id", "add")), 287 | 288 | self.assertEqual( 289 | { 290 | "bool": { 291 | "filter": expected_filter, 292 | "must": [id_query], 293 | "must_not": [] 294 | } 295 | }, 296 | db.DB()._get_query(event, "some_id", "remove")) 297 | 298 | def test_get_script(self): 299 | self.assertEqual( 300 | { 301 | "inline": "ctx._source.events.add('some_id')", 302 | "lang": "painless" 303 | }, 304 | db.DB()._get_script("some_id", "add")) 305 | 306 | self.assertEqual( 307 | { 308 | "inline": "ctx._source.events.remove" 309 | "(ctx._source.events.indexOf('some_id2'))", 310 | "lang": "painless" 311 | }, 312 | db.DB()._get_script("some_id2", "remove")) 313 | 314 | @mock.patch("netmet.server.db.DB._get_script") 315 | @mock.patch("netmet.server.db.DB._get_query") 316 | @mock.patch("netmet.server.db.DB.event_get") 317 | @mock.patch("netmet.server.db.DB._event_update") 318 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 319 | def test_event_upgrade_metrics(self, mock_elastic, mock_event_update, 320 | mock_event_get, mock_get_query, 321 | mock_get_script): 322 | melastic = mock_elastic.return_value 323 | 324 | mock_event_get.return_value = (1, {"task_id": "some_task2"}) 325 | melastic.tasks.get.return_value = {"completed": True} 326 | melastic.update_by_query.return_value = {"task": "some_task"} 327 | 328 | db.DB.create("a", ["b"]) 329 | db.get()._event_upgrade_metrics("some_id", "add") 330 | 331 | mock_event_get.assert_called_once_with("some_id") 332 | melastic.tasks.get.assert_called_once_with(task_id="some_task2") 333 | mock_event_update.assert_has_calls([ 334 | mock.call("some_id", {"task_id": None, "status": "updating"}, 335 | version=1), 336 | mock.call("some_id", {"task_id": "some_task", "status": "created"}) 337 | ]) 338 | 339 | body = { 340 | "query": mock_get_query.return_value, 341 | "script": mock_get_script.return_value, 342 | } 343 | melastic.update_by_query.assert_called_once_with( 344 | index="netmet_data_v2*", body=body, conflicts="proceed", 345 | wait_for_completion=False, requests_per_second=1000) 346 | mock_get_query.assert_called_once_with(mock_event_get.return_value[1], 347 | "some_id", "add") 348 | mock_get_script.assert_called_once_with("some_id", "add") 349 | 350 | @mock.patch("netmet.server.db.DB._event_upgrade_metrics") 351 | @mock.patch("netmet.server.db.elasticsearch.Elasticsearch") 352 | def test_event_create(self, mock_elastic, mock_event_upgrade_metrics): 353 | db.DB.create("a", ["b"]) 354 | data = {"some_data": 1} 355 | 356 | mock_elastic.return_value.create.return_value = {"created": True} 357 | 358 | self.assertTrue(db.get().event_create("some_id", data)) 359 | 360 | mock_elastic.return_value.create.assert_called_once_with( 361 | index="netmet_events", doc_type="events", id="some_id", 362 | body={"some_data": 1, "status": "created"}, refresh="true") 363 | 364 | mock_event_upgrade_metrics.assert_called_once_with("some_id", "add") 365 | 366 | @mock.patch("netmet.server.db.DB.event_get") 367 | @mock.patch("netmet.server.db.DB._event_update") 368 | def test_event_stop(self, mock_event_update, mock_event_get): 369 | mock_event_get.return_value = (2, {}) 370 | db.DB().event_stop("22") 371 | mock_event_get.assert_called_once_with("22") 372 | mock_event_update.assert_called_once_with( 373 | "22", {"finished_at": mock.ANY}, 2) 374 | 375 | @mock.patch("netmet.server.db.DB.event_get") 376 | def test_event_stop_conflict(self, mock_event_get): 377 | mock_event_get.return_value = (1, {"finished_at": "some_value"}) 378 | self.assertRaises(exceptions.DBConflict, 379 | db.DB().event_stop, "42") 380 | mock_event_get.assert_called_once_with("42") 381 | 382 | @mock.patch("netmet.server.db.DB._event_upgrade_metrics") 383 | @mock.patch("netmet.server.db.DB._event_update") 384 | def test_event_delete(self, mock_event_update, mock_event_upgrade_metrics): 385 | db.DB().event_delete("22") 386 | mock_event_update.assert_called_once_with("22", {"status": "deleted"}) 387 | mock_event_upgrade_metrics.assert_called_once_with("22", "remove") 388 | -------------------------------------------------------------------------------- /netmet/server/db.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017: GoDaddy Inc. 2 | 3 | import copy 4 | import datetime 5 | import json 6 | import logging 7 | 8 | import elasticsearch 9 | import morph 10 | 11 | from netmet import exceptions 12 | from netmet.utils import worker 13 | 14 | 15 | LOG = logging.getLogger(__name__) 16 | 17 | # Use streaming API instead of this 18 | # Elastic doesn't allow to query more than 10k elements (for the reason) 19 | MAX_AMOUNT_OF_SERVERS = 10000 20 | 21 | 22 | def get(): 23 | return DB.get() 24 | 25 | 26 | class DB(worker.LonelyWorker): 27 | _period = 600 # every 10 minutes check needs to rollover index 28 | 29 | _CATALOG_IDX = "netmet_catalog" 30 | _DATA_ALIAS = "netmet_data_v2" 31 | _DATA_IDX = "<%s-{now/d}-000001>" % _DATA_ALIAS 32 | _EVENTS_IDX = "netmet_events" 33 | 34 | _CATALOG = { 35 | "settings": { 36 | "index": { 37 | "number_of_shards": 3, 38 | "number_of_replicas": 3 39 | } 40 | }, 41 | "mappings": { 42 | "clients": { 43 | "dynamic": "strict", 44 | "properties": { 45 | "host": {"type": "keyword"}, 46 | "ip": {"type": "ip"}, 47 | "port": {"type": "integer"}, 48 | "mac": {"type": "keyword"}, 49 | "hypervisor": {"type": "keyword"}, 50 | "az": {"type": "keyword"}, 51 | "dc": {"type": "keyword"}, 52 | "configured": {"type": "boolean"} 53 | } 54 | }, 55 | "config": { 56 | "dynamic": "strict", 57 | "properties": { 58 | "timestamp": {"type": "date"}, 59 | "config": {"type": "text"}, 60 | "applied": {"type": "boolean"}, 61 | "meshed": {"type": "boolean"} 62 | } 63 | } 64 | } 65 | } 66 | 67 | _DATA = { 68 | "settings": { 69 | "index": { 70 | "number_of_shards": 10, 71 | "number_of_replicas": 1 72 | } 73 | }, 74 | "mappings": { 75 | "north-south": { 76 | "dynamic": "strict", 77 | "properties": { 78 | "client_src.host": {"type": "keyword"}, 79 | "client_src.ip": {"type": "ip"}, 80 | "client_src.port": {"type": "integer"}, 81 | "client_src.hypervisor": {"type": "keyword"}, 82 | "client_src.az": {"type": "keyword"}, 83 | "client_src.dc": {"type": "keyword"}, 84 | "dest": {"type": "keyword"}, 85 | "protocol": {"type": "keyword"}, 86 | "timestamp": {"type": "date"}, 87 | "transmitted": {"type": "integer"}, 88 | "packet_size": {"type": "integer"}, 89 | "lost": {"type": "integer"}, 90 | "latency": {"type": "float"}, 91 | "ret_code": {"type": "integer"}, 92 | "events": {"type": "keyword"} 93 | } 94 | }, 95 | "east-west": { 96 | "dynamic": "strict", 97 | "properties": { 98 | "protocol": {"type": "keyword"}, 99 | "client_src.host": {"type": "keyword"}, 100 | "client_src.ip": {"type": "ip"}, 101 | "client_src.port": {"type": "integer"}, 102 | "client_src.hypervisor": {"type": "keyword"}, 103 | "client_src.az": {"type": "keyword"}, 104 | "client_src.dc": {"type": "keyword"}, 105 | "client_dest.host": {"type": "keyword"}, 106 | "client_dest.ip": {"type": "ip"}, 107 | "client_dest.port": {"type": "integer"}, 108 | "client_dest.hypervisor": {"type": "keyword"}, 109 | "client_dest.az": {"type": "keyword"}, 110 | "client_dest.dc": {"type": "keyword"}, 111 | "timestamp": {"type": "date"}, 112 | "packet_size": {"type": "integer"}, 113 | "transmitted": {"type": "integer"}, 114 | "lost": {"type": "integer"}, 115 | "latency": {"type": "float"}, 116 | "ret_code": {"type": "integer"}, 117 | "events": {"type": "keyword"} 118 | } 119 | } 120 | } 121 | } 122 | 123 | _EVENTS = { 124 | "settings": { 125 | "index": { 126 | "number_of_shards": 3, 127 | "number_of_replicas": 3 128 | }, 129 | }, 130 | "mappings": { 131 | "events": { 132 | "dynamic": "strict", 133 | "properties": { 134 | "name": {"type": "keyword"}, 135 | "status": {"type": "keyword"}, 136 | "started_at": {"type": "date"}, 137 | "finished_at": {"type": "date"}, 138 | "task_id": {"type": "keyword"}, 139 | "traffic_from.type": {"type": "keyword"}, 140 | "traffic_from.value": {"type": "keyword"}, 141 | "traffic_to.type": {"type": "keyword"}, 142 | "traffic_to.value": {"type": "keyword"} 143 | } 144 | } 145 | } 146 | } 147 | 148 | @classmethod 149 | def create(cls, own_url, elastic): 150 | super(DB, cls).create() 151 | cls._self.own_url = own_url 152 | cls._self.elastic_urls = elastic 153 | cls._self.elastic = elasticsearch.Elasticsearch(elastic) 154 | cls._self._ensure_elastic() 155 | cls._self._ensure_schema() 156 | cls._self._rollover_data() 157 | cls._self._inited = True 158 | 159 | def _job(self): 160 | try: 161 | if getattr(self, "_inited", False): 162 | self._rollover_data() 163 | except Exception: 164 | LOG.exception("DB update failed") 165 | 166 | def _rollover_data(self): 167 | body = {"conditions": {"max_age": "1d", "max_docs": 10000000}} 168 | body.update(self._DATA) 169 | self.elastic.indices.rollover(alias=DB._DATA_ALIAS, body=body) 170 | 171 | def _ensure_elastic(self): 172 | self.elastic.info() 173 | 174 | def _ensure_schema(self): 175 | """Ensures that indexes exist & have right schemas. 176 | 177 | If there is no index this method creates it. 178 | If there is index but it has different schema process is shutdown 179 | """ 180 | data = [(self._CATALOG_IDX, self._CATALOG), 181 | (self._EVENTS_IDX, self._EVENTS)] 182 | 183 | for idx, mapping in data: 184 | try: 185 | if not self.elastic.indices.exists(idx): 186 | self.elastic.indices.create(index=idx, body=mapping) 187 | except elasticsearch.exceptions.ElasticsearchException as e: 188 | if not self.elastic.indices.exists(idx): 189 | raise exceptions.DBInitFailure( 190 | elastic=self.elastic, message=e) 191 | 192 | try: 193 | if not self.elastic.indices.exists_alias(name=DB._DATA_ALIAS): 194 | new_data = copy.deepcopy(self._DATA) 195 | new_data["aliases"] = {DB._DATA_ALIAS: {}} 196 | self.elastic.indices.create(index=self._DATA_IDX, 197 | body=new_data) 198 | except elasticsearch.exceptions.ElasticsearchException as e: 199 | if not self.elastic.indices.exists_alias(name=DB._DATA_ALIAS): 200 | raise exceptions.DBInitFailure(elastic=self.elastic, message=e) 201 | 202 | def clients_get(self): 203 | data = self.elastic.search(index=DB._CATALOG_IDX, doc_type="clients", 204 | size=MAX_AMOUNT_OF_SERVERS) 205 | 206 | return [morph.unflatten(x["_source"]) for x in data["hits"]["hits"]] 207 | 208 | def clients_set(self, catalog): 209 | bulk_body = [] 210 | for c in catalog: 211 | bulk_body.append(json.dumps({"index": {}})) 212 | bulk_body.append(json.dumps(morph.flatten(c))) 213 | 214 | self.elastic.delete_by_query(index=DB._CATALOG_IDX, 215 | doc_type="clients", 216 | body={"query": {"match_all": {}}}) 217 | 218 | self.elastic.bulk(index=DB._CATALOG_IDX, doc_type="clients", 219 | body="\n".join(bulk_body), 220 | refresh="true") 221 | 222 | def server_config_get(self, only_applied=False): 223 | query = {"sort": {"timestamp": {"order": "desc"}}} 224 | if only_applied: 225 | query["query"] = {"term": {"applied": True}} 226 | result = self.elastic.search(index=DB._CATALOG_IDX, doc_type="config", 227 | body=query, size=1) 228 | 229 | hits = result["hits"]["hits"] 230 | if not hits: 231 | return 232 | 233 | result = hits[0]["_source"] 234 | result["config"] = json.loads(result["config"]) 235 | result["id"] = hits[0]["_id"] 236 | return result 237 | 238 | def server_config_add(self, config): 239 | """Adds new server config.""" 240 | body = { 241 | "config": json.dumps(config), 242 | "applied": False, 243 | "meshed": False, 244 | "timestamp": datetime.datetime.now().isoformat() 245 | } 246 | self.elastic.index(index=DB._CATALOG_IDX, 247 | doc_type="config", body=body, 248 | refresh="true") 249 | 250 | def server_config_apply(self, id_): 251 | self.elastic.update(index=DB._CATALOG_IDX, 252 | doc_type="config", id=id_, 253 | body={"doc": {"applied": True}}, 254 | refresh="true") 255 | 256 | def server_config_meshed(self, id_): 257 | self.elastic.update(index=DB._CATALOG_IDX, 258 | doc_type="config", id=id_, 259 | body={"doc": {"meshed": True}}, 260 | refresh="true") 261 | 262 | def metrics_add(self, doc_type, data): 263 | if doc_type not in ["east-west", "north-south"]: 264 | raise ValueError("Wrong doc type") 265 | 266 | bulk_body = [] 267 | for d in data: 268 | bulk_body.append(json.dumps({"index": {}})) 269 | bulk_body.append(json.dumps(morph.flatten(d))) 270 | 271 | # NOTE(boris-42): We should analyze Elastic response here. 272 | r = self.elastic.bulk(index=DB._DATA_ALIAS, doc_type=doc_type, 273 | body="\n".join(bulk_body)) 274 | 275 | results = {} 276 | for it in r["items"]: 277 | k = it["index"]["status"] 278 | results.setdefault(k, 0) 279 | results[k] += 1 280 | 281 | LOG.info("Metrics bulk insert result: %s" % results) 282 | return results 283 | 284 | def event_get(self, id_): 285 | r = self.elastic.get(index=DB._EVENTS_IDX, doc_type="events", id=id_) 286 | if not r["found"]: 287 | raise exceptions.DBRecordNotFound(record=id_) 288 | return r["_version"], r["_source"] 289 | 290 | def events_list(self, offset, limit, only_active=False): 291 | query = { 292 | "from": offset, 293 | "size": limit, 294 | "query": { 295 | "bool": { 296 | "must_not": [{"term": {"status": "deleted"}}] 297 | } 298 | } 299 | } 300 | if only_active: 301 | query["query"]["filter"] = [ 302 | {"range": {"started_at": {"lte": "now/m"}}}] 303 | 304 | query["query"]["bool"]["should"] = [ 305 | {"range": {"finished_at": {"gt": "now/m"}}}, 306 | {"missing": {"field": "finished_at"}} 307 | ] 308 | 309 | results = self.elastic.search(index=DB._EVENTS_IDX, body=query) 310 | return [r["_source"] for r in results["hits"]["hits"]] 311 | 312 | def _event_update(self, id_, doc, version=None): 313 | v, el = self.event_get(id_) 314 | 315 | version = v if version is None else version 316 | if v != version: 317 | raise exceptions.DBConflict( 318 | "Record %s was updated by another concurrent request" % id_) 319 | 320 | body = {"doc": doc} 321 | r = self.elastic.update(index=DB._EVENTS_IDX, doc_type="events", 322 | id=id_, version=version, 323 | body=body, refresh="true") 324 | 325 | if not r["result"] == "updated": 326 | raise exceptions.DBConflict( 327 | "Record %s was update by other concurrent request." % id_) 328 | 329 | return True 330 | 331 | def _get_query(self, event, id_, action): 332 | query = {"must": [], "must_not": [], "filter": []} 333 | 334 | if event["started_at"] or event["finished_at"]: 335 | q = {"timestamp": {}} 336 | if event["started_at"]: 337 | q["timestamp"]["gte"] = event["started_at"] 338 | if event["finished_at"]: 339 | q["timestamp"]["lte"] = event["finished_at"] 340 | query["filter"].append({"range": q}) 341 | 342 | if event.get("traffic_to.type"): 343 | term = "client_dest.%s" % event["traffic_to.type"] 344 | query["filter"].append({"term": {term: event["traffic_to.value"]}}) 345 | 346 | if event.get("traffic_from.type"): 347 | term = "client_src.%s" % event["traffic_from.type"] 348 | query["filter"].append( 349 | {"term": {term: event["traffic_from.value"]}}) 350 | 351 | if action == "remove": 352 | query["must"].append({"term": {"events": id_}}) 353 | elif action == "add": 354 | query["must_not"].append({"term": {"events": id_}}) 355 | 356 | return {"bool": query} 357 | 358 | def _get_script(self, id_, action): 359 | if action == "add": 360 | return { 361 | "inline": "ctx._source.events.add('%s')" % id_, 362 | "lang": "painless" 363 | } 364 | 365 | elif action == "remove": 366 | return { 367 | "inline": "ctx._source.events.remove" 368 | "(ctx._source.events.indexOf('%s'))" % id_, 369 | "lang": "painless" 370 | } 371 | 372 | def _event_upgrade_metrics(self, id_, action): 373 | version, event = self.event_get(id_) 374 | 375 | if event.get("task_id"): 376 | t = self.elastic.tasks.get(task_id=event["task_id"]) 377 | if not t.get("completed", False): 378 | raise exceptions.DBConflict( 379 | "Task %s is still running" % event["task_id"]) 380 | 381 | self._event_update( 382 | id_, {"task_id": None, "status": "updating"}, version=version) 383 | else: 384 | self._event_update(id_, {"status": "updating"}, version=version) 385 | 386 | body = { 387 | "query": self._get_query(event, id_, action), 388 | "script": self._get_script(id_, action) 389 | } 390 | 391 | result = self.elastic.update_by_query( 392 | index=DB._DATA_ALIAS + "*", body=body, conflicts="proceed", 393 | wait_for_completion=False, requests_per_second=1000) 394 | 395 | self._event_update(id_, 396 | {"task_id": result["task"], "status": "created"}) 397 | 398 | def event_create(self, id_, data): 399 | data = dict(data) 400 | data["status"] = "created" 401 | r = self.elastic.create(index=DB._EVENTS_IDX, doc_type="events", 402 | id=id_, body=data, refresh="true") 403 | if r["created"]: 404 | self._event_upgrade_metrics(id_, "add") 405 | return True 406 | 407 | def event_stop(self, id_): 408 | version, event = self.event_get(id_) 409 | 410 | if event.get("finished_at", None): 411 | raise exceptions.DBConflict("Event is already stopped.") 412 | 413 | return self._event_update( 414 | id_, {"finished_at": datetime.datetime.now().isoformat()}, version) 415 | 416 | def event_delete(self, id_): 417 | self._event_upgrade_metrics(id_, "remove") 418 | return self._event_update(id_, {"status": "deleted"}) 419 | 420 | def lock_acquire(self, name, ttl): 421 | # release old one if ttl hit 422 | data = { 423 | "updated_at": datetime.datetime.now().isoformat(), 424 | "url": self.own_url, 425 | "ttl": ttl 426 | } 427 | try: 428 | # TODO(boris-42): Check whatever we can delete obsolate lock 429 | idx = "netmet_lock_%s" % name 430 | self.elastic.indices.create(idx, body={}) 431 | self.elastic.index(index=idx, doc_type="lock", id=1, body=data) 432 | return True 433 | except elasticsearch.exceptions.ElasticsearchException: 434 | return False 435 | 436 | def lock_release(self, name): 437 | try: 438 | # TODO(boris-42): Try few times to delete lock 439 | self.elastic.indices.delete("netmet_lock_%s" % name) 440 | return True 441 | except elasticsearch.exceptions.ElasticsearchException: 442 | return False 443 | --------------------------------------------------------------------------------