├── .gitignore ├── Makefile ├── README.md ├── airflow_operators_metrics ├── metrics.py └── server.py ├── alpine.Dockerfile ├── example ├── dags │ ├── __init__.py │ └── memory.py ├── docker-compose.yaml └── prometheus.yml ├── requirements.txt ├── setup.py └── ubuntu.Dockerfile /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | .idea 109 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all build push 2 | 3 | BASE ?= ubuntu 4 | TAG ?= ${BASE} 5 | 6 | alpine: 7 | $(eval BASE := alpine) 8 | 9 | all: build push 10 | 11 | build: 12 | docker build -f $(BASE).Dockerfile -t mastak/airflow_operator_stats:$(TAG) . 13 | 14 | push: 15 | docker push mastak/airflow_operator_stats:$(TAG) 16 | 17 | run: 18 | docker run --privileged --cap-add SYS_PTRACE -v /proc:/host-proc:ro \ 19 | -e CUSTOM_PROCFS_PATH=/host-proc \ 20 | mastak/airflow_operator_stats:$(TAG) 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Airflow operator stats 2 | 3 | Gather system information about airflow processes. 4 | 5 | ### How to use 6 | 7 | * Run airflow_operators_metrics container on each node with airflow workers like: 8 | ``` 9 | docker run -d \ 10 | -v /proc:/host/proc:ro \ 11 | -v /etc/hostname:/host/hostname:ro \ 12 | -e CUSTOM_PROCFS_PATH=/host/proc \ 13 | -e HOSTNAME_PATH=/host/hostname \ 14 | mastak/airflow_operator_stats:alpine 15 | ``` 16 | 17 | 18 | * Add scrapper config to the prometheus, docker swarm example: 19 | ``` 20 | scrape_configs: 21 | - job_name: 'airflow-exporter' 22 | dns_sd_configs: 23 | - names: 24 | - 'tasks.airflow-exporter' 25 | type: 'A' 26 | port: 8000 27 | ``` 28 | * Add dashboard to the Grafana, create your own or you can use it: https://grafana.com/dashboards/9672 29 | 30 | ### Example 31 | 32 | Example folder contains docker-compose.yml for general information, 33 | but right now it does not work. -------------------------------------------------------------------------------- /airflow_operators_metrics/metrics.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as t 3 | 4 | import psutil 5 | from prometheus_client import Gauge, Summary 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | COLLECT_TIME = Summary('airflow_collecting_stats_seconds', 10 | 'Time spent processing collecting stats') 11 | 12 | 13 | class ProcessMetrics(t.NamedTuple): 14 | dag: str 15 | operator: str 16 | exec_date: str 17 | is_local: bool 18 | is_raw: bool 19 | 20 | mem_rss: int 21 | mem_vms: int 22 | mem_shared: int 23 | mem_text: int 24 | mem_data: int 25 | mem_lib: int 26 | mem_uss: int 27 | mem_pss: int 28 | mem_swap: int 29 | 30 | # cpu_num: int 31 | cpu_percent: float 32 | cpu_times_user: float 33 | cpu_times_system: float 34 | 35 | 36 | class MetricsContainer: 37 | def __init__(self, prefix=None, 38 | global_labels: t.Optional[t.Dict[str, str]]=None): 39 | self._prefix = prefix 40 | self._global_labels = global_labels 41 | labels = ('name', 'dag', 'operator', 'exec_date') 42 | if global_labels: 43 | labels = labels + tuple(global_labels.keys()) 44 | 45 | def gauge(name, documentation, labelnames=(), *args, **kwargs): 46 | if prefix: 47 | name = f'{prefix}_{name}' 48 | labelnames += labels 49 | return Gauge(name, documentation, labelnames, *args, **kwargs) 50 | 51 | self._mem_rss = gauge('airflow_process_mem_rss', 52 | 'Non-swapped physical memory') 53 | self._mem_vms = gauge('airflow_process_mem_vms', 54 | 'Amount of virtual memory') 55 | self._mem_shared = gauge('airflow_process_mem_shared', 56 | 'Amount of shared memory') 57 | self._mem_text = gauge('airflow_process_mem_text', 58 | 'Devoted to executable code') 59 | self._mem_lib = gauge('airflow_process_mem_lib', 'Used by shared libraries') 60 | self._mem_uss = gauge('airflow_process_mem_uss', 61 | 'Mem unique to a process and which would be freed ' 62 | 'if the process was terminated right now') 63 | self._mem_swap = gauge('airflow_process_mem_swap', 64 | 'Amount of swapped memory') 65 | self._mem_pss = gauge('airflow_process_mem_pss', 66 | 'Shared with other processes, accounted in a way that ' 67 | 'the amount is divided evenly between processes ' 68 | 'that share it') 69 | 70 | # self._cpu_num = gauge('airflow_process_mem_swap', 71 | # 'Amount of swapped memory') 72 | self._cpu_percent = gauge('airflow_process_cpu_percent', 73 | 'System-wide CPU utilization as a percentage ' 74 | 'of the process') 75 | self._cpu_times_user = gauge('airflow_process_cpu_times_user', 76 | 'CPU times user') 77 | self._cpu_times_system = gauge('airflow_process_cpu_times_system', 78 | 'CPU times system') 79 | 80 | @COLLECT_TIME.time() 81 | def collect(self): 82 | handled = 0 83 | self._reset() 84 | for process_metrics in _get_processes_metrics(): 85 | self._handle_process_metrics(process_metrics) 86 | handled += 1 87 | logger.info(f'Gathered metrics from {handled} processes') 88 | 89 | def _handle_process_metrics(self, metrics: ProcessMetrics): 90 | name = _get_process_name(metrics) 91 | labels = {'name': name, 'dag': metrics.dag, 92 | 'operator': metrics.operator, 'exec_date': metrics.exec_date} 93 | if self._global_labels: 94 | labels.update(self._global_labels) 95 | 96 | self._mem_rss.labels(**labels).set(metrics.mem_rss) 97 | self._mem_vms.labels(**labels).set(metrics.mem_vms) 98 | self._mem_shared.labels(**labels).set(metrics.mem_shared) 99 | self._mem_text.labels(**labels).set(metrics.mem_text) 100 | self._mem_uss.labels(**labels).set(metrics.mem_uss) 101 | self._mem_swap.labels(**labels).set(metrics.mem_swap) 102 | self._mem_pss.labels(**labels).set(metrics.mem_pss) 103 | 104 | self._cpu_percent.labels(**labels).set(metrics.cpu_percent) 105 | self._cpu_times_user.labels(**labels).set(metrics.cpu_times_user) 106 | self._cpu_times_system.labels(**labels).set(metrics.cpu_times_system) 107 | 108 | def _reset(self): 109 | self._mem_rss._metrics = {} 110 | self._mem_vms._metrics = {} 111 | self._mem_shared._metrics = {} 112 | self._mem_text._metrics = {} 113 | self._mem_uss._metrics = {} 114 | self._mem_swap._metrics = {} 115 | self._mem_pss._metrics = {} 116 | 117 | self._cpu_percent._metrics = {} 118 | self._cpu_times_user._metrics = {} 119 | self._cpu_times_system._metrics = {} 120 | 121 | 122 | def _get_processes_metrics() -> t.Iterator[ProcessMetrics]: 123 | for process in psutil.process_iter(): 124 | try: 125 | airflow_data = get_airflow_data(process) 126 | if not airflow_data: 127 | continue 128 | mem = process.memory_full_info() 129 | cpu_times = process.cpu_times() 130 | cpu_percent = process.cpu_percent() 131 | except psutil.NoSuchProcess: 132 | continue 133 | 134 | yield ProcessMetrics( 135 | dag=airflow_data['dag'], 136 | operator=airflow_data['operator'], 137 | exec_date=airflow_data['exec_date'], 138 | is_local=airflow_data['is_local'], 139 | is_raw=airflow_data['is_raw'], 140 | 141 | mem_rss=mem.rss, 142 | mem_vms=mem.vms, 143 | mem_shared=mem.shared, 144 | mem_text=mem.text, 145 | mem_data=mem.data, 146 | mem_lib=mem.lib, 147 | mem_uss=mem.uss, 148 | mem_pss=mem.pss, 149 | mem_swap=mem.swap, 150 | 151 | cpu_percent=cpu_percent, 152 | cpu_times_user=cpu_times.user, 153 | cpu_times_system=cpu_times.system, 154 | ) 155 | 156 | 157 | def _get_process_name(metrics: ProcessMetrics): 158 | dag, operator = metrics.dag, metrics.operator 159 | if dag not in operator: 160 | name_parts = [f'{dag}.{operator}'] 161 | else: 162 | name_parts = [operator] 163 | name_parts.append(metrics.exec_date) 164 | if metrics.is_local: 165 | name_parts.append('local') 166 | if metrics.is_raw: 167 | name_parts.append('is_raw') 168 | return '_'.join(name_parts) 169 | 170 | 171 | def get_airflow_data( 172 | process: psutil.Process) -> t.Optional[t.Dict[str, t.Union[str, bool]]]: 173 | cmdline = process.cmdline() 174 | if not cmdline or not cmdline[0].startswith('/usr/bin/python'): 175 | return None 176 | 177 | for cmd_arg in cmdline: 178 | if 'airflow run' not in cmd_arg: 179 | continue 180 | 181 | airflow_args = cmd_arg.split() 182 | dag = airflow_args[3] 183 | operator = airflow_args[4] 184 | exec_date = airflow_args[5][5:25] 185 | is_local = any([i == '--local' for i in airflow_args]) 186 | is_raw = any([i == '--raw' for i in airflow_args]) 187 | 188 | return { 189 | 'dag': dag, 190 | 'operator': operator, 191 | 'exec_date': exec_date, 192 | 'is_local': is_local, 193 | 'is_raw': is_raw, 194 | } 195 | -------------------------------------------------------------------------------- /airflow_operators_metrics/server.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import logging 4 | import socket 5 | 6 | import psutil 7 | import prometheus_client as prom 8 | 9 | from airflow_operators_metrics.metrics import MetricsContainer 10 | 11 | 12 | if __name__ == '__main__': 13 | logging.basicConfig(level=logging.INFO) 14 | 15 | http_port = int(os.getenv('HTTP_PORT', 8000)) 16 | sleep_seconds = int(os.getenv('SLEEP_SECONDS', 4)) 17 | name_prefix = os.getenv('METRIC_NAME_PREFIX') 18 | hostname_path = os.getenv('HOSTNAME_PATH') 19 | custom_procfs_path = os.getenv('CUSTOM_PROCFS_PATH') 20 | 21 | if custom_procfs_path: 22 | psutil.PROCFS_PATH = custom_procfs_path 23 | 24 | labels = {'hostname': socket.gethostname()} 25 | if hostname_path: 26 | with open(hostname_path) as fp: 27 | host_hostname = labels['host_hostname'] = fp.read().strip() 28 | logging.info('hostname from file %s', host_hostname) 29 | 30 | prom.start_http_server(http_port) 31 | metrics = MetricsContainer(name_prefix, global_labels=labels) 32 | try: 33 | while True: 34 | metrics.collect() 35 | time.sleep(sleep_seconds) 36 | except KeyboardInterrupt: 37 | pass 38 | -------------------------------------------------------------------------------- /alpine.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-alpine as builder 2 | 3 | RUN apk add --update \ 4 | build-base \ 5 | python3-dev \ 6 | linux-headers && \ 7 | mkdir /wheels 8 | 9 | COPY requirements.txt /requirements.txt 10 | 11 | RUN pip3 wheel --wheel-dir=/wheels --find-links=/wheels -r /requirements.txt 12 | 13 | 14 | FROM python:3.7-alpine 15 | 16 | COPY --from=builder /wheels /wheels 17 | 18 | COPY ./ /app 19 | 20 | RUN pip install --no-index --find-links=/wheels -e /app && \ 21 | rm -rf /wheels 22 | 23 | WORKDIR /app 24 | 25 | CMD ["python3", "/app/airflow_operators_metrics/server.py"] 26 | -------------------------------------------------------------------------------- /example/dags/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mastak/airflow_operators_metrics/e273c6b84db2b72c7457702ec60c44e9b2845961/example/dags/__init__.py -------------------------------------------------------------------------------- /example/dags/memory.py: -------------------------------------------------------------------------------- 1 | import signal 2 | import time 3 | 4 | from airflow.models import DAG 5 | from airflow.operators.python_operator import PythonOperator 6 | from airflow.utils.dates import days_ago 7 | 8 | 9 | def eat_memory(mem_size, seconds_limit=60*10): 10 | def _eat_memory(): 11 | """ Create var some with size ~mem_size and sleep in loop until seconds_limit 12 | or SIGINT or SIGTERM 13 | """ 14 | some = '1' * mem_size 15 | _eat_memory.is_stop = False 16 | end_time = time.time() + seconds_limit 17 | 18 | def stop(): 19 | _eat_memory.is_stop = True 20 | 21 | signal.signal(signal.SIGINT, stop) 22 | signal.signal(signal.SIGTERM, stop) 23 | 24 | while not _eat_memory.is_stop and time.time() < end_time: 25 | time.sleep(1) 26 | return _eat_memory 27 | 28 | 29 | args = { 30 | 'start_date': days_ago(1), 31 | 'owner': 'airflow', 32 | } 33 | 34 | 35 | dag = DAG( 36 | dag_id='memory', 37 | schedule_interval='@hourly', 38 | default_args=args, 39 | ) 40 | 41 | 42 | memory_consumer = PythonOperator( 43 | task_id='memory_consumer', 44 | dag=dag, 45 | python_callable=eat_memory(mem_size=1024 * 1024 * 200), # ~200Mb 46 | ) 47 | -------------------------------------------------------------------------------- /example/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.4' 2 | services: 3 | redis: 4 | image: redis:3.2.7 5 | 6 | postgres: 7 | image: postgres:11 8 | environment: 9 | POSTGRES_USER: airflow 10 | POSTGRES_PASSWORD: airflow 11 | 12 | airflow_webserver: 13 | image: puckel/docker-airflow:1.10.1 14 | command: webserver 15 | ports: 16 | - "localhost:8080:8080" 17 | environment: 18 | EXECUTOR: Celery 19 | volumes: 20 | - ./dags:/usr/local/airflow/dags 21 | airflow_scheduler: 22 | image: puckel/docker-airflow:1.10.1 23 | command: scheduler 24 | depends_on: 25 | - airflow_webserver 26 | volumes: 27 | - ./dags:/usr/local/airflow/dags 28 | environment: 29 | EXECUTOR: Celery 30 | airflow_worker: 31 | image: puckel/docker-airflow:1.10.1 32 | command: worker 33 | depends_on: 34 | - airflow_scheduler 35 | volumes: 36 | - ./dags:/usr/local/airflow/dags 37 | environment: 38 | EXECUTOR: Celery 39 | 40 | airflow-exporter: 41 | image: mastak/airflow_operator_stats:ubuntu 42 | ports: 43 | - 8000 44 | volumes: 45 | - /proc:/host/proc:ro 46 | - /etc/hostname:/host/hostname:ro 47 | environment: 48 | CUSTOM_PROCFS_PATH: /host/proc 49 | HOSTNAME_PATH: /host/hostname 50 | 51 | prometheus: 52 | image: prom/prometheus:v2.6.0 53 | ports: 54 | - 9090 55 | command: 56 | - '--config.file=/etc/prometheus/prometheus.yml' 57 | - '--web.console.libraries=/etc/prometheus/console_libraries' 58 | - '--web.console.templates=/etc/prometheus/consoles' 59 | volumes: 60 | - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro 61 | 62 | grafana: 63 | image: grafana/grafana:5.4.2 64 | ports: 65 | - "localhost:3000:3000" 66 | environment: 67 | GF_SERVER_ROOT_URL: "%(protocol)s://%(domain)s:/grafana" 68 | GF_AUTH_ANONYMOUS_ENABLED: "false" 69 | GF_DATABASE_URL: "{{ grafana_database_url }}" 70 | volumes: 71 | - monitoring_grafana:/var/lib/grafana 72 | -------------------------------------------------------------------------------- /example/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | evaluation_interval: 10s 4 | 5 | scrape_configs: 6 | - job_name: 'airflow-exporter' 7 | dns_sd_configs: 8 | - names: 9 | - 'tasks.airflow-exporter' 10 | type: 'A' 11 | port: 8000 12 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | prometheus_client==0.7.1 2 | psutil==5.6.3 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | with open('requirements.txt', 'r') as f: 6 | install_requires = [ 7 | s for s in [ 8 | line.strip(' \n') for line in f 9 | ] if not s.startswith('#') and s != '' 10 | ] 11 | 12 | 13 | setup( 14 | name='airflow_operators_metrics', 15 | description='Collector system metrics of airflow processes', 16 | version='0.1', 17 | install_requires=install_requires, 18 | ) 19 | -------------------------------------------------------------------------------- /ubuntu.Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:18.10 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ENV LANG=en_US.UTF-8 LC_ALL=en_US.UTF-8 5 | 6 | RUN apt-get update && \ 7 | apt-get install -y --no-install-recommends \ 8 | locales \ 9 | locales-all \ 10 | vim \ 11 | libxml2 \ 12 | python3 \ 13 | python3-dev \ 14 | python3-pip \ 15 | libpython3.6 \ 16 | gcc \ 17 | build-essential && \ 18 | locale-gen "en_US.UTF-8" && \ 19 | rm -rf /var/lib/apt/lists/* 20 | 21 | RUN pip3 install -U --no-cache \ 22 | pip \ 23 | setuptools \ 24 | wheel 25 | 26 | COPY requirements.txt /app/requirements.txt 27 | 28 | RUN pip3 install -U -r /app/requirements.txt 29 | 30 | COPY ./setup.py /app/setup.py 31 | COPY ./airflow_operators_metrics /app/airflow_operators_metrics 32 | 33 | RUN pip install -e /app 34 | 35 | WORKDIR /app 36 | 37 | CMD ["python3", "/app/airflow_operators_metrics/server.py"] 38 | --------------------------------------------------------------------------------