├── pkg └── cluster_agent │ ├── __init__.py │ ├── tests │ ├── __init__.py │ ├── ut │ │ ├── __init__.py │ │ ├── test_epsagon_client.py │ │ ├── conftest.py │ │ ├── test_events_sender.py │ │ ├── test_events_manager.py │ │ ├── test_kubernetes_event.py │ │ ├── test_forwarder.py │ │ └── test_cluster_discovery.py │ └── system │ │ ├── sanity │ │ ├── __init__.py │ │ └── test_sanity.py │ │ ├── cluster_config.yml │ │ ├── README.md │ │ └── conftest.py │ ├── requirements.txt │ ├── requirements-dev.txt │ ├── .dockerignore │ ├── build │ └── Dockerfile │ ├── encoders.py │ ├── cicd │ ├── generate_test_matrix.py │ └── get_test_cluster_config.py │ ├── README.md │ ├── cluster_agent_deployment.yaml │ ├── epsagon_role.yaml │ ├── events_sender.py │ ├── epsagon_client.py │ ├── logger_configurer.py │ ├── events_manager.py │ ├── forwarder.py │ ├── kubernetes_event.py │ ├── main.py │ └── cluster_discovery.py ├── .gitignore ├── .github ├── CODEOWNERS └── workflows │ └── cluster_agent_tests.yml └── README.md /pkg/cluster_agent/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .vscode 3 | __pycache__ 4 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/system/sanity/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pkg/cluster_agent/requirements.txt: -------------------------------------------------------------------------------- 1 | kubernetes_asyncio 2 | aiohttp 3 | aiohttp-retry 4 | aiofiles 5 | -------------------------------------------------------------------------------- /pkg/cluster_agent/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-asyncio 3 | asynctest 4 | pytest_httpserver 5 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Each line is a file pattern followed by one or more owners. 2 | * @epsagon/the-fabulous-team -------------------------------------------------------------------------------- /pkg/cluster_agent/.dockerignore: -------------------------------------------------------------------------------- 1 | test* 2 | build/ 3 | __pycache__/ 4 | README.md 5 | venv 6 | requirements-dev.txt 7 | *.yml 8 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/system/cluster_config.yml: -------------------------------------------------------------------------------- 1 | kind: Cluster 2 | apiVersion: kind.x-k8s.io/v1alpha4 3 | nodes: 4 | - role: control-plane 5 | - role: worker 6 | -------------------------------------------------------------------------------- /pkg/cluster_agent/build/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | RUN mkdir /app 4 | WORKDIR /app 5 | ADD *.py /app/ 6 | ADD requirements.txt /app/ 7 | RUN pip install -r requirements.txt 8 | 9 | CMD ["python", "-u", "/app/main.py"] 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # epsagon-kubernetes 2 | 3 |

4 | 5 | 6 | 7 |
8 |

9 | 10 | # Epsagon Kubernetes 11 | 12 | Official repo for Epsagon Kubernetes agent. 13 | 14 | ## Prerequisites 15 | 16 | Kubernetes 1.16+ 17 | -------------------------------------------------------------------------------- /pkg/cluster_agent/encoders.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common json encoders 3 | """ 4 | from datetime import datetime 5 | from json import JSONEncoder 6 | 7 | 8 | class DateTimeEncoder(JSONEncoder): 9 | """ JSON encoder for datetime class """ 10 | 11 | def default(self, o): # pylint: disable=method-hidden 12 | """ 13 | Overriding for specific serialization 14 | """ 15 | if isinstance(o, datetime): 16 | return str(o) 17 | return super(DateTimeEncoder, self).default(o) 18 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/system/sanity/test_sanity.py: -------------------------------------------------------------------------------- 1 | """ 2 | System sanity tests 3 | """ 4 | import asyncio 5 | import pytest 6 | import conftest 7 | 8 | 9 | @pytest.fixture(scope='session', autouse=True) 10 | async def install_agent(): 11 | installer = conftest.ClusterAgentInstaller() 12 | await installer.install_all() 13 | 14 | @pytest.mark.asyncio 15 | async def test_sanity(): 16 | """ 17 | A placeholder test - ran by CICD, used to test the agent pod is 18 | running successfully. 19 | """ 20 | pass 21 | -------------------------------------------------------------------------------- /pkg/cluster_agent/cicd/generate_test_matrix.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates the system test file paths matrix. 3 | This is written to save time during CI/CD and support custom cluster config. 4 | """ 5 | import os 6 | import json 7 | 8 | SYSTEM_TEST_DIRPATH= "tests/system/" 9 | 10 | def main(): 11 | dirs = [result[0] for result in os.walk(SYSTEM_TEST_DIRPATH)] 12 | dirs.remove(SYSTEM_TEST_DIRPATH) 13 | print(json.dumps([directory[len(SYSTEM_TEST_DIRPATH):] for directory in dirs])) 14 | 15 | if __name__ == '__main__': 16 | main() 17 | 18 | 19 | -------------------------------------------------------------------------------- /pkg/cluster_agent/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | 5 |
6 |

7 | 8 | # Epsagon Kubernetes Cluster Agent 9 | 10 | ![Version: 1.0.2](https://img.shields.io/badge/Version-1.0.2-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) 11 | 12 | The Epsagon kuberenetes cluster agent collect & watches your cluster resources. All collected events are being sent to [Epsagon](https://dashboard.epsagon.com/). 13 | ## Documentation 14 | Further documentation can be found on [Epsagon official docs website](https://docs.epsagon.com/). 15 | 16 | ## Prerequisites 17 | 18 | Kubernetes 1.16+ 19 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/system/README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | 5 |
6 |

7 | 8 | # Epsagon Kubernetes Cluster Agent - System tests 9 | 10 | To run the cluster agent system tests, you need the kubectl context to be set to a 11 | kubernetes cluster. 12 | Those tests also run as part of the CICD by an workflow set for this repo. 13 | The workflow which already take care of the environment setup (using a Kind cluster). 14 | 15 | ## Prerequisites 16 | 17 | * A Kubernetes cluster environment. The cluster can be also a Kind cluster or another non-real environment. The cluster version should be 1.16+. 18 | * `kubectl` context to be set to your environment test cluster. 19 | -------------------------------------------------------------------------------- /pkg/cluster_agent/cluster_agent_deployment.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: epsagon-monitoring 5 | --- 6 | apiVersion: apps/v1 7 | kind: Deployment 8 | metadata: 9 | name: cluster-agent 10 | namespace: epsagon-monitoring 11 | spec: 12 | selector: 13 | matchLabels: 14 | app: epsagon-cluster-agent 15 | replicas: 1 16 | template: 17 | metadata: 18 | labels: 19 | app: epsagon-cluster-agent 20 | spec: 21 | serviceAccountName: cluster-agent 22 | containers: 23 | - name: cluster-agent 24 | image: epsagon/cluster-agent:1.0.0 25 | imagePullPolicy: Always 26 | env: 27 | - name: EPSAGON_TOKEN 28 | value: "" 29 | - name: EPSAGON_CLUSTER_NAME 30 | value: "" 31 | - name: EPSAGON_DEBUG 32 | value: "false" 33 | - name: EPSAGON_COLLECTOR_URL 34 | value: "https://collector.epsagon.com/resources/v1" 35 | -------------------------------------------------------------------------------- /pkg/cluster_agent/cicd/get_test_cluster_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Gets the cluster config file path according to given testing directory path. 3 | If the config is not found then using the default cluster configuration. 4 | """ 5 | import os 6 | import sys 7 | import json 8 | 9 | USAGE = ( 10 | f"Usage: python get_test_cluster_config.py " 11 | ) 12 | CLUSTER_CONFIG_FILENAME = "cluster_config.yml" 13 | SYSTEM_TEST_DIRPATH= "tests/system/" 14 | DEFAULT_CLUSTER_CONFIG = os.path.join(SYSTEM_TEST_DIRPATH, CLUSTER_CONFIG_FILENAME) 15 | 16 | def main(test_directory): 17 | cluster_config_path = DEFAULT_CLUSTER_CONFIG 18 | file_path = os.path.join(test_directory, CLUSTER_CONFIG_FILENAME) 19 | if os.path.exists(file_path): 20 | cluster_config_path = file_path 21 | 22 | print(cluster_config_path) 23 | 24 | 25 | if __name__ == '__main__': 26 | args = sys.argv 27 | if len(args) != 2: 28 | print(USAGE) 29 | sys.exit(1) 30 | 31 | main(args[1]) 32 | 33 | 34 | -------------------------------------------------------------------------------- /pkg/cluster_agent/epsagon_role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: epsagon-monitoring 5 | labels: 6 | name: epsagon-monitoring 7 | --- 8 | apiVersion: v1 9 | kind: ServiceAccount 10 | metadata: 11 | name: cluster-agent 12 | namespace: epsagon-monitoring 13 | --- 14 | apiVersion: rbac.authorization.k8s.io/v1 15 | kind: ClusterRole 16 | metadata: 17 | name: cluster-agent 18 | rules: 19 | - apiGroups: [""] 20 | resources: 21 | - nodes 22 | - services 23 | - endpoints 24 | - pods 25 | - namespaces 26 | - configmaps 27 | verbs: ["get", "list", "watch"] 28 | - apiGroups: ["apps"] 29 | resources: ["deployments", "statefulsets", "daemonsets"] 30 | verbs: ["get", "list", "watch"] 31 | --- 32 | apiVersion: rbac.authorization.k8s.io/v1 33 | kind: ClusterRoleBinding 34 | metadata: 35 | name: cluster-agent-binding 36 | roleRef: 37 | apiGroup: rbac.authorization.k8s.io 38 | kind: ClusterRole 39 | name: cluster-agent 40 | subjects: 41 | - kind: ServiceAccount 42 | name: cluster-agent 43 | namespace: epsagon-monitoring 44 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/test_epsagon_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | EpsagonClient tests 3 | """ 4 | import base64 5 | import pytest 6 | from epsagon_client import EpsagonClient 7 | 8 | TEST_EPSAGON_TOKEN = "123" 9 | ENCODED_TOKEN = base64.b64encode(f"{TEST_EPSAGON_TOKEN}:".encode()).decode() 10 | TEST_PATH = "/post_path" 11 | 12 | @pytest.mark.asyncio 13 | async def test_initialize_no_epsagon_token(): 14 | """ Initialize test - no epsagon token """ 15 | with pytest.raises(ValueError): 16 | await EpsagonClient.create(None) 17 | 18 | 19 | @pytest.mark.asyncio 20 | async def test_post(httpserver): 21 | """ post sanity test """ 22 | data = { 23 | "a": "A" 24 | } 25 | def handler(request): 26 | assert "Authorization" in request.headers 27 | assert request.headers["Authorization"] == f"Basic {ENCODED_TOKEN}" 28 | httpserver.expect_request( 29 | TEST_PATH, 30 | method="POST", 31 | ).respond_with_handler(handler) 32 | client = await EpsagonClient.create(TEST_EPSAGON_TOKEN) 33 | await client.post(httpserver.url_for(TEST_PATH), data) 34 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common test settings 3 | """ 4 | import asyncio 5 | from asynctest.mock import MagicMock 6 | 7 | # monkey patch MagicMock 8 | async def async_magic(): 9 | pass 10 | 11 | MagicMock.__await__ = lambda x: async_magic().__await__() 12 | 13 | 14 | async def run_coroutines_with_timeout( 15 | coroutines, 16 | verify_tasks_finished=True, 17 | timeout=1 18 | ): 19 | """ 20 | Convert coroutines to tasks and runs them with a given timeout. 21 | :param coroutines: to run 22 | :param verify_coroutines_finished: verifies all the given coroutines 23 | finished running 24 | :param timeout: in seconds, to wait for all the coroutines to finish. 25 | :return: a list of the corresponding coroutines created tasks 26 | """ 27 | tasks = [asyncio.create_task(coroutine) for coroutine in coroutines] 28 | finished, _ = await asyncio.wait( 29 | tasks, 30 | timeout=timeout, 31 | return_when=asyncio.ALL_COMPLETED 32 | ) 33 | if verify_tasks_finished: 34 | assert len(finished) == len(tasks) 35 | 36 | return tasks 37 | -------------------------------------------------------------------------------- /pkg/cluster_agent/events_sender.py: -------------------------------------------------------------------------------- 1 | """ 2 | Kubernetes Events sender 3 | """ 4 | import json 5 | import base64 6 | import zlib 7 | from typing import List 8 | from encoders import DateTimeEncoder 9 | from kubernetes_event import KubernetesEvent 10 | 11 | class EventsSender: 12 | """ 13 | Events sender 14 | """ 15 | 16 | def __init__(self, client, url, cluster_name, epsagon_token): 17 | """ 18 | :param client: used to send events by 19 | :param url: to send the events to 20 | """ 21 | self.client = client 22 | self.url = url 23 | self.epsagon_token = epsagon_token 24 | self.cluster_name = cluster_name 25 | 26 | async def send_events(self, events: List[KubernetesEvent]): 27 | """ 28 | Sends the given events 29 | """ 30 | if not events: 31 | return 32 | 33 | events = [event.to_dict() for event in events] 34 | events_json = json.dumps(events, cls=DateTimeEncoder) 35 | compressed_data = base64.b64encode( 36 | zlib.compress(events_json.encode("utf-8")) 37 | ).decode("utf-8") 38 | data_to_send = { 39 | "epsagon_token": self.epsagon_token, 40 | "cluster_name": self.cluster_name, 41 | "data": compressed_data, 42 | } 43 | 44 | await self.client.post(self.url, json.dumps(data_to_send)) 45 | -------------------------------------------------------------------------------- /pkg/cluster_agent/epsagon_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | Async Epsagon client 3 | """ 4 | from http import HTTPStatus 5 | from aiohttp.helpers import BasicAuth 6 | from aiohttp.client_exceptions import ClientError 7 | from aiohttp_retry import RetryClient, ExponentialRetry 8 | 9 | class EpsagonClientException(Exception): 10 | pass 11 | 12 | 13 | class EpsagonClient: 14 | """ 15 | Async Epsagon client 16 | """ 17 | 18 | DEFAULT_RETRY_ATTEMPTS = 3 19 | 20 | @classmethod 21 | async def create(cls, epsagon_token, retry_attempts=DEFAULT_RETRY_ATTEMPTS): 22 | """ 23 | Creates a new EpsagonClient instance 24 | :param epsagon_token: used for authorization 25 | """ 26 | self = cls() 27 | if not epsagon_token: 28 | raise ValueError("Epsagon token must be given") 29 | self.epsagon_token = epsagon_token 30 | retry_options = ExponentialRetry( 31 | attempts=retry_attempts, 32 | exceptions=(ClientError,) 33 | ) 34 | self.client = RetryClient( 35 | auth=BasicAuth(login=self.epsagon_token), 36 | headers={ 37 | "Content-Type": "application/json", 38 | }, 39 | retry_options=retry_options, 40 | raise_for_status=True 41 | ) 42 | return self 43 | 44 | async def post(self, url, data): 45 | """ 46 | Posts data to Epsagon given url. 47 | :param url: endpoint to post the data to 48 | :param data: to send 49 | HTTP status code. 50 | """ 51 | async with self.client.post(url, data=data): 52 | pass 53 | 54 | async def close(self): 55 | """ 56 | Closes the client. 57 | """ 58 | await self.client.close() 59 | -------------------------------------------------------------------------------- /pkg/cluster_agent/logger_configurer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Logger configurer helper module. 3 | """ 4 | 5 | import sys 6 | import logging 7 | from logging.handlers import RotatingFileHandler 8 | 9 | class LoggerConfigurer: 10 | """ 11 | Logger configurer for the collector log. 12 | """ 13 | 14 | MAX_LOG_FILE_SIZE = 10 * 1024 * 1024 # 10MB per log file 15 | FILE_BACKUP_COUNT = 1 16 | 17 | def __init__( 18 | self, 19 | log_format: str, 20 | log_file_path: str, 21 | logger: logging.Logger = None 22 | ): 23 | """ 24 | :param logger: logger to configure, defauls to the root logger. 25 | """ 26 | self.log_format = log_format 27 | self.log_file_path = log_file_path 28 | self.log_file_handler = None 29 | self.output_handler = None 30 | self.logger = logging.getLogger() if not logger else logger 31 | 32 | def configure_logger(self, is_debug: bool): 33 | """ 34 | Configures the logger handlers with the log format & level. 35 | Configure 2 handlers: 36 | - 1 output handler (stdout), level set by given param `is_debug` 37 | - 1 file handler, level set to logging.DEBUG 38 | """ 39 | formatter = logging.Formatter(self.log_format) 40 | self.log_file_handler = RotatingFileHandler( 41 | self.log_file_path, 42 | maxBytes=self.MAX_LOG_FILE_SIZE, 43 | backupCount=self.FILE_BACKUP_COUNT 44 | ) 45 | self.output_handler = logging.StreamHandler(sys.stdout) 46 | self.output_handler.level = logging.DEBUG if is_debug else logging.INFO 47 | self.log_file_handler.level = logging.DEBUG 48 | self.logger.setLevel(logging.DEBUG) 49 | for handler in (self.log_file_handler, self.output_handler): 50 | handler.setFormatter(formatter) 51 | self.logger.addHandler(handler) 52 | 53 | def update_logger_level(self, is_debug: bool): 54 | """ 55 | Updates the logger level. Updates only the stdout handler as the file handler 56 | always set to logging.DEBUG 57 | """ 58 | self.output_handler.level = logging.DEBUG if is_debug else logging.INFO 59 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/test_events_sender.py: -------------------------------------------------------------------------------- 1 | """ 2 | EventsSender tests 3 | """ 4 | import base64 5 | import json 6 | import zlib 7 | import pytest 8 | from typing import Dict, List 9 | from asynctest.mock import patch, MagicMock 10 | from encoders import DateTimeEncoder 11 | from epsagon_client import EpsagonClient 12 | from events_sender import EventsSender 13 | from kubernetes_event import ( 14 | KubernetesEvent, 15 | WatchKubernetesEvent, 16 | KubernetesEventType, 17 | WatchKubernetesEventType, 18 | ) 19 | 20 | TEST_URL = "http://testurl/1" 21 | TEST_CLUSTER_NAME = "test-cluster-name" 22 | TEST_EPSAGON_TOKEN = "1234" 23 | 24 | 25 | def _get_expected_data( 26 | events_sender: EventsSender, 27 | events: List[KubernetesEvent] 28 | ): 29 | """ 30 | Gets the expected data to be sent given events list and events sender 31 | """ 32 | events = [event.to_dict() for event in events] 33 | events_json = json.dumps(events, cls=DateTimeEncoder) 34 | compressed_data = base64.b64encode( 35 | zlib.compress(events_json.encode("utf-8")) 36 | ).decode("utf-8") 37 | data_to_send = { 38 | "epsagon_token": events_sender.epsagon_token, 39 | "cluster_name": events_sender.cluster_name, 40 | "data": compressed_data, 41 | } 42 | return json.dumps(data_to_send) 43 | 44 | 45 | @pytest.mark.asyncio 46 | @patch("epsagon_client.EpsagonClient") 47 | async def test_send_events_sanity(epsagon_client_mock): 48 | epsagon_client_obj = epsagon_client_mock.return_value 49 | sender = EventsSender( 50 | epsagon_client_obj, 51 | TEST_URL, 52 | TEST_CLUSTER_NAME, 53 | TEST_EPSAGON_TOKEN 54 | ) 55 | events = [ 56 | KubernetesEvent(KubernetesEventType.CLUSTER, {"a": "b"}), 57 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}), 58 | ] 59 | await sender.send_events(events) 60 | epsagon_client_obj.post.assert_called_once_with( 61 | TEST_URL, 62 | _get_expected_data(sender, events) 63 | ) 64 | 65 | 66 | @pytest.mark.asyncio 67 | @patch("epsagon_client.EpsagonClient") 68 | async def test_send_no_events(epsagon_client_mock): 69 | epsagon_client_obj = epsagon_client_mock.return_value 70 | sender = EventsSender( 71 | epsagon_client_obj, 72 | TEST_URL, 73 | TEST_CLUSTER_NAME, 74 | TEST_EPSAGON_TOKEN 75 | ) 76 | events = [] 77 | await sender.send_events(events) 78 | epsagon_client_obj.post.assert_not_called() 79 | -------------------------------------------------------------------------------- /.github/workflows/cluster_agent_tests.yml: -------------------------------------------------------------------------------- 1 | name: Create Cluster 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | push: 7 | branches: 8 | - main 9 | 10 | jobs: 11 | unit-testing: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 3.7 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: 3.7 19 | 20 | - name: Install dependencies 21 | timeout-minutes: 5 22 | working-directory: ./pkg/cluster_agent 23 | run: | 24 | pip install -r requirements-dev.txt 25 | pip install -r requirements.txt 26 | - name: test 27 | timeout-minutes: 1 28 | working-directory: ./pkg/cluster_agent 29 | run: python -m pytest ./tests/ut 30 | 31 | 32 | system_test_install: 33 | timeout-minutes: 10 34 | runs-on: ubuntu-latest 35 | outputs: 36 | matrixTestPath: ${{ steps.set-matrix.outputs.matrixTestPath }} 37 | steps: 38 | - uses: actions/checkout@v2 39 | - name: set-matrix 40 | id: set-matrix 41 | working-directory: ./pkg/cluster_agent 42 | run: | 43 | echo "::set-output name=matrixTestPath::`python ./cicd/generate_test_matrix.py`" 44 | 45 | system_tests: 46 | runs-on: ubuntu-latest 47 | timeout-minutes: 15 48 | needs: system_test_install 49 | strategy: 50 | fail-fast: false 51 | matrix: 52 | test_path: ${{ fromJson(needs.system_test_install.outputs.matrixTestPath) }} 53 | 54 | steps: 55 | - uses: actions/checkout@v2 56 | - name: get-cluster-config-path 57 | id: get-cluster-config-path 58 | run: echo "::set-output name=clusterConfig::`python ./cicd/get_test_cluster_config.py ${{ matrix.test_path }}`" 59 | working-directory: ./pkg/cluster_agent 60 | 61 | - name: Create k8s Kind Cluster 62 | uses: helm/kind-action@v1.2.0 63 | with: 64 | cluster_name: test-cluster 65 | config: ./pkg/cluster_agent/${{ steps.get-cluster-config-path.outputs.clusterConfig }} 66 | 67 | - name: build-image 68 | run: docker build . -t epsagon/cluster-agent:test -f ./build/Dockerfile 69 | working-directory: ./pkg/cluster_agent 70 | 71 | - name: load-image-to-kind 72 | run: kind load docker-image epsagon/cluster-agent:test --name test-cluster 73 | 74 | - name: Set up Python 3.7 75 | uses: actions/setup-python@v2 76 | with: 77 | python-version: 3.7 78 | 79 | - name: Install dependencies 80 | timeout-minutes: 5 81 | working-directory: ./pkg/cluster_agent 82 | run: | 83 | pip install -r requirements-dev.txt 84 | pip install -r requirements.txt 85 | - name: test 86 | timeout-minutes: 1 87 | working-directory: ./pkg/cluster_agent/tests/system/ 88 | run: python -m pytest ${{ matrix.test_path }} 89 | -------------------------------------------------------------------------------- /pkg/cluster_agent/events_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Events managers module 3 | """ 4 | import abc 5 | import logging 6 | from asyncio import Queue, wait_for, TimeoutError 7 | from typing import List 8 | from kubernetes_event import KubernetesEvent 9 | 10 | 11 | class EventsManager(abc.ABC): 12 | """ 13 | An abstract asynchronous events manager - used to read from & write to 14 | asynchronously. 15 | Each specific events manager should inherit from this class. 16 | """ 17 | 18 | @abc.abstractmethod 19 | def is_empty(self) -> bool: 20 | """ 21 | Returns whether there're no unread events 22 | """ 23 | raise NotImplementedError 24 | 25 | @abc.abstractmethod 26 | async def write_event(self, event: KubernetesEvent): 27 | """ 28 | Writes an event 29 | """ 30 | raise NotImplementedError 31 | 32 | @abc.abstractmethod 33 | async def get_event(self) -> KubernetesEvent: 34 | """ 35 | Reads an event 36 | """ 37 | raise NotImplementedError 38 | 39 | async def _read_event(self, timeout: int=None): 40 | """ 41 | Reads and returns an event. If timeout is given, then trying to read event up to 42 | the timeout given value. 43 | In case of timeout, returns None. 44 | """ 45 | event = None 46 | try: 47 | event = await wait_for(self.get_event(), timeout=timeout) 48 | except TimeoutError: 49 | pass 50 | 51 | return event 52 | 53 | async def get_events(self, max_size: int, timeout: int=None) -> List[KubernetesEvent]: 54 | """ 55 | Reads up to max_size events. 56 | The functions waits until the earlier: 57 | - there's at least one event. In this case, returns all the 58 | existing events. 59 | - timeout been passed (in case its given). In this case, an empty list is returned. 60 | :param max_size: of events to read 61 | If max_size < 1, then returning an empty list. 62 | If the current events count in the queue is less than max_size, then 63 | returns just the current events. 64 | :param timeout: If given, then setting this timeout for the first 65 | read event attempt. If no event is read during after the given timeout, 66 | the functions returns with an empty list. 67 | """ 68 | if max_size < 1: 69 | return [] 70 | 71 | first_event = await self._read_event(timeout=timeout) 72 | if not first_event: 73 | return [] 74 | 75 | events = [first_event] 76 | while not self.is_empty() and len(events) < max_size: 77 | events.append(await self.get_event()) 78 | 79 | return events 80 | 81 | 82 | class InMemoryEventsManager(EventsManager): 83 | """ 84 | Im memory events manager 85 | """ 86 | 87 | def __init__(self, *args, **kwargs): 88 | super().__init__(*args, **kwargs) 89 | self.events_queue = Queue() 90 | 91 | def is_empty(self) -> bool: 92 | return self.events_queue.empty() 93 | 94 | async def write_event(self, event: KubernetesEvent): 95 | await self.events_queue.put(event) 96 | 97 | async def get_event(self) -> KubernetesEvent: 98 | return await self.events_queue.get() 99 | 100 | def clean(self): 101 | """ 102 | Cleans all events. 103 | """ 104 | self.events_queue = Queue() 105 | -------------------------------------------------------------------------------- /pkg/cluster_agent/forwarder.py: -------------------------------------------------------------------------------- 1 | """ 2 | KubernetesEvent forwarder 3 | """ 4 | import asyncio 5 | from typing import List, Set 6 | from kubernetes_event import KubernetesEvent 7 | from events_manager import EventsManager 8 | from events_sender import EventsSender 9 | 10 | 11 | class Forwarder: 12 | """ 13 | A generic KubernetesEvent forwarder 14 | """ 15 | DEFAULT_MAX_WORKERS = 5 16 | DEFAULT_MAX_EVENTS_TO_READ = 100 17 | DEFAULT_GET_EVENTS_TIMEOUT = 1 18 | 19 | def __init__( 20 | self, 21 | events_manager: EventsManager, 22 | events_sender: EventsSender, 23 | max_workers: int = DEFAULT_MAX_WORKERS, 24 | max_events_to_read: int = DEFAULT_MAX_EVENTS_TO_READ 25 | ): 26 | """ 27 | :param events_manager: used to read from events 28 | :param events_sender: used to send read events to 29 | :param max_workers: to forward read events 30 | :param max_events_to_read: to read from the events_manager 31 | """ 32 | self.events_manager = events_manager 33 | self.events_sender = events_sender 34 | if max_workers < 1: 35 | raise ValueError("Invalid workers count value, must be > 0") 36 | self.max_workers_count: int = max_workers 37 | if max_events_to_read < 1: 38 | raise ValueError("Invalid max events to read value, must be > 0") 39 | self.max_events_to_read: int = max_events_to_read 40 | self.running_workers: Set[asyncio.Task] = set() 41 | 42 | async def _forward_events(self, events: List[KubernetesEvent]): 43 | """ 44 | Forwards the given events list 45 | """ 46 | try: 47 | await self.events_sender.send_events(events) 48 | except asyncio.CancelledError: 49 | pass 50 | 51 | def _stop_all_workers(self): 52 | """ 53 | Stops all workers 54 | """ 55 | for worker in self.running_workers: 56 | if not worker.done(): 57 | worker.cancel() 58 | elif not worker.cancelled(): 59 | worker.exception() 60 | self.running_workers = set() 61 | 62 | def _check_failed_workers(self, workers): 63 | """ 64 | Checks the finished workers status. If any worker had an error, then 65 | stopping the rest of the workers and raising an error. 66 | """ 67 | for task in workers: 68 | task_exception = task.exception() 69 | if task_exception: 70 | self._stop_all_workers() 71 | raise task_exception 72 | 73 | def _get_finished_workers(self): 74 | """ 75 | Gets the running workers tasks 76 | """ 77 | return [task for task in self.running_workers if task.done()] 78 | 79 | async def start(self): 80 | """ 81 | Starts the Forwarder. The forwarder will read up to MAX_EVENTS_TO_READ 82 | at each iteration using the events_manager, and sends them using the 83 | events_sender. 84 | """ 85 | try: 86 | while True: 87 | events: List[KubernetesEvent] = await self.events_manager.get_events( 88 | self.max_events_to_read, 89 | timeout=self.DEFAULT_GET_EVENTS_TIMEOUT 90 | ) 91 | self._check_failed_workers(self._get_finished_workers()) 92 | if not events: 93 | continue 94 | if len(self.running_workers) < self.max_workers_count: 95 | self.running_workers.add(asyncio.create_task( 96 | self._forward_events(events) 97 | )) 98 | else: 99 | finished, unfinished = await asyncio.wait( 100 | self.running_workers, 101 | return_when=asyncio.FIRST_COMPLETED 102 | ) 103 | self._check_failed_workers(finished) 104 | self.running_workers = unfinished 105 | self.running_workers.add(asyncio.create_task( 106 | self._forward_events(events) 107 | )) 108 | except asyncio.CancelledError: 109 | self._stop_all_workers() 110 | 111 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/test_events_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | EventsManager tests 3 | """ 4 | import asyncio 5 | import pytest 6 | from events_manager import InMemoryEventsManager 7 | from kubernetes_event import WatchKubernetesEvent, WatchKubernetesEventType 8 | from .conftest import run_coroutines_with_timeout 9 | 10 | DEFAULT_MAX_SIZE = 2 11 | 12 | @pytest.fixture 13 | def in_memory_events_manager(): 14 | """ 15 | In memory events manager fixture 16 | """ 17 | return InMemoryEventsManager() 18 | 19 | 20 | @pytest.mark.asyncio 21 | async def test_is_empty_sanity(in_memory_events_manager): 22 | """ 23 | is_empty sanity test 24 | """ 25 | assert in_memory_events_manager.is_empty() 26 | 27 | 28 | @pytest.mark.asyncio 29 | async def test_is_empty_with_data(in_memory_events_manager): 30 | """ 31 | is_empty test with events being written to events manager 32 | """ 33 | assert in_memory_events_manager.is_empty() 34 | event = WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}), 35 | await in_memory_events_manager.write_event(event) 36 | assert not in_memory_events_manager.is_empty() 37 | await run_coroutines_with_timeout((in_memory_events_manager.get_event(), )) 38 | assert in_memory_events_manager.is_empty() 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_get_and_write_sanity(in_memory_events_manager): 43 | """ 44 | sanity test for write_event and get_event 45 | """ 46 | event = WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}), 47 | await in_memory_events_manager.write_event(event) 48 | task = (await run_coroutines_with_timeout( 49 | (in_memory_events_manager.get_event(), ) 50 | ))[0] 51 | assert event == task.result() 52 | 53 | 54 | @pytest.mark.asyncio 55 | async def test_get_events_sanity(in_memory_events_manager): 56 | """ 57 | sanity test for get_events 58 | """ 59 | events = [ 60 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}), 61 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}), 62 | ] 63 | for event in events: 64 | await in_memory_events_manager.write_event(event) 65 | task = (await run_coroutines_with_timeout( 66 | (in_memory_events_manager.get_events(max_size=DEFAULT_MAX_SIZE), ) 67 | ))[0] 68 | assert events == task.result() 69 | 70 | 71 | @pytest.mark.asyncio 72 | async def test_get_events_custom_max_size(in_memory_events_manager): 73 | """ 74 | test for get_events with a custom max size 75 | """ 76 | custom_max_size = 1 77 | events = [ 78 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}), 79 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}), 80 | ] 81 | for event in events: 82 | await in_memory_events_manager.write_event(event) 83 | for i, event in enumerate(events): 84 | task = (await run_coroutines_with_timeout( 85 | (in_memory_events_manager.get_events(max_size=custom_max_size), ) 86 | ))[0] 87 | assert events[i:i + custom_max_size] == task.result() 88 | 89 | 90 | @pytest.mark.asyncio 91 | async def test_get_events_zero_max_size(in_memory_events_manager): 92 | """ 93 | test for get_events with max size = 0 94 | """ 95 | events = [ 96 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}), 97 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}), 98 | ] 99 | for event in events: 100 | await in_memory_events_manager.write_event(event) 101 | task = (await run_coroutines_with_timeout( 102 | (in_memory_events_manager.get_events(max_size=0), ) 103 | ))[0] 104 | assert [] == task.result() 105 | 106 | 107 | @pytest.mark.asyncio 108 | async def test_clean_sanity(in_memory_events_manager): 109 | """ 110 | sanity test for clean method 111 | """ 112 | events = [ 113 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}), 114 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}), 115 | ] 116 | for event in events: 117 | await in_memory_events_manager.write_event(event) 118 | in_memory_events_manager.clean() 119 | assert in_memory_events_manager.is_empty() 120 | 121 | 122 | @pytest.mark.asyncio 123 | async def test_clean_no_events(in_memory_events_manager): 124 | """ 125 | test for clean method where the manager is already empty 126 | """ 127 | assert in_memory_events_manager.is_empty() 128 | in_memory_events_manager.clean() 129 | assert in_memory_events_manager.is_empty() 130 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/system/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Common and builtin fixtures & utilities 3 | """ 4 | import asyncio 5 | import yaml 6 | import time 7 | from datetime import datetime, timedelta 8 | import pytest 9 | from kubernetes_asyncio import config 10 | from kubernetes_asyncio import client 11 | from kubernetes_asyncio import utils 12 | 13 | 14 | def default_cluster_agent_deployment(): 15 | """ Default cluster agent deployment """ 16 | labels = { 17 | 'app': 'epsagon-cluster-agent' 18 | } 19 | return client.V1Deployment( 20 | api_version='apps/v1', 21 | kind='Deployment', 22 | metadata=client.V1ObjectMeta(name='cluster-agent', namespace='epsagon-monitoring'), 23 | spec=client.V1DeploymentSpec( 24 | selector=client.V1LabelSelector( 25 | match_labels=labels.copy() 26 | ), 27 | replicas=1, 28 | template=client.V1PodTemplateSpec( 29 | metadata=client.V1ObjectMeta(labels=labels.copy()), 30 | spec=client.V1PodSpec( 31 | service_account_name='cluster-agent', 32 | containers=[ 33 | client.V1Container( 34 | name='cluster-agent', 35 | image='epsagon/cluster-agent:test', 36 | # required for pulling from the docker local loaded images 37 | # and not from Epsagon remote hub 38 | image_pull_policy='Never', 39 | env=[ 40 | client.V1EnvVar(name='EPSAGON_TOKEN', value='123'), 41 | client.V1EnvVar(name='EPSAGON_CLUSTER_NAME', value='test'), 42 | client.V1EnvVar(name='EPSAGON_DEBUG', value='false'), 43 | client.V1EnvVar(name='EPSAGON_COLLECTOR_URL', value='http://localhost:5000'), 44 | ] 45 | ), 46 | ] 47 | ), 48 | ), 49 | ), 50 | ) 51 | 52 | 53 | @pytest.fixture(scope='session') 54 | def event_loop(request): 55 | loop = asyncio.get_event_loop_policy().new_event_loop() 56 | yield loop 57 | loop.close() 58 | 59 | 60 | @pytest.fixture(scope='session', autouse=True) 61 | async def load_cluster_config(): 62 | """ 63 | Loads the cluster config. 64 | Assumes `kubectl` is set to the environment test cluster. 65 | """ 66 | await config.load_kube_config() 67 | 68 | 69 | class ClusterAgentInstaller: 70 | """ Cluster agent installer """ 71 | 72 | def __init__(self, api_client=None): 73 | self.apps_api_client = client.AppsV1Api(api_client=api_client) 74 | self.api_client = self.apps_api_client.api_client 75 | 76 | async def install_epsagon_role(self): 77 | """ Installs the Epsagon role required for the cluster agent """ 78 | await utils.create_from_yaml(self.api_client, '../../epsagon_role.yaml', namespace="epsagon-monitoring") 79 | 80 | async def install_cluster_agent(self, agent_deployment: client.V1Deployment): 81 | """ Installs the cluster agent """ 82 | await self.apps_api_client.create_namespaced_deployment( 83 | agent_deployment.metadata.namespace, 84 | agent_deployment 85 | ) 86 | 87 | async def _wait_for_deployment_pod(self, deployment_name: str, namespace: str): 88 | """ 89 | Waits for one a pod of the given deployment to be ready 90 | """ 91 | timeout = timedelta(seconds=30) 92 | start = datetime.now() 93 | end = datetime.now() 94 | while end - start < timeout: 95 | deployment = await self.apps_api_client.read_namespaced_deployment( 96 | deployment_name, 97 | namespace 98 | ) 99 | ready_replicas = deployment.status.ready_replicas 100 | if ready_replicas and ready_replicas > 0: 101 | print(ready_replicas) 102 | return 103 | time.sleep(2) 104 | end = datetime.now() 105 | 106 | raise Exception("Cluster agent pod failed to start") 107 | 108 | async def install_all( 109 | self, 110 | agent_deployment=None, 111 | wait_for_agent_pod_initialization=True, 112 | ): 113 | """ 114 | Installs the cluster agent. 115 | """ 116 | await self.install_epsagon_role() 117 | agent_deployment = ( 118 | agent_deployment 119 | if agent_deployment 120 | else default_cluster_agent_deployment() 121 | ) 122 | await self.install_cluster_agent(agent_deployment) 123 | deployment_name = agent_deployment.metadata.name 124 | deployment_namespace = agent_deployment.metadata.namespace 125 | if wait_for_agent_pod_initialization: 126 | await self._wait_for_deployment_pod(deployment_name, deployment_namespace) 127 | 128 | 129 | -------------------------------------------------------------------------------- /pkg/cluster_agent/kubernetes_event.py: -------------------------------------------------------------------------------- 1 | """ 2 | Kubernetes events 3 | """ 4 | import json 5 | import time 6 | from typing import Dict 7 | from enum import Enum 8 | from encoders import DateTimeEncoder 9 | 10 | class KubernetesEventEncoder(DateTimeEncoder): 11 | """ 12 | JSON Encoder for kubernetes events 13 | """ 14 | def default(self, o): # pylint: disable=method-hidden 15 | """ 16 | Overriding for specific serialization 17 | """ 18 | if isinstance(o, KubernetesEvent): 19 | return json.dumps(o.to_dict(), cls=DateTimeEncoder) 20 | 21 | return super(KubernetesEventEncoder, self).default(o) 22 | 23 | 24 | class KubernetesEventException(Exception): 25 | pass 26 | 27 | class InvalidWatchEventException(KubernetesEventException): 28 | pass 29 | 30 | class KubernetesEventType(Enum): 31 | """ 32 | General kubernetes event types, used by Epsagon 33 | """ 34 | CLUSTER = "cluster" 35 | WATCH = "watch" 36 | 37 | 38 | class WatchKubernetesEventType(Enum): 39 | """ 40 | Kubernetes watch (from kubernetes apiserver) event types 41 | """ 42 | ADDED = "ADDED" 43 | MODIFIED = "MODIFIED" 44 | DELETED = "DELETED" 45 | 46 | 47 | class KubernetesEvent: 48 | """ 49 | Abstract kubernetes event 50 | """ 51 | 52 | def __init__(self, event_type: KubernetesEventType, data): 53 | """ 54 | :param event_type: 55 | :param data: the actual event data 56 | """ 57 | self.event_type = event_type 58 | self.data = data 59 | self.timestamp = time.time_ns() 60 | 61 | def get_formatted_payload(self): 62 | """ 63 | Gets the kubernetes event data formatted. 64 | Inheriting classes can override this behaviour and format the payload 65 | as needed. 66 | By default, returns the raw data as given when initialized. 67 | """ 68 | return self.data 69 | 70 | def to_dict(self): 71 | """ 72 | Encode the kubernetes event as JSON 73 | """ 74 | return { 75 | "metadata": { 76 | "kind": self.event_type.value, 77 | "timestamp": self.timestamp, 78 | }, 79 | "payload": self.get_formatted_payload(), 80 | } 81 | 82 | def __eq__(self, other): 83 | """ 84 | Checks equity by comparing the event type & data 85 | """ 86 | return ( 87 | type(self) == type(other) and 88 | self.event_type == other.event_type and 89 | self.data == other.data 90 | ) 91 | 92 | def __hash__(self): 93 | """ gets the item hash """ 94 | data = self.to_dict() 95 | data["metadata"].pop("timestamp") 96 | return hash(str(data)) 97 | 98 | 99 | class WatchKubernetesEvent(KubernetesEvent): 100 | """ 101 | Kubernetes watch event 102 | """ 103 | OBJECT_FIELD_KEY = "object" 104 | EVENT_FIELDS = (OBJECT_FIELD_KEY, "type") 105 | 106 | def __init__( 107 | self, 108 | watch_event_type: WatchKubernetesEventType, 109 | watched_obj: Dict 110 | ): 111 | """ 112 | :param watch_event_type: kubernetes watch type 113 | :param watched_obj: the actual watched object the event related to 114 | """ 115 | super().__init__(KubernetesEventType.WATCH, watched_obj) 116 | self.watch_event_type: WatchKubernetesEventType = watch_event_type 117 | 118 | @classmethod 119 | def from_watch_dict(cls, raw_data): 120 | """ 121 | Instantiate a WatchKubernetesEvent from a raw watch event dict 122 | """ 123 | for field in cls.EVENT_FIELDS: 124 | if field not in raw_data: 125 | raise InvalidWatchEventException(f"Missing `{field}` in event") 126 | 127 | obj = raw_data[cls.OBJECT_FIELD_KEY].to_dict() 128 | event_type = raw_data["type"] 129 | if event_type not in ( 130 | current_type.value for current_type in WatchKubernetesEventType 131 | ): 132 | raise InvalidWatchEventException( 133 | f"Unsupported `{event_type}` watch event type" 134 | ) 135 | return cls(WatchKubernetesEventType(event_type), obj) 136 | 137 | def get_resource_version(self): 138 | """ 139 | Gets the watch kubernetes object resource version. 140 | If cannot extract resource version, returns None 141 | """ 142 | return self.data.get("metadata", {}).get("resource_version") 143 | 144 | def get_formatted_payload(self): 145 | """ 146 | Gets the watch kubernetes event data formatted. 147 | """ 148 | return { 149 | "type": self.watch_event_type.value, 150 | "object": super().get_formatted_payload() 151 | } 152 | 153 | def __eq__(self, other): 154 | """ 155 | Checks equity by comapring the data and the watch specific event type 156 | """ 157 | return ( 158 | type(self) == type(other) and 159 | self.watch_event_type == other.watch_event_type and 160 | self.data == other.data 161 | ) 162 | 163 | def __hash__(self): 164 | """ gets the item hash """ 165 | return super().__hash__() 166 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/test_kubernetes_event.py: -------------------------------------------------------------------------------- 1 | """ 2 | KubernetesEvent tests 3 | """ 4 | import time 5 | import pytest 6 | from asynctest.mock import patch, MagicMock 7 | from kubernetes_event import ( 8 | KubernetesEvent, 9 | WatchKubernetesEvent, 10 | KubernetesEventType, 11 | WatchKubernetesEventType, 12 | ) 13 | 14 | FAKE_TIMESTAMP = time.time_ns() 15 | 16 | @pytest.mark.asyncio 17 | async def test_initialize(): 18 | """ Initialize sanity test """ 19 | for current_type in KubernetesEventType: 20 | KubernetesEvent(current_type, {}) 21 | 22 | 23 | @pytest.mark.asyncio 24 | async def test_get_formatted_payload(): 25 | """ get_formatted_payload test """ 26 | data = { 27 | "A": "a" 28 | } 29 | event = KubernetesEvent(KubernetesEventType.CLUSTER, data) 30 | assert event.get_formatted_payload() == data 31 | 32 | 33 | def _get_expected_dict(event): 34 | """ Gets the expected event dict """ 35 | if type(event) == KubernetesEvent: 36 | return { 37 | "metadata": { 38 | "kind": event.event_type.value.lower(), 39 | "timestamp": FAKE_TIMESTAMP, 40 | }, 41 | "payload": event.data, 42 | } 43 | elif type(event) == WatchKubernetesEvent: 44 | return { 45 | "metadata": { 46 | "kind": event.event_type.value.lower(), 47 | "timestamp": FAKE_TIMESTAMP, 48 | }, 49 | "payload": { 50 | "type": event.watch_event_type.value, 51 | "object": event.data, 52 | } 53 | } 54 | 55 | raise Exception(f"Unsupported event type: {type(event)}") 56 | 57 | 58 | @pytest.mark.asyncio 59 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP)) 60 | async def test_to_dict(): 61 | """ to_dict test """ 62 | data = { 63 | "A": "a" 64 | } 65 | event_type = KubernetesEventType.CLUSTER 66 | event = KubernetesEvent(event_type, data) 67 | assert event.to_dict() == _get_expected_dict(event) 68 | 69 | 70 | @pytest.mark.asyncio 71 | async def test_equity(): 72 | """ __eq__ test """ 73 | all_data = [ 74 | { 75 | "A": "a" 76 | }, 77 | { 78 | "A": "a" 79 | }, 80 | { 81 | "B": "a" 82 | }, 83 | ] 84 | event_type = KubernetesEventType.CLUSTER 85 | events = [ 86 | KubernetesEvent(event_type, event_data) 87 | for event_data in all_data 88 | ] 89 | assert events[0] == events[1] 90 | assert events[0] != events[2] 91 | 92 | 93 | @pytest.mark.asyncio 94 | async def test_watch_initialize(): 95 | """ Watch events - initialize sanity test """ 96 | for current_type in WatchKubernetesEventType: 97 | WatchKubernetesEvent(current_type, {}) 98 | 99 | 100 | @pytest.mark.asyncio 101 | async def test_watch_get_formatted_payload(): 102 | """ get_formatted_payload test """ 103 | data = { 104 | "A": "a" 105 | } 106 | event_type = WatchKubernetesEventType.ADDED 107 | event = WatchKubernetesEvent(event_type, data) 108 | assert event.get_formatted_payload() == { 109 | "type": event_type.value, 110 | "object": data, 111 | } 112 | 113 | 114 | @pytest.mark.asyncio 115 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP)) 116 | async def test_watch_to_dict(): 117 | """ to_dict test """ 118 | data = { 119 | "A": "a" 120 | } 121 | event_type = WatchKubernetesEventType.ADDED 122 | event = WatchKubernetesEvent(event_type, data) 123 | assert event.to_dict() == _get_expected_dict(event) 124 | 125 | 126 | @pytest.mark.asyncio 127 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP)) 128 | async def test_watch_get_resource_version(): 129 | """ get_resource_version sanity test """ 130 | resource_version = "3" 131 | data = { 132 | "A": "a", 133 | "metadata": { 134 | "resourceVersion": resource_version, 135 | } 136 | } 137 | event_type = WatchKubernetesEventType.ADDED 138 | event = WatchKubernetesEvent(event_type, data) 139 | assert event.to_dict() == _get_expected_dict(event) 140 | assert resource_version == event.get_resource_version() 141 | 142 | 143 | @pytest.mark.asyncio 144 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP)) 145 | async def test_watch_get_resource_version(): 146 | """ get_resource_version test - no resource version """ 147 | data = { 148 | "A": "a", 149 | "metadata2222": { 150 | "a": "b" 151 | } 152 | } 153 | event_type = WatchKubernetesEventType.ADDED 154 | event = WatchKubernetesEvent(event_type, data) 155 | assert event.to_dict() == _get_expected_dict(event) 156 | assert not event.get_resource_version() 157 | 158 | 159 | @pytest.mark.asyncio 160 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP)) 161 | async def test_watch_to_dict(): 162 | """ to_dict test """ 163 | data = { 164 | "A": "a" 165 | } 166 | event_type = WatchKubernetesEventType.ADDED 167 | event = WatchKubernetesEvent(event_type, data) 168 | assert event.to_dict() == _get_expected_dict(event) 169 | 170 | 171 | @pytest.mark.asyncio 172 | async def test_watch_equity(): 173 | """ __eq__ test """ 174 | all_data = [ 175 | { 176 | "A": "a" 177 | }, 178 | { 179 | "A": "a" 180 | }, 181 | { 182 | "B": "a" 183 | }, 184 | ] 185 | event_type = WatchKubernetesEventType.ADDED 186 | events = [ 187 | WatchKubernetesEvent(event_type, event_data) 188 | for event_data in all_data 189 | ] 190 | assert events[0] == events[1] 191 | assert events[0] != events[2] 192 | -------------------------------------------------------------------------------- /pkg/cluster_agent/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Main module - runs collector & cluster discovery 3 | """ 4 | import time 5 | import logging 6 | import asyncio 7 | import socket 8 | import os 9 | import signal 10 | 11 | import aiofiles 12 | from datetime import datetime, timezone 13 | from traceback import format_exc 14 | from aiohttp import client_exceptions 15 | from kubernetes_asyncio import config, client 16 | from cluster_discovery import ClusterDiscovery 17 | from events_manager import InMemoryEventsManager 18 | from events_sender import EventsSender 19 | from epsagon_client import EpsagonClient, EpsagonClientException 20 | from forwarder import Forwarder 21 | from logger_configurer import LoggerConfigurer 22 | 23 | RESTART_WAIT_TIME_SECONDS = 60 24 | EPSAGON_TOKEN = os.getenv("EPSAGON_TOKEN") 25 | CLUSTER_NAME = os.getenv("EPSAGON_CLUSTER_NAME") 26 | COLLECTOR_URL = os.getenv( 27 | "EPSAGON_COLLECTOR_URL", 28 | "https://collector.epsagon.com/resources/v1" 29 | ) 30 | SHOULD_COLLECT_RESOURCES = os.getenv("EPSAGON_COLLECT_RESOURCES", "TRUE").upper() == "TRUE" 31 | SHOULD_COLLECT_EVENTS = os.getenv("EPSAGON_COLLECT_EVENTS", "FALSE").upper() == "TRUE" 32 | EPSAGON_CONF_DIR = "/etc/epsagon" 33 | IS_DEBUG_FILE_PATH = f"{EPSAGON_CONF_DIR}/epsagon_debug" 34 | 35 | def _get_log_file_name(): 36 | """ 37 | Gets the log file name, according to the configured collected data 38 | """ 39 | if SHOULD_COLLECT_RESOURCES and SHOULD_COLLECT_EVENTS: 40 | return "resources_and_events_log" 41 | if SHOULD_COLLECT_RESOURCES: 42 | return "resources_log" 43 | return "events_log" 44 | 45 | LOG_FILE_PATH = f"{os.getenv('HOME', '/tmp')}/{_get_log_file_name()}" 46 | LOG_FORMAT = "%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s" 47 | LOGGER_CONFIGURER = LoggerConfigurer(LOG_FORMAT, LOG_FILE_PATH) 48 | 49 | 50 | async def _is_debug_mode(): 51 | """ 52 | Checks whether the agent runs in debug mode. 53 | Checks IS_DEBUG_FILE_PATH boolean value. In case of error reading the 54 | file content, using the EPSAGON_DEBUG env var. 55 | """ 56 | try: 57 | async with aiofiles.open(IS_DEBUG_FILE_PATH, "r") as reader: 58 | return (await reader.read()).lower() == "true" 59 | except Exception: # pylint: disable=broad-except 60 | pass 61 | 62 | return os.getenv("EPSAGON_DEBUG", "").lower() == "true" 63 | 64 | 65 | def _reload_handler(): 66 | """ 67 | Reload configuration handler - reconfigures the main logger according 68 | to the current debug mode. 69 | """ 70 | LOGGER_CONFIGURER.update_logger_level(_is_debug_mode()) 71 | 72 | 73 | def _cancel_tasks(tasks): 74 | """ 75 | Cancels the given tasks 76 | """ 77 | for task in tasks: 78 | if not task.done(): 79 | task.cancel() 80 | 81 | 82 | async def _epsagon_conf_watcher(initial_debug_mode: bool): 83 | """ 84 | Watches for changes in the epsagon conf. 85 | If debug mode has been changed, updates the main logger 86 | with the new logging level. 87 | """ 88 | debug_mode = initial_debug_mode 89 | while True: 90 | try: 91 | current_debug_mode = await _is_debug_mode() 92 | if debug_mode != current_debug_mode: 93 | debug_mode = current_debug_mode 94 | LOGGER_CONFIGURER.update_logger_level(debug_mode) 95 | except Exception: # pylint: disable=broad-except 96 | pass 97 | await asyncio.sleep(120) 98 | 99 | 100 | async def run(is_debug_mode): 101 | """ 102 | Runs the cluster discovery & forwarder. 103 | """ 104 | asyncio.create_task(_epsagon_conf_watcher(is_debug_mode)) 105 | events_manager = InMemoryEventsManager() 106 | epsagon_client = await EpsagonClient.create(EPSAGON_TOKEN) 107 | events_sender = EventsSender( 108 | epsagon_client, 109 | COLLECTOR_URL, 110 | CLUSTER_NAME, 111 | EPSAGON_TOKEN 112 | ) 113 | cluster_discovery = ClusterDiscovery( 114 | events_manager.write_event, 115 | should_collect_resources=SHOULD_COLLECT_RESOURCES, 116 | should_collect_events=SHOULD_COLLECT_EVENTS, 117 | ) 118 | forwarder = Forwarder( 119 | events_manager, 120 | events_sender 121 | ) 122 | while True: 123 | try: 124 | tasks = [ 125 | asyncio.create_task(forwarder.start()), 126 | asyncio.create_task(cluster_discovery.start()) 127 | ] 128 | await asyncio.gather(*tasks) 129 | except ( 130 | client_exceptions.ClientError, 131 | socket.gaierror, 132 | ConnectionRefusedError, 133 | EpsagonClientException 134 | ): 135 | logging.error( 136 | "Connection error, restarting agent in %d seconds", 137 | RESTART_WAIT_TIME_SECONDS 138 | ) 139 | _cancel_tasks(tasks) 140 | events_manager.clean() 141 | await asyncio.sleep(RESTART_WAIT_TIME_SECONDS) 142 | except Exception as exception: 143 | logging.error(str(exception)) 144 | logging.error(format_exc()) 145 | logging.info("Agent is exiting due to an unexpected error") 146 | _cancel_tasks(tasks) 147 | await epsagon_client.close() 148 | break 149 | 150 | 151 | def main(): 152 | is_debug = asyncio.run(_is_debug_mode()) 153 | LOGGER_CONFIGURER.configure_logger(is_debug) 154 | if not EPSAGON_TOKEN: 155 | logging.error( 156 | "Missing Epsagon token. " 157 | "Make sure to configure EPSAGON_TOKEN in cluster_agent_deployment.yaml" 158 | ) 159 | return 160 | 161 | if not CLUSTER_NAME: 162 | logging.error( 163 | "Missing cluster name. " 164 | "Make sure to configure EPSAGON_CLUSTER_NAME in cluster_agent_deployment.yaml" 165 | ) 166 | return 167 | 168 | config.load_incluster_config() 169 | logging.info("Loaded cluster config") 170 | if is_debug: 171 | loaded_conf = client.configuration.Configuration.get_default_copy() 172 | logging.debug( 173 | "Loaded cluster configuration:\nHost: %s\n" 174 | "Using SSL Cert? %s\nUsing API token? %s", 175 | loaded_conf.host, 176 | bool(loaded_conf.ssl_ca_cert), 177 | bool(loaded_conf.api_key) 178 | ) 179 | loop = asyncio.new_event_loop() 180 | loop.add_signal_handler(signal.SIGHUP, _reload_handler) 181 | loop.run_until_complete(run(is_debug)) 182 | loop.close() 183 | 184 | if __name__ == "__main__": 185 | main() 186 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/test_forwarder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Forwarder tests 3 | """ 4 | import pytest 5 | import asyncio 6 | from typing import List 7 | from events_manager import EventsManager, InMemoryEventsManager 8 | from forwarder import Forwarder 9 | from kubernetes_event import ( 10 | KubernetesEvent, 11 | WatchKubernetesEvent, 12 | WatchKubernetesEventType, 13 | ) 14 | from .conftest import run_coroutines_with_timeout 15 | 16 | DEFAULT_MAX_EVENTS_TO_READ = Forwarder.DEFAULT_MAX_EVENTS_TO_READ 17 | DEFAULT_MAX_WORKERS = Forwarder.DEFAULT_MAX_WORKERS 18 | DEFAULT_EVENTS_COUNT = 1000 19 | 20 | class EventsManagerMock(InMemoryEventsManager): 21 | """ EventsManager mock, verifies max events to read """ 22 | def __init__(self, expected_max_size): 23 | """ 24 | :param expected_max_size: the expected max size when using get_events 25 | """ 26 | super().__init__() 27 | self.expected_max_size = expected_max_size 28 | 29 | async def get_events(self, max_size: int, timeout: int=None) -> List[KubernetesEvent]: 30 | """ 31 | Asserts the given max size, 32 | """ 33 | assert max_size == self.expected_max_size 34 | return await super().get_events(max_size) 35 | 36 | class EventsSenderMock: 37 | """ EventsSender mock, verifies max worker senders """ 38 | def __init__(self, expected_max_workers: int, error: Exception = None): 39 | """ 40 | :param expected_max_workers: the expected max size of workers used 41 | to send events 42 | :param error: to raise when send_events is called 43 | """ 44 | self.expected_max_workers = expected_max_workers 45 | self.current_workers_count = 0 46 | self.events = set() 47 | self.error = error 48 | 49 | async def send_events(self, events: List[KubernetesEvent]): 50 | """ 51 | Asserts the current number of workers <= expected max workers and saves 52 | the given events. 53 | when called, using asyncio.sleep to simulate a "real" send scenario 54 | If self.error then raising self.error immediately 55 | """ 56 | if self.error: 57 | raise self.error 58 | 59 | self.current_workers_count += 1 60 | assert self.current_workers_count <= self.expected_max_workers 61 | for event in events: 62 | self.events.add(event) 63 | await asyncio.sleep(0.1) 64 | self.current_workers_count -= 1 65 | 66 | 67 | async def _write_events( 68 | events: List[KubernetesEvent], 69 | events_manager: EventsManager 70 | ): 71 | """ 72 | Writes given events using given events manager. 73 | After each written event, returns control to event loop using asyncio.sleep 74 | Used to simulate a real scenario where the event loop gets control between 75 | calls to write_event 76 | """ 77 | for event in events: 78 | await events_manager.write_event(event) 79 | await asyncio.sleep(0) 80 | 81 | 82 | def _generate_kubernetes_events(count: int) -> List[KubernetesEvent]: 83 | """ Generates kubernetes event """ 84 | return [ 85 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {i: i}) 86 | for i in range(count) 87 | ] 88 | 89 | 90 | @pytest.mark.asyncio 91 | async def test_sanity(): 92 | """ 93 | sanity test - runs forwarder while writing events, verifies all events are sent. 94 | """ 95 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ) 96 | events_sender = EventsSenderMock(DEFAULT_MAX_WORKERS) 97 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT) 98 | events_write_task, forwarder_task = await run_coroutines_with_timeout( 99 | ( 100 | _write_events(events, events_manager), 101 | Forwarder(events_manager, events_sender).start() 102 | ), 103 | verify_tasks_finished=False, 104 | timeout=0.5, 105 | ) 106 | assert events_write_task.done() 107 | assert not forwarder_task.done() 108 | forwarder_task.cancel() 109 | assert set(events) == events_sender.events 110 | 111 | 112 | @pytest.mark.asyncio 113 | async def test_send_events_failure(): 114 | """ 115 | send events failure test - runs forwarder and send_events raises an error, 116 | expects the forwarder task to end and raise the send events error. 117 | """ 118 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ) 119 | error = Exception("test error") 120 | events_sender = EventsSenderMock(DEFAULT_MAX_WORKERS, error=error) 121 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT) 122 | events_write_task, forwarder_task = await run_coroutines_with_timeout( 123 | ( 124 | _write_events(events, events_manager), 125 | Forwarder(events_manager, events_sender).start() 126 | ), 127 | verify_tasks_finished=False, 128 | timeout=0.5, 129 | ) 130 | assert events_write_task.done() 131 | assert forwarder_task.done() 132 | assert forwarder_task.exception() == error 133 | 134 | 135 | @pytest.mark.asyncio 136 | async def test_max_events_to_read(): 137 | """ 138 | Runs forwarder while writing events, verifies all events are sent and 139 | verifies no more than the given max events to read are read. 140 | """ 141 | max_events_to_read = 10 142 | events_manager = EventsManagerMock(max_events_to_read) 143 | events_sender = EventsSenderMock(DEFAULT_MAX_WORKERS) 144 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT) 145 | events_write_task, forwarder_task = await run_coroutines_with_timeout( 146 | ( 147 | _write_events(events, events_manager), 148 | Forwarder( 149 | events_manager, 150 | events_sender, 151 | max_events_to_read=max_events_to_read 152 | ).start(), 153 | ), 154 | verify_tasks_finished=False, 155 | timeout=3, 156 | ) 157 | assert events_write_task.done() 158 | assert not forwarder_task.done() 159 | forwarder_task.cancel() 160 | assert set(events) == events_sender.events 161 | 162 | @pytest.mark.asyncio 163 | async def test_max_workers(): 164 | """ 165 | Runs forwarder while writing events, verifies all events are sent and 166 | verifies no more than the given workers count ran 167 | concurrently (asyncronously) 168 | """ 169 | max_workers = 2 170 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ) 171 | events_sender = EventsSenderMock(max_workers) 172 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT) 173 | events_write_task, forwarder_task = await run_coroutines_with_timeout( 174 | ( 175 | _write_events(events, events_manager), 176 | Forwarder( 177 | events_manager, 178 | events_sender, 179 | max_workers=max_workers 180 | ).start(), 181 | ), 182 | verify_tasks_finished=False, 183 | timeout=1, 184 | ) 185 | assert events_write_task.done() 186 | assert not forwarder_task.done() 187 | forwarder_task.cancel() 188 | assert set(events) == events_sender.events 189 | 190 | 191 | @pytest.mark.asyncio 192 | async def test_invalid_max_workers(): 193 | """ 194 | assert value error is raised when initializing a forwarder with < 1 195 | max workers 196 | """ 197 | max_workers = 0 198 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ) 199 | events_sender = EventsSenderMock(max_workers) 200 | with pytest.raises(ValueError): 201 | Forwarder( 202 | events_manager, 203 | events_sender, 204 | max_workers=max_workers 205 | ) 206 | 207 | 208 | @pytest.mark.asyncio 209 | async def test_invalid_max_events_to_read(): 210 | """ 211 | assert value error is raised when initializing a forwarder with < 1 212 | max events to read 213 | """ 214 | max_workers = 1 215 | max_events_to_read = 0 216 | events_manager = EventsManagerMock(max_events_to_read) 217 | events_sender = EventsSenderMock(max_workers) 218 | with pytest.raises(ValueError): 219 | Forwarder( 220 | events_manager, 221 | events_sender, 222 | max_events_to_read=max_events_to_read 223 | ) 224 | -------------------------------------------------------------------------------- /pkg/cluster_agent/cluster_discovery.py: -------------------------------------------------------------------------------- 1 | """ 2 | Cluster discovery - watch & publish events in the cluster 3 | """ 4 | import asyncio 5 | import logging 6 | import socket 7 | from dataclasses import dataclass 8 | from typing import Callable, Any, Dict 9 | from traceback import format_exc 10 | import kubernetes_asyncio 11 | from aiohttp.client_exceptions import ClientError 12 | from kubernetes_event import ( 13 | KubernetesEvent, 14 | WatchKubernetesEvent, 15 | WatchKubernetesEventType, 16 | KubernetesEventException, 17 | KubernetesEventType, 18 | ) 19 | 20 | @dataclass 21 | class WatchTarget: 22 | """ watch target """ 23 | endpoint: Callable # endpoint to watch 24 | last_resource_version: Any = None # used to avoid full resyncs 25 | 26 | class ClusterDiscoveryException(Exception): 27 | pass 28 | 29 | class ErrorWatchEventException(ClusterDiscoveryException): 30 | pass 31 | 32 | class ClusterDiscovery: 33 | """ 34 | Cluster resources discovery - watches & publish events in cluster 35 | """ 36 | 37 | # default time to wait between watch attemps 38 | RETRY_INTERVAL_SECONDS = 30 39 | 40 | def _create_watch_targets( 41 | self, 42 | should_collect_resources: bool, 43 | should_collect_events: bool, 44 | ) -> Dict[str, WatchTarget]: 45 | """ 46 | Creates watch targets - all pods, nodes & deployments. 47 | """ 48 | targets = {} 49 | if should_collect_resources: 50 | targets.update( 51 | { 52 | "Pod": WatchTarget(self.client.list_pod_for_all_namespaces), 53 | "Node": WatchTarget(self.client.list_node), 54 | "Namespace": WatchTarget(self.client.list_namespace), 55 | "Deployment": WatchTarget( 56 | self.apps_api_client.list_deployment_for_all_namespaces, 57 | ), 58 | "DaemonSet": WatchTarget( 59 | self.apps_api_client.list_daemon_set_for_all_namespaces, 60 | ), 61 | "StatefulSet": WatchTarget( 62 | self.apps_api_client.list_stateful_set_for_all_namespaces, 63 | ), 64 | } 65 | ) 66 | if should_collect_events: 67 | targets["Event"] = WatchTarget(self.client.list_event_for_all_namespaces) 68 | return targets 69 | 70 | def __init__( 71 | self, 72 | event_handler, 73 | should_collect_resources=True, 74 | should_collect_events=False, 75 | api_client=None, 76 | retry_interval_seconds=RETRY_INTERVAL_SECONDS, 77 | ): 78 | """ 79 | :param event_handler: to write events to 80 | :param api_client: of the cluster to discover. If not given, using the 81 | default one. 82 | """ 83 | self.i = 0 84 | self.event_handler = event_handler 85 | self.client = kubernetes_asyncio.client.CoreV1Api(api_client=api_client) 86 | self.version_client = kubernetes_asyncio.client.VersionApi(api_client=api_client) 87 | self.apps_api_client = kubernetes_asyncio.client.AppsV1Api(api_client=api_client) 88 | self.should_collect_resources = should_collect_resources 89 | self.watch_targets = self._create_watch_targets( 90 | should_collect_resources, 91 | should_collect_events 92 | ) 93 | self.watch_tasks = [] 94 | if retry_interval_seconds < 0: 95 | raise ValueError("Retry interval seconds must be bigger than 0") 96 | 97 | self.retry_interval_seconds = retry_interval_seconds 98 | 99 | 100 | def _update_resource_version( 101 | self, 102 | kind, 103 | target: WatchTarget, 104 | resource_version 105 | ): 106 | """ 107 | Updates the resource version of given kind & watch targets. 108 | """ 109 | self.watch_targets[kind].last_resource_version = resource_version 110 | 111 | 112 | async def _get_initial_list(self, kind, target): 113 | """ 114 | Performs initial list of given watch target endpoint. 115 | """ 116 | response = await target() 117 | for item in response.items: 118 | item.kind = kind 119 | kubernetes_event = WatchKubernetesEvent( 120 | WatchKubernetesEventType.ADDED, 121 | item.to_dict() 122 | ) 123 | await self.event_handler(kubernetes_event) 124 | 125 | return response.metadata.resource_version 126 | 127 | 128 | async def _run_watch(self, kind, target, stream): 129 | """ 130 | Runs the watch stream of given watch target and watch resource kind. 131 | """ 132 | async for event in stream: 133 | try: 134 | event_type = event.get("type") 135 | if not event_type or event_type.lower() == "error": 136 | raise ErrorWatchEventException("Received an error event") 137 | logging.debug("Received event: %s", event) 138 | kubernetes_event = WatchKubernetesEvent.from_watch_dict(event) 139 | await self.event_handler(kubernetes_event) 140 | resource_version = kubernetes_event.get_resource_version() 141 | self._update_resource_version( 142 | kind, 143 | target, 144 | resource_version 145 | ) 146 | logging.debug("%s new resource version: %s", kind, resource_version) 147 | except KubernetesEventException: 148 | logging.debug("Skipping invalid event") 149 | 150 | 151 | async def _start_watch(self, kind, target): 152 | """ 153 | Watches given cluster endpoint. 154 | For each streamed event, creating KubernetesEvent and writing the 155 | event to the event handler. Ignoring invalid event object. 156 | """ 157 | if not target.last_resource_version: 158 | # resource first time retrieval 159 | resource_version = await self._get_initial_list(kind, target.endpoint) 160 | self._update_resource_version( 161 | kind, 162 | target, 163 | resource_version 164 | ) 165 | else: 166 | # continue watch from last preserved resource version 167 | resource_version = target.last_resource_version 168 | 169 | try: 170 | while True: 171 | logging.debug("Start watch for %s", kind) 172 | w = kubernetes_asyncio.watch.Watch() 173 | stream = w.stream(target.endpoint, resource_version=resource_version) 174 | await self._run_watch(kind, target, stream) 175 | except ClientError: 176 | logging.debug("Client Error: %s", format_exc()) 177 | # resource version timeout, restarting watch 178 | # from last preserved resource version 179 | await self._start_watch(kind, target) 180 | except ErrorWatchEventException: 181 | logging.debug("Restarting %s watch due to an error event", kind) 182 | self._update_resource_version(kind, target, None) 183 | await self._start_watch(kind, target) 184 | except asyncio.CancelledError: 185 | pass 186 | 187 | def _stop_all(self): 188 | """ 189 | Stops all watch tasks 190 | """ 191 | for task in self.discover_tasks: 192 | if not task.done(): 193 | task.cancel() 194 | elif not task.cancelled(): 195 | task.exception() 196 | 197 | 198 | async def _collect_cluster_info(self): 199 | """ 200 | Collects the cluster info 201 | """ 202 | try: 203 | version = None 204 | try: 205 | version: str = (await self.version_client.get_code()).git_version 206 | except Exception as exception: 207 | logging.debug("Could not extract cluster version") 208 | logging.error(str(exception)) 209 | logging.error(format_exc()) 210 | try: 211 | data = {"version": version} 212 | kubernetes_event = KubernetesEvent(KubernetesEventType.CLUSTER, data) 213 | await self.event_handler(kubernetes_event) 214 | except KubernetesEventException: 215 | logging.debug("Failed to create cluster event") 216 | raise 217 | except asyncio.CancelledError: 218 | pass 219 | 220 | async def start(self): 221 | """ 222 | Starts watch task per target (see _create_watch_targets) and runs 223 | more discovery tasks such as retrieving cluster level information. 224 | In case of watch resync issues, restarting all watches from the last 225 | preserved resource version. 226 | In case of other network issues, stopping all tasks and restarting 227 | after RETRY_INTERVAL_SECONDS. 228 | """ 229 | try: 230 | await self._collect_cluster_info() 231 | self.discover_tasks = [ 232 | asyncio.create_task(self._start_watch(kind, target)) 233 | for kind, target in self.watch_targets.items() 234 | ] 235 | await asyncio.gather( 236 | *self.discover_tasks, 237 | loop = asyncio.get_event_loop() 238 | ) 239 | except (socket.gaierror, ClientError): 240 | self.stop() 241 | logging.error( 242 | "Connection error, retrying in %d seconds", 243 | self.retry_interval_seconds 244 | ) 245 | await asyncio.sleep(self.retry_interval_seconds) 246 | await self.start() 247 | except asyncio.CancelledError: 248 | self.stop() 249 | 250 | 251 | def stop(self): 252 | """ 253 | Stops the cluster discovery. 254 | """ 255 | # reset resource version for all watch targets 256 | for target in self.watch_targets.values(): 257 | target.last_resource_version = None 258 | self._stop_all() 259 | 260 | -------------------------------------------------------------------------------- /pkg/cluster_agent/tests/ut/test_cluster_discovery.py: -------------------------------------------------------------------------------- 1 | """ 2 | ClusterDiscovery tests 3 | """ 4 | import asyncio 5 | import socket 6 | import pytest 7 | import kubernetes_asyncio 8 | from dataclasses import dataclass 9 | from typing import List, Dict, Set, Any 10 | from asynctest.mock import patch 11 | from cluster_discovery import ClusterDiscovery, WatchTarget 12 | from kubernetes_event import ( 13 | KubernetesEvent, 14 | WatchKubernetesEvent, 15 | KubernetesEventType, 16 | WatchKubernetesEventType, 17 | ) 18 | from .conftest import run_coroutines_with_timeout 19 | 20 | 21 | TEST_VERSION = "v1.18" 22 | CLUSTER_EVENT = KubernetesEvent( 23 | KubernetesEventType.CLUSTER, 24 | { 25 | "version": TEST_VERSION 26 | } 27 | ) 28 | INVALID_CLUSTER_EVENT = KubernetesEvent( 29 | KubernetesEventType.CLUSTER, 30 | { 31 | "version": None, 32 | } 33 | ) 34 | TEST_RESOURCE_VERSION = "123333" 35 | 36 | class MockWatchTarget: 37 | def __init__(self, kind, resource_list, watch_events, list_error, stream_error, delay): 38 | self.kind = kind 39 | self.resource_list = resource_list 40 | self.watch_events = watch_events 41 | self.list_error = list_error 42 | self.stream_error = stream_error 43 | self.delay = delay 44 | 45 | async def __call__(self, *arg, **kwargs): 46 | """ 47 | Called when the cluster discovery performs its initial list 48 | """ 49 | if self.list_error: 50 | raise self.list_error 51 | current_kind = self.kind 52 | class ItemWrapper: 53 | def __init__(self, data: Dict): 54 | self.data = data 55 | self._kind = None 56 | self.expected_kind = current_kind 57 | 58 | @property 59 | def kind(self): 60 | self._kind 61 | 62 | @kind.setter 63 | def kind(self, kind): 64 | self._kind = kind 65 | 66 | def to_dict(self): 67 | assert self._kind == self.expected_kind 68 | return self.data 69 | 70 | @dataclass 71 | class ListResponse: 72 | @dataclass 73 | class Metadata: 74 | resource_version: Any = TEST_RESOURCE_VERSION 75 | 76 | items: List[ItemWrapper] 77 | metadata: Metadata = Metadata() 78 | 79 | return ListResponse([ItemWrapper(resource) for resource in self.resource_list]) 80 | 81 | class KubernetesResourceObject: 82 | """ Test kubernetes resource object """ 83 | def __init__(self, data: Dict): 84 | self.data = data 85 | 86 | def to_dict(self): 87 | """ to dict - gets the original data """ 88 | return self.data 89 | 90 | class EventsManager: 91 | """ 92 | EventsManager, used for writing & validating given events 93 | """ 94 | def __init__(self): 95 | self.events: Set[KubernetesEvent] = set() 96 | 97 | async def write_event(self, event: KubernetesEvent): 98 | """ Adds an event to the manager """ 99 | self.events.add(event) 100 | 101 | 102 | class EventsGenerator: 103 | """ 104 | Events generator, used for each watch target 105 | """ 106 | 107 | def __init__(self, events, delay=0): 108 | """ 109 | :param events: to return one be one 110 | :param delay: between each event 111 | """ 112 | self.i = 0 # current event 113 | self.events = events 114 | self.delay = delay 115 | 116 | def __aiter__(self): 117 | return self 118 | 119 | async def __anext__(self): 120 | """ 121 | Gets the next event. Waits delay seconds between each event 122 | When done, sleeping "forever" - to simulate a "real" scenario where 123 | the events stream doesn't end. 124 | """ 125 | i = self.i 126 | if self.i >= len(self.events): 127 | # sleeps forever, simulating a real scenario 128 | await asyncio.sleep(1000) 129 | self.i += 1 130 | if i: 131 | await asyncio.sleep(self.delay) 132 | return self.events[i] 133 | 134 | 135 | class WatchMock: 136 | """ 137 | A mock class for the kubernetes client Watch class 138 | """ 139 | def stream(self, target: MockWatchTarget, resource_version=None): 140 | """ 141 | Gets the events stream, raises an error if the 142 | MockWatchTarget is configured with one 143 | """ 144 | assert resource_version == TEST_RESOURCE_VERSION 145 | if target.stream_error: 146 | raise target.stream_error 147 | 148 | return EventsGenerator(target.watch_events, delay=target.delay) 149 | 150 | 151 | 152 | class ClientMock: 153 | """ 154 | A kubernetes API client mock class 155 | (used for the cluster version retrieval) 156 | """ 157 | def __init__(self, error=None): 158 | """ 159 | :param error: to raise when used 160 | """ 161 | self.error = error 162 | 163 | async def get_code(self): 164 | """ 165 | Gets the cluster version code. Raises an error if self.error 166 | """ 167 | if self.error: 168 | raise self.error 169 | class VersionResponse: 170 | """ Version response, as returned from the API server """ 171 | def __init__(self, git_version): 172 | """ 173 | :param git_version: the cluster version to return 174 | """ 175 | self.git_version = git_version 176 | 177 | return VersionResponse(TEST_VERSION) 178 | 179 | 180 | def _patch_cluster_discovery_watch_targets( 181 | cluster_discovery: ClusterDiscovery, 182 | watch_targets: List[MockWatchTarget], 183 | version_client 184 | ): 185 | """ 186 | Patches the cluster discovery obj - replace all watch targets and the 187 | cluster version client with the `fake` ones. 188 | """ 189 | cluster_discovery.watch_targets = { 190 | target.kind: WatchTarget(target) for target in watch_targets 191 | } 192 | cluster_discovery.version_client = version_client 193 | 194 | 195 | @pytest.fixture 196 | def raw_target_events() -> List[List[Dict]]: 197 | """ 198 | Generate some events. Each events list item is for one `watch target` 199 | :return: A list of event lists 200 | """ 201 | return [ 202 | [ 203 | { 204 | "type": "ADDED", 205 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 206 | KubernetesResourceObject({ "1a": "1a"}) 207 | ) 208 | }, 209 | { 210 | "type": "ADDED", 211 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 212 | KubernetesResourceObject({ "1aa": "1aa"}) 213 | ) 214 | }, 215 | { 216 | "type": "MODIFIED", 217 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 218 | KubernetesResourceObject({ "1m": "1m"}) 219 | ) 220 | }, 221 | { 222 | "type": "DELETED", 223 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 224 | KubernetesResourceObject({ "1d": "1d"}) 225 | ) 226 | }, 227 | ], 228 | [ 229 | { 230 | "type": "ADDED", 231 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 232 | KubernetesResourceObject({ "2a": "2a"}) 233 | ) 234 | }, 235 | { 236 | "type": "MODIFIED", 237 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 238 | KubernetesResourceObject({ "2m": "2m"}) 239 | ) 240 | }, 241 | { 242 | "type": "MODIFIED", 243 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 244 | KubernetesResourceObject({ "2mm": "2mm"}) 245 | ) 246 | }, 247 | { 248 | "type": "DELETED", 249 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 250 | KubernetesResourceObject({ "2d": "1d"}) 251 | ) 252 | }, 253 | ], 254 | [ 255 | { 256 | "type": "ADDED", 257 | WatchKubernetesEvent.OBJECT_FIELD_KEY: ( 258 | KubernetesResourceObject({ "3a": "3a"}) 259 | ) 260 | }, 261 | ], 262 | ] 263 | 264 | 265 | @pytest.fixture 266 | def target_resource_lists() -> List[List[Dict]]: 267 | """ 268 | Generate some resources. Each resources list item is for one `watch target` 269 | :return: A list of resources lists 270 | """ 271 | return [ 272 | [ 273 | { 274 | "a": "A", 275 | }, 276 | { 277 | "T1": "T2", 278 | } 279 | ], 280 | [ 281 | { 282 | "x": "y", 283 | } 284 | ], 285 | [ 286 | { 287 | "Q": "T", 288 | } 289 | ] 290 | ] 291 | 292 | 293 | def _get_expected_events( 294 | resource_lists, 295 | raw_events: List[List[WatchKubernetesEvent]], 296 | cluster_event: KubernetesEvent 297 | ) -> Set[KubernetesEvent]: 298 | """ 299 | Gets the expected events objects. Skips invalid events. 300 | If cluster_event is given then adding it to the expected events 301 | """ 302 | events = { 303 | WatchKubernetesEvent.from_watch_dict(raw_event) 304 | for target_events in raw_events 305 | for raw_event in target_events 306 | if WatchKubernetesEvent.OBJECT_FIELD_KEY in raw_event # skip invalid test events 307 | } 308 | for resource_list in resource_lists: 309 | for resource in resource_list: 310 | events.add(WatchKubernetesEvent(WatchKubernetesEventType.ADDED, resource)) 311 | 312 | if cluster_event: 313 | events.add(cluster_event) 314 | 315 | return events 316 | 317 | 318 | async def _run_cluster_discovery( 319 | cluster_discovery, 320 | events_manager, 321 | resource_lists, 322 | raw_events, 323 | cluster_event, 324 | watch_stream_error=None, 325 | resource_list_error=None, 326 | ): 327 | """ 328 | Runs the cluster discovery (cluster_discovery.start). 329 | Validates the task status (is running/task had an exception), and 330 | that the actual written events are the expected ones. 331 | """ 332 | task = (await run_coroutines_with_timeout( 333 | (cluster_discovery.start(),), 334 | verify_tasks_finished=False, 335 | timeout=0.2 336 | ))[0] 337 | if watch_stream_error: 338 | expected_events = _get_expected_events( 339 | resource_lists, 340 | [], 341 | cluster_event 342 | ) 343 | if type(watch_stream_error) == Exception: 344 | # unhandled error, task should be done 345 | assert task.done() 346 | assert type(task.exception()) == Exception 347 | else: # task is expected to run as error should be handled 348 | assert not task.done() 349 | elif resource_list_error: 350 | expected_events = _get_expected_events( 351 | [], 352 | [], 353 | cluster_event 354 | ) 355 | assert task.done() 356 | else: 357 | expected_events = _get_expected_events( 358 | resource_lists, 359 | raw_events, 360 | cluster_event 361 | ) 362 | # normal run - cluster discovery shouldn't stop 363 | assert not task.done() 364 | 365 | assert expected_events == events_manager.events 366 | if not task.done(): 367 | task.cancel() 368 | 369 | 370 | async def _test_cluster_discovery( 371 | resource_lists, 372 | raw_events, 373 | invalid_cluster_event=False, 374 | include_invalid_watch_event=False, 375 | watch_stream_error=None, 376 | resource_list_error=None, 377 | ) -> ClusterDiscovery: 378 | """ 379 | Tests the cluster discovery run. 380 | :param raw_events: to be read by the cluster discovery watch tasks 381 | :param invalid_cluster_event: indicates whether should expect a 382 | valid/invalid cluster event 383 | :param include_invalid_watch_event: indicates whetherto include an invalid 384 | watch events 385 | :param watch_stream_error: error to be raised when the cluster discovery 386 | tries to watch its targets. 387 | :return: the cluster discovery object 388 | """ 389 | cluster_event = CLUSTER_EVENT 390 | cluster_error = None 391 | if invalid_cluster_event: 392 | cluster_event = INVALID_CLUSTER_EVENT 393 | cluster_error = Exception() 394 | 395 | version_client = ClientMock(error=cluster_error) 396 | manager = EventsManager() 397 | cluster_discovery = ClusterDiscovery(manager.write_event) 398 | if include_invalid_watch_event: 399 | for target_events in raw_events: 400 | target_events.append({ "invalid_event": "invalid"}) 401 | 402 | # prepare watch targets - with events and possibly an error, if given 403 | targets = [ 404 | MockWatchTarget( 405 | str(i), 406 | resource_lists[i], 407 | raw_events[i], 408 | resource_list_error, 409 | watch_stream_error, 410 | 0.01 411 | ) 412 | for i in range(len(raw_events)) 413 | ] 414 | # replace watch targets & version cluent at cluster_discovery 415 | _patch_cluster_discovery_watch_targets( 416 | cluster_discovery, targets, version_client 417 | ) 418 | # tests the cluster discovery run 419 | await _run_cluster_discovery( 420 | cluster_discovery, 421 | manager, 422 | resource_lists, 423 | raw_events, 424 | cluster_event=cluster_event, 425 | watch_stream_error=watch_stream_error, 426 | resource_list_error=resource_list_error, 427 | ) 428 | return cluster_discovery 429 | 430 | 431 | @pytest.mark.asyncio 432 | @patch("kubernetes_asyncio.client") 433 | @patch("kubernetes_asyncio.watch.Watch", WatchMock) 434 | async def test_sanity(_, target_resource_lists, raw_target_events): 435 | """ 436 | Sanity test - read multiple events 437 | """ 438 | await _test_cluster_discovery(target_resource_lists, raw_target_events) 439 | 440 | 441 | @pytest.mark.asyncio 442 | @patch("kubernetes_asyncio.client") 443 | @patch("kubernetes_asyncio.watch.Watch", WatchMock) 444 | async def test_invalid_cluster_version( 445 | _, 446 | target_resource_lists, 447 | raw_target_events 448 | ): 449 | """ 450 | Tests multiple events with no cluster version info. 451 | Expects the cluster discovery to run & collect the watch events. 452 | """ 453 | await _test_cluster_discovery( 454 | target_resource_lists, 455 | raw_target_events, 456 | invalid_cluster_event=True 457 | ) 458 | 459 | 460 | @pytest.mark.asyncio 461 | @patch("kubernetes_asyncio.client") 462 | @patch("kubernetes_asyncio.watch.Watch", WatchMock) 463 | async def test_invalid_watch_event(_, target_resource_lists, raw_target_events): 464 | """ 465 | Tests multiple events with some invalid watch events 466 | Expects the cluster discovery to run, collect the watch events and 467 | skip the invalid events. 468 | """ 469 | await _test_cluster_discovery( 470 | target_resource_lists, 471 | raw_target_events, 472 | include_invalid_watch_event=True 473 | ) 474 | 475 | 476 | @pytest.mark.asyncio 477 | @patch("kubernetes_asyncio.client") 478 | @patch("kubernetes_asyncio.watch.Watch", WatchMock) 479 | async def test_watch_stream_unhandled_error( 480 | _, 481 | target_resource_lists, 482 | raw_target_events 483 | ): 484 | """ 485 | Tests watch stream unhandled error - expect only the resource list events 486 | and the task to raise the error (and stops running). 487 | """ 488 | await _test_cluster_discovery( 489 | target_resource_lists, 490 | raw_target_events, 491 | invalid_cluster_event=False, 492 | watch_stream_error=Exception() 493 | ) 494 | 495 | 496 | @pytest.mark.asyncio 497 | @patch("kubernetes_asyncio.client") 498 | @patch("kubernetes_asyncio.watch.Watch", WatchMock) 499 | async def test_resource_list_unhandled_error( 500 | _, 501 | target_resource_lists, 502 | raw_target_events 503 | ): 504 | """ 505 | Tests resource list unhandled error - expect no events 506 | and the task to raise the error (and stops running). 507 | """ 508 | await _test_cluster_discovery( 509 | target_resource_lists, 510 | raw_target_events, 511 | invalid_cluster_event=False, 512 | resource_list_error=Exception() 513 | ) 514 | 515 | 516 | @pytest.mark.asyncio 517 | @patch("kubernetes_asyncio.client") 518 | @patch("kubernetes_asyncio.watch.Watch", WatchMock) 519 | async def test_watch_stream_handled_error( 520 | _, 521 | target_resource_lists, 522 | raw_target_events 523 | ): 524 | """ 525 | Tests watch stream handled error - expect no events and the task should 526 | still be running. 527 | """ 528 | await _test_cluster_discovery( 529 | target_resource_lists, 530 | raw_target_events, 531 | invalid_cluster_event=False, 532 | watch_stream_error=socket.gaierror 533 | ) 534 | 535 | 536 | @pytest.mark.asyncio 537 | @patch("kubernetes_asyncio.client") 538 | async def test_invalid_retry_interval_seconds(_): 539 | """ 540 | Tests invalid retry interval seconds param 541 | """ 542 | with pytest.raises(ValueError): 543 | ClusterDiscovery(None, retry_interval_seconds=-1) 544 | 545 | 546 | @pytest.mark.asyncio 547 | @patch("kubernetes_asyncio.client") 548 | @patch("kubernetes_asyncio.watch.Watch", WatchMock) 549 | async def test_stop(_, target_resource_lists, raw_target_events): 550 | """ 551 | Tests cluster_discovery.stop - expect all discover tasks to be done 552 | """ 553 | cluster_discovery: ClusterDiscovery = ( 554 | await _test_cluster_discovery(target_resource_lists, raw_target_events) 555 | ) 556 | cluster_discovery.stop() 557 | # tasks are already cancelled - CancelledError will be raised when 558 | # they will be scheduled to run. 559 | # Waiting for the task objects status to be update for testing purpose. 560 | await asyncio.sleep(0.1) 561 | for task in cluster_discovery.discover_tasks: 562 | assert task.done() or task.cancelled() 563 | --------------------------------------------------------------------------------