├── pkg
└── cluster_agent
│ ├── __init__.py
│ ├── tests
│ ├── __init__.py
│ ├── ut
│ │ ├── __init__.py
│ │ ├── test_epsagon_client.py
│ │ ├── conftest.py
│ │ ├── test_events_sender.py
│ │ ├── test_events_manager.py
│ │ ├── test_kubernetes_event.py
│ │ ├── test_forwarder.py
│ │ └── test_cluster_discovery.py
│ └── system
│ │ ├── sanity
│ │ ├── __init__.py
│ │ └── test_sanity.py
│ │ ├── cluster_config.yml
│ │ ├── README.md
│ │ └── conftest.py
│ ├── requirements.txt
│ ├── requirements-dev.txt
│ ├── .dockerignore
│ ├── build
│ └── Dockerfile
│ ├── encoders.py
│ ├── cicd
│ ├── generate_test_matrix.py
│ └── get_test_cluster_config.py
│ ├── README.md
│ ├── cluster_agent_deployment.yaml
│ ├── epsagon_role.yaml
│ ├── events_sender.py
│ ├── epsagon_client.py
│ ├── logger_configurer.py
│ ├── events_manager.py
│ ├── forwarder.py
│ ├── kubernetes_event.py
│ ├── main.py
│ └── cluster_discovery.py
├── .gitignore
├── .github
├── CODEOWNERS
└── workflows
│ └── cluster_agent_tests.yml
└── README.md
/pkg/cluster_agent/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .vscode
3 | __pycache__
4 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/system/sanity/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/requirements.txt:
--------------------------------------------------------------------------------
1 | kubernetes_asyncio
2 | aiohttp
3 | aiohttp-retry
4 | aiofiles
5 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest
2 | pytest-asyncio
3 | asynctest
4 | pytest_httpserver
5 |
--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Each line is a file pattern followed by one or more owners.
2 | * @epsagon/the-fabulous-team
--------------------------------------------------------------------------------
/pkg/cluster_agent/.dockerignore:
--------------------------------------------------------------------------------
1 | test*
2 | build/
3 | __pycache__/
4 | README.md
5 | venv
6 | requirements-dev.txt
7 | *.yml
8 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/system/cluster_config.yml:
--------------------------------------------------------------------------------
1 | kind: Cluster
2 | apiVersion: kind.x-k8s.io/v1alpha4
3 | nodes:
4 | - role: control-plane
5 | - role: worker
6 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/build/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7
2 |
3 | RUN mkdir /app
4 | WORKDIR /app
5 | ADD *.py /app/
6 | ADD requirements.txt /app/
7 | RUN pip install -r requirements.txt
8 |
9 | CMD ["python", "-u", "/app/main.py"]
10 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # epsagon-kubernetes
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | # Epsagon Kubernetes
11 |
12 | Official repo for Epsagon Kubernetes agent.
13 |
14 | ## Prerequisites
15 |
16 | Kubernetes 1.16+
17 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/encoders.py:
--------------------------------------------------------------------------------
1 | """
2 | Common json encoders
3 | """
4 | from datetime import datetime
5 | from json import JSONEncoder
6 |
7 |
8 | class DateTimeEncoder(JSONEncoder):
9 | """ JSON encoder for datetime class """
10 |
11 | def default(self, o): # pylint: disable=method-hidden
12 | """
13 | Overriding for specific serialization
14 | """
15 | if isinstance(o, datetime):
16 | return str(o)
17 | return super(DateTimeEncoder, self).default(o)
18 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/system/sanity/test_sanity.py:
--------------------------------------------------------------------------------
1 | """
2 | System sanity tests
3 | """
4 | import asyncio
5 | import pytest
6 | import conftest
7 |
8 |
9 | @pytest.fixture(scope='session', autouse=True)
10 | async def install_agent():
11 | installer = conftest.ClusterAgentInstaller()
12 | await installer.install_all()
13 |
14 | @pytest.mark.asyncio
15 | async def test_sanity():
16 | """
17 | A placeholder test - ran by CICD, used to test the agent pod is
18 | running successfully.
19 | """
20 | pass
21 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/cicd/generate_test_matrix.py:
--------------------------------------------------------------------------------
1 | """
2 | Generates the system test file paths matrix.
3 | This is written to save time during CI/CD and support custom cluster config.
4 | """
5 | import os
6 | import json
7 |
8 | SYSTEM_TEST_DIRPATH= "tests/system/"
9 |
10 | def main():
11 | dirs = [result[0] for result in os.walk(SYSTEM_TEST_DIRPATH)]
12 | dirs.remove(SYSTEM_TEST_DIRPATH)
13 | print(json.dumps([directory[len(SYSTEM_TEST_DIRPATH):] for directory in dirs]))
14 |
15 | if __name__ == '__main__':
16 | main()
17 |
18 |
19 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | # Epsagon Kubernetes Cluster Agent
9 |
10 |  
11 |
12 | The Epsagon kuberenetes cluster agent collect & watches your cluster resources. All collected events are being sent to [Epsagon](https://dashboard.epsagon.com/).
13 | ## Documentation
14 | Further documentation can be found on [Epsagon official docs website](https://docs.epsagon.com/).
15 |
16 | ## Prerequisites
17 |
18 | Kubernetes 1.16+
19 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/system/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | # Epsagon Kubernetes Cluster Agent - System tests
9 |
10 | To run the cluster agent system tests, you need the kubectl context to be set to a
11 | kubernetes cluster.
12 | Those tests also run as part of the CICD by an workflow set for this repo.
13 | The workflow which already take care of the environment setup (using a Kind cluster).
14 |
15 | ## Prerequisites
16 |
17 | * A Kubernetes cluster environment. The cluster can be also a Kind cluster or another non-real environment. The cluster version should be 1.16+.
18 | * `kubectl` context to be set to your environment test cluster.
19 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/cluster_agent_deployment.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: epsagon-monitoring
5 | ---
6 | apiVersion: apps/v1
7 | kind: Deployment
8 | metadata:
9 | name: cluster-agent
10 | namespace: epsagon-monitoring
11 | spec:
12 | selector:
13 | matchLabels:
14 | app: epsagon-cluster-agent
15 | replicas: 1
16 | template:
17 | metadata:
18 | labels:
19 | app: epsagon-cluster-agent
20 | spec:
21 | serviceAccountName: cluster-agent
22 | containers:
23 | - name: cluster-agent
24 | image: epsagon/cluster-agent:1.0.0
25 | imagePullPolicy: Always
26 | env:
27 | - name: EPSAGON_TOKEN
28 | value: ""
29 | - name: EPSAGON_CLUSTER_NAME
30 | value: ""
31 | - name: EPSAGON_DEBUG
32 | value: "false"
33 | - name: EPSAGON_COLLECTOR_URL
34 | value: "https://collector.epsagon.com/resources/v1"
35 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/cicd/get_test_cluster_config.py:
--------------------------------------------------------------------------------
1 | """
2 | Gets the cluster config file path according to given testing directory path.
3 | If the config is not found then using the default cluster configuration.
4 | """
5 | import os
6 | import sys
7 | import json
8 |
9 | USAGE = (
10 | f"Usage: python get_test_cluster_config.py "
11 | )
12 | CLUSTER_CONFIG_FILENAME = "cluster_config.yml"
13 | SYSTEM_TEST_DIRPATH= "tests/system/"
14 | DEFAULT_CLUSTER_CONFIG = os.path.join(SYSTEM_TEST_DIRPATH, CLUSTER_CONFIG_FILENAME)
15 |
16 | def main(test_directory):
17 | cluster_config_path = DEFAULT_CLUSTER_CONFIG
18 | file_path = os.path.join(test_directory, CLUSTER_CONFIG_FILENAME)
19 | if os.path.exists(file_path):
20 | cluster_config_path = file_path
21 |
22 | print(cluster_config_path)
23 |
24 |
25 | if __name__ == '__main__':
26 | args = sys.argv
27 | if len(args) != 2:
28 | print(USAGE)
29 | sys.exit(1)
30 |
31 | main(args[1])
32 |
33 |
34 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/epsagon_role.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Namespace
3 | metadata:
4 | name: epsagon-monitoring
5 | labels:
6 | name: epsagon-monitoring
7 | ---
8 | apiVersion: v1
9 | kind: ServiceAccount
10 | metadata:
11 | name: cluster-agent
12 | namespace: epsagon-monitoring
13 | ---
14 | apiVersion: rbac.authorization.k8s.io/v1
15 | kind: ClusterRole
16 | metadata:
17 | name: cluster-agent
18 | rules:
19 | - apiGroups: [""]
20 | resources:
21 | - nodes
22 | - services
23 | - endpoints
24 | - pods
25 | - namespaces
26 | - configmaps
27 | verbs: ["get", "list", "watch"]
28 | - apiGroups: ["apps"]
29 | resources: ["deployments", "statefulsets", "daemonsets"]
30 | verbs: ["get", "list", "watch"]
31 | ---
32 | apiVersion: rbac.authorization.k8s.io/v1
33 | kind: ClusterRoleBinding
34 | metadata:
35 | name: cluster-agent-binding
36 | roleRef:
37 | apiGroup: rbac.authorization.k8s.io
38 | kind: ClusterRole
39 | name: cluster-agent
40 | subjects:
41 | - kind: ServiceAccount
42 | name: cluster-agent
43 | namespace: epsagon-monitoring
44 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/test_epsagon_client.py:
--------------------------------------------------------------------------------
1 | """
2 | EpsagonClient tests
3 | """
4 | import base64
5 | import pytest
6 | from epsagon_client import EpsagonClient
7 |
8 | TEST_EPSAGON_TOKEN = "123"
9 | ENCODED_TOKEN = base64.b64encode(f"{TEST_EPSAGON_TOKEN}:".encode()).decode()
10 | TEST_PATH = "/post_path"
11 |
12 | @pytest.mark.asyncio
13 | async def test_initialize_no_epsagon_token():
14 | """ Initialize test - no epsagon token """
15 | with pytest.raises(ValueError):
16 | await EpsagonClient.create(None)
17 |
18 |
19 | @pytest.mark.asyncio
20 | async def test_post(httpserver):
21 | """ post sanity test """
22 | data = {
23 | "a": "A"
24 | }
25 | def handler(request):
26 | assert "Authorization" in request.headers
27 | assert request.headers["Authorization"] == f"Basic {ENCODED_TOKEN}"
28 | httpserver.expect_request(
29 | TEST_PATH,
30 | method="POST",
31 | ).respond_with_handler(handler)
32 | client = await EpsagonClient.create(TEST_EPSAGON_TOKEN)
33 | await client.post(httpserver.url_for(TEST_PATH), data)
34 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/conftest.py:
--------------------------------------------------------------------------------
1 | """
2 | Common test settings
3 | """
4 | import asyncio
5 | from asynctest.mock import MagicMock
6 |
7 | # monkey patch MagicMock
8 | async def async_magic():
9 | pass
10 |
11 | MagicMock.__await__ = lambda x: async_magic().__await__()
12 |
13 |
14 | async def run_coroutines_with_timeout(
15 | coroutines,
16 | verify_tasks_finished=True,
17 | timeout=1
18 | ):
19 | """
20 | Convert coroutines to tasks and runs them with a given timeout.
21 | :param coroutines: to run
22 | :param verify_coroutines_finished: verifies all the given coroutines
23 | finished running
24 | :param timeout: in seconds, to wait for all the coroutines to finish.
25 | :return: a list of the corresponding coroutines created tasks
26 | """
27 | tasks = [asyncio.create_task(coroutine) for coroutine in coroutines]
28 | finished, _ = await asyncio.wait(
29 | tasks,
30 | timeout=timeout,
31 | return_when=asyncio.ALL_COMPLETED
32 | )
33 | if verify_tasks_finished:
34 | assert len(finished) == len(tasks)
35 |
36 | return tasks
37 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/events_sender.py:
--------------------------------------------------------------------------------
1 | """
2 | Kubernetes Events sender
3 | """
4 | import json
5 | import base64
6 | import zlib
7 | from typing import List
8 | from encoders import DateTimeEncoder
9 | from kubernetes_event import KubernetesEvent
10 |
11 | class EventsSender:
12 | """
13 | Events sender
14 | """
15 |
16 | def __init__(self, client, url, cluster_name, epsagon_token):
17 | """
18 | :param client: used to send events by
19 | :param url: to send the events to
20 | """
21 | self.client = client
22 | self.url = url
23 | self.epsagon_token = epsagon_token
24 | self.cluster_name = cluster_name
25 |
26 | async def send_events(self, events: List[KubernetesEvent]):
27 | """
28 | Sends the given events
29 | """
30 | if not events:
31 | return
32 |
33 | events = [event.to_dict() for event in events]
34 | events_json = json.dumps(events, cls=DateTimeEncoder)
35 | compressed_data = base64.b64encode(
36 | zlib.compress(events_json.encode("utf-8"))
37 | ).decode("utf-8")
38 | data_to_send = {
39 | "epsagon_token": self.epsagon_token,
40 | "cluster_name": self.cluster_name,
41 | "data": compressed_data,
42 | }
43 |
44 | await self.client.post(self.url, json.dumps(data_to_send))
45 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/epsagon_client.py:
--------------------------------------------------------------------------------
1 | """
2 | Async Epsagon client
3 | """
4 | from http import HTTPStatus
5 | from aiohttp.helpers import BasicAuth
6 | from aiohttp.client_exceptions import ClientError
7 | from aiohttp_retry import RetryClient, ExponentialRetry
8 |
9 | class EpsagonClientException(Exception):
10 | pass
11 |
12 |
13 | class EpsagonClient:
14 | """
15 | Async Epsagon client
16 | """
17 |
18 | DEFAULT_RETRY_ATTEMPTS = 3
19 |
20 | @classmethod
21 | async def create(cls, epsagon_token, retry_attempts=DEFAULT_RETRY_ATTEMPTS):
22 | """
23 | Creates a new EpsagonClient instance
24 | :param epsagon_token: used for authorization
25 | """
26 | self = cls()
27 | if not epsagon_token:
28 | raise ValueError("Epsagon token must be given")
29 | self.epsagon_token = epsagon_token
30 | retry_options = ExponentialRetry(
31 | attempts=retry_attempts,
32 | exceptions=(ClientError,)
33 | )
34 | self.client = RetryClient(
35 | auth=BasicAuth(login=self.epsagon_token),
36 | headers={
37 | "Content-Type": "application/json",
38 | },
39 | retry_options=retry_options,
40 | raise_for_status=True
41 | )
42 | return self
43 |
44 | async def post(self, url, data):
45 | """
46 | Posts data to Epsagon given url.
47 | :param url: endpoint to post the data to
48 | :param data: to send
49 | HTTP status code.
50 | """
51 | async with self.client.post(url, data=data):
52 | pass
53 |
54 | async def close(self):
55 | """
56 | Closes the client.
57 | """
58 | await self.client.close()
59 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/logger_configurer.py:
--------------------------------------------------------------------------------
1 | """
2 | Logger configurer helper module.
3 | """
4 |
5 | import sys
6 | import logging
7 | from logging.handlers import RotatingFileHandler
8 |
9 | class LoggerConfigurer:
10 | """
11 | Logger configurer for the collector log.
12 | """
13 |
14 | MAX_LOG_FILE_SIZE = 10 * 1024 * 1024 # 10MB per log file
15 | FILE_BACKUP_COUNT = 1
16 |
17 | def __init__(
18 | self,
19 | log_format: str,
20 | log_file_path: str,
21 | logger: logging.Logger = None
22 | ):
23 | """
24 | :param logger: logger to configure, defauls to the root logger.
25 | """
26 | self.log_format = log_format
27 | self.log_file_path = log_file_path
28 | self.log_file_handler = None
29 | self.output_handler = None
30 | self.logger = logging.getLogger() if not logger else logger
31 |
32 | def configure_logger(self, is_debug: bool):
33 | """
34 | Configures the logger handlers with the log format & level.
35 | Configure 2 handlers:
36 | - 1 output handler (stdout), level set by given param `is_debug`
37 | - 1 file handler, level set to logging.DEBUG
38 | """
39 | formatter = logging.Formatter(self.log_format)
40 | self.log_file_handler = RotatingFileHandler(
41 | self.log_file_path,
42 | maxBytes=self.MAX_LOG_FILE_SIZE,
43 | backupCount=self.FILE_BACKUP_COUNT
44 | )
45 | self.output_handler = logging.StreamHandler(sys.stdout)
46 | self.output_handler.level = logging.DEBUG if is_debug else logging.INFO
47 | self.log_file_handler.level = logging.DEBUG
48 | self.logger.setLevel(logging.DEBUG)
49 | for handler in (self.log_file_handler, self.output_handler):
50 | handler.setFormatter(formatter)
51 | self.logger.addHandler(handler)
52 |
53 | def update_logger_level(self, is_debug: bool):
54 | """
55 | Updates the logger level. Updates only the stdout handler as the file handler
56 | always set to logging.DEBUG
57 | """
58 | self.output_handler.level = logging.DEBUG if is_debug else logging.INFO
59 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/test_events_sender.py:
--------------------------------------------------------------------------------
1 | """
2 | EventsSender tests
3 | """
4 | import base64
5 | import json
6 | import zlib
7 | import pytest
8 | from typing import Dict, List
9 | from asynctest.mock import patch, MagicMock
10 | from encoders import DateTimeEncoder
11 | from epsagon_client import EpsagonClient
12 | from events_sender import EventsSender
13 | from kubernetes_event import (
14 | KubernetesEvent,
15 | WatchKubernetesEvent,
16 | KubernetesEventType,
17 | WatchKubernetesEventType,
18 | )
19 |
20 | TEST_URL = "http://testurl/1"
21 | TEST_CLUSTER_NAME = "test-cluster-name"
22 | TEST_EPSAGON_TOKEN = "1234"
23 |
24 |
25 | def _get_expected_data(
26 | events_sender: EventsSender,
27 | events: List[KubernetesEvent]
28 | ):
29 | """
30 | Gets the expected data to be sent given events list and events sender
31 | """
32 | events = [event.to_dict() for event in events]
33 | events_json = json.dumps(events, cls=DateTimeEncoder)
34 | compressed_data = base64.b64encode(
35 | zlib.compress(events_json.encode("utf-8"))
36 | ).decode("utf-8")
37 | data_to_send = {
38 | "epsagon_token": events_sender.epsagon_token,
39 | "cluster_name": events_sender.cluster_name,
40 | "data": compressed_data,
41 | }
42 | return json.dumps(data_to_send)
43 |
44 |
45 | @pytest.mark.asyncio
46 | @patch("epsagon_client.EpsagonClient")
47 | async def test_send_events_sanity(epsagon_client_mock):
48 | epsagon_client_obj = epsagon_client_mock.return_value
49 | sender = EventsSender(
50 | epsagon_client_obj,
51 | TEST_URL,
52 | TEST_CLUSTER_NAME,
53 | TEST_EPSAGON_TOKEN
54 | )
55 | events = [
56 | KubernetesEvent(KubernetesEventType.CLUSTER, {"a": "b"}),
57 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}),
58 | ]
59 | await sender.send_events(events)
60 | epsagon_client_obj.post.assert_called_once_with(
61 | TEST_URL,
62 | _get_expected_data(sender, events)
63 | )
64 |
65 |
66 | @pytest.mark.asyncio
67 | @patch("epsagon_client.EpsagonClient")
68 | async def test_send_no_events(epsagon_client_mock):
69 | epsagon_client_obj = epsagon_client_mock.return_value
70 | sender = EventsSender(
71 | epsagon_client_obj,
72 | TEST_URL,
73 | TEST_CLUSTER_NAME,
74 | TEST_EPSAGON_TOKEN
75 | )
76 | events = []
77 | await sender.send_events(events)
78 | epsagon_client_obj.post.assert_not_called()
79 |
--------------------------------------------------------------------------------
/.github/workflows/cluster_agent_tests.yml:
--------------------------------------------------------------------------------
1 | name: Create Cluster
2 |
3 | on:
4 | pull_request:
5 | branches: [main]
6 | push:
7 | branches:
8 | - main
9 |
10 | jobs:
11 | unit-testing:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python 3.7
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: 3.7
19 |
20 | - name: Install dependencies
21 | timeout-minutes: 5
22 | working-directory: ./pkg/cluster_agent
23 | run: |
24 | pip install -r requirements-dev.txt
25 | pip install -r requirements.txt
26 | - name: test
27 | timeout-minutes: 1
28 | working-directory: ./pkg/cluster_agent
29 | run: python -m pytest ./tests/ut
30 |
31 |
32 | system_test_install:
33 | timeout-minutes: 10
34 | runs-on: ubuntu-latest
35 | outputs:
36 | matrixTestPath: ${{ steps.set-matrix.outputs.matrixTestPath }}
37 | steps:
38 | - uses: actions/checkout@v2
39 | - name: set-matrix
40 | id: set-matrix
41 | working-directory: ./pkg/cluster_agent
42 | run: |
43 | echo "::set-output name=matrixTestPath::`python ./cicd/generate_test_matrix.py`"
44 |
45 | system_tests:
46 | runs-on: ubuntu-latest
47 | timeout-minutes: 15
48 | needs: system_test_install
49 | strategy:
50 | fail-fast: false
51 | matrix:
52 | test_path: ${{ fromJson(needs.system_test_install.outputs.matrixTestPath) }}
53 |
54 | steps:
55 | - uses: actions/checkout@v2
56 | - name: get-cluster-config-path
57 | id: get-cluster-config-path
58 | run: echo "::set-output name=clusterConfig::`python ./cicd/get_test_cluster_config.py ${{ matrix.test_path }}`"
59 | working-directory: ./pkg/cluster_agent
60 |
61 | - name: Create k8s Kind Cluster
62 | uses: helm/kind-action@v1.2.0
63 | with:
64 | cluster_name: test-cluster
65 | config: ./pkg/cluster_agent/${{ steps.get-cluster-config-path.outputs.clusterConfig }}
66 |
67 | - name: build-image
68 | run: docker build . -t epsagon/cluster-agent:test -f ./build/Dockerfile
69 | working-directory: ./pkg/cluster_agent
70 |
71 | - name: load-image-to-kind
72 | run: kind load docker-image epsagon/cluster-agent:test --name test-cluster
73 |
74 | - name: Set up Python 3.7
75 | uses: actions/setup-python@v2
76 | with:
77 | python-version: 3.7
78 |
79 | - name: Install dependencies
80 | timeout-minutes: 5
81 | working-directory: ./pkg/cluster_agent
82 | run: |
83 | pip install -r requirements-dev.txt
84 | pip install -r requirements.txt
85 | - name: test
86 | timeout-minutes: 1
87 | working-directory: ./pkg/cluster_agent/tests/system/
88 | run: python -m pytest ${{ matrix.test_path }}
89 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/events_manager.py:
--------------------------------------------------------------------------------
1 | """
2 | Events managers module
3 | """
4 | import abc
5 | import logging
6 | from asyncio import Queue, wait_for, TimeoutError
7 | from typing import List
8 | from kubernetes_event import KubernetesEvent
9 |
10 |
11 | class EventsManager(abc.ABC):
12 | """
13 | An abstract asynchronous events manager - used to read from & write to
14 | asynchronously.
15 | Each specific events manager should inherit from this class.
16 | """
17 |
18 | @abc.abstractmethod
19 | def is_empty(self) -> bool:
20 | """
21 | Returns whether there're no unread events
22 | """
23 | raise NotImplementedError
24 |
25 | @abc.abstractmethod
26 | async def write_event(self, event: KubernetesEvent):
27 | """
28 | Writes an event
29 | """
30 | raise NotImplementedError
31 |
32 | @abc.abstractmethod
33 | async def get_event(self) -> KubernetesEvent:
34 | """
35 | Reads an event
36 | """
37 | raise NotImplementedError
38 |
39 | async def _read_event(self, timeout: int=None):
40 | """
41 | Reads and returns an event. If timeout is given, then trying to read event up to
42 | the timeout given value.
43 | In case of timeout, returns None.
44 | """
45 | event = None
46 | try:
47 | event = await wait_for(self.get_event(), timeout=timeout)
48 | except TimeoutError:
49 | pass
50 |
51 | return event
52 |
53 | async def get_events(self, max_size: int, timeout: int=None) -> List[KubernetesEvent]:
54 | """
55 | Reads up to max_size events.
56 | The functions waits until the earlier:
57 | - there's at least one event. In this case, returns all the
58 | existing events.
59 | - timeout been passed (in case its given). In this case, an empty list is returned.
60 | :param max_size: of events to read
61 | If max_size < 1, then returning an empty list.
62 | If the current events count in the queue is less than max_size, then
63 | returns just the current events.
64 | :param timeout: If given, then setting this timeout for the first
65 | read event attempt. If no event is read during after the given timeout,
66 | the functions returns with an empty list.
67 | """
68 | if max_size < 1:
69 | return []
70 |
71 | first_event = await self._read_event(timeout=timeout)
72 | if not first_event:
73 | return []
74 |
75 | events = [first_event]
76 | while not self.is_empty() and len(events) < max_size:
77 | events.append(await self.get_event())
78 |
79 | return events
80 |
81 |
82 | class InMemoryEventsManager(EventsManager):
83 | """
84 | Im memory events manager
85 | """
86 |
87 | def __init__(self, *args, **kwargs):
88 | super().__init__(*args, **kwargs)
89 | self.events_queue = Queue()
90 |
91 | def is_empty(self) -> bool:
92 | return self.events_queue.empty()
93 |
94 | async def write_event(self, event: KubernetesEvent):
95 | await self.events_queue.put(event)
96 |
97 | async def get_event(self) -> KubernetesEvent:
98 | return await self.events_queue.get()
99 |
100 | def clean(self):
101 | """
102 | Cleans all events.
103 | """
104 | self.events_queue = Queue()
105 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/forwarder.py:
--------------------------------------------------------------------------------
1 | """
2 | KubernetesEvent forwarder
3 | """
4 | import asyncio
5 | from typing import List, Set
6 | from kubernetes_event import KubernetesEvent
7 | from events_manager import EventsManager
8 | from events_sender import EventsSender
9 |
10 |
11 | class Forwarder:
12 | """
13 | A generic KubernetesEvent forwarder
14 | """
15 | DEFAULT_MAX_WORKERS = 5
16 | DEFAULT_MAX_EVENTS_TO_READ = 100
17 | DEFAULT_GET_EVENTS_TIMEOUT = 1
18 |
19 | def __init__(
20 | self,
21 | events_manager: EventsManager,
22 | events_sender: EventsSender,
23 | max_workers: int = DEFAULT_MAX_WORKERS,
24 | max_events_to_read: int = DEFAULT_MAX_EVENTS_TO_READ
25 | ):
26 | """
27 | :param events_manager: used to read from events
28 | :param events_sender: used to send read events to
29 | :param max_workers: to forward read events
30 | :param max_events_to_read: to read from the events_manager
31 | """
32 | self.events_manager = events_manager
33 | self.events_sender = events_sender
34 | if max_workers < 1:
35 | raise ValueError("Invalid workers count value, must be > 0")
36 | self.max_workers_count: int = max_workers
37 | if max_events_to_read < 1:
38 | raise ValueError("Invalid max events to read value, must be > 0")
39 | self.max_events_to_read: int = max_events_to_read
40 | self.running_workers: Set[asyncio.Task] = set()
41 |
42 | async def _forward_events(self, events: List[KubernetesEvent]):
43 | """
44 | Forwards the given events list
45 | """
46 | try:
47 | await self.events_sender.send_events(events)
48 | except asyncio.CancelledError:
49 | pass
50 |
51 | def _stop_all_workers(self):
52 | """
53 | Stops all workers
54 | """
55 | for worker in self.running_workers:
56 | if not worker.done():
57 | worker.cancel()
58 | elif not worker.cancelled():
59 | worker.exception()
60 | self.running_workers = set()
61 |
62 | def _check_failed_workers(self, workers):
63 | """
64 | Checks the finished workers status. If any worker had an error, then
65 | stopping the rest of the workers and raising an error.
66 | """
67 | for task in workers:
68 | task_exception = task.exception()
69 | if task_exception:
70 | self._stop_all_workers()
71 | raise task_exception
72 |
73 | def _get_finished_workers(self):
74 | """
75 | Gets the running workers tasks
76 | """
77 | return [task for task in self.running_workers if task.done()]
78 |
79 | async def start(self):
80 | """
81 | Starts the Forwarder. The forwarder will read up to MAX_EVENTS_TO_READ
82 | at each iteration using the events_manager, and sends them using the
83 | events_sender.
84 | """
85 | try:
86 | while True:
87 | events: List[KubernetesEvent] = await self.events_manager.get_events(
88 | self.max_events_to_read,
89 | timeout=self.DEFAULT_GET_EVENTS_TIMEOUT
90 | )
91 | self._check_failed_workers(self._get_finished_workers())
92 | if not events:
93 | continue
94 | if len(self.running_workers) < self.max_workers_count:
95 | self.running_workers.add(asyncio.create_task(
96 | self._forward_events(events)
97 | ))
98 | else:
99 | finished, unfinished = await asyncio.wait(
100 | self.running_workers,
101 | return_when=asyncio.FIRST_COMPLETED
102 | )
103 | self._check_failed_workers(finished)
104 | self.running_workers = unfinished
105 | self.running_workers.add(asyncio.create_task(
106 | self._forward_events(events)
107 | ))
108 | except asyncio.CancelledError:
109 | self._stop_all_workers()
110 |
111 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/test_events_manager.py:
--------------------------------------------------------------------------------
1 | """
2 | EventsManager tests
3 | """
4 | import asyncio
5 | import pytest
6 | from events_manager import InMemoryEventsManager
7 | from kubernetes_event import WatchKubernetesEvent, WatchKubernetesEventType
8 | from .conftest import run_coroutines_with_timeout
9 |
10 | DEFAULT_MAX_SIZE = 2
11 |
12 | @pytest.fixture
13 | def in_memory_events_manager():
14 | """
15 | In memory events manager fixture
16 | """
17 | return InMemoryEventsManager()
18 |
19 |
20 | @pytest.mark.asyncio
21 | async def test_is_empty_sanity(in_memory_events_manager):
22 | """
23 | is_empty sanity test
24 | """
25 | assert in_memory_events_manager.is_empty()
26 |
27 |
28 | @pytest.mark.asyncio
29 | async def test_is_empty_with_data(in_memory_events_manager):
30 | """
31 | is_empty test with events being written to events manager
32 | """
33 | assert in_memory_events_manager.is_empty()
34 | event = WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}),
35 | await in_memory_events_manager.write_event(event)
36 | assert not in_memory_events_manager.is_empty()
37 | await run_coroutines_with_timeout((in_memory_events_manager.get_event(), ))
38 | assert in_memory_events_manager.is_empty()
39 |
40 |
41 | @pytest.mark.asyncio
42 | async def test_get_and_write_sanity(in_memory_events_manager):
43 | """
44 | sanity test for write_event and get_event
45 | """
46 | event = WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}),
47 | await in_memory_events_manager.write_event(event)
48 | task = (await run_coroutines_with_timeout(
49 | (in_memory_events_manager.get_event(), )
50 | ))[0]
51 | assert event == task.result()
52 |
53 |
54 | @pytest.mark.asyncio
55 | async def test_get_events_sanity(in_memory_events_manager):
56 | """
57 | sanity test for get_events
58 | """
59 | events = [
60 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}),
61 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}),
62 | ]
63 | for event in events:
64 | await in_memory_events_manager.write_event(event)
65 | task = (await run_coroutines_with_timeout(
66 | (in_memory_events_manager.get_events(max_size=DEFAULT_MAX_SIZE), )
67 | ))[0]
68 | assert events == task.result()
69 |
70 |
71 | @pytest.mark.asyncio
72 | async def test_get_events_custom_max_size(in_memory_events_manager):
73 | """
74 | test for get_events with a custom max size
75 | """
76 | custom_max_size = 1
77 | events = [
78 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}),
79 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}),
80 | ]
81 | for event in events:
82 | await in_memory_events_manager.write_event(event)
83 | for i, event in enumerate(events):
84 | task = (await run_coroutines_with_timeout(
85 | (in_memory_events_manager.get_events(max_size=custom_max_size), )
86 | ))[0]
87 | assert events[i:i + custom_max_size] == task.result()
88 |
89 |
90 | @pytest.mark.asyncio
91 | async def test_get_events_zero_max_size(in_memory_events_manager):
92 | """
93 | test for get_events with max size = 0
94 | """
95 | events = [
96 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}),
97 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}),
98 | ]
99 | for event in events:
100 | await in_memory_events_manager.write_event(event)
101 | task = (await run_coroutines_with_timeout(
102 | (in_memory_events_manager.get_events(max_size=0), )
103 | ))[0]
104 | assert [] == task.result()
105 |
106 |
107 | @pytest.mark.asyncio
108 | async def test_clean_sanity(in_memory_events_manager):
109 | """
110 | sanity test for clean method
111 | """
112 | events = [
113 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {"a": "b"}),
114 | WatchKubernetesEvent(WatchKubernetesEventType.DELETED, {"a": "c"}),
115 | ]
116 | for event in events:
117 | await in_memory_events_manager.write_event(event)
118 | in_memory_events_manager.clean()
119 | assert in_memory_events_manager.is_empty()
120 |
121 |
122 | @pytest.mark.asyncio
123 | async def test_clean_no_events(in_memory_events_manager):
124 | """
125 | test for clean method where the manager is already empty
126 | """
127 | assert in_memory_events_manager.is_empty()
128 | in_memory_events_manager.clean()
129 | assert in_memory_events_manager.is_empty()
130 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/system/conftest.py:
--------------------------------------------------------------------------------
1 | """
2 | Common and builtin fixtures & utilities
3 | """
4 | import asyncio
5 | import yaml
6 | import time
7 | from datetime import datetime, timedelta
8 | import pytest
9 | from kubernetes_asyncio import config
10 | from kubernetes_asyncio import client
11 | from kubernetes_asyncio import utils
12 |
13 |
14 | def default_cluster_agent_deployment():
15 | """ Default cluster agent deployment """
16 | labels = {
17 | 'app': 'epsagon-cluster-agent'
18 | }
19 | return client.V1Deployment(
20 | api_version='apps/v1',
21 | kind='Deployment',
22 | metadata=client.V1ObjectMeta(name='cluster-agent', namespace='epsagon-monitoring'),
23 | spec=client.V1DeploymentSpec(
24 | selector=client.V1LabelSelector(
25 | match_labels=labels.copy()
26 | ),
27 | replicas=1,
28 | template=client.V1PodTemplateSpec(
29 | metadata=client.V1ObjectMeta(labels=labels.copy()),
30 | spec=client.V1PodSpec(
31 | service_account_name='cluster-agent',
32 | containers=[
33 | client.V1Container(
34 | name='cluster-agent',
35 | image='epsagon/cluster-agent:test',
36 | # required for pulling from the docker local loaded images
37 | # and not from Epsagon remote hub
38 | image_pull_policy='Never',
39 | env=[
40 | client.V1EnvVar(name='EPSAGON_TOKEN', value='123'),
41 | client.V1EnvVar(name='EPSAGON_CLUSTER_NAME', value='test'),
42 | client.V1EnvVar(name='EPSAGON_DEBUG', value='false'),
43 | client.V1EnvVar(name='EPSAGON_COLLECTOR_URL', value='http://localhost:5000'),
44 | ]
45 | ),
46 | ]
47 | ),
48 | ),
49 | ),
50 | )
51 |
52 |
53 | @pytest.fixture(scope='session')
54 | def event_loop(request):
55 | loop = asyncio.get_event_loop_policy().new_event_loop()
56 | yield loop
57 | loop.close()
58 |
59 |
60 | @pytest.fixture(scope='session', autouse=True)
61 | async def load_cluster_config():
62 | """
63 | Loads the cluster config.
64 | Assumes `kubectl` is set to the environment test cluster.
65 | """
66 | await config.load_kube_config()
67 |
68 |
69 | class ClusterAgentInstaller:
70 | """ Cluster agent installer """
71 |
72 | def __init__(self, api_client=None):
73 | self.apps_api_client = client.AppsV1Api(api_client=api_client)
74 | self.api_client = self.apps_api_client.api_client
75 |
76 | async def install_epsagon_role(self):
77 | """ Installs the Epsagon role required for the cluster agent """
78 | await utils.create_from_yaml(self.api_client, '../../epsagon_role.yaml', namespace="epsagon-monitoring")
79 |
80 | async def install_cluster_agent(self, agent_deployment: client.V1Deployment):
81 | """ Installs the cluster agent """
82 | await self.apps_api_client.create_namespaced_deployment(
83 | agent_deployment.metadata.namespace,
84 | agent_deployment
85 | )
86 |
87 | async def _wait_for_deployment_pod(self, deployment_name: str, namespace: str):
88 | """
89 | Waits for one a pod of the given deployment to be ready
90 | """
91 | timeout = timedelta(seconds=30)
92 | start = datetime.now()
93 | end = datetime.now()
94 | while end - start < timeout:
95 | deployment = await self.apps_api_client.read_namespaced_deployment(
96 | deployment_name,
97 | namespace
98 | )
99 | ready_replicas = deployment.status.ready_replicas
100 | if ready_replicas and ready_replicas > 0:
101 | print(ready_replicas)
102 | return
103 | time.sleep(2)
104 | end = datetime.now()
105 |
106 | raise Exception("Cluster agent pod failed to start")
107 |
108 | async def install_all(
109 | self,
110 | agent_deployment=None,
111 | wait_for_agent_pod_initialization=True,
112 | ):
113 | """
114 | Installs the cluster agent.
115 | """
116 | await self.install_epsagon_role()
117 | agent_deployment = (
118 | agent_deployment
119 | if agent_deployment
120 | else default_cluster_agent_deployment()
121 | )
122 | await self.install_cluster_agent(agent_deployment)
123 | deployment_name = agent_deployment.metadata.name
124 | deployment_namespace = agent_deployment.metadata.namespace
125 | if wait_for_agent_pod_initialization:
126 | await self._wait_for_deployment_pod(deployment_name, deployment_namespace)
127 |
128 |
129 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/kubernetes_event.py:
--------------------------------------------------------------------------------
1 | """
2 | Kubernetes events
3 | """
4 | import json
5 | import time
6 | from typing import Dict
7 | from enum import Enum
8 | from encoders import DateTimeEncoder
9 |
10 | class KubernetesEventEncoder(DateTimeEncoder):
11 | """
12 | JSON Encoder for kubernetes events
13 | """
14 | def default(self, o): # pylint: disable=method-hidden
15 | """
16 | Overriding for specific serialization
17 | """
18 | if isinstance(o, KubernetesEvent):
19 | return json.dumps(o.to_dict(), cls=DateTimeEncoder)
20 |
21 | return super(KubernetesEventEncoder, self).default(o)
22 |
23 |
24 | class KubernetesEventException(Exception):
25 | pass
26 |
27 | class InvalidWatchEventException(KubernetesEventException):
28 | pass
29 |
30 | class KubernetesEventType(Enum):
31 | """
32 | General kubernetes event types, used by Epsagon
33 | """
34 | CLUSTER = "cluster"
35 | WATCH = "watch"
36 |
37 |
38 | class WatchKubernetesEventType(Enum):
39 | """
40 | Kubernetes watch (from kubernetes apiserver) event types
41 | """
42 | ADDED = "ADDED"
43 | MODIFIED = "MODIFIED"
44 | DELETED = "DELETED"
45 |
46 |
47 | class KubernetesEvent:
48 | """
49 | Abstract kubernetes event
50 | """
51 |
52 | def __init__(self, event_type: KubernetesEventType, data):
53 | """
54 | :param event_type:
55 | :param data: the actual event data
56 | """
57 | self.event_type = event_type
58 | self.data = data
59 | self.timestamp = time.time_ns()
60 |
61 | def get_formatted_payload(self):
62 | """
63 | Gets the kubernetes event data formatted.
64 | Inheriting classes can override this behaviour and format the payload
65 | as needed.
66 | By default, returns the raw data as given when initialized.
67 | """
68 | return self.data
69 |
70 | def to_dict(self):
71 | """
72 | Encode the kubernetes event as JSON
73 | """
74 | return {
75 | "metadata": {
76 | "kind": self.event_type.value,
77 | "timestamp": self.timestamp,
78 | },
79 | "payload": self.get_formatted_payload(),
80 | }
81 |
82 | def __eq__(self, other):
83 | """
84 | Checks equity by comparing the event type & data
85 | """
86 | return (
87 | type(self) == type(other) and
88 | self.event_type == other.event_type and
89 | self.data == other.data
90 | )
91 |
92 | def __hash__(self):
93 | """ gets the item hash """
94 | data = self.to_dict()
95 | data["metadata"].pop("timestamp")
96 | return hash(str(data))
97 |
98 |
99 | class WatchKubernetesEvent(KubernetesEvent):
100 | """
101 | Kubernetes watch event
102 | """
103 | OBJECT_FIELD_KEY = "object"
104 | EVENT_FIELDS = (OBJECT_FIELD_KEY, "type")
105 |
106 | def __init__(
107 | self,
108 | watch_event_type: WatchKubernetesEventType,
109 | watched_obj: Dict
110 | ):
111 | """
112 | :param watch_event_type: kubernetes watch type
113 | :param watched_obj: the actual watched object the event related to
114 | """
115 | super().__init__(KubernetesEventType.WATCH, watched_obj)
116 | self.watch_event_type: WatchKubernetesEventType = watch_event_type
117 |
118 | @classmethod
119 | def from_watch_dict(cls, raw_data):
120 | """
121 | Instantiate a WatchKubernetesEvent from a raw watch event dict
122 | """
123 | for field in cls.EVENT_FIELDS:
124 | if field not in raw_data:
125 | raise InvalidWatchEventException(f"Missing `{field}` in event")
126 |
127 | obj = raw_data[cls.OBJECT_FIELD_KEY].to_dict()
128 | event_type = raw_data["type"]
129 | if event_type not in (
130 | current_type.value for current_type in WatchKubernetesEventType
131 | ):
132 | raise InvalidWatchEventException(
133 | f"Unsupported `{event_type}` watch event type"
134 | )
135 | return cls(WatchKubernetesEventType(event_type), obj)
136 |
137 | def get_resource_version(self):
138 | """
139 | Gets the watch kubernetes object resource version.
140 | If cannot extract resource version, returns None
141 | """
142 | return self.data.get("metadata", {}).get("resource_version")
143 |
144 | def get_formatted_payload(self):
145 | """
146 | Gets the watch kubernetes event data formatted.
147 | """
148 | return {
149 | "type": self.watch_event_type.value,
150 | "object": super().get_formatted_payload()
151 | }
152 |
153 | def __eq__(self, other):
154 | """
155 | Checks equity by comapring the data and the watch specific event type
156 | """
157 | return (
158 | type(self) == type(other) and
159 | self.watch_event_type == other.watch_event_type and
160 | self.data == other.data
161 | )
162 |
163 | def __hash__(self):
164 | """ gets the item hash """
165 | return super().__hash__()
166 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/test_kubernetes_event.py:
--------------------------------------------------------------------------------
1 | """
2 | KubernetesEvent tests
3 | """
4 | import time
5 | import pytest
6 | from asynctest.mock import patch, MagicMock
7 | from kubernetes_event import (
8 | KubernetesEvent,
9 | WatchKubernetesEvent,
10 | KubernetesEventType,
11 | WatchKubernetesEventType,
12 | )
13 |
14 | FAKE_TIMESTAMP = time.time_ns()
15 |
16 | @pytest.mark.asyncio
17 | async def test_initialize():
18 | """ Initialize sanity test """
19 | for current_type in KubernetesEventType:
20 | KubernetesEvent(current_type, {})
21 |
22 |
23 | @pytest.mark.asyncio
24 | async def test_get_formatted_payload():
25 | """ get_formatted_payload test """
26 | data = {
27 | "A": "a"
28 | }
29 | event = KubernetesEvent(KubernetesEventType.CLUSTER, data)
30 | assert event.get_formatted_payload() == data
31 |
32 |
33 | def _get_expected_dict(event):
34 | """ Gets the expected event dict """
35 | if type(event) == KubernetesEvent:
36 | return {
37 | "metadata": {
38 | "kind": event.event_type.value.lower(),
39 | "timestamp": FAKE_TIMESTAMP,
40 | },
41 | "payload": event.data,
42 | }
43 | elif type(event) == WatchKubernetesEvent:
44 | return {
45 | "metadata": {
46 | "kind": event.event_type.value.lower(),
47 | "timestamp": FAKE_TIMESTAMP,
48 | },
49 | "payload": {
50 | "type": event.watch_event_type.value,
51 | "object": event.data,
52 | }
53 | }
54 |
55 | raise Exception(f"Unsupported event type: {type(event)}")
56 |
57 |
58 | @pytest.mark.asyncio
59 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP))
60 | async def test_to_dict():
61 | """ to_dict test """
62 | data = {
63 | "A": "a"
64 | }
65 | event_type = KubernetesEventType.CLUSTER
66 | event = KubernetesEvent(event_type, data)
67 | assert event.to_dict() == _get_expected_dict(event)
68 |
69 |
70 | @pytest.mark.asyncio
71 | async def test_equity():
72 | """ __eq__ test """
73 | all_data = [
74 | {
75 | "A": "a"
76 | },
77 | {
78 | "A": "a"
79 | },
80 | {
81 | "B": "a"
82 | },
83 | ]
84 | event_type = KubernetesEventType.CLUSTER
85 | events = [
86 | KubernetesEvent(event_type, event_data)
87 | for event_data in all_data
88 | ]
89 | assert events[0] == events[1]
90 | assert events[0] != events[2]
91 |
92 |
93 | @pytest.mark.asyncio
94 | async def test_watch_initialize():
95 | """ Watch events - initialize sanity test """
96 | for current_type in WatchKubernetesEventType:
97 | WatchKubernetesEvent(current_type, {})
98 |
99 |
100 | @pytest.mark.asyncio
101 | async def test_watch_get_formatted_payload():
102 | """ get_formatted_payload test """
103 | data = {
104 | "A": "a"
105 | }
106 | event_type = WatchKubernetesEventType.ADDED
107 | event = WatchKubernetesEvent(event_type, data)
108 | assert event.get_formatted_payload() == {
109 | "type": event_type.value,
110 | "object": data,
111 | }
112 |
113 |
114 | @pytest.mark.asyncio
115 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP))
116 | async def test_watch_to_dict():
117 | """ to_dict test """
118 | data = {
119 | "A": "a"
120 | }
121 | event_type = WatchKubernetesEventType.ADDED
122 | event = WatchKubernetesEvent(event_type, data)
123 | assert event.to_dict() == _get_expected_dict(event)
124 |
125 |
126 | @pytest.mark.asyncio
127 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP))
128 | async def test_watch_get_resource_version():
129 | """ get_resource_version sanity test """
130 | resource_version = "3"
131 | data = {
132 | "A": "a",
133 | "metadata": {
134 | "resourceVersion": resource_version,
135 | }
136 | }
137 | event_type = WatchKubernetesEventType.ADDED
138 | event = WatchKubernetesEvent(event_type, data)
139 | assert event.to_dict() == _get_expected_dict(event)
140 | assert resource_version == event.get_resource_version()
141 |
142 |
143 | @pytest.mark.asyncio
144 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP))
145 | async def test_watch_get_resource_version():
146 | """ get_resource_version test - no resource version """
147 | data = {
148 | "A": "a",
149 | "metadata2222": {
150 | "a": "b"
151 | }
152 | }
153 | event_type = WatchKubernetesEventType.ADDED
154 | event = WatchKubernetesEvent(event_type, data)
155 | assert event.to_dict() == _get_expected_dict(event)
156 | assert not event.get_resource_version()
157 |
158 |
159 | @pytest.mark.asyncio
160 | @patch("time.time_ns", MagicMock(return_value=FAKE_TIMESTAMP))
161 | async def test_watch_to_dict():
162 | """ to_dict test """
163 | data = {
164 | "A": "a"
165 | }
166 | event_type = WatchKubernetesEventType.ADDED
167 | event = WatchKubernetesEvent(event_type, data)
168 | assert event.to_dict() == _get_expected_dict(event)
169 |
170 |
171 | @pytest.mark.asyncio
172 | async def test_watch_equity():
173 | """ __eq__ test """
174 | all_data = [
175 | {
176 | "A": "a"
177 | },
178 | {
179 | "A": "a"
180 | },
181 | {
182 | "B": "a"
183 | },
184 | ]
185 | event_type = WatchKubernetesEventType.ADDED
186 | events = [
187 | WatchKubernetesEvent(event_type, event_data)
188 | for event_data in all_data
189 | ]
190 | assert events[0] == events[1]
191 | assert events[0] != events[2]
192 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Main module - runs collector & cluster discovery
3 | """
4 | import time
5 | import logging
6 | import asyncio
7 | import socket
8 | import os
9 | import signal
10 |
11 | import aiofiles
12 | from datetime import datetime, timezone
13 | from traceback import format_exc
14 | from aiohttp import client_exceptions
15 | from kubernetes_asyncio import config, client
16 | from cluster_discovery import ClusterDiscovery
17 | from events_manager import InMemoryEventsManager
18 | from events_sender import EventsSender
19 | from epsagon_client import EpsagonClient, EpsagonClientException
20 | from forwarder import Forwarder
21 | from logger_configurer import LoggerConfigurer
22 |
23 | RESTART_WAIT_TIME_SECONDS = 60
24 | EPSAGON_TOKEN = os.getenv("EPSAGON_TOKEN")
25 | CLUSTER_NAME = os.getenv("EPSAGON_CLUSTER_NAME")
26 | COLLECTOR_URL = os.getenv(
27 | "EPSAGON_COLLECTOR_URL",
28 | "https://collector.epsagon.com/resources/v1"
29 | )
30 | SHOULD_COLLECT_RESOURCES = os.getenv("EPSAGON_COLLECT_RESOURCES", "TRUE").upper() == "TRUE"
31 | SHOULD_COLLECT_EVENTS = os.getenv("EPSAGON_COLLECT_EVENTS", "FALSE").upper() == "TRUE"
32 | EPSAGON_CONF_DIR = "/etc/epsagon"
33 | IS_DEBUG_FILE_PATH = f"{EPSAGON_CONF_DIR}/epsagon_debug"
34 |
35 | def _get_log_file_name():
36 | """
37 | Gets the log file name, according to the configured collected data
38 | """
39 | if SHOULD_COLLECT_RESOURCES and SHOULD_COLLECT_EVENTS:
40 | return "resources_and_events_log"
41 | if SHOULD_COLLECT_RESOURCES:
42 | return "resources_log"
43 | return "events_log"
44 |
45 | LOG_FILE_PATH = f"{os.getenv('HOME', '/tmp')}/{_get_log_file_name()}"
46 | LOG_FORMAT = "%(asctime)s %(levelname)s %(module)s - %(funcName)s: %(message)s"
47 | LOGGER_CONFIGURER = LoggerConfigurer(LOG_FORMAT, LOG_FILE_PATH)
48 |
49 |
50 | async def _is_debug_mode():
51 | """
52 | Checks whether the agent runs in debug mode.
53 | Checks IS_DEBUG_FILE_PATH boolean value. In case of error reading the
54 | file content, using the EPSAGON_DEBUG env var.
55 | """
56 | try:
57 | async with aiofiles.open(IS_DEBUG_FILE_PATH, "r") as reader:
58 | return (await reader.read()).lower() == "true"
59 | except Exception: # pylint: disable=broad-except
60 | pass
61 |
62 | return os.getenv("EPSAGON_DEBUG", "").lower() == "true"
63 |
64 |
65 | def _reload_handler():
66 | """
67 | Reload configuration handler - reconfigures the main logger according
68 | to the current debug mode.
69 | """
70 | LOGGER_CONFIGURER.update_logger_level(_is_debug_mode())
71 |
72 |
73 | def _cancel_tasks(tasks):
74 | """
75 | Cancels the given tasks
76 | """
77 | for task in tasks:
78 | if not task.done():
79 | task.cancel()
80 |
81 |
82 | async def _epsagon_conf_watcher(initial_debug_mode: bool):
83 | """
84 | Watches for changes in the epsagon conf.
85 | If debug mode has been changed, updates the main logger
86 | with the new logging level.
87 | """
88 | debug_mode = initial_debug_mode
89 | while True:
90 | try:
91 | current_debug_mode = await _is_debug_mode()
92 | if debug_mode != current_debug_mode:
93 | debug_mode = current_debug_mode
94 | LOGGER_CONFIGURER.update_logger_level(debug_mode)
95 | except Exception: # pylint: disable=broad-except
96 | pass
97 | await asyncio.sleep(120)
98 |
99 |
100 | async def run(is_debug_mode):
101 | """
102 | Runs the cluster discovery & forwarder.
103 | """
104 | asyncio.create_task(_epsagon_conf_watcher(is_debug_mode))
105 | events_manager = InMemoryEventsManager()
106 | epsagon_client = await EpsagonClient.create(EPSAGON_TOKEN)
107 | events_sender = EventsSender(
108 | epsagon_client,
109 | COLLECTOR_URL,
110 | CLUSTER_NAME,
111 | EPSAGON_TOKEN
112 | )
113 | cluster_discovery = ClusterDiscovery(
114 | events_manager.write_event,
115 | should_collect_resources=SHOULD_COLLECT_RESOURCES,
116 | should_collect_events=SHOULD_COLLECT_EVENTS,
117 | )
118 | forwarder = Forwarder(
119 | events_manager,
120 | events_sender
121 | )
122 | while True:
123 | try:
124 | tasks = [
125 | asyncio.create_task(forwarder.start()),
126 | asyncio.create_task(cluster_discovery.start())
127 | ]
128 | await asyncio.gather(*tasks)
129 | except (
130 | client_exceptions.ClientError,
131 | socket.gaierror,
132 | ConnectionRefusedError,
133 | EpsagonClientException
134 | ):
135 | logging.error(
136 | "Connection error, restarting agent in %d seconds",
137 | RESTART_WAIT_TIME_SECONDS
138 | )
139 | _cancel_tasks(tasks)
140 | events_manager.clean()
141 | await asyncio.sleep(RESTART_WAIT_TIME_SECONDS)
142 | except Exception as exception:
143 | logging.error(str(exception))
144 | logging.error(format_exc())
145 | logging.info("Agent is exiting due to an unexpected error")
146 | _cancel_tasks(tasks)
147 | await epsagon_client.close()
148 | break
149 |
150 |
151 | def main():
152 | is_debug = asyncio.run(_is_debug_mode())
153 | LOGGER_CONFIGURER.configure_logger(is_debug)
154 | if not EPSAGON_TOKEN:
155 | logging.error(
156 | "Missing Epsagon token. "
157 | "Make sure to configure EPSAGON_TOKEN in cluster_agent_deployment.yaml"
158 | )
159 | return
160 |
161 | if not CLUSTER_NAME:
162 | logging.error(
163 | "Missing cluster name. "
164 | "Make sure to configure EPSAGON_CLUSTER_NAME in cluster_agent_deployment.yaml"
165 | )
166 | return
167 |
168 | config.load_incluster_config()
169 | logging.info("Loaded cluster config")
170 | if is_debug:
171 | loaded_conf = client.configuration.Configuration.get_default_copy()
172 | logging.debug(
173 | "Loaded cluster configuration:\nHost: %s\n"
174 | "Using SSL Cert? %s\nUsing API token? %s",
175 | loaded_conf.host,
176 | bool(loaded_conf.ssl_ca_cert),
177 | bool(loaded_conf.api_key)
178 | )
179 | loop = asyncio.new_event_loop()
180 | loop.add_signal_handler(signal.SIGHUP, _reload_handler)
181 | loop.run_until_complete(run(is_debug))
182 | loop.close()
183 |
184 | if __name__ == "__main__":
185 | main()
186 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/test_forwarder.py:
--------------------------------------------------------------------------------
1 | """
2 | Forwarder tests
3 | """
4 | import pytest
5 | import asyncio
6 | from typing import List
7 | from events_manager import EventsManager, InMemoryEventsManager
8 | from forwarder import Forwarder
9 | from kubernetes_event import (
10 | KubernetesEvent,
11 | WatchKubernetesEvent,
12 | WatchKubernetesEventType,
13 | )
14 | from .conftest import run_coroutines_with_timeout
15 |
16 | DEFAULT_MAX_EVENTS_TO_READ = Forwarder.DEFAULT_MAX_EVENTS_TO_READ
17 | DEFAULT_MAX_WORKERS = Forwarder.DEFAULT_MAX_WORKERS
18 | DEFAULT_EVENTS_COUNT = 1000
19 |
20 | class EventsManagerMock(InMemoryEventsManager):
21 | """ EventsManager mock, verifies max events to read """
22 | def __init__(self, expected_max_size):
23 | """
24 | :param expected_max_size: the expected max size when using get_events
25 | """
26 | super().__init__()
27 | self.expected_max_size = expected_max_size
28 |
29 | async def get_events(self, max_size: int, timeout: int=None) -> List[KubernetesEvent]:
30 | """
31 | Asserts the given max size,
32 | """
33 | assert max_size == self.expected_max_size
34 | return await super().get_events(max_size)
35 |
36 | class EventsSenderMock:
37 | """ EventsSender mock, verifies max worker senders """
38 | def __init__(self, expected_max_workers: int, error: Exception = None):
39 | """
40 | :param expected_max_workers: the expected max size of workers used
41 | to send events
42 | :param error: to raise when send_events is called
43 | """
44 | self.expected_max_workers = expected_max_workers
45 | self.current_workers_count = 0
46 | self.events = set()
47 | self.error = error
48 |
49 | async def send_events(self, events: List[KubernetesEvent]):
50 | """
51 | Asserts the current number of workers <= expected max workers and saves
52 | the given events.
53 | when called, using asyncio.sleep to simulate a "real" send scenario
54 | If self.error then raising self.error immediately
55 | """
56 | if self.error:
57 | raise self.error
58 |
59 | self.current_workers_count += 1
60 | assert self.current_workers_count <= self.expected_max_workers
61 | for event in events:
62 | self.events.add(event)
63 | await asyncio.sleep(0.1)
64 | self.current_workers_count -= 1
65 |
66 |
67 | async def _write_events(
68 | events: List[KubernetesEvent],
69 | events_manager: EventsManager
70 | ):
71 | """
72 | Writes given events using given events manager.
73 | After each written event, returns control to event loop using asyncio.sleep
74 | Used to simulate a real scenario where the event loop gets control between
75 | calls to write_event
76 | """
77 | for event in events:
78 | await events_manager.write_event(event)
79 | await asyncio.sleep(0)
80 |
81 |
82 | def _generate_kubernetes_events(count: int) -> List[KubernetesEvent]:
83 | """ Generates kubernetes event """
84 | return [
85 | WatchKubernetesEvent(WatchKubernetesEventType.ADDED, {i: i})
86 | for i in range(count)
87 | ]
88 |
89 |
90 | @pytest.mark.asyncio
91 | async def test_sanity():
92 | """
93 | sanity test - runs forwarder while writing events, verifies all events are sent.
94 | """
95 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ)
96 | events_sender = EventsSenderMock(DEFAULT_MAX_WORKERS)
97 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT)
98 | events_write_task, forwarder_task = await run_coroutines_with_timeout(
99 | (
100 | _write_events(events, events_manager),
101 | Forwarder(events_manager, events_sender).start()
102 | ),
103 | verify_tasks_finished=False,
104 | timeout=0.5,
105 | )
106 | assert events_write_task.done()
107 | assert not forwarder_task.done()
108 | forwarder_task.cancel()
109 | assert set(events) == events_sender.events
110 |
111 |
112 | @pytest.mark.asyncio
113 | async def test_send_events_failure():
114 | """
115 | send events failure test - runs forwarder and send_events raises an error,
116 | expects the forwarder task to end and raise the send events error.
117 | """
118 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ)
119 | error = Exception("test error")
120 | events_sender = EventsSenderMock(DEFAULT_MAX_WORKERS, error=error)
121 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT)
122 | events_write_task, forwarder_task = await run_coroutines_with_timeout(
123 | (
124 | _write_events(events, events_manager),
125 | Forwarder(events_manager, events_sender).start()
126 | ),
127 | verify_tasks_finished=False,
128 | timeout=0.5,
129 | )
130 | assert events_write_task.done()
131 | assert forwarder_task.done()
132 | assert forwarder_task.exception() == error
133 |
134 |
135 | @pytest.mark.asyncio
136 | async def test_max_events_to_read():
137 | """
138 | Runs forwarder while writing events, verifies all events are sent and
139 | verifies no more than the given max events to read are read.
140 | """
141 | max_events_to_read = 10
142 | events_manager = EventsManagerMock(max_events_to_read)
143 | events_sender = EventsSenderMock(DEFAULT_MAX_WORKERS)
144 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT)
145 | events_write_task, forwarder_task = await run_coroutines_with_timeout(
146 | (
147 | _write_events(events, events_manager),
148 | Forwarder(
149 | events_manager,
150 | events_sender,
151 | max_events_to_read=max_events_to_read
152 | ).start(),
153 | ),
154 | verify_tasks_finished=False,
155 | timeout=3,
156 | )
157 | assert events_write_task.done()
158 | assert not forwarder_task.done()
159 | forwarder_task.cancel()
160 | assert set(events) == events_sender.events
161 |
162 | @pytest.mark.asyncio
163 | async def test_max_workers():
164 | """
165 | Runs forwarder while writing events, verifies all events are sent and
166 | verifies no more than the given workers count ran
167 | concurrently (asyncronously)
168 | """
169 | max_workers = 2
170 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ)
171 | events_sender = EventsSenderMock(max_workers)
172 | events: List[KubernetesEvent] = _generate_kubernetes_events(DEFAULT_EVENTS_COUNT)
173 | events_write_task, forwarder_task = await run_coroutines_with_timeout(
174 | (
175 | _write_events(events, events_manager),
176 | Forwarder(
177 | events_manager,
178 | events_sender,
179 | max_workers=max_workers
180 | ).start(),
181 | ),
182 | verify_tasks_finished=False,
183 | timeout=1,
184 | )
185 | assert events_write_task.done()
186 | assert not forwarder_task.done()
187 | forwarder_task.cancel()
188 | assert set(events) == events_sender.events
189 |
190 |
191 | @pytest.mark.asyncio
192 | async def test_invalid_max_workers():
193 | """
194 | assert value error is raised when initializing a forwarder with < 1
195 | max workers
196 | """
197 | max_workers = 0
198 | events_manager = EventsManagerMock(DEFAULT_MAX_EVENTS_TO_READ)
199 | events_sender = EventsSenderMock(max_workers)
200 | with pytest.raises(ValueError):
201 | Forwarder(
202 | events_manager,
203 | events_sender,
204 | max_workers=max_workers
205 | )
206 |
207 |
208 | @pytest.mark.asyncio
209 | async def test_invalid_max_events_to_read():
210 | """
211 | assert value error is raised when initializing a forwarder with < 1
212 | max events to read
213 | """
214 | max_workers = 1
215 | max_events_to_read = 0
216 | events_manager = EventsManagerMock(max_events_to_read)
217 | events_sender = EventsSenderMock(max_workers)
218 | with pytest.raises(ValueError):
219 | Forwarder(
220 | events_manager,
221 | events_sender,
222 | max_events_to_read=max_events_to_read
223 | )
224 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/cluster_discovery.py:
--------------------------------------------------------------------------------
1 | """
2 | Cluster discovery - watch & publish events in the cluster
3 | """
4 | import asyncio
5 | import logging
6 | import socket
7 | from dataclasses import dataclass
8 | from typing import Callable, Any, Dict
9 | from traceback import format_exc
10 | import kubernetes_asyncio
11 | from aiohttp.client_exceptions import ClientError
12 | from kubernetes_event import (
13 | KubernetesEvent,
14 | WatchKubernetesEvent,
15 | WatchKubernetesEventType,
16 | KubernetesEventException,
17 | KubernetesEventType,
18 | )
19 |
20 | @dataclass
21 | class WatchTarget:
22 | """ watch target """
23 | endpoint: Callable # endpoint to watch
24 | last_resource_version: Any = None # used to avoid full resyncs
25 |
26 | class ClusterDiscoveryException(Exception):
27 | pass
28 |
29 | class ErrorWatchEventException(ClusterDiscoveryException):
30 | pass
31 |
32 | class ClusterDiscovery:
33 | """
34 | Cluster resources discovery - watches & publish events in cluster
35 | """
36 |
37 | # default time to wait between watch attemps
38 | RETRY_INTERVAL_SECONDS = 30
39 |
40 | def _create_watch_targets(
41 | self,
42 | should_collect_resources: bool,
43 | should_collect_events: bool,
44 | ) -> Dict[str, WatchTarget]:
45 | """
46 | Creates watch targets - all pods, nodes & deployments.
47 | """
48 | targets = {}
49 | if should_collect_resources:
50 | targets.update(
51 | {
52 | "Pod": WatchTarget(self.client.list_pod_for_all_namespaces),
53 | "Node": WatchTarget(self.client.list_node),
54 | "Namespace": WatchTarget(self.client.list_namespace),
55 | "Deployment": WatchTarget(
56 | self.apps_api_client.list_deployment_for_all_namespaces,
57 | ),
58 | "DaemonSet": WatchTarget(
59 | self.apps_api_client.list_daemon_set_for_all_namespaces,
60 | ),
61 | "StatefulSet": WatchTarget(
62 | self.apps_api_client.list_stateful_set_for_all_namespaces,
63 | ),
64 | }
65 | )
66 | if should_collect_events:
67 | targets["Event"] = WatchTarget(self.client.list_event_for_all_namespaces)
68 | return targets
69 |
70 | def __init__(
71 | self,
72 | event_handler,
73 | should_collect_resources=True,
74 | should_collect_events=False,
75 | api_client=None,
76 | retry_interval_seconds=RETRY_INTERVAL_SECONDS,
77 | ):
78 | """
79 | :param event_handler: to write events to
80 | :param api_client: of the cluster to discover. If not given, using the
81 | default one.
82 | """
83 | self.i = 0
84 | self.event_handler = event_handler
85 | self.client = kubernetes_asyncio.client.CoreV1Api(api_client=api_client)
86 | self.version_client = kubernetes_asyncio.client.VersionApi(api_client=api_client)
87 | self.apps_api_client = kubernetes_asyncio.client.AppsV1Api(api_client=api_client)
88 | self.should_collect_resources = should_collect_resources
89 | self.watch_targets = self._create_watch_targets(
90 | should_collect_resources,
91 | should_collect_events
92 | )
93 | self.watch_tasks = []
94 | if retry_interval_seconds < 0:
95 | raise ValueError("Retry interval seconds must be bigger than 0")
96 |
97 | self.retry_interval_seconds = retry_interval_seconds
98 |
99 |
100 | def _update_resource_version(
101 | self,
102 | kind,
103 | target: WatchTarget,
104 | resource_version
105 | ):
106 | """
107 | Updates the resource version of given kind & watch targets.
108 | """
109 | self.watch_targets[kind].last_resource_version = resource_version
110 |
111 |
112 | async def _get_initial_list(self, kind, target):
113 | """
114 | Performs initial list of given watch target endpoint.
115 | """
116 | response = await target()
117 | for item in response.items:
118 | item.kind = kind
119 | kubernetes_event = WatchKubernetesEvent(
120 | WatchKubernetesEventType.ADDED,
121 | item.to_dict()
122 | )
123 | await self.event_handler(kubernetes_event)
124 |
125 | return response.metadata.resource_version
126 |
127 |
128 | async def _run_watch(self, kind, target, stream):
129 | """
130 | Runs the watch stream of given watch target and watch resource kind.
131 | """
132 | async for event in stream:
133 | try:
134 | event_type = event.get("type")
135 | if not event_type or event_type.lower() == "error":
136 | raise ErrorWatchEventException("Received an error event")
137 | logging.debug("Received event: %s", event)
138 | kubernetes_event = WatchKubernetesEvent.from_watch_dict(event)
139 | await self.event_handler(kubernetes_event)
140 | resource_version = kubernetes_event.get_resource_version()
141 | self._update_resource_version(
142 | kind,
143 | target,
144 | resource_version
145 | )
146 | logging.debug("%s new resource version: %s", kind, resource_version)
147 | except KubernetesEventException:
148 | logging.debug("Skipping invalid event")
149 |
150 |
151 | async def _start_watch(self, kind, target):
152 | """
153 | Watches given cluster endpoint.
154 | For each streamed event, creating KubernetesEvent and writing the
155 | event to the event handler. Ignoring invalid event object.
156 | """
157 | if not target.last_resource_version:
158 | # resource first time retrieval
159 | resource_version = await self._get_initial_list(kind, target.endpoint)
160 | self._update_resource_version(
161 | kind,
162 | target,
163 | resource_version
164 | )
165 | else:
166 | # continue watch from last preserved resource version
167 | resource_version = target.last_resource_version
168 |
169 | try:
170 | while True:
171 | logging.debug("Start watch for %s", kind)
172 | w = kubernetes_asyncio.watch.Watch()
173 | stream = w.stream(target.endpoint, resource_version=resource_version)
174 | await self._run_watch(kind, target, stream)
175 | except ClientError:
176 | logging.debug("Client Error: %s", format_exc())
177 | # resource version timeout, restarting watch
178 | # from last preserved resource version
179 | await self._start_watch(kind, target)
180 | except ErrorWatchEventException:
181 | logging.debug("Restarting %s watch due to an error event", kind)
182 | self._update_resource_version(kind, target, None)
183 | await self._start_watch(kind, target)
184 | except asyncio.CancelledError:
185 | pass
186 |
187 | def _stop_all(self):
188 | """
189 | Stops all watch tasks
190 | """
191 | for task in self.discover_tasks:
192 | if not task.done():
193 | task.cancel()
194 | elif not task.cancelled():
195 | task.exception()
196 |
197 |
198 | async def _collect_cluster_info(self):
199 | """
200 | Collects the cluster info
201 | """
202 | try:
203 | version = None
204 | try:
205 | version: str = (await self.version_client.get_code()).git_version
206 | except Exception as exception:
207 | logging.debug("Could not extract cluster version")
208 | logging.error(str(exception))
209 | logging.error(format_exc())
210 | try:
211 | data = {"version": version}
212 | kubernetes_event = KubernetesEvent(KubernetesEventType.CLUSTER, data)
213 | await self.event_handler(kubernetes_event)
214 | except KubernetesEventException:
215 | logging.debug("Failed to create cluster event")
216 | raise
217 | except asyncio.CancelledError:
218 | pass
219 |
220 | async def start(self):
221 | """
222 | Starts watch task per target (see _create_watch_targets) and runs
223 | more discovery tasks such as retrieving cluster level information.
224 | In case of watch resync issues, restarting all watches from the last
225 | preserved resource version.
226 | In case of other network issues, stopping all tasks and restarting
227 | after RETRY_INTERVAL_SECONDS.
228 | """
229 | try:
230 | await self._collect_cluster_info()
231 | self.discover_tasks = [
232 | asyncio.create_task(self._start_watch(kind, target))
233 | for kind, target in self.watch_targets.items()
234 | ]
235 | await asyncio.gather(
236 | *self.discover_tasks,
237 | loop = asyncio.get_event_loop()
238 | )
239 | except (socket.gaierror, ClientError):
240 | self.stop()
241 | logging.error(
242 | "Connection error, retrying in %d seconds",
243 | self.retry_interval_seconds
244 | )
245 | await asyncio.sleep(self.retry_interval_seconds)
246 | await self.start()
247 | except asyncio.CancelledError:
248 | self.stop()
249 |
250 |
251 | def stop(self):
252 | """
253 | Stops the cluster discovery.
254 | """
255 | # reset resource version for all watch targets
256 | for target in self.watch_targets.values():
257 | target.last_resource_version = None
258 | self._stop_all()
259 |
260 |
--------------------------------------------------------------------------------
/pkg/cluster_agent/tests/ut/test_cluster_discovery.py:
--------------------------------------------------------------------------------
1 | """
2 | ClusterDiscovery tests
3 | """
4 | import asyncio
5 | import socket
6 | import pytest
7 | import kubernetes_asyncio
8 | from dataclasses import dataclass
9 | from typing import List, Dict, Set, Any
10 | from asynctest.mock import patch
11 | from cluster_discovery import ClusterDiscovery, WatchTarget
12 | from kubernetes_event import (
13 | KubernetesEvent,
14 | WatchKubernetesEvent,
15 | KubernetesEventType,
16 | WatchKubernetesEventType,
17 | )
18 | from .conftest import run_coroutines_with_timeout
19 |
20 |
21 | TEST_VERSION = "v1.18"
22 | CLUSTER_EVENT = KubernetesEvent(
23 | KubernetesEventType.CLUSTER,
24 | {
25 | "version": TEST_VERSION
26 | }
27 | )
28 | INVALID_CLUSTER_EVENT = KubernetesEvent(
29 | KubernetesEventType.CLUSTER,
30 | {
31 | "version": None,
32 | }
33 | )
34 | TEST_RESOURCE_VERSION = "123333"
35 |
36 | class MockWatchTarget:
37 | def __init__(self, kind, resource_list, watch_events, list_error, stream_error, delay):
38 | self.kind = kind
39 | self.resource_list = resource_list
40 | self.watch_events = watch_events
41 | self.list_error = list_error
42 | self.stream_error = stream_error
43 | self.delay = delay
44 |
45 | async def __call__(self, *arg, **kwargs):
46 | """
47 | Called when the cluster discovery performs its initial list
48 | """
49 | if self.list_error:
50 | raise self.list_error
51 | current_kind = self.kind
52 | class ItemWrapper:
53 | def __init__(self, data: Dict):
54 | self.data = data
55 | self._kind = None
56 | self.expected_kind = current_kind
57 |
58 | @property
59 | def kind(self):
60 | self._kind
61 |
62 | @kind.setter
63 | def kind(self, kind):
64 | self._kind = kind
65 |
66 | def to_dict(self):
67 | assert self._kind == self.expected_kind
68 | return self.data
69 |
70 | @dataclass
71 | class ListResponse:
72 | @dataclass
73 | class Metadata:
74 | resource_version: Any = TEST_RESOURCE_VERSION
75 |
76 | items: List[ItemWrapper]
77 | metadata: Metadata = Metadata()
78 |
79 | return ListResponse([ItemWrapper(resource) for resource in self.resource_list])
80 |
81 | class KubernetesResourceObject:
82 | """ Test kubernetes resource object """
83 | def __init__(self, data: Dict):
84 | self.data = data
85 |
86 | def to_dict(self):
87 | """ to dict - gets the original data """
88 | return self.data
89 |
90 | class EventsManager:
91 | """
92 | EventsManager, used for writing & validating given events
93 | """
94 | def __init__(self):
95 | self.events: Set[KubernetesEvent] = set()
96 |
97 | async def write_event(self, event: KubernetesEvent):
98 | """ Adds an event to the manager """
99 | self.events.add(event)
100 |
101 |
102 | class EventsGenerator:
103 | """
104 | Events generator, used for each watch target
105 | """
106 |
107 | def __init__(self, events, delay=0):
108 | """
109 | :param events: to return one be one
110 | :param delay: between each event
111 | """
112 | self.i = 0 # current event
113 | self.events = events
114 | self.delay = delay
115 |
116 | def __aiter__(self):
117 | return self
118 |
119 | async def __anext__(self):
120 | """
121 | Gets the next event. Waits delay seconds between each event
122 | When done, sleeping "forever" - to simulate a "real" scenario where
123 | the events stream doesn't end.
124 | """
125 | i = self.i
126 | if self.i >= len(self.events):
127 | # sleeps forever, simulating a real scenario
128 | await asyncio.sleep(1000)
129 | self.i += 1
130 | if i:
131 | await asyncio.sleep(self.delay)
132 | return self.events[i]
133 |
134 |
135 | class WatchMock:
136 | """
137 | A mock class for the kubernetes client Watch class
138 | """
139 | def stream(self, target: MockWatchTarget, resource_version=None):
140 | """
141 | Gets the events stream, raises an error if the
142 | MockWatchTarget is configured with one
143 | """
144 | assert resource_version == TEST_RESOURCE_VERSION
145 | if target.stream_error:
146 | raise target.stream_error
147 |
148 | return EventsGenerator(target.watch_events, delay=target.delay)
149 |
150 |
151 |
152 | class ClientMock:
153 | """
154 | A kubernetes API client mock class
155 | (used for the cluster version retrieval)
156 | """
157 | def __init__(self, error=None):
158 | """
159 | :param error: to raise when used
160 | """
161 | self.error = error
162 |
163 | async def get_code(self):
164 | """
165 | Gets the cluster version code. Raises an error if self.error
166 | """
167 | if self.error:
168 | raise self.error
169 | class VersionResponse:
170 | """ Version response, as returned from the API server """
171 | def __init__(self, git_version):
172 | """
173 | :param git_version: the cluster version to return
174 | """
175 | self.git_version = git_version
176 |
177 | return VersionResponse(TEST_VERSION)
178 |
179 |
180 | def _patch_cluster_discovery_watch_targets(
181 | cluster_discovery: ClusterDiscovery,
182 | watch_targets: List[MockWatchTarget],
183 | version_client
184 | ):
185 | """
186 | Patches the cluster discovery obj - replace all watch targets and the
187 | cluster version client with the `fake` ones.
188 | """
189 | cluster_discovery.watch_targets = {
190 | target.kind: WatchTarget(target) for target in watch_targets
191 | }
192 | cluster_discovery.version_client = version_client
193 |
194 |
195 | @pytest.fixture
196 | def raw_target_events() -> List[List[Dict]]:
197 | """
198 | Generate some events. Each events list item is for one `watch target`
199 | :return: A list of event lists
200 | """
201 | return [
202 | [
203 | {
204 | "type": "ADDED",
205 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
206 | KubernetesResourceObject({ "1a": "1a"})
207 | )
208 | },
209 | {
210 | "type": "ADDED",
211 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
212 | KubernetesResourceObject({ "1aa": "1aa"})
213 | )
214 | },
215 | {
216 | "type": "MODIFIED",
217 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
218 | KubernetesResourceObject({ "1m": "1m"})
219 | )
220 | },
221 | {
222 | "type": "DELETED",
223 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
224 | KubernetesResourceObject({ "1d": "1d"})
225 | )
226 | },
227 | ],
228 | [
229 | {
230 | "type": "ADDED",
231 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
232 | KubernetesResourceObject({ "2a": "2a"})
233 | )
234 | },
235 | {
236 | "type": "MODIFIED",
237 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
238 | KubernetesResourceObject({ "2m": "2m"})
239 | )
240 | },
241 | {
242 | "type": "MODIFIED",
243 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
244 | KubernetesResourceObject({ "2mm": "2mm"})
245 | )
246 | },
247 | {
248 | "type": "DELETED",
249 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
250 | KubernetesResourceObject({ "2d": "1d"})
251 | )
252 | },
253 | ],
254 | [
255 | {
256 | "type": "ADDED",
257 | WatchKubernetesEvent.OBJECT_FIELD_KEY: (
258 | KubernetesResourceObject({ "3a": "3a"})
259 | )
260 | },
261 | ],
262 | ]
263 |
264 |
265 | @pytest.fixture
266 | def target_resource_lists() -> List[List[Dict]]:
267 | """
268 | Generate some resources. Each resources list item is for one `watch target`
269 | :return: A list of resources lists
270 | """
271 | return [
272 | [
273 | {
274 | "a": "A",
275 | },
276 | {
277 | "T1": "T2",
278 | }
279 | ],
280 | [
281 | {
282 | "x": "y",
283 | }
284 | ],
285 | [
286 | {
287 | "Q": "T",
288 | }
289 | ]
290 | ]
291 |
292 |
293 | def _get_expected_events(
294 | resource_lists,
295 | raw_events: List[List[WatchKubernetesEvent]],
296 | cluster_event: KubernetesEvent
297 | ) -> Set[KubernetesEvent]:
298 | """
299 | Gets the expected events objects. Skips invalid events.
300 | If cluster_event is given then adding it to the expected events
301 | """
302 | events = {
303 | WatchKubernetesEvent.from_watch_dict(raw_event)
304 | for target_events in raw_events
305 | for raw_event in target_events
306 | if WatchKubernetesEvent.OBJECT_FIELD_KEY in raw_event # skip invalid test events
307 | }
308 | for resource_list in resource_lists:
309 | for resource in resource_list:
310 | events.add(WatchKubernetesEvent(WatchKubernetesEventType.ADDED, resource))
311 |
312 | if cluster_event:
313 | events.add(cluster_event)
314 |
315 | return events
316 |
317 |
318 | async def _run_cluster_discovery(
319 | cluster_discovery,
320 | events_manager,
321 | resource_lists,
322 | raw_events,
323 | cluster_event,
324 | watch_stream_error=None,
325 | resource_list_error=None,
326 | ):
327 | """
328 | Runs the cluster discovery (cluster_discovery.start).
329 | Validates the task status (is running/task had an exception), and
330 | that the actual written events are the expected ones.
331 | """
332 | task = (await run_coroutines_with_timeout(
333 | (cluster_discovery.start(),),
334 | verify_tasks_finished=False,
335 | timeout=0.2
336 | ))[0]
337 | if watch_stream_error:
338 | expected_events = _get_expected_events(
339 | resource_lists,
340 | [],
341 | cluster_event
342 | )
343 | if type(watch_stream_error) == Exception:
344 | # unhandled error, task should be done
345 | assert task.done()
346 | assert type(task.exception()) == Exception
347 | else: # task is expected to run as error should be handled
348 | assert not task.done()
349 | elif resource_list_error:
350 | expected_events = _get_expected_events(
351 | [],
352 | [],
353 | cluster_event
354 | )
355 | assert task.done()
356 | else:
357 | expected_events = _get_expected_events(
358 | resource_lists,
359 | raw_events,
360 | cluster_event
361 | )
362 | # normal run - cluster discovery shouldn't stop
363 | assert not task.done()
364 |
365 | assert expected_events == events_manager.events
366 | if not task.done():
367 | task.cancel()
368 |
369 |
370 | async def _test_cluster_discovery(
371 | resource_lists,
372 | raw_events,
373 | invalid_cluster_event=False,
374 | include_invalid_watch_event=False,
375 | watch_stream_error=None,
376 | resource_list_error=None,
377 | ) -> ClusterDiscovery:
378 | """
379 | Tests the cluster discovery run.
380 | :param raw_events: to be read by the cluster discovery watch tasks
381 | :param invalid_cluster_event: indicates whether should expect a
382 | valid/invalid cluster event
383 | :param include_invalid_watch_event: indicates whetherto include an invalid
384 | watch events
385 | :param watch_stream_error: error to be raised when the cluster discovery
386 | tries to watch its targets.
387 | :return: the cluster discovery object
388 | """
389 | cluster_event = CLUSTER_EVENT
390 | cluster_error = None
391 | if invalid_cluster_event:
392 | cluster_event = INVALID_CLUSTER_EVENT
393 | cluster_error = Exception()
394 |
395 | version_client = ClientMock(error=cluster_error)
396 | manager = EventsManager()
397 | cluster_discovery = ClusterDiscovery(manager.write_event)
398 | if include_invalid_watch_event:
399 | for target_events in raw_events:
400 | target_events.append({ "invalid_event": "invalid"})
401 |
402 | # prepare watch targets - with events and possibly an error, if given
403 | targets = [
404 | MockWatchTarget(
405 | str(i),
406 | resource_lists[i],
407 | raw_events[i],
408 | resource_list_error,
409 | watch_stream_error,
410 | 0.01
411 | )
412 | for i in range(len(raw_events))
413 | ]
414 | # replace watch targets & version cluent at cluster_discovery
415 | _patch_cluster_discovery_watch_targets(
416 | cluster_discovery, targets, version_client
417 | )
418 | # tests the cluster discovery run
419 | await _run_cluster_discovery(
420 | cluster_discovery,
421 | manager,
422 | resource_lists,
423 | raw_events,
424 | cluster_event=cluster_event,
425 | watch_stream_error=watch_stream_error,
426 | resource_list_error=resource_list_error,
427 | )
428 | return cluster_discovery
429 |
430 |
431 | @pytest.mark.asyncio
432 | @patch("kubernetes_asyncio.client")
433 | @patch("kubernetes_asyncio.watch.Watch", WatchMock)
434 | async def test_sanity(_, target_resource_lists, raw_target_events):
435 | """
436 | Sanity test - read multiple events
437 | """
438 | await _test_cluster_discovery(target_resource_lists, raw_target_events)
439 |
440 |
441 | @pytest.mark.asyncio
442 | @patch("kubernetes_asyncio.client")
443 | @patch("kubernetes_asyncio.watch.Watch", WatchMock)
444 | async def test_invalid_cluster_version(
445 | _,
446 | target_resource_lists,
447 | raw_target_events
448 | ):
449 | """
450 | Tests multiple events with no cluster version info.
451 | Expects the cluster discovery to run & collect the watch events.
452 | """
453 | await _test_cluster_discovery(
454 | target_resource_lists,
455 | raw_target_events,
456 | invalid_cluster_event=True
457 | )
458 |
459 |
460 | @pytest.mark.asyncio
461 | @patch("kubernetes_asyncio.client")
462 | @patch("kubernetes_asyncio.watch.Watch", WatchMock)
463 | async def test_invalid_watch_event(_, target_resource_lists, raw_target_events):
464 | """
465 | Tests multiple events with some invalid watch events
466 | Expects the cluster discovery to run, collect the watch events and
467 | skip the invalid events.
468 | """
469 | await _test_cluster_discovery(
470 | target_resource_lists,
471 | raw_target_events,
472 | include_invalid_watch_event=True
473 | )
474 |
475 |
476 | @pytest.mark.asyncio
477 | @patch("kubernetes_asyncio.client")
478 | @patch("kubernetes_asyncio.watch.Watch", WatchMock)
479 | async def test_watch_stream_unhandled_error(
480 | _,
481 | target_resource_lists,
482 | raw_target_events
483 | ):
484 | """
485 | Tests watch stream unhandled error - expect only the resource list events
486 | and the task to raise the error (and stops running).
487 | """
488 | await _test_cluster_discovery(
489 | target_resource_lists,
490 | raw_target_events,
491 | invalid_cluster_event=False,
492 | watch_stream_error=Exception()
493 | )
494 |
495 |
496 | @pytest.mark.asyncio
497 | @patch("kubernetes_asyncio.client")
498 | @patch("kubernetes_asyncio.watch.Watch", WatchMock)
499 | async def test_resource_list_unhandled_error(
500 | _,
501 | target_resource_lists,
502 | raw_target_events
503 | ):
504 | """
505 | Tests resource list unhandled error - expect no events
506 | and the task to raise the error (and stops running).
507 | """
508 | await _test_cluster_discovery(
509 | target_resource_lists,
510 | raw_target_events,
511 | invalid_cluster_event=False,
512 | resource_list_error=Exception()
513 | )
514 |
515 |
516 | @pytest.mark.asyncio
517 | @patch("kubernetes_asyncio.client")
518 | @patch("kubernetes_asyncio.watch.Watch", WatchMock)
519 | async def test_watch_stream_handled_error(
520 | _,
521 | target_resource_lists,
522 | raw_target_events
523 | ):
524 | """
525 | Tests watch stream handled error - expect no events and the task should
526 | still be running.
527 | """
528 | await _test_cluster_discovery(
529 | target_resource_lists,
530 | raw_target_events,
531 | invalid_cluster_event=False,
532 | watch_stream_error=socket.gaierror
533 | )
534 |
535 |
536 | @pytest.mark.asyncio
537 | @patch("kubernetes_asyncio.client")
538 | async def test_invalid_retry_interval_seconds(_):
539 | """
540 | Tests invalid retry interval seconds param
541 | """
542 | with pytest.raises(ValueError):
543 | ClusterDiscovery(None, retry_interval_seconds=-1)
544 |
545 |
546 | @pytest.mark.asyncio
547 | @patch("kubernetes_asyncio.client")
548 | @patch("kubernetes_asyncio.watch.Watch", WatchMock)
549 | async def test_stop(_, target_resource_lists, raw_target_events):
550 | """
551 | Tests cluster_discovery.stop - expect all discover tasks to be done
552 | """
553 | cluster_discovery: ClusterDiscovery = (
554 | await _test_cluster_discovery(target_resource_lists, raw_target_events)
555 | )
556 | cluster_discovery.stop()
557 | # tasks are already cancelled - CancelledError will be raised when
558 | # they will be scheduled to run.
559 | # Waiting for the task objects status to be update for testing purpose.
560 | await asyncio.sleep(0.1)
561 | for task in cluster_discovery.discover_tasks:
562 | assert task.done() or task.cancelled()
563 |
--------------------------------------------------------------------------------