├── .gitignore ├── data └── config.ini.template ├── setup.py ├── NOTICE ├── pyproject.toml ├── .pre-commit-config.yaml ├── setup.cfg ├── .github └── workflows │ └── publish-to-test-pypi.yml ├── src └── telemetry_peak_analyzer │ ├── models │ └── __init__.py │ ├── __init__.py │ ├── backends │ ├── tina.py │ └── __init__.py │ ├── __main__.py │ └── analyzers │ └── __init__.py ├── LICENSE ├── scripts ├── anonymize_telemetry.py └── create_telemetry.py ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md ├── tests └── test_telemetry_peak_analyzer.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | src/telemetry_peak_analyzer.egg-info 2 | data/config.ini 3 | .tox 4 | venv 5 | global_table.json 6 | -------------------------------------------------------------------------------- /data/config.ini.template: -------------------------------------------------------------------------------- 1 | [tina_backend] 2 | hosts = 3 | port = 4 | timeout_ms = 30000 5 | parallel_bulk = true 6 | scan_size = 10000 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2021 VMware, Inc. 3 | # SPDX-License-Identifier: BSD-2 4 | """ 5 | See https://stackoverflow.com/questions/62983756/what-is-pyproject-toml-file-for 6 | """ 7 | import setuptools 8 | 9 | if __name__ == "__main__": 10 | setuptools.setup() 11 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Telemetry Peak Analyzer 2 | Copyright 2021 VMware, Inc. 3 | 4 | This product is licensed to you under the BSD-2 license (the "License"). You may not use this product except in compliance with the BSD-2 License. 5 | 6 | This product may include a number of subcomponents with separate copyright notices and license terms. Your use of these subcomponents is subject to the terms and conditions of the subcomponent's license, as noted in the LICENSE file. 7 | 8 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "setuptools>=42", 4 | "wheel", 5 | ] 6 | build-backend = "setuptools.build_meta" 7 | 8 | [tool.tox] 9 | legacy_tox_ini = """ 10 | [tox] 11 | envlist = py36, py39 12 | isolated_build = True 13 | 14 | [testenv] 15 | commands=nosetests 16 | deps= 17 | ddt 18 | mock 19 | nose 20 | """ 21 | 22 | [tool.pylint.master] 23 | ignored-modules = "config" 24 | disable = """ 25 | W1514,F0010,useless-super-delegation,E1103,W0108,W0404,R0904,R0922,W0105, 26 | W0142,C0301,C0321,C0322,C0324,R,W0232,E1001,W0212,W0703,C,I0011,I0012,I0013,E0012""" 27 | 28 | [tool.black] 29 | line-length = 98 30 | include = "\\.pyi?$" 31 | exclude = """ 32 | /( 33 | \\.git 34 | | \\.hg 35 | | \\.mypy_cache 36 | | \\.tox 37 | | \\.venv 38 | | _build 39 | | buck-out 40 | | build 41 | | dist 42 | )/ 43 | """ 44 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks.git 3 | rev: v3.4.0 4 | hooks: 5 | - id: check-added-large-files 6 | - id: check-ast 7 | - id: check-byte-order-marker 8 | - id: check-docstring-first 9 | - id: check-executables-have-shebangs 10 | - id: check-json 11 | - id: check-merge-conflict 12 | - id: check-yaml 13 | - id: debug-statements 14 | - id: detect-private-key 15 | - id: end-of-file-fixer 16 | - id: trailing-whitespace 17 | - repo: https://github.com/psf/black 18 | rev: 21.7b0 19 | hooks: 20 | - id: black 21 | - repo: https://github.com/asottile/reorder_python_imports.git 22 | rev: v2.3.6 23 | hooks: 24 | - id: reorder-python-imports 25 | language_version: python3 26 | - repo: local 27 | hooks: 28 | - id: pylint 29 | name: pylint 30 | entry: pylint 31 | exclude: ^tests/ 32 | language: system 33 | types: [python] 34 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = telemetry-peak-analyzer 3 | version = 0.1.6 4 | author = Stefano Ortolani, Jason Zhang 5 | description = Analyzer to detect peaks when analyzing multi-attribute telemetry feeds 6 | long_description = file: README.md 7 | long_description_content_type = text/markdown 8 | url = https://github.com/vmware-labs/telemetry-peak-analyzer 9 | project_urls = 10 | Bug Tracker = https://github.com/vmware-labs/telemetry-peak-analyzer/issues 11 | classifiers = 12 | Development Status :: 3 - Alpha 13 | Intended Audience :: Developers 14 | License :: OSI Approved :: BSD License 15 | Programming Language :: Python :: 3 16 | Operating System :: OS Independent 17 | Topic :: Security 18 | Topic :: System :: Monitoring 19 | 20 | [options] 21 | package_dir = 22 | = src 23 | packages = find: 24 | python_requires = >=3.6 25 | install_requires = 26 | ijson 27 | 28 | [options.entry_points] 29 | console_scripts = 30 | telemetry-peak-analyzer = telemetry_peak_analyzer:parse_and_run_command 31 | 32 | [options.extras_require] 33 | tina = tina-client >=0.2.1 34 | 35 | [options.packages.find] 36 | where = src 37 | -------------------------------------------------------------------------------- /.github/workflows/publish-to-test-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI 2 | 3 | on: push 4 | 5 | jobs: 6 | build-n-publish: 7 | name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI 8 | runs-on: ubuntu-20.04 9 | steps: 10 | - uses: actions/checkout@master 11 | - name: Set up Python 3.8 12 | uses: actions/setup-python@v1 13 | with: 14 | python-version: 3.8 15 | - name: Install pypa/build 16 | run: >- 17 | python -m 18 | pip install 19 | build 20 | --user 21 | - name: Build a binary wheel and a source tarball 22 | run: >- 23 | python -m 24 | build 25 | --sdist 26 | --wheel 27 | --outdir dist/ 28 | . 29 | - name: Publish distribution 📦 to Test PyPI 30 | uses: pypa/gh-action-pypi-publish@v1.5.0 31 | with: 32 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 33 | repository_url: https://test.pypi.org/legacy/ 34 | skip_existing: true 35 | - name: Publish distribution 📦 to PyPI 36 | if: startsWith(github.ref, 'refs/tags') 37 | uses: pypa/gh-action-pypi-publish@v1.5.0 38 | with: 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /src/telemetry_peak_analyzer/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 VMware, Inc. 2 | # SPDX-License-Identifier: BSD-2 3 | import collections 4 | 5 | 6 | TelemetryPeak = collections.namedtuple( 7 | "TelemetryPeak", 8 | [ 9 | "sub_count", 10 | "samp_count", 11 | "samp_sub_count_max", 12 | "samp_sub_count_mean", 13 | "samp_sub_count_std", 14 | "samp_sub_ratio", 15 | "global_samp_sub_count_max", 16 | "global_threshold_suggested", 17 | ], 18 | ) 19 | 20 | 21 | GlobalTable = collections.namedtuple( 22 | "GlobalTable", 23 | [ 24 | "start_ts", 25 | "end_ts", 26 | "window_count", 27 | "sub_count_avg", 28 | "sub_count_max", 29 | "samp_count_avg", 30 | "samp_count_max", 31 | "samp_sub_count_avg", 32 | "samp_sub_count_max", 33 | "threshold_suggested", 34 | ], 35 | ) 36 | 37 | 38 | LocalTableStats = collections.namedtuple( 39 | "LocalTableStats", 40 | [ 41 | "sub_count", 42 | "samp_count", 43 | "samp_sub_count_max", 44 | "samp_sub_count_mean", 45 | "samp_sub_count_std", 46 | "samp_sub_ratio", 47 | "cross_stats", 48 | ], 49 | ) 50 | 51 | 52 | GlobalTableStats = collections.namedtuple( 53 | "GlobalTableStats", 54 | [ 55 | "samp_sub_count_max", 56 | "threshold", 57 | ], 58 | ) 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Telemetry Peak Analyzer 2 | Copyright 2021 VMware, Inc. 3 | 4 | The BSD-2 license (the "License") set forth below applies to all parts of the Telemetry Peak Analyzer project. You may not use this file except in compliance with the License. 5 | 6 | BSD-2 License 7 | 8 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 9 | 10 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 11 | 12 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /scripts/anonymize_telemetry.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2021 VMware, Inc. 3 | # SPDX-License-Identifier: BSD-2 4 | import argparse 5 | import hashlib 6 | import json 7 | import os 8 | import sys 9 | 10 | 11 | def main() -> int: 12 | """Anonymize telemetry JSON files.""" 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument( 15 | "-i", 16 | "--input-file", 17 | dest="input_file", 18 | default=None, 19 | required=True, 20 | type=str, 21 | help="The input file", 22 | ) 23 | args = parser.parse_args() 24 | 25 | with open(args.input_file, "r") as f: 26 | telemetry_data = json.load(f) 27 | 28 | for item in telemetry_data: 29 | item["customer.channel"] = None 30 | item["customer.installation_type"] = None 31 | item["customer.type"] = None 32 | item["file.md5"] = hashlib.md5(item["file.md5"].encode("utf-8")).hexdigest() 33 | item["file.name"] = None 34 | item["file.sha1"] = hashlib.sha1(item["file.sha1"].encode("utf-8")).hexdigest() 35 | if item["file.sha256"]: 36 | item["file.sha256"] = hashlib.sha256(item["file.sha256"].encode("utf-8")).hexdigest() 37 | item["file.size"] = 0 38 | item["source.access_key_id"] = 0 39 | item["source.data_center"] = None 40 | item["source.geo.country_iso_code"] = None 41 | item["source.geo.location"] = "0.00,0.00" 42 | item["source.submitter_ip"] = "0.0.0.0" 43 | item["source.user_id"] = 0 44 | item["submission_id"] = 0 45 | item["task.portal_url"] = None 46 | item["task.uuid"] = "a" * 32 47 | 48 | file_path = f"{os.path.splitext(args.input_file)[0]}.anonymized.json" 49 | with open(file_path, "w") as f: 50 | json.dump(telemetry_data, f, indent=2, sort_keys=True) 51 | 52 | return 0 53 | 54 | 55 | if __name__ == "__main__": 56 | sys.exit(main()) 57 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributing to telemetry-peak-analyzer 3 | 4 | The telemetry-peak-analyzer project team welcomes contributions from the community. Before you start working with telemetry-peak-analyzer, please 5 | read our [Developer Certificate of Origin](https://cla.vmware.com/dco). All contributions to this repository must be 6 | signed as described on that page. Your signature certifies that you wrote the patch or have the right to pass it on 7 | as an open-source patch. 8 | 9 | ## Contribution Flow 10 | 11 | This is a rough outline of what a contributor's workflow looks like: 12 | 13 | - Create a topic branch from where you want to base your work 14 | - Make commits of logical units 15 | - Make sure your commit messages are in the proper format (see below) 16 | - Push your changes to a topic branch in your fork of the repository 17 | - Submit a pull request 18 | 19 | Example: 20 | 21 | ``` shell 22 | git remote add upstream https://github.com/vmware/telemetry-peak-analyzer.git 23 | git checkout -b my-new-feature main 24 | git commit -a 25 | git push origin my-new-feature 26 | ``` 27 | 28 | ### Staying In Sync With Upstream 29 | 30 | When your branch gets out of sync with the vmware/main branch, use the following to update: 31 | 32 | ``` shell 33 | git checkout my-new-feature 34 | git fetch -a 35 | git pull --rebase upstream main 36 | git push --force-with-lease origin my-new-feature 37 | ``` 38 | 39 | ### Updating pull requests 40 | 41 | If your PR fails to pass CI or needs changes based on code review, you'll most likely want to squash these changes into 42 | existing commits. 43 | 44 | If your pull request contains a single commit or your changes are related to the most recent commit, you can simply 45 | amend the commit. 46 | 47 | ``` shell 48 | git add . 49 | git commit --amend 50 | git push --force-with-lease origin my-new-feature 51 | ``` 52 | 53 | If you need to squash changes into an earlier commit, you can use: 54 | 55 | ``` shell 56 | git add . 57 | git commit --fixup 58 | git rebase -i --autosquash main 59 | git push --force-with-lease origin my-new-feature 60 | ``` 61 | 62 | Be sure to add a comment to the PR indicating your new changes are ready to review, as GitHub does not generate a 63 | notification when you git push. 64 | 65 | ### Code Style 66 | 67 | ### Formatting Commit Messages 68 | 69 | We follow the conventions on [How to Write a Git Commit Message](http://chris.beams.io/posts/git-commit/). 70 | 71 | Be sure to include any related GitHub issue references in the commit message. See 72 | [GFM syntax](https://guides.github.com/features/mastering-markdown/#GitHub-flavored-markdown) for referencing issues 73 | and commits. 74 | 75 | ## Reporting Bugs and Creating Issues 76 | 77 | When opening a new issue, try to roughly follow the commit message format conventions above. 78 | -------------------------------------------------------------------------------- /scripts/create_telemetry.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright 2021 VMware, Inc. 3 | # SPDX-License-Identifier: BSD-2 4 | import argparse 5 | import csv 6 | import datetime 7 | import json 8 | import os 9 | import sys 10 | 11 | SEVERITY_BENIGN = "benign" 12 | SEVERITY_MALICIOUS = "malicious" 13 | SEVERITY_SUSPICIOUS = "suspicious" 14 | SEVERITY_ALL = [ 15 | SEVERITY_BENIGN, 16 | SEVERITY_MALICIOUS, 17 | SEVERITY_SUSPICIOUS, 18 | ] 19 | 20 | 21 | def datetime_str_to_ms(date_str: str, fmt: str = "%Y-%m-%d %H:%M:%S") -> int: 22 | """ 23 | Convert a given timestamp to milliseconds since the epoch. 24 | 25 | :param str date_str: the datetime string 26 | :param str fmt: the format 27 | :rtype: int 28 | """ 29 | date_obj = datetime.datetime.strptime(date_str, fmt) 30 | return int((date_obj - datetime.datetime.utcfromtimestamp(0)).total_seconds()) * 1000 31 | 32 | 33 | def main() -> int: 34 | """Convert (internal, deprecated) CSV telemetry files into JSON telemetry data.""" 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument( 37 | "-i", 38 | "--input-file", 39 | dest="input_file", 40 | default=None, 41 | required=True, 42 | type=str, 43 | help="The input file", 44 | ) 45 | parser.add_argument( 46 | "-s", 47 | "--severity-filter", 48 | dest="severity_filter", 49 | choices=SEVERITY_ALL, 50 | default=None, 51 | help=f"Optional filter by severity ({','.join(SEVERITY_ALL)})", 52 | ) 53 | args = parser.parse_args() 54 | 55 | telemetry_data = [] 56 | with open(args.input_file, "r") as f: 57 | reader = csv.DictReader(f) 58 | for row in reader: 59 | if args.severity_filter and row["severity"] != args.severity_filter: 60 | continue 61 | telemetry_data.append( 62 | { 63 | "analysis.label": row["vt_label"], 64 | "customer.channel": row["channel"], 65 | "customer.installation_type": row["installation_type"], 66 | "customer.region": row["region"], 67 | "customer.sector": row["sector"], 68 | "customer.type": row["key_type"], 69 | "file.llfile_type": row["file_type"], 70 | "file.magic": None, 71 | "file.md5": row["md5"], 72 | "file.mime_type": row["mime_type"], 73 | "file.name": None, 74 | "file.sha1": row["sha1"], 75 | "file.sha256": None, 76 | "file.size": row["file_size"], 77 | "source.access_key_id": row["access_key_id"], 78 | "source.data_center": row["data_center"], 79 | "source.geo.country_iso_code": None, 80 | "source.geo.location": "0.00,0.00", 81 | "source.origin": row["origin"], 82 | "source.submitter_ip": row["submitter_ip"], 83 | "source.user_id": row["user_id"], 84 | "submission_id": row["submission_id"], 85 | "task.portal_url": None, 86 | "task.score": row["score"], 87 | "task.severity": row["severity"], 88 | "task.uuid": row["task_uuid"], 89 | "utc_timestamp": datetime_str_to_ms(row["ts"]), 90 | } 91 | ) 92 | 93 | file_path = f"{os.path.splitext(args.input_file)[0]}.json" 94 | with open(file_path, "w") as f: 95 | json.dump(telemetry_data, f, indent=2, sort_keys=True) 96 | 97 | return 0 98 | 99 | 100 | if __name__ == "__main__": 101 | sys.exit(main()) 102 | -------------------------------------------------------------------------------- /src/telemetry_peak_analyzer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 VMware, Inc. 2 | # SPDX-License-Identifier: BSD-2 3 | import datetime 4 | import json 5 | import logging 6 | import os 7 | import resource 8 | import sys 9 | from typing import Dict 10 | from typing import List 11 | from typing import Union 12 | 13 | 14 | def ms_to_datetime(milliseconds: int) -> datetime.datetime: 15 | """ 16 | Convert a given amount of milliseconds to a datetime object. 17 | 18 | :param int milliseconds: number of milliseconds 19 | :rtype: datetime.datetime 20 | :return: the datetime object 21 | """ 22 | return datetime.datetime.fromtimestamp(milliseconds / 1000) 23 | 24 | 25 | def datetime_to_sec(timestamp: datetime.datetime) -> int: 26 | """ 27 | Convert a given timestamp to seconds since the epoch. 28 | 29 | :param datetime.datetime timestamp: the datetime object 30 | :rtype: int 31 | """ 32 | return int((timestamp - datetime.datetime.utcfromtimestamp(0)).total_seconds()) 33 | 34 | 35 | def datetime_to_ms(timestamp: datetime.datetime) -> int: 36 | """ 37 | Convert a given timestamp to milliseconds since the epoch. 38 | 39 | :param datetime.datetime timestamp: the datetime object 40 | :rtype: int 41 | """ 42 | return datetime_to_sec(timestamp) * 1000 43 | 44 | 45 | def save_to_json(obj: Union[Dict, List], file_path: str) -> None: 46 | """ 47 | Save python object. 48 | 49 | :param dict|list obj: the python object 50 | :param str file_path: the path where to save to 51 | """ 52 | with open(file_path, "w") as f: 53 | json.dump(obj, f, indent=2 * " ", sort_keys=True, default=str) 54 | 55 | 56 | class MemoryFootprintFormatter(logging.Formatter): 57 | """Special formatter keeping track how much memory is used.""" 58 | 59 | _DEFAULT_FMT = "%(levelname)s -> [%(asctime)s] %(message)s" 60 | _DEFAULT_DATEFMT = "%Y-%m-%d %H:%M:%S" 61 | 62 | def __init__( 63 | self, 64 | fmt: str = _DEFAULT_FMT, 65 | datefmt: str = _DEFAULT_DATEFMT, 66 | ): 67 | """Override method.""" 68 | super(MemoryFootprintFormatter, self).__init__(fmt, datefmt) 69 | 70 | @classmethod 71 | def configure_logging(cls, level: int) -> None: 72 | """ 73 | Configure some sane defaults. 74 | 75 | :param int level: the debugging level. 76 | """ 77 | handler = logging.StreamHandler() 78 | formatter = cls() 79 | handler.setFormatter(formatter) 80 | logging.root.addHandler(handler) 81 | logging.root.setLevel(level) 82 | 83 | @staticmethod 84 | def _read_procfs_memory(procfs_mem_key: str) -> float: 85 | """ 86 | Read memory usage from the /proc filesystem. 87 | 88 | :param str procfs_mem_key: the key to search 89 | :rtype: float 90 | :return: the value of the related key 91 | """ 92 | procfs_fn = os.path.join("/", "proc", str(os.getpid()), "status") 93 | procfs_stats_scale = { 94 | "kB": 1024.0, 95 | "mB": 1024.0 * 1024.0, 96 | "KB": 1024.0, 97 | "MB": 1024.0 * 1024.0, 98 | } 99 | with open(procfs_fn) as pf: 100 | pf_data = pf.read() 101 | # get VmKey line e.g. "VmRSS: 9999 kB\n ..." 102 | i = pf_data.index(procfs_mem_key) 103 | # remove white-spaces 104 | v = pf_data[i:].split(None, 3) 105 | if len(v) < 3: 106 | return 0.0 107 | # scale to the unit that was asked for 108 | return float(v[1]) * procfs_stats_scale[v[2]] 109 | 110 | @staticmethod 111 | def _get_memory_size() -> float: 112 | """ 113 | Retrieve total memory usage in bytes from /proc filesystem. 114 | 115 | :rtype: float 116 | :return: the value of the memory size 117 | """ 118 | return MemoryFootprintFormatter._read_procfs_memory("VmSize:") 119 | 120 | @staticmethod 121 | def _get_resident_memory_size() -> float: 122 | """ 123 | Retrieve resident memory usage in bytes from /proc filesystem. 124 | 125 | :rtype: float 126 | :return: the value of the memory size 127 | """ 128 | return MemoryFootprintFormatter._read_procfs_memory("VmRSS:") 129 | 130 | @classmethod 131 | def _get_memory_consumption(cls) -> float: 132 | """ 133 | Get the current memory consumption in megabytes. 134 | 135 | :rtype: float 136 | :return: the memory consumption 137 | """ 138 | try: 139 | # We get the actual memory footprint 140 | res_mem_size = MemoryFootprintFormatter._get_resident_memory_size() 141 | except IOError: 142 | # On OSX we do not have /proc so we fallback to resource 143 | if sys.platform == "darwin": 144 | # IMPORTANT: on OSX results are given in Bytes while on Linux are given in KBytes 145 | res_mem_size = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 146 | else: 147 | # If we are not on Linux or on OSX just give up 148 | res_mem_size = 1024.0 * 1024.0 149 | res_mem_size /= 1024.0 * 1024.0 150 | return res_mem_size 151 | 152 | def format(self, record: logging.LogRecord) -> str: 153 | """Override.""" 154 | # A record can be formatted multiple times by SimpleLogger. 155 | if not hasattr(record, "formatted"): 156 | record.msg = "[%04dmb] %s" % (self._get_memory_consumption(), record.msg) 157 | setattr(record, "formatted", True) 158 | return super(MemoryFootprintFormatter, self).format(record) 159 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in telemetry-peak-analyzer project and our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, religion, or sexual identity 11 | and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the 27 | overall community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or 32 | advances of any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email 36 | address, without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official e-mail address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement at oss-coc@vmware.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. -------------------------------------------------------------------------------- /tests/test_telemetry_peak_analyzer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 VMware, Inc. 2 | # SPDX-License-Identifier: BSD-2 3 | import configparser 4 | import datetime 5 | import unittest 6 | 7 | import ddt 8 | import mock 9 | from telemetry_peak_analyzer import analyzers 10 | from telemetry_peak_analyzer import backends 11 | from telemetry_peak_analyzer import models 12 | 13 | 14 | TEST_GLOBAL_TABLE_1 = { 15 | "malicious": { 16 | "file_type": models.GlobalTable( 17 | start_ts=datetime.datetime.strptime("2020-06-17", "%Y-%m-%d"), 18 | end_ts=datetime.datetime.strptime("2020-07-12", "%Y-%m-%d"), 19 | window_count=25, 20 | sub_count_avg=1, 21 | sub_count_max=10, 22 | samp_count_max=10, 23 | samp_count_avg=4, 24 | samp_sub_count_avg=0, 25 | samp_sub_count_max=0.0, 26 | threshold_suggested=10, 27 | ) 28 | } 29 | } 30 | 31 | TEST_GLOBAL_STATS_1 = { 32 | "malicious": { 33 | "file_type": models.GlobalTableStats( 34 | samp_sub_count_max=0.0, 35 | threshold=10, 36 | ) 37 | } 38 | } 39 | 40 | TEST_LOCAL_TABLE_1 = { 41 | "malicious": { 42 | "file_type": { 43 | "file.sha1": { 44 | "s1": 10, 45 | "s2": 10, 46 | }, 47 | "source.user_id": { 48 | "u1": 5, 49 | "u2": 5, 50 | }, 51 | "source.origin": { 52 | "o1": 5, 53 | "o2": 5, 54 | }, 55 | } 56 | }, 57 | } 58 | 59 | TEST_LOCAL_STATS_1 = { 60 | "malicious": { 61 | "file_type": models.LocalTableStats( 62 | sub_count=20, 63 | samp_count=2, 64 | samp_sub_count_max=10, 65 | samp_sub_count_mean=10, 66 | samp_sub_count_std=0.0, 67 | samp_sub_ratio=0.0, 68 | cross_stats={ 69 | "source.user_id": { 70 | "u1": 5, 71 | "u2": 5, 72 | }, 73 | "source.origin": { 74 | "o1": 5, 75 | "o2": 5, 76 | }, 77 | }, 78 | ) 79 | } 80 | } 81 | 82 | TEST_PEAKS_1 = { 83 | "malicious": { 84 | "file_type": models.TelemetryPeak( 85 | sub_count=20, 86 | samp_count=2, 87 | samp_sub_count_max=10, 88 | samp_sub_count_mean=10, 89 | samp_sub_count_std=0.0, 90 | samp_sub_ratio=0.0, 91 | global_samp_sub_count_max=0.0, 92 | global_threshold_suggested=10, 93 | ) 94 | } 95 | } 96 | 97 | TEST_GLOBAL_TABLE_2 = { 98 | "malicious": {}, 99 | } 100 | 101 | TEST_GLOBAL_STATS_2 = {} 102 | 103 | TEST_LOCAL_TABLE_2 = {} 104 | 105 | TEST_LOCAL_STATS_2 = {} 106 | 107 | TEST_PEAKS_2 = {} 108 | 109 | TEST_GLOBAL_TABLE_3 = {} 110 | 111 | TEST_GLOBAL_STATS_3 = {} 112 | 113 | TEST_LOCAL_TABLE_3 = {} 114 | 115 | TEST_LOCAL_STATS_3 = {} 116 | 117 | TEST_PEAKS_3 = {} 118 | 119 | 120 | @ddt.ddt 121 | class TestFileTypePeakAnalyzerTinaBackend(unittest.TestCase): 122 | """Class to test the manager.""" 123 | 124 | @ddt.data( 125 | (TEST_GLOBAL_TABLE_1, TEST_GLOBAL_STATS_1), 126 | (TEST_GLOBAL_TABLE_2, TEST_GLOBAL_STATS_2), 127 | (TEST_GLOBAL_TABLE_3, TEST_GLOBAL_STATS_3), 128 | ) 129 | def test_get_global_tables_stats(self, args): 130 | """Test the 'get_global_tables_stats' method.""" 131 | global_tables, expected_stats = args 132 | backend_mock = mock.MagicMock(spec=backends.TwoIndexTwoDimensionBackend) 133 | peak_analyzer = analyzers.FileTypePeakAnalyzer( 134 | conf=configparser.ConfigParser(), 135 | backend=backend_mock, 136 | start_ts=datetime.datetime.utcnow() - datetime.timedelta(days=7), 137 | end_ts=datetime.datetime.utcnow(), 138 | ) 139 | stats = peak_analyzer.get_global_tables_stats(global_tables) 140 | self.assertEqual(stats, expected_stats) 141 | 142 | @ddt.data( 143 | (TEST_LOCAL_TABLE_1, TEST_LOCAL_STATS_1), 144 | (TEST_LOCAL_TABLE_2, TEST_LOCAL_STATS_2), 145 | (TEST_LOCAL_TABLE_3, TEST_LOCAL_STATS_3), 146 | ) 147 | def test_get_local_tables_stats(self, args): 148 | """Test the 'get_local_tables_stats' method.""" 149 | local_tables, expected_stats = args 150 | backend_mock = mock.MagicMock(spec=backends.TwoIndexTwoDimensionBackend) 151 | peak_analyzer = analyzers.FileTypePeakAnalyzer( 152 | conf=configparser.ConfigParser(), 153 | backend=backend_mock, 154 | start_ts=datetime.datetime.utcnow() - datetime.timedelta(days=7), 155 | end_ts=datetime.datetime.utcnow(), 156 | ) 157 | stats = peak_analyzer.get_local_tables_stats(local_tables) 158 | self.assertEqual(stats, expected_stats) 159 | 160 | @ddt.data( 161 | (TEST_GLOBAL_TABLE_1, TEST_LOCAL_TABLE_1, TEST_PEAKS_1), 162 | (TEST_GLOBAL_TABLE_2, TEST_LOCAL_TABLE_2, TEST_PEAKS_2), 163 | (TEST_GLOBAL_TABLE_3, TEST_LOCAL_TABLE_3, TEST_PEAKS_3), 164 | ) 165 | def test_get_peaks(self, args): 166 | """Test the 'get_peaks' method.""" 167 | global_tables, local_tables, expected_peaks = args 168 | backend_mock = mock.MagicMock(spec=backends.TwoIndexTwoDimensionBackend) 169 | peak_analyzer = analyzers.FileTypePeakAnalyzer( 170 | conf=configparser.ConfigParser(), 171 | backend=backend_mock, 172 | start_ts=datetime.datetime.utcnow() - datetime.timedelta(days=7), 173 | end_ts=datetime.datetime.utcnow(), 174 | ) 175 | peaks = peak_analyzer.get_peaks(global_tables, local_tables) 176 | self.assertEqual(peaks, expected_peaks) 177 | 178 | 179 | if __name__ == "__main__": 180 | unittest.main() 181 | -------------------------------------------------------------------------------- /src/telemetry_peak_analyzer/backends/tina.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 VMware, Inc. 2 | # SPDX-License-Identifier: BSD-2 3 | import collections 4 | import configparser 5 | import datetime 6 | import logging 7 | from typing import Any 8 | from typing import Dict 9 | from typing import List 10 | 11 | from telemetry_peak_analyzer import backends 12 | 13 | try: 14 | from tina_client.storage import readers 15 | except ImportError: 16 | raise ImportError("The Tina backend requires tina-client.") from None 17 | 18 | 19 | logging.getLogger("elasticsearch").setLevel(logging.WARNING) 20 | 21 | 22 | class TinaBackend(backends.TwoIndexTwoDimensionBackend): 23 | """ 24 | Backend using Tina (an internal Elasticsearch cluster). 25 | 26 | Note: in this class we hide all the specialization required to port the existing logic; note 27 | that this is unlikely to work with different analyzer and generalizing this bit further 28 | might not be worth the cost. 29 | """ 30 | 31 | @staticmethod 32 | def _get_aggregation_query( 33 | index: List[str], 34 | dimensions: List[str], 35 | dimensions_values: Dict[str, List[str]], 36 | ) -> Dict[str, Dict]: 37 | """ 38 | Return a fully loaded Elasticsearch aggregation query. 39 | 40 | :param list[str] index: the index 41 | :param list[str] dimensions: the dimensions 42 | :param dict[str, list[str]] dimensions_values: the values of dimensions (if available) 43 | :rtype: dict[str, dict] 44 | :return: the aggregation to run 45 | """ 46 | 47 | def _get_aggregation() -> Dict[str, Dict]: 48 | """Return the aggregation for each dimension.""" 49 | return { 50 | "sub_per_day": { 51 | "date_histogram": { 52 | "field": index[0], 53 | "calendar_interval": "day", 54 | "min_doc_count": 0, 55 | }, 56 | "aggs": { 57 | "samp_count": { 58 | "cardinality": { 59 | "field": index[1], 60 | } 61 | }, 62 | "samp_sub_count": { 63 | "bucket_script": { 64 | "buckets_path": { 65 | "sub_count": "_count", 66 | "samp_count": "samp_count", 67 | }, 68 | "script": "params.sub_count/params.samp_count", 69 | } 70 | }, 71 | }, 72 | }, 73 | "samp_count_avg": {"avg_bucket": {"buckets_path": "sub_per_day.samp_count"}}, 74 | "samp_count_max": {"max_bucket": {"buckets_path": "sub_per_day.samp_count"}}, 75 | "samp_sub_count_max": { 76 | "max_bucket": {"buckets_path": "sub_per_day>samp_sub_count"} 77 | }, 78 | "sub_count_avg": {"avg_bucket": {"buckets_path": "sub_per_day._count"}}, 79 | "sub_count_max": {"max_bucket": {"buckets_path": "sub_per_day._count"}}, 80 | } 81 | 82 | dimension_0_values = dimensions_values.get(dimensions[0], []) 83 | return { 84 | "aggs": { 85 | "my_buckets": { 86 | "composite": { 87 | "sources": [{"dimension": {"terms": {"field": dimensions[1]}}}] 88 | }, 89 | "aggs": { 90 | dimension_0_values[0]: { 91 | "filter": {"term": {dimensions[0]: dimension_0_values[0]}}, 92 | "aggs": _get_aggregation(), 93 | }, 94 | dimension_0_values[1]: { 95 | "filter": {"term": {dimensions[0]: dimension_0_values[1]}}, 96 | "aggs": _get_aggregation(), 97 | }, 98 | }, 99 | } 100 | }, 101 | "size": 0, 102 | } 103 | 104 | @staticmethod 105 | def _parse_aggregation_output( 106 | buckets: List[Dict[str, Dict[str, Any]]], 107 | dimensions: List[str], 108 | dimensions_values: Dict[str, List[str]], 109 | ) -> Dict[str, Dict[str, Dict[str, float]]]: 110 | """ 111 | Parse the aggregation from Elasticsearch into a backend-independent format. 112 | 113 | :param list[dict[str, dict[str, any]]] buckets: the aggregation buckets 114 | :param list[str] dimensions: the dimensions 115 | :param dict[str, list[str]] dimensions_values: the values of some dimensions (if available) 116 | :rtype: dict[str, dict[str, dict[str, float]]] 117 | :return: the aggregation results indexed by dimension 118 | """ 119 | aggregation_keys = frozenset( 120 | [ 121 | "sub_count_avg", 122 | "sub_count_max", 123 | "samp_count_avg", 124 | "samp_count_max", 125 | "samp_sub_count_max", 126 | ] 127 | ) 128 | dimension_0_values = dimensions_values.get(dimensions[0], []) 129 | ret = collections.defaultdict(dict) 130 | for bucket in buckets: 131 | for dimension_0 in dimension_0_values: 132 | if bucket[dimension_0]["doc_count"] > 0: 133 | dimension_1 = bucket["key"]["dimension"] 134 | ret[dimension_0][dimension_1] = { 135 | k: bucket[dimension_0][k]["value"] 136 | for k in bucket[dimension_0] 137 | if k in aggregation_keys 138 | } 139 | return ret 140 | 141 | def __init__(self, conf: configparser.ConfigParser, section_name: str) -> None: 142 | """Constructor.""" 143 | super(TinaBackend, self).__init__(conf, section_name) 144 | self._tina_reader = readers.BulkFileSubmissionReader(conf, section_name) 145 | self._logger.info( 146 | "Loading backend '%s' from section '%s'", 147 | self.__class__.__name__, 148 | section_name, 149 | ) 150 | 151 | def stats( 152 | self, 153 | start_date: datetime.datetime, 154 | end_date: datetime.datetime, 155 | index: List[str], 156 | dimensions: List[str], 157 | dimensions_values: Dict[str, List[str]], 158 | ) -> Dict[str, Dict[str, Dict[str, float]]]: 159 | """Implement interface.""" 160 | query = self._get_aggregation_query(index, dimensions, dimensions_values) 161 | buckets = [] 162 | while True: 163 | ret = self._tina_reader.search_raw( 164 | start_ts=start_date, 165 | end_ts=end_date, 166 | query=query, 167 | limit=0, 168 | ) 169 | buckets.extend(ret["aggregations"]["my_buckets"]["buckets"]) 170 | after_key = ret["aggregations"]["my_buckets"].get("after_key") 171 | if not after_key: 172 | break 173 | query["aggs"]["my_buckets"]["composite"]["after"] = after_key 174 | return self._parse_aggregation_output(buckets, dimensions, dimensions_values) 175 | 176 | def group_by( 177 | self, 178 | start_date: datetime.datetime, 179 | end_date: datetime.datetime, 180 | index: List[str], 181 | dimensions: List[str], 182 | ) -> List[Dict[str, str]]: 183 | """Implement interface.""" 184 | buckets = self._tina_reader.aggregate( 185 | start_ts=start_date, 186 | end_ts=end_date, 187 | terms=dimensions + [index[1]], 188 | limit=None, 189 | ) 190 | # This might or might not be a generator so in-place edit can't work 191 | new_buckets = [] 192 | terms_set = set(dimensions + [index[1]]) 193 | for bucket in buckets: 194 | for k in bucket.keys(): 195 | if k in terms_set: 196 | # Make sure we always deal with strings 197 | bucket[k] = str(bucket[k]) 198 | new_buckets.append(bucket) 199 | return new_buckets 200 | 201 | 202 | class NetworkTinaBackend(TinaBackend): 203 | """New backend specialized for network events.""" 204 | 205 | def __init__(self, conf: configparser.ConfigParser, section_name: str) -> None: 206 | """Constructor.""" 207 | backends.TwoIndexTwoDimensionBackend.__init__(self, conf, section_name) 208 | self._tina_reader = readers.BulkEventReader(conf, section_name) 209 | self._logger.info( 210 | "Loading backend '%s' from section '%s'", 211 | self.__class__.__name__, 212 | section_name, 213 | ) 214 | 215 | def stats( 216 | self, 217 | start_date: datetime.datetime, 218 | end_date: datetime.datetime, 219 | index: List[str], 220 | dimensions: List[str], 221 | dimensions_values: Dict[str, List[str]], 222 | ) -> Dict[str, Dict[str, Dict[str, float]]]: 223 | return super(NetworkTinaBackend, self).stats(start_date, end_date, index, dimensions, dimensions_values) 224 | 225 | def group_by( 226 | self, 227 | start_date: datetime.datetime, 228 | end_date: datetime.datetime, 229 | index: List[str], 230 | dimensions: List[str], 231 | ) -> List[Dict[str, str]]: 232 | return super(NetworkTinaBackend, self).group_by(start_date, end_date, index, dimensions) 233 | -------------------------------------------------------------------------------- /src/telemetry_peak_analyzer/__main__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 VMware, Inc. 2 | # SPDX-License-Identifier: BSD-2 3 | import argparse 4 | import configparser 5 | import datetime 6 | import logging 7 | import os 8 | import sys 9 | from typing import Optional 10 | from typing import Tuple 11 | 12 | import telemetry_peak_analyzer 13 | 14 | 15 | def import_class(clazz_name: str) -> type: 16 | """ 17 | Import the module and return the class. 18 | 19 | Example: 20 | > clazz = get_clazz_from_module("package.module.ClassName") 21 | > logging.debug("Instantiating %s instance...", clazz.__name__) 22 | > obj = clazz(conf) 23 | 24 | :param str clazz_name: class name in 'module.Class' form 25 | :rtype: type 26 | :return: the loadable type 27 | :raises ImportError: if the class name is not valid 28 | """ 29 | if "." not in clazz_name: 30 | raise ImportError(f"Class '{clazz_name}' does not appear to be in module.Class form") 31 | 32 | try: 33 | only_clazz = clazz_name.split(".")[-1] 34 | only_module = ".".join(clazz_name.split(".")[:-1]) 35 | mod = __import__(only_module, fromlist=[only_clazz]) 36 | return getattr(mod, only_clazz) 37 | except AttributeError as ae: 38 | raise ImportError(f"Class '{clazz_name}' not found") from ae 39 | 40 | 41 | def is_valid_date(date_str: str) -> datetime.date: 42 | """ 43 | Validate a date and return a datetime object. 44 | 45 | :param str date_str: the datetime object as a string 46 | :rtype: datetime.date 47 | :return: the parsed date 48 | :raises ValueError: if the date is not valid 49 | """ 50 | try: 51 | return datetime.datetime.strptime(date_str, "%Y-%m-%d").date() 52 | except ValueError: 53 | raise ValueError(f"Not a valid date: '{date_str}'") from None 54 | 55 | 56 | def parse_date_options( 57 | start_ts: datetime.date, 58 | end_ts: datetime.date, 59 | delta: int, 60 | delay: int, 61 | ) -> Tuple[datetime.datetime, datetime.datetime]: 62 | """ 63 | Validate the date options. 64 | 65 | :param datetime.date start_ts: the start of the time interval 66 | :param datetime.date end_ts: the end of the time interval 67 | :param int delta: the length of the time interval 68 | :param int delay: the delay of the time interval 69 | :rtype: tuple[datetime.datetime, datetime.datetime] 70 | :return: the validated datetime objects 71 | :raises ValueError: if the provided interval is not valid 72 | """ 73 | if start_ts and end_ts: 74 | if end_ts <= start_ts: 75 | raise ValueError(f"Invalid time interval {start_ts} - {end_ts}") 76 | else: 77 | utc_now = datetime.datetime.utcnow() 78 | end_ts = (utc_now - datetime.timedelta(days=delay)).date() 79 | start_ts = end_ts - datetime.timedelta(days=delta) 80 | return ( 81 | datetime.datetime.combine(start_ts, datetime.datetime.min.time()), 82 | datetime.datetime.combine(end_ts, datetime.datetime.min.time()), 83 | ) 84 | 85 | 86 | def run( 87 | config: configparser.ConfigParser, 88 | analyzer_class: type, 89 | backend_class: type, 90 | backend_input: str, 91 | start_ts: datetime.datetime, 92 | end_ts: datetime.datetime, 93 | threshold: int, 94 | global_table_path: str, 95 | output_file_path: Optional[str] = None, 96 | ) -> int: 97 | """Run the telemetry peak analyzer.""" 98 | logger = logging.getLogger(__name__) 99 | logger.info("Loading Peak Analyzer from %s to %s with t=%s", start_ts, end_ts, threshold) 100 | peak_analyzer = analyzer_class( 101 | conf=config, 102 | backend=backend_class(config, backend_input), 103 | start_ts=start_ts, 104 | end_ts=end_ts, 105 | ) 106 | try: 107 | logger.info("Loading global tables from file '%s'", global_table_path) 108 | global_tables = peak_analyzer.get_global_tables_from_file(global_table_path) 109 | except IOError as ioe: 110 | logger.info("\tFailed: %s", str(ioe)) 111 | logger.info("Loading global tables from the backend") 112 | global_tables = peak_analyzer.get_global_tables() 113 | logger.info("Loading local tables") 114 | local_tables = peak_analyzer.get_local_tables() 115 | peaks = peak_analyzer.get_peaks(global_tables, local_tables, threshold=threshold) 116 | 117 | logger.info("Getting peaks") 118 | for dimension_0 in peaks: 119 | for dimension_1 in peaks[dimension_0]: 120 | logger.info("TelemetryPeak(%s, %s)", dimension_0, dimension_1) 121 | peak = peaks[dimension_0][dimension_1] 122 | for k, v in peak._asdict().items(): 123 | logger.info("\t%s: %s", k, round(v, 2)) 124 | if output_file_path: 125 | logger.info("Saving output to: %s", output_file_path) 126 | telemetry_peak_analyzer.save_to_json(peaks, output_file_path) 127 | 128 | logger.info("Refreshing global tables") 129 | global_tables = peak_analyzer.refresh_global_tables(global_tables, local_tables) 130 | 131 | logger.info("Saving global tables to '%s'", global_table_path) 132 | telemetry_peak_analyzer.save_to_json(global_tables, global_table_path) 133 | return 0 134 | 135 | 136 | def parse_and_run_command(): 137 | """ 138 | Examples: 139 | # python -m telemetry_peak_analyzer \ 140 | -b telemetry_peak_analyzer.backends.JsonBackend -n "~/data.*.json" \ 141 | -s 2020-07-01 -e 2021-08-01 -t 10 142 | # python -m telemetry_peak_analyzer -c config.ini \ 143 | -b telemetry_peak_analyzer.backends.tina.TinaBackend -n tina_backend \ 144 | -s 2020-07-01 -e 2021-08-01 -t 10 145 | """ 146 | parser = argparse.ArgumentParser() 147 | parser.add_argument( 148 | "-c", 149 | "--config-file", 150 | dest="config_file", 151 | default="./data/config.ini", 152 | type=str, 153 | help="read config from here", 154 | ) 155 | # Time interval option 1: specify start and end datetime 156 | parser.add_argument( 157 | "-s", 158 | "--start-date", 159 | dest="start_date", 160 | default=None, 161 | type=is_valid_date, 162 | help="the start of the time interval in 'YYYY:mm:dd' format", 163 | ) 164 | parser.add_argument( 165 | "-e", 166 | "--end-date", 167 | dest="end_date", 168 | default=None, 169 | type=is_valid_date, 170 | help="the end of the time interval in 'YYYY:mm:dd' format", 171 | ) 172 | # Time interval option 2: specify the length and the delay of the time interval 173 | parser.add_argument( 174 | "-d", 175 | "--delta", 176 | dest="delta", 177 | default=1, 178 | type=int, 179 | help="the length of the time interval starting from now", 180 | ) 181 | parser.add_argument( 182 | "-k", 183 | "--delay", 184 | dest="delay", 185 | default=0, 186 | type=int, 187 | help="the delay of the time interval expressed in days", 188 | ) 189 | # Other options 190 | parser.add_argument( 191 | "-t", 192 | "--threshold", 193 | dest="threshold", 194 | default=None, 195 | type=int, 196 | help="the threshold used by the telemetry peak analyzer", 197 | ) 198 | parser.add_argument( 199 | "-a", 200 | "--analyzer", 201 | dest="analyzer_class", 202 | default="telemetry_peak_analyzer.analyzers.FileTypePeakAnalyzer", 203 | type=import_class, 204 | help="the full class name of the analyzer used to process telemetry data", 205 | ) 206 | parser.add_argument( 207 | "-b", 208 | "--backend-class", 209 | dest="backend_class", 210 | default="telemetry_peak_analyzer.backends.JsonBackend", 211 | type=import_class, 212 | help="the full class name of the backend used to read the telemetry from", 213 | ) 214 | parser.add_argument( 215 | "-n", 216 | "--backend-input", 217 | dest="backend_input", 218 | required=True, 219 | type=str, 220 | help="the backend input, section name when reading remotely or a file for local input", 221 | ) 222 | parser.add_argument( 223 | "-m", 224 | "--global-table", 225 | dest="global_table_path", 226 | default="global_table.json", 227 | type=str, 228 | help="the path to the global tables, either to load from, or to save to", 229 | ) 230 | parser.add_argument( 231 | "-o", 232 | "--output-file", 233 | dest="output_file", 234 | default=None, 235 | type=str, 236 | help="the path to output file", 237 | ) 238 | parser.add_argument( 239 | "-v", 240 | "--verbose", 241 | dest="verbose", 242 | default=False, 243 | action="store_true", 244 | help="whether to be verbose", 245 | ) 246 | 247 | # Parse options and init the logger 248 | args = parser.parse_args() 249 | conf = configparser.ConfigParser() 250 | conf.read(args.config_file) 251 | start_date, end_date = parse_date_options( 252 | args.start_date, 253 | args.end_date, 254 | args.delta, 255 | args.delay, 256 | ) 257 | log_level = logging.DEBUG if args.verbose else logging.INFO 258 | telemetry_peak_analyzer.MemoryFootprintFormatter.configure_logging(log_level) 259 | 260 | # Run 261 | return run( 262 | conf, 263 | args.analyzer_class, 264 | args.backend_class, 265 | args.backend_input, 266 | start_date, 267 | end_date, 268 | args.threshold, 269 | os.path.abspath(args.global_table_path), 270 | os.path.abspath(args.output_file) if args.output_file else None, 271 | ) 272 | 273 | 274 | if __name__ == "__main__": 275 | sys.exit(parse_and_run_command()) 276 | -------------------------------------------------------------------------------- /src/telemetry_peak_analyzer/backends/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 VMware, Inc. 2 | # SPDX-License-Identifier: BSD-2 3 | import abc 4 | import collections 5 | import configparser 6 | import datetime 7 | import glob 8 | import logging 9 | import os 10 | import statistics 11 | from typing import Any 12 | from typing import Dict 13 | from typing import List 14 | from typing import TypeVar 15 | 16 | import ijson 17 | import telemetry_peak_analyzer 18 | 19 | BackendType = TypeVar("BackendType", bound="AbstractBackend") 20 | 21 | 22 | class AbstractBackend(abc.ABC): 23 | """Abstract backend.""" 24 | 25 | def __init__(self, conf: configparser.ConfigParser, section_name: str) -> None: 26 | """ 27 | Constructor. 28 | 29 | :param configparser.ConfigParser conf: the conf object 30 | :param str section_name: the name of the section 31 | """ 32 | self._conf = conf 33 | self._section_name = section_name 34 | self._logger = logging.getLogger(__name__) 35 | self._logger.info("Loading backend '%s'", self.__class__.__name__) 36 | 37 | @abc.abstractmethod 38 | def stats( 39 | self, 40 | start_date: datetime.datetime, 41 | end_date: datetime.datetime, 42 | index: List[str], 43 | dimensions: List[str], 44 | dimensions_values: Dict[str, List[str]], 45 | ) -> Dict[str, Any]: 46 | """ 47 | Create statistics. 48 | 49 | :param datetime.datetime start_date: the start of the time interval 50 | :param datetime.datetime end_date: the end of the time interval 51 | :param list[str] index: the index 52 | :param list[str] dimensions: the dimensions 53 | :param dict[str, list[str]] dimensions_values: the values of dimensions (if available) 54 | :rtype: dict[str, any] 55 | :return: statistics for each dimension combination 56 | """ 57 | 58 | @abc.abstractmethod 59 | def group_by( 60 | self, 61 | start_date: datetime.datetime, 62 | end_date: datetime.datetime, 63 | index: List[str], 64 | dimensions: List[str], 65 | ) -> List[Dict[str, str]]: 66 | """ 67 | Group by. 68 | 69 | :param datetime.datetime start_date: the start of the time interval 70 | :param datetime.datetime end_date: the end of the time interval 71 | :param list[str] index: the index 72 | :param list[str] dimensions: list of dimensions to group by 73 | :rtype: list[dict[str, str]] 74 | :return: buckets for each dimension combination 75 | """ 76 | 77 | 78 | class TwoIndexTwoDimensionBackend(AbstractBackend, abc.ABC): 79 | """Backend accepting an index with two fields and two dimensions.""" 80 | 81 | @abc.abstractmethod 82 | def stats( 83 | self, 84 | start_date: datetime.datetime, 85 | end_date: datetime.datetime, 86 | index: List[str], 87 | dimensions: List[str], 88 | dimensions_values: Dict[str, List[str]], 89 | ) -> Dict[str, Dict[str, Dict[str, float]]]: 90 | """ 91 | Create statistics. 92 | 93 | :param datetime.datetime start_date: the start of the time interval 94 | :param datetime.datetime end_date: the end of the time interval 95 | :param list[str] index: the index 96 | :param list[str] dimensions: the dimensions 97 | :param dict[str, list[str]] dimensions_values: the values of dimensions (if available) 98 | :rtype: dict[str, dict[str, dict[str, float]]] 99 | :return: a dictionary like: 100 | { 101 | "dimension_0_value": { 102 | "dimension_1_value": { 103 | "sub_count_avg": average of submissions 104 | "sub_count_max": maximum number of submissions 105 | "samp_count_avg": average of samples 106 | "samp_count_max": maximum number of samples 107 | "samp_sub_count_max": maximum ratio between samples and submissions 108 | } 109 | ... 110 | } 111 | ... 112 | } 113 | """ 114 | 115 | @abc.abstractmethod 116 | def group_by( 117 | self, 118 | start_date: datetime.datetime, 119 | end_date: datetime.datetime, 120 | index: List[str], 121 | dimensions: List[str], 122 | ) -> List[Dict[str, str]]: 123 | """ 124 | Group by. 125 | 126 | :param datetime.datetime start_date: the start of the time interval 127 | :param datetime.datetime end_date: the end of the time interval 128 | :param list[str] index: the index 129 | :param list[str] dimensions: the dimensions 130 | :rtype: list[dict[str, str]] 131 | :return: a list of dictionaries like: 132 | [ 133 | { 134 | "dimension_0": "benign", 135 | "dimension_1": "ExcelMsDocFile", 136 | "index_1": "0015cc85a17d707e00b9881a149c232d181ad451", 137 | "additional_dimension_0": "3549", 138 | "additional_dimension_1": "API", 139 | "count": 61 140 | } 141 | ... 142 | ] 143 | """ 144 | 145 | 146 | class JsonBackend(TwoIndexTwoDimensionBackend): 147 | """Backend using JSON files.""" 148 | 149 | def __init__(self, conf: configparser.ConfigParser, file_path_wildcard: str) -> None: 150 | """Constructor.""" 151 | super(JsonBackend, self).__init__(conf, section_name="not_used") 152 | self._file_paths = [] 153 | for name in glob.glob(file_path_wildcard): 154 | self._file_paths.append(os.path.abspath(name)) 155 | self._logger.info("Loaded files:") 156 | for file_path in self._file_paths: 157 | self._logger.info("\t%s", file_path) 158 | 159 | def stats( 160 | self, 161 | start_date: datetime.datetime, 162 | end_date: datetime.datetime, 163 | index: List[str], 164 | dimensions: List[str], 165 | dimensions_values: Dict[str, List[str]], 166 | ) -> Dict[str, Dict[str, Dict[str, float]]]: 167 | """Implement interface.""" 168 | # we do several passes so to keep memory usage to a minimum 169 | # pass 1, let us get all the dates 170 | dates = set([]) 171 | for file_path in self._file_paths: 172 | with open(file_path, "r") as f: 173 | for json_doc in ijson.items(f, "item"): 174 | index_0 = telemetry_peak_analyzer.ms_to_datetime(json_doc[index[0]]) 175 | if start_date <= index_0 < end_date: 176 | dates.add(index_0.date()) 177 | 178 | # pass 2, for each date get the stats 179 | buckets = collections.defaultdict(lambda: collections.defaultdict(dict)) 180 | all_dims_0 = set([]) 181 | all_dims_1 = set([]) 182 | for day_date in sorted(dates): 183 | sub_count = collections.defaultdict(lambda: collections.defaultdict(int)) 184 | samp_set = collections.defaultdict(lambda: collections.defaultdict(set)) 185 | dims_0 = set([]) 186 | dims_1 = set([]) 187 | for file_path in self._file_paths: 188 | with open(file_path, "r") as f: 189 | for json_doc in ijson.items(f, "item"): 190 | index_0 = telemetry_peak_analyzer.ms_to_datetime( 191 | json_doc[index[0]] 192 | ).date() 193 | index_1 = json_doc[index[1]] 194 | dimension_0 = json_doc[dimensions[0]] 195 | dimension_1 = json_doc[dimensions[1]] 196 | if index_0 == day_date: 197 | sub_count[dimension_0][dimension_1] += 1 198 | samp_set[dimension_0][dimension_1].add(index_1) 199 | dims_0.add(dimension_0) 200 | dims_1.add(dimension_1) 201 | for dim_0 in dims_0: 202 | for dim_1 in dims_1: 203 | try: 204 | samp_sub_count = sub_count[dim_0][dim_1] / len(samp_set[dim_0][dim_1]) 205 | except ZeroDivisionError: 206 | samp_sub_count = 0 207 | buckets[day_date][dim_0][dim_1] = { 208 | "sub_count": sub_count[dim_0][dim_1], 209 | "samp_count": len(samp_set[dim_0][dim_1]), 210 | "samp_sub_count": samp_sub_count, 211 | } 212 | all_dims_0.update(dims_0) 213 | all_dims_1.update(dims_1) 214 | ret = collections.defaultdict(dict) 215 | for dim_0 in all_dims_0: 216 | for dim_1 in all_dims_1: 217 | d_slice = [buckets[x].get(dim_0, {}).get(dim_1, {}) for x in dates] 218 | ret[dim_0][dim_1] = { 219 | "sub_count_avg": statistics.mean([x.get("sub_count", 0) for x in d_slice]), 220 | "sub_count_max": max([x.get("sub_count", 0) for x in d_slice]), 221 | "samp_count_avg": statistics.mean([x.get("samp_count", 0) for x in d_slice]), 222 | "samp_count_max": max([x.get("samp_count", 0) for x in d_slice]), 223 | "samp_sub_count_max": max([x.get("samp_sub_count", 0) for x in d_slice]), 224 | } 225 | return ret 226 | 227 | def group_by( 228 | self, 229 | start_date: datetime.datetime, 230 | end_date: datetime.datetime, 231 | index: List[str], 232 | dimensions: List[str], 233 | ) -> List[Dict[str, str]]: 234 | """Implement interface.""" 235 | counters = collections.Counter() 236 | dimensions = dimensions + [index[1]] 237 | for file_path in self._file_paths: 238 | with open(file_path, "r") as f: 239 | for json_doc in ijson.items(f, "item"): 240 | index_0 = telemetry_peak_analyzer.ms_to_datetime(json_doc[index[0]]) 241 | if start_date <= index_0 < end_date: 242 | counters[tuple([json_doc[dimension] for dimension in dimensions])] += 1 243 | ret = [] 244 | for key, count in counters.items(): 245 | value = {attr: key[idx] for idx, attr in enumerate(dimensions)} 246 | value["count"] = count 247 | ret.append(value) 248 | return ret 249 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![GitHub tag (latest SemVer)](https://img.shields.io/github/tag/vmware-labs/telemetry-peak-analyzer) 2 | ![GitHub](https://img.shields.io/pypi/l/telemetry-peak-analyzer) 3 | ![GitHub issues](https://img.shields.io/github/issues/vmware-labs/telemetry-peak-analyzer) 4 | 5 | # Telemetry Peak Analyzer 6 | 7 | ## Overview 8 | 9 | Telemetry Peak Analyzer is a framework to analyze and detect peaks on telemetry data with multiple 10 | dimensions, indexes, and attributes. The analyzer detects meaningful peaks based on statistical 11 | measurements computed over a short _local window_ and a longer _global window_ of telemetry data: 12 | 13 | - _Local window_ - a short time data window in which we want to detect peaks of a given attribute 14 | or dimension, e.g., file type. During the detection process, the analyzer generates a local 15 | statistics table (LST) with all the necessary statistical measurements. 16 | 17 | - _Global window_ - a historical long time data window which serves as a global benchmark to 18 | determine if a detected peak within the _local window_ is meaningful. During the detection process, 19 | it will generate (or update) a global statistics table (GST) with all the necessary statistical 20 | measurements. 21 | 22 | Telemetry data is dynamic, therefore the global benchmark as reflected by GST needs to be updated 23 | over time. To make the global benchmark adaptive, we use a sliding window mechanism which allows 24 | us to quickly update the new GST using previous GST and LST. 25 | 26 | *Note*: this implementation is a generalization of a research tool that was tailored to detect waves 27 | of malicious files sharing the same file type; to fully generalize terms and components, the source 28 | code relies on the following terms to describe different parts of the telemetry feed: 29 | - `index`: a tuple of attributes used to uniquely identify a telemetry data record. 30 | - `dimensions`: the attributes used to decompose a time-series into independent and orthogonal 31 | time-series. 32 | 33 | Such generalization is not perfect (for example, the current implementation does not support more 34 | than two dimensions) and some backends have obvious limitations; things will improve as the 35 | analyzer supports more types of telemetry data. 36 | 37 | ## Try it out 38 | 39 | ### Build & Run 40 | 41 | This package can be installed via pip, just run `pip install telemetry-peak-analyzer` or 42 | `pip install -e .`. 43 | 44 | If you want to install the dependencies required by the `tina` backend (a custom backend based 45 | on Elasticsearch used internally) you should append the `[tina]` extra option; you might need to 46 | use double quotes when doing a dev install, i.e., `pip install -e ".[tina]"`; note that a valid 47 | configuration file might be required. See `data/config.ini.template` for an example. 48 | 49 | Extra backends might require private dependencies; if that is the case, remember to select the 50 | internal index server using the `-i` option; if you require access, contact one of the maintainers. 51 | 52 | ### Scripts 53 | 54 | This package includes a console script ready to be used. Examples: 55 | 56 | * `python -m telemetry_peak_analyzer -b 57 | telemetry_peak_analyzer.backends.JsonBackend -n "./data/telemetry_example_*" -t 10`: 58 | in this example the peak analyzer reads from some local files using the JSON backend 59 | (note the double quotes) and sets the threshold to 10; note that when `-t` is specified, it 60 | will overwrite any suggested global threshold defined in GST. 61 | * `python -m telemetry_peak_analyzer -c config.ini -b 62 | telemetry_peak_analyzer.backends.tina.TinaBackend -n tina_nlemea -d 2`: 63 | in this example the peak analyzer reads from Tina from the last 2 days of data, using the 64 | configuration file `config.ini`, and the section `tina_nlemea` to know how to connect to the 65 | backend. 66 | 67 | ### Test 68 | There are a number of JSON files in the `data` directory for test using the JSON backend. 69 | Note that all the test files have been completely anonymized, to the point that even file hashes 70 | do not refer to actual files anymore. 71 | 72 | As mentioned above, the analyzer detects peaks based on statistical measurements of both a 73 | _local window_ and a _global window_. In the detailed example, the process comprises two steps. 74 | 75 | 1) `python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-01 –e 2020-11-04` 76 | 77 | This step generates an initial GST table as global benchmark from the defined initial 78 | _global window_, as specified by `-s` and `-e` options in the command. This step is only required 79 | the first time the analyzer is executed. Subsequent runs will update the GST using previously 80 | computed GST and LST tables. 81 | 82 | Expected output: 83 | 84 | ``` 85 | test@localhost telemetry-peak-analyzer % python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-01 -e 2020-11-04 86 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading Peak Analyzer from 2020-11-01 00:00:00 to 2020-11-04 00:00:00 with t=None 87 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading backend 'JsonBackend' 88 | INFO -> [2021-09-15 12:00:11] [0010mb] Loaded files: 89 | INFO -> [2021-09-15 12:00:11] [0010mb] /Users/test/telemetry-peak-analyzer/data/telemetry_example_3.json 90 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading analyzer 'FileTypePeakAnalyzer' with backend 'JsonBackend' 91 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading global tables from file '/Users/test/telemetry-peak-analyzer/global_table.json' 92 | INFO -> [2021-09-15 12:00:11] [0010mb] Failed: [Errno 2] No such file or directory: '/Users/test/telemetry-peak-analyzer/global_table.json' 93 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading global tables from the backend 94 | INFO -> [2021-09-15 12:00:12] [0012mb] Loading local tables 95 | INFO -> [2021-09-15 12:00:12] [0013mb] Getting peaks 96 | INFO -> [2021-09-15 12:00:12] [0013mb] Refreshing global tables 97 | INFO -> [2021-09-15 12:00:12] [0013mb] Saving global tables to '/Users/test/telemetry-peak-analyzer/global_table.json' 98 | ``` 99 | 100 | As the output shows, the process creates a JSON file `global_table.json` which is the initial 101 | GST table containing the global statistics. 102 | 103 | 2) `python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-04 –e 2020-11-05` 104 | 105 | This step will finally detect peaks from a _local window_ (as specified by `-s` and `-e` options) 106 | by leveraging the statistics in the GST and LST tables. This run will also update the GST (ideally, 107 | in production, you want to execute this second command on a daily basis to minimize the data to be 108 | processed). 109 | 110 | Expected output: 111 | 112 | ``` 113 | test@localhost telemetry-peak-analyzer % python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-04 -e 2020-11-05 114 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading Peak Analyzer from 2020-11-04 00:00:00 to 2020-11-05 00:00:00 with t=None 115 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading backend 'JsonBackend' 116 | INFO -> [2021-09-15 12:00:46] [0010mb] Loaded files: 117 | INFO -> [2021-09-15 12:00:46] [0010mb] /Users/test/telemetry-peak-analyzer/data/telemetry_example_3.json 118 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading analyzer 'FileTypePeakAnalyzer' with backend 'JsonBackend' 119 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading global tables from file '/Users/test/telemetry-peak-analyzer/global_table.json' 120 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading local tables 121 | INFO -> [2021-09-15 12:00:46] [0015mb] Getting peaks 122 | INFO -> [2021-09-15 12:00:46] [0015mb] TelemetryPeak(malicious, ZipArchiveFile) 123 | INFO -> [2021-09-15 12:00:46] [0015mb] sub_count: 11083 124 | INFO -> [2021-09-15 12:00:46] [0015mb] samp_count: 3028 125 | INFO -> [2021-09-15 12:00:46] [0015mb] samp_sub_count_max: 426 126 | INFO -> [2021-09-15 12:00:46] [0015mb] samp_sub_count_mean: 3.66 127 | INFO -> [2021-09-15 12:00:46] [0015mb] samp_sub_count_std: 11.54 128 | INFO -> [2021-09-15 12:00:46] [0015mb] samp_sub_ratio: 0.04 129 | INFO -> [2021-09-15 12:00:46] [0015mb] global_samp_sub_count_max: 2 130 | INFO -> [2021-09-15 12:00:46] [0015mb] global_threshold_suggested: 629 131 | INFO -> [2021-09-15 12:00:46] [0015mb] Refreshing global tables 132 | INFO -> [2021-09-15 12:00:46] [0015mb] Saving global tables to '/Users/test/telemetry-peak-analyzer/global_table.json' 133 | ``` 134 | 135 | As the output shows, it loads the GST generated from the 1st step, and successfully detects a 136 | ZipArchiveFile-based peak within the _local window_, and prints out some key statistical 137 | measurements generated during the detection process. 138 | 139 | At the end of the process, the GST table gets updated. 140 | 141 | 142 | ## Contributing 143 | 144 | The telemetry-peak-analyzer project team welcomes contributions from the community. Before you 145 | start working with telemetry-peak-analyzer, please read our 146 | [Developer Certificate of Origin](https://cla.vmware.com/dco). All contributions to this repository 147 | must be signed as described on that page. Your signature certifies that you wrote the patch or 148 | have the right to pass it on as an open-source patch. For more detailed information, 149 | refer to [CONTRIBUTING.md](CONTRIBUTING.md). 150 | 151 | ## Development 152 | 153 | Create the virtual env: 154 | 155 | `python3 -m venv venv` 156 | 157 | Activate the virtual env: 158 | 159 | `source ./venv/bin/activate` 160 | 161 | Install `tox`: 162 | 163 | `pip install tox` 164 | 165 | Run tests: 166 | 167 | `tox` 168 | 169 | Due to a bug in `tox` if you update the dependencies in `setup.cfg` the environments will not be 170 | re-created, leading to errors when running the tests 171 | (see https://github.com/tox-dev/tox/issues/93). 172 | As workaround, pass the `--recreate` flag after updating the dependencies. 173 | 174 | Before committing, install the package in dev mode (needed by `pylint`) following the instructions 175 | detailed in the `Build & Run` section. 176 | 177 | Then install `pylint` and `pre-commit`: 178 | 179 | `pip install pylint pre-commit` 180 | 181 | Install the hook: 182 | 183 | `pre-commit install` 184 | 185 | If you want to run pre-commit on all files use the following command: 186 | 187 | `pre-commit run --all-files` 188 | 189 | ## License 190 | [BSD 2-Clause](https://spdx.org/licenses/BSD-2-Clause.html) 191 | 192 | ## Extra 193 | The peak analyzer can be used to detect network peaks too. In order to use it for this purpose, while calling the peak analyzer, the `-a` option should be used to specify that the NetworkTypePeakAnalyzer is to be used. The `-b` option should specify the backend to be used, i.e. the NetworkTinaBackend. 194 | 195 | Due to the much larger volume of network peaks in a given time range as compared to file peaks, it is recommended to use the `-e` and `-d` parameters to specify the date and time range. 196 | 197 | Test command: 198 | ` python -m telemetry_peak_analyzer -c ./data/config.ini -a telemetry_peak_analyzer.analyzers.NetworkTypePeakAnalyzer -b telemetry_peak_analyzer.backends.tina.NetworkTinaBackend -n tina_westus -e 2022-04-01 -d 1 -o ./output.json` 199 | -------------------------------------------------------------------------------- /src/telemetry_peak_analyzer/analyzers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 VMware, Inc. 2 | # SPDX-License-Identifier: BSD-2 3 | import abc 4 | import collections 5 | import configparser 6 | import datetime 7 | import json 8 | import logging 9 | import statistics 10 | from abc import ABC 11 | from typing import Dict 12 | from typing import List 13 | from typing import Optional 14 | from typing import Set 15 | 16 | import telemetry_peak_analyzer 17 | from telemetry_peak_analyzer import backends 18 | from telemetry_peak_analyzer import models 19 | 20 | 21 | class AbstractAnalyzer(abc.ABC): 22 | """Abstract analyzer.""" 23 | 24 | DEFAULT_GLOBAL_TABLE_AGE = datetime.timedelta(days=7) 25 | 26 | @staticmethod 27 | def _get_window_count(start_ts: int, end_ts: int) -> int: 28 | """ 29 | Get the window count. 30 | 31 | :param int start_ts: milliseconds 32 | :param int end_ts: milliseconds 33 | :rtype: int 34 | :return: the window count 35 | """ 36 | delta_hours = (end_ts - start_ts) // 1000 // 60 // 60 37 | window_count = delta_hours // 24 if delta_hours % 24 == 0 else delta_hours 38 | # if start_ts and end_ts from the buckets are the same, then count it as one window 39 | if window_count == 0: 40 | window_count = 1 41 | return window_count 42 | 43 | def __init__( 44 | self, 45 | conf: configparser.ConfigParser, 46 | backend: backends.BackendType, 47 | index: List[str], 48 | dimensions: List[str], 49 | start_ts: datetime.datetime, 50 | end_ts: datetime.datetime, 51 | ) -> None: 52 | """ 53 | Constructor. 54 | 55 | :param configparser.ConfigParser conf: the conf object 56 | :param BackendType backend: the backend 57 | :param list[str] index: the index 58 | :param list[str] dimensions: the dimensions 59 | :param datetime.datetime start_ts: the beginning of the time interval 60 | :param datetime.datetime end_ts: the end of the time interval 61 | """ 62 | self._conf = conf 63 | self._backend = backend 64 | self._index = index 65 | self._dimensions = dimensions 66 | self._start_ts = start_ts 67 | self._end_ts = end_ts 68 | self._logger = logging.getLogger(__name__) 69 | self._logger.info( 70 | "Loading analyzer '%s' with backend '%s'", 71 | self.__class__.__name__, 72 | self._backend.__class__.__name__, 73 | ) 74 | 75 | @abc.abstractmethod 76 | def get_peaks( 77 | self, 78 | global_tables: Dict, 79 | local_tables: Dict, 80 | threshold: Optional[int] = None, 81 | ) -> Dict: 82 | """ 83 | Get the peaks. 84 | 85 | :param dict global_tables: the global tables 86 | :param dict local_tables: the local tables 87 | :param int|None threshold: optional threshold 88 | :rtype: dict 89 | :return: the telemetry peaks 90 | """ 91 | 92 | @abc.abstractmethod 93 | def refresh_global_tables(self, global_tables: Dict, local_tables: Dict) -> Dict: 94 | """ 95 | Refresh the global tables. 96 | 97 | :param dict global_tables: the global tables 98 | :param dict local_tables: the local tables 99 | :rtype: dict 100 | :return: the global tables 101 | """ 102 | 103 | @abc.abstractmethod 104 | def get_global_tables_stats( 105 | self, 106 | global_tables: Dict, 107 | threshold: Optional[int] = None, 108 | ) -> Dict: 109 | """ 110 | Get statistics from the global table. 111 | 112 | :param dict global_tables: the global tables 113 | :param int|None threshold: optional threshold 114 | :rtype: dict 115 | :return: the global tables stats 116 | """ 117 | 118 | @abc.abstractmethod 119 | def get_global_tables(self) -> Dict: 120 | """ 121 | Get the global tables from our backend. 122 | 123 | :rtype: dict 124 | :return: global tables 125 | """ 126 | 127 | @abc.abstractmethod 128 | def get_global_tables_from_file(self, file_path: str) -> Dict: 129 | """ 130 | Load global tables from a file. 131 | 132 | :param str file_path: the file path where the load from 133 | :rtype: dict 134 | :return: the loaded object 135 | """ 136 | 137 | @abc.abstractmethod 138 | def get_local_tables_stats(self, local_tables: Dict) -> Dict: 139 | """ 140 | Get statistics from the local table. 141 | 142 | :param dict local_tables: local tables 143 | :rtype: dict 144 | :return: the local tables stats 145 | """ 146 | 147 | @abc.abstractmethod 148 | def get_local_tables(self) -> Dict: 149 | """ 150 | Get the local tables from our backend. 151 | 152 | :rtype: dict 153 | :return: the local tables 154 | """ 155 | 156 | 157 | class TwoIndexTwoDimensionAnalyzer(AbstractAnalyzer, ABC): 158 | """Analyzer using index and dimension with cardinality set to two.""" 159 | 160 | LOCAL_MIN_COUNT = 50 161 | 162 | PEAK_GLOBAL_COUNT_WEIGHT = 0.8 163 | 164 | PEAK_MIN_SAMP_SUB_RATIO = 0.5 165 | 166 | CROSS_DIMENSIONS = [] 167 | 168 | DIMENSIONS_METADATA = {} 169 | 170 | def _get_dimension_threshold(self, dimension: str, value: str) -> int: 171 | """Get the threshold for a given dimension and value, if available, 0 otherwise.""" 172 | try: 173 | return self.DIMENSIONS_METADATA[dimension]["threshold"][value] 174 | except KeyError: 175 | return 0 176 | 177 | def _get_dimension_values(self, dimension: str) -> List[str]: 178 | """Get all values that a dimension can exhibit, if available.""" 179 | try: 180 | return self.DIMENSIONS_METADATA[dimension]["values"] 181 | except KeyError: 182 | return [] 183 | 184 | def _is_peak( 185 | self, 186 | local_table_stats: models.LocalTableStats, 187 | global_table_stats: models.GlobalTableStats, 188 | ) -> bool: 189 | """ 190 | Return whether the peak is valid. 191 | 192 | :param LocalTableStats local_table_stats: stats about the local table 193 | :param GlobalTableStats global_table_stats: stats about the global table 194 | :rtype: bool 195 | :return: whether the peak is valid 196 | """ 197 | if local_table_stats.sub_count < global_table_stats.threshold: 198 | return False 199 | temp = self.PEAK_GLOBAL_COUNT_WEIGHT * global_table_stats.samp_sub_count_max 200 | is_susp_overall_sub_samp_r = local_table_stats.samp_sub_count_mean > temp 201 | is_dominant_samp_sub = local_table_stats.samp_sub_ratio > self.PEAK_MIN_SAMP_SUB_RATIO 202 | temp = local_table_stats.samp_sub_count_mean + local_table_stats.samp_sub_count_std 203 | is_susp_samp_sub_var = local_table_stats.samp_sub_count_max > temp 204 | return is_susp_overall_sub_samp_r or is_dominant_samp_sub or is_susp_samp_sub_var 205 | 206 | def _update_global_table( 207 | self, 208 | global_table: models.GlobalTable, 209 | local_table: Dict[str, Dict[str, int]], 210 | ) -> models.GlobalTable: 211 | """ 212 | Update the global table 213 | 214 | :param GlobalTable global_table: the global table 215 | :param dict[str, dict[str, int]] local_table: the local table 216 | :rtype: GlobalTable 217 | :return: a refreshed global table 218 | """ 219 | window_count = global_table.window_count + 1 220 | sub_count = sum(local_table[self._index[1]].values()) 221 | samp_count = len(local_table[self._index[1]]) 222 | try: 223 | samp_sub_count = int(round(sub_count / samp_count)) 224 | except ZeroDivisionError: 225 | samp_sub_count = 0 226 | sub_count_avg = int( 227 | round( 228 | (global_table.sub_count_avg * global_table.window_count + sub_count) 229 | / window_count 230 | ) 231 | ) 232 | samp_count_avg = int( 233 | round( 234 | (global_table.samp_count_avg * global_table.window_count + samp_count) 235 | / window_count 236 | ) 237 | ) 238 | samp_sub_count_avg = int( 239 | round( 240 | (global_table.samp_sub_count_avg * global_table.window_count + samp_sub_count) 241 | / window_count 242 | ) 243 | ) 244 | return models.GlobalTable( 245 | start_ts=global_table.start_ts, 246 | end_ts=self._end_ts, 247 | window_count=window_count, 248 | sub_count_avg=sub_count_avg, 249 | sub_count_max=max(global_table.sub_count_max, sub_count), 250 | samp_count_avg=samp_count_avg, 251 | samp_count_max=max(global_table.samp_count_max, samp_count), 252 | samp_sub_count_avg=samp_sub_count_avg, 253 | samp_sub_count_max=max(global_table.samp_sub_count_max, samp_sub_count), 254 | threshold_suggested=max(global_table.threshold_suggested, sub_count_avg), 255 | ) 256 | 257 | def _infer_global_table( 258 | self, 259 | local_table: Dict[str, Dict[str, int]], 260 | threshold: int, 261 | ) -> models.GlobalTable: 262 | """ 263 | Infer the global table from the local table. 264 | 265 | :param dict[str, dict[str, int]] local_table: the local table 266 | :param int threshold: the threshold to suggest 267 | :rtype: GlobalTable 268 | :return: the inferred global table 269 | """ 270 | sub_count = sum(local_table[self._index[1]].values()) 271 | samp_count = len(local_table[self._index[1]]) 272 | try: 273 | samp_sub_count = int(round(sub_count / samp_count)) 274 | except ZeroDivisionError: 275 | samp_sub_count = 0 276 | return models.GlobalTable( 277 | start_ts=self._start_ts, 278 | end_ts=self._end_ts, 279 | window_count=1, 280 | sub_count_avg=sub_count, 281 | sub_count_max=sub_count, 282 | samp_count_avg=samp_count, 283 | samp_count_max=samp_count, 284 | samp_sub_count_avg=samp_sub_count, 285 | samp_sub_count_max=samp_sub_count, 286 | threshold_suggested=max(sub_count, threshold), 287 | ) 288 | 289 | def get_peaks( 290 | self, 291 | global_tables: Dict[str, Dict[str, models.GlobalTable]], 292 | local_tables: Dict[str, Dict[str, Dict[str, Dict[str, int]]]], 293 | threshold: Optional[int] = None, 294 | ) -> Dict[str, Dict[str, models.TelemetryPeak]]: 295 | """ 296 | Get the peaks. 297 | 298 | :param dict[str, dict[str, GlobalTable]] global_tables: the global tables 299 | :param dict[str, dict[str, dict[str, dict[str, int]]]] local_tables: the local tables 300 | :param int|None threshold: optional threshold 301 | :rtype: dict[str, dict[str, TelemetryPeak]] 302 | :return: the telemetry peaks 303 | """ 304 | local_tables_stats = self.get_local_tables_stats(local_tables) 305 | global_tables_stats = self.get_global_tables_stats(global_tables, threshold=threshold) 306 | peaks = collections.defaultdict(dict) 307 | for dimension_0 in local_tables_stats: 308 | for dimension_1 in local_tables_stats[dimension_0]: 309 | try: 310 | local_table_stats = local_tables_stats[dimension_0][dimension_1] 311 | global_table_stats = global_tables_stats[dimension_0][dimension_1] 312 | except KeyError: 313 | continue 314 | if self._is_peak(local_table_stats, global_table_stats): 315 | peaks[dimension_0][dimension_1] = models.TelemetryPeak( 316 | sub_count=local_table_stats.sub_count, 317 | samp_count=local_table_stats.samp_count, 318 | samp_sub_count_max=local_table_stats.samp_sub_count_max, 319 | samp_sub_count_mean=local_table_stats.samp_sub_count_mean, 320 | samp_sub_count_std=local_table_stats.samp_sub_count_std, 321 | samp_sub_ratio=local_table_stats.samp_sub_ratio, 322 | global_samp_sub_count_max=global_table_stats.samp_sub_count_max, 323 | global_threshold_suggested=global_table_stats.threshold, 324 | ) 325 | return peaks 326 | 327 | def refresh_global_tables( 328 | self, 329 | global_tables: Dict[str, Dict[str, models.GlobalTable]], 330 | local_tables: Dict[str, Dict[str, Dict[str, Dict[str, int]]]], 331 | ) -> Dict[str, Dict[str, models.GlobalTable]]: 332 | """ 333 | Refresh the global tables. 334 | 335 | :param dict[str, dict[str, GlobalTable]] global_tables: the global tables 336 | :param dict[str, dict[str, dict[str, dict[str, int]]]] local_tables: the local tables 337 | :rtype: dict[str, dict[str, GlobalTable]] 338 | :return: the global tables 339 | """ 340 | 341 | def get_dimensions_0() -> Set[str]: 342 | """Get the first dimension from all the global and local tables.""" 343 | return set(global_tables.keys()).union(local_tables.keys()) 344 | 345 | def get_dimensions_1(dim_0: str) -> Set[str]: 346 | """Get the second dimension from all the global and local tables.""" 347 | return set(global_tables.get(dim_0, {}).keys()).union( 348 | local_tables.get(dim_0, {}).keys() 349 | ) 350 | 351 | refreshed_global_tables = collections.defaultdict(dict) 352 | for dimension_0 in get_dimensions_0(): 353 | for dimension_1 in get_dimensions_1(dimension_0): 354 | global_table = global_tables.get(dimension_0, {}).get(dimension_1, None) 355 | local_table = local_tables.get(dimension_0, {}).get(dimension_1, None) 356 | # If we have both tables for all dimensions, check their age 357 | if global_table and local_table: 358 | # if the global table is more recent than the current time interval 359 | if self._start_ts < global_table.end_ts: 360 | new_table = global_table 361 | # otherwise update the global table 362 | else: 363 | new_table = self._update_global_table(global_table, local_table) 364 | # if we only have the global table, then use the global table 365 | elif global_table and not local_table: 366 | new_table = global_table 367 | # and if we only have the local table, infer a global table 368 | elif not global_table and local_table: 369 | threshold = self._get_dimension_threshold(self._dimensions[0], dimension_0) 370 | new_table = self._infer_global_table(local_table, threshold) 371 | # otherwise just skip 372 | else: 373 | continue 374 | refreshed_global_tables[dimension_0][dimension_1] = new_table 375 | return refreshed_global_tables 376 | 377 | def get_global_tables_stats( 378 | self, 379 | global_tables: Dict[str, Dict[str, models.GlobalTable]], 380 | threshold: Optional[int] = None, 381 | ) -> Dict[str, Dict[str, models.GlobalTableStats]]: 382 | """ 383 | Get statistics from the global table. 384 | 385 | :param dict[str, dict[str, GlobalTable]] global_tables: the global tables 386 | :param int|None threshold: optional threshold 387 | :rtype: dict[str, dict[str, GlobalTableStats]] 388 | :return: the global tables stats 389 | """ 390 | global_stats = collections.defaultdict(dict) 391 | for dimension_0 in global_tables: 392 | for dimension_1 in global_tables[dimension_0]: 393 | global_table = global_tables[dimension_0][dimension_1] 394 | global_stats[dimension_0][dimension_1] = models.GlobalTableStats( 395 | threshold=threshold or global_table.threshold_suggested, 396 | samp_sub_count_max=global_table.samp_sub_count_max, 397 | ) 398 | return global_stats 399 | 400 | def get_global_tables_from_file( 401 | self, 402 | file_path: str, 403 | ) -> Dict[str, Dict[str, models.GlobalTable]]: 404 | """ 405 | Load global tables from a file. 406 | 407 | :param str file_path: the file path where the load from 408 | :rtype: dict[str, dict[str, GlobalTable]] 409 | :return: the global tables 410 | """ 411 | with open(file_path, "r") as f: 412 | json_data = json.load(f) 413 | for dimension_0 in json_data: 414 | for dimension_1 in json_data[dimension_0]: 415 | table = json_data[dimension_0][dimension_1] 416 | json_data[dimension_0][dimension_1] = models.GlobalTable( 417 | start_ts=datetime.datetime.strptime(table[0][:19], "%Y-%m-%d %H:%M:%S"), 418 | end_ts=datetime.datetime.strptime(table[1][:19], "%Y-%m-%d %H:%M:%S"), 419 | window_count=table[2], 420 | sub_count_avg=table[3], 421 | sub_count_max=table[4], 422 | samp_count_avg=table[5], 423 | samp_count_max=table[6], 424 | samp_sub_count_avg=table[7], 425 | samp_sub_count_max=table[8], 426 | threshold_suggested=table[9], 427 | ) 428 | return json_data 429 | 430 | def get_global_tables(self) -> Dict[str, Dict[str, models.GlobalTable]]: 431 | """ 432 | Get the global tables. 433 | 434 | :rtype: dict[str, dict[str, GlobalTable]] 435 | :return: the global tables 436 | """ 437 | end_date = datetime.datetime.utcnow() 438 | start_date = end_date - self.DEFAULT_GLOBAL_TABLE_AGE 439 | ret = self._backend.stats( 440 | start_date=start_date, 441 | end_date=end_date, 442 | index=self._index, 443 | dimensions=self._dimensions, 444 | dimensions_values={x: self._get_dimension_values(x) for x in self._dimensions}, 445 | ) 446 | global_tables = collections.defaultdict(dict) 447 | for dimension_0 in ret: 448 | for dimension_1 in ret[dimension_0]: 449 | start_ts = telemetry_peak_analyzer.datetime_to_ms(start_date) 450 | end_ts = telemetry_peak_analyzer.datetime_to_ms(end_date) 451 | threshold = self._get_dimension_threshold(self._dimensions[0], dimension_0) 452 | sub_count_avg = ret[dimension_0][dimension_1]["sub_count_avg"] 453 | sub_count_max = ret[dimension_0][dimension_1]["sub_count_max"] 454 | samp_count_avg = ret[dimension_0][dimension_1]["samp_count_avg"] 455 | samp_count_max = ret[dimension_0][dimension_1]["samp_count_max"] 456 | samp_sub_count_max = ret[dimension_0][dimension_1]["samp_sub_count_max"] 457 | try: 458 | samp_sub_count_avg = int(round(sub_count_avg / samp_count_avg)) 459 | except ZeroDivisionError: 460 | samp_sub_count_avg = 0 461 | global_tables[dimension_0][dimension_1] = models.GlobalTable( 462 | start_ts=start_date, 463 | end_ts=end_date, 464 | window_count=self._get_window_count(start_ts, end_ts), 465 | sub_count_avg=int(round(sub_count_avg)), 466 | sub_count_max=int(sub_count_max), 467 | samp_count_avg=int(round(samp_count_avg)), 468 | samp_count_max=int(samp_count_max), 469 | samp_sub_count_avg=samp_sub_count_avg, 470 | samp_sub_count_max=samp_sub_count_max, 471 | threshold_suggested=max(int(round(sub_count_avg)), threshold), 472 | ) 473 | return global_tables 474 | 475 | def get_local_tables_stats( 476 | self, local_tables: Dict[str, Dict[str, Dict[str, Dict[str, int]]]] 477 | ) -> Dict[str, Dict[str, models.LocalTableStats]]: 478 | """ 479 | Get statistics from the local table. 480 | 481 | :param dict[str, dict[str, dict[str, dict[str, int]]]] local_tables: local tables 482 | :rtype: dict[str, dict[str, LocalTableStats]] 483 | :return: some statistics for each dimension combination 484 | """ 485 | local_stats = collections.defaultdict(dict) 486 | for dimension_0 in local_tables: 487 | for dimension_1 in local_tables[dimension_0]: 488 | local_table = local_tables[dimension_0][dimension_1] 489 | sub_count = sum(local_table[self._index[1]].values()) 490 | samp_count = len(local_table[self._index[1]]) 491 | samp_sub_count = local_table[self._index[1]].values() 492 | # fails if len < 1, one fails, both fail 493 | try: 494 | samp_sub_count_mean = statistics.mean(samp_sub_count) 495 | samp_sub_count_max = max(samp_sub_count) 496 | except (statistics.StatisticsError, ValueError): 497 | samp_sub_count_mean = 0.0 498 | samp_sub_count_max = 0 499 | # fails if len < 2 500 | try: 501 | samp_sub_count_std = statistics.stdev(samp_sub_count) 502 | except statistics.StatisticsError: 503 | samp_sub_count_std = 0.0 504 | if sub_count > self.LOCAL_MIN_COUNT: 505 | samp_sub_ratio = round(samp_sub_count_max / sub_count, 2) 506 | else: 507 | samp_sub_ratio = 0.0 508 | local_stats[dimension_0][dimension_1] = models.LocalTableStats( 509 | sub_count=sub_count, 510 | samp_count=samp_count, 511 | samp_sub_count_max=samp_sub_count_max, 512 | samp_sub_count_mean=samp_sub_count_mean, 513 | samp_sub_count_std=samp_sub_count_std, 514 | samp_sub_ratio=samp_sub_ratio, 515 | cross_stats={x: local_table[x] for x in self.CROSS_DIMENSIONS}, 516 | ) 517 | return local_stats 518 | 519 | def get_local_tables(self) -> Dict[str, Dict[str, Dict[str, Dict[str, int]]]]: 520 | """ 521 | Get the local tables from our backend. 522 | 523 | :rtype: dict[str, dict[str, dict[str, dict[str, int]]]] 524 | :return: the local tables 525 | """ 526 | terms = self._dimensions + self.CROSS_DIMENSIONS + [self._index[1]] 527 | json_data = self._backend.group_by( 528 | start_date=self._start_ts, 529 | end_date=self._end_ts, 530 | index=self._index, 531 | dimensions=self._dimensions + self.CROSS_DIMENSIONS, 532 | ) 533 | local_tables = collections.defaultdict(dict) 534 | for item in json_data: 535 | dimension_0 = item[self._dimensions[0]] 536 | dimension_1 = item[self._dimensions[1]] 537 | if dimension_1 not in local_tables[dimension_0]: 538 | local_tables[dimension_0][dimension_1] = { 539 | term: collections.defaultdict(int) for term in terms 540 | } 541 | for term in terms: 542 | local_tables[dimension_0][dimension_1][term][item[term]] += item["count"] 543 | return local_tables 544 | 545 | 546 | class FileTypePeakAnalyzer(TwoIndexTwoDimensionAnalyzer): 547 | """Analyzer using index and dimension to track file types.""" 548 | 549 | CROSS_DIMENSIONS = [ 550 | "source.user_id", 551 | "source.origin", 552 | ] 553 | 554 | DIMENSIONS_METADATA = { 555 | "task.severity": { 556 | "values": ["malicious", "benign"], 557 | "threshold": { 558 | "malicious": 90, 559 | "benign": 500, 560 | }, 561 | } 562 | } 563 | 564 | _INDEX = [ 565 | "utc_timestamp", 566 | "file.sha1", 567 | ] 568 | 569 | _DIMENSIONS = [ 570 | "task.severity", 571 | "file.llfile_type", 572 | ] 573 | 574 | def __init__( 575 | self, 576 | conf: configparser.ConfigParser, 577 | backend: backends.BackendType, 578 | start_ts: datetime.datetime, 579 | end_ts: datetime.datetime, 580 | ) -> None: 581 | """ 582 | Constructor. 583 | 584 | :param configparser.ConfigParser conf: the conf object 585 | :param backendType backend: the backend 586 | :param datetime.datetime start_ts: the beginning of the time interval 587 | :param datetime.datetime end_ts: the end of the time interval 588 | """ 589 | if not isinstance(backend, backends.TwoIndexTwoDimensionBackend): 590 | raise ValueError("Backend is not compatible with the chosen analyzer") 591 | super(FileTypePeakAnalyzer, self).__init__( 592 | conf=conf, 593 | index=self._INDEX, 594 | dimensions=self._DIMENSIONS, 595 | backend=backend, 596 | start_ts=start_ts, 597 | end_ts=end_ts, 598 | ) 599 | 600 | 601 | class NetworkTypePeakAnalyzer(TwoIndexTwoDimensionAnalyzer): 602 | """Analyzer using index and dimension to track network peaks.""" 603 | 604 | CROSS_DIMENSIONS = [ 605 | "source.user_id", 606 | ] 607 | 608 | DIMENSIONS_METADATA = { 609 | "event.impact": { 610 | "values": ["70", "30"], 611 | } 612 | } 613 | 614 | _INDEX = [ 615 | "utc_timestamp", 616 | "event.id", 617 | ] 618 | 619 | _DIMENSIONS = [ 620 | "event.impact", 621 | "threat.name.keyword", 622 | ] 623 | 624 | def __init__( 625 | self, 626 | conf: configparser.ConfigParser, 627 | backend: backends.BackendType, 628 | start_ts: datetime.datetime, 629 | end_ts: datetime.datetime, 630 | ) -> None: 631 | """ 632 | Constructor. 633 | 634 | :param configparser.ConfigParser conf: the conf object 635 | :param backendType backend: the backend 636 | :param datetime.datetime start_ts: the beginning of the time interval 637 | :param datetime.datetime end_ts: the end of the time interval 638 | """ 639 | if not isinstance(backend, backends.TwoIndexTwoDimensionBackend): 640 | raise ValueError("Backend is not compatible with the chosen analyzer") 641 | # Relax the default age of a global table when not present 642 | self.DEFAULT_METRIC_TABLE_AGE = datetime.timedelta(days=3) 643 | super(NetworkTypePeakAnalyzer, self).__init__( 644 | conf=conf, 645 | index=self._INDEX, 646 | dimensions=self._DIMENSIONS, 647 | backend=backend, 648 | start_ts=start_ts, 649 | end_ts=end_ts, 650 | ) 651 | --------------------------------------------------------------------------------