├── .gitignore
├── data
    └── config.ini.template
├── setup.py
├── NOTICE
├── pyproject.toml
├── .pre-commit-config.yaml
├── setup.cfg
├── .github
    └── workflows
    │   └── publish-to-test-pypi.yml
├── src
    └── telemetry_peak_analyzer
    │   ├── models
    │       └── __init__.py
    │   ├── __init__.py
    │   ├── backends
    │       ├── tina.py
    │       └── __init__.py
    │   ├── __main__.py
    │   └── analyzers
    │       └── __init__.py
├── LICENSE
├── scripts
    ├── anonymize_telemetry.py
    └── create_telemetry.py
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
├── tests
    └── test_telemetry_peak_analyzer.py
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | src/telemetry_peak_analyzer.egg-info
2 | data/config.ini
3 | .tox
4 | venv
5 | global_table.json
6 | 


--------------------------------------------------------------------------------
/data/config.ini.template:
--------------------------------------------------------------------------------
1 | [tina_backend]
2 | hosts =
3 | port =
4 | timeout_ms = 30000
5 | parallel_bulk = true
6 | scan_size = 10000
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2021 VMware, Inc.
 3 | # SPDX-License-Identifier: BSD-2
 4 | """
 5 | See https://stackoverflow.com/questions/62983756/what-is-pyproject-toml-file-for
 6 | """
 7 | import setuptools
 8 | 
 9 | if __name__ == "__main__":
10 |     setuptools.setup()
11 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Telemetry Peak Analyzer
2 | Copyright 2021 VMware, Inc.
3 | 
4 | This product is licensed to you under the BSD-2 license (the "License"). You may not use this product except in compliance with the BSD-2 License.  
5 | 
6 | This product may include a number of subcomponents with separate copyright notices and license terms. Your use of these subcomponents is subject to the terms and conditions of the subcomponent's license, as noted in the LICENSE file. 
7 | 
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "setuptools>=42",
 4 |     "wheel",
 5 | ]
 6 | build-backend = "setuptools.build_meta"
 7 | 
 8 | [tool.tox]
 9 | legacy_tox_ini = """
10 | [tox]
11 | envlist = py36, py39
12 | isolated_build = True
13 | 
14 | [testenv]
15 | commands=nosetests
16 | deps=
17 |     ddt
18 |     mock
19 |     nose
20 | """
21 | 
22 | [tool.pylint.master]
23 | ignored-modules = "config"
24 | disable = """
25 | W1514,F0010,useless-super-delegation,E1103,W0108,W0404,R0904,R0922,W0105,
26 | W0142,C0301,C0321,C0322,C0324,R,W0232,E1001,W0212,W0703,C,I0011,I0012,I0013,E0012"""
27 | 
28 | [tool.black]
29 | line-length = 98
30 | include = "\\.pyi?$"
31 | exclude = """
32 | /(
33 |     \\.git
34 |   | \\.hg
35 |   | \\.mypy_cache
36 |   | \\.tox
37 |   | \\.venv
38 |   | _build
39 |   | buck-out
40 |   | build
41 |   | dist
42 | )/
43 | """
44 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/pre-commit/pre-commit-hooks.git
 3 |     rev: v3.4.0
 4 |     hooks:
 5 |       - id: check-added-large-files
 6 |       - id: check-ast
 7 |       - id: check-byte-order-marker
 8 |       - id: check-docstring-first
 9 |       - id: check-executables-have-shebangs
10 |       - id: check-json
11 |       - id: check-merge-conflict
12 |       - id: check-yaml
13 |       - id: debug-statements
14 |       - id: detect-private-key
15 |       - id: end-of-file-fixer
16 |       - id: trailing-whitespace
17 |   - repo: https://github.com/psf/black
18 |     rev: 21.7b0
19 |     hooks:
20 |       - id: black
21 |   - repo: https://github.com/asottile/reorder_python_imports.git
22 |     rev: v2.3.6
23 |     hooks:
24 |       - id: reorder-python-imports
25 |         language_version: python3
26 |   - repo: local
27 |     hooks:
28 |       - id: pylint
29 |         name: pylint
30 |         entry: pylint
31 |         exclude: ^tests/
32 |         language: system
33 |         types: [python]
34 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = telemetry-peak-analyzer
 3 | version = 0.1.6
 4 | author = Stefano Ortolani, Jason Zhang
 5 | description = Analyzer to detect peaks when analyzing multi-attribute telemetry feeds
 6 | long_description = file: README.md
 7 | long_description_content_type = text/markdown
 8 | url = https://github.com/vmware-labs/telemetry-peak-analyzer
 9 | project_urls =
10 |     Bug Tracker = https://github.com/vmware-labs/telemetry-peak-analyzer/issues
11 | classifiers =
12 |     Development Status :: 3 - Alpha
13 |     Intended Audience :: Developers
14 |     License :: OSI Approved :: BSD License
15 |     Programming Language :: Python :: 3
16 |     Operating System :: OS Independent
17 |     Topic :: Security
18 |     Topic :: System :: Monitoring
19 | 
20 | [options]
21 | package_dir =
22 |     = src
23 | packages = find:
24 | python_requires = >=3.6
25 | install_requires =
26 |     ijson
27 | 
28 | [options.entry_points]
29 | console_scripts =
30 |     telemetry-peak-analyzer = telemetry_peak_analyzer:parse_and_run_command
31 | 
32 | [options.extras_require]
33 | tina = tina-client >=0.2.1
34 | 
35 | [options.packages.find]
36 | where = src
37 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-to-test-pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   build-n-publish:
 7 |     name: Build and publish Python 🐍 distributions 📦 to PyPI and TestPyPI
 8 |     runs-on: ubuntu-20.04
 9 |     steps:
10 |       - uses: actions/checkout@master
11 |       - name: Set up Python 3.8
12 |         uses: actions/setup-python@v1
13 |         with:
14 |           python-version: 3.8
15 |       - name: Install pypa/build
16 |         run: >-
17 |           python -m
18 |           pip install
19 |           build
20 |           --user
21 |       - name: Build a binary wheel and a source tarball
22 |         run: >-
23 |           python -m
24 |           build
25 |           --sdist
26 |           --wheel
27 |           --outdir dist/
28 |           .
29 |       - name: Publish distribution 📦 to Test PyPI
30 |         uses: pypa/gh-action-pypi-publish@v1.5.0
31 |         with:
32 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
33 |           repository_url: https://test.pypi.org/legacy/
34 |           skip_existing: true
35 |       - name: Publish distribution 📦 to PyPI
36 |         if: startsWith(github.ref, 'refs/tags')
37 |         uses: pypa/gh-action-pypi-publish@v1.5.0
38 |         with:
39 |           password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/src/telemetry_peak_analyzer/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2021 VMware, Inc.
 2 | # SPDX-License-Identifier: BSD-2
 3 | import collections
 4 | 
 5 | 
 6 | TelemetryPeak = collections.namedtuple(
 7 |     "TelemetryPeak",
 8 |     [
 9 |         "sub_count",
10 |         "samp_count",
11 |         "samp_sub_count_max",
12 |         "samp_sub_count_mean",
13 |         "samp_sub_count_std",
14 |         "samp_sub_ratio",
15 |         "global_samp_sub_count_max",
16 |         "global_threshold_suggested",
17 |     ],
18 | )
19 | 
20 | 
21 | GlobalTable = collections.namedtuple(
22 |     "GlobalTable",
23 |     [
24 |         "start_ts",
25 |         "end_ts",
26 |         "window_count",
27 |         "sub_count_avg",
28 |         "sub_count_max",
29 |         "samp_count_avg",
30 |         "samp_count_max",
31 |         "samp_sub_count_avg",
32 |         "samp_sub_count_max",
33 |         "threshold_suggested",
34 |     ],
35 | )
36 | 
37 | 
38 | LocalTableStats = collections.namedtuple(
39 |     "LocalTableStats",
40 |     [
41 |         "sub_count",
42 |         "samp_count",
43 |         "samp_sub_count_max",
44 |         "samp_sub_count_mean",
45 |         "samp_sub_count_std",
46 |         "samp_sub_ratio",
47 |         "cross_stats",
48 |     ],
49 | )
50 | 
51 | 
52 | GlobalTableStats = collections.namedtuple(
53 |     "GlobalTableStats",
54 |     [
55 |         "samp_sub_count_max",
56 |         "threshold",
57 |     ],
58 | )
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Telemetry Peak Analyzer
 2 | Copyright 2021 VMware, Inc.
 3 | 
 4 | The BSD-2 license (the "License") set forth below applies to all parts of the Telemetry Peak Analyzer project. You may not use this file except in compliance with the License.
 5 | 
 6 | BSD-2 License 
 7 | 
 8 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
 9 | 
10 | Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
11 | 
12 | Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 | 
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/scripts/anonymize_telemetry.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # Copyright 2021 VMware, Inc.
 3 | # SPDX-License-Identifier: BSD-2
 4 | import argparse
 5 | import hashlib
 6 | import json
 7 | import os
 8 | import sys
 9 | 
10 | 
11 | def main() -> int:
12 |     """Anonymize telemetry JSON files."""
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument(
15 |         "-i",
16 |         "--input-file",
17 |         dest="input_file",
18 |         default=None,
19 |         required=True,
20 |         type=str,
21 |         help="The input file",
22 |     )
23 |     args = parser.parse_args()
24 | 
25 |     with open(args.input_file, "r") as f:
26 |         telemetry_data = json.load(f)
27 | 
28 |     for item in telemetry_data:
29 |         item["customer.channel"] = None
30 |         item["customer.installation_type"] = None
31 |         item["customer.type"] = None
32 |         item["file.md5"] = hashlib.md5(item["file.md5"].encode("utf-8")).hexdigest()
33 |         item["file.name"] = None
34 |         item["file.sha1"] = hashlib.sha1(item["file.sha1"].encode("utf-8")).hexdigest()
35 |         if item["file.sha256"]:
36 |             item["file.sha256"] = hashlib.sha256(item["file.sha256"].encode("utf-8")).hexdigest()
37 |         item["file.size"] = 0
38 |         item["source.access_key_id"] = 0
39 |         item["source.data_center"] = None
40 |         item["source.geo.country_iso_code"] = None
41 |         item["source.geo.location"] = "0.00,0.00"
42 |         item["source.submitter_ip"] = "0.0.0.0"
43 |         item["source.user_id"] = 0
44 |         item["submission_id"] = 0
45 |         item["task.portal_url"] = None
46 |         item["task.uuid"] = "a" * 32
47 | 
48 |     file_path = f"{os.path.splitext(args.input_file)[0]}.anonymized.json"
49 |     with open(file_path, "w") as f:
50 |         json.dump(telemetry_data, f, indent=2, sort_keys=True)
51 | 
52 |     return 0
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     sys.exit(main())
57 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Contributing to telemetry-peak-analyzer
 3 | 
 4 | The telemetry-peak-analyzer project team welcomes contributions from the community. Before you start working with telemetry-peak-analyzer, please
 5 | read our [Developer Certificate of Origin](https://cla.vmware.com/dco). All contributions to this repository must be
 6 | signed as described on that page. Your signature certifies that you wrote the patch or have the right to pass it on
 7 | as an open-source patch.
 8 | 
 9 | ## Contribution Flow
10 | 
11 | This is a rough outline of what a contributor's workflow looks like:
12 | 
13 | - Create a topic branch from where you want to base your work
14 | - Make commits of logical units
15 | - Make sure your commit messages are in the proper format (see below)
16 | - Push your changes to a topic branch in your fork of the repository
17 | - Submit a pull request
18 | 
19 | Example:
20 | 
21 | ``` shell
22 | git remote add upstream https://github.com/vmware/telemetry-peak-analyzer.git
23 | git checkout -b my-new-feature main
24 | git commit -a
25 | git push origin my-new-feature
26 | ```
27 | 
28 | ### Staying In Sync With Upstream
29 | 
30 | When your branch gets out of sync with the vmware/main branch, use the following to update:
31 | 
32 | ``` shell
33 | git checkout my-new-feature
34 | git fetch -a
35 | git pull --rebase upstream main
36 | git push --force-with-lease origin my-new-feature
37 | ```
38 | 
39 | ### Updating pull requests
40 | 
41 | If your PR fails to pass CI or needs changes based on code review, you'll most likely want to squash these changes into
42 | existing commits.
43 | 
44 | If your pull request contains a single commit or your changes are related to the most recent commit, you can simply
45 | amend the commit.
46 | 
47 | ``` shell
48 | git add .
49 | git commit --amend
50 | git push --force-with-lease origin my-new-feature
51 | ```
52 | 
53 | If you need to squash changes into an earlier commit, you can use:
54 | 
55 | ``` shell
56 | git add .
57 | git commit --fixup <commit>
58 | git rebase -i --autosquash main
59 | git push --force-with-lease origin my-new-feature
60 | ```
61 | 
62 | Be sure to add a comment to the PR indicating your new changes are ready to review, as GitHub does not generate a
63 | notification when you git push.
64 | 
65 | ### Code Style
66 | 
67 | ### Formatting Commit Messages
68 | 
69 | We follow the conventions on [How to Write a Git Commit Message](http://chris.beams.io/posts/git-commit/).
70 | 
71 | Be sure to include any related GitHub issue references in the commit message.  See
72 | [GFM syntax](https://guides.github.com/features/mastering-markdown/#GitHub-flavored-markdown) for referencing issues
73 | and commits.
74 | 
75 | ## Reporting Bugs and Creating Issues
76 | 
77 | When opening a new issue, try to roughly follow the commit message format conventions above.
78 | 


--------------------------------------------------------------------------------
/scripts/create_telemetry.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Copyright 2021 VMware, Inc.
  3 | # SPDX-License-Identifier: BSD-2
  4 | import argparse
  5 | import csv
  6 | import datetime
  7 | import json
  8 | import os
  9 | import sys
 10 | 
 11 | SEVERITY_BENIGN = "benign"
 12 | SEVERITY_MALICIOUS = "malicious"
 13 | SEVERITY_SUSPICIOUS = "suspicious"
 14 | SEVERITY_ALL = [
 15 |     SEVERITY_BENIGN,
 16 |     SEVERITY_MALICIOUS,
 17 |     SEVERITY_SUSPICIOUS,
 18 | ]
 19 | 
 20 | 
 21 | def datetime_str_to_ms(date_str: str, fmt: str = "%Y-%m-%d %H:%M:%S") -> int:
 22 |     """
 23 |     Convert a given timestamp to milliseconds since the epoch.
 24 | 
 25 |     :param str date_str: the datetime string
 26 |     :param str fmt: the format
 27 |     :rtype: int
 28 |     """
 29 |     date_obj = datetime.datetime.strptime(date_str, fmt)
 30 |     return int((date_obj - datetime.datetime.utcfromtimestamp(0)).total_seconds()) * 1000
 31 | 
 32 | 
 33 | def main() -> int:
 34 |     """Convert (internal, deprecated) CSV telemetry files into JSON telemetry data."""
 35 |     parser = argparse.ArgumentParser()
 36 |     parser.add_argument(
 37 |         "-i",
 38 |         "--input-file",
 39 |         dest="input_file",
 40 |         default=None,
 41 |         required=True,
 42 |         type=str,
 43 |         help="The input file",
 44 |     )
 45 |     parser.add_argument(
 46 |         "-s",
 47 |         "--severity-filter",
 48 |         dest="severity_filter",
 49 |         choices=SEVERITY_ALL,
 50 |         default=None,
 51 |         help=f"Optional filter by severity ({','.join(SEVERITY_ALL)})",
 52 |     )
 53 |     args = parser.parse_args()
 54 | 
 55 |     telemetry_data = []
 56 |     with open(args.input_file, "r") as f:
 57 |         reader = csv.DictReader(f)
 58 |         for row in reader:
 59 |             if args.severity_filter and row["severity"] != args.severity_filter:
 60 |                 continue
 61 |             telemetry_data.append(
 62 |                 {
 63 |                     "analysis.label": row["vt_label"],
 64 |                     "customer.channel": row["channel"],
 65 |                     "customer.installation_type": row["installation_type"],
 66 |                     "customer.region": row["region"],
 67 |                     "customer.sector": row["sector"],
 68 |                     "customer.type": row["key_type"],
 69 |                     "file.llfile_type": row["file_type"],
 70 |                     "file.magic": None,
 71 |                     "file.md5": row["md5"],
 72 |                     "file.mime_type": row["mime_type"],
 73 |                     "file.name": None,
 74 |                     "file.sha1": row["sha1"],
 75 |                     "file.sha256": None,
 76 |                     "file.size": row["file_size"],
 77 |                     "source.access_key_id": row["access_key_id"],
 78 |                     "source.data_center": row["data_center"],
 79 |                     "source.geo.country_iso_code": None,
 80 |                     "source.geo.location": "0.00,0.00",
 81 |                     "source.origin": row["origin"],
 82 |                     "source.submitter_ip": row["submitter_ip"],
 83 |                     "source.user_id": row["user_id"],
 84 |                     "submission_id": row["submission_id"],
 85 |                     "task.portal_url": None,
 86 |                     "task.score": row["score"],
 87 |                     "task.severity": row["severity"],
 88 |                     "task.uuid": row["task_uuid"],
 89 |                     "utc_timestamp": datetime_str_to_ms(row["ts"]),
 90 |                 }
 91 |             )
 92 | 
 93 |     file_path = f"{os.path.splitext(args.input_file)[0]}.json"
 94 |     with open(file_path, "w") as f:
 95 |         json.dump(telemetry_data, f, indent=2, sort_keys=True)
 96 | 
 97 |     return 0
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     sys.exit(main())
102 | 


--------------------------------------------------------------------------------
/src/telemetry_peak_analyzer/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 VMware, Inc.
  2 | # SPDX-License-Identifier: BSD-2
  3 | import datetime
  4 | import json
  5 | import logging
  6 | import os
  7 | import resource
  8 | import sys
  9 | from typing import Dict
 10 | from typing import List
 11 | from typing import Union
 12 | 
 13 | 
 14 | def ms_to_datetime(milliseconds: int) -> datetime.datetime:
 15 |     """
 16 |     Convert a given amount of milliseconds to a datetime object.
 17 | 
 18 |     :param int milliseconds: number of milliseconds
 19 |     :rtype: datetime.datetime
 20 |     :return: the datetime object
 21 |     """
 22 |     return datetime.datetime.fromtimestamp(milliseconds / 1000)
 23 | 
 24 | 
 25 | def datetime_to_sec(timestamp: datetime.datetime) -> int:
 26 |     """
 27 |     Convert a given timestamp to seconds since the epoch.
 28 | 
 29 |     :param datetime.datetime timestamp: the datetime object
 30 |     :rtype: int
 31 |     """
 32 |     return int((timestamp - datetime.datetime.utcfromtimestamp(0)).total_seconds())
 33 | 
 34 | 
 35 | def datetime_to_ms(timestamp: datetime.datetime) -> int:
 36 |     """
 37 |     Convert a given timestamp to milliseconds since the epoch.
 38 | 
 39 |     :param datetime.datetime timestamp: the datetime object
 40 |     :rtype: int
 41 |     """
 42 |     return datetime_to_sec(timestamp) * 1000
 43 | 
 44 | 
 45 | def save_to_json(obj: Union[Dict, List], file_path: str) -> None:
 46 |     """
 47 |     Save python object.
 48 | 
 49 |     :param dict|list obj: the python object
 50 |     :param str file_path: the path where to save to
 51 |     """
 52 |     with open(file_path, "w") as f:
 53 |         json.dump(obj, f, indent=2 * " ", sort_keys=True, default=str)
 54 | 
 55 | 
 56 | class MemoryFootprintFormatter(logging.Formatter):
 57 |     """Special formatter keeping track how much memory is used."""
 58 | 
 59 |     _DEFAULT_FMT = "%(levelname)s -> [%(asctime)s] %(message)s"
 60 |     _DEFAULT_DATEFMT = "%Y-%m-%d %H:%M:%S"
 61 | 
 62 |     def __init__(
 63 |         self,
 64 |         fmt: str = _DEFAULT_FMT,
 65 |         datefmt: str = _DEFAULT_DATEFMT,
 66 |     ):
 67 |         """Override method."""
 68 |         super(MemoryFootprintFormatter, self).__init__(fmt, datefmt)
 69 | 
 70 |     @classmethod
 71 |     def configure_logging(cls, level: int) -> None:
 72 |         """
 73 |         Configure some sane defaults.
 74 | 
 75 |         :param int level: the debugging level.
 76 |         """
 77 |         handler = logging.StreamHandler()
 78 |         formatter = cls()
 79 |         handler.setFormatter(formatter)
 80 |         logging.root.addHandler(handler)
 81 |         logging.root.setLevel(level)
 82 | 
 83 |     @staticmethod
 84 |     def _read_procfs_memory(procfs_mem_key: str) -> float:
 85 |         """
 86 |         Read memory usage from the /proc filesystem.
 87 | 
 88 |         :param str procfs_mem_key: the key to search
 89 |         :rtype: float
 90 |         :return: the value of the related key
 91 |         """
 92 |         procfs_fn = os.path.join("/", "proc", str(os.getpid()), "status")
 93 |         procfs_stats_scale = {
 94 |             "kB": 1024.0,
 95 |             "mB": 1024.0 * 1024.0,
 96 |             "KB": 1024.0,
 97 |             "MB": 1024.0 * 1024.0,
 98 |         }
 99 |         with open(procfs_fn) as pf:
100 |             pf_data = pf.read()
101 |             # get VmKey line e.g. "VmRSS:  9999  kB\n ..."
102 |             i = pf_data.index(procfs_mem_key)
103 |             # remove white-spaces
104 |             v = pf_data[i:].split(None, 3)
105 |             if len(v) < 3:
106 |                 return 0.0
107 |             # scale to the unit that was asked for
108 |             return float(v[1]) * procfs_stats_scale[v[2]]
109 | 
110 |     @staticmethod
111 |     def _get_memory_size() -> float:
112 |         """
113 |         Retrieve total memory usage in bytes from /proc filesystem.
114 | 
115 |         :rtype: float
116 |         :return: the value of the memory size
117 |         """
118 |         return MemoryFootprintFormatter._read_procfs_memory("VmSize:")
119 | 
120 |     @staticmethod
121 |     def _get_resident_memory_size() -> float:
122 |         """
123 |         Retrieve resident memory usage in bytes from /proc filesystem.
124 | 
125 |         :rtype: float
126 |         :return: the value of the memory size
127 |         """
128 |         return MemoryFootprintFormatter._read_procfs_memory("VmRSS:")
129 | 
130 |     @classmethod
131 |     def _get_memory_consumption(cls) -> float:
132 |         """
133 |         Get the current memory consumption in megabytes.
134 | 
135 |         :rtype: float
136 |         :return: the memory consumption
137 |         """
138 |         try:
139 |             # We get the actual memory footprint
140 |             res_mem_size = MemoryFootprintFormatter._get_resident_memory_size()
141 |         except IOError:
142 |             # On OSX we do not have /proc so we fallback to resource
143 |             if sys.platform == "darwin":
144 |                 # IMPORTANT: on OSX results are given in Bytes while on Linux are given in KBytes
145 |                 res_mem_size = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
146 |             else:
147 |                 # If we are not on Linux or on OSX just give up
148 |                 res_mem_size = 1024.0 * 1024.0
149 |         res_mem_size /= 1024.0 * 1024.0
150 |         return res_mem_size
151 | 
152 |     def format(self, record: logging.LogRecord) -> str:
153 |         """Override."""
154 |         # A record can be formatted multiple times by SimpleLogger.
155 |         if not hasattr(record, "formatted"):
156 |             record.msg = "[%04dmb] %s" % (self._get_memory_consumption(), record.msg)
157 |             setattr(record, "formatted", True)
158 |         return super(MemoryFootprintFormatter, self).format(record)
159 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Contributor Covenant Code of Conduct
  3 | 
  4 | ## Our Pledge
  5 | 
  6 | We as members, contributors, and leaders pledge to make participation in telemetry-peak-analyzer project and our
  7 | community a harassment-free experience for everyone, regardless of age, body
  8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  9 | identity and expression, level of experience, education, socio-economic status,
 10 | nationality, personal appearance, race, religion, or sexual identity
 11 | and orientation.
 12 | 
 13 | We pledge to act and interact in ways that contribute to an open, welcoming,
 14 | diverse, inclusive, and healthy community.
 15 | 
 16 | ## Our Standards
 17 | 
 18 | Examples of behavior that contributes to a positive environment for our
 19 | community include:
 20 | 
 21 | * Demonstrating empathy and kindness toward other people
 22 | * Being respectful of differing opinions, viewpoints, and experiences
 23 | * Giving and gracefully accepting constructive feedback
 24 | * Accepting responsibility and apologizing to those affected by our mistakes,
 25 |   and learning from the experience
 26 | * Focusing on what is best not just for us as individuals, but for the
 27 |   overall community
 28 | 
 29 | Examples of unacceptable behavior include:
 30 | 
 31 | * The use of sexualized language or imagery, and sexual attention or
 32 |   advances of any kind
 33 | * Trolling, insulting or derogatory comments, and personal or political attacks
 34 | * Public or private harassment
 35 | * Publishing others' private information, such as a physical or email
 36 |   address, without their explicit permission
 37 | * Other conduct which could reasonably be considered inappropriate in a
 38 |   professional setting
 39 | 
 40 | ## Enforcement Responsibilities
 41 | 
 42 | Community leaders are responsible for clarifying and enforcing our standards of
 43 | acceptable behavior and will take appropriate and fair corrective action in
 44 | response to any behavior that they deem inappropriate, threatening, offensive,
 45 | or harmful.
 46 | 
 47 | Community leaders have the right and responsibility to remove, edit, or reject
 48 | comments, commits, code, wiki edits, issues, and other contributions that are
 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 50 | decisions when appropriate.
 51 | 
 52 | ## Scope
 53 | 
 54 | This Code of Conduct applies within all community spaces, and also applies when
 55 | an individual is officially representing the community in public spaces.
 56 | Examples of representing our community include using an official e-mail address,
 57 | posting via an official social media account, or acting as an appointed
 58 | representative at an online or offline event.
 59 | 
 60 | ## Enforcement
 61 | 
 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 63 | reported to the community leaders responsible for enforcement at oss-coc@vmware.com.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.


--------------------------------------------------------------------------------
/tests/test_telemetry_peak_analyzer.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 VMware, Inc.
  2 | # SPDX-License-Identifier: BSD-2
  3 | import configparser
  4 | import datetime
  5 | import unittest
  6 | 
  7 | import ddt
  8 | import mock
  9 | from telemetry_peak_analyzer import analyzers
 10 | from telemetry_peak_analyzer import backends
 11 | from telemetry_peak_analyzer import models
 12 | 
 13 | 
 14 | TEST_GLOBAL_TABLE_1 = {
 15 |     "malicious": {
 16 |         "file_type": models.GlobalTable(
 17 |             start_ts=datetime.datetime.strptime("2020-06-17", "%Y-%m-%d"),
 18 |             end_ts=datetime.datetime.strptime("2020-07-12", "%Y-%m-%d"),
 19 |             window_count=25,
 20 |             sub_count_avg=1,
 21 |             sub_count_max=10,
 22 |             samp_count_max=10,
 23 |             samp_count_avg=4,
 24 |             samp_sub_count_avg=0,
 25 |             samp_sub_count_max=0.0,
 26 |             threshold_suggested=10,
 27 |         )
 28 |     }
 29 | }
 30 | 
 31 | TEST_GLOBAL_STATS_1 = {
 32 |     "malicious": {
 33 |         "file_type": models.GlobalTableStats(
 34 |             samp_sub_count_max=0.0,
 35 |             threshold=10,
 36 |         )
 37 |     }
 38 | }
 39 | 
 40 | TEST_LOCAL_TABLE_1 = {
 41 |     "malicious": {
 42 |         "file_type": {
 43 |             "file.sha1": {
 44 |                 "s1": 10,
 45 |                 "s2": 10,
 46 |             },
 47 |             "source.user_id": {
 48 |                 "u1": 5,
 49 |                 "u2": 5,
 50 |             },
 51 |             "source.origin": {
 52 |                 "o1": 5,
 53 |                 "o2": 5,
 54 |             },
 55 |         }
 56 |     },
 57 | }
 58 | 
 59 | TEST_LOCAL_STATS_1 = {
 60 |     "malicious": {
 61 |         "file_type": models.LocalTableStats(
 62 |             sub_count=20,
 63 |             samp_count=2,
 64 |             samp_sub_count_max=10,
 65 |             samp_sub_count_mean=10,
 66 |             samp_sub_count_std=0.0,
 67 |             samp_sub_ratio=0.0,
 68 |             cross_stats={
 69 |                 "source.user_id": {
 70 |                     "u1": 5,
 71 |                     "u2": 5,
 72 |                 },
 73 |                 "source.origin": {
 74 |                     "o1": 5,
 75 |                     "o2": 5,
 76 |                 },
 77 |             },
 78 |         )
 79 |     }
 80 | }
 81 | 
 82 | TEST_PEAKS_1 = {
 83 |     "malicious": {
 84 |         "file_type": models.TelemetryPeak(
 85 |             sub_count=20,
 86 |             samp_count=2,
 87 |             samp_sub_count_max=10,
 88 |             samp_sub_count_mean=10,
 89 |             samp_sub_count_std=0.0,
 90 |             samp_sub_ratio=0.0,
 91 |             global_samp_sub_count_max=0.0,
 92 |             global_threshold_suggested=10,
 93 |         )
 94 |     }
 95 | }
 96 | 
 97 | TEST_GLOBAL_TABLE_2 = {
 98 |     "malicious": {},
 99 | }
100 | 
101 | TEST_GLOBAL_STATS_2 = {}
102 | 
103 | TEST_LOCAL_TABLE_2 = {}
104 | 
105 | TEST_LOCAL_STATS_2 = {}
106 | 
107 | TEST_PEAKS_2 = {}
108 | 
109 | TEST_GLOBAL_TABLE_3 = {}
110 | 
111 | TEST_GLOBAL_STATS_3 = {}
112 | 
113 | TEST_LOCAL_TABLE_3 = {}
114 | 
115 | TEST_LOCAL_STATS_3 = {}
116 | 
117 | TEST_PEAKS_3 = {}
118 | 
119 | 
120 | @ddt.ddt
121 | class TestFileTypePeakAnalyzerTinaBackend(unittest.TestCase):
122 |     """Class to test the manager."""
123 | 
124 |     @ddt.data(
125 |         (TEST_GLOBAL_TABLE_1, TEST_GLOBAL_STATS_1),
126 |         (TEST_GLOBAL_TABLE_2, TEST_GLOBAL_STATS_2),
127 |         (TEST_GLOBAL_TABLE_3, TEST_GLOBAL_STATS_3),
128 |     )
129 |     def test_get_global_tables_stats(self, args):
130 |         """Test the 'get_global_tables_stats' method."""
131 |         global_tables, expected_stats = args
132 |         backend_mock = mock.MagicMock(spec=backends.TwoIndexTwoDimensionBackend)
133 |         peak_analyzer = analyzers.FileTypePeakAnalyzer(
134 |             conf=configparser.ConfigParser(),
135 |             backend=backend_mock,
136 |             start_ts=datetime.datetime.utcnow() - datetime.timedelta(days=7),
137 |             end_ts=datetime.datetime.utcnow(),
138 |         )
139 |         stats = peak_analyzer.get_global_tables_stats(global_tables)
140 |         self.assertEqual(stats, expected_stats)
141 | 
142 |     @ddt.data(
143 |         (TEST_LOCAL_TABLE_1, TEST_LOCAL_STATS_1),
144 |         (TEST_LOCAL_TABLE_2, TEST_LOCAL_STATS_2),
145 |         (TEST_LOCAL_TABLE_3, TEST_LOCAL_STATS_3),
146 |     )
147 |     def test_get_local_tables_stats(self, args):
148 |         """Test the 'get_local_tables_stats' method."""
149 |         local_tables, expected_stats = args
150 |         backend_mock = mock.MagicMock(spec=backends.TwoIndexTwoDimensionBackend)
151 |         peak_analyzer = analyzers.FileTypePeakAnalyzer(
152 |             conf=configparser.ConfigParser(),
153 |             backend=backend_mock,
154 |             start_ts=datetime.datetime.utcnow() - datetime.timedelta(days=7),
155 |             end_ts=datetime.datetime.utcnow(),
156 |         )
157 |         stats = peak_analyzer.get_local_tables_stats(local_tables)
158 |         self.assertEqual(stats, expected_stats)
159 | 
160 |     @ddt.data(
161 |         (TEST_GLOBAL_TABLE_1, TEST_LOCAL_TABLE_1, TEST_PEAKS_1),
162 |         (TEST_GLOBAL_TABLE_2, TEST_LOCAL_TABLE_2, TEST_PEAKS_2),
163 |         (TEST_GLOBAL_TABLE_3, TEST_LOCAL_TABLE_3, TEST_PEAKS_3),
164 |     )
165 |     def test_get_peaks(self, args):
166 |         """Test the 'get_peaks' method."""
167 |         global_tables, local_tables, expected_peaks = args
168 |         backend_mock = mock.MagicMock(spec=backends.TwoIndexTwoDimensionBackend)
169 |         peak_analyzer = analyzers.FileTypePeakAnalyzer(
170 |             conf=configparser.ConfigParser(),
171 |             backend=backend_mock,
172 |             start_ts=datetime.datetime.utcnow() - datetime.timedelta(days=7),
173 |             end_ts=datetime.datetime.utcnow(),
174 |         )
175 |         peaks = peak_analyzer.get_peaks(global_tables, local_tables)
176 |         self.assertEqual(peaks, expected_peaks)
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     unittest.main()
181 | 


--------------------------------------------------------------------------------
/src/telemetry_peak_analyzer/backends/tina.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 VMware, Inc.
  2 | # SPDX-License-Identifier: BSD-2
  3 | import collections
  4 | import configparser
  5 | import datetime
  6 | import logging
  7 | from typing import Any
  8 | from typing import Dict
  9 | from typing import List
 10 | 
 11 | from telemetry_peak_analyzer import backends
 12 | 
 13 | try:
 14 |     from tina_client.storage import readers
 15 | except ImportError:
 16 |     raise ImportError("The Tina backend requires tina-client.") from None
 17 | 
 18 | 
 19 | logging.getLogger("elasticsearch").setLevel(logging.WARNING)
 20 | 
 21 | 
 22 | class TinaBackend(backends.TwoIndexTwoDimensionBackend):
 23 |     """
 24 |     Backend using Tina (an internal Elasticsearch cluster).
 25 | 
 26 |     Note: in this class we hide all the specialization required to port the existing logic; note
 27 |         that this is unlikely to work with different analyzer and generalizing this bit further
 28 |         might not be worth the cost.
 29 |     """
 30 | 
 31 |     @staticmethod
 32 |     def _get_aggregation_query(
 33 |         index: List[str],
 34 |         dimensions: List[str],
 35 |         dimensions_values: Dict[str, List[str]],
 36 |     ) -> Dict[str, Dict]:
 37 |         """
 38 |         Return a fully loaded Elasticsearch aggregation query.
 39 | 
 40 |         :param list[str] index: the index
 41 |         :param list[str] dimensions: the dimensions
 42 |         :param dict[str, list[str]] dimensions_values: the values of dimensions (if available)
 43 |         :rtype: dict[str, dict]
 44 |         :return: the aggregation to run
 45 |         """
 46 | 
 47 |         def _get_aggregation() -> Dict[str, Dict]:
 48 |             """Return the aggregation for each dimension."""
 49 |             return {
 50 |                 "sub_per_day": {
 51 |                     "date_histogram": {
 52 |                         "field": index[0],
 53 |                         "calendar_interval": "day",
 54 |                         "min_doc_count": 0,
 55 |                     },
 56 |                     "aggs": {
 57 |                         "samp_count": {
 58 |                             "cardinality": {
 59 |                                 "field": index[1],
 60 |                             }
 61 |                         },
 62 |                         "samp_sub_count": {
 63 |                             "bucket_script": {
 64 |                                 "buckets_path": {
 65 |                                     "sub_count": "_count",
 66 |                                     "samp_count": "samp_count",
 67 |                                 },
 68 |                                 "script": "params.sub_count/params.samp_count",
 69 |                             }
 70 |                         },
 71 |                     },
 72 |                 },
 73 |                 "samp_count_avg": {"avg_bucket": {"buckets_path": "sub_per_day.samp_count"}},
 74 |                 "samp_count_max": {"max_bucket": {"buckets_path": "sub_per_day.samp_count"}},
 75 |                 "samp_sub_count_max": {
 76 |                     "max_bucket": {"buckets_path": "sub_per_day>samp_sub_count"}
 77 |                 },
 78 |                 "sub_count_avg": {"avg_bucket": {"buckets_path": "sub_per_day._count"}},
 79 |                 "sub_count_max": {"max_bucket": {"buckets_path": "sub_per_day._count"}},
 80 |             }
 81 | 
 82 |         dimension_0_values = dimensions_values.get(dimensions[0], [])
 83 |         return {
 84 |             "aggs": {
 85 |                 "my_buckets": {
 86 |                     "composite": {
 87 |                         "sources": [{"dimension": {"terms": {"field": dimensions[1]}}}]
 88 |                     },
 89 |                     "aggs": {
 90 |                         dimension_0_values[0]: {
 91 |                             "filter": {"term": {dimensions[0]: dimension_0_values[0]}},
 92 |                             "aggs": _get_aggregation(),
 93 |                         },
 94 |                         dimension_0_values[1]: {
 95 |                             "filter": {"term": {dimensions[0]: dimension_0_values[1]}},
 96 |                             "aggs": _get_aggregation(),
 97 |                         },
 98 |                     },
 99 |                 }
100 |             },
101 |             "size": 0,
102 |         }
103 | 
104 |     @staticmethod
105 |     def _parse_aggregation_output(
106 |         buckets: List[Dict[str, Dict[str, Any]]],
107 |         dimensions: List[str],
108 |         dimensions_values: Dict[str, List[str]],
109 |     ) -> Dict[str, Dict[str, Dict[str, float]]]:
110 |         """
111 |         Parse the aggregation from Elasticsearch into a backend-independent format.
112 | 
113 |         :param list[dict[str, dict[str, any]]] buckets: the aggregation buckets
114 |         :param list[str] dimensions: the dimensions
115 |         :param dict[str, list[str]] dimensions_values: the values of some dimensions (if available)
116 |         :rtype: dict[str, dict[str, dict[str, float]]]
117 |         :return: the aggregation results indexed by dimension
118 |         """
119 |         aggregation_keys = frozenset(
120 |             [
121 |                 "sub_count_avg",
122 |                 "sub_count_max",
123 |                 "samp_count_avg",
124 |                 "samp_count_max",
125 |                 "samp_sub_count_max",
126 |             ]
127 |         )
128 |         dimension_0_values = dimensions_values.get(dimensions[0], [])
129 |         ret = collections.defaultdict(dict)
130 |         for bucket in buckets:
131 |             for dimension_0 in dimension_0_values:
132 |                 if bucket[dimension_0]["doc_count"] > 0:
133 |                     dimension_1 = bucket["key"]["dimension"]
134 |                     ret[dimension_0][dimension_1] = {
135 |                         k: bucket[dimension_0][k]["value"]
136 |                         for k in bucket[dimension_0]
137 |                         if k in aggregation_keys
138 |                     }
139 |         return ret
140 | 
141 |     def __init__(self, conf: configparser.ConfigParser, section_name: str) -> None:
142 |         """Constructor."""
143 |         super(TinaBackend, self).__init__(conf, section_name)
144 |         self._tina_reader = readers.BulkFileSubmissionReader(conf, section_name)
145 |         self._logger.info(
146 |             "Loading backend '%s' from section '%s'",
147 |             self.__class__.__name__,
148 |             section_name,
149 |         )
150 | 
151 |     def stats(
152 |         self,
153 |         start_date: datetime.datetime,
154 |         end_date: datetime.datetime,
155 |         index: List[str],
156 |         dimensions: List[str],
157 |         dimensions_values: Dict[str, List[str]],
158 |     ) -> Dict[str, Dict[str, Dict[str, float]]]:
159 |         """Implement interface."""
160 |         query = self._get_aggregation_query(index, dimensions, dimensions_values)
161 |         buckets = []
162 |         while True:
163 |             ret = self._tina_reader.search_raw(
164 |                 start_ts=start_date,
165 |                 end_ts=end_date,
166 |                 query=query,
167 |                 limit=0,
168 |             )
169 |             buckets.extend(ret["aggregations"]["my_buckets"]["buckets"])
170 |             after_key = ret["aggregations"]["my_buckets"].get("after_key")
171 |             if not after_key:
172 |                 break
173 |             query["aggs"]["my_buckets"]["composite"]["after"] = after_key
174 |         return self._parse_aggregation_output(buckets, dimensions, dimensions_values)
175 | 
176 |     def group_by(
177 |         self,
178 |         start_date: datetime.datetime,
179 |         end_date: datetime.datetime,
180 |         index: List[str],
181 |         dimensions: List[str],
182 |     ) -> List[Dict[str, str]]:
183 |         """Implement interface."""
184 |         buckets = self._tina_reader.aggregate(
185 |             start_ts=start_date,
186 |             end_ts=end_date,
187 |             terms=dimensions + [index[1]],
188 |             limit=None,
189 |         )
190 |         # This might or might not be a generator so in-place edit can't work
191 |         new_buckets = []
192 |         terms_set = set(dimensions + [index[1]])
193 |         for bucket in buckets:
194 |             for k in bucket.keys():
195 |                 if k in terms_set:
196 |                     # Make sure we always deal with strings
197 |                     bucket[k] = str(bucket[k])
198 |             new_buckets.append(bucket)
199 |         return new_buckets
200 | 
201 | 
202 | class NetworkTinaBackend(TinaBackend):
203 |     """New backend specialized for network events."""
204 | 
205 |     def __init__(self, conf: configparser.ConfigParser, section_name: str) -> None:
206 |         """Constructor."""
207 |         backends.TwoIndexTwoDimensionBackend.__init__(self, conf, section_name)
208 |         self._tina_reader = readers.BulkEventReader(conf, section_name)
209 |         self._logger.info(
210 |             "Loading backend '%s' from section '%s'",
211 |             self.__class__.__name__,
212 |             section_name,
213 |         )
214 | 
215 |     def stats(
216 |         self,
217 |         start_date: datetime.datetime,
218 |         end_date: datetime.datetime,
219 |         index: List[str],
220 |         dimensions: List[str],
221 |         dimensions_values: Dict[str, List[str]],
222 |     ) -> Dict[str, Dict[str, Dict[str, float]]]:
223 |         return super(NetworkTinaBackend, self).stats(start_date, end_date, index, dimensions, dimensions_values)
224 | 
225 |     def group_by(
226 |         self,
227 |         start_date: datetime.datetime,
228 |         end_date: datetime.datetime,
229 |         index: List[str],
230 |         dimensions: List[str],
231 |     ) -> List[Dict[str, str]]:
232 |         return super(NetworkTinaBackend, self).group_by(start_date, end_date, index, dimensions)
233 | 


--------------------------------------------------------------------------------
/src/telemetry_peak_analyzer/__main__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 VMware, Inc.
  2 | # SPDX-License-Identifier: BSD-2
  3 | import argparse
  4 | import configparser
  5 | import datetime
  6 | import logging
  7 | import os
  8 | import sys
  9 | from typing import Optional
 10 | from typing import Tuple
 11 | 
 12 | import telemetry_peak_analyzer
 13 | 
 14 | 
 15 | def import_class(clazz_name: str) -> type:
 16 |     """
 17 |     Import the module and return the class.
 18 | 
 19 |     Example:
 20 |         > clazz = get_clazz_from_module("package.module.ClassName")
 21 |         > logging.debug("Instantiating %s instance...", clazz.__name__)
 22 |         > obj = clazz(conf)
 23 | 
 24 |     :param str clazz_name: class name in 'module.Class' form
 25 |     :rtype: type
 26 |     :return: the loadable type
 27 |     :raises ImportError: if the class name is not valid
 28 |     """
 29 |     if "." not in clazz_name:
 30 |         raise ImportError(f"Class '{clazz_name}' does not appear to be in module.Class form")
 31 | 
 32 |     try:
 33 |         only_clazz = clazz_name.split(".")[-1]
 34 |         only_module = ".".join(clazz_name.split(".")[:-1])
 35 |         mod = __import__(only_module, fromlist=[only_clazz])
 36 |         return getattr(mod, only_clazz)
 37 |     except AttributeError as ae:
 38 |         raise ImportError(f"Class '{clazz_name}' not found") from ae
 39 | 
 40 | 
 41 | def is_valid_date(date_str: str) -> datetime.date:
 42 |     """
 43 |     Validate a date and return a datetime object.
 44 | 
 45 |     :param str date_str: the datetime object as a string
 46 |     :rtype: datetime.date
 47 |     :return: the parsed date
 48 |     :raises ValueError: if the date is not valid
 49 |     """
 50 |     try:
 51 |         return datetime.datetime.strptime(date_str, "%Y-%m-%d").date()
 52 |     except ValueError:
 53 |         raise ValueError(f"Not a valid date: '{date_str}'") from None
 54 | 
 55 | 
 56 | def parse_date_options(
 57 |     start_ts: datetime.date,
 58 |     end_ts: datetime.date,
 59 |     delta: int,
 60 |     delay: int,
 61 | ) -> Tuple[datetime.datetime, datetime.datetime]:
 62 |     """
 63 |     Validate the date options.
 64 | 
 65 |     :param datetime.date start_ts: the start of the time interval
 66 |     :param datetime.date end_ts: the end of the time interval
 67 |     :param int delta: the length of the time interval
 68 |     :param int delay: the delay of the time interval
 69 |     :rtype: tuple[datetime.datetime, datetime.datetime]
 70 |     :return: the validated datetime objects
 71 |     :raises ValueError: if the provided interval is not valid
 72 |     """
 73 |     if start_ts and end_ts:
 74 |         if end_ts <= start_ts:
 75 |             raise ValueError(f"Invalid time interval {start_ts} - {end_ts}")
 76 |     else:
 77 |         utc_now = datetime.datetime.utcnow()
 78 |         end_ts = (utc_now - datetime.timedelta(days=delay)).date()
 79 |         start_ts = end_ts - datetime.timedelta(days=delta)
 80 |     return (
 81 |         datetime.datetime.combine(start_ts, datetime.datetime.min.time()),
 82 |         datetime.datetime.combine(end_ts, datetime.datetime.min.time()),
 83 |     )
 84 | 
 85 | 
 86 | def run(
 87 |     config: configparser.ConfigParser,
 88 |     analyzer_class: type,
 89 |     backend_class: type,
 90 |     backend_input: str,
 91 |     start_ts: datetime.datetime,
 92 |     end_ts: datetime.datetime,
 93 |     threshold: int,
 94 |     global_table_path: str,
 95 |     output_file_path: Optional[str] = None,
 96 | ) -> int:
 97 |     """Run the telemetry peak analyzer."""
 98 |     logger = logging.getLogger(__name__)
 99 |     logger.info("Loading Peak Analyzer from %s to %s with t=%s", start_ts, end_ts, threshold)
100 |     peak_analyzer = analyzer_class(
101 |         conf=config,
102 |         backend=backend_class(config, backend_input),
103 |         start_ts=start_ts,
104 |         end_ts=end_ts,
105 |     )
106 |     try:
107 |         logger.info("Loading global tables from file '%s'", global_table_path)
108 |         global_tables = peak_analyzer.get_global_tables_from_file(global_table_path)
109 |     except IOError as ioe:
110 |         logger.info("\tFailed: %s", str(ioe))
111 |         logger.info("Loading global tables from the backend")
112 |         global_tables = peak_analyzer.get_global_tables()
113 |     logger.info("Loading local tables")
114 |     local_tables = peak_analyzer.get_local_tables()
115 |     peaks = peak_analyzer.get_peaks(global_tables, local_tables, threshold=threshold)
116 | 
117 |     logger.info("Getting peaks")
118 |     for dimension_0 in peaks:
119 |         for dimension_1 in peaks[dimension_0]:
120 |             logger.info("TelemetryPeak(%s, %s)", dimension_0, dimension_1)
121 |             peak = peaks[dimension_0][dimension_1]
122 |             for k, v in peak._asdict().items():
123 |                 logger.info("\t%s: %s", k, round(v, 2))
124 |     if output_file_path:
125 |         logger.info("Saving output to: %s", output_file_path)
126 |         telemetry_peak_analyzer.save_to_json(peaks, output_file_path)
127 | 
128 |     logger.info("Refreshing global tables")
129 |     global_tables = peak_analyzer.refresh_global_tables(global_tables, local_tables)
130 | 
131 |     logger.info("Saving global tables to '%s'", global_table_path)
132 |     telemetry_peak_analyzer.save_to_json(global_tables, global_table_path)
133 |     return 0
134 | 
135 | 
136 | def parse_and_run_command():
137 |     """
138 |     Examples:
139 |         # python -m telemetry_peak_analyzer \
140 |             -b telemetry_peak_analyzer.backends.JsonBackend -n "~/data.*.json" \
141 |             -s 2020-07-01 -e 2021-08-01 -t 10
142 |         # python -m telemetry_peak_analyzer -c config.ini \
143 |             -b telemetry_peak_analyzer.backends.tina.TinaBackend -n tina_backend \
144 |             -s 2020-07-01 -e 2021-08-01 -t 10
145 |     """
146 |     parser = argparse.ArgumentParser()
147 |     parser.add_argument(
148 |         "-c",
149 |         "--config-file",
150 |         dest="config_file",
151 |         default="./data/config.ini",
152 |         type=str,
153 |         help="read config from here",
154 |     )
155 |     # Time interval option 1: specify start and end datetime
156 |     parser.add_argument(
157 |         "-s",
158 |         "--start-date",
159 |         dest="start_date",
160 |         default=None,
161 |         type=is_valid_date,
162 |         help="the start of the time interval in 'YYYY:mm:dd' format",
163 |     )
164 |     parser.add_argument(
165 |         "-e",
166 |         "--end-date",
167 |         dest="end_date",
168 |         default=None,
169 |         type=is_valid_date,
170 |         help="the end of the time interval in 'YYYY:mm:dd' format",
171 |     )
172 |     # Time interval option 2: specify the length and the delay of the time interval
173 |     parser.add_argument(
174 |         "-d",
175 |         "--delta",
176 |         dest="delta",
177 |         default=1,
178 |         type=int,
179 |         help="the length of the time interval starting from now",
180 |     )
181 |     parser.add_argument(
182 |         "-k",
183 |         "--delay",
184 |         dest="delay",
185 |         default=0,
186 |         type=int,
187 |         help="the delay of the time interval expressed in days",
188 |     )
189 |     # Other options
190 |     parser.add_argument(
191 |         "-t",
192 |         "--threshold",
193 |         dest="threshold",
194 |         default=None,
195 |         type=int,
196 |         help="the threshold used by the telemetry peak analyzer",
197 |     )
198 |     parser.add_argument(
199 |         "-a",
200 |         "--analyzer",
201 |         dest="analyzer_class",
202 |         default="telemetry_peak_analyzer.analyzers.FileTypePeakAnalyzer",
203 |         type=import_class,
204 |         help="the full class name of the analyzer used to process telemetry data",
205 |     )
206 |     parser.add_argument(
207 |         "-b",
208 |         "--backend-class",
209 |         dest="backend_class",
210 |         default="telemetry_peak_analyzer.backends.JsonBackend",
211 |         type=import_class,
212 |         help="the full class name of the backend used to read the telemetry from",
213 |     )
214 |     parser.add_argument(
215 |         "-n",
216 |         "--backend-input",
217 |         dest="backend_input",
218 |         required=True,
219 |         type=str,
220 |         help="the backend input, section name when reading remotely or a file for local input",
221 |     )
222 |     parser.add_argument(
223 |         "-m",
224 |         "--global-table",
225 |         dest="global_table_path",
226 |         default="global_table.json",
227 |         type=str,
228 |         help="the path to the global tables, either to load from, or to save to",
229 |     )
230 |     parser.add_argument(
231 |         "-o",
232 |         "--output-file",
233 |         dest="output_file",
234 |         default=None,
235 |         type=str,
236 |         help="the path to output file",
237 |     )
238 |     parser.add_argument(
239 |         "-v",
240 |         "--verbose",
241 |         dest="verbose",
242 |         default=False,
243 |         action="store_true",
244 |         help="whether to be verbose",
245 |     )
246 | 
247 |     # Parse options and init the logger
248 |     args = parser.parse_args()
249 |     conf = configparser.ConfigParser()
250 |     conf.read(args.config_file)
251 |     start_date, end_date = parse_date_options(
252 |         args.start_date,
253 |         args.end_date,
254 |         args.delta,
255 |         args.delay,
256 |     )
257 |     log_level = logging.DEBUG if args.verbose else logging.INFO
258 |     telemetry_peak_analyzer.MemoryFootprintFormatter.configure_logging(log_level)
259 | 
260 |     # Run
261 |     return run(
262 |         conf,
263 |         args.analyzer_class,
264 |         args.backend_class,
265 |         args.backend_input,
266 |         start_date,
267 |         end_date,
268 |         args.threshold,
269 |         os.path.abspath(args.global_table_path),
270 |         os.path.abspath(args.output_file) if args.output_file else None,
271 |     )
272 | 
273 | 
274 | if __name__ == "__main__":
275 |     sys.exit(parse_and_run_command())
276 | 


--------------------------------------------------------------------------------
/src/telemetry_peak_analyzer/backends/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 VMware, Inc.
  2 | # SPDX-License-Identifier: BSD-2
  3 | import abc
  4 | import collections
  5 | import configparser
  6 | import datetime
  7 | import glob
  8 | import logging
  9 | import os
 10 | import statistics
 11 | from typing import Any
 12 | from typing import Dict
 13 | from typing import List
 14 | from typing import TypeVar
 15 | 
 16 | import ijson
 17 | import telemetry_peak_analyzer
 18 | 
 19 | BackendType = TypeVar("BackendType", bound="AbstractBackend")
 20 | 
 21 | 
 22 | class AbstractBackend(abc.ABC):
 23 |     """Abstract backend."""
 24 | 
 25 |     def __init__(self, conf: configparser.ConfigParser, section_name: str) -> None:
 26 |         """
 27 |         Constructor.
 28 | 
 29 |         :param configparser.ConfigParser conf: the conf object
 30 |         :param str section_name: the name of the section
 31 |         """
 32 |         self._conf = conf
 33 |         self._section_name = section_name
 34 |         self._logger = logging.getLogger(__name__)
 35 |         self._logger.info("Loading backend '%s'", self.__class__.__name__)
 36 | 
 37 |     @abc.abstractmethod
 38 |     def stats(
 39 |         self,
 40 |         start_date: datetime.datetime,
 41 |         end_date: datetime.datetime,
 42 |         index: List[str],
 43 |         dimensions: List[str],
 44 |         dimensions_values: Dict[str, List[str]],
 45 |     ) -> Dict[str, Any]:
 46 |         """
 47 |         Create statistics.
 48 | 
 49 |         :param datetime.datetime start_date: the start of the time interval
 50 |         :param datetime.datetime end_date: the end of the time interval
 51 |         :param list[str] index: the index
 52 |         :param list[str] dimensions: the dimensions
 53 |         :param dict[str, list[str]] dimensions_values: the values of dimensions (if available)
 54 |         :rtype: dict[str, any]
 55 |         :return: statistics for each dimension combination
 56 |         """
 57 | 
 58 |     @abc.abstractmethod
 59 |     def group_by(
 60 |         self,
 61 |         start_date: datetime.datetime,
 62 |         end_date: datetime.datetime,
 63 |         index: List[str],
 64 |         dimensions: List[str],
 65 |     ) -> List[Dict[str, str]]:
 66 |         """
 67 |         Group by.
 68 | 
 69 |         :param datetime.datetime start_date: the start of the time interval
 70 |         :param datetime.datetime end_date: the end of the time interval
 71 |         :param list[str] index: the index
 72 |         :param list[str] dimensions: list of dimensions to group by
 73 |         :rtype: list[dict[str, str]]
 74 |         :return: buckets for each dimension combination
 75 |         """
 76 | 
 77 | 
 78 | class TwoIndexTwoDimensionBackend(AbstractBackend, abc.ABC):
 79 |     """Backend accepting an index with two fields and two dimensions."""
 80 | 
 81 |     @abc.abstractmethod
 82 |     def stats(
 83 |         self,
 84 |         start_date: datetime.datetime,
 85 |         end_date: datetime.datetime,
 86 |         index: List[str],
 87 |         dimensions: List[str],
 88 |         dimensions_values: Dict[str, List[str]],
 89 |     ) -> Dict[str, Dict[str, Dict[str, float]]]:
 90 |         """
 91 |         Create statistics.
 92 | 
 93 |         :param datetime.datetime start_date: the start of the time interval
 94 |         :param datetime.datetime end_date: the end of the time interval
 95 |         :param list[str] index: the index
 96 |         :param list[str] dimensions: the dimensions
 97 |         :param dict[str, list[str]] dimensions_values: the values of dimensions (if available)
 98 |         :rtype: dict[str, dict[str, dict[str, float]]]
 99 |         :return: a dictionary like:
100 |                 {
101 |                     "dimension_0_value": {
102 |                         "dimension_1_value": {
103 |                             "sub_count_avg": average of submissions
104 |                             "sub_count_max": maximum number of submissions
105 |                             "samp_count_avg": average of samples
106 |                             "samp_count_max": maximum number of samples
107 |                             "samp_sub_count_max": maximum ratio between samples and submissions
108 |                         }
109 |                         ...
110 |                     }
111 |                     ...
112 |                 }
113 |         """
114 | 
115 |     @abc.abstractmethod
116 |     def group_by(
117 |         self,
118 |         start_date: datetime.datetime,
119 |         end_date: datetime.datetime,
120 |         index: List[str],
121 |         dimensions: List[str],
122 |     ) -> List[Dict[str, str]]:
123 |         """
124 |         Group by.
125 | 
126 |         :param datetime.datetime start_date: the start of the time interval
127 |         :param datetime.datetime end_date: the end of the time interval
128 |         :param list[str] index: the index
129 |         :param list[str] dimensions: the dimensions
130 |         :rtype: list[dict[str, str]]
131 |         :return: a list of dictionaries like:
132 |             [
133 |                 {
134 |                   "dimension_0": "benign",
135 |                   "dimension_1": "ExcelMsDocFile",
136 |                   "index_1": "0015cc85a17d707e00b9881a149c232d181ad451",
137 |                   "additional_dimension_0": "3549",
138 |                   "additional_dimension_1": "API",
139 |                   "count": 61
140 |                 }
141 |                 ...
142 |             ]
143 |         """
144 | 
145 | 
146 | class JsonBackend(TwoIndexTwoDimensionBackend):
147 |     """Backend using JSON files."""
148 | 
149 |     def __init__(self, conf: configparser.ConfigParser, file_path_wildcard: str) -> None:
150 |         """Constructor."""
151 |         super(JsonBackend, self).__init__(conf, section_name="not_used")
152 |         self._file_paths = []
153 |         for name in glob.glob(file_path_wildcard):
154 |             self._file_paths.append(os.path.abspath(name))
155 |         self._logger.info("Loaded files:")
156 |         for file_path in self._file_paths:
157 |             self._logger.info("\t%s", file_path)
158 | 
159 |     def stats(
160 |         self,
161 |         start_date: datetime.datetime,
162 |         end_date: datetime.datetime,
163 |         index: List[str],
164 |         dimensions: List[str],
165 |         dimensions_values: Dict[str, List[str]],
166 |     ) -> Dict[str, Dict[str, Dict[str, float]]]:
167 |         """Implement interface."""
168 |         # we do several passes so to keep memory usage to a minimum
169 |         # pass 1, let us get all the dates
170 |         dates = set([])
171 |         for file_path in self._file_paths:
172 |             with open(file_path, "r") as f:
173 |                 for json_doc in ijson.items(f, "item"):
174 |                     index_0 = telemetry_peak_analyzer.ms_to_datetime(json_doc[index[0]])
175 |                     if start_date <= index_0 < end_date:
176 |                         dates.add(index_0.date())
177 | 
178 |         # pass 2, for each date get the stats
179 |         buckets = collections.defaultdict(lambda: collections.defaultdict(dict))
180 |         all_dims_0 = set([])
181 |         all_dims_1 = set([])
182 |         for day_date in sorted(dates):
183 |             sub_count = collections.defaultdict(lambda: collections.defaultdict(int))
184 |             samp_set = collections.defaultdict(lambda: collections.defaultdict(set))
185 |             dims_0 = set([])
186 |             dims_1 = set([])
187 |             for file_path in self._file_paths:
188 |                 with open(file_path, "r") as f:
189 |                     for json_doc in ijson.items(f, "item"):
190 |                         index_0 = telemetry_peak_analyzer.ms_to_datetime(
191 |                             json_doc[index[0]]
192 |                         ).date()
193 |                         index_1 = json_doc[index[1]]
194 |                         dimension_0 = json_doc[dimensions[0]]
195 |                         dimension_1 = json_doc[dimensions[1]]
196 |                         if index_0 == day_date:
197 |                             sub_count[dimension_0][dimension_1] += 1
198 |                             samp_set[dimension_0][dimension_1].add(index_1)
199 |                             dims_0.add(dimension_0)
200 |                             dims_1.add(dimension_1)
201 |             for dim_0 in dims_0:
202 |                 for dim_1 in dims_1:
203 |                     try:
204 |                         samp_sub_count = sub_count[dim_0][dim_1] / len(samp_set[dim_0][dim_1])
205 |                     except ZeroDivisionError:
206 |                         samp_sub_count = 0
207 |                     buckets[day_date][dim_0][dim_1] = {
208 |                         "sub_count": sub_count[dim_0][dim_1],
209 |                         "samp_count": len(samp_set[dim_0][dim_1]),
210 |                         "samp_sub_count": samp_sub_count,
211 |                     }
212 |             all_dims_0.update(dims_0)
213 |             all_dims_1.update(dims_1)
214 |         ret = collections.defaultdict(dict)
215 |         for dim_0 in all_dims_0:
216 |             for dim_1 in all_dims_1:
217 |                 d_slice = [buckets[x].get(dim_0, {}).get(dim_1, {}) for x in dates]
218 |                 ret[dim_0][dim_1] = {
219 |                     "sub_count_avg": statistics.mean([x.get("sub_count", 0) for x in d_slice]),
220 |                     "sub_count_max": max([x.get("sub_count", 0) for x in d_slice]),
221 |                     "samp_count_avg": statistics.mean([x.get("samp_count", 0) for x in d_slice]),
222 |                     "samp_count_max": max([x.get("samp_count", 0) for x in d_slice]),
223 |                     "samp_sub_count_max": max([x.get("samp_sub_count", 0) for x in d_slice]),
224 |                 }
225 |         return ret
226 | 
227 |     def group_by(
228 |         self,
229 |         start_date: datetime.datetime,
230 |         end_date: datetime.datetime,
231 |         index: List[str],
232 |         dimensions: List[str],
233 |     ) -> List[Dict[str, str]]:
234 |         """Implement interface."""
235 |         counters = collections.Counter()
236 |         dimensions = dimensions + [index[1]]
237 |         for file_path in self._file_paths:
238 |             with open(file_path, "r") as f:
239 |                 for json_doc in ijson.items(f, "item"):
240 |                     index_0 = telemetry_peak_analyzer.ms_to_datetime(json_doc[index[0]])
241 |                     if start_date <= index_0 < end_date:
242 |                         counters[tuple([json_doc[dimension] for dimension in dimensions])] += 1
243 |         ret = []
244 |         for key, count in counters.items():
245 |             value = {attr: key[idx] for idx, attr in enumerate(dimensions)}
246 |             value["count"] = count
247 |             ret.append(value)
248 |         return ret
249 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![GitHub tag (latest SemVer)](https://img.shields.io/github/tag/vmware-labs/telemetry-peak-analyzer)
  2 | ![GitHub](https://img.shields.io/pypi/l/telemetry-peak-analyzer)
  3 | ![GitHub issues](https://img.shields.io/github/issues/vmware-labs/telemetry-peak-analyzer)
  4 | 
  5 | # Telemetry Peak Analyzer
  6 | 
  7 | ## Overview
  8 | 
  9 | Telemetry Peak Analyzer is a framework to analyze and detect peaks on telemetry data with multiple
 10 | dimensions, indexes, and attributes. The analyzer detects meaningful peaks based on statistical
 11 | measurements computed over a short _local window_ and a longer _global window_ of telemetry data:
 12 | 
 13 | - _Local window_ - a short time data window in which we want to detect peaks of a given attribute
 14 | or dimension, e.g., file type. During the detection process, the analyzer generates a local
 15 | statistics table (LST) with all the necessary statistical measurements.
 16 | 
 17 | - _Global window_ - a historical long time data window which serves as a global benchmark to
 18 | determine if a detected peak within the _local window_ is meaningful. During the detection process,
 19 | it will generate (or update) a global statistics table (GST) with all the necessary statistical
 20 | measurements.
 21 | 
 22 | Telemetry data is dynamic, therefore the global benchmark as reflected by GST needs to be updated
 23 | over time. To make the global benchmark adaptive, we use a sliding window mechanism which allows
 24 | us to quickly update the new GST using previous GST and LST.
 25 | 
 26 | *Note*: this implementation is a generalization of a research tool that was tailored to detect waves
 27 | of malicious files sharing the same file type; to fully generalize terms and components, the source
 28 | code relies on the following terms to describe different parts of the telemetry feed:
 29 | - `index`: a tuple of attributes used to uniquely identify a telemetry data record.
 30 | - `dimensions`: the attributes used to decompose a time-series into independent and orthogonal
 31 | time-series.
 32 | 
 33 | Such generalization is not perfect (for example, the current implementation does not support more
 34 | than two dimensions) and some backends have obvious limitations; things will improve as the
 35 | analyzer supports more types of telemetry data.
 36 | 
 37 | ## Try it out
 38 | 
 39 | ### Build & Run
 40 | 
 41 | This package can be installed via pip, just run `pip install telemetry-peak-analyzer` or
 42 | `pip install -e .`.
 43 | 
 44 | If you want to install the dependencies required by the `tina` backend (a custom backend based
 45 | on Elasticsearch used internally) you should append the `[tina]` extra option; you might need to
 46 | use double quotes when doing a dev install, i.e., `pip install -e ".[tina]"`; note that a valid
 47 | configuration file might be required. See `data/config.ini.template` for an example.
 48 | 
 49 | Extra backends might require private dependencies; if that is the case, remember to select the
 50 | internal index server using the `-i` option; if you require access, contact one of the maintainers.
 51 | 
 52 | ### Scripts
 53 | 
 54 | This package includes a console script ready to be used. Examples:
 55 | 
 56 | * `python -m telemetry_peak_analyzer -b
 57 | telemetry_peak_analyzer.backends.JsonBackend -n "./data/telemetry_example_*" -t 10`:
 58 | in this example the peak analyzer reads from some local files using the JSON backend
 59 | (note the double quotes) and sets the threshold to 10; note that when `-t` is specified, it
 60 | will overwrite any suggested global threshold defined in GST.
 61 | * `python -m telemetry_peak_analyzer -c config.ini -b
 62 | telemetry_peak_analyzer.backends.tina.TinaBackend -n tina_nlemea -d 2`:
 63 | in this example the peak analyzer reads from Tina from the last 2 days of data, using the
 64 | configuration file `config.ini`, and the section `tina_nlemea` to know how to connect to the
 65 | backend.
 66 | 
 67 | ### Test
 68 | There are a number of JSON files in the  `data` directory for test using the JSON backend.
 69 | Note that all the test files have been completely anonymized, to the point that even file hashes
 70 | do not refer to actual files anymore.
 71 | 
 72 | As mentioned above, the analyzer detects peaks based on statistical measurements of both a
 73 | _local window_ and a _global window_. In the detailed example, the process comprises two steps.
 74 | 
 75 | 1) `python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-01 –e 2020-11-04`
 76 | 
 77 | This step generates an initial GST table as global benchmark from the defined initial
 78 | _global window_, as specified by `-s` and `-e` options in the command. This step is only required
 79 | the first time the analyzer is executed. Subsequent runs will update the GST using previously
 80 | computed GST and LST tables.
 81 | 
 82 | Expected output:
 83 | 
 84 | ```
 85 | test@localhost telemetry-peak-analyzer % python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-01 -e 2020-11-04
 86 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading Peak Analyzer from 2020-11-01 00:00:00 to 2020-11-04 00:00:00 with t=None
 87 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading backend 'JsonBackend'
 88 | INFO -> [2021-09-15 12:00:11] [0010mb] Loaded files:
 89 | INFO -> [2021-09-15 12:00:11] [0010mb]  /Users/test/telemetry-peak-analyzer/data/telemetry_example_3.json
 90 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading analyzer 'FileTypePeakAnalyzer' with backend 'JsonBackend'
 91 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading global tables from file '/Users/test/telemetry-peak-analyzer/global_table.json'
 92 | INFO -> [2021-09-15 12:00:11] [0010mb]  Failed: [Errno 2] No such file or directory: '/Users/test/telemetry-peak-analyzer/global_table.json'
 93 | INFO -> [2021-09-15 12:00:11] [0010mb] Loading global tables from the backend
 94 | INFO -> [2021-09-15 12:00:12] [0012mb] Loading local tables
 95 | INFO -> [2021-09-15 12:00:12] [0013mb] Getting peaks
 96 | INFO -> [2021-09-15 12:00:12] [0013mb] Refreshing global tables
 97 | INFO -> [2021-09-15 12:00:12] [0013mb] Saving global tables to '/Users/test/telemetry-peak-analyzer/global_table.json'
 98 | ```
 99 | 
100 | As the output shows, the process creates a JSON file `global_table.json` which is the initial
101 | GST table containing the global statistics.
102 | 
103 | 2) `python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-04 –e 2020-11-05`
104 | 
105 | This step will finally detect peaks from a _local window_ (as specified by `-s` and `-e` options)
106 | by leveraging the statistics in the GST and LST tables. This run will also update the GST (ideally,
107 | in production, you want to execute this second command on a daily basis to minimize the data to be
108 | processed).
109 | 
110 | Expected output:
111 | 
112 | ```
113 | test@localhost telemetry-peak-analyzer % python -m telemetry_peak_analyzer -n ./data/telemetry_example_3.json -s 2020-11-04 -e 2020-11-05
114 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading Peak Analyzer from 2020-11-04 00:00:00 to 2020-11-05 00:00:00 with t=None
115 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading backend 'JsonBackend'
116 | INFO -> [2021-09-15 12:00:46] [0010mb] Loaded files:
117 | INFO -> [2021-09-15 12:00:46] [0010mb]  /Users/test/telemetry-peak-analyzer/data/telemetry_example_3.json
118 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading analyzer 'FileTypePeakAnalyzer' with backend 'JsonBackend'
119 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading global tables from file '/Users/test/telemetry-peak-analyzer/global_table.json'
120 | INFO -> [2021-09-15 12:00:46] [0010mb] Loading local tables
121 | INFO -> [2021-09-15 12:00:46] [0015mb] Getting peaks
122 | INFO -> [2021-09-15 12:00:46] [0015mb] TelemetryPeak(malicious, ZipArchiveFile)
123 | INFO -> [2021-09-15 12:00:46] [0015mb]  sub_count: 11083
124 | INFO -> [2021-09-15 12:00:46] [0015mb]  samp_count: 3028
125 | INFO -> [2021-09-15 12:00:46] [0015mb]  samp_sub_count_max: 426
126 | INFO -> [2021-09-15 12:00:46] [0015mb]  samp_sub_count_mean: 3.66
127 | INFO -> [2021-09-15 12:00:46] [0015mb]  samp_sub_count_std: 11.54
128 | INFO -> [2021-09-15 12:00:46] [0015mb]  samp_sub_ratio: 0.04
129 | INFO -> [2021-09-15 12:00:46] [0015mb]  global_samp_sub_count_max: 2
130 | INFO -> [2021-09-15 12:00:46] [0015mb]  global_threshold_suggested: 629
131 | INFO -> [2021-09-15 12:00:46] [0015mb] Refreshing global tables
132 | INFO -> [2021-09-15 12:00:46] [0015mb] Saving global tables to '/Users/test/telemetry-peak-analyzer/global_table.json'
133 | ```
134 | 
135 | As the output shows, it loads the GST generated from the 1st step, and successfully detects a
136 | ZipArchiveFile-based peak within the _local window_, and prints out some key statistical
137 | measurements generated during the detection process.
138 | 
139 | At the end of the process, the GST table gets updated.
140 | 
141 | 
142 | ## Contributing
143 | 
144 | The telemetry-peak-analyzer project team welcomes contributions from the community. Before you
145 | start working with telemetry-peak-analyzer, please read our
146 | [Developer Certificate of Origin](https://cla.vmware.com/dco). All contributions to this repository
147 | must be signed as described on that page. Your signature certifies that you wrote the patch or
148 | have the right to pass it on as an open-source patch. For more detailed information,
149 | refer to [CONTRIBUTING.md](CONTRIBUTING.md).
150 | 
151 | ## Development
152 | 
153 | Create the virtual env:
154 | 
155 | `python3 -m venv venv`
156 | 
157 | Activate the virtual env:
158 | 
159 | `source ./venv/bin/activate`
160 | 
161 | Install `tox`:
162 | 
163 | `pip install tox`
164 | 
165 | Run tests:
166 | 
167 | `tox`
168 | 
169 | Due to a bug in `tox` if you update the dependencies in `setup.cfg` the environments will not be
170 | re-created, leading to errors when running the tests
171 | (see https://github.com/tox-dev/tox/issues/93).
172 | As workaround, pass the `--recreate` flag after updating the dependencies.
173 | 
174 | Before committing, install the package in dev mode (needed by `pylint`) following the instructions
175 | detailed in the `Build & Run` section.
176 | 
177 | Then install `pylint` and `pre-commit`:
178 | 
179 | `pip install pylint pre-commit`
180 | 
181 | Install the hook:
182 | 
183 | `pre-commit install`
184 | 
185 | If you want to run pre-commit on all files use the following command:
186 | 
187 | `pre-commit run --all-files`
188 | 
189 | ## License
190 | [BSD 2-Clause](https://spdx.org/licenses/BSD-2-Clause.html)
191 | 
192 | ## Extra
193 | The peak analyzer can be used to detect network peaks too. In order to use it for this purpose, while calling the peak analyzer, the `-a` option should be used to specify that the NetworkTypePeakAnalyzer is to be used. The `-b` option should specify the backend to be used, i.e. the NetworkTinaBackend. 
194 | 
195 | Due to the much larger volume of network peaks in a given time range as compared to file peaks, it is recommended to use the `-e` and `-d` parameters to specify the date and time range. 
196 | 
197 | Test command:
198 | ` python -m telemetry_peak_analyzer -c ./data/config.ini -a telemetry_peak_analyzer.analyzers.NetworkTypePeakAnalyzer -b telemetry_peak_analyzer.backends.tina.NetworkTinaBackend -n tina_westus -e 2022-04-01 -d 1 -o ./output.json`
199 | 


--------------------------------------------------------------------------------
/src/telemetry_peak_analyzer/analyzers/__init__.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 VMware, Inc.
  2 | # SPDX-License-Identifier: BSD-2
  3 | import abc
  4 | import collections
  5 | import configparser
  6 | import datetime
  7 | import json
  8 | import logging
  9 | import statistics
 10 | from abc import ABC
 11 | from typing import Dict
 12 | from typing import List
 13 | from typing import Optional
 14 | from typing import Set
 15 | 
 16 | import telemetry_peak_analyzer
 17 | from telemetry_peak_analyzer import backends
 18 | from telemetry_peak_analyzer import models
 19 | 
 20 | 
 21 | class AbstractAnalyzer(abc.ABC):
 22 |     """Abstract analyzer."""
 23 | 
 24 |     DEFAULT_GLOBAL_TABLE_AGE = datetime.timedelta(days=7)
 25 | 
 26 |     @staticmethod
 27 |     def _get_window_count(start_ts: int, end_ts: int) -> int:
 28 |         """
 29 |         Get the window count.
 30 | 
 31 |         :param int start_ts: milliseconds
 32 |         :param int end_ts: milliseconds
 33 |         :rtype: int
 34 |         :return: the window count
 35 |         """
 36 |         delta_hours = (end_ts - start_ts) // 1000 // 60 // 60
 37 |         window_count = delta_hours // 24 if delta_hours % 24 == 0 else delta_hours
 38 |         # if start_ts and end_ts from the buckets are the same, then count it as one window
 39 |         if window_count == 0:
 40 |             window_count = 1
 41 |         return window_count
 42 | 
 43 |     def __init__(
 44 |         self,
 45 |         conf: configparser.ConfigParser,
 46 |         backend: backends.BackendType,
 47 |         index: List[str],
 48 |         dimensions: List[str],
 49 |         start_ts: datetime.datetime,
 50 |         end_ts: datetime.datetime,
 51 |     ) -> None:
 52 |         """
 53 |         Constructor.
 54 | 
 55 |         :param configparser.ConfigParser conf: the conf object
 56 |         :param BackendType backend: the backend
 57 |         :param list[str] index: the index
 58 |         :param list[str] dimensions: the dimensions
 59 |         :param datetime.datetime start_ts: the beginning of the time interval
 60 |         :param datetime.datetime end_ts: the end of the time interval
 61 |         """
 62 |         self._conf = conf
 63 |         self._backend = backend
 64 |         self._index = index
 65 |         self._dimensions = dimensions
 66 |         self._start_ts = start_ts
 67 |         self._end_ts = end_ts
 68 |         self._logger = logging.getLogger(__name__)
 69 |         self._logger.info(
 70 |             "Loading analyzer '%s' with backend '%s'",
 71 |             self.__class__.__name__,
 72 |             self._backend.__class__.__name__,
 73 |         )
 74 | 
 75 |     @abc.abstractmethod
 76 |     def get_peaks(
 77 |         self,
 78 |         global_tables: Dict,
 79 |         local_tables: Dict,
 80 |         threshold: Optional[int] = None,
 81 |     ) -> Dict:
 82 |         """
 83 |         Get the peaks.
 84 | 
 85 |         :param dict global_tables: the global tables
 86 |         :param dict local_tables: the local tables
 87 |         :param int|None threshold: optional threshold
 88 |         :rtype: dict
 89 |         :return: the telemetry peaks
 90 |         """
 91 | 
 92 |     @abc.abstractmethod
 93 |     def refresh_global_tables(self, global_tables: Dict, local_tables: Dict) -> Dict:
 94 |         """
 95 |         Refresh the global tables.
 96 | 
 97 |         :param dict global_tables: the global tables
 98 |         :param dict local_tables: the local tables
 99 |         :rtype: dict
100 |         :return: the global tables
101 |         """
102 | 
103 |     @abc.abstractmethod
104 |     def get_global_tables_stats(
105 |         self,
106 |         global_tables: Dict,
107 |         threshold: Optional[int] = None,
108 |     ) -> Dict:
109 |         """
110 |         Get statistics from the global table.
111 | 
112 |         :param dict global_tables: the global tables
113 |         :param int|None threshold: optional threshold
114 |         :rtype: dict
115 |         :return: the global tables stats
116 |         """
117 | 
118 |     @abc.abstractmethod
119 |     def get_global_tables(self) -> Dict:
120 |         """
121 |         Get the global tables from our backend.
122 | 
123 |         :rtype: dict
124 |         :return: global tables
125 |         """
126 | 
127 |     @abc.abstractmethod
128 |     def get_global_tables_from_file(self, file_path: str) -> Dict:
129 |         """
130 |         Load global tables from a file.
131 | 
132 |         :param str file_path: the file path where the load from
133 |         :rtype: dict
134 |         :return: the loaded object
135 |         """
136 | 
137 |     @abc.abstractmethod
138 |     def get_local_tables_stats(self, local_tables: Dict) -> Dict:
139 |         """
140 |         Get statistics from the local table.
141 | 
142 |         :param dict local_tables: local tables
143 |         :rtype: dict
144 |         :return: the local tables stats
145 |         """
146 | 
147 |     @abc.abstractmethod
148 |     def get_local_tables(self) -> Dict:
149 |         """
150 |         Get the local tables from our backend.
151 | 
152 |         :rtype: dict
153 |         :return: the local tables
154 |         """
155 | 
156 | 
157 | class TwoIndexTwoDimensionAnalyzer(AbstractAnalyzer, ABC):
158 |     """Analyzer using index and dimension with cardinality set to two."""
159 | 
160 |     LOCAL_MIN_COUNT = 50
161 | 
162 |     PEAK_GLOBAL_COUNT_WEIGHT = 0.8
163 | 
164 |     PEAK_MIN_SAMP_SUB_RATIO = 0.5
165 | 
166 |     CROSS_DIMENSIONS = []
167 | 
168 |     DIMENSIONS_METADATA = {}
169 | 
170 |     def _get_dimension_threshold(self, dimension: str, value: str) -> int:
171 |         """Get the threshold for a given dimension and value, if available, 0 otherwise."""
172 |         try:
173 |             return self.DIMENSIONS_METADATA[dimension]["threshold"][value]
174 |         except KeyError:
175 |             return 0
176 | 
177 |     def _get_dimension_values(self, dimension: str) -> List[str]:
178 |         """Get all values that a dimension can exhibit, if available."""
179 |         try:
180 |             return self.DIMENSIONS_METADATA[dimension]["values"]
181 |         except KeyError:
182 |             return []
183 | 
184 |     def _is_peak(
185 |         self,
186 |         local_table_stats: models.LocalTableStats,
187 |         global_table_stats: models.GlobalTableStats,
188 |     ) -> bool:
189 |         """
190 |         Return whether the peak is valid.
191 | 
192 |         :param LocalTableStats local_table_stats: stats about the local table
193 |         :param GlobalTableStats global_table_stats: stats about the global table
194 |         :rtype: bool
195 |         :return: whether the peak is valid
196 |         """
197 |         if local_table_stats.sub_count < global_table_stats.threshold:
198 |             return False
199 |         temp = self.PEAK_GLOBAL_COUNT_WEIGHT * global_table_stats.samp_sub_count_max
200 |         is_susp_overall_sub_samp_r = local_table_stats.samp_sub_count_mean > temp
201 |         is_dominant_samp_sub = local_table_stats.samp_sub_ratio > self.PEAK_MIN_SAMP_SUB_RATIO
202 |         temp = local_table_stats.samp_sub_count_mean + local_table_stats.samp_sub_count_std
203 |         is_susp_samp_sub_var = local_table_stats.samp_sub_count_max > temp
204 |         return is_susp_overall_sub_samp_r or is_dominant_samp_sub or is_susp_samp_sub_var
205 | 
206 |     def _update_global_table(
207 |         self,
208 |         global_table: models.GlobalTable,
209 |         local_table: Dict[str, Dict[str, int]],
210 |     ) -> models.GlobalTable:
211 |         """
212 |         Update the global table
213 | 
214 |         :param GlobalTable global_table: the global table
215 |         :param dict[str, dict[str, int]] local_table: the local table
216 |         :rtype: GlobalTable
217 |         :return: a refreshed global table
218 |         """
219 |         window_count = global_table.window_count + 1
220 |         sub_count = sum(local_table[self._index[1]].values())
221 |         samp_count = len(local_table[self._index[1]])
222 |         try:
223 |             samp_sub_count = int(round(sub_count / samp_count))
224 |         except ZeroDivisionError:
225 |             samp_sub_count = 0
226 |         sub_count_avg = int(
227 |             round(
228 |                 (global_table.sub_count_avg * global_table.window_count + sub_count)
229 |                 / window_count
230 |             )
231 |         )
232 |         samp_count_avg = int(
233 |             round(
234 |                 (global_table.samp_count_avg * global_table.window_count + samp_count)
235 |                 / window_count
236 |             )
237 |         )
238 |         samp_sub_count_avg = int(
239 |             round(
240 |                 (global_table.samp_sub_count_avg * global_table.window_count + samp_sub_count)
241 |                 / window_count
242 |             )
243 |         )
244 |         return models.GlobalTable(
245 |             start_ts=global_table.start_ts,
246 |             end_ts=self._end_ts,
247 |             window_count=window_count,
248 |             sub_count_avg=sub_count_avg,
249 |             sub_count_max=max(global_table.sub_count_max, sub_count),
250 |             samp_count_avg=samp_count_avg,
251 |             samp_count_max=max(global_table.samp_count_max, samp_count),
252 |             samp_sub_count_avg=samp_sub_count_avg,
253 |             samp_sub_count_max=max(global_table.samp_sub_count_max, samp_sub_count),
254 |             threshold_suggested=max(global_table.threshold_suggested, sub_count_avg),
255 |         )
256 | 
257 |     def _infer_global_table(
258 |         self,
259 |         local_table: Dict[str, Dict[str, int]],
260 |         threshold: int,
261 |     ) -> models.GlobalTable:
262 |         """
263 |         Infer the global table from the local table.
264 | 
265 |         :param dict[str, dict[str, int]] local_table: the local table
266 |         :param int threshold: the threshold to suggest
267 |         :rtype: GlobalTable
268 |         :return: the inferred global table
269 |         """
270 |         sub_count = sum(local_table[self._index[1]].values())
271 |         samp_count = len(local_table[self._index[1]])
272 |         try:
273 |             samp_sub_count = int(round(sub_count / samp_count))
274 |         except ZeroDivisionError:
275 |             samp_sub_count = 0
276 |         return models.GlobalTable(
277 |             start_ts=self._start_ts,
278 |             end_ts=self._end_ts,
279 |             window_count=1,
280 |             sub_count_avg=sub_count,
281 |             sub_count_max=sub_count,
282 |             samp_count_avg=samp_count,
283 |             samp_count_max=samp_count,
284 |             samp_sub_count_avg=samp_sub_count,
285 |             samp_sub_count_max=samp_sub_count,
286 |             threshold_suggested=max(sub_count, threshold),
287 |         )
288 | 
289 |     def get_peaks(
290 |         self,
291 |         global_tables: Dict[str, Dict[str, models.GlobalTable]],
292 |         local_tables: Dict[str, Dict[str, Dict[str, Dict[str, int]]]],
293 |         threshold: Optional[int] = None,
294 |     ) -> Dict[str, Dict[str, models.TelemetryPeak]]:
295 |         """
296 |         Get the peaks.
297 | 
298 |         :param dict[str, dict[str, GlobalTable]] global_tables: the global tables
299 |         :param dict[str, dict[str, dict[str, dict[str, int]]]] local_tables: the local tables
300 |         :param int|None threshold: optional threshold
301 |         :rtype: dict[str, dict[str, TelemetryPeak]]
302 |         :return: the telemetry peaks
303 |         """
304 |         local_tables_stats = self.get_local_tables_stats(local_tables)
305 |         global_tables_stats = self.get_global_tables_stats(global_tables, threshold=threshold)
306 |         peaks = collections.defaultdict(dict)
307 |         for dimension_0 in local_tables_stats:
308 |             for dimension_1 in local_tables_stats[dimension_0]:
309 |                 try:
310 |                     local_table_stats = local_tables_stats[dimension_0][dimension_1]
311 |                     global_table_stats = global_tables_stats[dimension_0][dimension_1]
312 |                 except KeyError:
313 |                     continue
314 |                 if self._is_peak(local_table_stats, global_table_stats):
315 |                     peaks[dimension_0][dimension_1] = models.TelemetryPeak(
316 |                         sub_count=local_table_stats.sub_count,
317 |                         samp_count=local_table_stats.samp_count,
318 |                         samp_sub_count_max=local_table_stats.samp_sub_count_max,
319 |                         samp_sub_count_mean=local_table_stats.samp_sub_count_mean,
320 |                         samp_sub_count_std=local_table_stats.samp_sub_count_std,
321 |                         samp_sub_ratio=local_table_stats.samp_sub_ratio,
322 |                         global_samp_sub_count_max=global_table_stats.samp_sub_count_max,
323 |                         global_threshold_suggested=global_table_stats.threshold,
324 |                     )
325 |         return peaks
326 | 
327 |     def refresh_global_tables(
328 |         self,
329 |         global_tables: Dict[str, Dict[str, models.GlobalTable]],
330 |         local_tables: Dict[str, Dict[str, Dict[str, Dict[str, int]]]],
331 |     ) -> Dict[str, Dict[str, models.GlobalTable]]:
332 |         """
333 |         Refresh the global tables.
334 | 
335 |         :param dict[str, dict[str, GlobalTable]] global_tables: the global tables
336 |         :param dict[str, dict[str, dict[str, dict[str, int]]]] local_tables: the local tables
337 |         :rtype: dict[str, dict[str, GlobalTable]]
338 |         :return: the global tables
339 |         """
340 | 
341 |         def get_dimensions_0() -> Set[str]:
342 |             """Get the first dimension from all the global and local tables."""
343 |             return set(global_tables.keys()).union(local_tables.keys())
344 | 
345 |         def get_dimensions_1(dim_0: str) -> Set[str]:
346 |             """Get the second dimension from all the global and local tables."""
347 |             return set(global_tables.get(dim_0, {}).keys()).union(
348 |                 local_tables.get(dim_0, {}).keys()
349 |             )
350 | 
351 |         refreshed_global_tables = collections.defaultdict(dict)
352 |         for dimension_0 in get_dimensions_0():
353 |             for dimension_1 in get_dimensions_1(dimension_0):
354 |                 global_table = global_tables.get(dimension_0, {}).get(dimension_1, None)
355 |                 local_table = local_tables.get(dimension_0, {}).get(dimension_1, None)
356 |                 # If we have both tables for all dimensions, check their age
357 |                 if global_table and local_table:
358 |                     # if the global table is more recent than the current time interval
359 |                     if self._start_ts < global_table.end_ts:
360 |                         new_table = global_table
361 |                     # otherwise update the global table
362 |                     else:
363 |                         new_table = self._update_global_table(global_table, local_table)
364 |                 # if we only have the global table, then use the global table
365 |                 elif global_table and not local_table:
366 |                     new_table = global_table
367 |                 # and if we only have the local table, infer a global table
368 |                 elif not global_table and local_table:
369 |                     threshold = self._get_dimension_threshold(self._dimensions[0], dimension_0)
370 |                     new_table = self._infer_global_table(local_table, threshold)
371 |                 # otherwise just skip
372 |                 else:
373 |                     continue
374 |                 refreshed_global_tables[dimension_0][dimension_1] = new_table
375 |         return refreshed_global_tables
376 | 
377 |     def get_global_tables_stats(
378 |         self,
379 |         global_tables: Dict[str, Dict[str, models.GlobalTable]],
380 |         threshold: Optional[int] = None,
381 |     ) -> Dict[str, Dict[str, models.GlobalTableStats]]:
382 |         """
383 |         Get statistics from the global table.
384 | 
385 |         :param dict[str, dict[str, GlobalTable]] global_tables: the global tables
386 |         :param int|None threshold: optional threshold
387 |         :rtype: dict[str, dict[str, GlobalTableStats]]
388 |         :return: the global tables stats
389 |         """
390 |         global_stats = collections.defaultdict(dict)
391 |         for dimension_0 in global_tables:
392 |             for dimension_1 in global_tables[dimension_0]:
393 |                 global_table = global_tables[dimension_0][dimension_1]
394 |                 global_stats[dimension_0][dimension_1] = models.GlobalTableStats(
395 |                     threshold=threshold or global_table.threshold_suggested,
396 |                     samp_sub_count_max=global_table.samp_sub_count_max,
397 |                 )
398 |         return global_stats
399 | 
400 |     def get_global_tables_from_file(
401 |         self,
402 |         file_path: str,
403 |     ) -> Dict[str, Dict[str, models.GlobalTable]]:
404 |         """
405 |         Load global tables from a file.
406 | 
407 |         :param str file_path: the file path where the load from
408 |         :rtype: dict[str, dict[str, GlobalTable]]
409 |         :return: the global tables
410 |         """
411 |         with open(file_path, "r") as f:
412 |             json_data = json.load(f)
413 |         for dimension_0 in json_data:
414 |             for dimension_1 in json_data[dimension_0]:
415 |                 table = json_data[dimension_0][dimension_1]
416 |                 json_data[dimension_0][dimension_1] = models.GlobalTable(
417 |                     start_ts=datetime.datetime.strptime(table[0][:19], "%Y-%m-%d %H:%M:%S"),
418 |                     end_ts=datetime.datetime.strptime(table[1][:19], "%Y-%m-%d %H:%M:%S"),
419 |                     window_count=table[2],
420 |                     sub_count_avg=table[3],
421 |                     sub_count_max=table[4],
422 |                     samp_count_avg=table[5],
423 |                     samp_count_max=table[6],
424 |                     samp_sub_count_avg=table[7],
425 |                     samp_sub_count_max=table[8],
426 |                     threshold_suggested=table[9],
427 |                 )
428 |         return json_data
429 | 
430 |     def get_global_tables(self) -> Dict[str, Dict[str, models.GlobalTable]]:
431 |         """
432 |         Get the global tables.
433 | 
434 |         :rtype: dict[str, dict[str, GlobalTable]]
435 |         :return: the global tables
436 |         """
437 |         end_date = datetime.datetime.utcnow()
438 |         start_date = end_date - self.DEFAULT_GLOBAL_TABLE_AGE
439 |         ret = self._backend.stats(
440 |             start_date=start_date,
441 |             end_date=end_date,
442 |             index=self._index,
443 |             dimensions=self._dimensions,
444 |             dimensions_values={x: self._get_dimension_values(x) for x in self._dimensions},
445 |         )
446 |         global_tables = collections.defaultdict(dict)
447 |         for dimension_0 in ret:
448 |             for dimension_1 in ret[dimension_0]:
449 |                 start_ts = telemetry_peak_analyzer.datetime_to_ms(start_date)
450 |                 end_ts = telemetry_peak_analyzer.datetime_to_ms(end_date)
451 |                 threshold = self._get_dimension_threshold(self._dimensions[0], dimension_0)
452 |                 sub_count_avg = ret[dimension_0][dimension_1]["sub_count_avg"]
453 |                 sub_count_max = ret[dimension_0][dimension_1]["sub_count_max"]
454 |                 samp_count_avg = ret[dimension_0][dimension_1]["samp_count_avg"]
455 |                 samp_count_max = ret[dimension_0][dimension_1]["samp_count_max"]
456 |                 samp_sub_count_max = ret[dimension_0][dimension_1]["samp_sub_count_max"]
457 |                 try:
458 |                     samp_sub_count_avg = int(round(sub_count_avg / samp_count_avg))
459 |                 except ZeroDivisionError:
460 |                     samp_sub_count_avg = 0
461 |                 global_tables[dimension_0][dimension_1] = models.GlobalTable(
462 |                     start_ts=start_date,
463 |                     end_ts=end_date,
464 |                     window_count=self._get_window_count(start_ts, end_ts),
465 |                     sub_count_avg=int(round(sub_count_avg)),
466 |                     sub_count_max=int(sub_count_max),
467 |                     samp_count_avg=int(round(samp_count_avg)),
468 |                     samp_count_max=int(samp_count_max),
469 |                     samp_sub_count_avg=samp_sub_count_avg,
470 |                     samp_sub_count_max=samp_sub_count_max,
471 |                     threshold_suggested=max(int(round(sub_count_avg)), threshold),
472 |                 )
473 |         return global_tables
474 | 
475 |     def get_local_tables_stats(
476 |         self, local_tables: Dict[str, Dict[str, Dict[str, Dict[str, int]]]]
477 |     ) -> Dict[str, Dict[str, models.LocalTableStats]]:
478 |         """
479 |         Get statistics from the local table.
480 | 
481 |         :param dict[str, dict[str, dict[str, dict[str, int]]]] local_tables: local tables
482 |         :rtype: dict[str, dict[str, LocalTableStats]]
483 |         :return: some statistics for each dimension combination
484 |         """
485 |         local_stats = collections.defaultdict(dict)
486 |         for dimension_0 in local_tables:
487 |             for dimension_1 in local_tables[dimension_0]:
488 |                 local_table = local_tables[dimension_0][dimension_1]
489 |                 sub_count = sum(local_table[self._index[1]].values())
490 |                 samp_count = len(local_table[self._index[1]])
491 |                 samp_sub_count = local_table[self._index[1]].values()
492 |                 # fails if len < 1, one fails, both fail
493 |                 try:
494 |                     samp_sub_count_mean = statistics.mean(samp_sub_count)
495 |                     samp_sub_count_max = max(samp_sub_count)
496 |                 except (statistics.StatisticsError, ValueError):
497 |                     samp_sub_count_mean = 0.0
498 |                     samp_sub_count_max = 0
499 |                 # fails if len < 2
500 |                 try:
501 |                     samp_sub_count_std = statistics.stdev(samp_sub_count)
502 |                 except statistics.StatisticsError:
503 |                     samp_sub_count_std = 0.0
504 |                 if sub_count > self.LOCAL_MIN_COUNT:
505 |                     samp_sub_ratio = round(samp_sub_count_max / sub_count, 2)
506 |                 else:
507 |                     samp_sub_ratio = 0.0
508 |                 local_stats[dimension_0][dimension_1] = models.LocalTableStats(
509 |                     sub_count=sub_count,
510 |                     samp_count=samp_count,
511 |                     samp_sub_count_max=samp_sub_count_max,
512 |                     samp_sub_count_mean=samp_sub_count_mean,
513 |                     samp_sub_count_std=samp_sub_count_std,
514 |                     samp_sub_ratio=samp_sub_ratio,
515 |                     cross_stats={x: local_table[x] for x in self.CROSS_DIMENSIONS},
516 |                 )
517 |         return local_stats
518 | 
519 |     def get_local_tables(self) -> Dict[str, Dict[str, Dict[str, Dict[str, int]]]]:
520 |         """
521 |         Get the local tables from our backend.
522 | 
523 |         :rtype: dict[str, dict[str, dict[str, dict[str, int]]]]
524 |         :return: the local tables
525 |         """
526 |         terms = self._dimensions + self.CROSS_DIMENSIONS + [self._index[1]]
527 |         json_data = self._backend.group_by(
528 |             start_date=self._start_ts,
529 |             end_date=self._end_ts,
530 |             index=self._index,
531 |             dimensions=self._dimensions + self.CROSS_DIMENSIONS,
532 |         )
533 |         local_tables = collections.defaultdict(dict)
534 |         for item in json_data:
535 |             dimension_0 = item[self._dimensions[0]]
536 |             dimension_1 = item[self._dimensions[1]]
537 |             if dimension_1 not in local_tables[dimension_0]:
538 |                 local_tables[dimension_0][dimension_1] = {
539 |                     term: collections.defaultdict(int) for term in terms
540 |                 }
541 |             for term in terms:
542 |                 local_tables[dimension_0][dimension_1][term][item[term]] += item["count"]
543 |         return local_tables
544 | 
545 | 
546 | class FileTypePeakAnalyzer(TwoIndexTwoDimensionAnalyzer):
547 |     """Analyzer using index and dimension to track file types."""
548 | 
549 |     CROSS_DIMENSIONS = [
550 |         "source.user_id",
551 |         "source.origin",
552 |     ]
553 | 
554 |     DIMENSIONS_METADATA = {
555 |         "task.severity": {
556 |             "values": ["malicious", "benign"],
557 |             "threshold": {
558 |                 "malicious": 90,
559 |                 "benign": 500,
560 |             },
561 |         }
562 |     }
563 | 
564 |     _INDEX = [
565 |         "utc_timestamp",
566 |         "file.sha1",
567 |     ]
568 | 
569 |     _DIMENSIONS = [
570 |         "task.severity",
571 |         "file.llfile_type",
572 |     ]
573 | 
574 |     def __init__(
575 |         self,
576 |         conf: configparser.ConfigParser,
577 |         backend: backends.BackendType,
578 |         start_ts: datetime.datetime,
579 |         end_ts: datetime.datetime,
580 |     ) -> None:
581 |         """
582 |         Constructor.
583 | 
584 |         :param configparser.ConfigParser conf: the conf object
585 |         :param backendType backend: the backend
586 |         :param datetime.datetime start_ts: the beginning of the time interval
587 |         :param datetime.datetime end_ts: the end of the time interval
588 |         """
589 |         if not isinstance(backend, backends.TwoIndexTwoDimensionBackend):
590 |             raise ValueError("Backend is not compatible with the chosen analyzer")
591 |         super(FileTypePeakAnalyzer, self).__init__(
592 |             conf=conf,
593 |             index=self._INDEX,
594 |             dimensions=self._DIMENSIONS,
595 |             backend=backend,
596 |             start_ts=start_ts,
597 |             end_ts=end_ts,
598 |         )
599 | 
600 | 
601 | class NetworkTypePeakAnalyzer(TwoIndexTwoDimensionAnalyzer):
602 |     """Analyzer using index and dimension to track network peaks."""
603 | 
604 |     CROSS_DIMENSIONS = [
605 |         "source.user_id",
606 |     ]
607 | 
608 |     DIMENSIONS_METADATA = {
609 |         "event.impact": {
610 |            "values": ["70", "30"],
611 |         }
612 |     }
613 | 
614 |     _INDEX = [
615 |         "utc_timestamp",
616 |         "event.id",
617 |     ]
618 | 
619 |     _DIMENSIONS = [
620 |         "event.impact",
621 |         "threat.name.keyword",
622 |     ]
623 | 
624 |     def __init__(
625 |         self,
626 |         conf: configparser.ConfigParser,
627 |         backend: backends.BackendType,
628 |         start_ts: datetime.datetime,
629 |         end_ts: datetime.datetime,
630 |     ) -> None:
631 |         """
632 |         Constructor.
633 | 
634 |         :param configparser.ConfigParser conf: the conf object
635 |         :param backendType backend: the backend
636 |         :param datetime.datetime start_ts: the beginning of the time interval
637 |         :param datetime.datetime end_ts: the end of the time interval
638 |         """
639 |         if not isinstance(backend, backends.TwoIndexTwoDimensionBackend):
640 |             raise ValueError("Backend is not compatible with the chosen analyzer")
641 |         # Relax the default age of a global table when not present
642 |         self.DEFAULT_METRIC_TABLE_AGE = datetime.timedelta(days=3)
643 |         super(NetworkTypePeakAnalyzer, self).__init__(
644 |             conf=conf,
645 |             index=self._INDEX,
646 |             dimensions=self._DIMENSIONS,
647 |             backend=backend,
648 |             start_ts=start_ts,
649 |             end_ts=end_ts,
650 |         )
651 | 


--------------------------------------------------------------------------------